11a59d1b8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2d9953105SMichael Ellerman /* 3d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 4d9953105SMichael Ellerman */ 5d9953105SMichael Ellerman 6d9953105SMichael Ellerman #include <linux/sched.h> 7d9953105SMichael Ellerman #include <linux/interrupt.h> 8d9953105SMichael Ellerman #include <linux/irq.h> 990128997SAnton Blanchard #include <linux/of.h> 1055fc0c56SAnton Blanchard #include <linux/fs.h> 1155fc0c56SAnton Blanchard #include <linux/reboot.h> 1294675cceSMahesh Salgaonkar #include <linux/irq_work.h> 13d9953105SMichael Ellerman 14d9953105SMichael Ellerman #include <asm/machdep.h> 15d9953105SMichael Ellerman #include <asm/rtas.h> 168c4f1f29SMichael Ellerman #include <asm/firmware.h> 17a43c1590SMahesh Salgaonkar #include <asm/mce.h> 18d9953105SMichael Ellerman 19577830b0SMichael Ellerman #include "pseries.h" 20c902be71SArnd Bergmann 21d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 22d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 23d9953105SMichael Ellerman 24d9953105SMichael Ellerman static int ras_check_exception_token; 25d9953105SMichael Ellerman 2694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 2794675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 2894675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 2994675cceSMahesh Salgaonkar }; 3094675cceSMahesh Salgaonkar 31d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 32d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 33d9953105SMichael Ellerman 34b4af279aSVipin K Parashar /* EPOW events counter variable */ 35b4af279aSVipin K Parashar static int num_epow_events; 36b4af279aSVipin K Parashar 37b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 387d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 397d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 40d9953105SMichael Ellerman 4104fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 4204fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 4304fce21cSMahesh Salgaonkar __be32 fru_id; 4404fce21cSMahesh Salgaonkar __be32 proc_id; 4504fce21cSMahesh Salgaonkar u8 error_type; 4604fce21cSMahesh Salgaonkar /* 4704fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 4804fce21cSMahesh Salgaonkar * 4904fce21cSMahesh Salgaonkar * MSB0 5004fce21cSMahesh Salgaonkar * | 5104fce21cSMahesh Salgaonkar * V 5204fce21cSMahesh Salgaonkar * 01234567 5304fce21cSMahesh Salgaonkar * XXXXXXXX 5404fce21cSMahesh Salgaonkar * 5504fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 5604fce21cSMahesh Salgaonkar * XXXXXXXX 5704fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 5804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 5904fce21cSMahesh Salgaonkar * X 1: Logical address provided. 6004fce21cSMahesh Salgaonkar * XX 2: Reserved. 6104fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 6204fce21cSMahesh Salgaonkar * 6304fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 6404fce21cSMahesh Salgaonkar * XXXXXXXX 6504fce21cSMahesh Salgaonkar * X 1: Effective address provided. 6604fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 6704fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 6804fce21cSMahesh Salgaonkar */ 6904fce21cSMahesh Salgaonkar u8 sub_err_type; 7004fce21cSMahesh Salgaonkar u8 reserved_1[6]; 7104fce21cSMahesh Salgaonkar __be64 effective_address; 7204fce21cSMahesh Salgaonkar __be64 logical_address; 7304fce21cSMahesh Salgaonkar } __packed; 7404fce21cSMahesh Salgaonkar 7504fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 7604fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 7704fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 7804fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 799ca766f9SNicholas Piggin #define MC_ERROR_TYPE_UNKNOWN 0x03 8004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 8104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 8204fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 8304fce21cSMahesh Salgaonkar 8404fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 8504fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 8604fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 8704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 8804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 8904fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 9004fce21cSMahesh Salgaonkar 919ca766f9SNicholas Piggin #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 929ca766f9SNicholas Piggin #define UE_LOGICAL_ADDR_PROVIDED 0x20 939ca766f9SNicholas Piggin 9404fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 9504fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 9604fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 9704fce21cSMahesh Salgaonkar 9804fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 10104fce21cSMahesh Salgaonkar 10204fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 10304fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 10404fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 10504fce21cSMahesh Salgaonkar 10604fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 10704fce21cSMahesh Salgaonkar { 10804fce21cSMahesh Salgaonkar switch (mlog->error_type) { 10904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 11004fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 11104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 11204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 11304fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 11404fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 11504fce21cSMahesh Salgaonkar default: 11604fce21cSMahesh Salgaonkar return 0; 11704fce21cSMahesh Salgaonkar } 11804fce21cSMahesh Salgaonkar } 11904fce21cSMahesh Salgaonkar 120d9953105SMichael Ellerman /* 121c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 122c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 123c9dccf1dSSam Bobroff * subsys stage. 124c9dccf1dSSam Bobroff */ 125c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 126c9dccf1dSSam Bobroff { 127c9dccf1dSSam Bobroff struct device_node *np; 128c9dccf1dSSam Bobroff 129c9dccf1dSSam Bobroff /* Hotplug Events */ 130c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 131c9dccf1dSSam Bobroff if (np != NULL) { 132c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 133c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 134c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 135c9dccf1dSSam Bobroff of_node_put(np); 136c9dccf1dSSam Bobroff } 137c9dccf1dSSam Bobroff 138c9dccf1dSSam Bobroff return 0; 139c9dccf1dSSam Bobroff } 140c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 141c9dccf1dSSam Bobroff 142c9dccf1dSSam Bobroff /* 143d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 144d9953105SMichael Ellerman * and power system events. 145d9953105SMichael Ellerman */ 146d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 147d9953105SMichael Ellerman { 148d9953105SMichael Ellerman struct device_node *np; 149d9953105SMichael Ellerman 150d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 151d9953105SMichael Ellerman 152d9953105SMichael Ellerman /* Internal Errors */ 153d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 154d9953105SMichael Ellerman if (np != NULL) { 15532c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 15632c96f77SMark Nelson "RAS_ERROR"); 157d9953105SMichael Ellerman of_node_put(np); 158d9953105SMichael Ellerman } 159d9953105SMichael Ellerman 160d9953105SMichael Ellerman /* EPOW Events */ 161d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 162d9953105SMichael Ellerman if (np != NULL) { 16332c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 164d9953105SMichael Ellerman of_node_put(np); 165d9953105SMichael Ellerman } 166d9953105SMichael Ellerman 16769ed3324SAnton Blanchard return 0; 168d9953105SMichael Ellerman } 1698e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 170d9953105SMichael Ellerman 17155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 17255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 17355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 17455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 17555fc0c56SAnton Blanchard 17655fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 17755fc0c56SAnton Blanchard { 17855fc0c56SAnton Blanchard switch (event_modifier) { 17955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 180b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1811b7e0cbeSliguang orderly_poweroff(true); 18255fc0c56SAnton Blanchard break; 18355fc0c56SAnton Blanchard 18455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 185b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 186b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 18779872e35SAnshuman Khandual orderly_poweroff(true); 18855fc0c56SAnton Blanchard break; 18955fc0c56SAnton Blanchard 19055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 191b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 192b4af279aSVipin K Parashar " RTAS error log for details\n"); 1931b7e0cbeSliguang orderly_poweroff(true); 19455fc0c56SAnton Blanchard break; 19555fc0c56SAnton Blanchard 19655fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 197b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 198b4af279aSVipin K Parashar " error log for details\n"); 1991b7e0cbeSliguang orderly_poweroff(true); 20055fc0c56SAnton Blanchard break; 20155fc0c56SAnton Blanchard 20255fc0c56SAnton Blanchard default: 203b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 20455fc0c56SAnton Blanchard event_modifier); 20555fc0c56SAnton Blanchard } 20655fc0c56SAnton Blanchard } 20755fc0c56SAnton Blanchard 20855fc0c56SAnton Blanchard struct epow_errorlog { 20955fc0c56SAnton Blanchard unsigned char sensor_value; 21055fc0c56SAnton Blanchard unsigned char event_modifier; 21155fc0c56SAnton Blanchard unsigned char extended_modifier; 21255fc0c56SAnton Blanchard unsigned char reserved; 21355fc0c56SAnton Blanchard unsigned char platform_reason; 21455fc0c56SAnton Blanchard }; 21555fc0c56SAnton Blanchard 21655fc0c56SAnton Blanchard #define EPOW_RESET 0 21755fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 21855fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 21955fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 22055fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 22155fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 22255fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 22355fc0c56SAnton Blanchard 224e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 22555fc0c56SAnton Blanchard { 22655fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 22755fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 22855fc0c56SAnton Blanchard char action_code; 22955fc0c56SAnton Blanchard char modifier; 23055fc0c56SAnton Blanchard 23155fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 23255fc0c56SAnton Blanchard if (pseries_log == NULL) 23355fc0c56SAnton Blanchard return; 23455fc0c56SAnton Blanchard 23555fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 23655fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 23755fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 23855fc0c56SAnton Blanchard 23955fc0c56SAnton Blanchard switch (action_code) { 24055fc0c56SAnton Blanchard case EPOW_RESET: 241b4af279aSVipin K Parashar if (num_epow_events) { 242b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 243b4af279aSVipin K Parashar num_epow_events--; 244b4af279aSVipin K Parashar } 24555fc0c56SAnton Blanchard break; 24655fc0c56SAnton Blanchard 24755fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 248b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 249b4af279aSVipin K Parashar " log for details\n"); 25055fc0c56SAnton Blanchard break; 25155fc0c56SAnton Blanchard 25255fc0c56SAnton Blanchard case EPOW_WARN_POWER: 253b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 254b4af279aSVipin K Parashar " log for details\n"); 25555fc0c56SAnton Blanchard break; 25655fc0c56SAnton Blanchard 25755fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 258d273fa91SYueHaibing handle_system_shutdown(modifier); 25955fc0c56SAnton Blanchard break; 26055fc0c56SAnton Blanchard 26155fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 262b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 263b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2641b7e0cbeSliguang orderly_poweroff(true); 26555fc0c56SAnton Blanchard break; 26655fc0c56SAnton Blanchard 26755fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 26855fc0c56SAnton Blanchard case EPOW_POWER_OFF: 269b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 270b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 27155fc0c56SAnton Blanchard emergency_sync(); 27255fc0c56SAnton Blanchard kernel_power_off(); 27355fc0c56SAnton Blanchard break; 27455fc0c56SAnton Blanchard 27555fc0c56SAnton Blanchard default: 276b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 27755fc0c56SAnton Blanchard action_code); 27855fc0c56SAnton Blanchard } 279b4af279aSVipin K Parashar 280b4af279aSVipin K Parashar /* Increment epow events counter variable */ 281b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 282b4af279aSVipin K Parashar num_epow_events++; 28355fc0c56SAnton Blanchard } 28455fc0c56SAnton Blanchard 285b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 286b7d9eb39SJohn Allen { 287b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 288b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 289b7d9eb39SJohn Allen 290b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 291b7d9eb39SJohn Allen 292b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 293b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 294b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 295b7d9eb39SJohn Allen rtas_get_error_log_max()); 296b7d9eb39SJohn Allen 297b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 298b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 299b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 300b7d9eb39SJohn Allen 301b7d9eb39SJohn Allen /* 302b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 303b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 304b7d9eb39SJohn Allen */ 305b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 3064c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || 3074c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) 308fd12527aSNathan Fontenot queue_hotplug_event(hp_elog); 309b7d9eb39SJohn Allen else 310b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 311b7d9eb39SJohn Allen 312b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 313b7d9eb39SJohn Allen return IRQ_HANDLED; 314b7d9eb39SJohn Allen } 315b7d9eb39SJohn Allen 31655fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3177d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 318d9953105SMichael Ellerman { 31955fc0c56SAnton Blanchard int status; 32055fc0c56SAnton Blanchard int state; 321d9953105SMichael Ellerman int critical; 322d9953105SMichael Ellerman 3231c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3241c2cb594SThomas Huth &state); 325d9953105SMichael Ellerman 326d9953105SMichael Ellerman if (state > 3) 327d9953105SMichael Ellerman critical = 1; /* Time Critical */ 328d9953105SMichael Ellerman else 329d9953105SMichael Ellerman critical = 0; 330d9953105SMichael Ellerman 331d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 332d9953105SMichael Ellerman 333d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 334b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 335476eb491SGrant Likely virq_to_hw(irq), 3366f43747fSAnton Blanchard RTAS_EPOW_WARNING, 337d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 338d9953105SMichael Ellerman rtas_get_error_log_max()); 339d9953105SMichael Ellerman 340d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 341d9953105SMichael Ellerman 34255fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 34355fc0c56SAnton Blanchard 344d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 345d9953105SMichael Ellerman return IRQ_HANDLED; 346d9953105SMichael Ellerman } 347d9953105SMichael Ellerman 348d9953105SMichael Ellerman /* 349d9953105SMichael Ellerman * Handle hardware error interrupts. 350d9953105SMichael Ellerman * 351d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 352d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 353d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 354d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 355d9953105SMichael Ellerman */ 3567d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 357d9953105SMichael Ellerman { 358d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 359cc8b5263SAnton Blanchard int status; 360d9953105SMichael Ellerman int fatal; 361d9953105SMichael Ellerman 362d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 363d9953105SMichael Ellerman 364d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 365b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 366476eb491SGrant Likely virq_to_hw(irq), 367d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 368d9953105SMichael Ellerman __pa(&ras_log_buf), 369d9953105SMichael Ellerman rtas_get_error_log_max()); 370d9953105SMichael Ellerman 371d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 372d9953105SMichael Ellerman 373a08a53eaSGreg Kurz if (status == 0 && 374a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 375d9953105SMichael Ellerman fatal = 1; 376d9953105SMichael Ellerman else 377d9953105SMichael Ellerman fatal = 0; 378d9953105SMichael Ellerman 379d9953105SMichael Ellerman /* format and print the extended information */ 380d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 381d9953105SMichael Ellerman 382d9953105SMichael Ellerman if (fatal) { 383b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 384b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 385cc8b5263SAnton Blanchard emergency_sync(); 386cc8b5263SAnton Blanchard kernel_power_off(); 387d9953105SMichael Ellerman } else { 388b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 389d9953105SMichael Ellerman } 390d9953105SMichael Ellerman 391d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 392d9953105SMichael Ellerman return IRQ_HANDLED; 393d9953105SMichael Ellerman } 394d9953105SMichael Ellerman 395d368514cSAnton Blanchard /* 396d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 397d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 398d368514cSAnton Blanchard */ 399d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 400d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 401d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 402d368514cSAnton Blanchard 40394675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 40494675cceSMahesh Salgaonkar { 40594675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 40694675cceSMahesh Salgaonkar } 40794675cceSMahesh Salgaonkar 408d368514cSAnton Blanchard /* 409d368514cSAnton Blanchard * Get the error information for errors coming through the 410d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 411d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 412d9953105SMichael Ellerman * will be returned if found. 413d9953105SMichael Ellerman * 41494675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 415d368514cSAnton Blanchard * 41694675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 417d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 418d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 419d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 420d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 421d9953105SMichael Ellerman * second machine check did come in. 422d9953105SMichael Ellerman */ 423d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 424d9953105SMichael Ellerman { 425d9953105SMichael Ellerman unsigned long *savep; 42694675cceSMahesh Salgaonkar struct rtas_error_log *h; 427d9953105SMichael Ellerman 428ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 429ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 430ee1dd1e3SMahesh Salgaonkar 431d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 432f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 433d368514cSAnton Blanchard return NULL; 434d9953105SMichael Ellerman } 435d368514cSAnton Blanchard 436d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 437cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 438d368514cSAnton Blanchard 439d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 44094675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 44194675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 442a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 44394675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 444d368514cSAnton Blanchard } else { 445a08a53eaSGreg Kurz int len, error_log_length; 446d368514cSAnton Blanchard 447a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 44874e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 44994675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 450d368514cSAnton Blanchard } 451d368514cSAnton Blanchard 45294675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 453d9953105SMichael Ellerman } 454d9953105SMichael Ellerman 455d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 456d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 457d9953105SMichael Ellerman * partition to receive FWNMI errors. 458d9953105SMichael Ellerman */ 459d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 460d9953105SMichael Ellerman { 461d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 462d9953105SMichael Ellerman if (ret != 0) 463d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 464d9953105SMichael Ellerman } 465d9953105SMichael Ellerman 466c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 467d9953105SMichael Ellerman { 468bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 469bded0706SNicholas Piggin /* 470bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 471bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 472bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 473bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 474bded0706SNicholas Piggin */ 475bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 476bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 477bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 478bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 479bded0706SNicholas Piggin regs->msr = 0; 480bded0706SNicholas Piggin } 481bded0706SNicholas Piggin #endif 482bded0706SNicholas Piggin 483d9953105SMichael Ellerman if (fwnmi_active) { 484d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 485d9953105SMichael Ellerman if (errhdr) { 486d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 487d9953105SMichael Ellerman } 488d9953105SMichael Ellerman fwnmi_release_errinfo(); 489d9953105SMichael Ellerman } 490102c05e8SNicholas Piggin 491102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 492102c05e8SNicholas Piggin return 1; 493102c05e8SNicholas Piggin 494c902be71SArnd Bergmann return 0; /* need to perform reset */ 495d9953105SMichael Ellerman } 496d9953105SMichael Ellerman 4978f0b8056SMahesh Salgaonkar 4989ca766f9SNicholas Piggin static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) 4998f0b8056SMahesh Salgaonkar { 5009ca766f9SNicholas Piggin struct mce_error_info mce_err = { 0 }; 5019ca766f9SNicholas Piggin unsigned long eaddr = 0, paddr = 0; 5028f0b8056SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 5038f0b8056SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 5049ca766f9SNicholas Piggin int disposition = rtas_error_disposition(errp); 5059ca766f9SNicholas Piggin int initiator = rtas_error_initiator(errp); 5069ca766f9SNicholas Piggin int severity = rtas_error_severity(errp); 5078f0b8056SMahesh Salgaonkar u8 error_type, err_sub_type; 5088f0b8056SMahesh Salgaonkar 5099ca766f9SNicholas Piggin if (initiator == RTAS_INITIATOR_UNKNOWN) 5109ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5119ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_CPU) 5129ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_CPU; 5139ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_PCI) 5149ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_PCI; 5159ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_ISA) 5169ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_ISA; 5179ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_MEMORY) 5189ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_MEMORY; 5199ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_POWERMGM) 5209ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_POWERMGM; 5219ca766f9SNicholas Piggin else 5229ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5238f0b8056SMahesh Salgaonkar 5249ca766f9SNicholas Piggin if (severity == RTAS_SEVERITY_NO_ERROR) 5259ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_NO_ERROR; 5269ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_EVENT) 5279ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5289ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_WARNING) 5299ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5309ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR_SYNC) 5319ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5329ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR) 5339ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5349ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_FATAL) 5359ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 5369ca766f9SNicholas Piggin else 5379ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 5388f0b8056SMahesh Salgaonkar 5399ca766f9SNicholas Piggin if (severity <= RTAS_SEVERITY_ERROR_SYNC) 5409ca766f9SNicholas Piggin mce_err.sync_error = true; 5419ca766f9SNicholas Piggin else 5429ca766f9SNicholas Piggin mce_err.sync_error = false; 5438f0b8056SMahesh Salgaonkar 5449ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 5459ca766f9SNicholas Piggin mce_err.error_class = MCE_ECLASS_UNKNOWN; 546a43c1590SMahesh Salgaonkar 547a43c1590SMahesh Salgaonkar if (!rtas_error_extended(errp)) 548a43c1590SMahesh Salgaonkar goto out; 549a43c1590SMahesh Salgaonkar 550a43c1590SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 551a43c1590SMahesh Salgaonkar if (pseries_log == NULL) 552a43c1590SMahesh Salgaonkar goto out; 553a43c1590SMahesh Salgaonkar 554a43c1590SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 555a43c1590SMahesh Salgaonkar error_type = mce_log->error_type; 5569ca766f9SNicholas Piggin err_sub_type = rtas_mc_error_sub_type(mce_log); 5579ca766f9SNicholas Piggin 5589ca766f9SNicholas Piggin switch (mce_log->error_type) { 5599ca766f9SNicholas Piggin case MC_ERROR_TYPE_UE: 5609ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UE; 561*efbc4303SGanesh Goudar mce_common_process_ue(regs, &mce_err); 562*efbc4303SGanesh Goudar if (mce_err.ignore_event) 563*efbc4303SGanesh Goudar disposition = RTAS_DISP_FULLY_RECOVERED; 5649ca766f9SNicholas Piggin switch (err_sub_type) { 5659ca766f9SNicholas Piggin case MC_ERROR_UE_IFETCH: 5669ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; 5679ca766f9SNicholas Piggin break; 5689ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: 5699ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; 5709ca766f9SNicholas Piggin break; 5719ca766f9SNicholas Piggin case MC_ERROR_UE_LOAD_STORE: 5729ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; 5739ca766f9SNicholas Piggin break; 5749ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: 5759ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 5769ca766f9SNicholas Piggin break; 5779ca766f9SNicholas Piggin case MC_ERROR_UE_INDETERMINATE: 5789ca766f9SNicholas Piggin default: 5799ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; 5809ca766f9SNicholas Piggin break; 5819ca766f9SNicholas Piggin } 5829ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) 5839ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 5849ca766f9SNicholas Piggin 5859ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { 5869ca766f9SNicholas Piggin paddr = be64_to_cpu(mce_log->logical_address); 5879ca766f9SNicholas Piggin } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { 5889ca766f9SNicholas Piggin unsigned long pfn; 5899ca766f9SNicholas Piggin 5909ca766f9SNicholas Piggin pfn = addr_to_pfn(regs, eaddr); 5919ca766f9SNicholas Piggin if (pfn != ULONG_MAX) 5929ca766f9SNicholas Piggin paddr = pfn << PAGE_SHIFT; 5939ca766f9SNicholas Piggin } 5949ca766f9SNicholas Piggin 5959ca766f9SNicholas Piggin break; 5969ca766f9SNicholas Piggin case MC_ERROR_TYPE_SLB: 5979ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_SLB; 5989ca766f9SNicholas Piggin switch (err_sub_type) { 5999ca766f9SNicholas Piggin case MC_ERROR_SLB_PARITY: 6009ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; 6019ca766f9SNicholas Piggin break; 6029ca766f9SNicholas Piggin case MC_ERROR_SLB_MULTIHIT: 6039ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 6049ca766f9SNicholas Piggin break; 6059ca766f9SNicholas Piggin case MC_ERROR_SLB_INDETERMINATE: 6069ca766f9SNicholas Piggin default: 6079ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; 6089ca766f9SNicholas Piggin break; 6099ca766f9SNicholas Piggin } 6109ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6119ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6129ca766f9SNicholas Piggin break; 6139ca766f9SNicholas Piggin case MC_ERROR_TYPE_ERAT: 6149ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_ERAT; 6159ca766f9SNicholas Piggin switch (err_sub_type) { 6169ca766f9SNicholas Piggin case MC_ERROR_ERAT_PARITY: 6179ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; 6189ca766f9SNicholas Piggin break; 6199ca766f9SNicholas Piggin case MC_ERROR_ERAT_MULTIHIT: 6209ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 6219ca766f9SNicholas Piggin break; 6229ca766f9SNicholas Piggin case MC_ERROR_ERAT_INDETERMINATE: 6239ca766f9SNicholas Piggin default: 6249ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; 6259ca766f9SNicholas Piggin break; 6269ca766f9SNicholas Piggin } 6279ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6289ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6299ca766f9SNicholas Piggin break; 6309ca766f9SNicholas Piggin case MC_ERROR_TYPE_TLB: 6319ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_TLB; 6329ca766f9SNicholas Piggin switch (err_sub_type) { 6339ca766f9SNicholas Piggin case MC_ERROR_TLB_PARITY: 6349ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; 6359ca766f9SNicholas Piggin break; 6369ca766f9SNicholas Piggin case MC_ERROR_TLB_MULTIHIT: 6379ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 6389ca766f9SNicholas Piggin break; 6399ca766f9SNicholas Piggin case MC_ERROR_TLB_INDETERMINATE: 6409ca766f9SNicholas Piggin default: 6419ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; 6429ca766f9SNicholas Piggin break; 6439ca766f9SNicholas Piggin } 6449ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6459ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6469ca766f9SNicholas Piggin break; 6479ca766f9SNicholas Piggin case MC_ERROR_TYPE_D_CACHE: 6489ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_DCACHE; 6499ca766f9SNicholas Piggin break; 6509ca766f9SNicholas Piggin case MC_ERROR_TYPE_I_CACHE: 6519ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_DCACHE; 6529ca766f9SNicholas Piggin break; 6539ca766f9SNicholas Piggin case MC_ERROR_TYPE_UNKNOWN: 6549ca766f9SNicholas Piggin default: 6559ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 6569ca766f9SNicholas Piggin break; 6579ca766f9SNicholas Piggin } 658a43c1590SMahesh Salgaonkar 659a43c1590SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 660a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_NOT_RECOVERED) { 661a43c1590SMahesh Salgaonkar switch (error_type) { 662a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 663a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 664c6d15258SMahesh Salgaonkar /* 665c6d15258SMahesh Salgaonkar * Store the old slb content in paca before flushing. 666c6d15258SMahesh Salgaonkar * Print this when we go to virtual mode. 667c6d15258SMahesh Salgaonkar * There are chances that we may hit MCE again if there 668c6d15258SMahesh Salgaonkar * is a parity error on the SLB entry we trying to read 669c6d15258SMahesh Salgaonkar * for saving. Hence limit the slb saving to single 670c6d15258SMahesh Salgaonkar * level of recursion. 671c6d15258SMahesh Salgaonkar */ 672c6d15258SMahesh Salgaonkar if (local_paca->in_mce == 1) 673c6d15258SMahesh Salgaonkar slb_save_contents(local_paca->mce_faulty_slbs); 674a43c1590SMahesh Salgaonkar flush_and_reload_slb(); 675a43c1590SMahesh Salgaonkar disposition = RTAS_DISP_FULLY_RECOVERED; 676a43c1590SMahesh Salgaonkar break; 677a43c1590SMahesh Salgaonkar default: 678a43c1590SMahesh Salgaonkar break; 679a43c1590SMahesh Salgaonkar } 6809ca766f9SNicholas Piggin } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 6819ca766f9SNicholas Piggin /* Platform corrected itself but could be degraded */ 6829ca766f9SNicholas Piggin printk(KERN_ERR "MCE: limited recovery, system may " 6839ca766f9SNicholas Piggin "be degraded\n"); 6849ca766f9SNicholas Piggin disposition = RTAS_DISP_FULLY_RECOVERED; 685a43c1590SMahesh Salgaonkar } 686a43c1590SMahesh Salgaonkar #endif 687a43c1590SMahesh Salgaonkar 688a43c1590SMahesh Salgaonkar out: 6899ca766f9SNicholas Piggin save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, 6909ca766f9SNicholas Piggin &mce_err, regs->nip, eaddr, paddr); 6919ca766f9SNicholas Piggin 692a43c1590SMahesh Salgaonkar return disposition; 693a43c1590SMahesh Salgaonkar } 694a43c1590SMahesh Salgaonkar 695d9953105SMichael Ellerman /* 69694675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 69794675cceSMahesh Salgaonkar */ 69894675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 69994675cceSMahesh Salgaonkar { 70094675cceSMahesh Salgaonkar struct rtas_error_log *err; 70194675cceSMahesh Salgaonkar 70294675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 70394675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 70494675cceSMahesh Salgaonkar } 70594675cceSMahesh Salgaonkar 70694675cceSMahesh Salgaonkar /* 707d9953105SMichael Ellerman * See if we can recover from a machine check exception. 708d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 709d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 710d9953105SMichael Ellerman * which provides the error analysis for us. 711d9953105SMichael Ellerman * 712d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 713d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 714d9953105SMichael Ellerman */ 7159ca766f9SNicholas Piggin static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) 716d9953105SMichael Ellerman { 717d47d1d8aSAnton Blanchard int recovered = 0; 7188f0b8056SMahesh Salgaonkar 719d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 720d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 7218f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 722d47d1d8aSAnton Blanchard recovered = 0; 7239ca766f9SNicholas Piggin } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { 724d9953105SMichael Ellerman /* Platform corrected itself */ 725d47d1d8aSAnton Blanchard recovered = 1; 7269ca766f9SNicholas Piggin } else if (evt->severity == MCE_SEV_FATAL) { 7279ca766f9SNicholas Piggin /* Fatal machine check */ 7289ca766f9SNicholas Piggin pr_err("Machine check interrupt is fatal\n"); 7299ca766f9SNicholas Piggin recovered = 0; 730d9953105SMichael Ellerman } 731d9953105SMichael Ellerman 7329ca766f9SNicholas Piggin if (!recovered && evt->sync_error) { 7339ca766f9SNicholas Piggin /* 7349ca766f9SNicholas Piggin * Try to kill processes if we get a synchronous machine check 7359ca766f9SNicholas Piggin * (e.g., one caused by execution of this instruction). This 7369ca766f9SNicholas Piggin * will devolve into a panic if we try to kill init or are in 7379ca766f9SNicholas Piggin * an interrupt etc. 7389ca766f9SNicholas Piggin * 7399ca766f9SNicholas Piggin * TODO: Queue up this address for hwpoisioning later. 7409ca766f9SNicholas Piggin * TODO: This is not quite right for d-side machine 7419ca766f9SNicholas Piggin * checks ->nip is not necessarily the important 7429ca766f9SNicholas Piggin * address. 7439ca766f9SNicholas Piggin */ 7449ca766f9SNicholas Piggin if ((user_mode(regs))) { 7459ca766f9SNicholas Piggin _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 7469ca766f9SNicholas Piggin recovered = 1; 7479ca766f9SNicholas Piggin } else if (die_will_crash()) { 7489ca766f9SNicholas Piggin /* 7499ca766f9SNicholas Piggin * die() would kill the kernel, so better to go via 7509ca766f9SNicholas Piggin * the platform reboot code that will log the 7519ca766f9SNicholas Piggin * machine check. 7529ca766f9SNicholas Piggin */ 7539ca766f9SNicholas Piggin recovered = 0; 7549ca766f9SNicholas Piggin } else { 7559ca766f9SNicholas Piggin die("Machine check", regs, SIGBUS); 7569ca766f9SNicholas Piggin recovered = 1; 7579ca766f9SNicholas Piggin } 7589ca766f9SNicholas Piggin } 759d9953105SMichael Ellerman 760d47d1d8aSAnton Blanchard return recovered; 761d9953105SMichael Ellerman } 762d9953105SMichael Ellerman 763d9953105SMichael Ellerman /* 764d9953105SMichael Ellerman * Handle a machine check. 765d9953105SMichael Ellerman * 766d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 767d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 768d9953105SMichael Ellerman * error was recovered (never true if RI=0). 769d9953105SMichael Ellerman * 770d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 771d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 772d9953105SMichael Ellerman */ 773d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 774d9953105SMichael Ellerman { 7759ca766f9SNicholas Piggin struct machine_check_event evt; 776d9953105SMichael Ellerman 7779ca766f9SNicholas Piggin if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 7789ca766f9SNicholas Piggin return 0; 7799ca766f9SNicholas Piggin 7809ca766f9SNicholas Piggin /* Print things out */ 7819ca766f9SNicholas Piggin if (evt.version != MCE_V1) { 7829ca766f9SNicholas Piggin pr_err("Machine Check Exception, Unknown event version %d !\n", 7839ca766f9SNicholas Piggin evt.version); 7849ca766f9SNicholas Piggin return 0; 785d9953105SMichael Ellerman } 7869ca766f9SNicholas Piggin machine_check_print_event_info(&evt, user_mode(regs), false); 7879ca766f9SNicholas Piggin 7889ca766f9SNicholas Piggin if (recover_mce(regs, &evt)) 7899ca766f9SNicholas Piggin return 1; 790d9953105SMichael Ellerman 791d9953105SMichael Ellerman return 0; 792d9953105SMichael Ellerman } 793a43c1590SMahesh Salgaonkar 794a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 795a43c1590SMahesh Salgaonkar { 796a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 797a43c1590SMahesh Salgaonkar int disposition; 798a43c1590SMahesh Salgaonkar 799a43c1590SMahesh Salgaonkar if (fwnmi_active) { 800a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 801a43c1590SMahesh Salgaonkar /* 802a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 803a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 804a43c1590SMahesh Salgaonkar * virtual mode. 805a43c1590SMahesh Salgaonkar */ 8069ca766f9SNicholas Piggin disposition = mce_handle_error(regs, errp); 8079ca766f9SNicholas Piggin fwnmi_release_errinfo(); 8089ca766f9SNicholas Piggin 8099ca766f9SNicholas Piggin /* Queue irq work to log this rtas event later. */ 8109ca766f9SNicholas Piggin irq_work_queue(&mce_errlog_process_work); 8119ca766f9SNicholas Piggin 812a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 813a43c1590SMahesh Salgaonkar return 1; 814a43c1590SMahesh Salgaonkar } 815a43c1590SMahesh Salgaonkar 816a43c1590SMahesh Salgaonkar return 0; 817a43c1590SMahesh Salgaonkar } 818