11a59d1b8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2d9953105SMichael Ellerman /* 3d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 4d9953105SMichael Ellerman */ 5d9953105SMichael Ellerman 6d9953105SMichael Ellerman #include <linux/sched.h> 7d9953105SMichael Ellerman #include <linux/interrupt.h> 8d9953105SMichael Ellerman #include <linux/irq.h> 990128997SAnton Blanchard #include <linux/of.h> 1055fc0c56SAnton Blanchard #include <linux/fs.h> 1155fc0c56SAnton Blanchard #include <linux/reboot.h> 1294675cceSMahesh Salgaonkar #include <linux/irq_work.h> 13d9953105SMichael Ellerman 14d9953105SMichael Ellerman #include <asm/machdep.h> 15d9953105SMichael Ellerman #include <asm/rtas.h> 168c4f1f29SMichael Ellerman #include <asm/firmware.h> 17a43c1590SMahesh Salgaonkar #include <asm/mce.h> 18d9953105SMichael Ellerman 19577830b0SMichael Ellerman #include "pseries.h" 20c902be71SArnd Bergmann 21d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 22d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 23d9953105SMichael Ellerman 24d9953105SMichael Ellerman static int ras_check_exception_token; 25d9953105SMichael Ellerman 2694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 2794675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 2894675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 2994675cceSMahesh Salgaonkar }; 3094675cceSMahesh Salgaonkar 31d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 32d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 33d9953105SMichael Ellerman 34b4af279aSVipin K Parashar /* EPOW events counter variable */ 35b4af279aSVipin K Parashar static int num_epow_events; 36b4af279aSVipin K Parashar 37b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 387d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 397d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 40d9953105SMichael Ellerman 4104fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 4204fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 4304fce21cSMahesh Salgaonkar __be32 fru_id; 4404fce21cSMahesh Salgaonkar __be32 proc_id; 4504fce21cSMahesh Salgaonkar u8 error_type; 4604fce21cSMahesh Salgaonkar /* 4704fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 4804fce21cSMahesh Salgaonkar * 4904fce21cSMahesh Salgaonkar * MSB0 5004fce21cSMahesh Salgaonkar * | 5104fce21cSMahesh Salgaonkar * V 5204fce21cSMahesh Salgaonkar * 01234567 5304fce21cSMahesh Salgaonkar * XXXXXXXX 5404fce21cSMahesh Salgaonkar * 5504fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 5604fce21cSMahesh Salgaonkar * XXXXXXXX 5704fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 5804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 5904fce21cSMahesh Salgaonkar * X 1: Logical address provided. 6004fce21cSMahesh Salgaonkar * XX 2: Reserved. 6104fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 6204fce21cSMahesh Salgaonkar * 6304fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 6404fce21cSMahesh Salgaonkar * XXXXXXXX 6504fce21cSMahesh Salgaonkar * X 1: Effective address provided. 6604fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 6704fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 6804fce21cSMahesh Salgaonkar */ 6904fce21cSMahesh Salgaonkar u8 sub_err_type; 7004fce21cSMahesh Salgaonkar u8 reserved_1[6]; 7104fce21cSMahesh Salgaonkar __be64 effective_address; 7204fce21cSMahesh Salgaonkar __be64 logical_address; 7304fce21cSMahesh Salgaonkar } __packed; 7404fce21cSMahesh Salgaonkar 7504fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 7604fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 7704fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 7804fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 799ca766f9SNicholas Piggin #define MC_ERROR_TYPE_UNKNOWN 0x03 8004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 8104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 8204fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 8304fce21cSMahesh Salgaonkar 8404fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 8504fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 8604fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 8704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 8804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 8904fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 9004fce21cSMahesh Salgaonkar 919ca766f9SNicholas Piggin #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 929ca766f9SNicholas Piggin #define UE_LOGICAL_ADDR_PROVIDED 0x20 939ca766f9SNicholas Piggin 9404fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 9504fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 9604fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 9704fce21cSMahesh Salgaonkar 9804fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 10104fce21cSMahesh Salgaonkar 10204fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 10304fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 10404fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 10504fce21cSMahesh Salgaonkar 10604fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 10704fce21cSMahesh Salgaonkar { 10804fce21cSMahesh Salgaonkar switch (mlog->error_type) { 10904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 11004fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 11104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 11204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 11304fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 11404fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 11504fce21cSMahesh Salgaonkar default: 11604fce21cSMahesh Salgaonkar return 0; 11704fce21cSMahesh Salgaonkar } 11804fce21cSMahesh Salgaonkar } 11904fce21cSMahesh Salgaonkar 120d9953105SMichael Ellerman /* 121c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 122c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 123c9dccf1dSSam Bobroff * subsys stage. 124c9dccf1dSSam Bobroff */ 125c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 126c9dccf1dSSam Bobroff { 127c9dccf1dSSam Bobroff struct device_node *np; 128c9dccf1dSSam Bobroff 129c9dccf1dSSam Bobroff /* Hotplug Events */ 130c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 131c9dccf1dSSam Bobroff if (np != NULL) { 132c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 133c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 134c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 135c9dccf1dSSam Bobroff of_node_put(np); 136c9dccf1dSSam Bobroff } 137c9dccf1dSSam Bobroff 138c9dccf1dSSam Bobroff return 0; 139c9dccf1dSSam Bobroff } 140c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 141c9dccf1dSSam Bobroff 142c9dccf1dSSam Bobroff /* 143d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 144d9953105SMichael Ellerman * and power system events. 145d9953105SMichael Ellerman */ 146d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 147d9953105SMichael Ellerman { 148d9953105SMichael Ellerman struct device_node *np; 149d9953105SMichael Ellerman 150d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 151d9953105SMichael Ellerman 152d9953105SMichael Ellerman /* Internal Errors */ 153d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 154d9953105SMichael Ellerman if (np != NULL) { 15532c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 15632c96f77SMark Nelson "RAS_ERROR"); 157d9953105SMichael Ellerman of_node_put(np); 158d9953105SMichael Ellerman } 159d9953105SMichael Ellerman 160d9953105SMichael Ellerman /* EPOW Events */ 161d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 162d9953105SMichael Ellerman if (np != NULL) { 16332c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 164d9953105SMichael Ellerman of_node_put(np); 165d9953105SMichael Ellerman } 166d9953105SMichael Ellerman 16769ed3324SAnton Blanchard return 0; 168d9953105SMichael Ellerman } 1698e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 170d9953105SMichael Ellerman 17155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 17255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 17355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 17455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 17555fc0c56SAnton Blanchard 17655fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 17755fc0c56SAnton Blanchard { 17855fc0c56SAnton Blanchard switch (event_modifier) { 17955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 180b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1811b7e0cbeSliguang orderly_poweroff(true); 18255fc0c56SAnton Blanchard break; 18355fc0c56SAnton Blanchard 18455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 185b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 186b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 18779872e35SAnshuman Khandual orderly_poweroff(true); 18855fc0c56SAnton Blanchard break; 18955fc0c56SAnton Blanchard 19055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 191b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 192b4af279aSVipin K Parashar " RTAS error log for details\n"); 1931b7e0cbeSliguang orderly_poweroff(true); 19455fc0c56SAnton Blanchard break; 19555fc0c56SAnton Blanchard 19655fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 197b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 198b4af279aSVipin K Parashar " error log for details\n"); 1991b7e0cbeSliguang orderly_poweroff(true); 20055fc0c56SAnton Blanchard break; 20155fc0c56SAnton Blanchard 20255fc0c56SAnton Blanchard default: 203b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 20455fc0c56SAnton Blanchard event_modifier); 20555fc0c56SAnton Blanchard } 20655fc0c56SAnton Blanchard } 20755fc0c56SAnton Blanchard 20855fc0c56SAnton Blanchard struct epow_errorlog { 20955fc0c56SAnton Blanchard unsigned char sensor_value; 21055fc0c56SAnton Blanchard unsigned char event_modifier; 21155fc0c56SAnton Blanchard unsigned char extended_modifier; 21255fc0c56SAnton Blanchard unsigned char reserved; 21355fc0c56SAnton Blanchard unsigned char platform_reason; 21455fc0c56SAnton Blanchard }; 21555fc0c56SAnton Blanchard 21655fc0c56SAnton Blanchard #define EPOW_RESET 0 21755fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 21855fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 21955fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 22055fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 22155fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 22255fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 22355fc0c56SAnton Blanchard 224e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 22555fc0c56SAnton Blanchard { 22655fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 22755fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 22855fc0c56SAnton Blanchard char action_code; 22955fc0c56SAnton Blanchard char modifier; 23055fc0c56SAnton Blanchard 23155fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 23255fc0c56SAnton Blanchard if (pseries_log == NULL) 23355fc0c56SAnton Blanchard return; 23455fc0c56SAnton Blanchard 23555fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 23655fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 23755fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 23855fc0c56SAnton Blanchard 23955fc0c56SAnton Blanchard switch (action_code) { 24055fc0c56SAnton Blanchard case EPOW_RESET: 241b4af279aSVipin K Parashar if (num_epow_events) { 242b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 243b4af279aSVipin K Parashar num_epow_events--; 244b4af279aSVipin K Parashar } 24555fc0c56SAnton Blanchard break; 24655fc0c56SAnton Blanchard 24755fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 248b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 249b4af279aSVipin K Parashar " log for details\n"); 25055fc0c56SAnton Blanchard break; 25155fc0c56SAnton Blanchard 25255fc0c56SAnton Blanchard case EPOW_WARN_POWER: 253b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 254b4af279aSVipin K Parashar " log for details\n"); 25555fc0c56SAnton Blanchard break; 25655fc0c56SAnton Blanchard 25755fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 258d273fa91SYueHaibing handle_system_shutdown(modifier); 25955fc0c56SAnton Blanchard break; 26055fc0c56SAnton Blanchard 26155fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 262b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 263b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2641b7e0cbeSliguang orderly_poweroff(true); 26555fc0c56SAnton Blanchard break; 26655fc0c56SAnton Blanchard 26755fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 26855fc0c56SAnton Blanchard case EPOW_POWER_OFF: 269b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 270b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 27155fc0c56SAnton Blanchard emergency_sync(); 27255fc0c56SAnton Blanchard kernel_power_off(); 27355fc0c56SAnton Blanchard break; 27455fc0c56SAnton Blanchard 27555fc0c56SAnton Blanchard default: 276b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 27755fc0c56SAnton Blanchard action_code); 27855fc0c56SAnton Blanchard } 279b4af279aSVipin K Parashar 280b4af279aSVipin K Parashar /* Increment epow events counter variable */ 281b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 282b4af279aSVipin K Parashar num_epow_events++; 28355fc0c56SAnton Blanchard } 28455fc0c56SAnton Blanchard 285b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 286b7d9eb39SJohn Allen { 287b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 288b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 289b7d9eb39SJohn Allen 290b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 291b7d9eb39SJohn Allen 292b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 293b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 294b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 295b7d9eb39SJohn Allen rtas_get_error_log_max()); 296b7d9eb39SJohn Allen 297b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 298b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 299b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 300b7d9eb39SJohn Allen 301b7d9eb39SJohn Allen /* 302b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 303b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 304b7d9eb39SJohn Allen */ 305b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 3064c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || 3074c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) 308fd12527aSNathan Fontenot queue_hotplug_event(hp_elog); 309b7d9eb39SJohn Allen else 310b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 311b7d9eb39SJohn Allen 312b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 313b7d9eb39SJohn Allen return IRQ_HANDLED; 314b7d9eb39SJohn Allen } 315b7d9eb39SJohn Allen 31655fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3177d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 318d9953105SMichael Ellerman { 31955fc0c56SAnton Blanchard int status; 32055fc0c56SAnton Blanchard int state; 321d9953105SMichael Ellerman int critical; 322d9953105SMichael Ellerman 3231c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3241c2cb594SThomas Huth &state); 325d9953105SMichael Ellerman 326d9953105SMichael Ellerman if (state > 3) 327d9953105SMichael Ellerman critical = 1; /* Time Critical */ 328d9953105SMichael Ellerman else 329d9953105SMichael Ellerman critical = 0; 330d9953105SMichael Ellerman 331d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 332d9953105SMichael Ellerman 333d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 334b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 335476eb491SGrant Likely virq_to_hw(irq), 3366f43747fSAnton Blanchard RTAS_EPOW_WARNING, 337d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 338d9953105SMichael Ellerman rtas_get_error_log_max()); 339d9953105SMichael Ellerman 340d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 341d9953105SMichael Ellerman 34255fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 34355fc0c56SAnton Blanchard 344d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 345d9953105SMichael Ellerman return IRQ_HANDLED; 346d9953105SMichael Ellerman } 347d9953105SMichael Ellerman 348d9953105SMichael Ellerman /* 349d9953105SMichael Ellerman * Handle hardware error interrupts. 350d9953105SMichael Ellerman * 351d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 352d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 353d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 354d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 355d9953105SMichael Ellerman */ 3567d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 357d9953105SMichael Ellerman { 358d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 359cc8b5263SAnton Blanchard int status; 360d9953105SMichael Ellerman int fatal; 361d9953105SMichael Ellerman 362d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 363d9953105SMichael Ellerman 364d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 365b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 366476eb491SGrant Likely virq_to_hw(irq), 367d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 368d9953105SMichael Ellerman __pa(&ras_log_buf), 369d9953105SMichael Ellerman rtas_get_error_log_max()); 370d9953105SMichael Ellerman 371d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 372d9953105SMichael Ellerman 373a08a53eaSGreg Kurz if (status == 0 && 374a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 375d9953105SMichael Ellerman fatal = 1; 376d9953105SMichael Ellerman else 377d9953105SMichael Ellerman fatal = 0; 378d9953105SMichael Ellerman 379d9953105SMichael Ellerman /* format and print the extended information */ 380d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 381d9953105SMichael Ellerman 382d9953105SMichael Ellerman if (fatal) { 383b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 384b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 385cc8b5263SAnton Blanchard emergency_sync(); 386cc8b5263SAnton Blanchard kernel_power_off(); 387d9953105SMichael Ellerman } else { 388b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 389d9953105SMichael Ellerman } 390d9953105SMichael Ellerman 391d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 392d9953105SMichael Ellerman return IRQ_HANDLED; 393d9953105SMichael Ellerman } 394d9953105SMichael Ellerman 395d368514cSAnton Blanchard /* 396d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 397d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 398deb70f7aSNicholas Piggin * Minimum size of the buffer is 16 bytes. 399d368514cSAnton Blanchard */ 400d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 401deb70f7aSNicholas Piggin ((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \ 402deb70f7aSNicholas Piggin (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16)))) 403d368514cSAnton Blanchard 40494675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 40594675cceSMahesh Salgaonkar { 40694675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 40794675cceSMahesh Salgaonkar } 40894675cceSMahesh Salgaonkar 409d7b14c5cSNicholas Piggin static __be64 *fwnmi_get_savep(struct pt_regs *regs) 410d7b14c5cSNicholas Piggin { 411d7b14c5cSNicholas Piggin unsigned long savep_ra; 412d7b14c5cSNicholas Piggin 413d7b14c5cSNicholas Piggin /* Mask top two bits */ 414d7b14c5cSNicholas Piggin savep_ra = regs->gpr[3] & ~(0x3UL << 62); 415d7b14c5cSNicholas Piggin if (!VALID_FWNMI_BUFFER(savep_ra)) { 416d7b14c5cSNicholas Piggin printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 417d7b14c5cSNicholas Piggin return NULL; 418d7b14c5cSNicholas Piggin } 419d7b14c5cSNicholas Piggin 420d7b14c5cSNicholas Piggin return __va(savep_ra); 421d7b14c5cSNicholas Piggin } 422d7b14c5cSNicholas Piggin 423d368514cSAnton Blanchard /* 424d368514cSAnton Blanchard * Get the error information for errors coming through the 425d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 426d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 427d9953105SMichael Ellerman * will be returned if found. 428d9953105SMichael Ellerman * 42994675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 430d368514cSAnton Blanchard * 43194675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 432d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 433d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 434d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 435d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 436d9953105SMichael Ellerman * second machine check did come in. 437d9953105SMichael Ellerman */ 438d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 439d9953105SMichael Ellerman { 44094675cceSMahesh Salgaonkar struct rtas_error_log *h; 441d7b14c5cSNicholas Piggin __be64 *savep; 442d9953105SMichael Ellerman 443d7b14c5cSNicholas Piggin savep = fwnmi_get_savep(regs); 444d7b14c5cSNicholas Piggin if (!savep) 445d368514cSAnton Blanchard return NULL; 446d368514cSAnton Blanchard 447cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 448d368514cSAnton Blanchard 449d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 45094675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 45194675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 452a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 45394675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 454d368514cSAnton Blanchard } else { 455a08a53eaSGreg Kurz int len, error_log_length; 456d368514cSAnton Blanchard 457a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 45874e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 45994675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 460d368514cSAnton Blanchard } 461d368514cSAnton Blanchard 46294675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 463d9953105SMichael Ellerman } 464d9953105SMichael Ellerman 465d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 466d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 467d9953105SMichael Ellerman * partition to receive FWNMI errors. 468d9953105SMichael Ellerman */ 469d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 470d9953105SMichael Ellerman { 471*2576f5f9SNicholas Piggin struct rtas_args rtas_args; 472*2576f5f9SNicholas Piggin int ret; 473*2576f5f9SNicholas Piggin 474*2576f5f9SNicholas Piggin /* 475*2576f5f9SNicholas Piggin * On pseries, the machine check stack is limited to under 4GB, so 476*2576f5f9SNicholas Piggin * args can be on-stack. 477*2576f5f9SNicholas Piggin */ 478*2576f5f9SNicholas Piggin rtas_call_unlocked(&rtas_args, ibm_nmi_interlock_token, 0, 1, NULL); 479*2576f5f9SNicholas Piggin ret = be32_to_cpu(rtas_args.rets[0]); 480d9953105SMichael Ellerman if (ret != 0) 481d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 482d9953105SMichael Ellerman } 483d9953105SMichael Ellerman 484c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 485d9953105SMichael Ellerman { 486bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 487bded0706SNicholas Piggin /* 488bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 489bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 490bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 491bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 492bded0706SNicholas Piggin */ 493bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 494bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 495bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 496bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 497bded0706SNicholas Piggin regs->msr = 0; 498bded0706SNicholas Piggin } 499bded0706SNicholas Piggin #endif 500bded0706SNicholas Piggin 501d9953105SMichael Ellerman if (fwnmi_active) { 502d7b14c5cSNicholas Piggin __be64 *savep; 503d7b14c5cSNicholas Piggin 504d7b14c5cSNicholas Piggin /* 505d7b14c5cSNicholas Piggin * Firmware (PowerVM and KVM) saves r3 to a save area like 506d7b14c5cSNicholas Piggin * machine check, which is not exactly what PAPR (2.9) 507d7b14c5cSNicholas Piggin * suggests but there is no way to detect otherwise, so this 508d7b14c5cSNicholas Piggin * is the interface now. 509d7b14c5cSNicholas Piggin * 510d7b14c5cSNicholas Piggin * System resets do not save any error log or require an 511d7b14c5cSNicholas Piggin * "ibm,nmi-interlock" rtas call to release. 512d7b14c5cSNicholas Piggin */ 513d7b14c5cSNicholas Piggin 514d7b14c5cSNicholas Piggin savep = fwnmi_get_savep(regs); 515d7b14c5cSNicholas Piggin if (savep) 516d7b14c5cSNicholas Piggin regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 517d9953105SMichael Ellerman } 518102c05e8SNicholas Piggin 519102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 520102c05e8SNicholas Piggin return 1; 521102c05e8SNicholas Piggin 522c902be71SArnd Bergmann return 0; /* need to perform reset */ 523d9953105SMichael Ellerman } 524d9953105SMichael Ellerman 5258f0b8056SMahesh Salgaonkar 5269ca766f9SNicholas Piggin static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) 5278f0b8056SMahesh Salgaonkar { 5289ca766f9SNicholas Piggin struct mce_error_info mce_err = { 0 }; 5299ca766f9SNicholas Piggin unsigned long eaddr = 0, paddr = 0; 5308f0b8056SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 5318f0b8056SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 5329ca766f9SNicholas Piggin int disposition = rtas_error_disposition(errp); 5339ca766f9SNicholas Piggin int initiator = rtas_error_initiator(errp); 5349ca766f9SNicholas Piggin int severity = rtas_error_severity(errp); 5358f0b8056SMahesh Salgaonkar u8 error_type, err_sub_type; 5368f0b8056SMahesh Salgaonkar 5379ca766f9SNicholas Piggin if (initiator == RTAS_INITIATOR_UNKNOWN) 5389ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5399ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_CPU) 5409ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_CPU; 5419ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_PCI) 5429ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_PCI; 5439ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_ISA) 5449ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_ISA; 5459ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_MEMORY) 5469ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_MEMORY; 5479ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_POWERMGM) 5489ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_POWERMGM; 5499ca766f9SNicholas Piggin else 5509ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5518f0b8056SMahesh Salgaonkar 5529ca766f9SNicholas Piggin if (severity == RTAS_SEVERITY_NO_ERROR) 5539ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_NO_ERROR; 5549ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_EVENT) 5559ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5569ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_WARNING) 5579ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5589ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR_SYNC) 5599ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5609ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR) 5619ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5629ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_FATAL) 5639ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 5649ca766f9SNicholas Piggin else 5659ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 5668f0b8056SMahesh Salgaonkar 5679ca766f9SNicholas Piggin if (severity <= RTAS_SEVERITY_ERROR_SYNC) 5689ca766f9SNicholas Piggin mce_err.sync_error = true; 5699ca766f9SNicholas Piggin else 5709ca766f9SNicholas Piggin mce_err.sync_error = false; 5718f0b8056SMahesh Salgaonkar 5729ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 5739ca766f9SNicholas Piggin mce_err.error_class = MCE_ECLASS_UNKNOWN; 574a43c1590SMahesh Salgaonkar 575a43c1590SMahesh Salgaonkar if (!rtas_error_extended(errp)) 576a43c1590SMahesh Salgaonkar goto out; 577a43c1590SMahesh Salgaonkar 578a43c1590SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 579a43c1590SMahesh Salgaonkar if (pseries_log == NULL) 580a43c1590SMahesh Salgaonkar goto out; 581a43c1590SMahesh Salgaonkar 582a43c1590SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 583a43c1590SMahesh Salgaonkar error_type = mce_log->error_type; 5849ca766f9SNicholas Piggin err_sub_type = rtas_mc_error_sub_type(mce_log); 5859ca766f9SNicholas Piggin 5869ca766f9SNicholas Piggin switch (mce_log->error_type) { 5879ca766f9SNicholas Piggin case MC_ERROR_TYPE_UE: 5889ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UE; 589efbc4303SGanesh Goudar mce_common_process_ue(regs, &mce_err); 590efbc4303SGanesh Goudar if (mce_err.ignore_event) 591efbc4303SGanesh Goudar disposition = RTAS_DISP_FULLY_RECOVERED; 5929ca766f9SNicholas Piggin switch (err_sub_type) { 5939ca766f9SNicholas Piggin case MC_ERROR_UE_IFETCH: 5949ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; 5959ca766f9SNicholas Piggin break; 5969ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: 5979ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; 5989ca766f9SNicholas Piggin break; 5999ca766f9SNicholas Piggin case MC_ERROR_UE_LOAD_STORE: 6009ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; 6019ca766f9SNicholas Piggin break; 6029ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: 6039ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 6049ca766f9SNicholas Piggin break; 6059ca766f9SNicholas Piggin case MC_ERROR_UE_INDETERMINATE: 6069ca766f9SNicholas Piggin default: 6079ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; 6089ca766f9SNicholas Piggin break; 6099ca766f9SNicholas Piggin } 6109ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) 6119ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6129ca766f9SNicholas Piggin 6139ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { 6149ca766f9SNicholas Piggin paddr = be64_to_cpu(mce_log->logical_address); 6159ca766f9SNicholas Piggin } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { 6169ca766f9SNicholas Piggin unsigned long pfn; 6179ca766f9SNicholas Piggin 6189ca766f9SNicholas Piggin pfn = addr_to_pfn(regs, eaddr); 6199ca766f9SNicholas Piggin if (pfn != ULONG_MAX) 6209ca766f9SNicholas Piggin paddr = pfn << PAGE_SHIFT; 6219ca766f9SNicholas Piggin } 6229ca766f9SNicholas Piggin 6239ca766f9SNicholas Piggin break; 6249ca766f9SNicholas Piggin case MC_ERROR_TYPE_SLB: 6259ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_SLB; 6269ca766f9SNicholas Piggin switch (err_sub_type) { 6279ca766f9SNicholas Piggin case MC_ERROR_SLB_PARITY: 6289ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; 6299ca766f9SNicholas Piggin break; 6309ca766f9SNicholas Piggin case MC_ERROR_SLB_MULTIHIT: 6319ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 6329ca766f9SNicholas Piggin break; 6339ca766f9SNicholas Piggin case MC_ERROR_SLB_INDETERMINATE: 6349ca766f9SNicholas Piggin default: 6359ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; 6369ca766f9SNicholas Piggin break; 6379ca766f9SNicholas Piggin } 6389ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6399ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6409ca766f9SNicholas Piggin break; 6419ca766f9SNicholas Piggin case MC_ERROR_TYPE_ERAT: 6429ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_ERAT; 6439ca766f9SNicholas Piggin switch (err_sub_type) { 6449ca766f9SNicholas Piggin case MC_ERROR_ERAT_PARITY: 6459ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; 6469ca766f9SNicholas Piggin break; 6479ca766f9SNicholas Piggin case MC_ERROR_ERAT_MULTIHIT: 6489ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 6499ca766f9SNicholas Piggin break; 6509ca766f9SNicholas Piggin case MC_ERROR_ERAT_INDETERMINATE: 6519ca766f9SNicholas Piggin default: 6529ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; 6539ca766f9SNicholas Piggin break; 6549ca766f9SNicholas Piggin } 6559ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6569ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6579ca766f9SNicholas Piggin break; 6589ca766f9SNicholas Piggin case MC_ERROR_TYPE_TLB: 6599ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_TLB; 6609ca766f9SNicholas Piggin switch (err_sub_type) { 6619ca766f9SNicholas Piggin case MC_ERROR_TLB_PARITY: 6629ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; 6639ca766f9SNicholas Piggin break; 6649ca766f9SNicholas Piggin case MC_ERROR_TLB_MULTIHIT: 6659ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 6669ca766f9SNicholas Piggin break; 6679ca766f9SNicholas Piggin case MC_ERROR_TLB_INDETERMINATE: 6689ca766f9SNicholas Piggin default: 6699ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; 6709ca766f9SNicholas Piggin break; 6719ca766f9SNicholas Piggin } 6729ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6739ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6749ca766f9SNicholas Piggin break; 6759ca766f9SNicholas Piggin case MC_ERROR_TYPE_D_CACHE: 6769ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_DCACHE; 6779ca766f9SNicholas Piggin break; 6789ca766f9SNicholas Piggin case MC_ERROR_TYPE_I_CACHE: 6799ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_DCACHE; 6809ca766f9SNicholas Piggin break; 6819ca766f9SNicholas Piggin case MC_ERROR_TYPE_UNKNOWN: 6829ca766f9SNicholas Piggin default: 6839ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 6849ca766f9SNicholas Piggin break; 6859ca766f9SNicholas Piggin } 686a43c1590SMahesh Salgaonkar 687a43c1590SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 688a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_NOT_RECOVERED) { 689a43c1590SMahesh Salgaonkar switch (error_type) { 690a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 691a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 692c6d15258SMahesh Salgaonkar /* 693c6d15258SMahesh Salgaonkar * Store the old slb content in paca before flushing. 694c6d15258SMahesh Salgaonkar * Print this when we go to virtual mode. 695c6d15258SMahesh Salgaonkar * There are chances that we may hit MCE again if there 696c6d15258SMahesh Salgaonkar * is a parity error on the SLB entry we trying to read 697c6d15258SMahesh Salgaonkar * for saving. Hence limit the slb saving to single 698c6d15258SMahesh Salgaonkar * level of recursion. 699c6d15258SMahesh Salgaonkar */ 700c6d15258SMahesh Salgaonkar if (local_paca->in_mce == 1) 701c6d15258SMahesh Salgaonkar slb_save_contents(local_paca->mce_faulty_slbs); 702a43c1590SMahesh Salgaonkar flush_and_reload_slb(); 703a43c1590SMahesh Salgaonkar disposition = RTAS_DISP_FULLY_RECOVERED; 704a43c1590SMahesh Salgaonkar break; 705a43c1590SMahesh Salgaonkar default: 706a43c1590SMahesh Salgaonkar break; 707a43c1590SMahesh Salgaonkar } 7089ca766f9SNicholas Piggin } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 7099ca766f9SNicholas Piggin /* Platform corrected itself but could be degraded */ 7109ca766f9SNicholas Piggin printk(KERN_ERR "MCE: limited recovery, system may " 7119ca766f9SNicholas Piggin "be degraded\n"); 7129ca766f9SNicholas Piggin disposition = RTAS_DISP_FULLY_RECOVERED; 713a43c1590SMahesh Salgaonkar } 714a43c1590SMahesh Salgaonkar #endif 715a43c1590SMahesh Salgaonkar 716a43c1590SMahesh Salgaonkar out: 717a95a0a16SGanesh Goudar /* 718a95a0a16SGanesh Goudar * Enable translation as we will be accessing per-cpu variables 719a95a0a16SGanesh Goudar * in save_mce_event() which may fall outside RMO region, also 720a95a0a16SGanesh Goudar * leave it enabled because subsequently we will be queuing work 721a95a0a16SGanesh Goudar * to workqueues where again per-cpu variables accessed, besides 722a95a0a16SGanesh Goudar * fwnmi_release_errinfo() crashes when called in realmode on 723a95a0a16SGanesh Goudar * pseries. 724a95a0a16SGanesh Goudar * Note: All the realmode handling like flushing SLB entries for 725a95a0a16SGanesh Goudar * SLB multihit is done by now. 726a95a0a16SGanesh Goudar */ 727a95a0a16SGanesh Goudar mtmsr(mfmsr() | MSR_IR | MSR_DR); 7289ca766f9SNicholas Piggin save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, 7299ca766f9SNicholas Piggin &mce_err, regs->nip, eaddr, paddr); 7309ca766f9SNicholas Piggin 731a43c1590SMahesh Salgaonkar return disposition; 732a43c1590SMahesh Salgaonkar } 733a43c1590SMahesh Salgaonkar 734d9953105SMichael Ellerman /* 73594675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 73694675cceSMahesh Salgaonkar */ 73794675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 73894675cceSMahesh Salgaonkar { 73994675cceSMahesh Salgaonkar struct rtas_error_log *err; 74094675cceSMahesh Salgaonkar 74194675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 74294675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 74394675cceSMahesh Salgaonkar } 74494675cceSMahesh Salgaonkar 74594675cceSMahesh Salgaonkar /* 746d9953105SMichael Ellerman * See if we can recover from a machine check exception. 747d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 748d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 749d9953105SMichael Ellerman * which provides the error analysis for us. 750d9953105SMichael Ellerman * 751d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 752d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 753d9953105SMichael Ellerman */ 7549ca766f9SNicholas Piggin static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) 755d9953105SMichael Ellerman { 756d47d1d8aSAnton Blanchard int recovered = 0; 7578f0b8056SMahesh Salgaonkar 758d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 759d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 7608f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 761d47d1d8aSAnton Blanchard recovered = 0; 7629ca766f9SNicholas Piggin } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { 763d9953105SMichael Ellerman /* Platform corrected itself */ 764d47d1d8aSAnton Blanchard recovered = 1; 7659ca766f9SNicholas Piggin } else if (evt->severity == MCE_SEV_FATAL) { 7669ca766f9SNicholas Piggin /* Fatal machine check */ 7679ca766f9SNicholas Piggin pr_err("Machine check interrupt is fatal\n"); 7689ca766f9SNicholas Piggin recovered = 0; 769d9953105SMichael Ellerman } 770d9953105SMichael Ellerman 7719ca766f9SNicholas Piggin if (!recovered && evt->sync_error) { 7729ca766f9SNicholas Piggin /* 7739ca766f9SNicholas Piggin * Try to kill processes if we get a synchronous machine check 7749ca766f9SNicholas Piggin * (e.g., one caused by execution of this instruction). This 7759ca766f9SNicholas Piggin * will devolve into a panic if we try to kill init or are in 7769ca766f9SNicholas Piggin * an interrupt etc. 7779ca766f9SNicholas Piggin * 7789ca766f9SNicholas Piggin * TODO: Queue up this address for hwpoisioning later. 7799ca766f9SNicholas Piggin * TODO: This is not quite right for d-side machine 7809ca766f9SNicholas Piggin * checks ->nip is not necessarily the important 7819ca766f9SNicholas Piggin * address. 7829ca766f9SNicholas Piggin */ 7839ca766f9SNicholas Piggin if ((user_mode(regs))) { 7849ca766f9SNicholas Piggin _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 7859ca766f9SNicholas Piggin recovered = 1; 7869ca766f9SNicholas Piggin } else if (die_will_crash()) { 7879ca766f9SNicholas Piggin /* 7889ca766f9SNicholas Piggin * die() would kill the kernel, so better to go via 7899ca766f9SNicholas Piggin * the platform reboot code that will log the 7909ca766f9SNicholas Piggin * machine check. 7919ca766f9SNicholas Piggin */ 7929ca766f9SNicholas Piggin recovered = 0; 7939ca766f9SNicholas Piggin } else { 7949ca766f9SNicholas Piggin die("Machine check", regs, SIGBUS); 7959ca766f9SNicholas Piggin recovered = 1; 7969ca766f9SNicholas Piggin } 7979ca766f9SNicholas Piggin } 798d9953105SMichael Ellerman 799d47d1d8aSAnton Blanchard return recovered; 800d9953105SMichael Ellerman } 801d9953105SMichael Ellerman 802d9953105SMichael Ellerman /* 803d9953105SMichael Ellerman * Handle a machine check. 804d9953105SMichael Ellerman * 805d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 806d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 807d9953105SMichael Ellerman * error was recovered (never true if RI=0). 808d9953105SMichael Ellerman * 809d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 810d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 811d9953105SMichael Ellerman */ 812d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 813d9953105SMichael Ellerman { 8149ca766f9SNicholas Piggin struct machine_check_event evt; 815d9953105SMichael Ellerman 8169ca766f9SNicholas Piggin if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 8179ca766f9SNicholas Piggin return 0; 8189ca766f9SNicholas Piggin 8199ca766f9SNicholas Piggin /* Print things out */ 8209ca766f9SNicholas Piggin if (evt.version != MCE_V1) { 8219ca766f9SNicholas Piggin pr_err("Machine Check Exception, Unknown event version %d !\n", 8229ca766f9SNicholas Piggin evt.version); 8239ca766f9SNicholas Piggin return 0; 824d9953105SMichael Ellerman } 8259ca766f9SNicholas Piggin machine_check_print_event_info(&evt, user_mode(regs), false); 8269ca766f9SNicholas Piggin 8279ca766f9SNicholas Piggin if (recover_mce(regs, &evt)) 8289ca766f9SNicholas Piggin return 1; 829d9953105SMichael Ellerman 830d9953105SMichael Ellerman return 0; 831d9953105SMichael Ellerman } 832a43c1590SMahesh Salgaonkar 833a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 834a43c1590SMahesh Salgaonkar { 835a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 836a43c1590SMahesh Salgaonkar int disposition; 837a43c1590SMahesh Salgaonkar 838a43c1590SMahesh Salgaonkar if (fwnmi_active) { 839a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 840a43c1590SMahesh Salgaonkar /* 841a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 842a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 843a43c1590SMahesh Salgaonkar * virtual mode. 844a43c1590SMahesh Salgaonkar */ 8459ca766f9SNicholas Piggin disposition = mce_handle_error(regs, errp); 8469ca766f9SNicholas Piggin fwnmi_release_errinfo(); 8479ca766f9SNicholas Piggin 8489ca766f9SNicholas Piggin /* Queue irq work to log this rtas event later. */ 8499ca766f9SNicholas Piggin irq_work_queue(&mce_errlog_process_work); 8509ca766f9SNicholas Piggin 851a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 852a43c1590SMahesh Salgaonkar return 1; 853a43c1590SMahesh Salgaonkar } 854a43c1590SMahesh Salgaonkar 855a43c1590SMahesh Salgaonkar return 0; 856a43c1590SMahesh Salgaonkar } 857