11a59d1b8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2d9953105SMichael Ellerman /* 3d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 4d9953105SMichael Ellerman */ 5d9953105SMichael Ellerman 6d9953105SMichael Ellerman #include <linux/sched.h> 7d9953105SMichael Ellerman #include <linux/interrupt.h> 8d9953105SMichael Ellerman #include <linux/irq.h> 990128997SAnton Blanchard #include <linux/of.h> 1055fc0c56SAnton Blanchard #include <linux/fs.h> 1155fc0c56SAnton Blanchard #include <linux/reboot.h> 1294675cceSMahesh Salgaonkar #include <linux/irq_work.h> 13d9953105SMichael Ellerman 14d9953105SMichael Ellerman #include <asm/machdep.h> 15d9953105SMichael Ellerman #include <asm/rtas.h> 168c4f1f29SMichael Ellerman #include <asm/firmware.h> 17a43c1590SMahesh Salgaonkar #include <asm/mce.h> 18d9953105SMichael Ellerman 19577830b0SMichael Ellerman #include "pseries.h" 20c902be71SArnd Bergmann 21d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 22d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 23d9953105SMichael Ellerman 24d9953105SMichael Ellerman static int ras_check_exception_token; 25d9953105SMichael Ellerman 2694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 2794675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 2894675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 2994675cceSMahesh Salgaonkar }; 3094675cceSMahesh Salgaonkar 31d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 32d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 33d9953105SMichael Ellerman 34b4af279aSVipin K Parashar /* EPOW events counter variable */ 35b4af279aSVipin K Parashar static int num_epow_events; 36b4af279aSVipin K Parashar 37b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 387d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 397d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 40d9953105SMichael Ellerman 4104fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 4204fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 4304fce21cSMahesh Salgaonkar __be32 fru_id; 4404fce21cSMahesh Salgaonkar __be32 proc_id; 4504fce21cSMahesh Salgaonkar u8 error_type; 4604fce21cSMahesh Salgaonkar /* 4704fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 4804fce21cSMahesh Salgaonkar * 4904fce21cSMahesh Salgaonkar * MSB0 5004fce21cSMahesh Salgaonkar * | 5104fce21cSMahesh Salgaonkar * V 5204fce21cSMahesh Salgaonkar * 01234567 5304fce21cSMahesh Salgaonkar * XXXXXXXX 5404fce21cSMahesh Salgaonkar * 5504fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 5604fce21cSMahesh Salgaonkar * XXXXXXXX 5704fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 5804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 5904fce21cSMahesh Salgaonkar * X 1: Logical address provided. 6004fce21cSMahesh Salgaonkar * XX 2: Reserved. 6104fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 6204fce21cSMahesh Salgaonkar * 6304fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 6404fce21cSMahesh Salgaonkar * XXXXXXXX 6504fce21cSMahesh Salgaonkar * X 1: Effective address provided. 6604fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 6704fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 6804fce21cSMahesh Salgaonkar */ 6904fce21cSMahesh Salgaonkar u8 sub_err_type; 7004fce21cSMahesh Salgaonkar u8 reserved_1[6]; 7104fce21cSMahesh Salgaonkar __be64 effective_address; 7204fce21cSMahesh Salgaonkar __be64 logical_address; 7304fce21cSMahesh Salgaonkar } __packed; 7404fce21cSMahesh Salgaonkar 7504fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 7604fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 7704fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 7804fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 799ca766f9SNicholas Piggin #define MC_ERROR_TYPE_UNKNOWN 0x03 8004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 8104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 8204fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 8304fce21cSMahesh Salgaonkar 8404fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 8504fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 8604fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 8704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 8804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 8904fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 9004fce21cSMahesh Salgaonkar 919ca766f9SNicholas Piggin #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 929ca766f9SNicholas Piggin #define UE_LOGICAL_ADDR_PROVIDED 0x20 939ca766f9SNicholas Piggin 9404fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 9504fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 9604fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 9704fce21cSMahesh Salgaonkar 9804fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 10104fce21cSMahesh Salgaonkar 10204fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 10304fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 10404fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 10504fce21cSMahesh Salgaonkar 10604fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 10704fce21cSMahesh Salgaonkar { 10804fce21cSMahesh Salgaonkar switch (mlog->error_type) { 10904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 11004fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 11104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 11204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 11304fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 11404fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 11504fce21cSMahesh Salgaonkar default: 11604fce21cSMahesh Salgaonkar return 0; 11704fce21cSMahesh Salgaonkar } 11804fce21cSMahesh Salgaonkar } 11904fce21cSMahesh Salgaonkar 120d9953105SMichael Ellerman /* 121c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 122c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 123c9dccf1dSSam Bobroff * subsys stage. 124c9dccf1dSSam Bobroff */ 12590db8bf2SCédric Le Goater static int __init init_ras_hotplug_IRQ(void) 126c9dccf1dSSam Bobroff { 127c9dccf1dSSam Bobroff struct device_node *np; 128c9dccf1dSSam Bobroff 129c9dccf1dSSam Bobroff /* Hotplug Events */ 130c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 131c9dccf1dSSam Bobroff if (np != NULL) { 132c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 133c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 134c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 135c9dccf1dSSam Bobroff of_node_put(np); 136c9dccf1dSSam Bobroff } 137c9dccf1dSSam Bobroff 138c9dccf1dSSam Bobroff return 0; 139c9dccf1dSSam Bobroff } 140c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 141c9dccf1dSSam Bobroff 142c9dccf1dSSam Bobroff /* 143d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 144d9953105SMichael Ellerman * and power system events. 145d9953105SMichael Ellerman */ 146d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 147d9953105SMichael Ellerman { 148d9953105SMichael Ellerman struct device_node *np; 149d9953105SMichael Ellerman 150d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 151d9953105SMichael Ellerman 152d9953105SMichael Ellerman /* Internal Errors */ 153d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 154d9953105SMichael Ellerman if (np != NULL) { 15532c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 15632c96f77SMark Nelson "RAS_ERROR"); 157d9953105SMichael Ellerman of_node_put(np); 158d9953105SMichael Ellerman } 159d9953105SMichael Ellerman 160d9953105SMichael Ellerman /* EPOW Events */ 161d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 162d9953105SMichael Ellerman if (np != NULL) { 16332c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 164d9953105SMichael Ellerman of_node_put(np); 165d9953105SMichael Ellerman } 166d9953105SMichael Ellerman 16769ed3324SAnton Blanchard return 0; 168d9953105SMichael Ellerman } 1698e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 170d9953105SMichael Ellerman 17155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 17255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 17355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 17455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 17555fc0c56SAnton Blanchard 17655fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 17755fc0c56SAnton Blanchard { 17855fc0c56SAnton Blanchard switch (event_modifier) { 17955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 180b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1811b7e0cbeSliguang orderly_poweroff(true); 18255fc0c56SAnton Blanchard break; 18355fc0c56SAnton Blanchard 18455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 185b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 186b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 18755fc0c56SAnton Blanchard break; 18855fc0c56SAnton Blanchard 18955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 190b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 191b4af279aSVipin K Parashar " RTAS error log for details\n"); 1921b7e0cbeSliguang orderly_poweroff(true); 19355fc0c56SAnton Blanchard break; 19455fc0c56SAnton Blanchard 19555fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 196b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 197b4af279aSVipin K Parashar " error log for details\n"); 1981b7e0cbeSliguang orderly_poweroff(true); 19955fc0c56SAnton Blanchard break; 20055fc0c56SAnton Blanchard 20155fc0c56SAnton Blanchard default: 202b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 20355fc0c56SAnton Blanchard event_modifier); 20455fc0c56SAnton Blanchard } 20555fc0c56SAnton Blanchard } 20655fc0c56SAnton Blanchard 20755fc0c56SAnton Blanchard struct epow_errorlog { 20855fc0c56SAnton Blanchard unsigned char sensor_value; 20955fc0c56SAnton Blanchard unsigned char event_modifier; 21055fc0c56SAnton Blanchard unsigned char extended_modifier; 21155fc0c56SAnton Blanchard unsigned char reserved; 21255fc0c56SAnton Blanchard unsigned char platform_reason; 21355fc0c56SAnton Blanchard }; 21455fc0c56SAnton Blanchard 21555fc0c56SAnton Blanchard #define EPOW_RESET 0 21655fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 21755fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 21855fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 21955fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 22055fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 22155fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 22255fc0c56SAnton Blanchard 223e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 22455fc0c56SAnton Blanchard { 22555fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 22655fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 22755fc0c56SAnton Blanchard char action_code; 22855fc0c56SAnton Blanchard char modifier; 22955fc0c56SAnton Blanchard 23055fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 23155fc0c56SAnton Blanchard if (pseries_log == NULL) 23255fc0c56SAnton Blanchard return; 23355fc0c56SAnton Blanchard 23455fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 23555fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 23655fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 23755fc0c56SAnton Blanchard 23855fc0c56SAnton Blanchard switch (action_code) { 23955fc0c56SAnton Blanchard case EPOW_RESET: 240b4af279aSVipin K Parashar if (num_epow_events) { 241b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 242b4af279aSVipin K Parashar num_epow_events--; 243b4af279aSVipin K Parashar } 24455fc0c56SAnton Blanchard break; 24555fc0c56SAnton Blanchard 24655fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 247b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 248b4af279aSVipin K Parashar " log for details\n"); 24955fc0c56SAnton Blanchard break; 25055fc0c56SAnton Blanchard 25155fc0c56SAnton Blanchard case EPOW_WARN_POWER: 252b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 253b4af279aSVipin K Parashar " log for details\n"); 25455fc0c56SAnton Blanchard break; 25555fc0c56SAnton Blanchard 25655fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 257d273fa91SYueHaibing handle_system_shutdown(modifier); 25855fc0c56SAnton Blanchard break; 25955fc0c56SAnton Blanchard 26055fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 261b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 262b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2631b7e0cbeSliguang orderly_poweroff(true); 26455fc0c56SAnton Blanchard break; 26555fc0c56SAnton Blanchard 26655fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 26755fc0c56SAnton Blanchard case EPOW_POWER_OFF: 268b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 269b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 27055fc0c56SAnton Blanchard emergency_sync(); 27155fc0c56SAnton Blanchard kernel_power_off(); 27255fc0c56SAnton Blanchard break; 27355fc0c56SAnton Blanchard 27455fc0c56SAnton Blanchard default: 275b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 27655fc0c56SAnton Blanchard action_code); 27755fc0c56SAnton Blanchard } 278b4af279aSVipin K Parashar 279b4af279aSVipin K Parashar /* Increment epow events counter variable */ 280b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 281b4af279aSVipin K Parashar num_epow_events++; 28255fc0c56SAnton Blanchard } 28355fc0c56SAnton Blanchard 284b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 285b7d9eb39SJohn Allen { 286b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 287b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 288b7d9eb39SJohn Allen 289b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 290b7d9eb39SJohn Allen 291b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 292b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 293b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 294b7d9eb39SJohn Allen rtas_get_error_log_max()); 295b7d9eb39SJohn Allen 296b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 297b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 298b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 299b7d9eb39SJohn Allen 300b7d9eb39SJohn Allen /* 301b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 302b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 303b7d9eb39SJohn Allen */ 304b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 3054c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || 3064c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) 307fd12527aSNathan Fontenot queue_hotplug_event(hp_elog); 308b7d9eb39SJohn Allen else 309b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 310b7d9eb39SJohn Allen 311b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 312b7d9eb39SJohn Allen return IRQ_HANDLED; 313b7d9eb39SJohn Allen } 314b7d9eb39SJohn Allen 31555fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3167d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 317d9953105SMichael Ellerman { 31855fc0c56SAnton Blanchard int state; 319d9953105SMichael Ellerman int critical; 320d9953105SMichael Ellerman 321aa23ea0cSCédric Le Goater rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); 322d9953105SMichael Ellerman 323d9953105SMichael Ellerman if (state > 3) 324d9953105SMichael Ellerman critical = 1; /* Time Critical */ 325d9953105SMichael Ellerman else 326d9953105SMichael Ellerman critical = 0; 327d9953105SMichael Ellerman 328d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 329d9953105SMichael Ellerman 330aa23ea0cSCédric Le Goater rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, 331aa23ea0cSCédric Le Goater virq_to_hw(irq), RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf), 332d9953105SMichael Ellerman rtas_get_error_log_max()); 333d9953105SMichael Ellerman 334d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 335d9953105SMichael Ellerman 33655fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 33755fc0c56SAnton Blanchard 338d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 339d9953105SMichael Ellerman return IRQ_HANDLED; 340d9953105SMichael Ellerman } 341d9953105SMichael Ellerman 342d9953105SMichael Ellerman /* 343d9953105SMichael Ellerman * Handle hardware error interrupts. 344d9953105SMichael Ellerman * 345d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 346d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 347d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 348d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 349d9953105SMichael Ellerman */ 3507d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 351d9953105SMichael Ellerman { 352d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 353cc8b5263SAnton Blanchard int status; 354d9953105SMichael Ellerman int fatal; 355d9953105SMichael Ellerman 356d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 357d9953105SMichael Ellerman 358d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 359b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 360476eb491SGrant Likely virq_to_hw(irq), 361d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 362d9953105SMichael Ellerman __pa(&ras_log_buf), 363d9953105SMichael Ellerman rtas_get_error_log_max()); 364d9953105SMichael Ellerman 365d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 366d9953105SMichael Ellerman 367a08a53eaSGreg Kurz if (status == 0 && 368a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 369d9953105SMichael Ellerman fatal = 1; 370d9953105SMichael Ellerman else 371d9953105SMichael Ellerman fatal = 0; 372d9953105SMichael Ellerman 373d9953105SMichael Ellerman /* format and print the extended information */ 374d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 375d9953105SMichael Ellerman 376d9953105SMichael Ellerman if (fatal) { 377b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 378b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 379cc8b5263SAnton Blanchard emergency_sync(); 380cc8b5263SAnton Blanchard kernel_power_off(); 381d9953105SMichael Ellerman } else { 382b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 383d9953105SMichael Ellerman } 384d9953105SMichael Ellerman 385d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 386d9953105SMichael Ellerman return IRQ_HANDLED; 387d9953105SMichael Ellerman } 388d9953105SMichael Ellerman 389d368514cSAnton Blanchard /* 390d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 391d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 392deb70f7aSNicholas Piggin * Minimum size of the buffer is 16 bytes. 393d368514cSAnton Blanchard */ 394d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 395deb70f7aSNicholas Piggin ((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \ 396deb70f7aSNicholas Piggin (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16)))) 397d368514cSAnton Blanchard 39894675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 39994675cceSMahesh Salgaonkar { 40094675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 40194675cceSMahesh Salgaonkar } 40294675cceSMahesh Salgaonkar 403d7b14c5cSNicholas Piggin static __be64 *fwnmi_get_savep(struct pt_regs *regs) 404d7b14c5cSNicholas Piggin { 405d7b14c5cSNicholas Piggin unsigned long savep_ra; 406d7b14c5cSNicholas Piggin 407d7b14c5cSNicholas Piggin /* Mask top two bits */ 408d7b14c5cSNicholas Piggin savep_ra = regs->gpr[3] & ~(0x3UL << 62); 409d7b14c5cSNicholas Piggin if (!VALID_FWNMI_BUFFER(savep_ra)) { 410d7b14c5cSNicholas Piggin printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 411d7b14c5cSNicholas Piggin return NULL; 412d7b14c5cSNicholas Piggin } 413d7b14c5cSNicholas Piggin 414d7b14c5cSNicholas Piggin return __va(savep_ra); 415d7b14c5cSNicholas Piggin } 416d7b14c5cSNicholas Piggin 417d368514cSAnton Blanchard /* 418d368514cSAnton Blanchard * Get the error information for errors coming through the 419d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 420d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 421d9953105SMichael Ellerman * will be returned if found. 422d9953105SMichael Ellerman * 42394675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 424d368514cSAnton Blanchard * 42594675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 426d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 427d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 428d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 429d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 430d9953105SMichael Ellerman * second machine check did come in. 431d9953105SMichael Ellerman */ 432d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 433d9953105SMichael Ellerman { 43494675cceSMahesh Salgaonkar struct rtas_error_log *h; 435d7b14c5cSNicholas Piggin __be64 *savep; 436d9953105SMichael Ellerman 437d7b14c5cSNicholas Piggin savep = fwnmi_get_savep(regs); 438d7b14c5cSNicholas Piggin if (!savep) 439d368514cSAnton Blanchard return NULL; 440d368514cSAnton Blanchard 441cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 442d368514cSAnton Blanchard 443d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 44494675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 44594675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 446a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 44794675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 448d368514cSAnton Blanchard } else { 449a08a53eaSGreg Kurz int len, error_log_length; 450d368514cSAnton Blanchard 451a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 45274e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 45394675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 454d368514cSAnton Blanchard } 455d368514cSAnton Blanchard 45694675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 457d9953105SMichael Ellerman } 458d9953105SMichael Ellerman 459d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 460d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 461d9953105SMichael Ellerman * partition to receive FWNMI errors. 462d9953105SMichael Ellerman */ 463d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 464d9953105SMichael Ellerman { 4652576f5f9SNicholas Piggin struct rtas_args rtas_args; 4662576f5f9SNicholas Piggin int ret; 4672576f5f9SNicholas Piggin 4682576f5f9SNicholas Piggin /* 4692576f5f9SNicholas Piggin * On pseries, the machine check stack is limited to under 4GB, so 4702576f5f9SNicholas Piggin * args can be on-stack. 4712576f5f9SNicholas Piggin */ 4722576f5f9SNicholas Piggin rtas_call_unlocked(&rtas_args, ibm_nmi_interlock_token, 0, 1, NULL); 4732576f5f9SNicholas Piggin ret = be32_to_cpu(rtas_args.rets[0]); 474d9953105SMichael Ellerman if (ret != 0) 475d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 476d9953105SMichael Ellerman } 477d9953105SMichael Ellerman 478c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 479d9953105SMichael Ellerman { 480bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 481bded0706SNicholas Piggin /* 482bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 483bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 484bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 485bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 486bded0706SNicholas Piggin */ 487bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 488bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 489bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 490bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 491bded0706SNicholas Piggin regs->msr = 0; 492bded0706SNicholas Piggin } 493bded0706SNicholas Piggin #endif 494bded0706SNicholas Piggin 495d9953105SMichael Ellerman if (fwnmi_active) { 496d7b14c5cSNicholas Piggin __be64 *savep; 497d7b14c5cSNicholas Piggin 498d7b14c5cSNicholas Piggin /* 499d7b14c5cSNicholas Piggin * Firmware (PowerVM and KVM) saves r3 to a save area like 500d7b14c5cSNicholas Piggin * machine check, which is not exactly what PAPR (2.9) 501d7b14c5cSNicholas Piggin * suggests but there is no way to detect otherwise, so this 502d7b14c5cSNicholas Piggin * is the interface now. 503d7b14c5cSNicholas Piggin * 504d7b14c5cSNicholas Piggin * System resets do not save any error log or require an 505d7b14c5cSNicholas Piggin * "ibm,nmi-interlock" rtas call to release. 506d7b14c5cSNicholas Piggin */ 507d7b14c5cSNicholas Piggin 508d7b14c5cSNicholas Piggin savep = fwnmi_get_savep(regs); 509d7b14c5cSNicholas Piggin if (savep) 510d7b14c5cSNicholas Piggin regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 511d9953105SMichael Ellerman } 512102c05e8SNicholas Piggin 513102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 514102c05e8SNicholas Piggin return 1; 515102c05e8SNicholas Piggin 516c902be71SArnd Bergmann return 0; /* need to perform reset */ 517d9953105SMichael Ellerman } 518d9953105SMichael Ellerman 5194ff753feSGanesh Goudar static int mce_handle_err_realmode(int disposition, u8 error_type) 5204ff753feSGanesh Goudar { 5214ff753feSGanesh Goudar #ifdef CONFIG_PPC_BOOK3S_64 5224ff753feSGanesh Goudar if (disposition == RTAS_DISP_NOT_RECOVERED) { 5234ff753feSGanesh Goudar switch (error_type) { 5244ff753feSGanesh Goudar case MC_ERROR_TYPE_ERAT: 52582f70a05SNicholas Piggin flush_erat(); 52682f70a05SNicholas Piggin disposition = RTAS_DISP_FULLY_RECOVERED; 52782f70a05SNicholas Piggin break; 52882f70a05SNicholas Piggin case MC_ERROR_TYPE_SLB: 5294ff753feSGanesh Goudar /* 5304ff753feSGanesh Goudar * Store the old slb content in paca before flushing. 5314ff753feSGanesh Goudar * Print this when we go to virtual mode. 5324ff753feSGanesh Goudar * There are chances that we may hit MCE again if there 5334ff753feSGanesh Goudar * is a parity error on the SLB entry we trying to read 5344ff753feSGanesh Goudar * for saving. Hence limit the slb saving to single 5354ff753feSGanesh Goudar * level of recursion. 5364ff753feSGanesh Goudar */ 5374ff753feSGanesh Goudar if (local_paca->in_mce == 1) 5384ff753feSGanesh Goudar slb_save_contents(local_paca->mce_faulty_slbs); 5394ff753feSGanesh Goudar flush_and_reload_slb(); 5404ff753feSGanesh Goudar disposition = RTAS_DISP_FULLY_RECOVERED; 5414ff753feSGanesh Goudar break; 5424ff753feSGanesh Goudar default: 5434ff753feSGanesh Goudar break; 5444ff753feSGanesh Goudar } 5454ff753feSGanesh Goudar } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 5464ff753feSGanesh Goudar /* Platform corrected itself but could be degraded */ 5474ff753feSGanesh Goudar pr_err("MCE: limited recovery, system may be degraded\n"); 5484ff753feSGanesh Goudar disposition = RTAS_DISP_FULLY_RECOVERED; 5494ff753feSGanesh Goudar } 5504ff753feSGanesh Goudar #endif 5514ff753feSGanesh Goudar return disposition; 5524ff753feSGanesh Goudar } 5538f0b8056SMahesh Salgaonkar 5544ff753feSGanesh Goudar static int mce_handle_err_virtmode(struct pt_regs *regs, 5554ff753feSGanesh Goudar struct rtas_error_log *errp, 5564ff753feSGanesh Goudar struct pseries_mc_errorlog *mce_log, 5574ff753feSGanesh Goudar int disposition) 5588f0b8056SMahesh Salgaonkar { 5599ca766f9SNicholas Piggin struct mce_error_info mce_err = { 0 }; 5609ca766f9SNicholas Piggin int initiator = rtas_error_initiator(errp); 5619ca766f9SNicholas Piggin int severity = rtas_error_severity(errp); 5624ff753feSGanesh Goudar unsigned long eaddr = 0, paddr = 0; 5638f0b8056SMahesh Salgaonkar u8 error_type, err_sub_type; 5648f0b8056SMahesh Salgaonkar 5654ff753feSGanesh Goudar if (!mce_log) 5664ff753feSGanesh Goudar goto out; 5674ff753feSGanesh Goudar 5684ff753feSGanesh Goudar error_type = mce_log->error_type; 5694ff753feSGanesh Goudar err_sub_type = rtas_mc_error_sub_type(mce_log); 5704ff753feSGanesh Goudar 5719ca766f9SNicholas Piggin if (initiator == RTAS_INITIATOR_UNKNOWN) 5729ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5739ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_CPU) 5749ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_CPU; 5759ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_PCI) 5769ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_PCI; 5779ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_ISA) 5789ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_ISA; 5799ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_MEMORY) 5809ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_MEMORY; 5819ca766f9SNicholas Piggin else if (initiator == RTAS_INITIATOR_POWERMGM) 5829ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_POWERMGM; 5839ca766f9SNicholas Piggin else 5849ca766f9SNicholas Piggin mce_err.initiator = MCE_INITIATOR_UNKNOWN; 5858f0b8056SMahesh Salgaonkar 5869ca766f9SNicholas Piggin if (severity == RTAS_SEVERITY_NO_ERROR) 5879ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_NO_ERROR; 5889ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_EVENT) 5899ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5909ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_WARNING) 5919ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_WARNING; 5929ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR_SYNC) 5939ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5949ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_ERROR) 5959ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_SEVERE; 5969ca766f9SNicholas Piggin else if (severity == RTAS_SEVERITY_FATAL) 5979ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 5989ca766f9SNicholas Piggin else 5999ca766f9SNicholas Piggin mce_err.severity = MCE_SEV_FATAL; 6008f0b8056SMahesh Salgaonkar 6019ca766f9SNicholas Piggin if (severity <= RTAS_SEVERITY_ERROR_SYNC) 6029ca766f9SNicholas Piggin mce_err.sync_error = true; 6039ca766f9SNicholas Piggin else 6049ca766f9SNicholas Piggin mce_err.sync_error = false; 6058f0b8056SMahesh Salgaonkar 6069ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 6079ca766f9SNicholas Piggin mce_err.error_class = MCE_ECLASS_UNKNOWN; 608a43c1590SMahesh Salgaonkar 6094ff753feSGanesh Goudar switch (error_type) { 6109ca766f9SNicholas Piggin case MC_ERROR_TYPE_UE: 6119ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UE; 612efbc4303SGanesh Goudar mce_common_process_ue(regs, &mce_err); 613efbc4303SGanesh Goudar if (mce_err.ignore_event) 614efbc4303SGanesh Goudar disposition = RTAS_DISP_FULLY_RECOVERED; 6159ca766f9SNicholas Piggin switch (err_sub_type) { 6169ca766f9SNicholas Piggin case MC_ERROR_UE_IFETCH: 6179ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; 6189ca766f9SNicholas Piggin break; 6199ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: 6209ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; 6219ca766f9SNicholas Piggin break; 6229ca766f9SNicholas Piggin case MC_ERROR_UE_LOAD_STORE: 6239ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; 6249ca766f9SNicholas Piggin break; 6259ca766f9SNicholas Piggin case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: 6269ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 6279ca766f9SNicholas Piggin break; 6289ca766f9SNicholas Piggin case MC_ERROR_UE_INDETERMINATE: 6299ca766f9SNicholas Piggin default: 6309ca766f9SNicholas Piggin mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; 6319ca766f9SNicholas Piggin break; 6329ca766f9SNicholas Piggin } 6339ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) 6349ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6359ca766f9SNicholas Piggin 6369ca766f9SNicholas Piggin if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { 6379ca766f9SNicholas Piggin paddr = be64_to_cpu(mce_log->logical_address); 6389ca766f9SNicholas Piggin } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { 6399ca766f9SNicholas Piggin unsigned long pfn; 6409ca766f9SNicholas Piggin 6419ca766f9SNicholas Piggin pfn = addr_to_pfn(regs, eaddr); 6429ca766f9SNicholas Piggin if (pfn != ULONG_MAX) 6439ca766f9SNicholas Piggin paddr = pfn << PAGE_SHIFT; 6449ca766f9SNicholas Piggin } 6459ca766f9SNicholas Piggin 6469ca766f9SNicholas Piggin break; 6479ca766f9SNicholas Piggin case MC_ERROR_TYPE_SLB: 6489ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_SLB; 6499ca766f9SNicholas Piggin switch (err_sub_type) { 6509ca766f9SNicholas Piggin case MC_ERROR_SLB_PARITY: 6519ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; 6529ca766f9SNicholas Piggin break; 6539ca766f9SNicholas Piggin case MC_ERROR_SLB_MULTIHIT: 6549ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 6559ca766f9SNicholas Piggin break; 6569ca766f9SNicholas Piggin case MC_ERROR_SLB_INDETERMINATE: 6579ca766f9SNicholas Piggin default: 6589ca766f9SNicholas Piggin mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; 6599ca766f9SNicholas Piggin break; 6609ca766f9SNicholas Piggin } 6619ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6629ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6639ca766f9SNicholas Piggin break; 6649ca766f9SNicholas Piggin case MC_ERROR_TYPE_ERAT: 6659ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_ERAT; 6669ca766f9SNicholas Piggin switch (err_sub_type) { 6679ca766f9SNicholas Piggin case MC_ERROR_ERAT_PARITY: 6689ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; 6699ca766f9SNicholas Piggin break; 6709ca766f9SNicholas Piggin case MC_ERROR_ERAT_MULTIHIT: 6719ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 6729ca766f9SNicholas Piggin break; 6739ca766f9SNicholas Piggin case MC_ERROR_ERAT_INDETERMINATE: 6749ca766f9SNicholas Piggin default: 6759ca766f9SNicholas Piggin mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; 6769ca766f9SNicholas Piggin break; 6779ca766f9SNicholas Piggin } 6789ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6799ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6809ca766f9SNicholas Piggin break; 6819ca766f9SNicholas Piggin case MC_ERROR_TYPE_TLB: 6829ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_TLB; 6839ca766f9SNicholas Piggin switch (err_sub_type) { 6849ca766f9SNicholas Piggin case MC_ERROR_TLB_PARITY: 6859ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; 6869ca766f9SNicholas Piggin break; 6879ca766f9SNicholas Piggin case MC_ERROR_TLB_MULTIHIT: 6889ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 6899ca766f9SNicholas Piggin break; 6909ca766f9SNicholas Piggin case MC_ERROR_TLB_INDETERMINATE: 6919ca766f9SNicholas Piggin default: 6929ca766f9SNicholas Piggin mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; 6939ca766f9SNicholas Piggin break; 6949ca766f9SNicholas Piggin } 6959ca766f9SNicholas Piggin if (mce_log->sub_err_type & 0x80) 6969ca766f9SNicholas Piggin eaddr = be64_to_cpu(mce_log->effective_address); 6979ca766f9SNicholas Piggin break; 6989ca766f9SNicholas Piggin case MC_ERROR_TYPE_D_CACHE: 6999ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_DCACHE; 7009ca766f9SNicholas Piggin break; 7019ca766f9SNicholas Piggin case MC_ERROR_TYPE_I_CACHE: 702*864ec4d4SGanesh Goudar mce_err.error_type = MCE_ERROR_TYPE_ICACHE; 7039ca766f9SNicholas Piggin break; 7049ca766f9SNicholas Piggin case MC_ERROR_TYPE_UNKNOWN: 7059ca766f9SNicholas Piggin default: 7069ca766f9SNicholas Piggin mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; 7079ca766f9SNicholas Piggin break; 7089ca766f9SNicholas Piggin } 709a43c1590SMahesh Salgaonkar out: 7104ff753feSGanesh Goudar save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, 7114ff753feSGanesh Goudar &mce_err, regs->nip, eaddr, paddr); 7124ff753feSGanesh Goudar return disposition; 7134ff753feSGanesh Goudar } 7144ff753feSGanesh Goudar 7154ff753feSGanesh Goudar static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) 7164ff753feSGanesh Goudar { 7174ff753feSGanesh Goudar struct pseries_errorlog *pseries_log; 7184ff753feSGanesh Goudar struct pseries_mc_errorlog *mce_log = NULL; 7194ff753feSGanesh Goudar int disposition = rtas_error_disposition(errp); 72074c3354bSNicholas Piggin unsigned long msr; 7214ff753feSGanesh Goudar u8 error_type; 7224ff753feSGanesh Goudar 7234ff753feSGanesh Goudar if (!rtas_error_extended(errp)) 7244ff753feSGanesh Goudar goto out; 7254ff753feSGanesh Goudar 7264ff753feSGanesh Goudar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 7274ff753feSGanesh Goudar if (!pseries_log) 7284ff753feSGanesh Goudar goto out; 7294ff753feSGanesh Goudar 7304ff753feSGanesh Goudar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 7314ff753feSGanesh Goudar error_type = mce_log->error_type; 7324ff753feSGanesh Goudar 7334ff753feSGanesh Goudar disposition = mce_handle_err_realmode(disposition, error_type); 7344ff753feSGanesh Goudar 735a95a0a16SGanesh Goudar /* 736a95a0a16SGanesh Goudar * Enable translation as we will be accessing per-cpu variables 737a95a0a16SGanesh Goudar * in save_mce_event() which may fall outside RMO region, also 738a95a0a16SGanesh Goudar * leave it enabled because subsequently we will be queuing work 739a95a0a16SGanesh Goudar * to workqueues where again per-cpu variables accessed, besides 740a95a0a16SGanesh Goudar * fwnmi_release_errinfo() crashes when called in realmode on 741a95a0a16SGanesh Goudar * pseries. 742a95a0a16SGanesh Goudar * Note: All the realmode handling like flushing SLB entries for 743a95a0a16SGanesh Goudar * SLB multihit is done by now. 744a95a0a16SGanesh Goudar */ 7454ff753feSGanesh Goudar out: 74674c3354bSNicholas Piggin msr = mfmsr(); 74774c3354bSNicholas Piggin mtmsr(msr | MSR_IR | MSR_DR); 74874c3354bSNicholas Piggin 7494ff753feSGanesh Goudar disposition = mce_handle_err_virtmode(regs, errp, mce_log, 7504ff753feSGanesh Goudar disposition); 75174c3354bSNicholas Piggin 75274c3354bSNicholas Piggin /* 75374c3354bSNicholas Piggin * Queue irq work to log this rtas event later. 75474c3354bSNicholas Piggin * irq_work_queue uses per-cpu variables, so do this in virt 75574c3354bSNicholas Piggin * mode as well. 75674c3354bSNicholas Piggin */ 75774c3354bSNicholas Piggin irq_work_queue(&mce_errlog_process_work); 75874c3354bSNicholas Piggin 75974c3354bSNicholas Piggin mtmsr(msr); 76074c3354bSNicholas Piggin 761a43c1590SMahesh Salgaonkar return disposition; 762a43c1590SMahesh Salgaonkar } 763a43c1590SMahesh Salgaonkar 764d9953105SMichael Ellerman /* 76594675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 76694675cceSMahesh Salgaonkar */ 76794675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 76894675cceSMahesh Salgaonkar { 76994675cceSMahesh Salgaonkar struct rtas_error_log *err; 77094675cceSMahesh Salgaonkar 77194675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 77294675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 77394675cceSMahesh Salgaonkar } 77494675cceSMahesh Salgaonkar 77594675cceSMahesh Salgaonkar /* 776d9953105SMichael Ellerman * See if we can recover from a machine check exception. 777d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 778d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 779d9953105SMichael Ellerman * which provides the error analysis for us. 780d9953105SMichael Ellerman * 781d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 782d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 783d9953105SMichael Ellerman */ 7849ca766f9SNicholas Piggin static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) 785d9953105SMichael Ellerman { 786d47d1d8aSAnton Blanchard int recovered = 0; 7878f0b8056SMahesh Salgaonkar 788d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 789d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 7908f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 791d47d1d8aSAnton Blanchard recovered = 0; 7929ca766f9SNicholas Piggin } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { 793d9953105SMichael Ellerman /* Platform corrected itself */ 794d47d1d8aSAnton Blanchard recovered = 1; 7959ca766f9SNicholas Piggin } else if (evt->severity == MCE_SEV_FATAL) { 7969ca766f9SNicholas Piggin /* Fatal machine check */ 7979ca766f9SNicholas Piggin pr_err("Machine check interrupt is fatal\n"); 7989ca766f9SNicholas Piggin recovered = 0; 799d9953105SMichael Ellerman } 800d9953105SMichael Ellerman 8019ca766f9SNicholas Piggin if (!recovered && evt->sync_error) { 8029ca766f9SNicholas Piggin /* 8039ca766f9SNicholas Piggin * Try to kill processes if we get a synchronous machine check 8049ca766f9SNicholas Piggin * (e.g., one caused by execution of this instruction). This 8059ca766f9SNicholas Piggin * will devolve into a panic if we try to kill init or are in 8069ca766f9SNicholas Piggin * an interrupt etc. 8079ca766f9SNicholas Piggin * 8089ca766f9SNicholas Piggin * TODO: Queue up this address for hwpoisioning later. 8099ca766f9SNicholas Piggin * TODO: This is not quite right for d-side machine 8109ca766f9SNicholas Piggin * checks ->nip is not necessarily the important 8119ca766f9SNicholas Piggin * address. 8129ca766f9SNicholas Piggin */ 8139ca766f9SNicholas Piggin if ((user_mode(regs))) { 8149ca766f9SNicholas Piggin _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 8159ca766f9SNicholas Piggin recovered = 1; 8169ca766f9SNicholas Piggin } else if (die_will_crash()) { 8179ca766f9SNicholas Piggin /* 8189ca766f9SNicholas Piggin * die() would kill the kernel, so better to go via 8199ca766f9SNicholas Piggin * the platform reboot code that will log the 8209ca766f9SNicholas Piggin * machine check. 8219ca766f9SNicholas Piggin */ 8229ca766f9SNicholas Piggin recovered = 0; 8239ca766f9SNicholas Piggin } else { 824209e9d50SNicholas Piggin die_mce("Machine check", regs, SIGBUS); 8259ca766f9SNicholas Piggin recovered = 1; 8269ca766f9SNicholas Piggin } 8279ca766f9SNicholas Piggin } 828d9953105SMichael Ellerman 829d47d1d8aSAnton Blanchard return recovered; 830d9953105SMichael Ellerman } 831d9953105SMichael Ellerman 832d9953105SMichael Ellerman /* 833d9953105SMichael Ellerman * Handle a machine check. 834d9953105SMichael Ellerman * 835d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 836d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 837d9953105SMichael Ellerman * error was recovered (never true if RI=0). 838d9953105SMichael Ellerman * 839d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 840d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 841d9953105SMichael Ellerman */ 842d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 843d9953105SMichael Ellerman { 8449ca766f9SNicholas Piggin struct machine_check_event evt; 845d9953105SMichael Ellerman 8469ca766f9SNicholas Piggin if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 8479ca766f9SNicholas Piggin return 0; 8489ca766f9SNicholas Piggin 8499ca766f9SNicholas Piggin /* Print things out */ 8509ca766f9SNicholas Piggin if (evt.version != MCE_V1) { 8519ca766f9SNicholas Piggin pr_err("Machine Check Exception, Unknown event version %d !\n", 8529ca766f9SNicholas Piggin evt.version); 8539ca766f9SNicholas Piggin return 0; 854d9953105SMichael Ellerman } 8559ca766f9SNicholas Piggin machine_check_print_event_info(&evt, user_mode(regs), false); 8569ca766f9SNicholas Piggin 8579ca766f9SNicholas Piggin if (recover_mce(regs, &evt)) 8589ca766f9SNicholas Piggin return 1; 859d9953105SMichael Ellerman 860d9953105SMichael Ellerman return 0; 861d9953105SMichael Ellerman } 862a43c1590SMahesh Salgaonkar 863a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 864a43c1590SMahesh Salgaonkar { 865a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 866a43c1590SMahesh Salgaonkar int disposition; 867a43c1590SMahesh Salgaonkar 868a43c1590SMahesh Salgaonkar if (fwnmi_active) { 869a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 870a43c1590SMahesh Salgaonkar /* 871a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 872a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 873a43c1590SMahesh Salgaonkar * virtual mode. 874a43c1590SMahesh Salgaonkar */ 8759ca766f9SNicholas Piggin disposition = mce_handle_error(regs, errp); 8769ca766f9SNicholas Piggin 87774c3354bSNicholas Piggin fwnmi_release_errinfo(); 8789ca766f9SNicholas Piggin 879a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 880a43c1590SMahesh Salgaonkar return 1; 881a43c1590SMahesh Salgaonkar } 882a43c1590SMahesh Salgaonkar 883a43c1590SMahesh Salgaonkar return 0; 884a43c1590SMahesh Salgaonkar } 885