1*1a59d1b8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2d9953105SMichael Ellerman /* 3d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 4d9953105SMichael Ellerman */ 5d9953105SMichael Ellerman 6d9953105SMichael Ellerman #include <linux/sched.h> 7d9953105SMichael Ellerman #include <linux/interrupt.h> 8d9953105SMichael Ellerman #include <linux/irq.h> 990128997SAnton Blanchard #include <linux/of.h> 1055fc0c56SAnton Blanchard #include <linux/fs.h> 1155fc0c56SAnton Blanchard #include <linux/reboot.h> 1294675cceSMahesh Salgaonkar #include <linux/irq_work.h> 13d9953105SMichael Ellerman 14d9953105SMichael Ellerman #include <asm/machdep.h> 15d9953105SMichael Ellerman #include <asm/rtas.h> 168c4f1f29SMichael Ellerman #include <asm/firmware.h> 17a43c1590SMahesh Salgaonkar #include <asm/mce.h> 18d9953105SMichael Ellerman 19577830b0SMichael Ellerman #include "pseries.h" 20c902be71SArnd Bergmann 21d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 22d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 23d9953105SMichael Ellerman 24d9953105SMichael Ellerman static int ras_check_exception_token; 25d9953105SMichael Ellerman 2694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 2794675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 2894675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 2994675cceSMahesh Salgaonkar }; 3094675cceSMahesh Salgaonkar 31d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 32d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 33d9953105SMichael Ellerman 34b4af279aSVipin K Parashar /* EPOW events counter variable */ 35b4af279aSVipin K Parashar static int num_epow_events; 36b4af279aSVipin K Parashar 37b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 387d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 397d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 40d9953105SMichael Ellerman 4104fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 4204fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 4304fce21cSMahesh Salgaonkar __be32 fru_id; 4404fce21cSMahesh Salgaonkar __be32 proc_id; 4504fce21cSMahesh Salgaonkar u8 error_type; 4604fce21cSMahesh Salgaonkar /* 4704fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 4804fce21cSMahesh Salgaonkar * 4904fce21cSMahesh Salgaonkar * MSB0 5004fce21cSMahesh Salgaonkar * | 5104fce21cSMahesh Salgaonkar * V 5204fce21cSMahesh Salgaonkar * 01234567 5304fce21cSMahesh Salgaonkar * XXXXXXXX 5404fce21cSMahesh Salgaonkar * 5504fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 5604fce21cSMahesh Salgaonkar * XXXXXXXX 5704fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 5804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 5904fce21cSMahesh Salgaonkar * X 1: Logical address provided. 6004fce21cSMahesh Salgaonkar * XX 2: Reserved. 6104fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 6204fce21cSMahesh Salgaonkar * 6304fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 6404fce21cSMahesh Salgaonkar * XXXXXXXX 6504fce21cSMahesh Salgaonkar * X 1: Effective address provided. 6604fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 6704fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 6804fce21cSMahesh Salgaonkar */ 6904fce21cSMahesh Salgaonkar u8 sub_err_type; 7004fce21cSMahesh Salgaonkar u8 reserved_1[6]; 7104fce21cSMahesh Salgaonkar __be64 effective_address; 7204fce21cSMahesh Salgaonkar __be64 logical_address; 7304fce21cSMahesh Salgaonkar } __packed; 7404fce21cSMahesh Salgaonkar 7504fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 7604fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 7704fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 7804fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 7904fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 8004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 8104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 8204fce21cSMahesh Salgaonkar 8304fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 8404fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 8504fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 8604fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 8704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 8804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 8904fce21cSMahesh Salgaonkar 9004fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 9104fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 9204fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 9304fce21cSMahesh Salgaonkar 9404fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 9504fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 9604fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 9704fce21cSMahesh Salgaonkar 9804fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 10104fce21cSMahesh Salgaonkar 10204fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 10304fce21cSMahesh Salgaonkar { 10404fce21cSMahesh Salgaonkar switch (mlog->error_type) { 10504fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 10604fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 10704fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 10804fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 10904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 11004fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 11104fce21cSMahesh Salgaonkar default: 11204fce21cSMahesh Salgaonkar return 0; 11304fce21cSMahesh Salgaonkar } 11404fce21cSMahesh Salgaonkar } 11504fce21cSMahesh Salgaonkar 11604fce21cSMahesh Salgaonkar static 11704fce21cSMahesh Salgaonkar inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) 11804fce21cSMahesh Salgaonkar { 11904fce21cSMahesh Salgaonkar __be64 addr = 0; 12004fce21cSMahesh Salgaonkar 12104fce21cSMahesh Salgaonkar switch (mlog->error_type) { 12204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 12304fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x40) 12404fce21cSMahesh Salgaonkar addr = mlog->effective_address; 12504fce21cSMahesh Salgaonkar break; 12604fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 12704fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 12804fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 12904fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x80) 13004fce21cSMahesh Salgaonkar addr = mlog->effective_address; 13104fce21cSMahesh Salgaonkar default: 13204fce21cSMahesh Salgaonkar break; 13304fce21cSMahesh Salgaonkar } 13404fce21cSMahesh Salgaonkar return be64_to_cpu(addr); 13504fce21cSMahesh Salgaonkar } 1360ebfff14SBenjamin Herrenschmidt 137d9953105SMichael Ellerman /* 138c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 139c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 140c9dccf1dSSam Bobroff * subsys stage. 141c9dccf1dSSam Bobroff */ 142c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 143c9dccf1dSSam Bobroff { 144c9dccf1dSSam Bobroff struct device_node *np; 145c9dccf1dSSam Bobroff 146c9dccf1dSSam Bobroff /* Hotplug Events */ 147c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 148c9dccf1dSSam Bobroff if (np != NULL) { 149c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 150c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 151c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 152c9dccf1dSSam Bobroff of_node_put(np); 153c9dccf1dSSam Bobroff } 154c9dccf1dSSam Bobroff 155c9dccf1dSSam Bobroff return 0; 156c9dccf1dSSam Bobroff } 157c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 158c9dccf1dSSam Bobroff 159c9dccf1dSSam Bobroff /* 160d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 161d9953105SMichael Ellerman * and power system events. 162d9953105SMichael Ellerman */ 163d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 164d9953105SMichael Ellerman { 165d9953105SMichael Ellerman struct device_node *np; 166d9953105SMichael Ellerman 167d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 168d9953105SMichael Ellerman 169d9953105SMichael Ellerman /* Internal Errors */ 170d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 171d9953105SMichael Ellerman if (np != NULL) { 17232c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 17332c96f77SMark Nelson "RAS_ERROR"); 174d9953105SMichael Ellerman of_node_put(np); 175d9953105SMichael Ellerman } 176d9953105SMichael Ellerman 177d9953105SMichael Ellerman /* EPOW Events */ 178d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 179d9953105SMichael Ellerman if (np != NULL) { 18032c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 181d9953105SMichael Ellerman of_node_put(np); 182d9953105SMichael Ellerman } 183d9953105SMichael Ellerman 18469ed3324SAnton Blanchard return 0; 185d9953105SMichael Ellerman } 1868e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 187d9953105SMichael Ellerman 18855fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 18955fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 19055fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 19155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 19255fc0c56SAnton Blanchard 19355fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 19455fc0c56SAnton Blanchard { 19555fc0c56SAnton Blanchard switch (event_modifier) { 19655fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 197b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1981b7e0cbeSliguang orderly_poweroff(true); 19955fc0c56SAnton Blanchard break; 20055fc0c56SAnton Blanchard 20155fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 202b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 203b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 20479872e35SAnshuman Khandual orderly_poweroff(true); 20555fc0c56SAnton Blanchard break; 20655fc0c56SAnton Blanchard 20755fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 208b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 209b4af279aSVipin K Parashar " RTAS error log for details\n"); 2101b7e0cbeSliguang orderly_poweroff(true); 21155fc0c56SAnton Blanchard break; 21255fc0c56SAnton Blanchard 21355fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 214b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 215b4af279aSVipin K Parashar " error log for details\n"); 2161b7e0cbeSliguang orderly_poweroff(true); 21755fc0c56SAnton Blanchard break; 21855fc0c56SAnton Blanchard 21955fc0c56SAnton Blanchard default: 220b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 22155fc0c56SAnton Blanchard event_modifier); 22255fc0c56SAnton Blanchard } 22355fc0c56SAnton Blanchard } 22455fc0c56SAnton Blanchard 22555fc0c56SAnton Blanchard struct epow_errorlog { 22655fc0c56SAnton Blanchard unsigned char sensor_value; 22755fc0c56SAnton Blanchard unsigned char event_modifier; 22855fc0c56SAnton Blanchard unsigned char extended_modifier; 22955fc0c56SAnton Blanchard unsigned char reserved; 23055fc0c56SAnton Blanchard unsigned char platform_reason; 23155fc0c56SAnton Blanchard }; 23255fc0c56SAnton Blanchard 23355fc0c56SAnton Blanchard #define EPOW_RESET 0 23455fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 23555fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 23655fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 23755fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 23855fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 23955fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 24055fc0c56SAnton Blanchard 241e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 24255fc0c56SAnton Blanchard { 24355fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 24455fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 24555fc0c56SAnton Blanchard char action_code; 24655fc0c56SAnton Blanchard char modifier; 24755fc0c56SAnton Blanchard 24855fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 24955fc0c56SAnton Blanchard if (pseries_log == NULL) 25055fc0c56SAnton Blanchard return; 25155fc0c56SAnton Blanchard 25255fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 25355fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 25455fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 25555fc0c56SAnton Blanchard 25655fc0c56SAnton Blanchard switch (action_code) { 25755fc0c56SAnton Blanchard case EPOW_RESET: 258b4af279aSVipin K Parashar if (num_epow_events) { 259b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 260b4af279aSVipin K Parashar num_epow_events--; 261b4af279aSVipin K Parashar } 26255fc0c56SAnton Blanchard break; 26355fc0c56SAnton Blanchard 26455fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 265b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 266b4af279aSVipin K Parashar " log for details\n"); 26755fc0c56SAnton Blanchard break; 26855fc0c56SAnton Blanchard 26955fc0c56SAnton Blanchard case EPOW_WARN_POWER: 270b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 271b4af279aSVipin K Parashar " log for details\n"); 27255fc0c56SAnton Blanchard break; 27355fc0c56SAnton Blanchard 27455fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 27555fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 27655fc0c56SAnton Blanchard break; 27755fc0c56SAnton Blanchard 27855fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 279b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 280b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2811b7e0cbeSliguang orderly_poweroff(true); 28255fc0c56SAnton Blanchard break; 28355fc0c56SAnton Blanchard 28455fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 28555fc0c56SAnton Blanchard case EPOW_POWER_OFF: 286b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 287b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 28855fc0c56SAnton Blanchard emergency_sync(); 28955fc0c56SAnton Blanchard kernel_power_off(); 29055fc0c56SAnton Blanchard break; 29155fc0c56SAnton Blanchard 29255fc0c56SAnton Blanchard default: 293b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 29455fc0c56SAnton Blanchard action_code); 29555fc0c56SAnton Blanchard } 296b4af279aSVipin K Parashar 297b4af279aSVipin K Parashar /* Increment epow events counter variable */ 298b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 299b4af279aSVipin K Parashar num_epow_events++; 30055fc0c56SAnton Blanchard } 30155fc0c56SAnton Blanchard 302b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 303b7d9eb39SJohn Allen { 304b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 305b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 306b7d9eb39SJohn Allen 307b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 308b7d9eb39SJohn Allen 309b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 310b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 311b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 312b7d9eb39SJohn Allen rtas_get_error_log_max()); 313b7d9eb39SJohn Allen 314b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 315b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 316b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 317b7d9eb39SJohn Allen 318b7d9eb39SJohn Allen /* 319b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 320b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 321b7d9eb39SJohn Allen */ 322b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 3234c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || 3244c5d87dbSOliver O'Halloran hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) 325fd12527aSNathan Fontenot queue_hotplug_event(hp_elog); 326b7d9eb39SJohn Allen else 327b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 328b7d9eb39SJohn Allen 329b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 330b7d9eb39SJohn Allen return IRQ_HANDLED; 331b7d9eb39SJohn Allen } 332b7d9eb39SJohn Allen 33355fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3347d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 335d9953105SMichael Ellerman { 33655fc0c56SAnton Blanchard int status; 33755fc0c56SAnton Blanchard int state; 338d9953105SMichael Ellerman int critical; 339d9953105SMichael Ellerman 3401c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3411c2cb594SThomas Huth &state); 342d9953105SMichael Ellerman 343d9953105SMichael Ellerman if (state > 3) 344d9953105SMichael Ellerman critical = 1; /* Time Critical */ 345d9953105SMichael Ellerman else 346d9953105SMichael Ellerman critical = 0; 347d9953105SMichael Ellerman 348d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 349d9953105SMichael Ellerman 350d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 351b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 352476eb491SGrant Likely virq_to_hw(irq), 3536f43747fSAnton Blanchard RTAS_EPOW_WARNING, 354d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 355d9953105SMichael Ellerman rtas_get_error_log_max()); 356d9953105SMichael Ellerman 357d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 358d9953105SMichael Ellerman 35955fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 36055fc0c56SAnton Blanchard 361d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 362d9953105SMichael Ellerman return IRQ_HANDLED; 363d9953105SMichael Ellerman } 364d9953105SMichael Ellerman 365d9953105SMichael Ellerman /* 366d9953105SMichael Ellerman * Handle hardware error interrupts. 367d9953105SMichael Ellerman * 368d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 369d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 370d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 371d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 372d9953105SMichael Ellerman */ 3737d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 374d9953105SMichael Ellerman { 375d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 376cc8b5263SAnton Blanchard int status; 377d9953105SMichael Ellerman int fatal; 378d9953105SMichael Ellerman 379d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 380d9953105SMichael Ellerman 381d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 382b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 383476eb491SGrant Likely virq_to_hw(irq), 384d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 385d9953105SMichael Ellerman __pa(&ras_log_buf), 386d9953105SMichael Ellerman rtas_get_error_log_max()); 387d9953105SMichael Ellerman 388d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 389d9953105SMichael Ellerman 390a08a53eaSGreg Kurz if (status == 0 && 391a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 392d9953105SMichael Ellerman fatal = 1; 393d9953105SMichael Ellerman else 394d9953105SMichael Ellerman fatal = 0; 395d9953105SMichael Ellerman 396d9953105SMichael Ellerman /* format and print the extended information */ 397d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 398d9953105SMichael Ellerman 399d9953105SMichael Ellerman if (fatal) { 400b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 401b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 402cc8b5263SAnton Blanchard emergency_sync(); 403cc8b5263SAnton Blanchard kernel_power_off(); 404d9953105SMichael Ellerman } else { 405b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 406d9953105SMichael Ellerman } 407d9953105SMichael Ellerman 408d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 409d9953105SMichael Ellerman return IRQ_HANDLED; 410d9953105SMichael Ellerman } 411d9953105SMichael Ellerman 412d368514cSAnton Blanchard /* 413d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 414d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 415d368514cSAnton Blanchard */ 416d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 417d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 418d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 419d368514cSAnton Blanchard 42094675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 42194675cceSMahesh Salgaonkar { 42294675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 42394675cceSMahesh Salgaonkar } 42494675cceSMahesh Salgaonkar 425d368514cSAnton Blanchard /* 426d368514cSAnton Blanchard * Get the error information for errors coming through the 427d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 428d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 429d9953105SMichael Ellerman * will be returned if found. 430d9953105SMichael Ellerman * 43194675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 432d368514cSAnton Blanchard * 43394675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 434d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 435d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 436d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 437d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 438d9953105SMichael Ellerman * second machine check did come in. 439d9953105SMichael Ellerman */ 440d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 441d9953105SMichael Ellerman { 442d9953105SMichael Ellerman unsigned long *savep; 44394675cceSMahesh Salgaonkar struct rtas_error_log *h; 444d9953105SMichael Ellerman 445ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 446ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 447ee1dd1e3SMahesh Salgaonkar 448d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 449f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 450d368514cSAnton Blanchard return NULL; 451d9953105SMichael Ellerman } 452d368514cSAnton Blanchard 453d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 454cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 455d368514cSAnton Blanchard 456d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 45794675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 45894675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 459a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 46094675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 461d368514cSAnton Blanchard } else { 462a08a53eaSGreg Kurz int len, error_log_length; 463d368514cSAnton Blanchard 464a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 46574e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 46694675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 467d368514cSAnton Blanchard } 468d368514cSAnton Blanchard 46994675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 470d9953105SMichael Ellerman } 471d9953105SMichael Ellerman 472d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 473d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 474d9953105SMichael Ellerman * partition to receive FWNMI errors. 475d9953105SMichael Ellerman */ 476d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 477d9953105SMichael Ellerman { 478d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 479d9953105SMichael Ellerman if (ret != 0) 480d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 481d9953105SMichael Ellerman } 482d9953105SMichael Ellerman 483c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 484d9953105SMichael Ellerman { 485bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 486bded0706SNicholas Piggin /* 487bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 488bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 489bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 490bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 491bded0706SNicholas Piggin */ 492bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 493bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 494bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 495bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 496bded0706SNicholas Piggin regs->msr = 0; 497bded0706SNicholas Piggin } 498bded0706SNicholas Piggin #endif 499bded0706SNicholas Piggin 500d9953105SMichael Ellerman if (fwnmi_active) { 501d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 502d9953105SMichael Ellerman if (errhdr) { 503d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 504d9953105SMichael Ellerman } 505d9953105SMichael Ellerman fwnmi_release_errinfo(); 506d9953105SMichael Ellerman } 507102c05e8SNicholas Piggin 508102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 509102c05e8SNicholas Piggin return 1; 510102c05e8SNicholas Piggin 511c902be71SArnd Bergmann return 0; /* need to perform reset */ 512d9953105SMichael Ellerman } 513d9953105SMichael Ellerman 5148f0b8056SMahesh Salgaonkar #define VAL_TO_STRING(ar, val) \ 5158f0b8056SMahesh Salgaonkar (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") 5168f0b8056SMahesh Salgaonkar 5178f0b8056SMahesh Salgaonkar static void pseries_print_mce_info(struct pt_regs *regs, 5188f0b8056SMahesh Salgaonkar struct rtas_error_log *errp) 5198f0b8056SMahesh Salgaonkar { 5208f0b8056SMahesh Salgaonkar const char *level, *sevstr; 5218f0b8056SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 5228f0b8056SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 5238f0b8056SMahesh Salgaonkar u8 error_type, err_sub_type; 5248f0b8056SMahesh Salgaonkar u64 addr; 5258f0b8056SMahesh Salgaonkar u8 initiator = rtas_error_initiator(errp); 5268f0b8056SMahesh Salgaonkar int disposition = rtas_error_disposition(errp); 5278f0b8056SMahesh Salgaonkar 5288f0b8056SMahesh Salgaonkar static const char * const initiators[] = { 529c9d8dda4SMahesh Salgaonkar [0] = "Unknown", 530c9d8dda4SMahesh Salgaonkar [1] = "CPU", 531c9d8dda4SMahesh Salgaonkar [2] = "PCI", 532c9d8dda4SMahesh Salgaonkar [3] = "ISA", 533c9d8dda4SMahesh Salgaonkar [4] = "Memory", 534c9d8dda4SMahesh Salgaonkar [5] = "Power Mgmt", 5358f0b8056SMahesh Salgaonkar }; 5368f0b8056SMahesh Salgaonkar static const char * const mc_err_types[] = { 537c9d8dda4SMahesh Salgaonkar [0] = "UE", 538c9d8dda4SMahesh Salgaonkar [1] = "SLB", 539c9d8dda4SMahesh Salgaonkar [2] = "ERAT", 540c9d8dda4SMahesh Salgaonkar [3] = "Unknown", 541c9d8dda4SMahesh Salgaonkar [4] = "TLB", 542c9d8dda4SMahesh Salgaonkar [5] = "D-Cache", 543c9d8dda4SMahesh Salgaonkar [6] = "Unknown", 544c9d8dda4SMahesh Salgaonkar [7] = "I-Cache", 5458f0b8056SMahesh Salgaonkar }; 5468f0b8056SMahesh Salgaonkar static const char * const mc_ue_types[] = { 547c9d8dda4SMahesh Salgaonkar [0] = "Indeterminate", 548c9d8dda4SMahesh Salgaonkar [1] = "Instruction fetch", 549c9d8dda4SMahesh Salgaonkar [2] = "Page table walk ifetch", 550c9d8dda4SMahesh Salgaonkar [3] = "Load/Store", 551c9d8dda4SMahesh Salgaonkar [4] = "Page table walk Load/Store", 5528f0b8056SMahesh Salgaonkar }; 5538f0b8056SMahesh Salgaonkar 5548f0b8056SMahesh Salgaonkar /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ 5558f0b8056SMahesh Salgaonkar static const char * const mc_slb_types[] = { 556c9d8dda4SMahesh Salgaonkar [0] = "Parity", 557c9d8dda4SMahesh Salgaonkar [1] = "Multihit", 558c9d8dda4SMahesh Salgaonkar [2] = "Indeterminate", 5598f0b8056SMahesh Salgaonkar }; 5608f0b8056SMahesh Salgaonkar 5618f0b8056SMahesh Salgaonkar /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ 5628f0b8056SMahesh Salgaonkar static const char * const mc_soft_types[] = { 563c9d8dda4SMahesh Salgaonkar [0] = "Unknown", 564c9d8dda4SMahesh Salgaonkar [1] = "Parity", 565c9d8dda4SMahesh Salgaonkar [2] = "Multihit", 566c9d8dda4SMahesh Salgaonkar [3] = "Indeterminate", 5678f0b8056SMahesh Salgaonkar }; 5688f0b8056SMahesh Salgaonkar 5698f0b8056SMahesh Salgaonkar if (!rtas_error_extended(errp)) { 5708f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt: Missing extended error log\n"); 5718f0b8056SMahesh Salgaonkar return; 5728f0b8056SMahesh Salgaonkar } 5738f0b8056SMahesh Salgaonkar 5748f0b8056SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 5758f0b8056SMahesh Salgaonkar if (pseries_log == NULL) 5768f0b8056SMahesh Salgaonkar return; 5778f0b8056SMahesh Salgaonkar 5788f0b8056SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 5798f0b8056SMahesh Salgaonkar 5808f0b8056SMahesh Salgaonkar error_type = mce_log->error_type; 5818f0b8056SMahesh Salgaonkar err_sub_type = rtas_mc_error_sub_type(mce_log); 5828f0b8056SMahesh Salgaonkar 5838f0b8056SMahesh Salgaonkar switch (rtas_error_severity(errp)) { 5848f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_NO_ERROR: 5858f0b8056SMahesh Salgaonkar level = KERN_INFO; 5868f0b8056SMahesh Salgaonkar sevstr = "Harmless"; 5878f0b8056SMahesh Salgaonkar break; 5888f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_WARNING: 5898f0b8056SMahesh Salgaonkar level = KERN_WARNING; 5908f0b8056SMahesh Salgaonkar sevstr = ""; 5918f0b8056SMahesh Salgaonkar break; 5928f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_ERROR: 5938f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_ERROR_SYNC: 5948f0b8056SMahesh Salgaonkar level = KERN_ERR; 5958f0b8056SMahesh Salgaonkar sevstr = "Severe"; 5968f0b8056SMahesh Salgaonkar break; 5978f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_FATAL: 5988f0b8056SMahesh Salgaonkar default: 5998f0b8056SMahesh Salgaonkar level = KERN_ERR; 6008f0b8056SMahesh Salgaonkar sevstr = "Fatal"; 6018f0b8056SMahesh Salgaonkar break; 6028f0b8056SMahesh Salgaonkar } 6038f0b8056SMahesh Salgaonkar 604c6d15258SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 605c6d15258SMahesh Salgaonkar /* Display faulty slb contents for SLB errors. */ 606c6d15258SMahesh Salgaonkar if (error_type == MC_ERROR_TYPE_SLB) 607c6d15258SMahesh Salgaonkar slb_dump_contents(local_paca->mce_faulty_slbs); 608c6d15258SMahesh Salgaonkar #endif 609c6d15258SMahesh Salgaonkar 6108f0b8056SMahesh Salgaonkar printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 6118f0b8056SMahesh Salgaonkar disposition == RTAS_DISP_FULLY_RECOVERED ? 6128f0b8056SMahesh Salgaonkar "Recovered" : "Not recovered"); 6138f0b8056SMahesh Salgaonkar if (user_mode(regs)) { 6148f0b8056SMahesh Salgaonkar printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, 6158f0b8056SMahesh Salgaonkar regs->nip, current->pid, current->comm); 6168f0b8056SMahesh Salgaonkar } else { 6178f0b8056SMahesh Salgaonkar printk("%s NIP [%016lx]: %pS\n", level, regs->nip, 6188f0b8056SMahesh Salgaonkar (void *)regs->nip); 6198f0b8056SMahesh Salgaonkar } 6208f0b8056SMahesh Salgaonkar printk("%s Initiator: %s\n", level, 6218f0b8056SMahesh Salgaonkar VAL_TO_STRING(initiators, initiator)); 6228f0b8056SMahesh Salgaonkar 6238f0b8056SMahesh Salgaonkar switch (error_type) { 6248f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_UE: 6258f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 6268f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 6278f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_ue_types, err_sub_type)); 6288f0b8056SMahesh Salgaonkar break; 6298f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 6308f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 6318f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 6328f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_slb_types, err_sub_type)); 6338f0b8056SMahesh Salgaonkar break; 6348f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 6358f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 6368f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 6378f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 6388f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_soft_types, err_sub_type)); 6398f0b8056SMahesh Salgaonkar break; 6408f0b8056SMahesh Salgaonkar default: 6418f0b8056SMahesh Salgaonkar printk("%s Error type: %s\n", level, 6428f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type)); 6438f0b8056SMahesh Salgaonkar break; 6448f0b8056SMahesh Salgaonkar } 6458f0b8056SMahesh Salgaonkar 6468f0b8056SMahesh Salgaonkar addr = rtas_mc_get_effective_addr(mce_log); 6478f0b8056SMahesh Salgaonkar if (addr) 6488f0b8056SMahesh Salgaonkar printk("%s Effective address: %016llx\n", level, addr); 6498f0b8056SMahesh Salgaonkar } 6508f0b8056SMahesh Salgaonkar 651a43c1590SMahesh Salgaonkar static int mce_handle_error(struct rtas_error_log *errp) 652a43c1590SMahesh Salgaonkar { 653a43c1590SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 654a43c1590SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 655a43c1590SMahesh Salgaonkar int disposition = rtas_error_disposition(errp); 656a43c1590SMahesh Salgaonkar u8 error_type; 657a43c1590SMahesh Salgaonkar 658a43c1590SMahesh Salgaonkar if (!rtas_error_extended(errp)) 659a43c1590SMahesh Salgaonkar goto out; 660a43c1590SMahesh Salgaonkar 661a43c1590SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 662a43c1590SMahesh Salgaonkar if (pseries_log == NULL) 663a43c1590SMahesh Salgaonkar goto out; 664a43c1590SMahesh Salgaonkar 665a43c1590SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 666a43c1590SMahesh Salgaonkar error_type = mce_log->error_type; 667a43c1590SMahesh Salgaonkar 668a43c1590SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 669a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_NOT_RECOVERED) { 670a43c1590SMahesh Salgaonkar switch (error_type) { 671a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 672a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 673c6d15258SMahesh Salgaonkar /* 674c6d15258SMahesh Salgaonkar * Store the old slb content in paca before flushing. 675c6d15258SMahesh Salgaonkar * Print this when we go to virtual mode. 676c6d15258SMahesh Salgaonkar * There are chances that we may hit MCE again if there 677c6d15258SMahesh Salgaonkar * is a parity error on the SLB entry we trying to read 678c6d15258SMahesh Salgaonkar * for saving. Hence limit the slb saving to single 679c6d15258SMahesh Salgaonkar * level of recursion. 680c6d15258SMahesh Salgaonkar */ 681c6d15258SMahesh Salgaonkar if (local_paca->in_mce == 1) 682c6d15258SMahesh Salgaonkar slb_save_contents(local_paca->mce_faulty_slbs); 683a43c1590SMahesh Salgaonkar flush_and_reload_slb(); 684a43c1590SMahesh Salgaonkar disposition = RTAS_DISP_FULLY_RECOVERED; 685a43c1590SMahesh Salgaonkar rtas_set_disposition_recovered(errp); 686a43c1590SMahesh Salgaonkar break; 687a43c1590SMahesh Salgaonkar default: 688a43c1590SMahesh Salgaonkar break; 689a43c1590SMahesh Salgaonkar } 690a43c1590SMahesh Salgaonkar } 691a43c1590SMahesh Salgaonkar #endif 692a43c1590SMahesh Salgaonkar 693a43c1590SMahesh Salgaonkar out: 694a43c1590SMahesh Salgaonkar return disposition; 695a43c1590SMahesh Salgaonkar } 696a43c1590SMahesh Salgaonkar 6977f177f98SGanesh Goudar #ifdef CONFIG_MEMORY_FAILURE 6987f177f98SGanesh Goudar 6997f177f98SGanesh Goudar static DEFINE_PER_CPU(int, rtas_ue_count); 7007f177f98SGanesh Goudar static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); 7017f177f98SGanesh Goudar 7027f177f98SGanesh Goudar #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 7037f177f98SGanesh Goudar #define UE_LOGICAL_ADDR_PROVIDED 0x20 7047f177f98SGanesh Goudar 7057f177f98SGanesh Goudar 7067f177f98SGanesh Goudar static void pseries_hwpoison_work_fn(struct work_struct *work) 7077f177f98SGanesh Goudar { 7087f177f98SGanesh Goudar unsigned long paddr; 7097f177f98SGanesh Goudar int index; 7107f177f98SGanesh Goudar 7117f177f98SGanesh Goudar while (__this_cpu_read(rtas_ue_count) > 0) { 7127f177f98SGanesh Goudar index = __this_cpu_read(rtas_ue_count) - 1; 7137f177f98SGanesh Goudar paddr = __this_cpu_read(rtas_ue_paddr[index]); 7147f177f98SGanesh Goudar memory_failure(paddr >> PAGE_SHIFT, 0); 7157f177f98SGanesh Goudar __this_cpu_dec(rtas_ue_count); 7167f177f98SGanesh Goudar } 7177f177f98SGanesh Goudar } 7187f177f98SGanesh Goudar 7197f177f98SGanesh Goudar static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); 7207f177f98SGanesh Goudar 7217f177f98SGanesh Goudar static void queue_ue_paddr(unsigned long paddr) 7227f177f98SGanesh Goudar { 7237f177f98SGanesh Goudar int index; 7247f177f98SGanesh Goudar 7257f177f98SGanesh Goudar index = __this_cpu_inc_return(rtas_ue_count) - 1; 7267f177f98SGanesh Goudar if (index >= MAX_MC_EVT) { 7277f177f98SGanesh Goudar __this_cpu_dec(rtas_ue_count); 7287f177f98SGanesh Goudar return; 7297f177f98SGanesh Goudar } 7307f177f98SGanesh Goudar this_cpu_write(rtas_ue_paddr[index], paddr); 7317f177f98SGanesh Goudar schedule_work(&hwpoison_work); 7327f177f98SGanesh Goudar } 7337f177f98SGanesh Goudar 7347f177f98SGanesh Goudar static void pseries_do_memory_failure(struct pt_regs *regs, 7357f177f98SGanesh Goudar struct pseries_mc_errorlog *mce_log) 7367f177f98SGanesh Goudar { 7377f177f98SGanesh Goudar unsigned long paddr; 7387f177f98SGanesh Goudar 7397f177f98SGanesh Goudar if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { 7407f177f98SGanesh Goudar paddr = be64_to_cpu(mce_log->logical_address); 7417f177f98SGanesh Goudar } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { 7427f177f98SGanesh Goudar unsigned long pfn; 7437f177f98SGanesh Goudar 7447f177f98SGanesh Goudar pfn = addr_to_pfn(regs, 7457f177f98SGanesh Goudar be64_to_cpu(mce_log->effective_address)); 7467f177f98SGanesh Goudar if (pfn == ULONG_MAX) 7477f177f98SGanesh Goudar return; 7487f177f98SGanesh Goudar paddr = pfn << PAGE_SHIFT; 7497f177f98SGanesh Goudar } else { 7507f177f98SGanesh Goudar return; 7517f177f98SGanesh Goudar } 7527f177f98SGanesh Goudar queue_ue_paddr(paddr); 7537f177f98SGanesh Goudar } 7547f177f98SGanesh Goudar 7557f177f98SGanesh Goudar static void pseries_process_ue(struct pt_regs *regs, 7567f177f98SGanesh Goudar struct rtas_error_log *errp) 7577f177f98SGanesh Goudar { 7587f177f98SGanesh Goudar struct pseries_errorlog *pseries_log; 7597f177f98SGanesh Goudar struct pseries_mc_errorlog *mce_log; 7607f177f98SGanesh Goudar 7617f177f98SGanesh Goudar if (!rtas_error_extended(errp)) 7627f177f98SGanesh Goudar return; 7637f177f98SGanesh Goudar 7647f177f98SGanesh Goudar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 7657f177f98SGanesh Goudar if (!pseries_log) 7667f177f98SGanesh Goudar return; 7677f177f98SGanesh Goudar 7687f177f98SGanesh Goudar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 7697f177f98SGanesh Goudar 7707f177f98SGanesh Goudar if (mce_log->error_type == MC_ERROR_TYPE_UE) 7717f177f98SGanesh Goudar pseries_do_memory_failure(regs, mce_log); 7727f177f98SGanesh Goudar } 7737f177f98SGanesh Goudar #else 7747f177f98SGanesh Goudar static inline void pseries_process_ue(struct pt_regs *regs, 7757f177f98SGanesh Goudar struct rtas_error_log *errp) { } 7767f177f98SGanesh Goudar #endif /*CONFIG_MEMORY_FAILURE */ 7777f177f98SGanesh Goudar 778d9953105SMichael Ellerman /* 77994675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 78094675cceSMahesh Salgaonkar */ 78194675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 78294675cceSMahesh Salgaonkar { 78394675cceSMahesh Salgaonkar struct rtas_error_log *err; 78494675cceSMahesh Salgaonkar 78594675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 78694675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 78794675cceSMahesh Salgaonkar } 78894675cceSMahesh Salgaonkar 78994675cceSMahesh Salgaonkar /* 790d9953105SMichael Ellerman * See if we can recover from a machine check exception. 791d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 792d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 793d9953105SMichael Ellerman * which provides the error analysis for us. 794d9953105SMichael Ellerman * 795d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 796d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 797d9953105SMichael Ellerman */ 798d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 799d9953105SMichael Ellerman { 800d47d1d8aSAnton Blanchard int recovered = 0; 801a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 802d9953105SMichael Ellerman 8038f0b8056SMahesh Salgaonkar pseries_print_mce_info(regs, err); 8048f0b8056SMahesh Salgaonkar 805d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 806d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 8078f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 808d47d1d8aSAnton Blanchard recovered = 0; 809d47d1d8aSAnton Blanchard 810a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 811d9953105SMichael Ellerman /* Platform corrected itself */ 812d47d1d8aSAnton Blanchard recovered = 1; 813d47d1d8aSAnton Blanchard 814a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 815d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 816d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 817d47d1d8aSAnton Blanchard "be degraded\n"); 818d47d1d8aSAnton Blanchard recovered = 1; 819d47d1d8aSAnton Blanchard 820d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 821a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 822d47d1d8aSAnton Blanchard 823d47d1d8aSAnton Blanchard /* 824d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 825d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 826d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 827d47d1d8aSAnton Blanchard * fields being valid here. 828d47d1d8aSAnton Blanchard */ 829d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 830d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 831d47d1d8aSAnton Blanchard 832d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 833d47d1d8aSAnton Blanchard recovered = 1; 834d9953105SMichael Ellerman } 835d9953105SMichael Ellerman 8367f177f98SGanesh Goudar pseries_process_ue(regs, err); 8377f177f98SGanesh Goudar 83894675cceSMahesh Salgaonkar /* Queue irq work to log this rtas event later. */ 83994675cceSMahesh Salgaonkar irq_work_queue(&mce_errlog_process_work); 840d9953105SMichael Ellerman 841d47d1d8aSAnton Blanchard return recovered; 842d9953105SMichael Ellerman } 843d9953105SMichael Ellerman 844d9953105SMichael Ellerman /* 845d9953105SMichael Ellerman * Handle a machine check. 846d9953105SMichael Ellerman * 847d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 848d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 849d9953105SMichael Ellerman * error was recovered (never true if RI=0). 850d9953105SMichael Ellerman * 851d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 852d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 853d9953105SMichael Ellerman */ 854d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 855d9953105SMichael Ellerman { 856d9953105SMichael Ellerman struct rtas_error_log *errp; 857d9953105SMichael Ellerman 858d9953105SMichael Ellerman if (fwnmi_active) { 859d9953105SMichael Ellerman fwnmi_release_errinfo(); 860a43c1590SMahesh Salgaonkar errp = fwnmi_get_errlog(); 861d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 862d9953105SMichael Ellerman return 1; 863d9953105SMichael Ellerman } 864d9953105SMichael Ellerman 865d9953105SMichael Ellerman return 0; 866d9953105SMichael Ellerman } 867a43c1590SMahesh Salgaonkar 868a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 869a43c1590SMahesh Salgaonkar { 870a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 871a43c1590SMahesh Salgaonkar int disposition; 872a43c1590SMahesh Salgaonkar 873a43c1590SMahesh Salgaonkar if (fwnmi_active) { 874a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 875a43c1590SMahesh Salgaonkar /* 876a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 877a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 878a43c1590SMahesh Salgaonkar * virtual mode. 879a43c1590SMahesh Salgaonkar */ 880a43c1590SMahesh Salgaonkar disposition = mce_handle_error(errp); 881a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 882a43c1590SMahesh Salgaonkar return 1; 883a43c1590SMahesh Salgaonkar } 884a43c1590SMahesh Salgaonkar 885a43c1590SMahesh Salgaonkar return 0; 886a43c1590SMahesh Salgaonkar } 887