1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman #include <linux/sched.h> 20d9953105SMichael Ellerman #include <linux/interrupt.h> 21d9953105SMichael Ellerman #include <linux/irq.h> 2290128997SAnton Blanchard #include <linux/of.h> 2355fc0c56SAnton Blanchard #include <linux/fs.h> 2455fc0c56SAnton Blanchard #include <linux/reboot.h> 2594675cceSMahesh Salgaonkar #include <linux/irq_work.h> 26d9953105SMichael Ellerman 27d9953105SMichael Ellerman #include <asm/machdep.h> 28d9953105SMichael Ellerman #include <asm/rtas.h> 298c4f1f29SMichael Ellerman #include <asm/firmware.h> 30a43c1590SMahesh Salgaonkar #include <asm/mce.h> 31d9953105SMichael Ellerman 32577830b0SMichael Ellerman #include "pseries.h" 33c902be71SArnd Bergmann 34d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 35d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 36d9953105SMichael Ellerman 37d9953105SMichael Ellerman static int ras_check_exception_token; 38d9953105SMichael Ellerman 3994675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 4094675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 4194675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 4294675cceSMahesh Salgaonkar }; 4394675cceSMahesh Salgaonkar 44d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 45d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 46d9953105SMichael Ellerman 47b4af279aSVipin K Parashar /* EPOW events counter variable */ 48b4af279aSVipin K Parashar static int num_epow_events; 49b4af279aSVipin K Parashar 50b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 517d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 527d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 53d9953105SMichael Ellerman 5404fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 5504fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 5604fce21cSMahesh Salgaonkar __be32 fru_id; 5704fce21cSMahesh Salgaonkar __be32 proc_id; 5804fce21cSMahesh Salgaonkar u8 error_type; 5904fce21cSMahesh Salgaonkar /* 6004fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 6104fce21cSMahesh Salgaonkar * 6204fce21cSMahesh Salgaonkar * MSB0 6304fce21cSMahesh Salgaonkar * | 6404fce21cSMahesh Salgaonkar * V 6504fce21cSMahesh Salgaonkar * 01234567 6604fce21cSMahesh Salgaonkar * XXXXXXXX 6704fce21cSMahesh Salgaonkar * 6804fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 6904fce21cSMahesh Salgaonkar * XXXXXXXX 7004fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 7104fce21cSMahesh Salgaonkar * X 1: Effective address provided. 7204fce21cSMahesh Salgaonkar * X 1: Logical address provided. 7304fce21cSMahesh Salgaonkar * XX 2: Reserved. 7404fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 7504fce21cSMahesh Salgaonkar * 7604fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 7704fce21cSMahesh Salgaonkar * XXXXXXXX 7804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 7904fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 8004fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 8104fce21cSMahesh Salgaonkar */ 8204fce21cSMahesh Salgaonkar u8 sub_err_type; 8304fce21cSMahesh Salgaonkar u8 reserved_1[6]; 8404fce21cSMahesh Salgaonkar __be64 effective_address; 8504fce21cSMahesh Salgaonkar __be64 logical_address; 8604fce21cSMahesh Salgaonkar } __packed; 8704fce21cSMahesh Salgaonkar 8804fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 8904fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 9004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 9104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 9204fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 9304fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 9404fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 9504fce21cSMahesh Salgaonkar 9604fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 9704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 9804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 10104fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 10204fce21cSMahesh Salgaonkar 10304fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 10404fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 10504fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 10604fce21cSMahesh Salgaonkar 10704fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 10804fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 10904fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 11004fce21cSMahesh Salgaonkar 11104fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 11204fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 11304fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 11404fce21cSMahesh Salgaonkar 11504fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 11604fce21cSMahesh Salgaonkar { 11704fce21cSMahesh Salgaonkar switch (mlog->error_type) { 11804fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 11904fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 12004fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 12104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 12204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 12304fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 12404fce21cSMahesh Salgaonkar default: 12504fce21cSMahesh Salgaonkar return 0; 12604fce21cSMahesh Salgaonkar } 12704fce21cSMahesh Salgaonkar } 12804fce21cSMahesh Salgaonkar 12904fce21cSMahesh Salgaonkar static 13004fce21cSMahesh Salgaonkar inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) 13104fce21cSMahesh Salgaonkar { 13204fce21cSMahesh Salgaonkar __be64 addr = 0; 13304fce21cSMahesh Salgaonkar 13404fce21cSMahesh Salgaonkar switch (mlog->error_type) { 13504fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 13604fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x40) 13704fce21cSMahesh Salgaonkar addr = mlog->effective_address; 13804fce21cSMahesh Salgaonkar break; 13904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 14004fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 14104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 14204fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x80) 14304fce21cSMahesh Salgaonkar addr = mlog->effective_address; 14404fce21cSMahesh Salgaonkar default: 14504fce21cSMahesh Salgaonkar break; 14604fce21cSMahesh Salgaonkar } 14704fce21cSMahesh Salgaonkar return be64_to_cpu(addr); 14804fce21cSMahesh Salgaonkar } 1490ebfff14SBenjamin Herrenschmidt 150d9953105SMichael Ellerman /* 151c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 152c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 153c9dccf1dSSam Bobroff * subsys stage. 154c9dccf1dSSam Bobroff */ 155c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 156c9dccf1dSSam Bobroff { 157c9dccf1dSSam Bobroff struct device_node *np; 158c9dccf1dSSam Bobroff 159c9dccf1dSSam Bobroff /* Hotplug Events */ 160c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 161c9dccf1dSSam Bobroff if (np != NULL) { 162c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 163c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 164c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 165c9dccf1dSSam Bobroff of_node_put(np); 166c9dccf1dSSam Bobroff } 167c9dccf1dSSam Bobroff 168c9dccf1dSSam Bobroff return 0; 169c9dccf1dSSam Bobroff } 170c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 171c9dccf1dSSam Bobroff 172c9dccf1dSSam Bobroff /* 173d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 174d9953105SMichael Ellerman * and power system events. 175d9953105SMichael Ellerman */ 176d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 177d9953105SMichael Ellerman { 178d9953105SMichael Ellerman struct device_node *np; 179d9953105SMichael Ellerman 180d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 181d9953105SMichael Ellerman 182d9953105SMichael Ellerman /* Internal Errors */ 183d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 184d9953105SMichael Ellerman if (np != NULL) { 18532c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 18632c96f77SMark Nelson "RAS_ERROR"); 187d9953105SMichael Ellerman of_node_put(np); 188d9953105SMichael Ellerman } 189d9953105SMichael Ellerman 190d9953105SMichael Ellerman /* EPOW Events */ 191d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 192d9953105SMichael Ellerman if (np != NULL) { 19332c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 194d9953105SMichael Ellerman of_node_put(np); 195d9953105SMichael Ellerman } 196d9953105SMichael Ellerman 19769ed3324SAnton Blanchard return 0; 198d9953105SMichael Ellerman } 1998e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 200d9953105SMichael Ellerman 20155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 20255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 20355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 20455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 20555fc0c56SAnton Blanchard 20655fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 20755fc0c56SAnton Blanchard { 20855fc0c56SAnton Blanchard switch (event_modifier) { 20955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 210b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 2111b7e0cbeSliguang orderly_poweroff(true); 21255fc0c56SAnton Blanchard break; 21355fc0c56SAnton Blanchard 21455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 215b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 216b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 21779872e35SAnshuman Khandual orderly_poweroff(true); 21855fc0c56SAnton Blanchard break; 21955fc0c56SAnton Blanchard 22055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 221b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 222b4af279aSVipin K Parashar " RTAS error log for details\n"); 2231b7e0cbeSliguang orderly_poweroff(true); 22455fc0c56SAnton Blanchard break; 22555fc0c56SAnton Blanchard 22655fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 227b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 228b4af279aSVipin K Parashar " error log for details\n"); 2291b7e0cbeSliguang orderly_poweroff(true); 23055fc0c56SAnton Blanchard break; 23155fc0c56SAnton Blanchard 23255fc0c56SAnton Blanchard default: 233b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 23455fc0c56SAnton Blanchard event_modifier); 23555fc0c56SAnton Blanchard } 23655fc0c56SAnton Blanchard } 23755fc0c56SAnton Blanchard 23855fc0c56SAnton Blanchard struct epow_errorlog { 23955fc0c56SAnton Blanchard unsigned char sensor_value; 24055fc0c56SAnton Blanchard unsigned char event_modifier; 24155fc0c56SAnton Blanchard unsigned char extended_modifier; 24255fc0c56SAnton Blanchard unsigned char reserved; 24355fc0c56SAnton Blanchard unsigned char platform_reason; 24455fc0c56SAnton Blanchard }; 24555fc0c56SAnton Blanchard 24655fc0c56SAnton Blanchard #define EPOW_RESET 0 24755fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 24855fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 24955fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 25055fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 25155fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 25255fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 25355fc0c56SAnton Blanchard 254e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 25555fc0c56SAnton Blanchard { 25655fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 25755fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 25855fc0c56SAnton Blanchard char action_code; 25955fc0c56SAnton Blanchard char modifier; 26055fc0c56SAnton Blanchard 26155fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 26255fc0c56SAnton Blanchard if (pseries_log == NULL) 26355fc0c56SAnton Blanchard return; 26455fc0c56SAnton Blanchard 26555fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 26655fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 26755fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 26855fc0c56SAnton Blanchard 26955fc0c56SAnton Blanchard switch (action_code) { 27055fc0c56SAnton Blanchard case EPOW_RESET: 271b4af279aSVipin K Parashar if (num_epow_events) { 272b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 273b4af279aSVipin K Parashar num_epow_events--; 274b4af279aSVipin K Parashar } 27555fc0c56SAnton Blanchard break; 27655fc0c56SAnton Blanchard 27755fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 278b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 279b4af279aSVipin K Parashar " log for details\n"); 28055fc0c56SAnton Blanchard break; 28155fc0c56SAnton Blanchard 28255fc0c56SAnton Blanchard case EPOW_WARN_POWER: 283b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 284b4af279aSVipin K Parashar " log for details\n"); 28555fc0c56SAnton Blanchard break; 28655fc0c56SAnton Blanchard 28755fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 28855fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 28955fc0c56SAnton Blanchard break; 29055fc0c56SAnton Blanchard 29155fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 292b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 293b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2941b7e0cbeSliguang orderly_poweroff(true); 29555fc0c56SAnton Blanchard break; 29655fc0c56SAnton Blanchard 29755fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 29855fc0c56SAnton Blanchard case EPOW_POWER_OFF: 299b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 300b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 30155fc0c56SAnton Blanchard emergency_sync(); 30255fc0c56SAnton Blanchard kernel_power_off(); 30355fc0c56SAnton Blanchard break; 30455fc0c56SAnton Blanchard 30555fc0c56SAnton Blanchard default: 306b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 30755fc0c56SAnton Blanchard action_code); 30855fc0c56SAnton Blanchard } 309b4af279aSVipin K Parashar 310b4af279aSVipin K Parashar /* Increment epow events counter variable */ 311b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 312b4af279aSVipin K Parashar num_epow_events++; 31355fc0c56SAnton Blanchard } 31455fc0c56SAnton Blanchard 315b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 316b7d9eb39SJohn Allen { 317b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 318b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 319b7d9eb39SJohn Allen 320b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 321b7d9eb39SJohn Allen 322b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 323b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 324b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 325b7d9eb39SJohn Allen rtas_get_error_log_max()); 326b7d9eb39SJohn Allen 327b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 328b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 329b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 330b7d9eb39SJohn Allen 331b7d9eb39SJohn Allen /* 332b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 333b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 334b7d9eb39SJohn Allen */ 335b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 336b7d9eb39SJohn Allen hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) 337b7d9eb39SJohn Allen queue_hotplug_event(hp_elog, NULL, NULL); 338b7d9eb39SJohn Allen else 339b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 340b7d9eb39SJohn Allen 341b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 342b7d9eb39SJohn Allen return IRQ_HANDLED; 343b7d9eb39SJohn Allen } 344b7d9eb39SJohn Allen 34555fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3467d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 347d9953105SMichael Ellerman { 34855fc0c56SAnton Blanchard int status; 34955fc0c56SAnton Blanchard int state; 350d9953105SMichael Ellerman int critical; 351d9953105SMichael Ellerman 3521c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3531c2cb594SThomas Huth &state); 354d9953105SMichael Ellerman 355d9953105SMichael Ellerman if (state > 3) 356d9953105SMichael Ellerman critical = 1; /* Time Critical */ 357d9953105SMichael Ellerman else 358d9953105SMichael Ellerman critical = 0; 359d9953105SMichael Ellerman 360d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 361d9953105SMichael Ellerman 362d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 363b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 364476eb491SGrant Likely virq_to_hw(irq), 3656f43747fSAnton Blanchard RTAS_EPOW_WARNING, 366d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 367d9953105SMichael Ellerman rtas_get_error_log_max()); 368d9953105SMichael Ellerman 369d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 370d9953105SMichael Ellerman 37155fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 37255fc0c56SAnton Blanchard 373d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 374d9953105SMichael Ellerman return IRQ_HANDLED; 375d9953105SMichael Ellerman } 376d9953105SMichael Ellerman 377d9953105SMichael Ellerman /* 378d9953105SMichael Ellerman * Handle hardware error interrupts. 379d9953105SMichael Ellerman * 380d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 381d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 382d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 383d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 384d9953105SMichael Ellerman */ 3857d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 386d9953105SMichael Ellerman { 387d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 388cc8b5263SAnton Blanchard int status; 389d9953105SMichael Ellerman int fatal; 390d9953105SMichael Ellerman 391d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 392d9953105SMichael Ellerman 393d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 394b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 395476eb491SGrant Likely virq_to_hw(irq), 396d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 397d9953105SMichael Ellerman __pa(&ras_log_buf), 398d9953105SMichael Ellerman rtas_get_error_log_max()); 399d9953105SMichael Ellerman 400d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 401d9953105SMichael Ellerman 402a08a53eaSGreg Kurz if (status == 0 && 403a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 404d9953105SMichael Ellerman fatal = 1; 405d9953105SMichael Ellerman else 406d9953105SMichael Ellerman fatal = 0; 407d9953105SMichael Ellerman 408d9953105SMichael Ellerman /* format and print the extended information */ 409d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 410d9953105SMichael Ellerman 411d9953105SMichael Ellerman if (fatal) { 412b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 413b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 414cc8b5263SAnton Blanchard emergency_sync(); 415cc8b5263SAnton Blanchard kernel_power_off(); 416d9953105SMichael Ellerman } else { 417b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 418d9953105SMichael Ellerman } 419d9953105SMichael Ellerman 420d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 421d9953105SMichael Ellerman return IRQ_HANDLED; 422d9953105SMichael Ellerman } 423d9953105SMichael Ellerman 424d368514cSAnton Blanchard /* 425d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 426d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 427d368514cSAnton Blanchard */ 428d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 429d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 430d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 431d368514cSAnton Blanchard 43294675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 43394675cceSMahesh Salgaonkar { 43494675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 43594675cceSMahesh Salgaonkar } 43694675cceSMahesh Salgaonkar 437d368514cSAnton Blanchard /* 438d368514cSAnton Blanchard * Get the error information for errors coming through the 439d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 440d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 441d9953105SMichael Ellerman * will be returned if found. 442d9953105SMichael Ellerman * 44394675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 444d368514cSAnton Blanchard * 44594675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 446d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 447d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 448d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 449d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 450d9953105SMichael Ellerman * second machine check did come in. 451d9953105SMichael Ellerman */ 452d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 453d9953105SMichael Ellerman { 454d9953105SMichael Ellerman unsigned long *savep; 45594675cceSMahesh Salgaonkar struct rtas_error_log *h; 456d9953105SMichael Ellerman 457ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 458ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 459ee1dd1e3SMahesh Salgaonkar 460d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 461f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 462d368514cSAnton Blanchard return NULL; 463d9953105SMichael Ellerman } 464d368514cSAnton Blanchard 465d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 466cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 467d368514cSAnton Blanchard 468d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 46994675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 47094675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 471a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 47294675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 473d368514cSAnton Blanchard } else { 474a08a53eaSGreg Kurz int len, error_log_length; 475d368514cSAnton Blanchard 476a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 47774e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 47894675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 479d368514cSAnton Blanchard } 480d368514cSAnton Blanchard 48194675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 482d9953105SMichael Ellerman } 483d9953105SMichael Ellerman 484d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 485d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 486d9953105SMichael Ellerman * partition to receive FWNMI errors. 487d9953105SMichael Ellerman */ 488d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 489d9953105SMichael Ellerman { 490d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 491d9953105SMichael Ellerman if (ret != 0) 492d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 493d9953105SMichael Ellerman } 494d9953105SMichael Ellerman 495c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 496d9953105SMichael Ellerman { 497bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 498bded0706SNicholas Piggin /* 499bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 500bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 501bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 502bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 503bded0706SNicholas Piggin */ 504bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 505bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 506bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 507bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 508bded0706SNicholas Piggin regs->msr = 0; 509bded0706SNicholas Piggin } 510bded0706SNicholas Piggin #endif 511bded0706SNicholas Piggin 512d9953105SMichael Ellerman if (fwnmi_active) { 513d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 514d9953105SMichael Ellerman if (errhdr) { 515d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 516d9953105SMichael Ellerman } 517d9953105SMichael Ellerman fwnmi_release_errinfo(); 518d9953105SMichael Ellerman } 519102c05e8SNicholas Piggin 520102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 521102c05e8SNicholas Piggin return 1; 522102c05e8SNicholas Piggin 523c902be71SArnd Bergmann return 0; /* need to perform reset */ 524d9953105SMichael Ellerman } 525d9953105SMichael Ellerman 526*8f0b8056SMahesh Salgaonkar #define VAL_TO_STRING(ar, val) \ 527*8f0b8056SMahesh Salgaonkar (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") 528*8f0b8056SMahesh Salgaonkar 529*8f0b8056SMahesh Salgaonkar static void pseries_print_mce_info(struct pt_regs *regs, 530*8f0b8056SMahesh Salgaonkar struct rtas_error_log *errp) 531*8f0b8056SMahesh Salgaonkar { 532*8f0b8056SMahesh Salgaonkar const char *level, *sevstr; 533*8f0b8056SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 534*8f0b8056SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 535*8f0b8056SMahesh Salgaonkar u8 error_type, err_sub_type; 536*8f0b8056SMahesh Salgaonkar u64 addr; 537*8f0b8056SMahesh Salgaonkar u8 initiator = rtas_error_initiator(errp); 538*8f0b8056SMahesh Salgaonkar int disposition = rtas_error_disposition(errp); 539*8f0b8056SMahesh Salgaonkar 540*8f0b8056SMahesh Salgaonkar static const char * const initiators[] = { 541*8f0b8056SMahesh Salgaonkar "Unknown", 542*8f0b8056SMahesh Salgaonkar "CPU", 543*8f0b8056SMahesh Salgaonkar "PCI", 544*8f0b8056SMahesh Salgaonkar "ISA", 545*8f0b8056SMahesh Salgaonkar "Memory", 546*8f0b8056SMahesh Salgaonkar "Power Mgmt", 547*8f0b8056SMahesh Salgaonkar }; 548*8f0b8056SMahesh Salgaonkar static const char * const mc_err_types[] = { 549*8f0b8056SMahesh Salgaonkar "UE", 550*8f0b8056SMahesh Salgaonkar "SLB", 551*8f0b8056SMahesh Salgaonkar "ERAT", 552*8f0b8056SMahesh Salgaonkar "TLB", 553*8f0b8056SMahesh Salgaonkar "D-Cache", 554*8f0b8056SMahesh Salgaonkar "Unknown", 555*8f0b8056SMahesh Salgaonkar "I-Cache", 556*8f0b8056SMahesh Salgaonkar }; 557*8f0b8056SMahesh Salgaonkar static const char * const mc_ue_types[] = { 558*8f0b8056SMahesh Salgaonkar "Indeterminate", 559*8f0b8056SMahesh Salgaonkar "Instruction fetch", 560*8f0b8056SMahesh Salgaonkar "Page table walk ifetch", 561*8f0b8056SMahesh Salgaonkar "Load/Store", 562*8f0b8056SMahesh Salgaonkar "Page table walk Load/Store", 563*8f0b8056SMahesh Salgaonkar }; 564*8f0b8056SMahesh Salgaonkar 565*8f0b8056SMahesh Salgaonkar /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ 566*8f0b8056SMahesh Salgaonkar static const char * const mc_slb_types[] = { 567*8f0b8056SMahesh Salgaonkar "Parity", 568*8f0b8056SMahesh Salgaonkar "Multihit", 569*8f0b8056SMahesh Salgaonkar "Indeterminate", 570*8f0b8056SMahesh Salgaonkar }; 571*8f0b8056SMahesh Salgaonkar 572*8f0b8056SMahesh Salgaonkar /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ 573*8f0b8056SMahesh Salgaonkar static const char * const mc_soft_types[] = { 574*8f0b8056SMahesh Salgaonkar "Unknown", 575*8f0b8056SMahesh Salgaonkar "Parity", 576*8f0b8056SMahesh Salgaonkar "Multihit", 577*8f0b8056SMahesh Salgaonkar "Indeterminate", 578*8f0b8056SMahesh Salgaonkar }; 579*8f0b8056SMahesh Salgaonkar 580*8f0b8056SMahesh Salgaonkar if (!rtas_error_extended(errp)) { 581*8f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt: Missing extended error log\n"); 582*8f0b8056SMahesh Salgaonkar return; 583*8f0b8056SMahesh Salgaonkar } 584*8f0b8056SMahesh Salgaonkar 585*8f0b8056SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 586*8f0b8056SMahesh Salgaonkar if (pseries_log == NULL) 587*8f0b8056SMahesh Salgaonkar return; 588*8f0b8056SMahesh Salgaonkar 589*8f0b8056SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 590*8f0b8056SMahesh Salgaonkar 591*8f0b8056SMahesh Salgaonkar error_type = mce_log->error_type; 592*8f0b8056SMahesh Salgaonkar err_sub_type = rtas_mc_error_sub_type(mce_log); 593*8f0b8056SMahesh Salgaonkar 594*8f0b8056SMahesh Salgaonkar switch (rtas_error_severity(errp)) { 595*8f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_NO_ERROR: 596*8f0b8056SMahesh Salgaonkar level = KERN_INFO; 597*8f0b8056SMahesh Salgaonkar sevstr = "Harmless"; 598*8f0b8056SMahesh Salgaonkar break; 599*8f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_WARNING: 600*8f0b8056SMahesh Salgaonkar level = KERN_WARNING; 601*8f0b8056SMahesh Salgaonkar sevstr = ""; 602*8f0b8056SMahesh Salgaonkar break; 603*8f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_ERROR: 604*8f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_ERROR_SYNC: 605*8f0b8056SMahesh Salgaonkar level = KERN_ERR; 606*8f0b8056SMahesh Salgaonkar sevstr = "Severe"; 607*8f0b8056SMahesh Salgaonkar break; 608*8f0b8056SMahesh Salgaonkar case RTAS_SEVERITY_FATAL: 609*8f0b8056SMahesh Salgaonkar default: 610*8f0b8056SMahesh Salgaonkar level = KERN_ERR; 611*8f0b8056SMahesh Salgaonkar sevstr = "Fatal"; 612*8f0b8056SMahesh Salgaonkar break; 613*8f0b8056SMahesh Salgaonkar } 614*8f0b8056SMahesh Salgaonkar 615*8f0b8056SMahesh Salgaonkar printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 616*8f0b8056SMahesh Salgaonkar disposition == RTAS_DISP_FULLY_RECOVERED ? 617*8f0b8056SMahesh Salgaonkar "Recovered" : "Not recovered"); 618*8f0b8056SMahesh Salgaonkar if (user_mode(regs)) { 619*8f0b8056SMahesh Salgaonkar printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, 620*8f0b8056SMahesh Salgaonkar regs->nip, current->pid, current->comm); 621*8f0b8056SMahesh Salgaonkar } else { 622*8f0b8056SMahesh Salgaonkar printk("%s NIP [%016lx]: %pS\n", level, regs->nip, 623*8f0b8056SMahesh Salgaonkar (void *)regs->nip); 624*8f0b8056SMahesh Salgaonkar } 625*8f0b8056SMahesh Salgaonkar printk("%s Initiator: %s\n", level, 626*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(initiators, initiator)); 627*8f0b8056SMahesh Salgaonkar 628*8f0b8056SMahesh Salgaonkar switch (error_type) { 629*8f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_UE: 630*8f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 631*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 632*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_ue_types, err_sub_type)); 633*8f0b8056SMahesh Salgaonkar break; 634*8f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 635*8f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 636*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 637*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_slb_types, err_sub_type)); 638*8f0b8056SMahesh Salgaonkar break; 639*8f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 640*8f0b8056SMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 641*8f0b8056SMahesh Salgaonkar printk("%s Error type: %s [%s]\n", level, 642*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type), 643*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_soft_types, err_sub_type)); 644*8f0b8056SMahesh Salgaonkar break; 645*8f0b8056SMahesh Salgaonkar default: 646*8f0b8056SMahesh Salgaonkar printk("%s Error type: %s\n", level, 647*8f0b8056SMahesh Salgaonkar VAL_TO_STRING(mc_err_types, error_type)); 648*8f0b8056SMahesh Salgaonkar break; 649*8f0b8056SMahesh Salgaonkar } 650*8f0b8056SMahesh Salgaonkar 651*8f0b8056SMahesh Salgaonkar addr = rtas_mc_get_effective_addr(mce_log); 652*8f0b8056SMahesh Salgaonkar if (addr) 653*8f0b8056SMahesh Salgaonkar printk("%s Effective address: %016llx\n", level, addr); 654*8f0b8056SMahesh Salgaonkar } 655*8f0b8056SMahesh Salgaonkar 656a43c1590SMahesh Salgaonkar static int mce_handle_error(struct rtas_error_log *errp) 657a43c1590SMahesh Salgaonkar { 658a43c1590SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 659a43c1590SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 660a43c1590SMahesh Salgaonkar int disposition = rtas_error_disposition(errp); 661a43c1590SMahesh Salgaonkar u8 error_type; 662a43c1590SMahesh Salgaonkar 663a43c1590SMahesh Salgaonkar if (!rtas_error_extended(errp)) 664a43c1590SMahesh Salgaonkar goto out; 665a43c1590SMahesh Salgaonkar 666a43c1590SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 667a43c1590SMahesh Salgaonkar if (pseries_log == NULL) 668a43c1590SMahesh Salgaonkar goto out; 669a43c1590SMahesh Salgaonkar 670a43c1590SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 671a43c1590SMahesh Salgaonkar error_type = mce_log->error_type; 672a43c1590SMahesh Salgaonkar 673a43c1590SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 674a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_NOT_RECOVERED) { 675a43c1590SMahesh Salgaonkar switch (error_type) { 676a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 677a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 678a43c1590SMahesh Salgaonkar /* Store the old slb content someplace. */ 679a43c1590SMahesh Salgaonkar flush_and_reload_slb(); 680a43c1590SMahesh Salgaonkar disposition = RTAS_DISP_FULLY_RECOVERED; 681a43c1590SMahesh Salgaonkar rtas_set_disposition_recovered(errp); 682a43c1590SMahesh Salgaonkar break; 683a43c1590SMahesh Salgaonkar default: 684a43c1590SMahesh Salgaonkar break; 685a43c1590SMahesh Salgaonkar } 686a43c1590SMahesh Salgaonkar } 687a43c1590SMahesh Salgaonkar #endif 688a43c1590SMahesh Salgaonkar 689a43c1590SMahesh Salgaonkar out: 690a43c1590SMahesh Salgaonkar return disposition; 691a43c1590SMahesh Salgaonkar } 692a43c1590SMahesh Salgaonkar 693d9953105SMichael Ellerman /* 69494675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 69594675cceSMahesh Salgaonkar */ 69694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 69794675cceSMahesh Salgaonkar { 69894675cceSMahesh Salgaonkar struct rtas_error_log *err; 69994675cceSMahesh Salgaonkar 70094675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 70194675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 70294675cceSMahesh Salgaonkar } 70394675cceSMahesh Salgaonkar 70494675cceSMahesh Salgaonkar /* 705d9953105SMichael Ellerman * See if we can recover from a machine check exception. 706d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 707d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 708d9953105SMichael Ellerman * which provides the error analysis for us. 709d9953105SMichael Ellerman * 710d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 711d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 712d9953105SMichael Ellerman */ 713d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 714d9953105SMichael Ellerman { 715d47d1d8aSAnton Blanchard int recovered = 0; 716a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 717d9953105SMichael Ellerman 718*8f0b8056SMahesh Salgaonkar pseries_print_mce_info(regs, err); 719*8f0b8056SMahesh Salgaonkar 720d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 721d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 722*8f0b8056SMahesh Salgaonkar pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 723d47d1d8aSAnton Blanchard recovered = 0; 724d47d1d8aSAnton Blanchard 725a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 726d9953105SMichael Ellerman /* Platform corrected itself */ 727d47d1d8aSAnton Blanchard recovered = 1; 728d47d1d8aSAnton Blanchard 729a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 730d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 731d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 732d47d1d8aSAnton Blanchard "be degraded\n"); 733d47d1d8aSAnton Blanchard recovered = 1; 734d47d1d8aSAnton Blanchard 735d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 736a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 737d47d1d8aSAnton Blanchard 738d47d1d8aSAnton Blanchard /* 739d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 740d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 741d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 742d47d1d8aSAnton Blanchard * fields being valid here. 743d47d1d8aSAnton Blanchard */ 744d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 745d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 746d47d1d8aSAnton Blanchard 747d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 748d47d1d8aSAnton Blanchard recovered = 1; 749d9953105SMichael Ellerman } 750d9953105SMichael Ellerman 75194675cceSMahesh Salgaonkar /* Queue irq work to log this rtas event later. */ 75294675cceSMahesh Salgaonkar irq_work_queue(&mce_errlog_process_work); 753d9953105SMichael Ellerman 754d47d1d8aSAnton Blanchard return recovered; 755d9953105SMichael Ellerman } 756d9953105SMichael Ellerman 757d9953105SMichael Ellerman /* 758d9953105SMichael Ellerman * Handle a machine check. 759d9953105SMichael Ellerman * 760d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 761d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 762d9953105SMichael Ellerman * error was recovered (never true if RI=0). 763d9953105SMichael Ellerman * 764d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 765d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 766d9953105SMichael Ellerman */ 767d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 768d9953105SMichael Ellerman { 769d9953105SMichael Ellerman struct rtas_error_log *errp; 770d9953105SMichael Ellerman 771d9953105SMichael Ellerman if (fwnmi_active) { 772d9953105SMichael Ellerman fwnmi_release_errinfo(); 773a43c1590SMahesh Salgaonkar errp = fwnmi_get_errlog(); 774d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 775d9953105SMichael Ellerman return 1; 776d9953105SMichael Ellerman } 777d9953105SMichael Ellerman 778d9953105SMichael Ellerman return 0; 779d9953105SMichael Ellerman } 780a43c1590SMahesh Salgaonkar 781a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 782a43c1590SMahesh Salgaonkar { 783a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 784a43c1590SMahesh Salgaonkar int disposition; 785a43c1590SMahesh Salgaonkar 786a43c1590SMahesh Salgaonkar if (fwnmi_active) { 787a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 788a43c1590SMahesh Salgaonkar /* 789a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 790a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 791a43c1590SMahesh Salgaonkar * virtual mode. 792a43c1590SMahesh Salgaonkar */ 793a43c1590SMahesh Salgaonkar disposition = mce_handle_error(errp); 794a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 795a43c1590SMahesh Salgaonkar return 1; 796a43c1590SMahesh Salgaonkar } 797a43c1590SMahesh Salgaonkar 798a43c1590SMahesh Salgaonkar return 0; 799a43c1590SMahesh Salgaonkar } 800