1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman #include <linux/sched.h> 20d9953105SMichael Ellerman #include <linux/interrupt.h> 21d9953105SMichael Ellerman #include <linux/irq.h> 2290128997SAnton Blanchard #include <linux/of.h> 2355fc0c56SAnton Blanchard #include <linux/fs.h> 2455fc0c56SAnton Blanchard #include <linux/reboot.h> 2594675cceSMahesh Salgaonkar #include <linux/irq_work.h> 26d9953105SMichael Ellerman 27d9953105SMichael Ellerman #include <asm/machdep.h> 28d9953105SMichael Ellerman #include <asm/rtas.h> 298c4f1f29SMichael Ellerman #include <asm/firmware.h> 30*a43c1590SMahesh Salgaonkar #include <asm/mce.h> 31d9953105SMichael Ellerman 32577830b0SMichael Ellerman #include "pseries.h" 33c902be71SArnd Bergmann 34d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 35d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 36d9953105SMichael Ellerman 37d9953105SMichael Ellerman static int ras_check_exception_token; 38d9953105SMichael Ellerman 3994675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 4094675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 4194675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 4294675cceSMahesh Salgaonkar }; 4394675cceSMahesh Salgaonkar 44d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 45d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 46d9953105SMichael Ellerman 47b4af279aSVipin K Parashar /* EPOW events counter variable */ 48b4af279aSVipin K Parashar static int num_epow_events; 49b4af279aSVipin K Parashar 50b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 517d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 527d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 53d9953105SMichael Ellerman 5404fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 5504fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 5604fce21cSMahesh Salgaonkar __be32 fru_id; 5704fce21cSMahesh Salgaonkar __be32 proc_id; 5804fce21cSMahesh Salgaonkar u8 error_type; 5904fce21cSMahesh Salgaonkar /* 6004fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 6104fce21cSMahesh Salgaonkar * 6204fce21cSMahesh Salgaonkar * MSB0 6304fce21cSMahesh Salgaonkar * | 6404fce21cSMahesh Salgaonkar * V 6504fce21cSMahesh Salgaonkar * 01234567 6604fce21cSMahesh Salgaonkar * XXXXXXXX 6704fce21cSMahesh Salgaonkar * 6804fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 6904fce21cSMahesh Salgaonkar * XXXXXXXX 7004fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 7104fce21cSMahesh Salgaonkar * X 1: Effective address provided. 7204fce21cSMahesh Salgaonkar * X 1: Logical address provided. 7304fce21cSMahesh Salgaonkar * XX 2: Reserved. 7404fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 7504fce21cSMahesh Salgaonkar * 7604fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 7704fce21cSMahesh Salgaonkar * XXXXXXXX 7804fce21cSMahesh Salgaonkar * X 1: Effective address provided. 7904fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 8004fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 8104fce21cSMahesh Salgaonkar */ 8204fce21cSMahesh Salgaonkar u8 sub_err_type; 8304fce21cSMahesh Salgaonkar u8 reserved_1[6]; 8404fce21cSMahesh Salgaonkar __be64 effective_address; 8504fce21cSMahesh Salgaonkar __be64 logical_address; 8604fce21cSMahesh Salgaonkar } __packed; 8704fce21cSMahesh Salgaonkar 8804fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 8904fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 9004fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 9104fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 9204fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 9304fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 9404fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 9504fce21cSMahesh Salgaonkar 9604fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 9704fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 9804fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 9904fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 10004fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 10104fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 10204fce21cSMahesh Salgaonkar 10304fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 10404fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 10504fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 10604fce21cSMahesh Salgaonkar 10704fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 10804fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 10904fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 11004fce21cSMahesh Salgaonkar 11104fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 11204fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 11304fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 11404fce21cSMahesh Salgaonkar 11504fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 11604fce21cSMahesh Salgaonkar { 11704fce21cSMahesh Salgaonkar switch (mlog->error_type) { 11804fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 11904fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 12004fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 12104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 12204fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 12304fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 12404fce21cSMahesh Salgaonkar default: 12504fce21cSMahesh Salgaonkar return 0; 12604fce21cSMahesh Salgaonkar } 12704fce21cSMahesh Salgaonkar } 12804fce21cSMahesh Salgaonkar 12904fce21cSMahesh Salgaonkar static 13004fce21cSMahesh Salgaonkar inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) 13104fce21cSMahesh Salgaonkar { 13204fce21cSMahesh Salgaonkar __be64 addr = 0; 13304fce21cSMahesh Salgaonkar 13404fce21cSMahesh Salgaonkar switch (mlog->error_type) { 13504fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 13604fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x40) 13704fce21cSMahesh Salgaonkar addr = mlog->effective_address; 13804fce21cSMahesh Salgaonkar break; 13904fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 14004fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 14104fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 14204fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x80) 14304fce21cSMahesh Salgaonkar addr = mlog->effective_address; 14404fce21cSMahesh Salgaonkar default: 14504fce21cSMahesh Salgaonkar break; 14604fce21cSMahesh Salgaonkar } 14704fce21cSMahesh Salgaonkar return be64_to_cpu(addr); 14804fce21cSMahesh Salgaonkar } 1490ebfff14SBenjamin Herrenschmidt 150d9953105SMichael Ellerman /* 151c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 152c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 153c9dccf1dSSam Bobroff * subsys stage. 154c9dccf1dSSam Bobroff */ 155c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 156c9dccf1dSSam Bobroff { 157c9dccf1dSSam Bobroff struct device_node *np; 158c9dccf1dSSam Bobroff 159c9dccf1dSSam Bobroff /* Hotplug Events */ 160c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 161c9dccf1dSSam Bobroff if (np != NULL) { 162c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 163c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 164c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 165c9dccf1dSSam Bobroff of_node_put(np); 166c9dccf1dSSam Bobroff } 167c9dccf1dSSam Bobroff 168c9dccf1dSSam Bobroff return 0; 169c9dccf1dSSam Bobroff } 170c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 171c9dccf1dSSam Bobroff 172c9dccf1dSSam Bobroff /* 173d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 174d9953105SMichael Ellerman * and power system events. 175d9953105SMichael Ellerman */ 176d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 177d9953105SMichael Ellerman { 178d9953105SMichael Ellerman struct device_node *np; 179d9953105SMichael Ellerman 180d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 181d9953105SMichael Ellerman 182d9953105SMichael Ellerman /* Internal Errors */ 183d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 184d9953105SMichael Ellerman if (np != NULL) { 18532c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 18632c96f77SMark Nelson "RAS_ERROR"); 187d9953105SMichael Ellerman of_node_put(np); 188d9953105SMichael Ellerman } 189d9953105SMichael Ellerman 190d9953105SMichael Ellerman /* EPOW Events */ 191d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 192d9953105SMichael Ellerman if (np != NULL) { 19332c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 194d9953105SMichael Ellerman of_node_put(np); 195d9953105SMichael Ellerman } 196d9953105SMichael Ellerman 19769ed3324SAnton Blanchard return 0; 198d9953105SMichael Ellerman } 1998e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 200d9953105SMichael Ellerman 20155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 20255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 20355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 20455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 20555fc0c56SAnton Blanchard 20655fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 20755fc0c56SAnton Blanchard { 20855fc0c56SAnton Blanchard switch (event_modifier) { 20955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 210b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 2111b7e0cbeSliguang orderly_poweroff(true); 21255fc0c56SAnton Blanchard break; 21355fc0c56SAnton Blanchard 21455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 215b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 216b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 21779872e35SAnshuman Khandual orderly_poweroff(true); 21855fc0c56SAnton Blanchard break; 21955fc0c56SAnton Blanchard 22055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 221b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 222b4af279aSVipin K Parashar " RTAS error log for details\n"); 2231b7e0cbeSliguang orderly_poweroff(true); 22455fc0c56SAnton Blanchard break; 22555fc0c56SAnton Blanchard 22655fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 227b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 228b4af279aSVipin K Parashar " error log for details\n"); 2291b7e0cbeSliguang orderly_poweroff(true); 23055fc0c56SAnton Blanchard break; 23155fc0c56SAnton Blanchard 23255fc0c56SAnton Blanchard default: 233b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 23455fc0c56SAnton Blanchard event_modifier); 23555fc0c56SAnton Blanchard } 23655fc0c56SAnton Blanchard } 23755fc0c56SAnton Blanchard 23855fc0c56SAnton Blanchard struct epow_errorlog { 23955fc0c56SAnton Blanchard unsigned char sensor_value; 24055fc0c56SAnton Blanchard unsigned char event_modifier; 24155fc0c56SAnton Blanchard unsigned char extended_modifier; 24255fc0c56SAnton Blanchard unsigned char reserved; 24355fc0c56SAnton Blanchard unsigned char platform_reason; 24455fc0c56SAnton Blanchard }; 24555fc0c56SAnton Blanchard 24655fc0c56SAnton Blanchard #define EPOW_RESET 0 24755fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 24855fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 24955fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 25055fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 25155fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 25255fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 25355fc0c56SAnton Blanchard 254e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 25555fc0c56SAnton Blanchard { 25655fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 25755fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 25855fc0c56SAnton Blanchard char action_code; 25955fc0c56SAnton Blanchard char modifier; 26055fc0c56SAnton Blanchard 26155fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 26255fc0c56SAnton Blanchard if (pseries_log == NULL) 26355fc0c56SAnton Blanchard return; 26455fc0c56SAnton Blanchard 26555fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 26655fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 26755fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 26855fc0c56SAnton Blanchard 26955fc0c56SAnton Blanchard switch (action_code) { 27055fc0c56SAnton Blanchard case EPOW_RESET: 271b4af279aSVipin K Parashar if (num_epow_events) { 272b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 273b4af279aSVipin K Parashar num_epow_events--; 274b4af279aSVipin K Parashar } 27555fc0c56SAnton Blanchard break; 27655fc0c56SAnton Blanchard 27755fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 278b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 279b4af279aSVipin K Parashar " log for details\n"); 28055fc0c56SAnton Blanchard break; 28155fc0c56SAnton Blanchard 28255fc0c56SAnton Blanchard case EPOW_WARN_POWER: 283b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 284b4af279aSVipin K Parashar " log for details\n"); 28555fc0c56SAnton Blanchard break; 28655fc0c56SAnton Blanchard 28755fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 28855fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 28955fc0c56SAnton Blanchard break; 29055fc0c56SAnton Blanchard 29155fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 292b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 293b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2941b7e0cbeSliguang orderly_poweroff(true); 29555fc0c56SAnton Blanchard break; 29655fc0c56SAnton Blanchard 29755fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 29855fc0c56SAnton Blanchard case EPOW_POWER_OFF: 299b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 300b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 30155fc0c56SAnton Blanchard emergency_sync(); 30255fc0c56SAnton Blanchard kernel_power_off(); 30355fc0c56SAnton Blanchard break; 30455fc0c56SAnton Blanchard 30555fc0c56SAnton Blanchard default: 306b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 30755fc0c56SAnton Blanchard action_code); 30855fc0c56SAnton Blanchard } 309b4af279aSVipin K Parashar 310b4af279aSVipin K Parashar /* Increment epow events counter variable */ 311b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 312b4af279aSVipin K Parashar num_epow_events++; 31355fc0c56SAnton Blanchard } 31455fc0c56SAnton Blanchard 315b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 316b7d9eb39SJohn Allen { 317b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 318b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 319b7d9eb39SJohn Allen 320b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 321b7d9eb39SJohn Allen 322b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 323b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 324b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 325b7d9eb39SJohn Allen rtas_get_error_log_max()); 326b7d9eb39SJohn Allen 327b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 328b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 329b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 330b7d9eb39SJohn Allen 331b7d9eb39SJohn Allen /* 332b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 333b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 334b7d9eb39SJohn Allen */ 335b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 336b7d9eb39SJohn Allen hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) 337b7d9eb39SJohn Allen queue_hotplug_event(hp_elog, NULL, NULL); 338b7d9eb39SJohn Allen else 339b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 340b7d9eb39SJohn Allen 341b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 342b7d9eb39SJohn Allen return IRQ_HANDLED; 343b7d9eb39SJohn Allen } 344b7d9eb39SJohn Allen 34555fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3467d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 347d9953105SMichael Ellerman { 34855fc0c56SAnton Blanchard int status; 34955fc0c56SAnton Blanchard int state; 350d9953105SMichael Ellerman int critical; 351d9953105SMichael Ellerman 3521c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3531c2cb594SThomas Huth &state); 354d9953105SMichael Ellerman 355d9953105SMichael Ellerman if (state > 3) 356d9953105SMichael Ellerman critical = 1; /* Time Critical */ 357d9953105SMichael Ellerman else 358d9953105SMichael Ellerman critical = 0; 359d9953105SMichael Ellerman 360d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 361d9953105SMichael Ellerman 362d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 363b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 364476eb491SGrant Likely virq_to_hw(irq), 3656f43747fSAnton Blanchard RTAS_EPOW_WARNING, 366d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 367d9953105SMichael Ellerman rtas_get_error_log_max()); 368d9953105SMichael Ellerman 369d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 370d9953105SMichael Ellerman 37155fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 37255fc0c56SAnton Blanchard 373d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 374d9953105SMichael Ellerman return IRQ_HANDLED; 375d9953105SMichael Ellerman } 376d9953105SMichael Ellerman 377d9953105SMichael Ellerman /* 378d9953105SMichael Ellerman * Handle hardware error interrupts. 379d9953105SMichael Ellerman * 380d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 381d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 382d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 383d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 384d9953105SMichael Ellerman */ 3857d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 386d9953105SMichael Ellerman { 387d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 388cc8b5263SAnton Blanchard int status; 389d9953105SMichael Ellerman int fatal; 390d9953105SMichael Ellerman 391d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 392d9953105SMichael Ellerman 393d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 394b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 395476eb491SGrant Likely virq_to_hw(irq), 396d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 397d9953105SMichael Ellerman __pa(&ras_log_buf), 398d9953105SMichael Ellerman rtas_get_error_log_max()); 399d9953105SMichael Ellerman 400d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 401d9953105SMichael Ellerman 402a08a53eaSGreg Kurz if (status == 0 && 403a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 404d9953105SMichael Ellerman fatal = 1; 405d9953105SMichael Ellerman else 406d9953105SMichael Ellerman fatal = 0; 407d9953105SMichael Ellerman 408d9953105SMichael Ellerman /* format and print the extended information */ 409d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 410d9953105SMichael Ellerman 411d9953105SMichael Ellerman if (fatal) { 412b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 413b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 414cc8b5263SAnton Blanchard emergency_sync(); 415cc8b5263SAnton Blanchard kernel_power_off(); 416d9953105SMichael Ellerman } else { 417b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 418d9953105SMichael Ellerman } 419d9953105SMichael Ellerman 420d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 421d9953105SMichael Ellerman return IRQ_HANDLED; 422d9953105SMichael Ellerman } 423d9953105SMichael Ellerman 424d368514cSAnton Blanchard /* 425d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 426d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 427d368514cSAnton Blanchard */ 428d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 429d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 430d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 431d368514cSAnton Blanchard 43294675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 43394675cceSMahesh Salgaonkar { 43494675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 43594675cceSMahesh Salgaonkar } 43694675cceSMahesh Salgaonkar 437d368514cSAnton Blanchard /* 438d368514cSAnton Blanchard * Get the error information for errors coming through the 439d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 440d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 441d9953105SMichael Ellerman * will be returned if found. 442d9953105SMichael Ellerman * 44394675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 444d368514cSAnton Blanchard * 44594675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 446d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 447d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 448d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 449d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 450d9953105SMichael Ellerman * second machine check did come in. 451d9953105SMichael Ellerman */ 452d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 453d9953105SMichael Ellerman { 454d9953105SMichael Ellerman unsigned long *savep; 45594675cceSMahesh Salgaonkar struct rtas_error_log *h; 456d9953105SMichael Ellerman 457ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 458ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 459ee1dd1e3SMahesh Salgaonkar 460d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 461f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 462d368514cSAnton Blanchard return NULL; 463d9953105SMichael Ellerman } 464d368514cSAnton Blanchard 465d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 466cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 467d368514cSAnton Blanchard 468d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 46994675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 47094675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 471a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 47294675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 473d368514cSAnton Blanchard } else { 474a08a53eaSGreg Kurz int len, error_log_length; 475d368514cSAnton Blanchard 476a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 47774e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 47894675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 479d368514cSAnton Blanchard } 480d368514cSAnton Blanchard 48194675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 482d9953105SMichael Ellerman } 483d9953105SMichael Ellerman 484d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 485d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 486d9953105SMichael Ellerman * partition to receive FWNMI errors. 487d9953105SMichael Ellerman */ 488d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 489d9953105SMichael Ellerman { 490d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 491d9953105SMichael Ellerman if (ret != 0) 492d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 493d9953105SMichael Ellerman } 494d9953105SMichael Ellerman 495c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 496d9953105SMichael Ellerman { 497bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 498bded0706SNicholas Piggin /* 499bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 500bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 501bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 502bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 503bded0706SNicholas Piggin */ 504bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 505bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 506bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 507bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 508bded0706SNicholas Piggin regs->msr = 0; 509bded0706SNicholas Piggin } 510bded0706SNicholas Piggin #endif 511bded0706SNicholas Piggin 512d9953105SMichael Ellerman if (fwnmi_active) { 513d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 514d9953105SMichael Ellerman if (errhdr) { 515d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 516d9953105SMichael Ellerman } 517d9953105SMichael Ellerman fwnmi_release_errinfo(); 518d9953105SMichael Ellerman } 519102c05e8SNicholas Piggin 520102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 521102c05e8SNicholas Piggin return 1; 522102c05e8SNicholas Piggin 523c902be71SArnd Bergmann return 0; /* need to perform reset */ 524d9953105SMichael Ellerman } 525d9953105SMichael Ellerman 526*a43c1590SMahesh Salgaonkar static int mce_handle_error(struct rtas_error_log *errp) 527*a43c1590SMahesh Salgaonkar { 528*a43c1590SMahesh Salgaonkar struct pseries_errorlog *pseries_log; 529*a43c1590SMahesh Salgaonkar struct pseries_mc_errorlog *mce_log; 530*a43c1590SMahesh Salgaonkar int disposition = rtas_error_disposition(errp); 531*a43c1590SMahesh Salgaonkar u8 error_type; 532*a43c1590SMahesh Salgaonkar 533*a43c1590SMahesh Salgaonkar if (!rtas_error_extended(errp)) 534*a43c1590SMahesh Salgaonkar goto out; 535*a43c1590SMahesh Salgaonkar 536*a43c1590SMahesh Salgaonkar pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 537*a43c1590SMahesh Salgaonkar if (pseries_log == NULL) 538*a43c1590SMahesh Salgaonkar goto out; 539*a43c1590SMahesh Salgaonkar 540*a43c1590SMahesh Salgaonkar mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 541*a43c1590SMahesh Salgaonkar error_type = mce_log->error_type; 542*a43c1590SMahesh Salgaonkar 543*a43c1590SMahesh Salgaonkar #ifdef CONFIG_PPC_BOOK3S_64 544*a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_NOT_RECOVERED) { 545*a43c1590SMahesh Salgaonkar switch (error_type) { 546*a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 547*a43c1590SMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 548*a43c1590SMahesh Salgaonkar /* Store the old slb content someplace. */ 549*a43c1590SMahesh Salgaonkar flush_and_reload_slb(); 550*a43c1590SMahesh Salgaonkar disposition = RTAS_DISP_FULLY_RECOVERED; 551*a43c1590SMahesh Salgaonkar rtas_set_disposition_recovered(errp); 552*a43c1590SMahesh Salgaonkar break; 553*a43c1590SMahesh Salgaonkar default: 554*a43c1590SMahesh Salgaonkar break; 555*a43c1590SMahesh Salgaonkar } 556*a43c1590SMahesh Salgaonkar } 557*a43c1590SMahesh Salgaonkar #endif 558*a43c1590SMahesh Salgaonkar 559*a43c1590SMahesh Salgaonkar out: 560*a43c1590SMahesh Salgaonkar return disposition; 561*a43c1590SMahesh Salgaonkar } 562*a43c1590SMahesh Salgaonkar 563d9953105SMichael Ellerman /* 56494675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 56594675cceSMahesh Salgaonkar */ 56694675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 56794675cceSMahesh Salgaonkar { 56894675cceSMahesh Salgaonkar struct rtas_error_log *err; 56994675cceSMahesh Salgaonkar 57094675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 57194675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 57294675cceSMahesh Salgaonkar } 57394675cceSMahesh Salgaonkar 57494675cceSMahesh Salgaonkar /* 575d9953105SMichael Ellerman * See if we can recover from a machine check exception. 576d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 577d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 578d9953105SMichael Ellerman * which provides the error analysis for us. 579d9953105SMichael Ellerman * 580d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 581d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 582d9953105SMichael Ellerman */ 583d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 584d9953105SMichael Ellerman { 585d47d1d8aSAnton Blanchard int recovered = 0; 586a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 587d9953105SMichael Ellerman 588d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 589d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 590d47d1d8aSAnton Blanchard recovered = 0; 591d47d1d8aSAnton Blanchard 592a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 593d9953105SMichael Ellerman /* Platform corrected itself */ 594d47d1d8aSAnton Blanchard recovered = 1; 595d47d1d8aSAnton Blanchard 596a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 597d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 598d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 599d47d1d8aSAnton Blanchard "be degraded\n"); 600d47d1d8aSAnton Blanchard recovered = 1; 601d47d1d8aSAnton Blanchard 602d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 603a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 604d47d1d8aSAnton Blanchard 605d47d1d8aSAnton Blanchard /* 606d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 607d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 608d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 609d47d1d8aSAnton Blanchard * fields being valid here. 610d47d1d8aSAnton Blanchard */ 611d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 612d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 613d47d1d8aSAnton Blanchard 614d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 615d47d1d8aSAnton Blanchard recovered = 1; 616d9953105SMichael Ellerman } 617d9953105SMichael Ellerman 61894675cceSMahesh Salgaonkar /* Queue irq work to log this rtas event later. */ 61994675cceSMahesh Salgaonkar irq_work_queue(&mce_errlog_process_work); 620d9953105SMichael Ellerman 621d47d1d8aSAnton Blanchard return recovered; 622d9953105SMichael Ellerman } 623d9953105SMichael Ellerman 624d9953105SMichael Ellerman /* 625d9953105SMichael Ellerman * Handle a machine check. 626d9953105SMichael Ellerman * 627d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 628d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 629d9953105SMichael Ellerman * error was recovered (never true if RI=0). 630d9953105SMichael Ellerman * 631d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 632d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 633d9953105SMichael Ellerman */ 634d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 635d9953105SMichael Ellerman { 636d9953105SMichael Ellerman struct rtas_error_log *errp; 637d9953105SMichael Ellerman 638d9953105SMichael Ellerman if (fwnmi_active) { 639d9953105SMichael Ellerman fwnmi_release_errinfo(); 640*a43c1590SMahesh Salgaonkar errp = fwnmi_get_errlog(); 641d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 642d9953105SMichael Ellerman return 1; 643d9953105SMichael Ellerman } 644d9953105SMichael Ellerman 645d9953105SMichael Ellerman return 0; 646d9953105SMichael Ellerman } 647*a43c1590SMahesh Salgaonkar 648*a43c1590SMahesh Salgaonkar long pseries_machine_check_realmode(struct pt_regs *regs) 649*a43c1590SMahesh Salgaonkar { 650*a43c1590SMahesh Salgaonkar struct rtas_error_log *errp; 651*a43c1590SMahesh Salgaonkar int disposition; 652*a43c1590SMahesh Salgaonkar 653*a43c1590SMahesh Salgaonkar if (fwnmi_active) { 654*a43c1590SMahesh Salgaonkar errp = fwnmi_get_errinfo(regs); 655*a43c1590SMahesh Salgaonkar /* 656*a43c1590SMahesh Salgaonkar * Call to fwnmi_release_errinfo() in real mode causes kernel 657*a43c1590SMahesh Salgaonkar * to panic. Hence we will call it as soon as we go into 658*a43c1590SMahesh Salgaonkar * virtual mode. 659*a43c1590SMahesh Salgaonkar */ 660*a43c1590SMahesh Salgaonkar disposition = mce_handle_error(errp); 661*a43c1590SMahesh Salgaonkar if (disposition == RTAS_DISP_FULLY_RECOVERED) 662*a43c1590SMahesh Salgaonkar return 1; 663*a43c1590SMahesh Salgaonkar } 664*a43c1590SMahesh Salgaonkar 665*a43c1590SMahesh Salgaonkar return 0; 666*a43c1590SMahesh Salgaonkar } 667