1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman #include <linux/sched.h> 20d9953105SMichael Ellerman #include <linux/interrupt.h> 21d9953105SMichael Ellerman #include <linux/irq.h> 2290128997SAnton Blanchard #include <linux/of.h> 2355fc0c56SAnton Blanchard #include <linux/fs.h> 2455fc0c56SAnton Blanchard #include <linux/reboot.h> 2594675cceSMahesh Salgaonkar #include <linux/irq_work.h> 26d9953105SMichael Ellerman 27d9953105SMichael Ellerman #include <asm/machdep.h> 28d9953105SMichael Ellerman #include <asm/rtas.h> 298c4f1f29SMichael Ellerman #include <asm/firmware.h> 30d9953105SMichael Ellerman 31577830b0SMichael Ellerman #include "pseries.h" 32c902be71SArnd Bergmann 33d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 34d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 35d9953105SMichael Ellerman 36d9953105SMichael Ellerman static int ras_check_exception_token; 37d9953105SMichael Ellerman 3894675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 3994675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 4094675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 4194675cceSMahesh Salgaonkar }; 4294675cceSMahesh Salgaonkar 43d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 44d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 45d9953105SMichael Ellerman 46b4af279aSVipin K Parashar /* EPOW events counter variable */ 47b4af279aSVipin K Parashar static int num_epow_events; 48b4af279aSVipin K Parashar 49b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 507d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 517d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 52d9953105SMichael Ellerman 53*04fce21cSMahesh Salgaonkar /* RTAS pseries MCE errorlog section. */ 54*04fce21cSMahesh Salgaonkar struct pseries_mc_errorlog { 55*04fce21cSMahesh Salgaonkar __be32 fru_id; 56*04fce21cSMahesh Salgaonkar __be32 proc_id; 57*04fce21cSMahesh Salgaonkar u8 error_type; 58*04fce21cSMahesh Salgaonkar /* 59*04fce21cSMahesh Salgaonkar * sub_err_type (1 byte). Bit fields depends on error_type 60*04fce21cSMahesh Salgaonkar * 61*04fce21cSMahesh Salgaonkar * MSB0 62*04fce21cSMahesh Salgaonkar * | 63*04fce21cSMahesh Salgaonkar * V 64*04fce21cSMahesh Salgaonkar * 01234567 65*04fce21cSMahesh Salgaonkar * XXXXXXXX 66*04fce21cSMahesh Salgaonkar * 67*04fce21cSMahesh Salgaonkar * For error_type == MC_ERROR_TYPE_UE 68*04fce21cSMahesh Salgaonkar * XXXXXXXX 69*04fce21cSMahesh Salgaonkar * X 1: Permanent or Transient UE. 70*04fce21cSMahesh Salgaonkar * X 1: Effective address provided. 71*04fce21cSMahesh Salgaonkar * X 1: Logical address provided. 72*04fce21cSMahesh Salgaonkar * XX 2: Reserved. 73*04fce21cSMahesh Salgaonkar * XXX 3: Type of UE error. 74*04fce21cSMahesh Salgaonkar * 75*04fce21cSMahesh Salgaonkar * For error_type != MC_ERROR_TYPE_UE 76*04fce21cSMahesh Salgaonkar * XXXXXXXX 77*04fce21cSMahesh Salgaonkar * X 1: Effective address provided. 78*04fce21cSMahesh Salgaonkar * XXXXX 5: Reserved. 79*04fce21cSMahesh Salgaonkar * XX 2: Type of SLB/ERAT/TLB error. 80*04fce21cSMahesh Salgaonkar */ 81*04fce21cSMahesh Salgaonkar u8 sub_err_type; 82*04fce21cSMahesh Salgaonkar u8 reserved_1[6]; 83*04fce21cSMahesh Salgaonkar __be64 effective_address; 84*04fce21cSMahesh Salgaonkar __be64 logical_address; 85*04fce21cSMahesh Salgaonkar } __packed; 86*04fce21cSMahesh Salgaonkar 87*04fce21cSMahesh Salgaonkar /* RTAS pseries MCE error types */ 88*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_UE 0x00 89*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_SLB 0x01 90*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_ERAT 0x02 91*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_TLB 0x04 92*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_D_CACHE 0x05 93*04fce21cSMahesh Salgaonkar #define MC_ERROR_TYPE_I_CACHE 0x07 94*04fce21cSMahesh Salgaonkar 95*04fce21cSMahesh Salgaonkar /* RTAS pseries MCE error sub types */ 96*04fce21cSMahesh Salgaonkar #define MC_ERROR_UE_INDETERMINATE 0 97*04fce21cSMahesh Salgaonkar #define MC_ERROR_UE_IFETCH 1 98*04fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 99*04fce21cSMahesh Salgaonkar #define MC_ERROR_UE_LOAD_STORE 3 100*04fce21cSMahesh Salgaonkar #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 101*04fce21cSMahesh Salgaonkar 102*04fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_PARITY 0 103*04fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_MULTIHIT 1 104*04fce21cSMahesh Salgaonkar #define MC_ERROR_SLB_INDETERMINATE 2 105*04fce21cSMahesh Salgaonkar 106*04fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_PARITY 1 107*04fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_MULTIHIT 2 108*04fce21cSMahesh Salgaonkar #define MC_ERROR_ERAT_INDETERMINATE 3 109*04fce21cSMahesh Salgaonkar 110*04fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_PARITY 1 111*04fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_MULTIHIT 2 112*04fce21cSMahesh Salgaonkar #define MC_ERROR_TLB_INDETERMINATE 3 113*04fce21cSMahesh Salgaonkar 114*04fce21cSMahesh Salgaonkar static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) 115*04fce21cSMahesh Salgaonkar { 116*04fce21cSMahesh Salgaonkar switch (mlog->error_type) { 117*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 118*04fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x07); 119*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 120*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 121*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 122*04fce21cSMahesh Salgaonkar return (mlog->sub_err_type & 0x03); 123*04fce21cSMahesh Salgaonkar default: 124*04fce21cSMahesh Salgaonkar return 0; 125*04fce21cSMahesh Salgaonkar } 126*04fce21cSMahesh Salgaonkar } 127*04fce21cSMahesh Salgaonkar 128*04fce21cSMahesh Salgaonkar static 129*04fce21cSMahesh Salgaonkar inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) 130*04fce21cSMahesh Salgaonkar { 131*04fce21cSMahesh Salgaonkar __be64 addr = 0; 132*04fce21cSMahesh Salgaonkar 133*04fce21cSMahesh Salgaonkar switch (mlog->error_type) { 134*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_UE: 135*04fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x40) 136*04fce21cSMahesh Salgaonkar addr = mlog->effective_address; 137*04fce21cSMahesh Salgaonkar break; 138*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_SLB: 139*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_ERAT: 140*04fce21cSMahesh Salgaonkar case MC_ERROR_TYPE_TLB: 141*04fce21cSMahesh Salgaonkar if (mlog->sub_err_type & 0x80) 142*04fce21cSMahesh Salgaonkar addr = mlog->effective_address; 143*04fce21cSMahesh Salgaonkar default: 144*04fce21cSMahesh Salgaonkar break; 145*04fce21cSMahesh Salgaonkar } 146*04fce21cSMahesh Salgaonkar return be64_to_cpu(addr); 147*04fce21cSMahesh Salgaonkar } 1480ebfff14SBenjamin Herrenschmidt 149d9953105SMichael Ellerman /* 150c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 151c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 152c9dccf1dSSam Bobroff * subsys stage. 153c9dccf1dSSam Bobroff */ 154c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 155c9dccf1dSSam Bobroff { 156c9dccf1dSSam Bobroff struct device_node *np; 157c9dccf1dSSam Bobroff 158c9dccf1dSSam Bobroff /* Hotplug Events */ 159c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 160c9dccf1dSSam Bobroff if (np != NULL) { 161c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 162c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 163c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 164c9dccf1dSSam Bobroff of_node_put(np); 165c9dccf1dSSam Bobroff } 166c9dccf1dSSam Bobroff 167c9dccf1dSSam Bobroff return 0; 168c9dccf1dSSam Bobroff } 169c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 170c9dccf1dSSam Bobroff 171c9dccf1dSSam Bobroff /* 172d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 173d9953105SMichael Ellerman * and power system events. 174d9953105SMichael Ellerman */ 175d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 176d9953105SMichael Ellerman { 177d9953105SMichael Ellerman struct device_node *np; 178d9953105SMichael Ellerman 179d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 180d9953105SMichael Ellerman 181d9953105SMichael Ellerman /* Internal Errors */ 182d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 183d9953105SMichael Ellerman if (np != NULL) { 18432c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 18532c96f77SMark Nelson "RAS_ERROR"); 186d9953105SMichael Ellerman of_node_put(np); 187d9953105SMichael Ellerman } 188d9953105SMichael Ellerman 189d9953105SMichael Ellerman /* EPOW Events */ 190d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 191d9953105SMichael Ellerman if (np != NULL) { 19232c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 193d9953105SMichael Ellerman of_node_put(np); 194d9953105SMichael Ellerman } 195d9953105SMichael Ellerman 19669ed3324SAnton Blanchard return 0; 197d9953105SMichael Ellerman } 1988e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 199d9953105SMichael Ellerman 20055fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 20155fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 20255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 20355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 20455fc0c56SAnton Blanchard 20555fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 20655fc0c56SAnton Blanchard { 20755fc0c56SAnton Blanchard switch (event_modifier) { 20855fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 209b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 2101b7e0cbeSliguang orderly_poweroff(true); 21155fc0c56SAnton Blanchard break; 21255fc0c56SAnton Blanchard 21355fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 214b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 215b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 21679872e35SAnshuman Khandual orderly_poweroff(true); 21755fc0c56SAnton Blanchard break; 21855fc0c56SAnton Blanchard 21955fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 220b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 221b4af279aSVipin K Parashar " RTAS error log for details\n"); 2221b7e0cbeSliguang orderly_poweroff(true); 22355fc0c56SAnton Blanchard break; 22455fc0c56SAnton Blanchard 22555fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 226b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 227b4af279aSVipin K Parashar " error log for details\n"); 2281b7e0cbeSliguang orderly_poweroff(true); 22955fc0c56SAnton Blanchard break; 23055fc0c56SAnton Blanchard 23155fc0c56SAnton Blanchard default: 232b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 23355fc0c56SAnton Blanchard event_modifier); 23455fc0c56SAnton Blanchard } 23555fc0c56SAnton Blanchard } 23655fc0c56SAnton Blanchard 23755fc0c56SAnton Blanchard struct epow_errorlog { 23855fc0c56SAnton Blanchard unsigned char sensor_value; 23955fc0c56SAnton Blanchard unsigned char event_modifier; 24055fc0c56SAnton Blanchard unsigned char extended_modifier; 24155fc0c56SAnton Blanchard unsigned char reserved; 24255fc0c56SAnton Blanchard unsigned char platform_reason; 24355fc0c56SAnton Blanchard }; 24455fc0c56SAnton Blanchard 24555fc0c56SAnton Blanchard #define EPOW_RESET 0 24655fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 24755fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 24855fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 24955fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 25055fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 25155fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 25255fc0c56SAnton Blanchard 253e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 25455fc0c56SAnton Blanchard { 25555fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 25655fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 25755fc0c56SAnton Blanchard char action_code; 25855fc0c56SAnton Blanchard char modifier; 25955fc0c56SAnton Blanchard 26055fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 26155fc0c56SAnton Blanchard if (pseries_log == NULL) 26255fc0c56SAnton Blanchard return; 26355fc0c56SAnton Blanchard 26455fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 26555fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 26655fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 26755fc0c56SAnton Blanchard 26855fc0c56SAnton Blanchard switch (action_code) { 26955fc0c56SAnton Blanchard case EPOW_RESET: 270b4af279aSVipin K Parashar if (num_epow_events) { 271b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 272b4af279aSVipin K Parashar num_epow_events--; 273b4af279aSVipin K Parashar } 27455fc0c56SAnton Blanchard break; 27555fc0c56SAnton Blanchard 27655fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 277b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 278b4af279aSVipin K Parashar " log for details\n"); 27955fc0c56SAnton Blanchard break; 28055fc0c56SAnton Blanchard 28155fc0c56SAnton Blanchard case EPOW_WARN_POWER: 282b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 283b4af279aSVipin K Parashar " log for details\n"); 28455fc0c56SAnton Blanchard break; 28555fc0c56SAnton Blanchard 28655fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 28755fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 28855fc0c56SAnton Blanchard break; 28955fc0c56SAnton Blanchard 29055fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 291b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 292b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 2931b7e0cbeSliguang orderly_poweroff(true); 29455fc0c56SAnton Blanchard break; 29555fc0c56SAnton Blanchard 29655fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 29755fc0c56SAnton Blanchard case EPOW_POWER_OFF: 298b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 299b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 30055fc0c56SAnton Blanchard emergency_sync(); 30155fc0c56SAnton Blanchard kernel_power_off(); 30255fc0c56SAnton Blanchard break; 30355fc0c56SAnton Blanchard 30455fc0c56SAnton Blanchard default: 305b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 30655fc0c56SAnton Blanchard action_code); 30755fc0c56SAnton Blanchard } 308b4af279aSVipin K Parashar 309b4af279aSVipin K Parashar /* Increment epow events counter variable */ 310b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 311b4af279aSVipin K Parashar num_epow_events++; 31255fc0c56SAnton Blanchard } 31355fc0c56SAnton Blanchard 314b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 315b7d9eb39SJohn Allen { 316b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 317b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 318b7d9eb39SJohn Allen 319b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 320b7d9eb39SJohn Allen 321b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 322b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 323b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 324b7d9eb39SJohn Allen rtas_get_error_log_max()); 325b7d9eb39SJohn Allen 326b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 327b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 328b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 329b7d9eb39SJohn Allen 330b7d9eb39SJohn Allen /* 331b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 332b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 333b7d9eb39SJohn Allen */ 334b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 335b7d9eb39SJohn Allen hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) 336b7d9eb39SJohn Allen queue_hotplug_event(hp_elog, NULL, NULL); 337b7d9eb39SJohn Allen else 338b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 339b7d9eb39SJohn Allen 340b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 341b7d9eb39SJohn Allen return IRQ_HANDLED; 342b7d9eb39SJohn Allen } 343b7d9eb39SJohn Allen 34455fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 3457d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 346d9953105SMichael Ellerman { 34755fc0c56SAnton Blanchard int status; 34855fc0c56SAnton Blanchard int state; 349d9953105SMichael Ellerman int critical; 350d9953105SMichael Ellerman 3511c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 3521c2cb594SThomas Huth &state); 353d9953105SMichael Ellerman 354d9953105SMichael Ellerman if (state > 3) 355d9953105SMichael Ellerman critical = 1; /* Time Critical */ 356d9953105SMichael Ellerman else 357d9953105SMichael Ellerman critical = 0; 358d9953105SMichael Ellerman 359d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 360d9953105SMichael Ellerman 361d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 362b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 363476eb491SGrant Likely virq_to_hw(irq), 3646f43747fSAnton Blanchard RTAS_EPOW_WARNING, 365d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 366d9953105SMichael Ellerman rtas_get_error_log_max()); 367d9953105SMichael Ellerman 368d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 369d9953105SMichael Ellerman 37055fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 37155fc0c56SAnton Blanchard 372d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 373d9953105SMichael Ellerman return IRQ_HANDLED; 374d9953105SMichael Ellerman } 375d9953105SMichael Ellerman 376d9953105SMichael Ellerman /* 377d9953105SMichael Ellerman * Handle hardware error interrupts. 378d9953105SMichael Ellerman * 379d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 380d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 381d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 382d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 383d9953105SMichael Ellerman */ 3847d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 385d9953105SMichael Ellerman { 386d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 387cc8b5263SAnton Blanchard int status; 388d9953105SMichael Ellerman int fatal; 389d9953105SMichael Ellerman 390d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 391d9953105SMichael Ellerman 392d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 393b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 394476eb491SGrant Likely virq_to_hw(irq), 395d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 396d9953105SMichael Ellerman __pa(&ras_log_buf), 397d9953105SMichael Ellerman rtas_get_error_log_max()); 398d9953105SMichael Ellerman 399d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 400d9953105SMichael Ellerman 401a08a53eaSGreg Kurz if (status == 0 && 402a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 403d9953105SMichael Ellerman fatal = 1; 404d9953105SMichael Ellerman else 405d9953105SMichael Ellerman fatal = 0; 406d9953105SMichael Ellerman 407d9953105SMichael Ellerman /* format and print the extended information */ 408d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 409d9953105SMichael Ellerman 410d9953105SMichael Ellerman if (fatal) { 411b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 412b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 413cc8b5263SAnton Blanchard emergency_sync(); 414cc8b5263SAnton Blanchard kernel_power_off(); 415d9953105SMichael Ellerman } else { 416b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 417d9953105SMichael Ellerman } 418d9953105SMichael Ellerman 419d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 420d9953105SMichael Ellerman return IRQ_HANDLED; 421d9953105SMichael Ellerman } 422d9953105SMichael Ellerman 423d368514cSAnton Blanchard /* 424d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 425d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 426d368514cSAnton Blanchard */ 427d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 428d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 429d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 430d368514cSAnton Blanchard 43194675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 43294675cceSMahesh Salgaonkar { 43394675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 43494675cceSMahesh Salgaonkar } 43594675cceSMahesh Salgaonkar 436d368514cSAnton Blanchard /* 437d368514cSAnton Blanchard * Get the error information for errors coming through the 438d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 439d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 440d9953105SMichael Ellerman * will be returned if found. 441d9953105SMichael Ellerman * 44294675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 443d368514cSAnton Blanchard * 44494675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 445d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 446d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 447d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 448d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 449d9953105SMichael Ellerman * second machine check did come in. 450d9953105SMichael Ellerman */ 451d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 452d9953105SMichael Ellerman { 453d9953105SMichael Ellerman unsigned long *savep; 45494675cceSMahesh Salgaonkar struct rtas_error_log *h; 455d9953105SMichael Ellerman 456ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 457ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 458ee1dd1e3SMahesh Salgaonkar 459d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 460f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 461d368514cSAnton Blanchard return NULL; 462d9953105SMichael Ellerman } 463d368514cSAnton Blanchard 464d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 465cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 466d368514cSAnton Blanchard 467d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 46894675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 46994675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 470a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 47194675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 472d368514cSAnton Blanchard } else { 473a08a53eaSGreg Kurz int len, error_log_length; 474d368514cSAnton Blanchard 475a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 47674e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 47794675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 478d368514cSAnton Blanchard } 479d368514cSAnton Blanchard 48094675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 481d9953105SMichael Ellerman } 482d9953105SMichael Ellerman 483d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 484d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 485d9953105SMichael Ellerman * partition to receive FWNMI errors. 486d9953105SMichael Ellerman */ 487d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 488d9953105SMichael Ellerman { 489d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 490d9953105SMichael Ellerman if (ret != 0) 491d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 492d9953105SMichael Ellerman } 493d9953105SMichael Ellerman 494c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 495d9953105SMichael Ellerman { 496bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 497bded0706SNicholas Piggin /* 498bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 499bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 500bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 501bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 502bded0706SNicholas Piggin */ 503bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 504bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 505bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 506bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 507bded0706SNicholas Piggin regs->msr = 0; 508bded0706SNicholas Piggin } 509bded0706SNicholas Piggin #endif 510bded0706SNicholas Piggin 511d9953105SMichael Ellerman if (fwnmi_active) { 512d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 513d9953105SMichael Ellerman if (errhdr) { 514d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 515d9953105SMichael Ellerman } 516d9953105SMichael Ellerman fwnmi_release_errinfo(); 517d9953105SMichael Ellerman } 518102c05e8SNicholas Piggin 519102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 520102c05e8SNicholas Piggin return 1; 521102c05e8SNicholas Piggin 522c902be71SArnd Bergmann return 0; /* need to perform reset */ 523d9953105SMichael Ellerman } 524d9953105SMichael Ellerman 525d9953105SMichael Ellerman /* 52694675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 52794675cceSMahesh Salgaonkar */ 52894675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 52994675cceSMahesh Salgaonkar { 53094675cceSMahesh Salgaonkar struct rtas_error_log *err; 53194675cceSMahesh Salgaonkar 53294675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 53394675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 53494675cceSMahesh Salgaonkar } 53594675cceSMahesh Salgaonkar 53694675cceSMahesh Salgaonkar /* 537d9953105SMichael Ellerman * See if we can recover from a machine check exception. 538d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 539d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 540d9953105SMichael Ellerman * which provides the error analysis for us. 541d9953105SMichael Ellerman * 542d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 543d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 544d9953105SMichael Ellerman */ 545d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 546d9953105SMichael Ellerman { 547d47d1d8aSAnton Blanchard int recovered = 0; 548a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 549d9953105SMichael Ellerman 550d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 551d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 552d47d1d8aSAnton Blanchard recovered = 0; 553d47d1d8aSAnton Blanchard 554a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 555d9953105SMichael Ellerman /* Platform corrected itself */ 556d47d1d8aSAnton Blanchard recovered = 1; 557d47d1d8aSAnton Blanchard 558a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 559d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 560d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 561d47d1d8aSAnton Blanchard "be degraded\n"); 562d47d1d8aSAnton Blanchard recovered = 1; 563d47d1d8aSAnton Blanchard 564d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 565a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 566d47d1d8aSAnton Blanchard 567d47d1d8aSAnton Blanchard /* 568d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 569d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 570d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 571d47d1d8aSAnton Blanchard * fields being valid here. 572d47d1d8aSAnton Blanchard */ 573d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 574d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 575d47d1d8aSAnton Blanchard 576d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 577d47d1d8aSAnton Blanchard recovered = 1; 578d9953105SMichael Ellerman } 579d9953105SMichael Ellerman 58094675cceSMahesh Salgaonkar /* Queue irq work to log this rtas event later. */ 58194675cceSMahesh Salgaonkar irq_work_queue(&mce_errlog_process_work); 582d9953105SMichael Ellerman 583d47d1d8aSAnton Blanchard return recovered; 584d9953105SMichael Ellerman } 585d9953105SMichael Ellerman 586d9953105SMichael Ellerman /* 587d9953105SMichael Ellerman * Handle a machine check. 588d9953105SMichael Ellerman * 589d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 590d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 591d9953105SMichael Ellerman * error was recovered (never true if RI=0). 592d9953105SMichael Ellerman * 593d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 594d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 595d9953105SMichael Ellerman */ 596d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 597d9953105SMichael Ellerman { 598d9953105SMichael Ellerman struct rtas_error_log *errp; 599d9953105SMichael Ellerman 600d9953105SMichael Ellerman if (fwnmi_active) { 601d9953105SMichael Ellerman errp = fwnmi_get_errinfo(regs); 602d9953105SMichael Ellerman fwnmi_release_errinfo(); 603d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 604d9953105SMichael Ellerman return 1; 605d9953105SMichael Ellerman } 606d9953105SMichael Ellerman 607d9953105SMichael Ellerman return 0; 608d9953105SMichael Ellerman } 609