1 /* 2 * Machine check exception handling. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * 18 * Copyright 2013 IBM Corporation 19 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 20 */ 21 22 #undef DEBUG 23 #define pr_fmt(fmt) "mce: " fmt 24 25 #include <linux/types.h> 26 #include <linux/ptrace.h> 27 #include <linux/percpu.h> 28 #include <linux/export.h> 29 #include <linux/irq_work.h> 30 #include <asm/mce.h> 31 32 static DEFINE_PER_CPU(int, mce_nest_count); 33 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); 34 35 /* Queue for delayed MCE events. */ 36 static DEFINE_PER_CPU(int, mce_queue_count); 37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); 38 39 static void machine_check_process_queued_event(struct irq_work *work); 40 static struct irq_work mce_event_process_work = { 41 .func = machine_check_process_queued_event, 42 }; 43 44 static void mce_set_error_info(struct machine_check_event *mce, 45 struct mce_error_info *mce_err) 46 { 47 mce->error_type = mce_err->error_type; 48 switch (mce_err->error_type) { 49 case MCE_ERROR_TYPE_UE: 50 mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type; 51 break; 52 case MCE_ERROR_TYPE_SLB: 53 mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type; 54 break; 55 case MCE_ERROR_TYPE_ERAT: 56 mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type; 57 break; 58 case MCE_ERROR_TYPE_TLB: 59 mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; 60 break; 61 case MCE_ERROR_TYPE_UNKNOWN: 62 default: 63 break; 64 } 65 } 66 67 /* 68 * Decode and save high level MCE information into per cpu buffer which 69 * is an array of machine_check_event structure. 70 */ 71 void save_mce_event(struct pt_regs *regs, long handled, 72 struct mce_error_info *mce_err, 73 uint64_t nip, uint64_t addr) 74 { 75 int index = __this_cpu_inc_return(mce_nest_count) - 1; 76 struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); 77 78 /* 79 * Return if we don't have enough space to log mce event. 80 * mce_nest_count may go beyond MAX_MC_EVT but that's ok, 81 * the check below will stop buffer overrun. 82 */ 83 if (index >= MAX_MC_EVT) 84 return; 85 86 /* Populate generic machine check info */ 87 mce->version = MCE_V1; 88 mce->srr0 = nip; 89 mce->srr1 = regs->msr; 90 mce->gpr3 = regs->gpr[3]; 91 mce->in_use = 1; 92 93 mce->initiator = MCE_INITIATOR_CPU; 94 /* Mark it recovered if we have handled it and MSR(RI=1). */ 95 if (handled && (regs->msr & MSR_RI)) 96 mce->disposition = MCE_DISPOSITION_RECOVERED; 97 else 98 mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; 99 mce->severity = MCE_SEV_ERROR_SYNC; 100 101 /* 102 * Populate the mce error_type and type-specific error_type. 103 */ 104 mce_set_error_info(mce, mce_err); 105 106 if (!addr) 107 return; 108 109 if (mce->error_type == MCE_ERROR_TYPE_TLB) { 110 mce->u.tlb_error.effective_address_provided = true; 111 mce->u.tlb_error.effective_address = addr; 112 } else if (mce->error_type == MCE_ERROR_TYPE_SLB) { 113 mce->u.slb_error.effective_address_provided = true; 114 mce->u.slb_error.effective_address = addr; 115 } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { 116 mce->u.erat_error.effective_address_provided = true; 117 mce->u.erat_error.effective_address = addr; 118 } else if (mce->error_type == MCE_ERROR_TYPE_UE) { 119 mce->u.ue_error.effective_address_provided = true; 120 mce->u.ue_error.effective_address = addr; 121 } 122 return; 123 } 124 125 /* 126 * get_mce_event: 127 * mce Pointer to machine_check_event structure to be filled. 128 * release Flag to indicate whether to free the event slot or not. 129 * 0 <= do not release the mce event. Caller will invoke 130 * release_mce_event() once event has been consumed. 131 * 1 <= release the slot. 132 * 133 * return 1 = success 134 * 0 = failure 135 * 136 * get_mce_event() will be called by platform specific machine check 137 * handle routine and in KVM. 138 * When we call get_mce_event(), we are still in interrupt context and 139 * preemption will not be scheduled until ret_from_expect() routine 140 * is called. 141 */ 142 int get_mce_event(struct machine_check_event *mce, bool release) 143 { 144 int index = __this_cpu_read(mce_nest_count) - 1; 145 struct machine_check_event *mc_evt; 146 int ret = 0; 147 148 /* Sanity check */ 149 if (index < 0) 150 return ret; 151 152 /* Check if we have MCE info to process. */ 153 if (index < MAX_MC_EVT) { 154 mc_evt = this_cpu_ptr(&mce_event[index]); 155 /* Copy the event structure and release the original */ 156 if (mce) 157 *mce = *mc_evt; 158 if (release) 159 mc_evt->in_use = 0; 160 ret = 1; 161 } 162 /* Decrement the count to free the slot. */ 163 if (release) 164 __this_cpu_dec(mce_nest_count); 165 166 return ret; 167 } 168 169 void release_mce_event(void) 170 { 171 get_mce_event(NULL, true); 172 } 173 174 /* 175 * Queue up the MCE event which then can be handled later. 176 */ 177 void machine_check_queue_event(void) 178 { 179 int index; 180 struct machine_check_event evt; 181 182 if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 183 return; 184 185 index = __this_cpu_inc_return(mce_queue_count) - 1; 186 /* If queue is full, just return for now. */ 187 if (index >= MAX_MC_EVT) { 188 __this_cpu_dec(mce_queue_count); 189 return; 190 } 191 memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt)); 192 193 /* Queue irq work to process this event later. */ 194 irq_work_queue(&mce_event_process_work); 195 } 196 197 /* 198 * process pending MCE event from the mce event queue. This function will be 199 * called during syscall exit. 200 */ 201 static void machine_check_process_queued_event(struct irq_work *work) 202 { 203 int index; 204 205 /* 206 * For now just print it to console. 207 * TODO: log this error event to FSP or nvram. 208 */ 209 while (__this_cpu_read(mce_queue_count) > 0) { 210 index = __this_cpu_read(mce_queue_count) - 1; 211 machine_check_print_event_info( 212 this_cpu_ptr(&mce_event_queue[index])); 213 __this_cpu_dec(mce_queue_count); 214 } 215 } 216 217 void machine_check_print_event_info(struct machine_check_event *evt) 218 { 219 const char *level, *sevstr, *subtype; 220 static const char *mc_ue_types[] = { 221 "Indeterminate", 222 "Instruction fetch", 223 "Page table walk ifetch", 224 "Load/Store", 225 "Page table walk Load/Store", 226 }; 227 static const char *mc_slb_types[] = { 228 "Indeterminate", 229 "Parity", 230 "Multihit", 231 }; 232 static const char *mc_erat_types[] = { 233 "Indeterminate", 234 "Parity", 235 "Multihit", 236 }; 237 static const char *mc_tlb_types[] = { 238 "Indeterminate", 239 "Parity", 240 "Multihit", 241 }; 242 243 /* Print things out */ 244 if (evt->version != MCE_V1) { 245 pr_err("Machine Check Exception, Unknown event version %d !\n", 246 evt->version); 247 return; 248 } 249 switch (evt->severity) { 250 case MCE_SEV_NO_ERROR: 251 level = KERN_INFO; 252 sevstr = "Harmless"; 253 break; 254 case MCE_SEV_WARNING: 255 level = KERN_WARNING; 256 sevstr = ""; 257 break; 258 case MCE_SEV_ERROR_SYNC: 259 level = KERN_ERR; 260 sevstr = "Severe"; 261 break; 262 case MCE_SEV_FATAL: 263 default: 264 level = KERN_ERR; 265 sevstr = "Fatal"; 266 break; 267 } 268 269 printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 270 evt->disposition == MCE_DISPOSITION_RECOVERED ? 271 "Recovered" : "[Not recovered"); 272 printk("%s Initiator: %s\n", level, 273 evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 274 switch (evt->error_type) { 275 case MCE_ERROR_TYPE_UE: 276 subtype = evt->u.ue_error.ue_error_type < 277 ARRAY_SIZE(mc_ue_types) ? 278 mc_ue_types[evt->u.ue_error.ue_error_type] 279 : "Unknown"; 280 printk("%s Error type: UE [%s]\n", level, subtype); 281 if (evt->u.ue_error.effective_address_provided) 282 printk("%s Effective address: %016llx\n", 283 level, evt->u.ue_error.effective_address); 284 if (evt->u.ue_error.physical_address_provided) 285 printk("%s Physical address: %016llx\n", 286 level, evt->u.ue_error.physical_address); 287 break; 288 case MCE_ERROR_TYPE_SLB: 289 subtype = evt->u.slb_error.slb_error_type < 290 ARRAY_SIZE(mc_slb_types) ? 291 mc_slb_types[evt->u.slb_error.slb_error_type] 292 : "Unknown"; 293 printk("%s Error type: SLB [%s]\n", level, subtype); 294 if (evt->u.slb_error.effective_address_provided) 295 printk("%s Effective address: %016llx\n", 296 level, evt->u.slb_error.effective_address); 297 break; 298 case MCE_ERROR_TYPE_ERAT: 299 subtype = evt->u.erat_error.erat_error_type < 300 ARRAY_SIZE(mc_erat_types) ? 301 mc_erat_types[evt->u.erat_error.erat_error_type] 302 : "Unknown"; 303 printk("%s Error type: ERAT [%s]\n", level, subtype); 304 if (evt->u.erat_error.effective_address_provided) 305 printk("%s Effective address: %016llx\n", 306 level, evt->u.erat_error.effective_address); 307 break; 308 case MCE_ERROR_TYPE_TLB: 309 subtype = evt->u.tlb_error.tlb_error_type < 310 ARRAY_SIZE(mc_tlb_types) ? 311 mc_tlb_types[evt->u.tlb_error.tlb_error_type] 312 : "Unknown"; 313 printk("%s Error type: TLB [%s]\n", level, subtype); 314 if (evt->u.tlb_error.effective_address_provided) 315 printk("%s Effective address: %016llx\n", 316 level, evt->u.tlb_error.effective_address); 317 break; 318 default: 319 case MCE_ERROR_TYPE_UNKNOWN: 320 printk("%s Error type: Unknown\n", level); 321 break; 322 } 323 } 324 325 uint64_t get_mce_fault_addr(struct machine_check_event *evt) 326 { 327 switch (evt->error_type) { 328 case MCE_ERROR_TYPE_UE: 329 if (evt->u.ue_error.effective_address_provided) 330 return evt->u.ue_error.effective_address; 331 break; 332 case MCE_ERROR_TYPE_SLB: 333 if (evt->u.slb_error.effective_address_provided) 334 return evt->u.slb_error.effective_address; 335 break; 336 case MCE_ERROR_TYPE_ERAT: 337 if (evt->u.erat_error.effective_address_provided) 338 return evt->u.erat_error.effective_address; 339 break; 340 case MCE_ERROR_TYPE_TLB: 341 if (evt->u.tlb_error.effective_address_provided) 342 return evt->u.tlb_error.effective_address; 343 break; 344 default: 345 case MCE_ERROR_TYPE_UNKNOWN: 346 break; 347 } 348 return 0; 349 } 350 EXPORT_SYMBOL(get_mce_fault_addr); 351