1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 4 * Copyright (C) 2011 Don Zickus Red Hat, Inc. 5 * 6 * Pentium III FXSR, SSE support 7 * Gareth Hughes <gareth@valinux.com>, May 2000 8 */ 9 10 /* 11 * Handle hardware traps and faults. 12 */ 13 #include <linux/spinlock.h> 14 #include <linux/kprobes.h> 15 #include <linux/kdebug.h> 16 #include <linux/nmi.h> 17 #include <linux/delay.h> 18 #include <linux/hardirq.h> 19 #include <linux/slab.h> 20 #include <linux/export.h> 21 22 #if defined(CONFIG_EDAC) 23 #include <linux/edac.h> 24 #endif 25 26 #include <linux/atomic.h> 27 #include <asm/traps.h> 28 #include <asm/mach_traps.h> 29 #include <asm/nmi.h> 30 #include <asm/x86_init.h> 31 32 struct nmi_desc { 33 spinlock_t lock; 34 struct list_head head; 35 }; 36 37 static struct nmi_desc nmi_desc[NMI_MAX] = 38 { 39 { 40 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), 41 .head = LIST_HEAD_INIT(nmi_desc[0].head), 42 }, 43 { 44 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), 45 .head = LIST_HEAD_INIT(nmi_desc[1].head), 46 }, 47 { 48 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), 49 .head = LIST_HEAD_INIT(nmi_desc[2].head), 50 }, 51 { 52 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), 53 .head = LIST_HEAD_INIT(nmi_desc[3].head), 54 }, 55 56 }; 57 58 struct nmi_stats { 59 unsigned int normal; 60 unsigned int unknown; 61 unsigned int external; 62 unsigned int swallow; 63 }; 64 65 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); 66 67 static int ignore_nmis; 68 69 int unknown_nmi_panic; 70 /* 71 * Prevent NMI reason port (0x61) being accessed simultaneously, can 72 * only be used in NMI handler. 73 */ 74 static DEFINE_RAW_SPINLOCK(nmi_reason_lock); 75 76 static int __init setup_unknown_nmi_panic(char *str) 77 { 78 unknown_nmi_panic = 1; 79 return 1; 80 } 81 __setup("unknown_nmi_panic", setup_unknown_nmi_panic); 82 83 #define nmi_to_desc(type) (&nmi_desc[type]) 84 85 static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 86 { 87 struct nmi_desc *desc = nmi_to_desc(type); 88 struct nmiaction *a; 89 int handled=0; 90 91 rcu_read_lock(); 92 93 /* 94 * NMIs are edge-triggered, which means if you have enough 95 * of them concurrently, you can lose some because only one 96 * can be latched at any given time. Walk the whole list 97 * to handle those situations. 98 */ 99 list_for_each_entry_rcu(a, &desc->head, list) 100 handled += a->handler(type, regs); 101 102 rcu_read_unlock(); 103 104 /* return total number of NMI events handled */ 105 return handled; 106 } 107 108 int __register_nmi_handler(unsigned int type, struct nmiaction *action) 109 { 110 struct nmi_desc *desc = nmi_to_desc(type); 111 unsigned long flags; 112 113 if (!action->handler) 114 return -EINVAL; 115 116 spin_lock_irqsave(&desc->lock, flags); 117 118 /* 119 * most handlers of type NMI_UNKNOWN never return because 120 * they just assume the NMI is theirs. Just a sanity check 121 * to manage expectations 122 */ 123 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); 124 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); 125 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); 126 127 /* 128 * some handlers need to be executed first otherwise a fake 129 * event confuses some handlers (kdump uses this flag) 130 */ 131 if (action->flags & NMI_FLAG_FIRST) 132 list_add_rcu(&action->list, &desc->head); 133 else 134 list_add_tail_rcu(&action->list, &desc->head); 135 136 spin_unlock_irqrestore(&desc->lock, flags); 137 return 0; 138 } 139 EXPORT_SYMBOL(__register_nmi_handler); 140 141 void unregister_nmi_handler(unsigned int type, const char *name) 142 { 143 struct nmi_desc *desc = nmi_to_desc(type); 144 struct nmiaction *n; 145 unsigned long flags; 146 147 spin_lock_irqsave(&desc->lock, flags); 148 149 list_for_each_entry_rcu(n, &desc->head, list) { 150 /* 151 * the name passed in to describe the nmi handler 152 * is used as the lookup key 153 */ 154 if (!strcmp(n->name, name)) { 155 WARN(in_nmi(), 156 "Trying to free NMI (%s) from NMI context!\n", n->name); 157 list_del_rcu(&n->list); 158 break; 159 } 160 } 161 162 spin_unlock_irqrestore(&desc->lock, flags); 163 synchronize_rcu(); 164 } 165 EXPORT_SYMBOL_GPL(unregister_nmi_handler); 166 167 static __kprobes void 168 pci_serr_error(unsigned char reason, struct pt_regs *regs) 169 { 170 /* check to see if anyone registered against these types of errors */ 171 if (nmi_handle(NMI_SERR, regs, false)) 172 return; 173 174 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 175 reason, smp_processor_id()); 176 177 /* 178 * On some machines, PCI SERR line is used to report memory 179 * errors. EDAC makes use of it. 180 */ 181 #if defined(CONFIG_EDAC) 182 if (edac_handler_set()) { 183 edac_atomic_assert_error(); 184 return; 185 } 186 #endif 187 188 if (panic_on_unrecovered_nmi) 189 panic("NMI: Not continuing"); 190 191 pr_emerg("Dazed and confused, but trying to continue\n"); 192 193 /* Clear and disable the PCI SERR error line. */ 194 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; 195 outb(reason, NMI_REASON_PORT); 196 } 197 198 static __kprobes void 199 io_check_error(unsigned char reason, struct pt_regs *regs) 200 { 201 unsigned long i; 202 203 /* check to see if anyone registered against these types of errors */ 204 if (nmi_handle(NMI_IO_CHECK, regs, false)) 205 return; 206 207 pr_emerg( 208 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", 209 reason, smp_processor_id()); 210 show_regs(regs); 211 212 if (panic_on_io_nmi) 213 panic("NMI IOCK error: Not continuing"); 214 215 /* Re-enable the IOCK line, wait for a few seconds */ 216 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; 217 outb(reason, NMI_REASON_PORT); 218 219 i = 20000; 220 while (--i) { 221 touch_nmi_watchdog(); 222 udelay(100); 223 } 224 225 reason &= ~NMI_REASON_CLEAR_IOCHK; 226 outb(reason, NMI_REASON_PORT); 227 } 228 229 static __kprobes void 230 unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 231 { 232 int handled; 233 234 /* 235 * Use 'false' as back-to-back NMIs are dealt with one level up. 236 * Of course this makes having multiple 'unknown' handlers useless 237 * as only the first one is ever run (unless it can actually determine 238 * if it caused the NMI) 239 */ 240 handled = nmi_handle(NMI_UNKNOWN, regs, false); 241 if (handled) { 242 __this_cpu_add(nmi_stats.unknown, handled); 243 return; 244 } 245 246 __this_cpu_add(nmi_stats.unknown, 1); 247 248 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 249 reason, smp_processor_id()); 250 251 pr_emerg("Do you have a strange power saving mode enabled?\n"); 252 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 253 panic("NMI: Not continuing"); 254 255 pr_emerg("Dazed and confused, but trying to continue\n"); 256 } 257 258 static DEFINE_PER_CPU(bool, swallow_nmi); 259 static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 260 261 static __kprobes void default_do_nmi(struct pt_regs *regs) 262 { 263 unsigned char reason = 0; 264 int handled; 265 bool b2b = false; 266 267 /* 268 * CPU-specific NMI must be processed before non-CPU-specific 269 * NMI, otherwise we may lose it, because the CPU-specific 270 * NMI can not be detected/processed on other CPUs. 271 */ 272 273 /* 274 * Back-to-back NMIs are interesting because they can either 275 * be two NMI or more than two NMIs (any thing over two is dropped 276 * due to NMI being edge-triggered). If this is the second half 277 * of the back-to-back NMI, assume we dropped things and process 278 * more handlers. Otherwise reset the 'swallow' NMI behaviour 279 */ 280 if (regs->ip == __this_cpu_read(last_nmi_rip)) 281 b2b = true; 282 else 283 __this_cpu_write(swallow_nmi, false); 284 285 __this_cpu_write(last_nmi_rip, regs->ip); 286 287 handled = nmi_handle(NMI_LOCAL, regs, b2b); 288 __this_cpu_add(nmi_stats.normal, handled); 289 if (handled) { 290 /* 291 * There are cases when a NMI handler handles multiple 292 * events in the current NMI. One of these events may 293 * be queued for in the next NMI. Because the event is 294 * already handled, the next NMI will result in an unknown 295 * NMI. Instead lets flag this for a potential NMI to 296 * swallow. 297 */ 298 if (handled > 1) 299 __this_cpu_write(swallow_nmi, true); 300 return; 301 } 302 303 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ 304 raw_spin_lock(&nmi_reason_lock); 305 reason = x86_platform.get_nmi_reason(); 306 307 if (reason & NMI_REASON_MASK) { 308 if (reason & NMI_REASON_SERR) 309 pci_serr_error(reason, regs); 310 else if (reason & NMI_REASON_IOCHK) 311 io_check_error(reason, regs); 312 #ifdef CONFIG_X86_32 313 /* 314 * Reassert NMI in case it became active 315 * meanwhile as it's edge-triggered: 316 */ 317 reassert_nmi(); 318 #endif 319 __this_cpu_add(nmi_stats.external, 1); 320 raw_spin_unlock(&nmi_reason_lock); 321 return; 322 } 323 raw_spin_unlock(&nmi_reason_lock); 324 325 /* 326 * Only one NMI can be latched at a time. To handle 327 * this we may process multiple nmi handlers at once to 328 * cover the case where an NMI is dropped. The downside 329 * to this approach is we may process an NMI prematurely, 330 * while its real NMI is sitting latched. This will cause 331 * an unknown NMI on the next run of the NMI processing. 332 * 333 * We tried to flag that condition above, by setting the 334 * swallow_nmi flag when we process more than one event. 335 * This condition is also only present on the second half 336 * of a back-to-back NMI, so we flag that condition too. 337 * 338 * If both are true, we assume we already processed this 339 * NMI previously and we swallow it. Otherwise we reset 340 * the logic. 341 * 342 * There are scenarios where we may accidentally swallow 343 * a 'real' unknown NMI. For example, while processing 344 * a perf NMI another perf NMI comes in along with a 345 * 'real' unknown NMI. These two NMIs get combined into 346 * one (as descibed above). When the next NMI gets 347 * processed, it will be flagged by perf as handled, but 348 * noone will know that there was a 'real' unknown NMI sent 349 * also. As a result it gets swallowed. Or if the first 350 * perf NMI returns two events handled then the second 351 * NMI will get eaten by the logic below, again losing a 352 * 'real' unknown NMI. But this is the best we can do 353 * for now. 354 */ 355 if (b2b && __this_cpu_read(swallow_nmi)) 356 __this_cpu_add(nmi_stats.swallow, 1); 357 else 358 unknown_nmi_error(reason, regs); 359 } 360 361 /* 362 * NMIs can hit breakpoints which will cause it to lose its 363 * NMI context with the CPU when the breakpoint does an iret. 364 */ 365 #ifdef CONFIG_X86_32 366 /* 367 * For i386, NMIs use the same stack as the kernel, and we can 368 * add a workaround to the iret problem in C (preventing nested 369 * NMIs if an NMI takes a trap). Simply have 3 states the NMI 370 * can be in: 371 * 372 * 1) not running 373 * 2) executing 374 * 3) latched 375 * 376 * When no NMI is in progress, it is in the "not running" state. 377 * When an NMI comes in, it goes into the "executing" state. 378 * Normally, if another NMI is triggered, it does not interrupt 379 * the running NMI and the HW will simply latch it so that when 380 * the first NMI finishes, it will restart the second NMI. 381 * (Note, the latch is binary, thus multiple NMIs triggering, 382 * when one is running, are ignored. Only one NMI is restarted.) 383 * 384 * If an NMI hits a breakpoint that executes an iret, another 385 * NMI can preempt it. We do not want to allow this new NMI 386 * to run, but we want to execute it when the first one finishes. 387 * We set the state to "latched", and the exit of the first NMI will 388 * perform a dec_return, if the result is zero (NOT_RUNNING), then 389 * it will simply exit the NMI handler. If not, the dec_return 390 * would have set the state to NMI_EXECUTING (what we want it to 391 * be when we are running). In this case, we simply jump back 392 * to rerun the NMI handler again, and restart the 'latched' NMI. 393 * 394 * No trap (breakpoint or page fault) should be hit before nmi_restart, 395 * thus there is no race between the first check of state for NOT_RUNNING 396 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs 397 * at this point. 398 * 399 * In case the NMI takes a page fault, we need to save off the CR2 400 * because the NMI could have preempted another page fault and corrupt 401 * the CR2 that is about to be read. As nested NMIs must be restarted 402 * and they can not take breakpoints or page faults, the update of the 403 * CR2 must be done before converting the nmi state back to NOT_RUNNING. 404 * Otherwise, there would be a race of another nested NMI coming in 405 * after setting state to NOT_RUNNING but before updating the nmi_cr2. 406 */ 407 enum nmi_states { 408 NMI_NOT_RUNNING = 0, 409 NMI_EXECUTING, 410 NMI_LATCHED, 411 }; 412 static DEFINE_PER_CPU(enum nmi_states, nmi_state); 413 static DEFINE_PER_CPU(unsigned long, nmi_cr2); 414 415 #define nmi_nesting_preprocess(regs) \ 416 do { \ 417 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ 418 this_cpu_write(nmi_state, NMI_LATCHED); \ 419 return; \ 420 } \ 421 this_cpu_write(nmi_state, NMI_EXECUTING); \ 422 this_cpu_write(nmi_cr2, read_cr2()); \ 423 } while (0); \ 424 nmi_restart: 425 426 #define nmi_nesting_postprocess() \ 427 do { \ 428 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ 429 write_cr2(this_cpu_read(nmi_cr2)); \ 430 if (this_cpu_dec_return(nmi_state)) \ 431 goto nmi_restart; \ 432 } while (0) 433 #else /* x86_64 */ 434 /* 435 * In x86_64 things are a bit more difficult. This has the same problem 436 * where an NMI hitting a breakpoint that calls iret will remove the 437 * NMI context, allowing a nested NMI to enter. What makes this more 438 * difficult is that both NMIs and breakpoints have their own stack. 439 * When a new NMI or breakpoint is executed, the stack is set to a fixed 440 * point. If an NMI is nested, it will have its stack set at that same 441 * fixed address that the first NMI had, and will start corrupting the 442 * stack. This is handled in entry_64.S, but the same problem exists with 443 * the breakpoint stack. 444 * 445 * If a breakpoint is being processed, and the debug stack is being used, 446 * if an NMI comes in and also hits a breakpoint, the stack pointer 447 * will be set to the same fixed address as the breakpoint that was 448 * interrupted, causing that stack to be corrupted. To handle this case, 449 * check if the stack that was interrupted is the debug stack, and if 450 * so, change the IDT so that new breakpoints will use the current stack 451 * and not switch to the fixed address. On return of the NMI, switch back 452 * to the original IDT. 453 */ 454 static DEFINE_PER_CPU(int, update_debug_stack); 455 456 static inline void nmi_nesting_preprocess(struct pt_regs *regs) 457 { 458 /* 459 * If we interrupted a breakpoint, it is possible that 460 * the nmi handler will have breakpoints too. We need to 461 * change the IDT such that breakpoints that happen here 462 * continue to use the NMI stack. 463 */ 464 if (unlikely(is_debug_stack(regs->sp))) { 465 debug_stack_set_zero(); 466 this_cpu_write(update_debug_stack, 1); 467 } 468 } 469 470 static inline void nmi_nesting_postprocess(void) 471 { 472 if (unlikely(this_cpu_read(update_debug_stack))) { 473 debug_stack_reset(); 474 this_cpu_write(update_debug_stack, 0); 475 } 476 } 477 #endif 478 479 dotraplinkage notrace __kprobes void 480 do_nmi(struct pt_regs *regs, long error_code) 481 { 482 nmi_nesting_preprocess(regs); 483 484 nmi_enter(); 485 486 inc_irq_stat(__nmi_count); 487 488 if (!ignore_nmis) 489 default_do_nmi(regs); 490 491 nmi_exit(); 492 493 /* On i386, may loop back to preprocess */ 494 nmi_nesting_postprocess(); 495 } 496 497 void stop_nmi(void) 498 { 499 ignore_nmis++; 500 } 501 502 void restart_nmi(void) 503 { 504 ignore_nmis--; 505 } 506 507 /* reset the back-to-back NMI logic */ 508 void local_touch_nmi(void) 509 { 510 __this_cpu_write(last_nmi_rip, 0); 511 } 512