1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 4 * Copyright (C) 2011 Don Zickus Red Hat, Inc. 5 * 6 * Pentium III FXSR, SSE support 7 * Gareth Hughes <gareth@valinux.com>, May 2000 8 */ 9 10 /* 11 * Handle hardware traps and faults. 12 */ 13 #include <linux/spinlock.h> 14 #include <linux/kprobes.h> 15 #include <linux/kdebug.h> 16 #include <linux/nmi.h> 17 #include <linux/debugfs.h> 18 #include <linux/delay.h> 19 #include <linux/hardirq.h> 20 #include <linux/slab.h> 21 #include <linux/export.h> 22 23 #if defined(CONFIG_EDAC) 24 #include <linux/edac.h> 25 #endif 26 27 #include <linux/atomic.h> 28 #include <asm/traps.h> 29 #include <asm/mach_traps.h> 30 #include <asm/nmi.h> 31 #include <asm/x86_init.h> 32 33 #define CREATE_TRACE_POINTS 34 #include <trace/events/nmi.h> 35 36 struct nmi_desc { 37 spinlock_t lock; 38 struct list_head head; 39 }; 40 41 static struct nmi_desc nmi_desc[NMI_MAX] = 42 { 43 { 44 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), 45 .head = LIST_HEAD_INIT(nmi_desc[0].head), 46 }, 47 { 48 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), 49 .head = LIST_HEAD_INIT(nmi_desc[1].head), 50 }, 51 { 52 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), 53 .head = LIST_HEAD_INIT(nmi_desc[2].head), 54 }, 55 { 56 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), 57 .head = LIST_HEAD_INIT(nmi_desc[3].head), 58 }, 59 60 }; 61 62 struct nmi_stats { 63 unsigned int normal; 64 unsigned int unknown; 65 unsigned int external; 66 unsigned int swallow; 67 }; 68 69 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); 70 71 static int ignore_nmis; 72 73 int unknown_nmi_panic; 74 /* 75 * Prevent NMI reason port (0x61) being accessed simultaneously, can 76 * only be used in NMI handler. 77 */ 78 static DEFINE_RAW_SPINLOCK(nmi_reason_lock); 79 80 static int __init setup_unknown_nmi_panic(char *str) 81 { 82 unknown_nmi_panic = 1; 83 return 1; 84 } 85 __setup("unknown_nmi_panic", setup_unknown_nmi_panic); 86 87 #define nmi_to_desc(type) (&nmi_desc[type]) 88 89 static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; 90 static int __init nmi_warning_debugfs(void) 91 { 92 debugfs_create_u64("nmi_longest_ns", 0644, 93 arch_debugfs_dir, &nmi_longest_ns); 94 return 0; 95 } 96 fs_initcall(nmi_warning_debugfs); 97 98 static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 99 { 100 struct nmi_desc *desc = nmi_to_desc(type); 101 struct nmiaction *a; 102 int handled=0; 103 104 rcu_read_lock(); 105 106 /* 107 * NMIs are edge-triggered, which means if you have enough 108 * of them concurrently, you can lose some because only one 109 * can be latched at any given time. Walk the whole list 110 * to handle those situations. 111 */ 112 list_for_each_entry_rcu(a, &desc->head, list) { 113 u64 before, delta, whole_msecs; 114 int remainder_ns, decimal_msecs, thishandled; 115 116 before = sched_clock(); 117 thishandled = a->handler(type, regs); 118 handled += thishandled; 119 delta = sched_clock() - before; 120 trace_nmi_handler(a->handler, (int)delta, thishandled); 121 122 if (delta < nmi_longest_ns) 123 continue; 124 125 nmi_longest_ns = delta; 126 whole_msecs = delta; 127 remainder_ns = do_div(whole_msecs, (1000 * 1000)); 128 decimal_msecs = remainder_ns / 1000; 129 printk_ratelimited(KERN_INFO 130 "INFO: NMI handler (%ps) took too long to run: " 131 "%lld.%03d msecs\n", a->handler, whole_msecs, 132 decimal_msecs); 133 } 134 135 rcu_read_unlock(); 136 137 /* return total number of NMI events handled */ 138 return handled; 139 } 140 141 int __register_nmi_handler(unsigned int type, struct nmiaction *action) 142 { 143 struct nmi_desc *desc = nmi_to_desc(type); 144 unsigned long flags; 145 146 if (!action->handler) 147 return -EINVAL; 148 149 spin_lock_irqsave(&desc->lock, flags); 150 151 /* 152 * most handlers of type NMI_UNKNOWN never return because 153 * they just assume the NMI is theirs. Just a sanity check 154 * to manage expectations 155 */ 156 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); 157 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); 158 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); 159 160 /* 161 * some handlers need to be executed first otherwise a fake 162 * event confuses some handlers (kdump uses this flag) 163 */ 164 if (action->flags & NMI_FLAG_FIRST) 165 list_add_rcu(&action->list, &desc->head); 166 else 167 list_add_tail_rcu(&action->list, &desc->head); 168 169 spin_unlock_irqrestore(&desc->lock, flags); 170 return 0; 171 } 172 EXPORT_SYMBOL(__register_nmi_handler); 173 174 void unregister_nmi_handler(unsigned int type, const char *name) 175 { 176 struct nmi_desc *desc = nmi_to_desc(type); 177 struct nmiaction *n; 178 unsigned long flags; 179 180 spin_lock_irqsave(&desc->lock, flags); 181 182 list_for_each_entry_rcu(n, &desc->head, list) { 183 /* 184 * the name passed in to describe the nmi handler 185 * is used as the lookup key 186 */ 187 if (!strcmp(n->name, name)) { 188 WARN(in_nmi(), 189 "Trying to free NMI (%s) from NMI context!\n", n->name); 190 list_del_rcu(&n->list); 191 break; 192 } 193 } 194 195 spin_unlock_irqrestore(&desc->lock, flags); 196 synchronize_rcu(); 197 } 198 EXPORT_SYMBOL_GPL(unregister_nmi_handler); 199 200 static __kprobes void 201 pci_serr_error(unsigned char reason, struct pt_regs *regs) 202 { 203 /* check to see if anyone registered against these types of errors */ 204 if (nmi_handle(NMI_SERR, regs, false)) 205 return; 206 207 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 208 reason, smp_processor_id()); 209 210 /* 211 * On some machines, PCI SERR line is used to report memory 212 * errors. EDAC makes use of it. 213 */ 214 #if defined(CONFIG_EDAC) 215 if (edac_handler_set()) { 216 edac_atomic_assert_error(); 217 return; 218 } 219 #endif 220 221 if (panic_on_unrecovered_nmi) 222 panic("NMI: Not continuing"); 223 224 pr_emerg("Dazed and confused, but trying to continue\n"); 225 226 /* Clear and disable the PCI SERR error line. */ 227 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; 228 outb(reason, NMI_REASON_PORT); 229 } 230 231 static __kprobes void 232 io_check_error(unsigned char reason, struct pt_regs *regs) 233 { 234 unsigned long i; 235 236 /* check to see if anyone registered against these types of errors */ 237 if (nmi_handle(NMI_IO_CHECK, regs, false)) 238 return; 239 240 pr_emerg( 241 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", 242 reason, smp_processor_id()); 243 show_regs(regs); 244 245 if (panic_on_io_nmi) 246 panic("NMI IOCK error: Not continuing"); 247 248 /* Re-enable the IOCK line, wait for a few seconds */ 249 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; 250 outb(reason, NMI_REASON_PORT); 251 252 i = 20000; 253 while (--i) { 254 touch_nmi_watchdog(); 255 udelay(100); 256 } 257 258 reason &= ~NMI_REASON_CLEAR_IOCHK; 259 outb(reason, NMI_REASON_PORT); 260 } 261 262 static __kprobes void 263 unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 264 { 265 int handled; 266 267 /* 268 * Use 'false' as back-to-back NMIs are dealt with one level up. 269 * Of course this makes having multiple 'unknown' handlers useless 270 * as only the first one is ever run (unless it can actually determine 271 * if it caused the NMI) 272 */ 273 handled = nmi_handle(NMI_UNKNOWN, regs, false); 274 if (handled) { 275 __this_cpu_add(nmi_stats.unknown, handled); 276 return; 277 } 278 279 __this_cpu_add(nmi_stats.unknown, 1); 280 281 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 282 reason, smp_processor_id()); 283 284 pr_emerg("Do you have a strange power saving mode enabled?\n"); 285 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 286 panic("NMI: Not continuing"); 287 288 pr_emerg("Dazed and confused, but trying to continue\n"); 289 } 290 291 static DEFINE_PER_CPU(bool, swallow_nmi); 292 static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 293 294 static __kprobes void default_do_nmi(struct pt_regs *regs) 295 { 296 unsigned char reason = 0; 297 int handled; 298 bool b2b = false; 299 300 /* 301 * CPU-specific NMI must be processed before non-CPU-specific 302 * NMI, otherwise we may lose it, because the CPU-specific 303 * NMI can not be detected/processed on other CPUs. 304 */ 305 306 /* 307 * Back-to-back NMIs are interesting because they can either 308 * be two NMI or more than two NMIs (any thing over two is dropped 309 * due to NMI being edge-triggered). If this is the second half 310 * of the back-to-back NMI, assume we dropped things and process 311 * more handlers. Otherwise reset the 'swallow' NMI behaviour 312 */ 313 if (regs->ip == __this_cpu_read(last_nmi_rip)) 314 b2b = true; 315 else 316 __this_cpu_write(swallow_nmi, false); 317 318 __this_cpu_write(last_nmi_rip, regs->ip); 319 320 handled = nmi_handle(NMI_LOCAL, regs, b2b); 321 __this_cpu_add(nmi_stats.normal, handled); 322 if (handled) { 323 /* 324 * There are cases when a NMI handler handles multiple 325 * events in the current NMI. One of these events may 326 * be queued for in the next NMI. Because the event is 327 * already handled, the next NMI will result in an unknown 328 * NMI. Instead lets flag this for a potential NMI to 329 * swallow. 330 */ 331 if (handled > 1) 332 __this_cpu_write(swallow_nmi, true); 333 return; 334 } 335 336 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ 337 raw_spin_lock(&nmi_reason_lock); 338 reason = x86_platform.get_nmi_reason(); 339 340 if (reason & NMI_REASON_MASK) { 341 if (reason & NMI_REASON_SERR) 342 pci_serr_error(reason, regs); 343 else if (reason & NMI_REASON_IOCHK) 344 io_check_error(reason, regs); 345 #ifdef CONFIG_X86_32 346 /* 347 * Reassert NMI in case it became active 348 * meanwhile as it's edge-triggered: 349 */ 350 reassert_nmi(); 351 #endif 352 __this_cpu_add(nmi_stats.external, 1); 353 raw_spin_unlock(&nmi_reason_lock); 354 return; 355 } 356 raw_spin_unlock(&nmi_reason_lock); 357 358 /* 359 * Only one NMI can be latched at a time. To handle 360 * this we may process multiple nmi handlers at once to 361 * cover the case where an NMI is dropped. The downside 362 * to this approach is we may process an NMI prematurely, 363 * while its real NMI is sitting latched. This will cause 364 * an unknown NMI on the next run of the NMI processing. 365 * 366 * We tried to flag that condition above, by setting the 367 * swallow_nmi flag when we process more than one event. 368 * This condition is also only present on the second half 369 * of a back-to-back NMI, so we flag that condition too. 370 * 371 * If both are true, we assume we already processed this 372 * NMI previously and we swallow it. Otherwise we reset 373 * the logic. 374 * 375 * There are scenarios where we may accidentally swallow 376 * a 'real' unknown NMI. For example, while processing 377 * a perf NMI another perf NMI comes in along with a 378 * 'real' unknown NMI. These two NMIs get combined into 379 * one (as descibed above). When the next NMI gets 380 * processed, it will be flagged by perf as handled, but 381 * noone will know that there was a 'real' unknown NMI sent 382 * also. As a result it gets swallowed. Or if the first 383 * perf NMI returns two events handled then the second 384 * NMI will get eaten by the logic below, again losing a 385 * 'real' unknown NMI. But this is the best we can do 386 * for now. 387 */ 388 if (b2b && __this_cpu_read(swallow_nmi)) 389 __this_cpu_add(nmi_stats.swallow, 1); 390 else 391 unknown_nmi_error(reason, regs); 392 } 393 394 /* 395 * NMIs can hit breakpoints which will cause it to lose its 396 * NMI context with the CPU when the breakpoint does an iret. 397 */ 398 #ifdef CONFIG_X86_32 399 /* 400 * For i386, NMIs use the same stack as the kernel, and we can 401 * add a workaround to the iret problem in C (preventing nested 402 * NMIs if an NMI takes a trap). Simply have 3 states the NMI 403 * can be in: 404 * 405 * 1) not running 406 * 2) executing 407 * 3) latched 408 * 409 * When no NMI is in progress, it is in the "not running" state. 410 * When an NMI comes in, it goes into the "executing" state. 411 * Normally, if another NMI is triggered, it does not interrupt 412 * the running NMI and the HW will simply latch it so that when 413 * the first NMI finishes, it will restart the second NMI. 414 * (Note, the latch is binary, thus multiple NMIs triggering, 415 * when one is running, are ignored. Only one NMI is restarted.) 416 * 417 * If an NMI hits a breakpoint that executes an iret, another 418 * NMI can preempt it. We do not want to allow this new NMI 419 * to run, but we want to execute it when the first one finishes. 420 * We set the state to "latched", and the exit of the first NMI will 421 * perform a dec_return, if the result is zero (NOT_RUNNING), then 422 * it will simply exit the NMI handler. If not, the dec_return 423 * would have set the state to NMI_EXECUTING (what we want it to 424 * be when we are running). In this case, we simply jump back 425 * to rerun the NMI handler again, and restart the 'latched' NMI. 426 * 427 * No trap (breakpoint or page fault) should be hit before nmi_restart, 428 * thus there is no race between the first check of state for NOT_RUNNING 429 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs 430 * at this point. 431 * 432 * In case the NMI takes a page fault, we need to save off the CR2 433 * because the NMI could have preempted another page fault and corrupt 434 * the CR2 that is about to be read. As nested NMIs must be restarted 435 * and they can not take breakpoints or page faults, the update of the 436 * CR2 must be done before converting the nmi state back to NOT_RUNNING. 437 * Otherwise, there would be a race of another nested NMI coming in 438 * after setting state to NOT_RUNNING but before updating the nmi_cr2. 439 */ 440 enum nmi_states { 441 NMI_NOT_RUNNING = 0, 442 NMI_EXECUTING, 443 NMI_LATCHED, 444 }; 445 static DEFINE_PER_CPU(enum nmi_states, nmi_state); 446 static DEFINE_PER_CPU(unsigned long, nmi_cr2); 447 448 #define nmi_nesting_preprocess(regs) \ 449 do { \ 450 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ 451 this_cpu_write(nmi_state, NMI_LATCHED); \ 452 return; \ 453 } \ 454 this_cpu_write(nmi_state, NMI_EXECUTING); \ 455 this_cpu_write(nmi_cr2, read_cr2()); \ 456 } while (0); \ 457 nmi_restart: 458 459 #define nmi_nesting_postprocess() \ 460 do { \ 461 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ 462 write_cr2(this_cpu_read(nmi_cr2)); \ 463 if (this_cpu_dec_return(nmi_state)) \ 464 goto nmi_restart; \ 465 } while (0) 466 #else /* x86_64 */ 467 /* 468 * In x86_64 things are a bit more difficult. This has the same problem 469 * where an NMI hitting a breakpoint that calls iret will remove the 470 * NMI context, allowing a nested NMI to enter. What makes this more 471 * difficult is that both NMIs and breakpoints have their own stack. 472 * When a new NMI or breakpoint is executed, the stack is set to a fixed 473 * point. If an NMI is nested, it will have its stack set at that same 474 * fixed address that the first NMI had, and will start corrupting the 475 * stack. This is handled in entry_64.S, but the same problem exists with 476 * the breakpoint stack. 477 * 478 * If a breakpoint is being processed, and the debug stack is being used, 479 * if an NMI comes in and also hits a breakpoint, the stack pointer 480 * will be set to the same fixed address as the breakpoint that was 481 * interrupted, causing that stack to be corrupted. To handle this case, 482 * check if the stack that was interrupted is the debug stack, and if 483 * so, change the IDT so that new breakpoints will use the current stack 484 * and not switch to the fixed address. On return of the NMI, switch back 485 * to the original IDT. 486 */ 487 static DEFINE_PER_CPU(int, update_debug_stack); 488 489 static inline void nmi_nesting_preprocess(struct pt_regs *regs) 490 { 491 /* 492 * If we interrupted a breakpoint, it is possible that 493 * the nmi handler will have breakpoints too. We need to 494 * change the IDT such that breakpoints that happen here 495 * continue to use the NMI stack. 496 */ 497 if (unlikely(is_debug_stack(regs->sp))) { 498 debug_stack_set_zero(); 499 this_cpu_write(update_debug_stack, 1); 500 } 501 } 502 503 static inline void nmi_nesting_postprocess(void) 504 { 505 if (unlikely(this_cpu_read(update_debug_stack))) { 506 debug_stack_reset(); 507 this_cpu_write(update_debug_stack, 0); 508 } 509 } 510 #endif 511 512 dotraplinkage notrace __kprobes void 513 do_nmi(struct pt_regs *regs, long error_code) 514 { 515 nmi_nesting_preprocess(regs); 516 517 nmi_enter(); 518 519 inc_irq_stat(__nmi_count); 520 521 if (!ignore_nmis) 522 default_do_nmi(regs); 523 524 nmi_exit(); 525 526 /* On i386, may loop back to preprocess */ 527 nmi_nesting_postprocess(); 528 } 529 530 void stop_nmi(void) 531 { 532 ignore_nmis++; 533 } 534 535 void restart_nmi(void) 536 { 537 ignore_nmis--; 538 } 539 540 /* reset the back-to-back NMI logic */ 541 void local_touch_nmi(void) 542 { 543 __this_cpu_write(last_nmi_rip, 0); 544 } 545 EXPORT_SYMBOL_GPL(local_touch_nmi); 546