1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 4 * Copyright (C) 2011 Don Zickus Red Hat, Inc. 5 * 6 * Pentium III FXSR, SSE support 7 * Gareth Hughes <gareth@valinux.com>, May 2000 8 */ 9 10 /* 11 * Handle hardware traps and faults. 12 */ 13 #include <linux/spinlock.h> 14 #include <linux/kprobes.h> 15 #include <linux/kdebug.h> 16 #include <linux/sched/debug.h> 17 #include <linux/nmi.h> 18 #include <linux/debugfs.h> 19 #include <linux/delay.h> 20 #include <linux/hardirq.h> 21 #include <linux/ratelimit.h> 22 #include <linux/slab.h> 23 #include <linux/export.h> 24 #include <linux/atomic.h> 25 #include <linux/sched/clock.h> 26 27 #if defined(CONFIG_EDAC) 28 #include <linux/edac.h> 29 #endif 30 31 #include <asm/cpu_entry_area.h> 32 #include <asm/traps.h> 33 #include <asm/mach_traps.h> 34 #include <asm/nmi.h> 35 #include <asm/x86_init.h> 36 #include <asm/reboot.h> 37 #include <asm/cache.h> 38 #include <asm/nospec-branch.h> 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/nmi.h> 42 43 struct nmi_desc { 44 raw_spinlock_t lock; 45 struct list_head head; 46 }; 47 48 static struct nmi_desc nmi_desc[NMI_MAX] = 49 { 50 { 51 .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), 52 .head = LIST_HEAD_INIT(nmi_desc[0].head), 53 }, 54 { 55 .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), 56 .head = LIST_HEAD_INIT(nmi_desc[1].head), 57 }, 58 { 59 .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), 60 .head = LIST_HEAD_INIT(nmi_desc[2].head), 61 }, 62 { 63 .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), 64 .head = LIST_HEAD_INIT(nmi_desc[3].head), 65 }, 66 67 }; 68 69 struct nmi_stats { 70 unsigned int normal; 71 unsigned int unknown; 72 unsigned int external; 73 unsigned int swallow; 74 }; 75 76 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); 77 78 static int ignore_nmis __read_mostly; 79 80 int unknown_nmi_panic; 81 /* 82 * Prevent NMI reason port (0x61) being accessed simultaneously, can 83 * only be used in NMI handler. 84 */ 85 static DEFINE_RAW_SPINLOCK(nmi_reason_lock); 86 87 static int __init setup_unknown_nmi_panic(char *str) 88 { 89 unknown_nmi_panic = 1; 90 return 1; 91 } 92 __setup("unknown_nmi_panic", setup_unknown_nmi_panic); 93 94 #define nmi_to_desc(type) (&nmi_desc[type]) 95 96 static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; 97 98 static int __init nmi_warning_debugfs(void) 99 { 100 debugfs_create_u64("nmi_longest_ns", 0644, 101 arch_debugfs_dir, &nmi_longest_ns); 102 return 0; 103 } 104 fs_initcall(nmi_warning_debugfs); 105 106 static void nmi_max_handler(struct irq_work *w) 107 { 108 struct nmiaction *a = container_of(w, struct nmiaction, irq_work); 109 int remainder_ns, decimal_msecs; 110 u64 whole_msecs = READ_ONCE(a->max_duration); 111 112 remainder_ns = do_div(whole_msecs, (1000 * 1000)); 113 decimal_msecs = remainder_ns / 1000; 114 115 printk_ratelimited(KERN_INFO 116 "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", 117 a->handler, whole_msecs, decimal_msecs); 118 } 119 120 static int nmi_handle(unsigned int type, struct pt_regs *regs) 121 { 122 struct nmi_desc *desc = nmi_to_desc(type); 123 struct nmiaction *a; 124 int handled=0; 125 126 rcu_read_lock(); 127 128 /* 129 * NMIs are edge-triggered, which means if you have enough 130 * of them concurrently, you can lose some because only one 131 * can be latched at any given time. Walk the whole list 132 * to handle those situations. 133 */ 134 list_for_each_entry_rcu(a, &desc->head, list) { 135 int thishandled; 136 u64 delta; 137 138 delta = sched_clock(); 139 thishandled = a->handler(type, regs); 140 handled += thishandled; 141 delta = sched_clock() - delta; 142 trace_nmi_handler(a->handler, (int)delta, thishandled); 143 144 if (delta < nmi_longest_ns || delta < a->max_duration) 145 continue; 146 147 a->max_duration = delta; 148 irq_work_queue(&a->irq_work); 149 } 150 151 rcu_read_unlock(); 152 153 /* return total number of NMI events handled */ 154 return handled; 155 } 156 NOKPROBE_SYMBOL(nmi_handle); 157 158 int __register_nmi_handler(unsigned int type, struct nmiaction *action) 159 { 160 struct nmi_desc *desc = nmi_to_desc(type); 161 unsigned long flags; 162 163 if (!action->handler) 164 return -EINVAL; 165 166 init_irq_work(&action->irq_work, nmi_max_handler); 167 168 raw_spin_lock_irqsave(&desc->lock, flags); 169 170 /* 171 * Indicate if there are multiple registrations on the 172 * internal NMI handler call chains (SERR and IO_CHECK). 173 */ 174 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); 175 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); 176 177 /* 178 * some handlers need to be executed first otherwise a fake 179 * event confuses some handlers (kdump uses this flag) 180 */ 181 if (action->flags & NMI_FLAG_FIRST) 182 list_add_rcu(&action->list, &desc->head); 183 else 184 list_add_tail_rcu(&action->list, &desc->head); 185 186 raw_spin_unlock_irqrestore(&desc->lock, flags); 187 return 0; 188 } 189 EXPORT_SYMBOL(__register_nmi_handler); 190 191 void unregister_nmi_handler(unsigned int type, const char *name) 192 { 193 struct nmi_desc *desc = nmi_to_desc(type); 194 struct nmiaction *n; 195 unsigned long flags; 196 197 raw_spin_lock_irqsave(&desc->lock, flags); 198 199 list_for_each_entry_rcu(n, &desc->head, list) { 200 /* 201 * the name passed in to describe the nmi handler 202 * is used as the lookup key 203 */ 204 if (!strcmp(n->name, name)) { 205 WARN(in_nmi(), 206 "Trying to free NMI (%s) from NMI context!\n", n->name); 207 list_del_rcu(&n->list); 208 break; 209 } 210 } 211 212 raw_spin_unlock_irqrestore(&desc->lock, flags); 213 synchronize_rcu(); 214 } 215 EXPORT_SYMBOL_GPL(unregister_nmi_handler); 216 217 static void 218 pci_serr_error(unsigned char reason, struct pt_regs *regs) 219 { 220 /* check to see if anyone registered against these types of errors */ 221 if (nmi_handle(NMI_SERR, regs)) 222 return; 223 224 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 225 reason, smp_processor_id()); 226 227 if (panic_on_unrecovered_nmi) 228 nmi_panic(regs, "NMI: Not continuing"); 229 230 pr_emerg("Dazed and confused, but trying to continue\n"); 231 232 /* Clear and disable the PCI SERR error line. */ 233 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; 234 outb(reason, NMI_REASON_PORT); 235 } 236 NOKPROBE_SYMBOL(pci_serr_error); 237 238 static void 239 io_check_error(unsigned char reason, struct pt_regs *regs) 240 { 241 unsigned long i; 242 243 /* check to see if anyone registered against these types of errors */ 244 if (nmi_handle(NMI_IO_CHECK, regs)) 245 return; 246 247 pr_emerg( 248 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", 249 reason, smp_processor_id()); 250 show_regs(regs); 251 252 if (panic_on_io_nmi) { 253 nmi_panic(regs, "NMI IOCK error: Not continuing"); 254 255 /* 256 * If we end up here, it means we have received an NMI while 257 * processing panic(). Simply return without delaying and 258 * re-enabling NMIs. 259 */ 260 return; 261 } 262 263 /* Re-enable the IOCK line, wait for a few seconds */ 264 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; 265 outb(reason, NMI_REASON_PORT); 266 267 i = 20000; 268 while (--i) { 269 touch_nmi_watchdog(); 270 udelay(100); 271 } 272 273 reason &= ~NMI_REASON_CLEAR_IOCHK; 274 outb(reason, NMI_REASON_PORT); 275 } 276 NOKPROBE_SYMBOL(io_check_error); 277 278 static void 279 unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 280 { 281 int handled; 282 283 /* 284 * Use 'false' as back-to-back NMIs are dealt with one level up. 285 * Of course this makes having multiple 'unknown' handlers useless 286 * as only the first one is ever run (unless it can actually determine 287 * if it caused the NMI) 288 */ 289 handled = nmi_handle(NMI_UNKNOWN, regs); 290 if (handled) { 291 __this_cpu_add(nmi_stats.unknown, handled); 292 return; 293 } 294 295 __this_cpu_add(nmi_stats.unknown, 1); 296 297 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 298 reason, smp_processor_id()); 299 300 pr_emerg("Do you have a strange power saving mode enabled?\n"); 301 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 302 nmi_panic(regs, "NMI: Not continuing"); 303 304 pr_emerg("Dazed and confused, but trying to continue\n"); 305 } 306 NOKPROBE_SYMBOL(unknown_nmi_error); 307 308 static DEFINE_PER_CPU(bool, swallow_nmi); 309 static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 310 311 static void default_do_nmi(struct pt_regs *regs) 312 { 313 unsigned char reason = 0; 314 int handled; 315 bool b2b = false; 316 317 /* 318 * CPU-specific NMI must be processed before non-CPU-specific 319 * NMI, otherwise we may lose it, because the CPU-specific 320 * NMI can not be detected/processed on other CPUs. 321 */ 322 323 /* 324 * Back-to-back NMIs are interesting because they can either 325 * be two NMI or more than two NMIs (any thing over two is dropped 326 * due to NMI being edge-triggered). If this is the second half 327 * of the back-to-back NMI, assume we dropped things and process 328 * more handlers. Otherwise reset the 'swallow' NMI behaviour 329 */ 330 if (regs->ip == __this_cpu_read(last_nmi_rip)) 331 b2b = true; 332 else 333 __this_cpu_write(swallow_nmi, false); 334 335 __this_cpu_write(last_nmi_rip, regs->ip); 336 337 handled = nmi_handle(NMI_LOCAL, regs); 338 __this_cpu_add(nmi_stats.normal, handled); 339 if (handled) { 340 /* 341 * There are cases when a NMI handler handles multiple 342 * events in the current NMI. One of these events may 343 * be queued for in the next NMI. Because the event is 344 * already handled, the next NMI will result in an unknown 345 * NMI. Instead lets flag this for a potential NMI to 346 * swallow. 347 */ 348 if (handled > 1) 349 __this_cpu_write(swallow_nmi, true); 350 return; 351 } 352 353 /* 354 * Non-CPU-specific NMI: NMI sources can be processed on any CPU. 355 * 356 * Another CPU may be processing panic routines while holding 357 * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, 358 * and if so, call its callback directly. If there is no CPU preparing 359 * crash dump, we simply loop here. 360 */ 361 while (!raw_spin_trylock(&nmi_reason_lock)) { 362 run_crash_ipi_callback(regs); 363 cpu_relax(); 364 } 365 366 reason = x86_platform.get_nmi_reason(); 367 368 if (reason & NMI_REASON_MASK) { 369 if (reason & NMI_REASON_SERR) 370 pci_serr_error(reason, regs); 371 else if (reason & NMI_REASON_IOCHK) 372 io_check_error(reason, regs); 373 #ifdef CONFIG_X86_32 374 /* 375 * Reassert NMI in case it became active 376 * meanwhile as it's edge-triggered: 377 */ 378 reassert_nmi(); 379 #endif 380 __this_cpu_add(nmi_stats.external, 1); 381 raw_spin_unlock(&nmi_reason_lock); 382 return; 383 } 384 raw_spin_unlock(&nmi_reason_lock); 385 386 /* 387 * Only one NMI can be latched at a time. To handle 388 * this we may process multiple nmi handlers at once to 389 * cover the case where an NMI is dropped. The downside 390 * to this approach is we may process an NMI prematurely, 391 * while its real NMI is sitting latched. This will cause 392 * an unknown NMI on the next run of the NMI processing. 393 * 394 * We tried to flag that condition above, by setting the 395 * swallow_nmi flag when we process more than one event. 396 * This condition is also only present on the second half 397 * of a back-to-back NMI, so we flag that condition too. 398 * 399 * If both are true, we assume we already processed this 400 * NMI previously and we swallow it. Otherwise we reset 401 * the logic. 402 * 403 * There are scenarios where we may accidentally swallow 404 * a 'real' unknown NMI. For example, while processing 405 * a perf NMI another perf NMI comes in along with a 406 * 'real' unknown NMI. These two NMIs get combined into 407 * one (as descibed above). When the next NMI gets 408 * processed, it will be flagged by perf as handled, but 409 * noone will know that there was a 'real' unknown NMI sent 410 * also. As a result it gets swallowed. Or if the first 411 * perf NMI returns two events handled then the second 412 * NMI will get eaten by the logic below, again losing a 413 * 'real' unknown NMI. But this is the best we can do 414 * for now. 415 */ 416 if (b2b && __this_cpu_read(swallow_nmi)) 417 __this_cpu_add(nmi_stats.swallow, 1); 418 else 419 unknown_nmi_error(reason, regs); 420 } 421 NOKPROBE_SYMBOL(default_do_nmi); 422 423 /* 424 * NMIs can page fault or hit breakpoints which will cause it to lose 425 * its NMI context with the CPU when the breakpoint or page fault does an IRET. 426 * 427 * As a result, NMIs can nest if NMIs get unmasked due an IRET during 428 * NMI processing. On x86_64, the asm glue protects us from nested NMIs 429 * if the outer NMI came from kernel mode, but we can still nest if the 430 * outer NMI came from user mode. 431 * 432 * To handle these nested NMIs, we have three states: 433 * 434 * 1) not running 435 * 2) executing 436 * 3) latched 437 * 438 * When no NMI is in progress, it is in the "not running" state. 439 * When an NMI comes in, it goes into the "executing" state. 440 * Normally, if another NMI is triggered, it does not interrupt 441 * the running NMI and the HW will simply latch it so that when 442 * the first NMI finishes, it will restart the second NMI. 443 * (Note, the latch is binary, thus multiple NMIs triggering, 444 * when one is running, are ignored. Only one NMI is restarted.) 445 * 446 * If an NMI executes an iret, another NMI can preempt it. We do not 447 * want to allow this new NMI to run, but we want to execute it when the 448 * first one finishes. We set the state to "latched", and the exit of 449 * the first NMI will perform a dec_return, if the result is zero 450 * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the 451 * dec_return would have set the state to NMI_EXECUTING (what we want it 452 * to be when we are running). In this case, we simply jump back to 453 * rerun the NMI handler again, and restart the 'latched' NMI. 454 * 455 * No trap (breakpoint or page fault) should be hit before nmi_restart, 456 * thus there is no race between the first check of state for NOT_RUNNING 457 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs 458 * at this point. 459 * 460 * In case the NMI takes a page fault, we need to save off the CR2 461 * because the NMI could have preempted another page fault and corrupt 462 * the CR2 that is about to be read. As nested NMIs must be restarted 463 * and they can not take breakpoints or page faults, the update of the 464 * CR2 must be done before converting the nmi state back to NOT_RUNNING. 465 * Otherwise, there would be a race of another nested NMI coming in 466 * after setting state to NOT_RUNNING but before updating the nmi_cr2. 467 */ 468 enum nmi_states { 469 NMI_NOT_RUNNING = 0, 470 NMI_EXECUTING, 471 NMI_LATCHED, 472 }; 473 static DEFINE_PER_CPU(enum nmi_states, nmi_state); 474 static DEFINE_PER_CPU(unsigned long, nmi_cr2); 475 476 #ifdef CONFIG_X86_64 477 /* 478 * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without 479 * some care, the inner breakpoint will clobber the outer breakpoint's 480 * stack. 481 * 482 * If a breakpoint is being processed, and the debug stack is being 483 * used, if an NMI comes in and also hits a breakpoint, the stack 484 * pointer will be set to the same fixed address as the breakpoint that 485 * was interrupted, causing that stack to be corrupted. To handle this 486 * case, check if the stack that was interrupted is the debug stack, and 487 * if so, change the IDT so that new breakpoints will use the current 488 * stack and not switch to the fixed address. On return of the NMI, 489 * switch back to the original IDT. 490 */ 491 static DEFINE_PER_CPU(int, update_debug_stack); 492 493 static bool notrace is_debug_stack(unsigned long addr) 494 { 495 struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); 496 unsigned long top = CEA_ESTACK_TOP(cs, DB); 497 unsigned long bot = CEA_ESTACK_BOT(cs, DB1); 498 499 if (__this_cpu_read(debug_stack_usage)) 500 return true; 501 /* 502 * Note, this covers the guard page between DB and DB1 as well to 503 * avoid two checks. But by all means @addr can never point into 504 * the guard page. 505 */ 506 return addr >= bot && addr < top; 507 } 508 NOKPROBE_SYMBOL(is_debug_stack); 509 #endif 510 511 dotraplinkage notrace void 512 do_nmi(struct pt_regs *regs, long error_code) 513 { 514 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { 515 this_cpu_write(nmi_state, NMI_LATCHED); 516 return; 517 } 518 this_cpu_write(nmi_state, NMI_EXECUTING); 519 this_cpu_write(nmi_cr2, read_cr2()); 520 nmi_restart: 521 522 #ifdef CONFIG_X86_64 523 /* 524 * If we interrupted a breakpoint, it is possible that 525 * the nmi handler will have breakpoints too. We need to 526 * change the IDT such that breakpoints that happen here 527 * continue to use the NMI stack. 528 */ 529 if (unlikely(is_debug_stack(regs->sp))) { 530 debug_stack_set_zero(); 531 this_cpu_write(update_debug_stack, 1); 532 } 533 #endif 534 535 nmi_enter(); 536 537 inc_irq_stat(__nmi_count); 538 539 if (!ignore_nmis) 540 default_do_nmi(regs); 541 542 nmi_exit(); 543 544 #ifdef CONFIG_X86_64 545 if (unlikely(this_cpu_read(update_debug_stack))) { 546 debug_stack_reset(); 547 this_cpu_write(update_debug_stack, 0); 548 } 549 #endif 550 551 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) 552 write_cr2(this_cpu_read(nmi_cr2)); 553 if (this_cpu_dec_return(nmi_state)) 554 goto nmi_restart; 555 556 if (user_mode(regs)) 557 mds_user_clear_cpu_buffers(); 558 } 559 NOKPROBE_SYMBOL(do_nmi); 560 561 void stop_nmi(void) 562 { 563 ignore_nmis++; 564 } 565 566 void restart_nmi(void) 567 { 568 ignore_nmis--; 569 } 570 571 /* reset the back-to-back NMI logic */ 572 void local_touch_nmi(void) 573 { 574 __this_cpu_write(last_nmi_rip, 0); 575 } 576 EXPORT_SYMBOL_GPL(local_touch_nmi); 577