1 /* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11 #include <linux/thread_info.h> 12 #include <linux/capability.h> 13 #include <linux/miscdevice.h> 14 #include <linux/ratelimit.h> 15 #include <linux/rcupdate.h> 16 #include <linux/kobject.h> 17 #include <linux/uaccess.h> 18 #include <linux/kdebug.h> 19 #include <linux/kernel.h> 20 #include <linux/percpu.h> 21 #include <linux/string.h> 22 #include <linux/device.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/delay.h> 25 #include <linux/ctype.h> 26 #include <linux/sched.h> 27 #include <linux/sysfs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/init.h> 31 #include <linux/kmod.h> 32 #include <linux/poll.h> 33 #include <linux/nmi.h> 34 #include <linux/cpu.h> 35 #include <linux/ras.h> 36 #include <linux/smp.h> 37 #include <linux/fs.h> 38 #include <linux/mm.h> 39 #include <linux/debugfs.h> 40 #include <linux/irq_work.h> 41 #include <linux/export.h> 42 #include <linux/jump_label.h> 43 #include <linux/set_memory.h> 44 45 #include <asm/intel-family.h> 46 #include <asm/processor.h> 47 #include <asm/traps.h> 48 #include <asm/tlbflush.h> 49 #include <asm/mce.h> 50 #include <asm/msr.h> 51 #include <asm/reboot.h> 52 53 #include "internal.h" 54 55 static DEFINE_MUTEX(mce_log_mutex); 56 57 /* sysfs synchronization */ 58 static DEFINE_MUTEX(mce_sysfs_mutex); 59 60 #define CREATE_TRACE_POINTS 61 #include <trace/events/mce.h> 62 63 #define SPINUNIT 100 /* 100ns */ 64 65 DEFINE_PER_CPU(unsigned, mce_exception_count); 66 67 struct mce_bank *mce_banks __read_mostly; 68 struct mce_vendor_flags mce_flags __read_mostly; 69 70 struct mca_config mca_cfg __read_mostly = { 71 .bootlog = -1, 72 /* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79 .tolerant = 1, 80 .monarch_timeout = -1 81 }; 82 83 static DEFINE_PER_CPU(struct mce, mces_seen); 84 static unsigned long mce_need_notify; 85 static int cpu_missing; 86 87 /* 88 * MCA banks polled by the period polling timer for corrected events. 89 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 90 */ 91 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 92 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 93 }; 94 95 /* 96 * MCA banks controlled through firmware first for corrected errors. 97 * This is a global list of banks for which we won't enable CMCI and we 98 * won't poll. Firmware controls these banks and is responsible for 99 * reporting corrected errors through GHES. Uncorrected/recoverable 100 * errors are still notified through a machine check. 101 */ 102 mce_banks_t mce_banks_ce_disabled; 103 104 static struct work_struct mce_work; 105 static struct irq_work mce_irq_work; 106 107 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 108 109 /* 110 * CPU/chipset specific EDAC code can register a notifier call here to print 111 * MCE errors in a human-readable form. 112 */ 113 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); 114 115 /* Do initial initialization of a struct mce */ 116 void mce_setup(struct mce *m) 117 { 118 memset(m, 0, sizeof(struct mce)); 119 m->cpu = m->extcpu = smp_processor_id(); 120 /* need the internal __ version to avoid deadlocks */ 121 m->time = __ktime_get_real_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124 m->socketid = cpu_data(m->extcpu).phys_proc_id; 125 m->apicid = cpu_data(m->extcpu).initial_apicid; 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 127 128 if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) 129 rdmsrl(MSR_PPIN, m->ppin); 130 131 m->microcode = boot_cpu_data.microcode; 132 } 133 134 DEFINE_PER_CPU(struct mce, injectm); 135 EXPORT_PER_CPU_SYMBOL_GPL(injectm); 136 137 void mce_log(struct mce *m) 138 { 139 if (!mce_gen_pool_add(m)) 140 irq_work_queue(&mce_irq_work); 141 } 142 143 void mce_inject_log(struct mce *m) 144 { 145 mutex_lock(&mce_log_mutex); 146 mce_log(m); 147 mutex_unlock(&mce_log_mutex); 148 } 149 EXPORT_SYMBOL_GPL(mce_inject_log); 150 151 static struct notifier_block mce_srao_nb; 152 153 /* 154 * We run the default notifier if we have only the SRAO, the first and the 155 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS 156 * notifiers registered on the chain. 157 */ 158 #define NUM_DEFAULT_NOTIFIERS 3 159 static atomic_t num_notifiers; 160 161 void mce_register_decode_chain(struct notifier_block *nb) 162 { 163 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) 164 return; 165 166 atomic_inc(&num_notifiers); 167 168 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); 169 } 170 EXPORT_SYMBOL_GPL(mce_register_decode_chain); 171 172 void mce_unregister_decode_chain(struct notifier_block *nb) 173 { 174 atomic_dec(&num_notifiers); 175 176 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 177 } 178 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 179 180 static inline u32 ctl_reg(int bank) 181 { 182 return MSR_IA32_MCx_CTL(bank); 183 } 184 185 static inline u32 status_reg(int bank) 186 { 187 return MSR_IA32_MCx_STATUS(bank); 188 } 189 190 static inline u32 addr_reg(int bank) 191 { 192 return MSR_IA32_MCx_ADDR(bank); 193 } 194 195 static inline u32 misc_reg(int bank) 196 { 197 return MSR_IA32_MCx_MISC(bank); 198 } 199 200 static inline u32 smca_ctl_reg(int bank) 201 { 202 return MSR_AMD64_SMCA_MCx_CTL(bank); 203 } 204 205 static inline u32 smca_status_reg(int bank) 206 { 207 return MSR_AMD64_SMCA_MCx_STATUS(bank); 208 } 209 210 static inline u32 smca_addr_reg(int bank) 211 { 212 return MSR_AMD64_SMCA_MCx_ADDR(bank); 213 } 214 215 static inline u32 smca_misc_reg(int bank) 216 { 217 return MSR_AMD64_SMCA_MCx_MISC(bank); 218 } 219 220 struct mca_msr_regs msr_ops = { 221 .ctl = ctl_reg, 222 .status = status_reg, 223 .addr = addr_reg, 224 .misc = misc_reg 225 }; 226 227 static void __print_mce(struct mce *m) 228 { 229 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", 230 m->extcpu, 231 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), 232 m->mcgstatus, m->bank, m->status); 233 234 if (m->ip) { 235 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 236 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 237 m->cs, m->ip); 238 239 if (m->cs == __KERNEL_CS) 240 pr_cont("{%pS}", (void *)(unsigned long)m->ip); 241 pr_cont("\n"); 242 } 243 244 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 245 if (m->addr) 246 pr_cont("ADDR %llx ", m->addr); 247 if (m->misc) 248 pr_cont("MISC %llx ", m->misc); 249 250 if (mce_flags.smca) { 251 if (m->synd) 252 pr_cont("SYND %llx ", m->synd); 253 if (m->ipid) 254 pr_cont("IPID %llx ", m->ipid); 255 } 256 257 pr_cont("\n"); 258 /* 259 * Note this output is parsed by external tools and old fields 260 * should not be changed. 261 */ 262 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 263 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 264 m->microcode); 265 } 266 267 static void print_mce(struct mce *m) 268 { 269 __print_mce(m); 270 271 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) 272 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 273 } 274 275 #define PANIC_TIMEOUT 5 /* 5 seconds */ 276 277 static atomic_t mce_panicked; 278 279 static int fake_panic; 280 static atomic_t mce_fake_panicked; 281 282 /* Panic in progress. Enable interrupts and wait for final IPI */ 283 static void wait_for_panic(void) 284 { 285 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 286 287 preempt_disable(); 288 local_irq_enable(); 289 while (timeout-- > 0) 290 udelay(1); 291 if (panic_timeout == 0) 292 panic_timeout = mca_cfg.panic_timeout; 293 panic("Panicing machine check CPU died"); 294 } 295 296 static void mce_panic(const char *msg, struct mce *final, char *exp) 297 { 298 int apei_err = 0; 299 struct llist_node *pending; 300 struct mce_evt_llist *l; 301 302 if (!fake_panic) { 303 /* 304 * Make sure only one CPU runs in machine check panic 305 */ 306 if (atomic_inc_return(&mce_panicked) > 1) 307 wait_for_panic(); 308 barrier(); 309 310 bust_spinlocks(1); 311 console_verbose(); 312 } else { 313 /* Don't log too much for fake panic */ 314 if (atomic_inc_return(&mce_fake_panicked) > 1) 315 return; 316 } 317 pending = mce_gen_pool_prepare_records(); 318 /* First print corrected ones that are still unlogged */ 319 llist_for_each_entry(l, pending, llnode) { 320 struct mce *m = &l->mce; 321 if (!(m->status & MCI_STATUS_UC)) { 322 print_mce(m); 323 if (!apei_err) 324 apei_err = apei_write_mce(m); 325 } 326 } 327 /* Now print uncorrected but with the final one last */ 328 llist_for_each_entry(l, pending, llnode) { 329 struct mce *m = &l->mce; 330 if (!(m->status & MCI_STATUS_UC)) 331 continue; 332 if (!final || mce_cmp(m, final)) { 333 print_mce(m); 334 if (!apei_err) 335 apei_err = apei_write_mce(m); 336 } 337 } 338 if (final) { 339 print_mce(final); 340 if (!apei_err) 341 apei_err = apei_write_mce(final); 342 } 343 if (cpu_missing) 344 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 345 if (exp) 346 pr_emerg(HW_ERR "Machine check: %s\n", exp); 347 if (!fake_panic) { 348 if (panic_timeout == 0) 349 panic_timeout = mca_cfg.panic_timeout; 350 panic(msg); 351 } else 352 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 353 } 354 355 /* Support code for software error injection */ 356 357 static int msr_to_offset(u32 msr) 358 { 359 unsigned bank = __this_cpu_read(injectm.bank); 360 361 if (msr == mca_cfg.rip_msr) 362 return offsetof(struct mce, ip); 363 if (msr == msr_ops.status(bank)) 364 return offsetof(struct mce, status); 365 if (msr == msr_ops.addr(bank)) 366 return offsetof(struct mce, addr); 367 if (msr == msr_ops.misc(bank)) 368 return offsetof(struct mce, misc); 369 if (msr == MSR_IA32_MCG_STATUS) 370 return offsetof(struct mce, mcgstatus); 371 return -1; 372 } 373 374 /* MSR access wrappers used for error injection */ 375 static u64 mce_rdmsrl(u32 msr) 376 { 377 u64 v; 378 379 if (__this_cpu_read(injectm.finished)) { 380 int offset = msr_to_offset(msr); 381 382 if (offset < 0) 383 return 0; 384 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); 385 } 386 387 if (rdmsrl_safe(msr, &v)) { 388 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); 389 /* 390 * Return zero in case the access faulted. This should 391 * not happen normally but can happen if the CPU does 392 * something weird, or if the code is buggy. 393 */ 394 v = 0; 395 } 396 397 return v; 398 } 399 400 static void mce_wrmsrl(u32 msr, u64 v) 401 { 402 if (__this_cpu_read(injectm.finished)) { 403 int offset = msr_to_offset(msr); 404 405 if (offset >= 0) 406 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; 407 return; 408 } 409 wrmsrl(msr, v); 410 } 411 412 /* 413 * Collect all global (w.r.t. this processor) status about this machine 414 * check into our "mce" struct so that we can use it later to assess 415 * the severity of the problem as we read per-bank specific details. 416 */ 417 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 418 { 419 mce_setup(m); 420 421 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 422 if (regs) { 423 /* 424 * Get the address of the instruction at the time of 425 * the machine check error. 426 */ 427 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 428 m->ip = regs->ip; 429 m->cs = regs->cs; 430 431 /* 432 * When in VM86 mode make the cs look like ring 3 433 * always. This is a lie, but it's better than passing 434 * the additional vm86 bit around everywhere. 435 */ 436 if (v8086_mode(regs)) 437 m->cs |= 3; 438 } 439 /* Use accurate RIP reporting if available. */ 440 if (mca_cfg.rip_msr) 441 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 442 } 443 } 444 445 int mce_available(struct cpuinfo_x86 *c) 446 { 447 if (mca_cfg.disabled) 448 return 0; 449 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 450 } 451 452 static void mce_schedule_work(void) 453 { 454 if (!mce_gen_pool_empty()) 455 schedule_work(&mce_work); 456 } 457 458 static void mce_irq_work_cb(struct irq_work *entry) 459 { 460 mce_schedule_work(); 461 } 462 463 /* 464 * Check if the address reported by the CPU is in a format we can parse. 465 * It would be possible to add code for most other cases, but all would 466 * be somewhat complicated (e.g. segment offset would require an instruction 467 * parser). So only support physical addresses up to page granuality for now. 468 */ 469 int mce_usable_address(struct mce *m) 470 { 471 if (!(m->status & MCI_STATUS_ADDRV)) 472 return 0; 473 474 /* Checks after this one are Intel-specific: */ 475 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 476 return 1; 477 478 if (!(m->status & MCI_STATUS_MISCV)) 479 return 0; 480 481 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 482 return 0; 483 484 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 485 return 0; 486 487 return 1; 488 } 489 EXPORT_SYMBOL_GPL(mce_usable_address); 490 491 bool mce_is_memory_error(struct mce *m) 492 { 493 if (m->cpuvendor == X86_VENDOR_AMD || 494 m->cpuvendor == X86_VENDOR_HYGON) { 495 return amd_mce_is_memory_error(m); 496 } else if (m->cpuvendor == X86_VENDOR_INTEL) { 497 /* 498 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes 499 * 500 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for 501 * indicating a memory error. Bit 8 is used for indicating a 502 * cache hierarchy error. The combination of bit 2 and bit 3 503 * is used for indicating a `generic' cache hierarchy error 504 * But we can't just blindly check the above bits, because if 505 * bit 11 is set, then it is a bus/interconnect error - and 506 * either way the above bits just gives more detail on what 507 * bus/interconnect error happened. Note that bit 12 can be 508 * ignored, as it's the "filter" bit. 509 */ 510 return (m->status & 0xef80) == BIT(7) || 511 (m->status & 0xef00) == BIT(8) || 512 (m->status & 0xeffc) == 0xc; 513 } 514 515 return false; 516 } 517 EXPORT_SYMBOL_GPL(mce_is_memory_error); 518 519 bool mce_is_correctable(struct mce *m) 520 { 521 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) 522 return false; 523 524 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) 525 return false; 526 527 if (m->status & MCI_STATUS_UC) 528 return false; 529 530 return true; 531 } 532 EXPORT_SYMBOL_GPL(mce_is_correctable); 533 534 static bool cec_add_mce(struct mce *m) 535 { 536 if (!m) 537 return false; 538 539 /* We eat only correctable DRAM errors with usable addresses. */ 540 if (mce_is_memory_error(m) && 541 mce_is_correctable(m) && 542 mce_usable_address(m)) 543 if (!cec_add_elem(m->addr >> PAGE_SHIFT)) 544 return true; 545 546 return false; 547 } 548 549 static int mce_first_notifier(struct notifier_block *nb, unsigned long val, 550 void *data) 551 { 552 struct mce *m = (struct mce *)data; 553 554 if (!m) 555 return NOTIFY_DONE; 556 557 if (cec_add_mce(m)) 558 return NOTIFY_STOP; 559 560 /* Emit the trace record: */ 561 trace_mce_record(m); 562 563 set_bit(0, &mce_need_notify); 564 565 mce_notify_irq(); 566 567 return NOTIFY_DONE; 568 } 569 570 static struct notifier_block first_nb = { 571 .notifier_call = mce_first_notifier, 572 .priority = MCE_PRIO_FIRST, 573 }; 574 575 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, 576 void *data) 577 { 578 struct mce *mce = (struct mce *)data; 579 unsigned long pfn; 580 581 if (!mce) 582 return NOTIFY_DONE; 583 584 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { 585 pfn = mce->addr >> PAGE_SHIFT; 586 if (!memory_failure(pfn, 0)) 587 set_mce_nospec(pfn); 588 } 589 590 return NOTIFY_OK; 591 } 592 static struct notifier_block mce_srao_nb = { 593 .notifier_call = srao_decode_notifier, 594 .priority = MCE_PRIO_SRAO, 595 }; 596 597 static int mce_default_notifier(struct notifier_block *nb, unsigned long val, 598 void *data) 599 { 600 struct mce *m = (struct mce *)data; 601 602 if (!m) 603 return NOTIFY_DONE; 604 605 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) 606 return NOTIFY_DONE; 607 608 __print_mce(m); 609 610 return NOTIFY_DONE; 611 } 612 613 static struct notifier_block mce_default_nb = { 614 .notifier_call = mce_default_notifier, 615 /* lowest prio, we want it to run last. */ 616 .priority = MCE_PRIO_LOWEST, 617 }; 618 619 /* 620 * Read ADDR and MISC registers. 621 */ 622 static void mce_read_aux(struct mce *m, int i) 623 { 624 if (m->status & MCI_STATUS_MISCV) 625 m->misc = mce_rdmsrl(msr_ops.misc(i)); 626 627 if (m->status & MCI_STATUS_ADDRV) { 628 m->addr = mce_rdmsrl(msr_ops.addr(i)); 629 630 /* 631 * Mask the reported address by the reported granularity. 632 */ 633 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { 634 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 635 m->addr >>= shift; 636 m->addr <<= shift; 637 } 638 639 /* 640 * Extract [55:<lsb>] where lsb is the least significant 641 * *valid* bit of the address bits. 642 */ 643 if (mce_flags.smca) { 644 u8 lsb = (m->addr >> 56) & 0x3f; 645 646 m->addr &= GENMASK_ULL(55, lsb); 647 } 648 } 649 650 if (mce_flags.smca) { 651 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); 652 653 if (m->status & MCI_STATUS_SYNDV) 654 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); 655 } 656 } 657 658 DEFINE_PER_CPU(unsigned, mce_poll_count); 659 660 /* 661 * Poll for corrected events or events that happened before reset. 662 * Those are just logged through /dev/mcelog. 663 * 664 * This is executed in standard interrupt context. 665 * 666 * Note: spec recommends to panic for fatal unsignalled 667 * errors here. However this would be quite problematic -- 668 * we would need to reimplement the Monarch handling and 669 * it would mess up the exclusion between exception handler 670 * and poll handler -- * so we skip this for now. 671 * These cases should not happen anyways, or only when the CPU 672 * is already totally * confused. In this case it's likely it will 673 * not fully execute the machine check handler either. 674 */ 675 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 676 { 677 bool error_seen = false; 678 struct mce m; 679 int i; 680 681 this_cpu_inc(mce_poll_count); 682 683 mce_gather_info(&m, NULL); 684 685 if (flags & MCP_TIMESTAMP) 686 m.tsc = rdtsc(); 687 688 for (i = 0; i < mca_cfg.banks; i++) { 689 if (!mce_banks[i].ctl || !test_bit(i, *b)) 690 continue; 691 692 m.misc = 0; 693 m.addr = 0; 694 m.bank = i; 695 696 barrier(); 697 m.status = mce_rdmsrl(msr_ops.status(i)); 698 699 /* If this entry is not valid, ignore it */ 700 if (!(m.status & MCI_STATUS_VAL)) 701 continue; 702 703 /* 704 * If we are logging everything (at CPU online) or this 705 * is a corrected error, then we must log it. 706 */ 707 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) 708 goto log_it; 709 710 /* 711 * Newer Intel systems that support software error 712 * recovery need to make additional checks. Other 713 * CPUs should skip over uncorrected errors, but log 714 * everything else. 715 */ 716 if (!mca_cfg.ser) { 717 if (m.status & MCI_STATUS_UC) 718 continue; 719 goto log_it; 720 } 721 722 /* Log "not enabled" (speculative) errors */ 723 if (!(m.status & MCI_STATUS_EN)) 724 goto log_it; 725 726 /* 727 * Log UCNA (SDM: 15.6.3 "UCR Error Classification") 728 * UC == 1 && PCC == 0 && S == 0 729 */ 730 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) 731 goto log_it; 732 733 /* 734 * Skip anything else. Presumption is that our read of this 735 * bank is racing with a machine check. Leave the log alone 736 * for do_machine_check() to deal with it. 737 */ 738 continue; 739 740 log_it: 741 error_seen = true; 742 743 mce_read_aux(&m, i); 744 745 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); 746 747 /* 748 * Don't get the IP here because it's unlikely to 749 * have anything to do with the actual error location. 750 */ 751 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 752 mce_log(&m); 753 else if (mce_usable_address(&m)) { 754 /* 755 * Although we skipped logging this, we still want 756 * to take action. Add to the pool so the registered 757 * notifiers will see it. 758 */ 759 if (!mce_gen_pool_add(&m)) 760 mce_schedule_work(); 761 } 762 763 /* 764 * Clear state for this bank. 765 */ 766 mce_wrmsrl(msr_ops.status(i), 0); 767 } 768 769 /* 770 * Don't clear MCG_STATUS here because it's only defined for 771 * exceptions. 772 */ 773 774 sync_core(); 775 776 return error_seen; 777 } 778 EXPORT_SYMBOL_GPL(machine_check_poll); 779 780 /* 781 * Do a quick check if any of the events requires a panic. 782 * This decides if we keep the events around or clear them. 783 */ 784 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 785 struct pt_regs *regs) 786 { 787 char *tmp; 788 int i; 789 790 for (i = 0; i < mca_cfg.banks; i++) { 791 m->status = mce_rdmsrl(msr_ops.status(i)); 792 if (!(m->status & MCI_STATUS_VAL)) 793 continue; 794 795 __set_bit(i, validp); 796 if (quirk_no_way_out) 797 quirk_no_way_out(i, m, regs); 798 799 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { 800 m->bank = i; 801 mce_read_aux(m, i); 802 *msg = tmp; 803 return 1; 804 } 805 } 806 return 0; 807 } 808 809 /* 810 * Variable to establish order between CPUs while scanning. 811 * Each CPU spins initially until executing is equal its number. 812 */ 813 static atomic_t mce_executing; 814 815 /* 816 * Defines order of CPUs on entry. First CPU becomes Monarch. 817 */ 818 static atomic_t mce_callin; 819 820 /* 821 * Check if a timeout waiting for other CPUs happened. 822 */ 823 static int mce_timed_out(u64 *t, const char *msg) 824 { 825 /* 826 * The others already did panic for some reason. 827 * Bail out like in a timeout. 828 * rmb() to tell the compiler that system_state 829 * might have been modified by someone else. 830 */ 831 rmb(); 832 if (atomic_read(&mce_panicked)) 833 wait_for_panic(); 834 if (!mca_cfg.monarch_timeout) 835 goto out; 836 if ((s64)*t < SPINUNIT) { 837 if (mca_cfg.tolerant <= 1) 838 mce_panic(msg, NULL, NULL); 839 cpu_missing = 1; 840 return 1; 841 } 842 *t -= SPINUNIT; 843 out: 844 touch_nmi_watchdog(); 845 return 0; 846 } 847 848 /* 849 * The Monarch's reign. The Monarch is the CPU who entered 850 * the machine check handler first. It waits for the others to 851 * raise the exception too and then grades them. When any 852 * error is fatal panic. Only then let the others continue. 853 * 854 * The other CPUs entering the MCE handler will be controlled by the 855 * Monarch. They are called Subjects. 856 * 857 * This way we prevent any potential data corruption in a unrecoverable case 858 * and also makes sure always all CPU's errors are examined. 859 * 860 * Also this detects the case of a machine check event coming from outer 861 * space (not detected by any CPUs) In this case some external agent wants 862 * us to shut down, so panic too. 863 * 864 * The other CPUs might still decide to panic if the handler happens 865 * in a unrecoverable place, but in this case the system is in a semi-stable 866 * state and won't corrupt anything by itself. It's ok to let the others 867 * continue for a bit first. 868 * 869 * All the spin loops have timeouts; when a timeout happens a CPU 870 * typically elects itself to be Monarch. 871 */ 872 static void mce_reign(void) 873 { 874 int cpu; 875 struct mce *m = NULL; 876 int global_worst = 0; 877 char *msg = NULL; 878 char *nmsg = NULL; 879 880 /* 881 * This CPU is the Monarch and the other CPUs have run 882 * through their handlers. 883 * Grade the severity of the errors of all the CPUs. 884 */ 885 for_each_possible_cpu(cpu) { 886 int severity = mce_severity(&per_cpu(mces_seen, cpu), 887 mca_cfg.tolerant, 888 &nmsg, true); 889 if (severity > global_worst) { 890 msg = nmsg; 891 global_worst = severity; 892 m = &per_cpu(mces_seen, cpu); 893 } 894 } 895 896 /* 897 * Cannot recover? Panic here then. 898 * This dumps all the mces in the log buffer and stops the 899 * other CPUs. 900 */ 901 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 902 mce_panic("Fatal machine check", m, msg); 903 904 /* 905 * For UC somewhere we let the CPU who detects it handle it. 906 * Also must let continue the others, otherwise the handling 907 * CPU could deadlock on a lock. 908 */ 909 910 /* 911 * No machine check event found. Must be some external 912 * source or one CPU is hung. Panic. 913 */ 914 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 915 mce_panic("Fatal machine check from unknown source", NULL, NULL); 916 917 /* 918 * Now clear all the mces_seen so that they don't reappear on 919 * the next mce. 920 */ 921 for_each_possible_cpu(cpu) 922 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 923 } 924 925 static atomic_t global_nwo; 926 927 /* 928 * Start of Monarch synchronization. This waits until all CPUs have 929 * entered the exception handler and then determines if any of them 930 * saw a fatal event that requires panic. Then it executes them 931 * in the entry order. 932 * TBD double check parallel CPU hotunplug 933 */ 934 static int mce_start(int *no_way_out) 935 { 936 int order; 937 int cpus = num_online_cpus(); 938 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 939 940 if (!timeout) 941 return -1; 942 943 atomic_add(*no_way_out, &global_nwo); 944 /* 945 * Rely on the implied barrier below, such that global_nwo 946 * is updated before mce_callin. 947 */ 948 order = atomic_inc_return(&mce_callin); 949 950 /* 951 * Wait for everyone. 952 */ 953 while (atomic_read(&mce_callin) != cpus) { 954 if (mce_timed_out(&timeout, 955 "Timeout: Not all CPUs entered broadcast exception handler")) { 956 atomic_set(&global_nwo, 0); 957 return -1; 958 } 959 ndelay(SPINUNIT); 960 } 961 962 /* 963 * mce_callin should be read before global_nwo 964 */ 965 smp_rmb(); 966 967 if (order == 1) { 968 /* 969 * Monarch: Starts executing now, the others wait. 970 */ 971 atomic_set(&mce_executing, 1); 972 } else { 973 /* 974 * Subject: Now start the scanning loop one by one in 975 * the original callin order. 976 * This way when there are any shared banks it will be 977 * only seen by one CPU before cleared, avoiding duplicates. 978 */ 979 while (atomic_read(&mce_executing) < order) { 980 if (mce_timed_out(&timeout, 981 "Timeout: Subject CPUs unable to finish machine check processing")) { 982 atomic_set(&global_nwo, 0); 983 return -1; 984 } 985 ndelay(SPINUNIT); 986 } 987 } 988 989 /* 990 * Cache the global no_way_out state. 991 */ 992 *no_way_out = atomic_read(&global_nwo); 993 994 return order; 995 } 996 997 /* 998 * Synchronize between CPUs after main scanning loop. 999 * This invokes the bulk of the Monarch processing. 1000 */ 1001 static int mce_end(int order) 1002 { 1003 int ret = -1; 1004 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 1005 1006 if (!timeout) 1007 goto reset; 1008 if (order < 0) 1009 goto reset; 1010 1011 /* 1012 * Allow others to run. 1013 */ 1014 atomic_inc(&mce_executing); 1015 1016 if (order == 1) { 1017 /* CHECKME: Can this race with a parallel hotplug? */ 1018 int cpus = num_online_cpus(); 1019 1020 /* 1021 * Monarch: Wait for everyone to go through their scanning 1022 * loops. 1023 */ 1024 while (atomic_read(&mce_executing) <= cpus) { 1025 if (mce_timed_out(&timeout, 1026 "Timeout: Monarch CPU unable to finish machine check processing")) 1027 goto reset; 1028 ndelay(SPINUNIT); 1029 } 1030 1031 mce_reign(); 1032 barrier(); 1033 ret = 0; 1034 } else { 1035 /* 1036 * Subject: Wait for Monarch to finish. 1037 */ 1038 while (atomic_read(&mce_executing) != 0) { 1039 if (mce_timed_out(&timeout, 1040 "Timeout: Monarch CPU did not finish machine check processing")) 1041 goto reset; 1042 ndelay(SPINUNIT); 1043 } 1044 1045 /* 1046 * Don't reset anything. That's done by the Monarch. 1047 */ 1048 return 0; 1049 } 1050 1051 /* 1052 * Reset all global state. 1053 */ 1054 reset: 1055 atomic_set(&global_nwo, 0); 1056 atomic_set(&mce_callin, 0); 1057 barrier(); 1058 1059 /* 1060 * Let others run again. 1061 */ 1062 atomic_set(&mce_executing, 0); 1063 return ret; 1064 } 1065 1066 static void mce_clear_state(unsigned long *toclear) 1067 { 1068 int i; 1069 1070 for (i = 0; i < mca_cfg.banks; i++) { 1071 if (test_bit(i, toclear)) 1072 mce_wrmsrl(msr_ops.status(i), 0); 1073 } 1074 } 1075 1076 static int do_memory_failure(struct mce *m) 1077 { 1078 int flags = MF_ACTION_REQUIRED; 1079 int ret; 1080 1081 pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); 1082 if (!(m->mcgstatus & MCG_STATUS_RIPV)) 1083 flags |= MF_MUST_KILL; 1084 ret = memory_failure(m->addr >> PAGE_SHIFT, flags); 1085 if (ret) 1086 pr_err("Memory error not recovered"); 1087 else 1088 set_mce_nospec(m->addr >> PAGE_SHIFT); 1089 return ret; 1090 } 1091 1092 1093 /* 1094 * Cases where we avoid rendezvous handler timeout: 1095 * 1) If this CPU is offline. 1096 * 1097 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to 1098 * skip those CPUs which remain looping in the 1st kernel - see 1099 * crash_nmi_callback(). 1100 * 1101 * Note: there still is a small window between kexec-ing and the new, 1102 * kdump kernel establishing a new #MC handler where a broadcasted MCE 1103 * might not get handled properly. 1104 */ 1105 static bool __mc_check_crashing_cpu(int cpu) 1106 { 1107 if (cpu_is_offline(cpu) || 1108 (crashing_cpu != -1 && crashing_cpu != cpu)) { 1109 u64 mcgstatus; 1110 1111 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 1112 if (mcgstatus & MCG_STATUS_RIPV) { 1113 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1114 return true; 1115 } 1116 } 1117 return false; 1118 } 1119 1120 static void __mc_scan_banks(struct mce *m, struct mce *final, 1121 unsigned long *toclear, unsigned long *valid_banks, 1122 int no_way_out, int *worst) 1123 { 1124 struct mca_config *cfg = &mca_cfg; 1125 int severity, i; 1126 1127 for (i = 0; i < cfg->banks; i++) { 1128 __clear_bit(i, toclear); 1129 if (!test_bit(i, valid_banks)) 1130 continue; 1131 1132 if (!mce_banks[i].ctl) 1133 continue; 1134 1135 m->misc = 0; 1136 m->addr = 0; 1137 m->bank = i; 1138 1139 m->status = mce_rdmsrl(msr_ops.status(i)); 1140 if (!(m->status & MCI_STATUS_VAL)) 1141 continue; 1142 1143 /* 1144 * Corrected or non-signaled errors are handled by 1145 * machine_check_poll(). Leave them alone, unless this panics. 1146 */ 1147 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1148 !no_way_out) 1149 continue; 1150 1151 /* Set taint even when machine check was not enabled. */ 1152 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1153 1154 severity = mce_severity(m, cfg->tolerant, NULL, true); 1155 1156 /* 1157 * When machine check was for corrected/deferred handler don't 1158 * touch, unless we're panicking. 1159 */ 1160 if ((severity == MCE_KEEP_SEVERITY || 1161 severity == MCE_UCNA_SEVERITY) && !no_way_out) 1162 continue; 1163 1164 __set_bit(i, toclear); 1165 1166 /* Machine check event was not enabled. Clear, but ignore. */ 1167 if (severity == MCE_NO_SEVERITY) 1168 continue; 1169 1170 mce_read_aux(m, i); 1171 1172 /* assuming valid severity level != 0 */ 1173 m->severity = severity; 1174 1175 mce_log(m); 1176 1177 if (severity > *worst) { 1178 *final = *m; 1179 *worst = severity; 1180 } 1181 } 1182 1183 /* mce_clear_state will clear *final, save locally for use later */ 1184 *m = *final; 1185 } 1186 1187 /* 1188 * The actual machine check handler. This only handles real 1189 * exceptions when something got corrupted coming in through int 18. 1190 * 1191 * This is executed in NMI context not subject to normal locking rules. This 1192 * implies that most kernel services cannot be safely used. Don't even 1193 * think about putting a printk in there! 1194 * 1195 * On Intel systems this is entered on all CPUs in parallel through 1196 * MCE broadcast. However some CPUs might be broken beyond repair, 1197 * so be always careful when synchronizing with others. 1198 */ 1199 void do_machine_check(struct pt_regs *regs, long error_code) 1200 { 1201 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1202 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1203 struct mca_config *cfg = &mca_cfg; 1204 int cpu = smp_processor_id(); 1205 char *msg = "Unknown"; 1206 struct mce m, *final; 1207 int worst = 0; 1208 1209 /* 1210 * Establish sequential order between the CPUs entering the machine 1211 * check handler. 1212 */ 1213 int order = -1; 1214 1215 /* 1216 * If no_way_out gets set, there is no safe way to recover from this 1217 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 1218 */ 1219 int no_way_out = 0; 1220 1221 /* 1222 * If kill_it gets set, there might be a way to recover from this 1223 * error. 1224 */ 1225 int kill_it = 0; 1226 1227 /* 1228 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES 1229 * on Intel. 1230 */ 1231 int lmce = 1; 1232 1233 if (__mc_check_crashing_cpu(cpu)) 1234 return; 1235 1236 ist_enter(regs); 1237 1238 this_cpu_inc(mce_exception_count); 1239 1240 mce_gather_info(&m, regs); 1241 m.tsc = rdtsc(); 1242 1243 final = this_cpu_ptr(&mces_seen); 1244 *final = m; 1245 1246 memset(valid_banks, 0, sizeof(valid_banks)); 1247 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1248 1249 barrier(); 1250 1251 /* 1252 * When no restart IP might need to kill or panic. 1253 * Assume the worst for now, but if we find the 1254 * severity is MCE_AR_SEVERITY we have other options. 1255 */ 1256 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1257 kill_it = 1; 1258 1259 /* 1260 * Check if this MCE is signaled to only this logical processor, 1261 * on Intel only. 1262 */ 1263 if (m.cpuvendor == X86_VENDOR_INTEL) 1264 lmce = m.mcgstatus & MCG_STATUS_LMCES; 1265 1266 /* 1267 * Local machine check may already know that we have to panic. 1268 * Broadcast machine check begins rendezvous in mce_start() 1269 * Go through all banks in exclusion of the other CPUs. This way we 1270 * don't report duplicated events on shared banks because the first one 1271 * to see it will clear it. 1272 */ 1273 if (lmce) { 1274 if (no_way_out) 1275 mce_panic("Fatal local machine check", &m, msg); 1276 } else { 1277 order = mce_start(&no_way_out); 1278 } 1279 1280 __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); 1281 1282 if (!no_way_out) 1283 mce_clear_state(toclear); 1284 1285 /* 1286 * Do most of the synchronization with other CPUs. 1287 * When there's any problem use only local no_way_out state. 1288 */ 1289 if (!lmce) { 1290 if (mce_end(order) < 0) 1291 no_way_out = worst >= MCE_PANIC_SEVERITY; 1292 } else { 1293 /* 1294 * If there was a fatal machine check we should have 1295 * already called mce_panic earlier in this function. 1296 * Since we re-read the banks, we might have found 1297 * something new. Check again to see if we found a 1298 * fatal error. We call "mce_severity()" again to 1299 * make sure we have the right "msg". 1300 */ 1301 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { 1302 mce_severity(&m, cfg->tolerant, &msg, true); 1303 mce_panic("Local fatal machine check!", &m, msg); 1304 } 1305 } 1306 1307 /* 1308 * If tolerant is at an insane level we drop requests to kill 1309 * processes and continue even when there is no way out. 1310 */ 1311 if (cfg->tolerant == 3) 1312 kill_it = 0; 1313 else if (no_way_out) 1314 mce_panic("Fatal machine check on current CPU", &m, msg); 1315 1316 if (worst > 0) 1317 irq_work_queue(&mce_irq_work); 1318 1319 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1320 1321 sync_core(); 1322 1323 if (worst != MCE_AR_SEVERITY && !kill_it) 1324 goto out_ist; 1325 1326 /* Fault was in user mode and we need to take some action */ 1327 if ((m.cs & 3) == 3) { 1328 ist_begin_non_atomic(regs); 1329 local_irq_enable(); 1330 1331 if (kill_it || do_memory_failure(&m)) 1332 force_sig(SIGBUS, current); 1333 local_irq_disable(); 1334 ist_end_non_atomic(); 1335 } else { 1336 if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) 1337 mce_panic("Failed kernel mode recovery", &m, NULL); 1338 } 1339 1340 out_ist: 1341 ist_exit(regs); 1342 } 1343 EXPORT_SYMBOL_GPL(do_machine_check); 1344 1345 #ifndef CONFIG_MEMORY_FAILURE 1346 int memory_failure(unsigned long pfn, int flags) 1347 { 1348 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1349 BUG_ON(flags & MF_ACTION_REQUIRED); 1350 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1351 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1352 pfn); 1353 1354 return 0; 1355 } 1356 #endif 1357 1358 /* 1359 * Periodic polling timer for "silent" machine check errors. If the 1360 * poller finds an MCE, poll 2x faster. When the poller finds no more 1361 * errors, poll 2x slower (up to check_interval seconds). 1362 */ 1363 static unsigned long check_interval = INITIAL_CHECK_INTERVAL; 1364 1365 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1366 static DEFINE_PER_CPU(struct timer_list, mce_timer); 1367 1368 static unsigned long mce_adjust_timer_default(unsigned long interval) 1369 { 1370 return interval; 1371 } 1372 1373 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; 1374 1375 static void __start_timer(struct timer_list *t, unsigned long interval) 1376 { 1377 unsigned long when = jiffies + interval; 1378 unsigned long flags; 1379 1380 local_irq_save(flags); 1381 1382 if (!timer_pending(t) || time_before(when, t->expires)) 1383 mod_timer(t, round_jiffies(when)); 1384 1385 local_irq_restore(flags); 1386 } 1387 1388 static void mce_timer_fn(struct timer_list *t) 1389 { 1390 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); 1391 unsigned long iv; 1392 1393 WARN_ON(cpu_t != t); 1394 1395 iv = __this_cpu_read(mce_next_interval); 1396 1397 if (mce_available(this_cpu_ptr(&cpu_info))) { 1398 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); 1399 1400 if (mce_intel_cmci_poll()) { 1401 iv = mce_adjust_timer(iv); 1402 goto done; 1403 } 1404 } 1405 1406 /* 1407 * Alert userspace if needed. If we logged an MCE, reduce the polling 1408 * interval, otherwise increase the polling interval. 1409 */ 1410 if (mce_notify_irq()) 1411 iv = max(iv / 2, (unsigned long) HZ/100); 1412 else 1413 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1414 1415 done: 1416 __this_cpu_write(mce_next_interval, iv); 1417 __start_timer(t, iv); 1418 } 1419 1420 /* 1421 * Ensure that the timer is firing in @interval from now. 1422 */ 1423 void mce_timer_kick(unsigned long interval) 1424 { 1425 struct timer_list *t = this_cpu_ptr(&mce_timer); 1426 unsigned long iv = __this_cpu_read(mce_next_interval); 1427 1428 __start_timer(t, interval); 1429 1430 if (interval < iv) 1431 __this_cpu_write(mce_next_interval, interval); 1432 } 1433 1434 /* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1435 static void mce_timer_delete_all(void) 1436 { 1437 int cpu; 1438 1439 for_each_online_cpu(cpu) 1440 del_timer_sync(&per_cpu(mce_timer, cpu)); 1441 } 1442 1443 /* 1444 * Notify the user(s) about new machine check events. 1445 * Can be called from interrupt context, but not from machine check/NMI 1446 * context. 1447 */ 1448 int mce_notify_irq(void) 1449 { 1450 /* Not more than two messages every minute */ 1451 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1452 1453 if (test_and_clear_bit(0, &mce_need_notify)) { 1454 mce_work_trigger(); 1455 1456 if (__ratelimit(&ratelimit)) 1457 pr_info(HW_ERR "Machine check events logged\n"); 1458 1459 return 1; 1460 } 1461 return 0; 1462 } 1463 EXPORT_SYMBOL_GPL(mce_notify_irq); 1464 1465 static int __mcheck_cpu_mce_banks_init(void) 1466 { 1467 int i; 1468 1469 mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); 1470 if (!mce_banks) 1471 return -ENOMEM; 1472 1473 for (i = 0; i < MAX_NR_BANKS; i++) { 1474 struct mce_bank *b = &mce_banks[i]; 1475 1476 b->ctl = -1ULL; 1477 b->init = 1; 1478 } 1479 return 0; 1480 } 1481 1482 /* 1483 * Initialize Machine Checks for a CPU. 1484 */ 1485 static int __mcheck_cpu_cap_init(void) 1486 { 1487 u64 cap; 1488 u8 b; 1489 1490 rdmsrl(MSR_IA32_MCG_CAP, cap); 1491 1492 b = cap & MCG_BANKCNT_MASK; 1493 if (WARN_ON_ONCE(b > MAX_NR_BANKS)) 1494 b = MAX_NR_BANKS; 1495 1496 mca_cfg.banks = max(mca_cfg.banks, b); 1497 1498 if (!mce_banks) { 1499 int err = __mcheck_cpu_mce_banks_init(); 1500 if (err) 1501 return err; 1502 } 1503 1504 /* Use accurate RIP reporting if available. */ 1505 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1506 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1507 1508 if (cap & MCG_SER_P) 1509 mca_cfg.ser = 1; 1510 1511 return 0; 1512 } 1513 1514 static void __mcheck_cpu_init_generic(void) 1515 { 1516 enum mcp_flags m_fl = 0; 1517 mce_banks_t all_banks; 1518 u64 cap; 1519 1520 if (!mca_cfg.bootlog) 1521 m_fl = MCP_DONTLOG; 1522 1523 /* 1524 * Log the machine checks left over from the previous reset. 1525 */ 1526 bitmap_fill(all_banks, MAX_NR_BANKS); 1527 machine_check_poll(MCP_UC | m_fl, &all_banks); 1528 1529 cr4_set_bits(X86_CR4_MCE); 1530 1531 rdmsrl(MSR_IA32_MCG_CAP, cap); 1532 if (cap & MCG_CTL_P) 1533 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1534 } 1535 1536 static void __mcheck_cpu_init_clear_banks(void) 1537 { 1538 int i; 1539 1540 for (i = 0; i < mca_cfg.banks; i++) { 1541 struct mce_bank *b = &mce_banks[i]; 1542 1543 if (!b->init) 1544 continue; 1545 wrmsrl(msr_ops.ctl(i), b->ctl); 1546 wrmsrl(msr_ops.status(i), 0); 1547 } 1548 } 1549 1550 /* 1551 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1552 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1553 * Vol 3B Table 15-20). But this confuses both the code that determines 1554 * whether the machine check occurred in kernel or user mode, and also 1555 * the severity assessment code. Pretend that EIPV was set, and take the 1556 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1557 */ 1558 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1559 { 1560 if (bank != 0) 1561 return; 1562 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1563 return; 1564 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1565 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1566 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1567 MCACOD)) != 1568 (MCI_STATUS_UC|MCI_STATUS_EN| 1569 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1570 MCI_STATUS_AR|MCACOD_INSTR)) 1571 return; 1572 1573 m->mcgstatus |= MCG_STATUS_EIPV; 1574 m->ip = regs->ip; 1575 m->cs = regs->cs; 1576 } 1577 1578 /* Add per CPU specific workarounds here */ 1579 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1580 { 1581 struct mca_config *cfg = &mca_cfg; 1582 1583 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1584 pr_info("unknown CPU type - not enabling MCE support\n"); 1585 return -EOPNOTSUPP; 1586 } 1587 1588 /* This should be disabled by the BIOS, but isn't always */ 1589 if (c->x86_vendor == X86_VENDOR_AMD) { 1590 if (c->x86 == 15 && cfg->banks > 4) { 1591 /* 1592 * disable GART TBL walk error reporting, which 1593 * trips off incorrectly with the IOMMU & 3ware 1594 * & Cerberus: 1595 */ 1596 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1597 } 1598 if (c->x86 < 0x11 && cfg->bootlog < 0) { 1599 /* 1600 * Lots of broken BIOS around that don't clear them 1601 * by default and leave crap in there. Don't log: 1602 */ 1603 cfg->bootlog = 0; 1604 } 1605 /* 1606 * Various K7s with broken bank 0 around. Always disable 1607 * by default. 1608 */ 1609 if (c->x86 == 6 && cfg->banks > 0) 1610 mce_banks[0].ctl = 0; 1611 1612 /* 1613 * overflow_recov is supported for F15h Models 00h-0fh 1614 * even though we don't have a CPUID bit for it. 1615 */ 1616 if (c->x86 == 0x15 && c->x86_model <= 0xf) 1617 mce_flags.overflow_recov = 1; 1618 1619 } 1620 1621 if (c->x86_vendor == X86_VENDOR_INTEL) { 1622 /* 1623 * SDM documents that on family 6 bank 0 should not be written 1624 * because it aliases to another special BIOS controlled 1625 * register. 1626 * But it's not aliased anymore on model 0x1a+ 1627 * Don't ignore bank 0 completely because there could be a 1628 * valid event later, merely don't write CTL0. 1629 */ 1630 1631 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1632 mce_banks[0].init = 0; 1633 1634 /* 1635 * All newer Intel systems support MCE broadcasting. Enable 1636 * synchronization with a one second timeout. 1637 */ 1638 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1639 cfg->monarch_timeout < 0) 1640 cfg->monarch_timeout = USEC_PER_SEC; 1641 1642 /* 1643 * There are also broken BIOSes on some Pentium M and 1644 * earlier systems: 1645 */ 1646 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1647 cfg->bootlog = 0; 1648 1649 if (c->x86 == 6 && c->x86_model == 45) 1650 quirk_no_way_out = quirk_sandybridge_ifu; 1651 } 1652 if (cfg->monarch_timeout < 0) 1653 cfg->monarch_timeout = 0; 1654 if (cfg->bootlog != 0) 1655 cfg->panic_timeout = 30; 1656 1657 return 0; 1658 } 1659 1660 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1661 { 1662 if (c->x86 != 5) 1663 return 0; 1664 1665 switch (c->x86_vendor) { 1666 case X86_VENDOR_INTEL: 1667 intel_p5_mcheck_init(c); 1668 return 1; 1669 break; 1670 case X86_VENDOR_CENTAUR: 1671 winchip_mcheck_init(c); 1672 return 1; 1673 break; 1674 default: 1675 return 0; 1676 } 1677 1678 return 0; 1679 } 1680 1681 /* 1682 * Init basic CPU features needed for early decoding of MCEs. 1683 */ 1684 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) 1685 { 1686 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { 1687 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); 1688 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); 1689 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); 1690 1691 if (mce_flags.smca) { 1692 msr_ops.ctl = smca_ctl_reg; 1693 msr_ops.status = smca_status_reg; 1694 msr_ops.addr = smca_addr_reg; 1695 msr_ops.misc = smca_misc_reg; 1696 } 1697 } 1698 } 1699 1700 static void mce_centaur_feature_init(struct cpuinfo_x86 *c) 1701 { 1702 struct mca_config *cfg = &mca_cfg; 1703 1704 /* 1705 * All newer Centaur CPUs support MCE broadcasting. Enable 1706 * synchronization with a one second timeout. 1707 */ 1708 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || 1709 c->x86 > 6) { 1710 if (cfg->monarch_timeout < 0) 1711 cfg->monarch_timeout = USEC_PER_SEC; 1712 } 1713 } 1714 1715 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1716 { 1717 switch (c->x86_vendor) { 1718 case X86_VENDOR_INTEL: 1719 mce_intel_feature_init(c); 1720 mce_adjust_timer = cmci_intel_adjust_timer; 1721 break; 1722 1723 case X86_VENDOR_AMD: { 1724 mce_amd_feature_init(c); 1725 break; 1726 } 1727 1728 case X86_VENDOR_HYGON: 1729 mce_hygon_feature_init(c); 1730 break; 1731 1732 case X86_VENDOR_CENTAUR: 1733 mce_centaur_feature_init(c); 1734 break; 1735 1736 default: 1737 break; 1738 } 1739 } 1740 1741 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) 1742 { 1743 switch (c->x86_vendor) { 1744 case X86_VENDOR_INTEL: 1745 mce_intel_feature_clear(c); 1746 break; 1747 default: 1748 break; 1749 } 1750 } 1751 1752 static void mce_start_timer(struct timer_list *t) 1753 { 1754 unsigned long iv = check_interval * HZ; 1755 1756 if (mca_cfg.ignore_ce || !iv) 1757 return; 1758 1759 this_cpu_write(mce_next_interval, iv); 1760 __start_timer(t, iv); 1761 } 1762 1763 static void __mcheck_cpu_setup_timer(void) 1764 { 1765 struct timer_list *t = this_cpu_ptr(&mce_timer); 1766 1767 timer_setup(t, mce_timer_fn, TIMER_PINNED); 1768 } 1769 1770 static void __mcheck_cpu_init_timer(void) 1771 { 1772 struct timer_list *t = this_cpu_ptr(&mce_timer); 1773 1774 timer_setup(t, mce_timer_fn, TIMER_PINNED); 1775 mce_start_timer(t); 1776 } 1777 1778 bool filter_mce(struct mce *m) 1779 { 1780 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1781 return amd_filter_mce(m); 1782 1783 return false; 1784 } 1785 1786 /* Handle unconfigured int18 (should never happen) */ 1787 static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1788 { 1789 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1790 smp_processor_id()); 1791 } 1792 1793 /* Call the installed machine check handler for this CPU setup. */ 1794 void (*machine_check_vector)(struct pt_regs *, long error_code) = 1795 unexpected_machine_check; 1796 1797 dotraplinkage void do_mce(struct pt_regs *regs, long error_code) 1798 { 1799 machine_check_vector(regs, error_code); 1800 } 1801 1802 /* 1803 * Called for each booted CPU to set up machine checks. 1804 * Must be called with preempt off: 1805 */ 1806 void mcheck_cpu_init(struct cpuinfo_x86 *c) 1807 { 1808 if (mca_cfg.disabled) 1809 return; 1810 1811 if (__mcheck_cpu_ancient_init(c)) 1812 return; 1813 1814 if (!mce_available(c)) 1815 return; 1816 1817 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1818 mca_cfg.disabled = 1; 1819 return; 1820 } 1821 1822 if (mce_gen_pool_init()) { 1823 mca_cfg.disabled = 1; 1824 pr_emerg("Couldn't allocate MCE records pool!\n"); 1825 return; 1826 } 1827 1828 machine_check_vector = do_machine_check; 1829 1830 __mcheck_cpu_init_early(c); 1831 __mcheck_cpu_init_generic(); 1832 __mcheck_cpu_init_vendor(c); 1833 __mcheck_cpu_init_clear_banks(); 1834 __mcheck_cpu_setup_timer(); 1835 } 1836 1837 /* 1838 * Called for each booted CPU to clear some machine checks opt-ins 1839 */ 1840 void mcheck_cpu_clear(struct cpuinfo_x86 *c) 1841 { 1842 if (mca_cfg.disabled) 1843 return; 1844 1845 if (!mce_available(c)) 1846 return; 1847 1848 /* 1849 * Possibly to clear general settings generic to x86 1850 * __mcheck_cpu_clear_generic(c); 1851 */ 1852 __mcheck_cpu_clear_vendor(c); 1853 1854 } 1855 1856 static void __mce_disable_bank(void *arg) 1857 { 1858 int bank = *((int *)arg); 1859 __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); 1860 cmci_disable_bank(bank); 1861 } 1862 1863 void mce_disable_bank(int bank) 1864 { 1865 if (bank >= mca_cfg.banks) { 1866 pr_warn(FW_BUG 1867 "Ignoring request to disable invalid MCA bank %d.\n", 1868 bank); 1869 return; 1870 } 1871 set_bit(bank, mce_banks_ce_disabled); 1872 on_each_cpu(__mce_disable_bank, &bank, 1); 1873 } 1874 1875 /* 1876 * mce=off Disables machine check 1877 * mce=no_cmci Disables CMCI 1878 * mce=no_lmce Disables LMCE 1879 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1880 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1881 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1882 * monarchtimeout is how long to wait for other CPUs on machine 1883 * check, or 0 to not wait 1884 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h 1885 and older. 1886 * mce=nobootlog Don't log MCEs from before booting. 1887 * mce=bios_cmci_threshold Don't program the CMCI threshold 1888 * mce=recovery force enable memcpy_mcsafe() 1889 */ 1890 static int __init mcheck_enable(char *str) 1891 { 1892 struct mca_config *cfg = &mca_cfg; 1893 1894 if (*str == 0) { 1895 enable_p5_mce(); 1896 return 1; 1897 } 1898 if (*str == '=') 1899 str++; 1900 if (!strcmp(str, "off")) 1901 cfg->disabled = 1; 1902 else if (!strcmp(str, "no_cmci")) 1903 cfg->cmci_disabled = true; 1904 else if (!strcmp(str, "no_lmce")) 1905 cfg->lmce_disabled = 1; 1906 else if (!strcmp(str, "dont_log_ce")) 1907 cfg->dont_log_ce = true; 1908 else if (!strcmp(str, "ignore_ce")) 1909 cfg->ignore_ce = true; 1910 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1911 cfg->bootlog = (str[0] == 'b'); 1912 else if (!strcmp(str, "bios_cmci_threshold")) 1913 cfg->bios_cmci_threshold = 1; 1914 else if (!strcmp(str, "recovery")) 1915 cfg->recovery = 1; 1916 else if (isdigit(str[0])) { 1917 if (get_option(&str, &cfg->tolerant) == 2) 1918 get_option(&str, &(cfg->monarch_timeout)); 1919 } else { 1920 pr_info("mce argument %s ignored. Please use /sys\n", str); 1921 return 0; 1922 } 1923 return 1; 1924 } 1925 __setup("mce", mcheck_enable); 1926 1927 int __init mcheck_init(void) 1928 { 1929 mcheck_intel_therm_init(); 1930 mce_register_decode_chain(&first_nb); 1931 mce_register_decode_chain(&mce_srao_nb); 1932 mce_register_decode_chain(&mce_default_nb); 1933 mcheck_vendor_init_severity(); 1934 1935 INIT_WORK(&mce_work, mce_gen_pool_process); 1936 init_irq_work(&mce_irq_work, mce_irq_work_cb); 1937 1938 return 0; 1939 } 1940 1941 /* 1942 * mce_syscore: PM support 1943 */ 1944 1945 /* 1946 * Disable machine checks on suspend and shutdown. We can't really handle 1947 * them later. 1948 */ 1949 static void mce_disable_error_reporting(void) 1950 { 1951 int i; 1952 1953 for (i = 0; i < mca_cfg.banks; i++) { 1954 struct mce_bank *b = &mce_banks[i]; 1955 1956 if (b->init) 1957 wrmsrl(msr_ops.ctl(i), 0); 1958 } 1959 return; 1960 } 1961 1962 static void vendor_disable_error_reporting(void) 1963 { 1964 /* 1965 * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs 1966 * are socket-wide. 1967 * Disabling them for just a single offlined CPU is bad, since it will 1968 * inhibit reporting for all shared resources on the socket like the 1969 * last level cache (LLC), the integrated memory controller (iMC), etc. 1970 */ 1971 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || 1972 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || 1973 boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1974 return; 1975 1976 mce_disable_error_reporting(); 1977 } 1978 1979 static int mce_syscore_suspend(void) 1980 { 1981 vendor_disable_error_reporting(); 1982 return 0; 1983 } 1984 1985 static void mce_syscore_shutdown(void) 1986 { 1987 vendor_disable_error_reporting(); 1988 } 1989 1990 /* 1991 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1992 * Only one CPU is active at this time, the others get re-added later using 1993 * CPU hotplug: 1994 */ 1995 static void mce_syscore_resume(void) 1996 { 1997 __mcheck_cpu_init_generic(); 1998 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); 1999 __mcheck_cpu_init_clear_banks(); 2000 } 2001 2002 static struct syscore_ops mce_syscore_ops = { 2003 .suspend = mce_syscore_suspend, 2004 .shutdown = mce_syscore_shutdown, 2005 .resume = mce_syscore_resume, 2006 }; 2007 2008 /* 2009 * mce_device: Sysfs support 2010 */ 2011 2012 static void mce_cpu_restart(void *data) 2013 { 2014 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2015 return; 2016 __mcheck_cpu_init_generic(); 2017 __mcheck_cpu_init_clear_banks(); 2018 __mcheck_cpu_init_timer(); 2019 } 2020 2021 /* Reinit MCEs after user configuration changes */ 2022 static void mce_restart(void) 2023 { 2024 mce_timer_delete_all(); 2025 on_each_cpu(mce_cpu_restart, NULL, 1); 2026 } 2027 2028 /* Toggle features for corrected errors */ 2029 static void mce_disable_cmci(void *data) 2030 { 2031 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2032 return; 2033 cmci_clear(); 2034 } 2035 2036 static void mce_enable_ce(void *all) 2037 { 2038 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2039 return; 2040 cmci_reenable(); 2041 cmci_recheck(); 2042 if (all) 2043 __mcheck_cpu_init_timer(); 2044 } 2045 2046 static struct bus_type mce_subsys = { 2047 .name = "machinecheck", 2048 .dev_name = "machinecheck", 2049 }; 2050 2051 DEFINE_PER_CPU(struct device *, mce_device); 2052 2053 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2054 { 2055 return container_of(attr, struct mce_bank, attr); 2056 } 2057 2058 static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2059 char *buf) 2060 { 2061 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2062 } 2063 2064 static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2065 const char *buf, size_t size) 2066 { 2067 u64 new; 2068 2069 if (kstrtou64(buf, 0, &new) < 0) 2070 return -EINVAL; 2071 2072 attr_to_bank(attr)->ctl = new; 2073 mce_restart(); 2074 2075 return size; 2076 } 2077 2078 static ssize_t set_ignore_ce(struct device *s, 2079 struct device_attribute *attr, 2080 const char *buf, size_t size) 2081 { 2082 u64 new; 2083 2084 if (kstrtou64(buf, 0, &new) < 0) 2085 return -EINVAL; 2086 2087 mutex_lock(&mce_sysfs_mutex); 2088 if (mca_cfg.ignore_ce ^ !!new) { 2089 if (new) { 2090 /* disable ce features */ 2091 mce_timer_delete_all(); 2092 on_each_cpu(mce_disable_cmci, NULL, 1); 2093 mca_cfg.ignore_ce = true; 2094 } else { 2095 /* enable ce features */ 2096 mca_cfg.ignore_ce = false; 2097 on_each_cpu(mce_enable_ce, (void *)1, 1); 2098 } 2099 } 2100 mutex_unlock(&mce_sysfs_mutex); 2101 2102 return size; 2103 } 2104 2105 static ssize_t set_cmci_disabled(struct device *s, 2106 struct device_attribute *attr, 2107 const char *buf, size_t size) 2108 { 2109 u64 new; 2110 2111 if (kstrtou64(buf, 0, &new) < 0) 2112 return -EINVAL; 2113 2114 mutex_lock(&mce_sysfs_mutex); 2115 if (mca_cfg.cmci_disabled ^ !!new) { 2116 if (new) { 2117 /* disable cmci */ 2118 on_each_cpu(mce_disable_cmci, NULL, 1); 2119 mca_cfg.cmci_disabled = true; 2120 } else { 2121 /* enable cmci */ 2122 mca_cfg.cmci_disabled = false; 2123 on_each_cpu(mce_enable_ce, NULL, 1); 2124 } 2125 } 2126 mutex_unlock(&mce_sysfs_mutex); 2127 2128 return size; 2129 } 2130 2131 static ssize_t store_int_with_restart(struct device *s, 2132 struct device_attribute *attr, 2133 const char *buf, size_t size) 2134 { 2135 unsigned long old_check_interval = check_interval; 2136 ssize_t ret = device_store_ulong(s, attr, buf, size); 2137 2138 if (check_interval == old_check_interval) 2139 return ret; 2140 2141 mutex_lock(&mce_sysfs_mutex); 2142 mce_restart(); 2143 mutex_unlock(&mce_sysfs_mutex); 2144 2145 return ret; 2146 } 2147 2148 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2149 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2150 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2151 2152 static struct dev_ext_attribute dev_attr_check_interval = { 2153 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2154 &check_interval 2155 }; 2156 2157 static struct dev_ext_attribute dev_attr_ignore_ce = { 2158 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 2159 &mca_cfg.ignore_ce 2160 }; 2161 2162 static struct dev_ext_attribute dev_attr_cmci_disabled = { 2163 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 2164 &mca_cfg.cmci_disabled 2165 }; 2166 2167 static struct device_attribute *mce_device_attrs[] = { 2168 &dev_attr_tolerant.attr, 2169 &dev_attr_check_interval.attr, 2170 #ifdef CONFIG_X86_MCELOG_LEGACY 2171 &dev_attr_trigger, 2172 #endif 2173 &dev_attr_monarch_timeout.attr, 2174 &dev_attr_dont_log_ce.attr, 2175 &dev_attr_ignore_ce.attr, 2176 &dev_attr_cmci_disabled.attr, 2177 NULL 2178 }; 2179 2180 static cpumask_var_t mce_device_initialized; 2181 2182 static void mce_device_release(struct device *dev) 2183 { 2184 kfree(dev); 2185 } 2186 2187 /* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2188 static int mce_device_create(unsigned int cpu) 2189 { 2190 struct device *dev; 2191 int err; 2192 int i, j; 2193 2194 if (!mce_available(&boot_cpu_data)) 2195 return -EIO; 2196 2197 dev = per_cpu(mce_device, cpu); 2198 if (dev) 2199 return 0; 2200 2201 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2202 if (!dev) 2203 return -ENOMEM; 2204 dev->id = cpu; 2205 dev->bus = &mce_subsys; 2206 dev->release = &mce_device_release; 2207 2208 err = device_register(dev); 2209 if (err) { 2210 put_device(dev); 2211 return err; 2212 } 2213 2214 for (i = 0; mce_device_attrs[i]; i++) { 2215 err = device_create_file(dev, mce_device_attrs[i]); 2216 if (err) 2217 goto error; 2218 } 2219 for (j = 0; j < mca_cfg.banks; j++) { 2220 err = device_create_file(dev, &mce_banks[j].attr); 2221 if (err) 2222 goto error2; 2223 } 2224 cpumask_set_cpu(cpu, mce_device_initialized); 2225 per_cpu(mce_device, cpu) = dev; 2226 2227 return 0; 2228 error2: 2229 while (--j >= 0) 2230 device_remove_file(dev, &mce_banks[j].attr); 2231 error: 2232 while (--i >= 0) 2233 device_remove_file(dev, mce_device_attrs[i]); 2234 2235 device_unregister(dev); 2236 2237 return err; 2238 } 2239 2240 static void mce_device_remove(unsigned int cpu) 2241 { 2242 struct device *dev = per_cpu(mce_device, cpu); 2243 int i; 2244 2245 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2246 return; 2247 2248 for (i = 0; mce_device_attrs[i]; i++) 2249 device_remove_file(dev, mce_device_attrs[i]); 2250 2251 for (i = 0; i < mca_cfg.banks; i++) 2252 device_remove_file(dev, &mce_banks[i].attr); 2253 2254 device_unregister(dev); 2255 cpumask_clear_cpu(cpu, mce_device_initialized); 2256 per_cpu(mce_device, cpu) = NULL; 2257 } 2258 2259 /* Make sure there are no machine checks on offlined CPUs. */ 2260 static void mce_disable_cpu(void) 2261 { 2262 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2263 return; 2264 2265 if (!cpuhp_tasks_frozen) 2266 cmci_clear(); 2267 2268 vendor_disable_error_reporting(); 2269 } 2270 2271 static void mce_reenable_cpu(void) 2272 { 2273 int i; 2274 2275 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2276 return; 2277 2278 if (!cpuhp_tasks_frozen) 2279 cmci_reenable(); 2280 for (i = 0; i < mca_cfg.banks; i++) { 2281 struct mce_bank *b = &mce_banks[i]; 2282 2283 if (b->init) 2284 wrmsrl(msr_ops.ctl(i), b->ctl); 2285 } 2286 } 2287 2288 static int mce_cpu_dead(unsigned int cpu) 2289 { 2290 mce_intel_hcpu_update(cpu); 2291 2292 /* intentionally ignoring frozen here */ 2293 if (!cpuhp_tasks_frozen) 2294 cmci_rediscover(); 2295 return 0; 2296 } 2297 2298 static int mce_cpu_online(unsigned int cpu) 2299 { 2300 struct timer_list *t = this_cpu_ptr(&mce_timer); 2301 int ret; 2302 2303 mce_device_create(cpu); 2304 2305 ret = mce_threshold_create_device(cpu); 2306 if (ret) { 2307 mce_device_remove(cpu); 2308 return ret; 2309 } 2310 mce_reenable_cpu(); 2311 mce_start_timer(t); 2312 return 0; 2313 } 2314 2315 static int mce_cpu_pre_down(unsigned int cpu) 2316 { 2317 struct timer_list *t = this_cpu_ptr(&mce_timer); 2318 2319 mce_disable_cpu(); 2320 del_timer_sync(t); 2321 mce_threshold_remove_device(cpu); 2322 mce_device_remove(cpu); 2323 return 0; 2324 } 2325 2326 static __init void mce_init_banks(void) 2327 { 2328 int i; 2329 2330 for (i = 0; i < mca_cfg.banks; i++) { 2331 struct mce_bank *b = &mce_banks[i]; 2332 struct device_attribute *a = &b->attr; 2333 2334 sysfs_attr_init(&a->attr); 2335 a->attr.name = b->attrname; 2336 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2337 2338 a->attr.mode = 0644; 2339 a->show = show_bank; 2340 a->store = set_bank; 2341 } 2342 } 2343 2344 static __init int mcheck_init_device(void) 2345 { 2346 int err; 2347 2348 /* 2349 * Check if we have a spare virtual bit. This will only become 2350 * a problem if/when we move beyond 5-level page tables. 2351 */ 2352 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); 2353 2354 if (!mce_available(&boot_cpu_data)) { 2355 err = -EIO; 2356 goto err_out; 2357 } 2358 2359 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { 2360 err = -ENOMEM; 2361 goto err_out; 2362 } 2363 2364 mce_init_banks(); 2365 2366 err = subsys_system_register(&mce_subsys, NULL); 2367 if (err) 2368 goto err_out_mem; 2369 2370 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, 2371 mce_cpu_dead); 2372 if (err) 2373 goto err_out_mem; 2374 2375 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", 2376 mce_cpu_online, mce_cpu_pre_down); 2377 if (err < 0) 2378 goto err_out_online; 2379 2380 register_syscore_ops(&mce_syscore_ops); 2381 2382 return 0; 2383 2384 err_out_online: 2385 cpuhp_remove_state(CPUHP_X86_MCE_DEAD); 2386 2387 err_out_mem: 2388 free_cpumask_var(mce_device_initialized); 2389 2390 err_out: 2391 pr_err("Unable to init MCE device (rc: %d)\n", err); 2392 2393 return err; 2394 } 2395 device_initcall_sync(mcheck_init_device); 2396 2397 /* 2398 * Old style boot options parsing. Only for compatibility. 2399 */ 2400 static int __init mcheck_disable(char *str) 2401 { 2402 mca_cfg.disabled = 1; 2403 return 1; 2404 } 2405 __setup("nomce", mcheck_disable); 2406 2407 #ifdef CONFIG_DEBUG_FS 2408 struct dentry *mce_get_debugfs_dir(void) 2409 { 2410 static struct dentry *dmce; 2411 2412 if (!dmce) 2413 dmce = debugfs_create_dir("mce", NULL); 2414 2415 return dmce; 2416 } 2417 2418 static void mce_reset(void) 2419 { 2420 cpu_missing = 0; 2421 atomic_set(&mce_fake_panicked, 0); 2422 atomic_set(&mce_executing, 0); 2423 atomic_set(&mce_callin, 0); 2424 atomic_set(&global_nwo, 0); 2425 } 2426 2427 static int fake_panic_get(void *data, u64 *val) 2428 { 2429 *val = fake_panic; 2430 return 0; 2431 } 2432 2433 static int fake_panic_set(void *data, u64 val) 2434 { 2435 mce_reset(); 2436 fake_panic = val; 2437 return 0; 2438 } 2439 2440 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, 2441 "%llu\n"); 2442 2443 static int __init mcheck_debugfs_init(void) 2444 { 2445 struct dentry *dmce, *ffake_panic; 2446 2447 dmce = mce_get_debugfs_dir(); 2448 if (!dmce) 2449 return -ENOMEM; 2450 ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, 2451 NULL, &fake_panic_fops); 2452 if (!ffake_panic) 2453 return -ENOMEM; 2454 2455 return 0; 2456 } 2457 #else 2458 static int __init mcheck_debugfs_init(void) { return -EINVAL; } 2459 #endif 2460 2461 DEFINE_STATIC_KEY_FALSE(mcsafe_key); 2462 EXPORT_SYMBOL_GPL(mcsafe_key); 2463 2464 static int __init mcheck_late_init(void) 2465 { 2466 pr_info("Using %d MCE banks\n", mca_cfg.banks); 2467 2468 if (mca_cfg.recovery) 2469 static_branch_inc(&mcsafe_key); 2470 2471 mcheck_debugfs_init(); 2472 cec_init(); 2473 2474 /* 2475 * Flush out everything that has been logged during early boot, now that 2476 * everything has been initialized (workqueues, decoders, ...). 2477 */ 2478 mce_schedule_work(); 2479 2480 return 0; 2481 } 2482 late_initcall(mcheck_late_init); 2483