1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Machine check handler. 4 * 5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 6 * Rest from unknown author(s). 7 * 2004 Andi Kleen. Rewrote most of it. 8 * Copyright 2008 Intel Corporation 9 * Author: Andi Kleen 10 */ 11 12 #include <linux/thread_info.h> 13 #include <linux/capability.h> 14 #include <linux/miscdevice.h> 15 #include <linux/ratelimit.h> 16 #include <linux/rcupdate.h> 17 #include <linux/kobject.h> 18 #include <linux/uaccess.h> 19 #include <linux/kdebug.h> 20 #include <linux/kernel.h> 21 #include <linux/percpu.h> 22 #include <linux/string.h> 23 #include <linux/device.h> 24 #include <linux/syscore_ops.h> 25 #include <linux/delay.h> 26 #include <linux/ctype.h> 27 #include <linux/sched.h> 28 #include <linux/sysfs.h> 29 #include <linux/types.h> 30 #include <linux/slab.h> 31 #include <linux/init.h> 32 #include <linux/kmod.h> 33 #include <linux/poll.h> 34 #include <linux/nmi.h> 35 #include <linux/cpu.h> 36 #include <linux/ras.h> 37 #include <linux/smp.h> 38 #include <linux/fs.h> 39 #include <linux/mm.h> 40 #include <linux/debugfs.h> 41 #include <linux/irq_work.h> 42 #include <linux/export.h> 43 #include <linux/jump_label.h> 44 #include <linux/set_memory.h> 45 46 #include <asm/intel-family.h> 47 #include <asm/processor.h> 48 #include <asm/traps.h> 49 #include <asm/tlbflush.h> 50 #include <asm/mce.h> 51 #include <asm/msr.h> 52 #include <asm/reboot.h> 53 54 #include "internal.h" 55 56 static DEFINE_MUTEX(mce_log_mutex); 57 58 /* sysfs synchronization */ 59 static DEFINE_MUTEX(mce_sysfs_mutex); 60 61 #define CREATE_TRACE_POINTS 62 #include <trace/events/mce.h> 63 64 #define SPINUNIT 100 /* 100ns */ 65 66 DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68 struct mce_bank *mce_banks __read_mostly; 69 struct mce_vendor_flags mce_flags __read_mostly; 70 71 struct mca_config mca_cfg __read_mostly = { 72 .bootlog = -1, 73 /* 74 * Tolerant levels: 75 * 0: always panic on uncorrected errors, log corrected errors 76 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 77 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 78 * 3: never panic or SIGBUS, log all errors (for testing only) 79 */ 80 .tolerant = 1, 81 .monarch_timeout = -1 82 }; 83 84 static DEFINE_PER_CPU(struct mce, mces_seen); 85 static unsigned long mce_need_notify; 86 static int cpu_missing; 87 88 /* 89 * MCA banks polled by the period polling timer for corrected events. 90 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 91 */ 92 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 93 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 94 }; 95 96 /* 97 * MCA banks controlled through firmware first for corrected errors. 98 * This is a global list of banks for which we won't enable CMCI and we 99 * won't poll. Firmware controls these banks and is responsible for 100 * reporting corrected errors through GHES. Uncorrected/recoverable 101 * errors are still notified through a machine check. 102 */ 103 mce_banks_t mce_banks_ce_disabled; 104 105 static struct work_struct mce_work; 106 static struct irq_work mce_irq_work; 107 108 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 109 110 /* 111 * CPU/chipset specific EDAC code can register a notifier call here to print 112 * MCE errors in a human-readable form. 113 */ 114 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); 115 116 /* Do initial initialization of a struct mce */ 117 void mce_setup(struct mce *m) 118 { 119 memset(m, 0, sizeof(struct mce)); 120 m->cpu = m->extcpu = smp_processor_id(); 121 /* need the internal __ version to avoid deadlocks */ 122 m->time = __ktime_get_real_seconds(); 123 m->cpuvendor = boot_cpu_data.x86_vendor; 124 m->cpuid = cpuid_eax(1); 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126 m->apicid = cpu_data(m->extcpu).initial_apicid; 127 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 128 129 if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) 130 rdmsrl(MSR_PPIN, m->ppin); 131 132 m->microcode = boot_cpu_data.microcode; 133 } 134 135 DEFINE_PER_CPU(struct mce, injectm); 136 EXPORT_PER_CPU_SYMBOL_GPL(injectm); 137 138 void mce_log(struct mce *m) 139 { 140 if (!mce_gen_pool_add(m)) 141 irq_work_queue(&mce_irq_work); 142 } 143 144 void mce_inject_log(struct mce *m) 145 { 146 mutex_lock(&mce_log_mutex); 147 mce_log(m); 148 mutex_unlock(&mce_log_mutex); 149 } 150 EXPORT_SYMBOL_GPL(mce_inject_log); 151 152 static struct notifier_block mce_srao_nb; 153 154 /* 155 * We run the default notifier if we have only the SRAO, the first and the 156 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS 157 * notifiers registered on the chain. 158 */ 159 #define NUM_DEFAULT_NOTIFIERS 3 160 static atomic_t num_notifiers; 161 162 void mce_register_decode_chain(struct notifier_block *nb) 163 { 164 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) 165 return; 166 167 atomic_inc(&num_notifiers); 168 169 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); 170 } 171 EXPORT_SYMBOL_GPL(mce_register_decode_chain); 172 173 void mce_unregister_decode_chain(struct notifier_block *nb) 174 { 175 atomic_dec(&num_notifiers); 176 177 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 178 } 179 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 180 181 static inline u32 ctl_reg(int bank) 182 { 183 return MSR_IA32_MCx_CTL(bank); 184 } 185 186 static inline u32 status_reg(int bank) 187 { 188 return MSR_IA32_MCx_STATUS(bank); 189 } 190 191 static inline u32 addr_reg(int bank) 192 { 193 return MSR_IA32_MCx_ADDR(bank); 194 } 195 196 static inline u32 misc_reg(int bank) 197 { 198 return MSR_IA32_MCx_MISC(bank); 199 } 200 201 static inline u32 smca_ctl_reg(int bank) 202 { 203 return MSR_AMD64_SMCA_MCx_CTL(bank); 204 } 205 206 static inline u32 smca_status_reg(int bank) 207 { 208 return MSR_AMD64_SMCA_MCx_STATUS(bank); 209 } 210 211 static inline u32 smca_addr_reg(int bank) 212 { 213 return MSR_AMD64_SMCA_MCx_ADDR(bank); 214 } 215 216 static inline u32 smca_misc_reg(int bank) 217 { 218 return MSR_AMD64_SMCA_MCx_MISC(bank); 219 } 220 221 struct mca_msr_regs msr_ops = { 222 .ctl = ctl_reg, 223 .status = status_reg, 224 .addr = addr_reg, 225 .misc = misc_reg 226 }; 227 228 static void __print_mce(struct mce *m) 229 { 230 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", 231 m->extcpu, 232 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), 233 m->mcgstatus, m->bank, m->status); 234 235 if (m->ip) { 236 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 237 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 238 m->cs, m->ip); 239 240 if (m->cs == __KERNEL_CS) 241 pr_cont("{%pS}", (void *)(unsigned long)m->ip); 242 pr_cont("\n"); 243 } 244 245 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 246 if (m->addr) 247 pr_cont("ADDR %llx ", m->addr); 248 if (m->misc) 249 pr_cont("MISC %llx ", m->misc); 250 251 if (mce_flags.smca) { 252 if (m->synd) 253 pr_cont("SYND %llx ", m->synd); 254 if (m->ipid) 255 pr_cont("IPID %llx ", m->ipid); 256 } 257 258 pr_cont("\n"); 259 /* 260 * Note this output is parsed by external tools and old fields 261 * should not be changed. 262 */ 263 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 264 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 265 m->microcode); 266 } 267 268 static void print_mce(struct mce *m) 269 { 270 __print_mce(m); 271 272 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) 273 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 274 } 275 276 #define PANIC_TIMEOUT 5 /* 5 seconds */ 277 278 static atomic_t mce_panicked; 279 280 static int fake_panic; 281 static atomic_t mce_fake_panicked; 282 283 /* Panic in progress. Enable interrupts and wait for final IPI */ 284 static void wait_for_panic(void) 285 { 286 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 287 288 preempt_disable(); 289 local_irq_enable(); 290 while (timeout-- > 0) 291 udelay(1); 292 if (panic_timeout == 0) 293 panic_timeout = mca_cfg.panic_timeout; 294 panic("Panicing machine check CPU died"); 295 } 296 297 static void mce_panic(const char *msg, struct mce *final, char *exp) 298 { 299 int apei_err = 0; 300 struct llist_node *pending; 301 struct mce_evt_llist *l; 302 303 if (!fake_panic) { 304 /* 305 * Make sure only one CPU runs in machine check panic 306 */ 307 if (atomic_inc_return(&mce_panicked) > 1) 308 wait_for_panic(); 309 barrier(); 310 311 bust_spinlocks(1); 312 console_verbose(); 313 } else { 314 /* Don't log too much for fake panic */ 315 if (atomic_inc_return(&mce_fake_panicked) > 1) 316 return; 317 } 318 pending = mce_gen_pool_prepare_records(); 319 /* First print corrected ones that are still unlogged */ 320 llist_for_each_entry(l, pending, llnode) { 321 struct mce *m = &l->mce; 322 if (!(m->status & MCI_STATUS_UC)) { 323 print_mce(m); 324 if (!apei_err) 325 apei_err = apei_write_mce(m); 326 } 327 } 328 /* Now print uncorrected but with the final one last */ 329 llist_for_each_entry(l, pending, llnode) { 330 struct mce *m = &l->mce; 331 if (!(m->status & MCI_STATUS_UC)) 332 continue; 333 if (!final || mce_cmp(m, final)) { 334 print_mce(m); 335 if (!apei_err) 336 apei_err = apei_write_mce(m); 337 } 338 } 339 if (final) { 340 print_mce(final); 341 if (!apei_err) 342 apei_err = apei_write_mce(final); 343 } 344 if (cpu_missing) 345 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 346 if (exp) 347 pr_emerg(HW_ERR "Machine check: %s\n", exp); 348 if (!fake_panic) { 349 if (panic_timeout == 0) 350 panic_timeout = mca_cfg.panic_timeout; 351 panic(msg); 352 } else 353 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 354 } 355 356 /* Support code for software error injection */ 357 358 static int msr_to_offset(u32 msr) 359 { 360 unsigned bank = __this_cpu_read(injectm.bank); 361 362 if (msr == mca_cfg.rip_msr) 363 return offsetof(struct mce, ip); 364 if (msr == msr_ops.status(bank)) 365 return offsetof(struct mce, status); 366 if (msr == msr_ops.addr(bank)) 367 return offsetof(struct mce, addr); 368 if (msr == msr_ops.misc(bank)) 369 return offsetof(struct mce, misc); 370 if (msr == MSR_IA32_MCG_STATUS) 371 return offsetof(struct mce, mcgstatus); 372 return -1; 373 } 374 375 /* MSR access wrappers used for error injection */ 376 static u64 mce_rdmsrl(u32 msr) 377 { 378 u64 v; 379 380 if (__this_cpu_read(injectm.finished)) { 381 int offset = msr_to_offset(msr); 382 383 if (offset < 0) 384 return 0; 385 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); 386 } 387 388 if (rdmsrl_safe(msr, &v)) { 389 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); 390 /* 391 * Return zero in case the access faulted. This should 392 * not happen normally but can happen if the CPU does 393 * something weird, or if the code is buggy. 394 */ 395 v = 0; 396 } 397 398 return v; 399 } 400 401 static void mce_wrmsrl(u32 msr, u64 v) 402 { 403 if (__this_cpu_read(injectm.finished)) { 404 int offset = msr_to_offset(msr); 405 406 if (offset >= 0) 407 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; 408 return; 409 } 410 wrmsrl(msr, v); 411 } 412 413 /* 414 * Collect all global (w.r.t. this processor) status about this machine 415 * check into our "mce" struct so that we can use it later to assess 416 * the severity of the problem as we read per-bank specific details. 417 */ 418 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 419 { 420 mce_setup(m); 421 422 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 423 if (regs) { 424 /* 425 * Get the address of the instruction at the time of 426 * the machine check error. 427 */ 428 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 429 m->ip = regs->ip; 430 m->cs = regs->cs; 431 432 /* 433 * When in VM86 mode make the cs look like ring 3 434 * always. This is a lie, but it's better than passing 435 * the additional vm86 bit around everywhere. 436 */ 437 if (v8086_mode(regs)) 438 m->cs |= 3; 439 } 440 /* Use accurate RIP reporting if available. */ 441 if (mca_cfg.rip_msr) 442 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 443 } 444 } 445 446 int mce_available(struct cpuinfo_x86 *c) 447 { 448 if (mca_cfg.disabled) 449 return 0; 450 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 451 } 452 453 static void mce_schedule_work(void) 454 { 455 if (!mce_gen_pool_empty()) 456 schedule_work(&mce_work); 457 } 458 459 static void mce_irq_work_cb(struct irq_work *entry) 460 { 461 mce_schedule_work(); 462 } 463 464 /* 465 * Check if the address reported by the CPU is in a format we can parse. 466 * It would be possible to add code for most other cases, but all would 467 * be somewhat complicated (e.g. segment offset would require an instruction 468 * parser). So only support physical addresses up to page granuality for now. 469 */ 470 int mce_usable_address(struct mce *m) 471 { 472 if (!(m->status & MCI_STATUS_ADDRV)) 473 return 0; 474 475 /* Checks after this one are Intel-specific: */ 476 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 477 return 1; 478 479 if (!(m->status & MCI_STATUS_MISCV)) 480 return 0; 481 482 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 483 return 0; 484 485 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 486 return 0; 487 488 return 1; 489 } 490 EXPORT_SYMBOL_GPL(mce_usable_address); 491 492 bool mce_is_memory_error(struct mce *m) 493 { 494 if (m->cpuvendor == X86_VENDOR_AMD || 495 m->cpuvendor == X86_VENDOR_HYGON) { 496 return amd_mce_is_memory_error(m); 497 } else if (m->cpuvendor == X86_VENDOR_INTEL) { 498 /* 499 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes 500 * 501 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for 502 * indicating a memory error. Bit 8 is used for indicating a 503 * cache hierarchy error. The combination of bit 2 and bit 3 504 * is used for indicating a `generic' cache hierarchy error 505 * But we can't just blindly check the above bits, because if 506 * bit 11 is set, then it is a bus/interconnect error - and 507 * either way the above bits just gives more detail on what 508 * bus/interconnect error happened. Note that bit 12 can be 509 * ignored, as it's the "filter" bit. 510 */ 511 return (m->status & 0xef80) == BIT(7) || 512 (m->status & 0xef00) == BIT(8) || 513 (m->status & 0xeffc) == 0xc; 514 } 515 516 return false; 517 } 518 EXPORT_SYMBOL_GPL(mce_is_memory_error); 519 520 bool mce_is_correctable(struct mce *m) 521 { 522 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) 523 return false; 524 525 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) 526 return false; 527 528 if (m->status & MCI_STATUS_UC) 529 return false; 530 531 return true; 532 } 533 EXPORT_SYMBOL_GPL(mce_is_correctable); 534 535 static bool cec_add_mce(struct mce *m) 536 { 537 if (!m) 538 return false; 539 540 /* We eat only correctable DRAM errors with usable addresses. */ 541 if (mce_is_memory_error(m) && 542 mce_is_correctable(m) && 543 mce_usable_address(m)) 544 if (!cec_add_elem(m->addr >> PAGE_SHIFT)) 545 return true; 546 547 return false; 548 } 549 550 static int mce_first_notifier(struct notifier_block *nb, unsigned long val, 551 void *data) 552 { 553 struct mce *m = (struct mce *)data; 554 555 if (!m) 556 return NOTIFY_DONE; 557 558 if (cec_add_mce(m)) 559 return NOTIFY_STOP; 560 561 /* Emit the trace record: */ 562 trace_mce_record(m); 563 564 set_bit(0, &mce_need_notify); 565 566 mce_notify_irq(); 567 568 return NOTIFY_DONE; 569 } 570 571 static struct notifier_block first_nb = { 572 .notifier_call = mce_first_notifier, 573 .priority = MCE_PRIO_FIRST, 574 }; 575 576 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, 577 void *data) 578 { 579 struct mce *mce = (struct mce *)data; 580 unsigned long pfn; 581 582 if (!mce) 583 return NOTIFY_DONE; 584 585 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { 586 pfn = mce->addr >> PAGE_SHIFT; 587 if (!memory_failure(pfn, 0)) 588 set_mce_nospec(pfn); 589 } 590 591 return NOTIFY_OK; 592 } 593 static struct notifier_block mce_srao_nb = { 594 .notifier_call = srao_decode_notifier, 595 .priority = MCE_PRIO_SRAO, 596 }; 597 598 static int mce_default_notifier(struct notifier_block *nb, unsigned long val, 599 void *data) 600 { 601 struct mce *m = (struct mce *)data; 602 603 if (!m) 604 return NOTIFY_DONE; 605 606 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) 607 return NOTIFY_DONE; 608 609 __print_mce(m); 610 611 return NOTIFY_DONE; 612 } 613 614 static struct notifier_block mce_default_nb = { 615 .notifier_call = mce_default_notifier, 616 /* lowest prio, we want it to run last. */ 617 .priority = MCE_PRIO_LOWEST, 618 }; 619 620 /* 621 * Read ADDR and MISC registers. 622 */ 623 static void mce_read_aux(struct mce *m, int i) 624 { 625 if (m->status & MCI_STATUS_MISCV) 626 m->misc = mce_rdmsrl(msr_ops.misc(i)); 627 628 if (m->status & MCI_STATUS_ADDRV) { 629 m->addr = mce_rdmsrl(msr_ops.addr(i)); 630 631 /* 632 * Mask the reported address by the reported granularity. 633 */ 634 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { 635 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 636 m->addr >>= shift; 637 m->addr <<= shift; 638 } 639 640 /* 641 * Extract [55:<lsb>] where lsb is the least significant 642 * *valid* bit of the address bits. 643 */ 644 if (mce_flags.smca) { 645 u8 lsb = (m->addr >> 56) & 0x3f; 646 647 m->addr &= GENMASK_ULL(55, lsb); 648 } 649 } 650 651 if (mce_flags.smca) { 652 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); 653 654 if (m->status & MCI_STATUS_SYNDV) 655 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); 656 } 657 } 658 659 DEFINE_PER_CPU(unsigned, mce_poll_count); 660 661 /* 662 * Poll for corrected events or events that happened before reset. 663 * Those are just logged through /dev/mcelog. 664 * 665 * This is executed in standard interrupt context. 666 * 667 * Note: spec recommends to panic for fatal unsignalled 668 * errors here. However this would be quite problematic -- 669 * we would need to reimplement the Monarch handling and 670 * it would mess up the exclusion between exception handler 671 * and poll handler -- * so we skip this for now. 672 * These cases should not happen anyways, or only when the CPU 673 * is already totally * confused. In this case it's likely it will 674 * not fully execute the machine check handler either. 675 */ 676 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 677 { 678 bool error_seen = false; 679 struct mce m; 680 int i; 681 682 this_cpu_inc(mce_poll_count); 683 684 mce_gather_info(&m, NULL); 685 686 if (flags & MCP_TIMESTAMP) 687 m.tsc = rdtsc(); 688 689 for (i = 0; i < mca_cfg.banks; i++) { 690 if (!mce_banks[i].ctl || !test_bit(i, *b)) 691 continue; 692 693 m.misc = 0; 694 m.addr = 0; 695 m.bank = i; 696 697 barrier(); 698 m.status = mce_rdmsrl(msr_ops.status(i)); 699 700 /* If this entry is not valid, ignore it */ 701 if (!(m.status & MCI_STATUS_VAL)) 702 continue; 703 704 /* 705 * If we are logging everything (at CPU online) or this 706 * is a corrected error, then we must log it. 707 */ 708 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) 709 goto log_it; 710 711 /* 712 * Newer Intel systems that support software error 713 * recovery need to make additional checks. Other 714 * CPUs should skip over uncorrected errors, but log 715 * everything else. 716 */ 717 if (!mca_cfg.ser) { 718 if (m.status & MCI_STATUS_UC) 719 continue; 720 goto log_it; 721 } 722 723 /* Log "not enabled" (speculative) errors */ 724 if (!(m.status & MCI_STATUS_EN)) 725 goto log_it; 726 727 /* 728 * Log UCNA (SDM: 15.6.3 "UCR Error Classification") 729 * UC == 1 && PCC == 0 && S == 0 730 */ 731 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) 732 goto log_it; 733 734 /* 735 * Skip anything else. Presumption is that our read of this 736 * bank is racing with a machine check. Leave the log alone 737 * for do_machine_check() to deal with it. 738 */ 739 continue; 740 741 log_it: 742 error_seen = true; 743 744 mce_read_aux(&m, i); 745 746 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); 747 748 /* 749 * Don't get the IP here because it's unlikely to 750 * have anything to do with the actual error location. 751 */ 752 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 753 mce_log(&m); 754 else if (mce_usable_address(&m)) { 755 /* 756 * Although we skipped logging this, we still want 757 * to take action. Add to the pool so the registered 758 * notifiers will see it. 759 */ 760 if (!mce_gen_pool_add(&m)) 761 mce_schedule_work(); 762 } 763 764 /* 765 * Clear state for this bank. 766 */ 767 mce_wrmsrl(msr_ops.status(i), 0); 768 } 769 770 /* 771 * Don't clear MCG_STATUS here because it's only defined for 772 * exceptions. 773 */ 774 775 sync_core(); 776 777 return error_seen; 778 } 779 EXPORT_SYMBOL_GPL(machine_check_poll); 780 781 /* 782 * Do a quick check if any of the events requires a panic. 783 * This decides if we keep the events around or clear them. 784 */ 785 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 786 struct pt_regs *regs) 787 { 788 char *tmp; 789 int i; 790 791 for (i = 0; i < mca_cfg.banks; i++) { 792 m->status = mce_rdmsrl(msr_ops.status(i)); 793 if (!(m->status & MCI_STATUS_VAL)) 794 continue; 795 796 __set_bit(i, validp); 797 if (quirk_no_way_out) 798 quirk_no_way_out(i, m, regs); 799 800 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { 801 m->bank = i; 802 mce_read_aux(m, i); 803 *msg = tmp; 804 return 1; 805 } 806 } 807 return 0; 808 } 809 810 /* 811 * Variable to establish order between CPUs while scanning. 812 * Each CPU spins initially until executing is equal its number. 813 */ 814 static atomic_t mce_executing; 815 816 /* 817 * Defines order of CPUs on entry. First CPU becomes Monarch. 818 */ 819 static atomic_t mce_callin; 820 821 /* 822 * Check if a timeout waiting for other CPUs happened. 823 */ 824 static int mce_timed_out(u64 *t, const char *msg) 825 { 826 /* 827 * The others already did panic for some reason. 828 * Bail out like in a timeout. 829 * rmb() to tell the compiler that system_state 830 * might have been modified by someone else. 831 */ 832 rmb(); 833 if (atomic_read(&mce_panicked)) 834 wait_for_panic(); 835 if (!mca_cfg.monarch_timeout) 836 goto out; 837 if ((s64)*t < SPINUNIT) { 838 if (mca_cfg.tolerant <= 1) 839 mce_panic(msg, NULL, NULL); 840 cpu_missing = 1; 841 return 1; 842 } 843 *t -= SPINUNIT; 844 out: 845 touch_nmi_watchdog(); 846 return 0; 847 } 848 849 /* 850 * The Monarch's reign. The Monarch is the CPU who entered 851 * the machine check handler first. It waits for the others to 852 * raise the exception too and then grades them. When any 853 * error is fatal panic. Only then let the others continue. 854 * 855 * The other CPUs entering the MCE handler will be controlled by the 856 * Monarch. They are called Subjects. 857 * 858 * This way we prevent any potential data corruption in a unrecoverable case 859 * and also makes sure always all CPU's errors are examined. 860 * 861 * Also this detects the case of a machine check event coming from outer 862 * space (not detected by any CPUs) In this case some external agent wants 863 * us to shut down, so panic too. 864 * 865 * The other CPUs might still decide to panic if the handler happens 866 * in a unrecoverable place, but in this case the system is in a semi-stable 867 * state and won't corrupt anything by itself. It's ok to let the others 868 * continue for a bit first. 869 * 870 * All the spin loops have timeouts; when a timeout happens a CPU 871 * typically elects itself to be Monarch. 872 */ 873 static void mce_reign(void) 874 { 875 int cpu; 876 struct mce *m = NULL; 877 int global_worst = 0; 878 char *msg = NULL; 879 char *nmsg = NULL; 880 881 /* 882 * This CPU is the Monarch and the other CPUs have run 883 * through their handlers. 884 * Grade the severity of the errors of all the CPUs. 885 */ 886 for_each_possible_cpu(cpu) { 887 int severity = mce_severity(&per_cpu(mces_seen, cpu), 888 mca_cfg.tolerant, 889 &nmsg, true); 890 if (severity > global_worst) { 891 msg = nmsg; 892 global_worst = severity; 893 m = &per_cpu(mces_seen, cpu); 894 } 895 } 896 897 /* 898 * Cannot recover? Panic here then. 899 * This dumps all the mces in the log buffer and stops the 900 * other CPUs. 901 */ 902 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 903 mce_panic("Fatal machine check", m, msg); 904 905 /* 906 * For UC somewhere we let the CPU who detects it handle it. 907 * Also must let continue the others, otherwise the handling 908 * CPU could deadlock on a lock. 909 */ 910 911 /* 912 * No machine check event found. Must be some external 913 * source or one CPU is hung. Panic. 914 */ 915 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 916 mce_panic("Fatal machine check from unknown source", NULL, NULL); 917 918 /* 919 * Now clear all the mces_seen so that they don't reappear on 920 * the next mce. 921 */ 922 for_each_possible_cpu(cpu) 923 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 924 } 925 926 static atomic_t global_nwo; 927 928 /* 929 * Start of Monarch synchronization. This waits until all CPUs have 930 * entered the exception handler and then determines if any of them 931 * saw a fatal event that requires panic. Then it executes them 932 * in the entry order. 933 * TBD double check parallel CPU hotunplug 934 */ 935 static int mce_start(int *no_way_out) 936 { 937 int order; 938 int cpus = num_online_cpus(); 939 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 940 941 if (!timeout) 942 return -1; 943 944 atomic_add(*no_way_out, &global_nwo); 945 /* 946 * Rely on the implied barrier below, such that global_nwo 947 * is updated before mce_callin. 948 */ 949 order = atomic_inc_return(&mce_callin); 950 951 /* 952 * Wait for everyone. 953 */ 954 while (atomic_read(&mce_callin) != cpus) { 955 if (mce_timed_out(&timeout, 956 "Timeout: Not all CPUs entered broadcast exception handler")) { 957 atomic_set(&global_nwo, 0); 958 return -1; 959 } 960 ndelay(SPINUNIT); 961 } 962 963 /* 964 * mce_callin should be read before global_nwo 965 */ 966 smp_rmb(); 967 968 if (order == 1) { 969 /* 970 * Monarch: Starts executing now, the others wait. 971 */ 972 atomic_set(&mce_executing, 1); 973 } else { 974 /* 975 * Subject: Now start the scanning loop one by one in 976 * the original callin order. 977 * This way when there are any shared banks it will be 978 * only seen by one CPU before cleared, avoiding duplicates. 979 */ 980 while (atomic_read(&mce_executing) < order) { 981 if (mce_timed_out(&timeout, 982 "Timeout: Subject CPUs unable to finish machine check processing")) { 983 atomic_set(&global_nwo, 0); 984 return -1; 985 } 986 ndelay(SPINUNIT); 987 } 988 } 989 990 /* 991 * Cache the global no_way_out state. 992 */ 993 *no_way_out = atomic_read(&global_nwo); 994 995 return order; 996 } 997 998 /* 999 * Synchronize between CPUs after main scanning loop. 1000 * This invokes the bulk of the Monarch processing. 1001 */ 1002 static int mce_end(int order) 1003 { 1004 int ret = -1; 1005 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 1006 1007 if (!timeout) 1008 goto reset; 1009 if (order < 0) 1010 goto reset; 1011 1012 /* 1013 * Allow others to run. 1014 */ 1015 atomic_inc(&mce_executing); 1016 1017 if (order == 1) { 1018 /* CHECKME: Can this race with a parallel hotplug? */ 1019 int cpus = num_online_cpus(); 1020 1021 /* 1022 * Monarch: Wait for everyone to go through their scanning 1023 * loops. 1024 */ 1025 while (atomic_read(&mce_executing) <= cpus) { 1026 if (mce_timed_out(&timeout, 1027 "Timeout: Monarch CPU unable to finish machine check processing")) 1028 goto reset; 1029 ndelay(SPINUNIT); 1030 } 1031 1032 mce_reign(); 1033 barrier(); 1034 ret = 0; 1035 } else { 1036 /* 1037 * Subject: Wait for Monarch to finish. 1038 */ 1039 while (atomic_read(&mce_executing) != 0) { 1040 if (mce_timed_out(&timeout, 1041 "Timeout: Monarch CPU did not finish machine check processing")) 1042 goto reset; 1043 ndelay(SPINUNIT); 1044 } 1045 1046 /* 1047 * Don't reset anything. That's done by the Monarch. 1048 */ 1049 return 0; 1050 } 1051 1052 /* 1053 * Reset all global state. 1054 */ 1055 reset: 1056 atomic_set(&global_nwo, 0); 1057 atomic_set(&mce_callin, 0); 1058 barrier(); 1059 1060 /* 1061 * Let others run again. 1062 */ 1063 atomic_set(&mce_executing, 0); 1064 return ret; 1065 } 1066 1067 static void mce_clear_state(unsigned long *toclear) 1068 { 1069 int i; 1070 1071 for (i = 0; i < mca_cfg.banks; i++) { 1072 if (test_bit(i, toclear)) 1073 mce_wrmsrl(msr_ops.status(i), 0); 1074 } 1075 } 1076 1077 static int do_memory_failure(struct mce *m) 1078 { 1079 int flags = MF_ACTION_REQUIRED; 1080 int ret; 1081 1082 pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); 1083 if (!(m->mcgstatus & MCG_STATUS_RIPV)) 1084 flags |= MF_MUST_KILL; 1085 ret = memory_failure(m->addr >> PAGE_SHIFT, flags); 1086 if (ret) 1087 pr_err("Memory error not recovered"); 1088 else 1089 set_mce_nospec(m->addr >> PAGE_SHIFT); 1090 return ret; 1091 } 1092 1093 1094 /* 1095 * Cases where we avoid rendezvous handler timeout: 1096 * 1) If this CPU is offline. 1097 * 1098 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to 1099 * skip those CPUs which remain looping in the 1st kernel - see 1100 * crash_nmi_callback(). 1101 * 1102 * Note: there still is a small window between kexec-ing and the new, 1103 * kdump kernel establishing a new #MC handler where a broadcasted MCE 1104 * might not get handled properly. 1105 */ 1106 static bool __mc_check_crashing_cpu(int cpu) 1107 { 1108 if (cpu_is_offline(cpu) || 1109 (crashing_cpu != -1 && crashing_cpu != cpu)) { 1110 u64 mcgstatus; 1111 1112 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 1113 if (mcgstatus & MCG_STATUS_RIPV) { 1114 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1115 return true; 1116 } 1117 } 1118 return false; 1119 } 1120 1121 static void __mc_scan_banks(struct mce *m, struct mce *final, 1122 unsigned long *toclear, unsigned long *valid_banks, 1123 int no_way_out, int *worst) 1124 { 1125 struct mca_config *cfg = &mca_cfg; 1126 int severity, i; 1127 1128 for (i = 0; i < cfg->banks; i++) { 1129 __clear_bit(i, toclear); 1130 if (!test_bit(i, valid_banks)) 1131 continue; 1132 1133 if (!mce_banks[i].ctl) 1134 continue; 1135 1136 m->misc = 0; 1137 m->addr = 0; 1138 m->bank = i; 1139 1140 m->status = mce_rdmsrl(msr_ops.status(i)); 1141 if (!(m->status & MCI_STATUS_VAL)) 1142 continue; 1143 1144 /* 1145 * Corrected or non-signaled errors are handled by 1146 * machine_check_poll(). Leave them alone, unless this panics. 1147 */ 1148 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1149 !no_way_out) 1150 continue; 1151 1152 /* Set taint even when machine check was not enabled. */ 1153 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1154 1155 severity = mce_severity(m, cfg->tolerant, NULL, true); 1156 1157 /* 1158 * When machine check was for corrected/deferred handler don't 1159 * touch, unless we're panicking. 1160 */ 1161 if ((severity == MCE_KEEP_SEVERITY || 1162 severity == MCE_UCNA_SEVERITY) && !no_way_out) 1163 continue; 1164 1165 __set_bit(i, toclear); 1166 1167 /* Machine check event was not enabled. Clear, but ignore. */ 1168 if (severity == MCE_NO_SEVERITY) 1169 continue; 1170 1171 mce_read_aux(m, i); 1172 1173 /* assuming valid severity level != 0 */ 1174 m->severity = severity; 1175 1176 mce_log(m); 1177 1178 if (severity > *worst) { 1179 *final = *m; 1180 *worst = severity; 1181 } 1182 } 1183 1184 /* mce_clear_state will clear *final, save locally for use later */ 1185 *m = *final; 1186 } 1187 1188 /* 1189 * The actual machine check handler. This only handles real 1190 * exceptions when something got corrupted coming in through int 18. 1191 * 1192 * This is executed in NMI context not subject to normal locking rules. This 1193 * implies that most kernel services cannot be safely used. Don't even 1194 * think about putting a printk in there! 1195 * 1196 * On Intel systems this is entered on all CPUs in parallel through 1197 * MCE broadcast. However some CPUs might be broken beyond repair, 1198 * so be always careful when synchronizing with others. 1199 */ 1200 void do_machine_check(struct pt_regs *regs, long error_code) 1201 { 1202 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1203 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1204 struct mca_config *cfg = &mca_cfg; 1205 int cpu = smp_processor_id(); 1206 char *msg = "Unknown"; 1207 struct mce m, *final; 1208 int worst = 0; 1209 1210 /* 1211 * Establish sequential order between the CPUs entering the machine 1212 * check handler. 1213 */ 1214 int order = -1; 1215 1216 /* 1217 * If no_way_out gets set, there is no safe way to recover from this 1218 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 1219 */ 1220 int no_way_out = 0; 1221 1222 /* 1223 * If kill_it gets set, there might be a way to recover from this 1224 * error. 1225 */ 1226 int kill_it = 0; 1227 1228 /* 1229 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES 1230 * on Intel. 1231 */ 1232 int lmce = 1; 1233 1234 if (__mc_check_crashing_cpu(cpu)) 1235 return; 1236 1237 ist_enter(regs); 1238 1239 this_cpu_inc(mce_exception_count); 1240 1241 mce_gather_info(&m, regs); 1242 m.tsc = rdtsc(); 1243 1244 final = this_cpu_ptr(&mces_seen); 1245 *final = m; 1246 1247 memset(valid_banks, 0, sizeof(valid_banks)); 1248 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1249 1250 barrier(); 1251 1252 /* 1253 * When no restart IP might need to kill or panic. 1254 * Assume the worst for now, but if we find the 1255 * severity is MCE_AR_SEVERITY we have other options. 1256 */ 1257 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1258 kill_it = 1; 1259 1260 /* 1261 * Check if this MCE is signaled to only this logical processor, 1262 * on Intel only. 1263 */ 1264 if (m.cpuvendor == X86_VENDOR_INTEL) 1265 lmce = m.mcgstatus & MCG_STATUS_LMCES; 1266 1267 /* 1268 * Local machine check may already know that we have to panic. 1269 * Broadcast machine check begins rendezvous in mce_start() 1270 * Go through all banks in exclusion of the other CPUs. This way we 1271 * don't report duplicated events on shared banks because the first one 1272 * to see it will clear it. 1273 */ 1274 if (lmce) { 1275 if (no_way_out) 1276 mce_panic("Fatal local machine check", &m, msg); 1277 } else { 1278 order = mce_start(&no_way_out); 1279 } 1280 1281 __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); 1282 1283 if (!no_way_out) 1284 mce_clear_state(toclear); 1285 1286 /* 1287 * Do most of the synchronization with other CPUs. 1288 * When there's any problem use only local no_way_out state. 1289 */ 1290 if (!lmce) { 1291 if (mce_end(order) < 0) 1292 no_way_out = worst >= MCE_PANIC_SEVERITY; 1293 } else { 1294 /* 1295 * If there was a fatal machine check we should have 1296 * already called mce_panic earlier in this function. 1297 * Since we re-read the banks, we might have found 1298 * something new. Check again to see if we found a 1299 * fatal error. We call "mce_severity()" again to 1300 * make sure we have the right "msg". 1301 */ 1302 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { 1303 mce_severity(&m, cfg->tolerant, &msg, true); 1304 mce_panic("Local fatal machine check!", &m, msg); 1305 } 1306 } 1307 1308 /* 1309 * If tolerant is at an insane level we drop requests to kill 1310 * processes and continue even when there is no way out. 1311 */ 1312 if (cfg->tolerant == 3) 1313 kill_it = 0; 1314 else if (no_way_out) 1315 mce_panic("Fatal machine check on current CPU", &m, msg); 1316 1317 if (worst > 0) 1318 irq_work_queue(&mce_irq_work); 1319 1320 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1321 1322 sync_core(); 1323 1324 if (worst != MCE_AR_SEVERITY && !kill_it) 1325 goto out_ist; 1326 1327 /* Fault was in user mode and we need to take some action */ 1328 if ((m.cs & 3) == 3) { 1329 ist_begin_non_atomic(regs); 1330 local_irq_enable(); 1331 1332 if (kill_it || do_memory_failure(&m)) 1333 force_sig(SIGBUS, current); 1334 local_irq_disable(); 1335 ist_end_non_atomic(); 1336 } else { 1337 if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) 1338 mce_panic("Failed kernel mode recovery", &m, NULL); 1339 } 1340 1341 out_ist: 1342 ist_exit(regs); 1343 } 1344 EXPORT_SYMBOL_GPL(do_machine_check); 1345 1346 #ifndef CONFIG_MEMORY_FAILURE 1347 int memory_failure(unsigned long pfn, int flags) 1348 { 1349 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1350 BUG_ON(flags & MF_ACTION_REQUIRED); 1351 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1352 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1353 pfn); 1354 1355 return 0; 1356 } 1357 #endif 1358 1359 /* 1360 * Periodic polling timer for "silent" machine check errors. If the 1361 * poller finds an MCE, poll 2x faster. When the poller finds no more 1362 * errors, poll 2x slower (up to check_interval seconds). 1363 */ 1364 static unsigned long check_interval = INITIAL_CHECK_INTERVAL; 1365 1366 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1367 static DEFINE_PER_CPU(struct timer_list, mce_timer); 1368 1369 static unsigned long mce_adjust_timer_default(unsigned long interval) 1370 { 1371 return interval; 1372 } 1373 1374 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; 1375 1376 static void __start_timer(struct timer_list *t, unsigned long interval) 1377 { 1378 unsigned long when = jiffies + interval; 1379 unsigned long flags; 1380 1381 local_irq_save(flags); 1382 1383 if (!timer_pending(t) || time_before(when, t->expires)) 1384 mod_timer(t, round_jiffies(when)); 1385 1386 local_irq_restore(flags); 1387 } 1388 1389 static void mce_timer_fn(struct timer_list *t) 1390 { 1391 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); 1392 unsigned long iv; 1393 1394 WARN_ON(cpu_t != t); 1395 1396 iv = __this_cpu_read(mce_next_interval); 1397 1398 if (mce_available(this_cpu_ptr(&cpu_info))) { 1399 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); 1400 1401 if (mce_intel_cmci_poll()) { 1402 iv = mce_adjust_timer(iv); 1403 goto done; 1404 } 1405 } 1406 1407 /* 1408 * Alert userspace if needed. If we logged an MCE, reduce the polling 1409 * interval, otherwise increase the polling interval. 1410 */ 1411 if (mce_notify_irq()) 1412 iv = max(iv / 2, (unsigned long) HZ/100); 1413 else 1414 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1415 1416 done: 1417 __this_cpu_write(mce_next_interval, iv); 1418 __start_timer(t, iv); 1419 } 1420 1421 /* 1422 * Ensure that the timer is firing in @interval from now. 1423 */ 1424 void mce_timer_kick(unsigned long interval) 1425 { 1426 struct timer_list *t = this_cpu_ptr(&mce_timer); 1427 unsigned long iv = __this_cpu_read(mce_next_interval); 1428 1429 __start_timer(t, interval); 1430 1431 if (interval < iv) 1432 __this_cpu_write(mce_next_interval, interval); 1433 } 1434 1435 /* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1436 static void mce_timer_delete_all(void) 1437 { 1438 int cpu; 1439 1440 for_each_online_cpu(cpu) 1441 del_timer_sync(&per_cpu(mce_timer, cpu)); 1442 } 1443 1444 /* 1445 * Notify the user(s) about new machine check events. 1446 * Can be called from interrupt context, but not from machine check/NMI 1447 * context. 1448 */ 1449 int mce_notify_irq(void) 1450 { 1451 /* Not more than two messages every minute */ 1452 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1453 1454 if (test_and_clear_bit(0, &mce_need_notify)) { 1455 mce_work_trigger(); 1456 1457 if (__ratelimit(&ratelimit)) 1458 pr_info(HW_ERR "Machine check events logged\n"); 1459 1460 return 1; 1461 } 1462 return 0; 1463 } 1464 EXPORT_SYMBOL_GPL(mce_notify_irq); 1465 1466 static int __mcheck_cpu_mce_banks_init(void) 1467 { 1468 int i; 1469 1470 mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); 1471 if (!mce_banks) 1472 return -ENOMEM; 1473 1474 for (i = 0; i < MAX_NR_BANKS; i++) { 1475 struct mce_bank *b = &mce_banks[i]; 1476 1477 b->ctl = -1ULL; 1478 b->init = 1; 1479 } 1480 return 0; 1481 } 1482 1483 /* 1484 * Initialize Machine Checks for a CPU. 1485 */ 1486 static int __mcheck_cpu_cap_init(void) 1487 { 1488 u64 cap; 1489 u8 b; 1490 1491 rdmsrl(MSR_IA32_MCG_CAP, cap); 1492 1493 b = cap & MCG_BANKCNT_MASK; 1494 if (WARN_ON_ONCE(b > MAX_NR_BANKS)) 1495 b = MAX_NR_BANKS; 1496 1497 mca_cfg.banks = max(mca_cfg.banks, b); 1498 1499 if (!mce_banks) { 1500 int err = __mcheck_cpu_mce_banks_init(); 1501 if (err) 1502 return err; 1503 } 1504 1505 /* Use accurate RIP reporting if available. */ 1506 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1507 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1508 1509 if (cap & MCG_SER_P) 1510 mca_cfg.ser = 1; 1511 1512 return 0; 1513 } 1514 1515 static void __mcheck_cpu_init_generic(void) 1516 { 1517 enum mcp_flags m_fl = 0; 1518 mce_banks_t all_banks; 1519 u64 cap; 1520 1521 if (!mca_cfg.bootlog) 1522 m_fl = MCP_DONTLOG; 1523 1524 /* 1525 * Log the machine checks left over from the previous reset. 1526 */ 1527 bitmap_fill(all_banks, MAX_NR_BANKS); 1528 machine_check_poll(MCP_UC | m_fl, &all_banks); 1529 1530 cr4_set_bits(X86_CR4_MCE); 1531 1532 rdmsrl(MSR_IA32_MCG_CAP, cap); 1533 if (cap & MCG_CTL_P) 1534 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1535 } 1536 1537 static void __mcheck_cpu_init_clear_banks(void) 1538 { 1539 int i; 1540 1541 for (i = 0; i < mca_cfg.banks; i++) { 1542 struct mce_bank *b = &mce_banks[i]; 1543 1544 if (!b->init) 1545 continue; 1546 wrmsrl(msr_ops.ctl(i), b->ctl); 1547 wrmsrl(msr_ops.status(i), 0); 1548 } 1549 } 1550 1551 /* 1552 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1553 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1554 * Vol 3B Table 15-20). But this confuses both the code that determines 1555 * whether the machine check occurred in kernel or user mode, and also 1556 * the severity assessment code. Pretend that EIPV was set, and take the 1557 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1558 */ 1559 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1560 { 1561 if (bank != 0) 1562 return; 1563 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1564 return; 1565 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1566 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1567 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1568 MCACOD)) != 1569 (MCI_STATUS_UC|MCI_STATUS_EN| 1570 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1571 MCI_STATUS_AR|MCACOD_INSTR)) 1572 return; 1573 1574 m->mcgstatus |= MCG_STATUS_EIPV; 1575 m->ip = regs->ip; 1576 m->cs = regs->cs; 1577 } 1578 1579 /* Add per CPU specific workarounds here */ 1580 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1581 { 1582 struct mca_config *cfg = &mca_cfg; 1583 1584 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1585 pr_info("unknown CPU type - not enabling MCE support\n"); 1586 return -EOPNOTSUPP; 1587 } 1588 1589 /* This should be disabled by the BIOS, but isn't always */ 1590 if (c->x86_vendor == X86_VENDOR_AMD) { 1591 if (c->x86 == 15 && cfg->banks > 4) { 1592 /* 1593 * disable GART TBL walk error reporting, which 1594 * trips off incorrectly with the IOMMU & 3ware 1595 * & Cerberus: 1596 */ 1597 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1598 } 1599 if (c->x86 < 0x11 && cfg->bootlog < 0) { 1600 /* 1601 * Lots of broken BIOS around that don't clear them 1602 * by default and leave crap in there. Don't log: 1603 */ 1604 cfg->bootlog = 0; 1605 } 1606 /* 1607 * Various K7s with broken bank 0 around. Always disable 1608 * by default. 1609 */ 1610 if (c->x86 == 6 && cfg->banks > 0) 1611 mce_banks[0].ctl = 0; 1612 1613 /* 1614 * overflow_recov is supported for F15h Models 00h-0fh 1615 * even though we don't have a CPUID bit for it. 1616 */ 1617 if (c->x86 == 0x15 && c->x86_model <= 0xf) 1618 mce_flags.overflow_recov = 1; 1619 1620 } 1621 1622 if (c->x86_vendor == X86_VENDOR_INTEL) { 1623 /* 1624 * SDM documents that on family 6 bank 0 should not be written 1625 * because it aliases to another special BIOS controlled 1626 * register. 1627 * But it's not aliased anymore on model 0x1a+ 1628 * Don't ignore bank 0 completely because there could be a 1629 * valid event later, merely don't write CTL0. 1630 */ 1631 1632 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1633 mce_banks[0].init = 0; 1634 1635 /* 1636 * All newer Intel systems support MCE broadcasting. Enable 1637 * synchronization with a one second timeout. 1638 */ 1639 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1640 cfg->monarch_timeout < 0) 1641 cfg->monarch_timeout = USEC_PER_SEC; 1642 1643 /* 1644 * There are also broken BIOSes on some Pentium M and 1645 * earlier systems: 1646 */ 1647 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1648 cfg->bootlog = 0; 1649 1650 if (c->x86 == 6 && c->x86_model == 45) 1651 quirk_no_way_out = quirk_sandybridge_ifu; 1652 } 1653 if (cfg->monarch_timeout < 0) 1654 cfg->monarch_timeout = 0; 1655 if (cfg->bootlog != 0) 1656 cfg->panic_timeout = 30; 1657 1658 return 0; 1659 } 1660 1661 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1662 { 1663 if (c->x86 != 5) 1664 return 0; 1665 1666 switch (c->x86_vendor) { 1667 case X86_VENDOR_INTEL: 1668 intel_p5_mcheck_init(c); 1669 return 1; 1670 break; 1671 case X86_VENDOR_CENTAUR: 1672 winchip_mcheck_init(c); 1673 return 1; 1674 break; 1675 default: 1676 return 0; 1677 } 1678 1679 return 0; 1680 } 1681 1682 /* 1683 * Init basic CPU features needed for early decoding of MCEs. 1684 */ 1685 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) 1686 { 1687 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { 1688 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); 1689 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); 1690 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); 1691 1692 if (mce_flags.smca) { 1693 msr_ops.ctl = smca_ctl_reg; 1694 msr_ops.status = smca_status_reg; 1695 msr_ops.addr = smca_addr_reg; 1696 msr_ops.misc = smca_misc_reg; 1697 } 1698 } 1699 } 1700 1701 static void mce_centaur_feature_init(struct cpuinfo_x86 *c) 1702 { 1703 struct mca_config *cfg = &mca_cfg; 1704 1705 /* 1706 * All newer Centaur CPUs support MCE broadcasting. Enable 1707 * synchronization with a one second timeout. 1708 */ 1709 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || 1710 c->x86 > 6) { 1711 if (cfg->monarch_timeout < 0) 1712 cfg->monarch_timeout = USEC_PER_SEC; 1713 } 1714 } 1715 1716 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1717 { 1718 switch (c->x86_vendor) { 1719 case X86_VENDOR_INTEL: 1720 mce_intel_feature_init(c); 1721 mce_adjust_timer = cmci_intel_adjust_timer; 1722 break; 1723 1724 case X86_VENDOR_AMD: { 1725 mce_amd_feature_init(c); 1726 break; 1727 } 1728 1729 case X86_VENDOR_HYGON: 1730 mce_hygon_feature_init(c); 1731 break; 1732 1733 case X86_VENDOR_CENTAUR: 1734 mce_centaur_feature_init(c); 1735 break; 1736 1737 default: 1738 break; 1739 } 1740 } 1741 1742 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) 1743 { 1744 switch (c->x86_vendor) { 1745 case X86_VENDOR_INTEL: 1746 mce_intel_feature_clear(c); 1747 break; 1748 default: 1749 break; 1750 } 1751 } 1752 1753 static void mce_start_timer(struct timer_list *t) 1754 { 1755 unsigned long iv = check_interval * HZ; 1756 1757 if (mca_cfg.ignore_ce || !iv) 1758 return; 1759 1760 this_cpu_write(mce_next_interval, iv); 1761 __start_timer(t, iv); 1762 } 1763 1764 static void __mcheck_cpu_setup_timer(void) 1765 { 1766 struct timer_list *t = this_cpu_ptr(&mce_timer); 1767 1768 timer_setup(t, mce_timer_fn, TIMER_PINNED); 1769 } 1770 1771 static void __mcheck_cpu_init_timer(void) 1772 { 1773 struct timer_list *t = this_cpu_ptr(&mce_timer); 1774 1775 timer_setup(t, mce_timer_fn, TIMER_PINNED); 1776 mce_start_timer(t); 1777 } 1778 1779 bool filter_mce(struct mce *m) 1780 { 1781 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1782 return amd_filter_mce(m); 1783 1784 return false; 1785 } 1786 1787 /* Handle unconfigured int18 (should never happen) */ 1788 static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1789 { 1790 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1791 smp_processor_id()); 1792 } 1793 1794 /* Call the installed machine check handler for this CPU setup. */ 1795 void (*machine_check_vector)(struct pt_regs *, long error_code) = 1796 unexpected_machine_check; 1797 1798 dotraplinkage void do_mce(struct pt_regs *regs, long error_code) 1799 { 1800 machine_check_vector(regs, error_code); 1801 } 1802 1803 /* 1804 * Called for each booted CPU to set up machine checks. 1805 * Must be called with preempt off: 1806 */ 1807 void mcheck_cpu_init(struct cpuinfo_x86 *c) 1808 { 1809 if (mca_cfg.disabled) 1810 return; 1811 1812 if (__mcheck_cpu_ancient_init(c)) 1813 return; 1814 1815 if (!mce_available(c)) 1816 return; 1817 1818 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1819 mca_cfg.disabled = 1; 1820 return; 1821 } 1822 1823 if (mce_gen_pool_init()) { 1824 mca_cfg.disabled = 1; 1825 pr_emerg("Couldn't allocate MCE records pool!\n"); 1826 return; 1827 } 1828 1829 machine_check_vector = do_machine_check; 1830 1831 __mcheck_cpu_init_early(c); 1832 __mcheck_cpu_init_generic(); 1833 __mcheck_cpu_init_vendor(c); 1834 __mcheck_cpu_init_clear_banks(); 1835 __mcheck_cpu_setup_timer(); 1836 } 1837 1838 /* 1839 * Called for each booted CPU to clear some machine checks opt-ins 1840 */ 1841 void mcheck_cpu_clear(struct cpuinfo_x86 *c) 1842 { 1843 if (mca_cfg.disabled) 1844 return; 1845 1846 if (!mce_available(c)) 1847 return; 1848 1849 /* 1850 * Possibly to clear general settings generic to x86 1851 * __mcheck_cpu_clear_generic(c); 1852 */ 1853 __mcheck_cpu_clear_vendor(c); 1854 1855 } 1856 1857 static void __mce_disable_bank(void *arg) 1858 { 1859 int bank = *((int *)arg); 1860 __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); 1861 cmci_disable_bank(bank); 1862 } 1863 1864 void mce_disable_bank(int bank) 1865 { 1866 if (bank >= mca_cfg.banks) { 1867 pr_warn(FW_BUG 1868 "Ignoring request to disable invalid MCA bank %d.\n", 1869 bank); 1870 return; 1871 } 1872 set_bit(bank, mce_banks_ce_disabled); 1873 on_each_cpu(__mce_disable_bank, &bank, 1); 1874 } 1875 1876 /* 1877 * mce=off Disables machine check 1878 * mce=no_cmci Disables CMCI 1879 * mce=no_lmce Disables LMCE 1880 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1881 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1882 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1883 * monarchtimeout is how long to wait for other CPUs on machine 1884 * check, or 0 to not wait 1885 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h 1886 and older. 1887 * mce=nobootlog Don't log MCEs from before booting. 1888 * mce=bios_cmci_threshold Don't program the CMCI threshold 1889 * mce=recovery force enable memcpy_mcsafe() 1890 */ 1891 static int __init mcheck_enable(char *str) 1892 { 1893 struct mca_config *cfg = &mca_cfg; 1894 1895 if (*str == 0) { 1896 enable_p5_mce(); 1897 return 1; 1898 } 1899 if (*str == '=') 1900 str++; 1901 if (!strcmp(str, "off")) 1902 cfg->disabled = 1; 1903 else if (!strcmp(str, "no_cmci")) 1904 cfg->cmci_disabled = true; 1905 else if (!strcmp(str, "no_lmce")) 1906 cfg->lmce_disabled = 1; 1907 else if (!strcmp(str, "dont_log_ce")) 1908 cfg->dont_log_ce = true; 1909 else if (!strcmp(str, "ignore_ce")) 1910 cfg->ignore_ce = true; 1911 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1912 cfg->bootlog = (str[0] == 'b'); 1913 else if (!strcmp(str, "bios_cmci_threshold")) 1914 cfg->bios_cmci_threshold = 1; 1915 else if (!strcmp(str, "recovery")) 1916 cfg->recovery = 1; 1917 else if (isdigit(str[0])) { 1918 if (get_option(&str, &cfg->tolerant) == 2) 1919 get_option(&str, &(cfg->monarch_timeout)); 1920 } else { 1921 pr_info("mce argument %s ignored. Please use /sys\n", str); 1922 return 0; 1923 } 1924 return 1; 1925 } 1926 __setup("mce", mcheck_enable); 1927 1928 int __init mcheck_init(void) 1929 { 1930 mcheck_intel_therm_init(); 1931 mce_register_decode_chain(&first_nb); 1932 mce_register_decode_chain(&mce_srao_nb); 1933 mce_register_decode_chain(&mce_default_nb); 1934 mcheck_vendor_init_severity(); 1935 1936 INIT_WORK(&mce_work, mce_gen_pool_process); 1937 init_irq_work(&mce_irq_work, mce_irq_work_cb); 1938 1939 return 0; 1940 } 1941 1942 /* 1943 * mce_syscore: PM support 1944 */ 1945 1946 /* 1947 * Disable machine checks on suspend and shutdown. We can't really handle 1948 * them later. 1949 */ 1950 static void mce_disable_error_reporting(void) 1951 { 1952 int i; 1953 1954 for (i = 0; i < mca_cfg.banks; i++) { 1955 struct mce_bank *b = &mce_banks[i]; 1956 1957 if (b->init) 1958 wrmsrl(msr_ops.ctl(i), 0); 1959 } 1960 return; 1961 } 1962 1963 static void vendor_disable_error_reporting(void) 1964 { 1965 /* 1966 * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs 1967 * are socket-wide. 1968 * Disabling them for just a single offlined CPU is bad, since it will 1969 * inhibit reporting for all shared resources on the socket like the 1970 * last level cache (LLC), the integrated memory controller (iMC), etc. 1971 */ 1972 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || 1973 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || 1974 boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1975 return; 1976 1977 mce_disable_error_reporting(); 1978 } 1979 1980 static int mce_syscore_suspend(void) 1981 { 1982 vendor_disable_error_reporting(); 1983 return 0; 1984 } 1985 1986 static void mce_syscore_shutdown(void) 1987 { 1988 vendor_disable_error_reporting(); 1989 } 1990 1991 /* 1992 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1993 * Only one CPU is active at this time, the others get re-added later using 1994 * CPU hotplug: 1995 */ 1996 static void mce_syscore_resume(void) 1997 { 1998 __mcheck_cpu_init_generic(); 1999 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); 2000 __mcheck_cpu_init_clear_banks(); 2001 } 2002 2003 static struct syscore_ops mce_syscore_ops = { 2004 .suspend = mce_syscore_suspend, 2005 .shutdown = mce_syscore_shutdown, 2006 .resume = mce_syscore_resume, 2007 }; 2008 2009 /* 2010 * mce_device: Sysfs support 2011 */ 2012 2013 static void mce_cpu_restart(void *data) 2014 { 2015 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2016 return; 2017 __mcheck_cpu_init_generic(); 2018 __mcheck_cpu_init_clear_banks(); 2019 __mcheck_cpu_init_timer(); 2020 } 2021 2022 /* Reinit MCEs after user configuration changes */ 2023 static void mce_restart(void) 2024 { 2025 mce_timer_delete_all(); 2026 on_each_cpu(mce_cpu_restart, NULL, 1); 2027 } 2028 2029 /* Toggle features for corrected errors */ 2030 static void mce_disable_cmci(void *data) 2031 { 2032 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2033 return; 2034 cmci_clear(); 2035 } 2036 2037 static void mce_enable_ce(void *all) 2038 { 2039 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2040 return; 2041 cmci_reenable(); 2042 cmci_recheck(); 2043 if (all) 2044 __mcheck_cpu_init_timer(); 2045 } 2046 2047 static struct bus_type mce_subsys = { 2048 .name = "machinecheck", 2049 .dev_name = "machinecheck", 2050 }; 2051 2052 DEFINE_PER_CPU(struct device *, mce_device); 2053 2054 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2055 { 2056 return container_of(attr, struct mce_bank, attr); 2057 } 2058 2059 static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2060 char *buf) 2061 { 2062 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2063 } 2064 2065 static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2066 const char *buf, size_t size) 2067 { 2068 u64 new; 2069 2070 if (kstrtou64(buf, 0, &new) < 0) 2071 return -EINVAL; 2072 2073 attr_to_bank(attr)->ctl = new; 2074 mce_restart(); 2075 2076 return size; 2077 } 2078 2079 static ssize_t set_ignore_ce(struct device *s, 2080 struct device_attribute *attr, 2081 const char *buf, size_t size) 2082 { 2083 u64 new; 2084 2085 if (kstrtou64(buf, 0, &new) < 0) 2086 return -EINVAL; 2087 2088 mutex_lock(&mce_sysfs_mutex); 2089 if (mca_cfg.ignore_ce ^ !!new) { 2090 if (new) { 2091 /* disable ce features */ 2092 mce_timer_delete_all(); 2093 on_each_cpu(mce_disable_cmci, NULL, 1); 2094 mca_cfg.ignore_ce = true; 2095 } else { 2096 /* enable ce features */ 2097 mca_cfg.ignore_ce = false; 2098 on_each_cpu(mce_enable_ce, (void *)1, 1); 2099 } 2100 } 2101 mutex_unlock(&mce_sysfs_mutex); 2102 2103 return size; 2104 } 2105 2106 static ssize_t set_cmci_disabled(struct device *s, 2107 struct device_attribute *attr, 2108 const char *buf, size_t size) 2109 { 2110 u64 new; 2111 2112 if (kstrtou64(buf, 0, &new) < 0) 2113 return -EINVAL; 2114 2115 mutex_lock(&mce_sysfs_mutex); 2116 if (mca_cfg.cmci_disabled ^ !!new) { 2117 if (new) { 2118 /* disable cmci */ 2119 on_each_cpu(mce_disable_cmci, NULL, 1); 2120 mca_cfg.cmci_disabled = true; 2121 } else { 2122 /* enable cmci */ 2123 mca_cfg.cmci_disabled = false; 2124 on_each_cpu(mce_enable_ce, NULL, 1); 2125 } 2126 } 2127 mutex_unlock(&mce_sysfs_mutex); 2128 2129 return size; 2130 } 2131 2132 static ssize_t store_int_with_restart(struct device *s, 2133 struct device_attribute *attr, 2134 const char *buf, size_t size) 2135 { 2136 unsigned long old_check_interval = check_interval; 2137 ssize_t ret = device_store_ulong(s, attr, buf, size); 2138 2139 if (check_interval == old_check_interval) 2140 return ret; 2141 2142 mutex_lock(&mce_sysfs_mutex); 2143 mce_restart(); 2144 mutex_unlock(&mce_sysfs_mutex); 2145 2146 return ret; 2147 } 2148 2149 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2150 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2151 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2152 2153 static struct dev_ext_attribute dev_attr_check_interval = { 2154 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2155 &check_interval 2156 }; 2157 2158 static struct dev_ext_attribute dev_attr_ignore_ce = { 2159 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 2160 &mca_cfg.ignore_ce 2161 }; 2162 2163 static struct dev_ext_attribute dev_attr_cmci_disabled = { 2164 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 2165 &mca_cfg.cmci_disabled 2166 }; 2167 2168 static struct device_attribute *mce_device_attrs[] = { 2169 &dev_attr_tolerant.attr, 2170 &dev_attr_check_interval.attr, 2171 #ifdef CONFIG_X86_MCELOG_LEGACY 2172 &dev_attr_trigger, 2173 #endif 2174 &dev_attr_monarch_timeout.attr, 2175 &dev_attr_dont_log_ce.attr, 2176 &dev_attr_ignore_ce.attr, 2177 &dev_attr_cmci_disabled.attr, 2178 NULL 2179 }; 2180 2181 static cpumask_var_t mce_device_initialized; 2182 2183 static void mce_device_release(struct device *dev) 2184 { 2185 kfree(dev); 2186 } 2187 2188 /* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2189 static int mce_device_create(unsigned int cpu) 2190 { 2191 struct device *dev; 2192 int err; 2193 int i, j; 2194 2195 if (!mce_available(&boot_cpu_data)) 2196 return -EIO; 2197 2198 dev = per_cpu(mce_device, cpu); 2199 if (dev) 2200 return 0; 2201 2202 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2203 if (!dev) 2204 return -ENOMEM; 2205 dev->id = cpu; 2206 dev->bus = &mce_subsys; 2207 dev->release = &mce_device_release; 2208 2209 err = device_register(dev); 2210 if (err) { 2211 put_device(dev); 2212 return err; 2213 } 2214 2215 for (i = 0; mce_device_attrs[i]; i++) { 2216 err = device_create_file(dev, mce_device_attrs[i]); 2217 if (err) 2218 goto error; 2219 } 2220 for (j = 0; j < mca_cfg.banks; j++) { 2221 err = device_create_file(dev, &mce_banks[j].attr); 2222 if (err) 2223 goto error2; 2224 } 2225 cpumask_set_cpu(cpu, mce_device_initialized); 2226 per_cpu(mce_device, cpu) = dev; 2227 2228 return 0; 2229 error2: 2230 while (--j >= 0) 2231 device_remove_file(dev, &mce_banks[j].attr); 2232 error: 2233 while (--i >= 0) 2234 device_remove_file(dev, mce_device_attrs[i]); 2235 2236 device_unregister(dev); 2237 2238 return err; 2239 } 2240 2241 static void mce_device_remove(unsigned int cpu) 2242 { 2243 struct device *dev = per_cpu(mce_device, cpu); 2244 int i; 2245 2246 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2247 return; 2248 2249 for (i = 0; mce_device_attrs[i]; i++) 2250 device_remove_file(dev, mce_device_attrs[i]); 2251 2252 for (i = 0; i < mca_cfg.banks; i++) 2253 device_remove_file(dev, &mce_banks[i].attr); 2254 2255 device_unregister(dev); 2256 cpumask_clear_cpu(cpu, mce_device_initialized); 2257 per_cpu(mce_device, cpu) = NULL; 2258 } 2259 2260 /* Make sure there are no machine checks on offlined CPUs. */ 2261 static void mce_disable_cpu(void) 2262 { 2263 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2264 return; 2265 2266 if (!cpuhp_tasks_frozen) 2267 cmci_clear(); 2268 2269 vendor_disable_error_reporting(); 2270 } 2271 2272 static void mce_reenable_cpu(void) 2273 { 2274 int i; 2275 2276 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2277 return; 2278 2279 if (!cpuhp_tasks_frozen) 2280 cmci_reenable(); 2281 for (i = 0; i < mca_cfg.banks; i++) { 2282 struct mce_bank *b = &mce_banks[i]; 2283 2284 if (b->init) 2285 wrmsrl(msr_ops.ctl(i), b->ctl); 2286 } 2287 } 2288 2289 static int mce_cpu_dead(unsigned int cpu) 2290 { 2291 mce_intel_hcpu_update(cpu); 2292 2293 /* intentionally ignoring frozen here */ 2294 if (!cpuhp_tasks_frozen) 2295 cmci_rediscover(); 2296 return 0; 2297 } 2298 2299 static int mce_cpu_online(unsigned int cpu) 2300 { 2301 struct timer_list *t = this_cpu_ptr(&mce_timer); 2302 int ret; 2303 2304 mce_device_create(cpu); 2305 2306 ret = mce_threshold_create_device(cpu); 2307 if (ret) { 2308 mce_device_remove(cpu); 2309 return ret; 2310 } 2311 mce_reenable_cpu(); 2312 mce_start_timer(t); 2313 return 0; 2314 } 2315 2316 static int mce_cpu_pre_down(unsigned int cpu) 2317 { 2318 struct timer_list *t = this_cpu_ptr(&mce_timer); 2319 2320 mce_disable_cpu(); 2321 del_timer_sync(t); 2322 mce_threshold_remove_device(cpu); 2323 mce_device_remove(cpu); 2324 return 0; 2325 } 2326 2327 static __init void mce_init_banks(void) 2328 { 2329 int i; 2330 2331 for (i = 0; i < mca_cfg.banks; i++) { 2332 struct mce_bank *b = &mce_banks[i]; 2333 struct device_attribute *a = &b->attr; 2334 2335 sysfs_attr_init(&a->attr); 2336 a->attr.name = b->attrname; 2337 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2338 2339 a->attr.mode = 0644; 2340 a->show = show_bank; 2341 a->store = set_bank; 2342 } 2343 } 2344 2345 static __init int mcheck_init_device(void) 2346 { 2347 int err; 2348 2349 /* 2350 * Check if we have a spare virtual bit. This will only become 2351 * a problem if/when we move beyond 5-level page tables. 2352 */ 2353 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); 2354 2355 if (!mce_available(&boot_cpu_data)) { 2356 err = -EIO; 2357 goto err_out; 2358 } 2359 2360 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { 2361 err = -ENOMEM; 2362 goto err_out; 2363 } 2364 2365 mce_init_banks(); 2366 2367 err = subsys_system_register(&mce_subsys, NULL); 2368 if (err) 2369 goto err_out_mem; 2370 2371 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, 2372 mce_cpu_dead); 2373 if (err) 2374 goto err_out_mem; 2375 2376 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", 2377 mce_cpu_online, mce_cpu_pre_down); 2378 if (err < 0) 2379 goto err_out_online; 2380 2381 register_syscore_ops(&mce_syscore_ops); 2382 2383 return 0; 2384 2385 err_out_online: 2386 cpuhp_remove_state(CPUHP_X86_MCE_DEAD); 2387 2388 err_out_mem: 2389 free_cpumask_var(mce_device_initialized); 2390 2391 err_out: 2392 pr_err("Unable to init MCE device (rc: %d)\n", err); 2393 2394 return err; 2395 } 2396 device_initcall_sync(mcheck_init_device); 2397 2398 /* 2399 * Old style boot options parsing. Only for compatibility. 2400 */ 2401 static int __init mcheck_disable(char *str) 2402 { 2403 mca_cfg.disabled = 1; 2404 return 1; 2405 } 2406 __setup("nomce", mcheck_disable); 2407 2408 #ifdef CONFIG_DEBUG_FS 2409 struct dentry *mce_get_debugfs_dir(void) 2410 { 2411 static struct dentry *dmce; 2412 2413 if (!dmce) 2414 dmce = debugfs_create_dir("mce", NULL); 2415 2416 return dmce; 2417 } 2418 2419 static void mce_reset(void) 2420 { 2421 cpu_missing = 0; 2422 atomic_set(&mce_fake_panicked, 0); 2423 atomic_set(&mce_executing, 0); 2424 atomic_set(&mce_callin, 0); 2425 atomic_set(&global_nwo, 0); 2426 } 2427 2428 static int fake_panic_get(void *data, u64 *val) 2429 { 2430 *val = fake_panic; 2431 return 0; 2432 } 2433 2434 static int fake_panic_set(void *data, u64 val) 2435 { 2436 mce_reset(); 2437 fake_panic = val; 2438 return 0; 2439 } 2440 2441 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, 2442 "%llu\n"); 2443 2444 static int __init mcheck_debugfs_init(void) 2445 { 2446 struct dentry *dmce, *ffake_panic; 2447 2448 dmce = mce_get_debugfs_dir(); 2449 if (!dmce) 2450 return -ENOMEM; 2451 ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, 2452 NULL, &fake_panic_fops); 2453 if (!ffake_panic) 2454 return -ENOMEM; 2455 2456 return 0; 2457 } 2458 #else 2459 static int __init mcheck_debugfs_init(void) { return -EINVAL; } 2460 #endif 2461 2462 DEFINE_STATIC_KEY_FALSE(mcsafe_key); 2463 EXPORT_SYMBOL_GPL(mcsafe_key); 2464 2465 static int __init mcheck_late_init(void) 2466 { 2467 pr_info("Using %d MCE banks\n", mca_cfg.banks); 2468 2469 if (mca_cfg.recovery) 2470 static_branch_inc(&mcsafe_key); 2471 2472 mcheck_debugfs_init(); 2473 cec_init(); 2474 2475 /* 2476 * Flush out everything that has been logged during early boot, now that 2477 * everything has been initialized (workqueues, decoders, ...). 2478 */ 2479 mce_schedule_work(); 2480 2481 return 0; 2482 } 2483 late_initcall(mcheck_late_init); 2484