xref: /openbmc/linux/arch/x86/kernel/cpu/mce/core.c (revision 2c64e9cb)
1 /*
2  * Machine check handler.
3  *
4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5  * Rest from unknown author(s).
6  * 2004 Andi Kleen. Rewrote most of it.
7  * Copyright 2008 Intel Corporation
8  * Author: Andi Kleen
9  */
10 
11 #include <linux/thread_info.h>
12 #include <linux/capability.h>
13 #include <linux/miscdevice.h>
14 #include <linux/ratelimit.h>
15 #include <linux/rcupdate.h>
16 #include <linux/kobject.h>
17 #include <linux/uaccess.h>
18 #include <linux/kdebug.h>
19 #include <linux/kernel.h>
20 #include <linux/percpu.h>
21 #include <linux/string.h>
22 #include <linux/device.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/delay.h>
25 #include <linux/ctype.h>
26 #include <linux/sched.h>
27 #include <linux/sysfs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/init.h>
31 #include <linux/kmod.h>
32 #include <linux/poll.h>
33 #include <linux/nmi.h>
34 #include <linux/cpu.h>
35 #include <linux/ras.h>
36 #include <linux/smp.h>
37 #include <linux/fs.h>
38 #include <linux/mm.h>
39 #include <linux/debugfs.h>
40 #include <linux/irq_work.h>
41 #include <linux/export.h>
42 #include <linux/jump_label.h>
43 #include <linux/set_memory.h>
44 
45 #include <asm/intel-family.h>
46 #include <asm/processor.h>
47 #include <asm/traps.h>
48 #include <asm/tlbflush.h>
49 #include <asm/mce.h>
50 #include <asm/msr.h>
51 #include <asm/reboot.h>
52 
53 #include "internal.h"
54 
55 static DEFINE_MUTEX(mce_log_mutex);
56 
57 /* sysfs synchronization */
58 static DEFINE_MUTEX(mce_sysfs_mutex);
59 
60 #define CREATE_TRACE_POINTS
61 #include <trace/events/mce.h>
62 
63 #define SPINUNIT		100	/* 100ns */
64 
65 DEFINE_PER_CPU(unsigned, mce_exception_count);
66 
67 struct mce_bank *mce_banks __read_mostly;
68 struct mce_vendor_flags mce_flags __read_mostly;
69 
70 struct mca_config mca_cfg __read_mostly = {
71 	.bootlog  = -1,
72 	/*
73 	 * Tolerant levels:
74 	 * 0: always panic on uncorrected errors, log corrected errors
75 	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
77 	 * 3: never panic or SIGBUS, log all errors (for testing only)
78 	 */
79 	.tolerant = 1,
80 	.monarch_timeout = -1
81 };
82 
83 static DEFINE_PER_CPU(struct mce, mces_seen);
84 static unsigned long mce_need_notify;
85 static int cpu_missing;
86 
87 /*
88  * MCA banks polled by the period polling timer for corrected events.
89  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
90  */
91 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
92 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
93 };
94 
95 /*
96  * MCA banks controlled through firmware first for corrected errors.
97  * This is a global list of banks for which we won't enable CMCI and we
98  * won't poll. Firmware controls these banks and is responsible for
99  * reporting corrected errors through GHES. Uncorrected/recoverable
100  * errors are still notified through a machine check.
101  */
102 mce_banks_t mce_banks_ce_disabled;
103 
104 static struct work_struct mce_work;
105 static struct irq_work mce_irq_work;
106 
107 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
108 
109 /*
110  * CPU/chipset specific EDAC code can register a notifier call here to print
111  * MCE errors in a human-readable form.
112  */
113 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
114 
115 /* Do initial initialization of a struct mce */
116 void mce_setup(struct mce *m)
117 {
118 	memset(m, 0, sizeof(struct mce));
119 	m->cpu = m->extcpu = smp_processor_id();
120 	/* need the internal __ version to avoid deadlocks */
121 	m->time = __ktime_get_real_seconds();
122 	m->cpuvendor = boot_cpu_data.x86_vendor;
123 	m->cpuid = cpuid_eax(1);
124 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
125 	m->apicid = cpu_data(m->extcpu).initial_apicid;
126 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
127 
128 	if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
129 		rdmsrl(MSR_PPIN, m->ppin);
130 
131 	m->microcode = boot_cpu_data.microcode;
132 }
133 
134 DEFINE_PER_CPU(struct mce, injectm);
135 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
136 
137 void mce_log(struct mce *m)
138 {
139 	if (!mce_gen_pool_add(m))
140 		irq_work_queue(&mce_irq_work);
141 }
142 
143 void mce_inject_log(struct mce *m)
144 {
145 	mutex_lock(&mce_log_mutex);
146 	mce_log(m);
147 	mutex_unlock(&mce_log_mutex);
148 }
149 EXPORT_SYMBOL_GPL(mce_inject_log);
150 
151 static struct notifier_block mce_srao_nb;
152 
153 /*
154  * We run the default notifier if we have only the SRAO, the first and the
155  * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
156  * notifiers registered on the chain.
157  */
158 #define NUM_DEFAULT_NOTIFIERS	3
159 static atomic_t num_notifiers;
160 
161 void mce_register_decode_chain(struct notifier_block *nb)
162 {
163 	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
164 		return;
165 
166 	atomic_inc(&num_notifiers);
167 
168 	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
169 }
170 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
171 
172 void mce_unregister_decode_chain(struct notifier_block *nb)
173 {
174 	atomic_dec(&num_notifiers);
175 
176 	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
177 }
178 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
179 
180 static inline u32 ctl_reg(int bank)
181 {
182 	return MSR_IA32_MCx_CTL(bank);
183 }
184 
185 static inline u32 status_reg(int bank)
186 {
187 	return MSR_IA32_MCx_STATUS(bank);
188 }
189 
190 static inline u32 addr_reg(int bank)
191 {
192 	return MSR_IA32_MCx_ADDR(bank);
193 }
194 
195 static inline u32 misc_reg(int bank)
196 {
197 	return MSR_IA32_MCx_MISC(bank);
198 }
199 
200 static inline u32 smca_ctl_reg(int bank)
201 {
202 	return MSR_AMD64_SMCA_MCx_CTL(bank);
203 }
204 
205 static inline u32 smca_status_reg(int bank)
206 {
207 	return MSR_AMD64_SMCA_MCx_STATUS(bank);
208 }
209 
210 static inline u32 smca_addr_reg(int bank)
211 {
212 	return MSR_AMD64_SMCA_MCx_ADDR(bank);
213 }
214 
215 static inline u32 smca_misc_reg(int bank)
216 {
217 	return MSR_AMD64_SMCA_MCx_MISC(bank);
218 }
219 
220 struct mca_msr_regs msr_ops = {
221 	.ctl	= ctl_reg,
222 	.status	= status_reg,
223 	.addr	= addr_reg,
224 	.misc	= misc_reg
225 };
226 
227 static void __print_mce(struct mce *m)
228 {
229 	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
230 		 m->extcpu,
231 		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
232 		 m->mcgstatus, m->bank, m->status);
233 
234 	if (m->ip) {
235 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
236 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
237 			m->cs, m->ip);
238 
239 		if (m->cs == __KERNEL_CS)
240 			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
241 		pr_cont("\n");
242 	}
243 
244 	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
245 	if (m->addr)
246 		pr_cont("ADDR %llx ", m->addr);
247 	if (m->misc)
248 		pr_cont("MISC %llx ", m->misc);
249 
250 	if (mce_flags.smca) {
251 		if (m->synd)
252 			pr_cont("SYND %llx ", m->synd);
253 		if (m->ipid)
254 			pr_cont("IPID %llx ", m->ipid);
255 	}
256 
257 	pr_cont("\n");
258 	/*
259 	 * Note this output is parsed by external tools and old fields
260 	 * should not be changed.
261 	 */
262 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
263 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
264 		m->microcode);
265 }
266 
267 static void print_mce(struct mce *m)
268 {
269 	__print_mce(m);
270 
271 	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
272 		pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
273 }
274 
275 #define PANIC_TIMEOUT 5 /* 5 seconds */
276 
277 static atomic_t mce_panicked;
278 
279 static int fake_panic;
280 static atomic_t mce_fake_panicked;
281 
282 /* Panic in progress. Enable interrupts and wait for final IPI */
283 static void wait_for_panic(void)
284 {
285 	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
286 
287 	preempt_disable();
288 	local_irq_enable();
289 	while (timeout-- > 0)
290 		udelay(1);
291 	if (panic_timeout == 0)
292 		panic_timeout = mca_cfg.panic_timeout;
293 	panic("Panicing machine check CPU died");
294 }
295 
296 static void mce_panic(const char *msg, struct mce *final, char *exp)
297 {
298 	int apei_err = 0;
299 	struct llist_node *pending;
300 	struct mce_evt_llist *l;
301 
302 	if (!fake_panic) {
303 		/*
304 		 * Make sure only one CPU runs in machine check panic
305 		 */
306 		if (atomic_inc_return(&mce_panicked) > 1)
307 			wait_for_panic();
308 		barrier();
309 
310 		bust_spinlocks(1);
311 		console_verbose();
312 	} else {
313 		/* Don't log too much for fake panic */
314 		if (atomic_inc_return(&mce_fake_panicked) > 1)
315 			return;
316 	}
317 	pending = mce_gen_pool_prepare_records();
318 	/* First print corrected ones that are still unlogged */
319 	llist_for_each_entry(l, pending, llnode) {
320 		struct mce *m = &l->mce;
321 		if (!(m->status & MCI_STATUS_UC)) {
322 			print_mce(m);
323 			if (!apei_err)
324 				apei_err = apei_write_mce(m);
325 		}
326 	}
327 	/* Now print uncorrected but with the final one last */
328 	llist_for_each_entry(l, pending, llnode) {
329 		struct mce *m = &l->mce;
330 		if (!(m->status & MCI_STATUS_UC))
331 			continue;
332 		if (!final || mce_cmp(m, final)) {
333 			print_mce(m);
334 			if (!apei_err)
335 				apei_err = apei_write_mce(m);
336 		}
337 	}
338 	if (final) {
339 		print_mce(final);
340 		if (!apei_err)
341 			apei_err = apei_write_mce(final);
342 	}
343 	if (cpu_missing)
344 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
345 	if (exp)
346 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
347 	if (!fake_panic) {
348 		if (panic_timeout == 0)
349 			panic_timeout = mca_cfg.panic_timeout;
350 		panic(msg);
351 	} else
352 		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
353 }
354 
355 /* Support code for software error injection */
356 
357 static int msr_to_offset(u32 msr)
358 {
359 	unsigned bank = __this_cpu_read(injectm.bank);
360 
361 	if (msr == mca_cfg.rip_msr)
362 		return offsetof(struct mce, ip);
363 	if (msr == msr_ops.status(bank))
364 		return offsetof(struct mce, status);
365 	if (msr == msr_ops.addr(bank))
366 		return offsetof(struct mce, addr);
367 	if (msr == msr_ops.misc(bank))
368 		return offsetof(struct mce, misc);
369 	if (msr == MSR_IA32_MCG_STATUS)
370 		return offsetof(struct mce, mcgstatus);
371 	return -1;
372 }
373 
374 /* MSR access wrappers used for error injection */
375 static u64 mce_rdmsrl(u32 msr)
376 {
377 	u64 v;
378 
379 	if (__this_cpu_read(injectm.finished)) {
380 		int offset = msr_to_offset(msr);
381 
382 		if (offset < 0)
383 			return 0;
384 		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
385 	}
386 
387 	if (rdmsrl_safe(msr, &v)) {
388 		WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
389 		/*
390 		 * Return zero in case the access faulted. This should
391 		 * not happen normally but can happen if the CPU does
392 		 * something weird, or if the code is buggy.
393 		 */
394 		v = 0;
395 	}
396 
397 	return v;
398 }
399 
400 static void mce_wrmsrl(u32 msr, u64 v)
401 {
402 	if (__this_cpu_read(injectm.finished)) {
403 		int offset = msr_to_offset(msr);
404 
405 		if (offset >= 0)
406 			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
407 		return;
408 	}
409 	wrmsrl(msr, v);
410 }
411 
412 /*
413  * Collect all global (w.r.t. this processor) status about this machine
414  * check into our "mce" struct so that we can use it later to assess
415  * the severity of the problem as we read per-bank specific details.
416  */
417 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
418 {
419 	mce_setup(m);
420 
421 	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
422 	if (regs) {
423 		/*
424 		 * Get the address of the instruction at the time of
425 		 * the machine check error.
426 		 */
427 		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
428 			m->ip = regs->ip;
429 			m->cs = regs->cs;
430 
431 			/*
432 			 * When in VM86 mode make the cs look like ring 3
433 			 * always. This is a lie, but it's better than passing
434 			 * the additional vm86 bit around everywhere.
435 			 */
436 			if (v8086_mode(regs))
437 				m->cs |= 3;
438 		}
439 		/* Use accurate RIP reporting if available. */
440 		if (mca_cfg.rip_msr)
441 			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
442 	}
443 }
444 
445 int mce_available(struct cpuinfo_x86 *c)
446 {
447 	if (mca_cfg.disabled)
448 		return 0;
449 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
450 }
451 
452 static void mce_schedule_work(void)
453 {
454 	if (!mce_gen_pool_empty())
455 		schedule_work(&mce_work);
456 }
457 
458 static void mce_irq_work_cb(struct irq_work *entry)
459 {
460 	mce_schedule_work();
461 }
462 
463 /*
464  * Check if the address reported by the CPU is in a format we can parse.
465  * It would be possible to add code for most other cases, but all would
466  * be somewhat complicated (e.g. segment offset would require an instruction
467  * parser). So only support physical addresses up to page granuality for now.
468  */
469 int mce_usable_address(struct mce *m)
470 {
471 	if (!(m->status & MCI_STATUS_ADDRV))
472 		return 0;
473 
474 	/* Checks after this one are Intel-specific: */
475 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
476 		return 1;
477 
478 	if (!(m->status & MCI_STATUS_MISCV))
479 		return 0;
480 
481 	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
482 		return 0;
483 
484 	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
485 		return 0;
486 
487 	return 1;
488 }
489 EXPORT_SYMBOL_GPL(mce_usable_address);
490 
491 bool mce_is_memory_error(struct mce *m)
492 {
493 	if (m->cpuvendor == X86_VENDOR_AMD ||
494 	    m->cpuvendor == X86_VENDOR_HYGON) {
495 		return amd_mce_is_memory_error(m);
496 	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
497 		/*
498 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
499 		 *
500 		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
501 		 * indicating a memory error. Bit 8 is used for indicating a
502 		 * cache hierarchy error. The combination of bit 2 and bit 3
503 		 * is used for indicating a `generic' cache hierarchy error
504 		 * But we can't just blindly check the above bits, because if
505 		 * bit 11 is set, then it is a bus/interconnect error - and
506 		 * either way the above bits just gives more detail on what
507 		 * bus/interconnect error happened. Note that bit 12 can be
508 		 * ignored, as it's the "filter" bit.
509 		 */
510 		return (m->status & 0xef80) == BIT(7) ||
511 		       (m->status & 0xef00) == BIT(8) ||
512 		       (m->status & 0xeffc) == 0xc;
513 	}
514 
515 	return false;
516 }
517 EXPORT_SYMBOL_GPL(mce_is_memory_error);
518 
519 bool mce_is_correctable(struct mce *m)
520 {
521 	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
522 		return false;
523 
524 	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
525 		return false;
526 
527 	if (m->status & MCI_STATUS_UC)
528 		return false;
529 
530 	return true;
531 }
532 EXPORT_SYMBOL_GPL(mce_is_correctable);
533 
534 static bool cec_add_mce(struct mce *m)
535 {
536 	if (!m)
537 		return false;
538 
539 	/* We eat only correctable DRAM errors with usable addresses. */
540 	if (mce_is_memory_error(m) &&
541 	    mce_is_correctable(m)  &&
542 	    mce_usable_address(m))
543 		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
544 			return true;
545 
546 	return false;
547 }
548 
549 static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
550 			      void *data)
551 {
552 	struct mce *m = (struct mce *)data;
553 
554 	if (!m)
555 		return NOTIFY_DONE;
556 
557 	if (cec_add_mce(m))
558 		return NOTIFY_STOP;
559 
560 	/* Emit the trace record: */
561 	trace_mce_record(m);
562 
563 	set_bit(0, &mce_need_notify);
564 
565 	mce_notify_irq();
566 
567 	return NOTIFY_DONE;
568 }
569 
570 static struct notifier_block first_nb = {
571 	.notifier_call	= mce_first_notifier,
572 	.priority	= MCE_PRIO_FIRST,
573 };
574 
575 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
576 				void *data)
577 {
578 	struct mce *mce = (struct mce *)data;
579 	unsigned long pfn;
580 
581 	if (!mce)
582 		return NOTIFY_DONE;
583 
584 	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
585 		pfn = mce->addr >> PAGE_SHIFT;
586 		if (!memory_failure(pfn, 0))
587 			set_mce_nospec(pfn);
588 	}
589 
590 	return NOTIFY_OK;
591 }
592 static struct notifier_block mce_srao_nb = {
593 	.notifier_call	= srao_decode_notifier,
594 	.priority	= MCE_PRIO_SRAO,
595 };
596 
597 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
598 				void *data)
599 {
600 	struct mce *m = (struct mce *)data;
601 
602 	if (!m)
603 		return NOTIFY_DONE;
604 
605 	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
606 		return NOTIFY_DONE;
607 
608 	__print_mce(m);
609 
610 	return NOTIFY_DONE;
611 }
612 
613 static struct notifier_block mce_default_nb = {
614 	.notifier_call	= mce_default_notifier,
615 	/* lowest prio, we want it to run last. */
616 	.priority	= MCE_PRIO_LOWEST,
617 };
618 
619 /*
620  * Read ADDR and MISC registers.
621  */
622 static void mce_read_aux(struct mce *m, int i)
623 {
624 	if (m->status & MCI_STATUS_MISCV)
625 		m->misc = mce_rdmsrl(msr_ops.misc(i));
626 
627 	if (m->status & MCI_STATUS_ADDRV) {
628 		m->addr = mce_rdmsrl(msr_ops.addr(i));
629 
630 		/*
631 		 * Mask the reported address by the reported granularity.
632 		 */
633 		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
634 			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
635 			m->addr >>= shift;
636 			m->addr <<= shift;
637 		}
638 
639 		/*
640 		 * Extract [55:<lsb>] where lsb is the least significant
641 		 * *valid* bit of the address bits.
642 		 */
643 		if (mce_flags.smca) {
644 			u8 lsb = (m->addr >> 56) & 0x3f;
645 
646 			m->addr &= GENMASK_ULL(55, lsb);
647 		}
648 	}
649 
650 	if (mce_flags.smca) {
651 		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
652 
653 		if (m->status & MCI_STATUS_SYNDV)
654 			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
655 	}
656 }
657 
658 DEFINE_PER_CPU(unsigned, mce_poll_count);
659 
660 /*
661  * Poll for corrected events or events that happened before reset.
662  * Those are just logged through /dev/mcelog.
663  *
664  * This is executed in standard interrupt context.
665  *
666  * Note: spec recommends to panic for fatal unsignalled
667  * errors here. However this would be quite problematic --
668  * we would need to reimplement the Monarch handling and
669  * it would mess up the exclusion between exception handler
670  * and poll handler -- * so we skip this for now.
671  * These cases should not happen anyways, or only when the CPU
672  * is already totally * confused. In this case it's likely it will
673  * not fully execute the machine check handler either.
674  */
675 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
676 {
677 	bool error_seen = false;
678 	struct mce m;
679 	int i;
680 
681 	this_cpu_inc(mce_poll_count);
682 
683 	mce_gather_info(&m, NULL);
684 
685 	if (flags & MCP_TIMESTAMP)
686 		m.tsc = rdtsc();
687 
688 	for (i = 0; i < mca_cfg.banks; i++) {
689 		if (!mce_banks[i].ctl || !test_bit(i, *b))
690 			continue;
691 
692 		m.misc = 0;
693 		m.addr = 0;
694 		m.bank = i;
695 
696 		barrier();
697 		m.status = mce_rdmsrl(msr_ops.status(i));
698 
699 		/* If this entry is not valid, ignore it */
700 		if (!(m.status & MCI_STATUS_VAL))
701 			continue;
702 
703 		/*
704 		 * If we are logging everything (at CPU online) or this
705 		 * is a corrected error, then we must log it.
706 		 */
707 		if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
708 			goto log_it;
709 
710 		/*
711 		 * Newer Intel systems that support software error
712 		 * recovery need to make additional checks. Other
713 		 * CPUs should skip over uncorrected errors, but log
714 		 * everything else.
715 		 */
716 		if (!mca_cfg.ser) {
717 			if (m.status & MCI_STATUS_UC)
718 				continue;
719 			goto log_it;
720 		}
721 
722 		/* Log "not enabled" (speculative) errors */
723 		if (!(m.status & MCI_STATUS_EN))
724 			goto log_it;
725 
726 		/*
727 		 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
728 		 * UC == 1 && PCC == 0 && S == 0
729 		 */
730 		if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
731 			goto log_it;
732 
733 		/*
734 		 * Skip anything else. Presumption is that our read of this
735 		 * bank is racing with a machine check. Leave the log alone
736 		 * for do_machine_check() to deal with it.
737 		 */
738 		continue;
739 
740 log_it:
741 		error_seen = true;
742 
743 		mce_read_aux(&m, i);
744 
745 		m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
746 
747 		/*
748 		 * Don't get the IP here because it's unlikely to
749 		 * have anything to do with the actual error location.
750 		 */
751 		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
752 			mce_log(&m);
753 		else if (mce_usable_address(&m)) {
754 			/*
755 			 * Although we skipped logging this, we still want
756 			 * to take action. Add to the pool so the registered
757 			 * notifiers will see it.
758 			 */
759 			if (!mce_gen_pool_add(&m))
760 				mce_schedule_work();
761 		}
762 
763 		/*
764 		 * Clear state for this bank.
765 		 */
766 		mce_wrmsrl(msr_ops.status(i), 0);
767 	}
768 
769 	/*
770 	 * Don't clear MCG_STATUS here because it's only defined for
771 	 * exceptions.
772 	 */
773 
774 	sync_core();
775 
776 	return error_seen;
777 }
778 EXPORT_SYMBOL_GPL(machine_check_poll);
779 
780 /*
781  * Do a quick check if any of the events requires a panic.
782  * This decides if we keep the events around or clear them.
783  */
784 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
785 			  struct pt_regs *regs)
786 {
787 	char *tmp;
788 	int i;
789 
790 	for (i = 0; i < mca_cfg.banks; i++) {
791 		m->status = mce_rdmsrl(msr_ops.status(i));
792 		if (!(m->status & MCI_STATUS_VAL))
793 			continue;
794 
795 		__set_bit(i, validp);
796 		if (quirk_no_way_out)
797 			quirk_no_way_out(i, m, regs);
798 
799 		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
800 			m->bank = i;
801 			mce_read_aux(m, i);
802 			*msg = tmp;
803 			return 1;
804 		}
805 	}
806 	return 0;
807 }
808 
809 /*
810  * Variable to establish order between CPUs while scanning.
811  * Each CPU spins initially until executing is equal its number.
812  */
813 static atomic_t mce_executing;
814 
815 /*
816  * Defines order of CPUs on entry. First CPU becomes Monarch.
817  */
818 static atomic_t mce_callin;
819 
820 /*
821  * Check if a timeout waiting for other CPUs happened.
822  */
823 static int mce_timed_out(u64 *t, const char *msg)
824 {
825 	/*
826 	 * The others already did panic for some reason.
827 	 * Bail out like in a timeout.
828 	 * rmb() to tell the compiler that system_state
829 	 * might have been modified by someone else.
830 	 */
831 	rmb();
832 	if (atomic_read(&mce_panicked))
833 		wait_for_panic();
834 	if (!mca_cfg.monarch_timeout)
835 		goto out;
836 	if ((s64)*t < SPINUNIT) {
837 		if (mca_cfg.tolerant <= 1)
838 			mce_panic(msg, NULL, NULL);
839 		cpu_missing = 1;
840 		return 1;
841 	}
842 	*t -= SPINUNIT;
843 out:
844 	touch_nmi_watchdog();
845 	return 0;
846 }
847 
848 /*
849  * The Monarch's reign.  The Monarch is the CPU who entered
850  * the machine check handler first. It waits for the others to
851  * raise the exception too and then grades them. When any
852  * error is fatal panic. Only then let the others continue.
853  *
854  * The other CPUs entering the MCE handler will be controlled by the
855  * Monarch. They are called Subjects.
856  *
857  * This way we prevent any potential data corruption in a unrecoverable case
858  * and also makes sure always all CPU's errors are examined.
859  *
860  * Also this detects the case of a machine check event coming from outer
861  * space (not detected by any CPUs) In this case some external agent wants
862  * us to shut down, so panic too.
863  *
864  * The other CPUs might still decide to panic if the handler happens
865  * in a unrecoverable place, but in this case the system is in a semi-stable
866  * state and won't corrupt anything by itself. It's ok to let the others
867  * continue for a bit first.
868  *
869  * All the spin loops have timeouts; when a timeout happens a CPU
870  * typically elects itself to be Monarch.
871  */
872 static void mce_reign(void)
873 {
874 	int cpu;
875 	struct mce *m = NULL;
876 	int global_worst = 0;
877 	char *msg = NULL;
878 	char *nmsg = NULL;
879 
880 	/*
881 	 * This CPU is the Monarch and the other CPUs have run
882 	 * through their handlers.
883 	 * Grade the severity of the errors of all the CPUs.
884 	 */
885 	for_each_possible_cpu(cpu) {
886 		int severity = mce_severity(&per_cpu(mces_seen, cpu),
887 					    mca_cfg.tolerant,
888 					    &nmsg, true);
889 		if (severity > global_worst) {
890 			msg = nmsg;
891 			global_worst = severity;
892 			m = &per_cpu(mces_seen, cpu);
893 		}
894 	}
895 
896 	/*
897 	 * Cannot recover? Panic here then.
898 	 * This dumps all the mces in the log buffer and stops the
899 	 * other CPUs.
900 	 */
901 	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
902 		mce_panic("Fatal machine check", m, msg);
903 
904 	/*
905 	 * For UC somewhere we let the CPU who detects it handle it.
906 	 * Also must let continue the others, otherwise the handling
907 	 * CPU could deadlock on a lock.
908 	 */
909 
910 	/*
911 	 * No machine check event found. Must be some external
912 	 * source or one CPU is hung. Panic.
913 	 */
914 	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
915 		mce_panic("Fatal machine check from unknown source", NULL, NULL);
916 
917 	/*
918 	 * Now clear all the mces_seen so that they don't reappear on
919 	 * the next mce.
920 	 */
921 	for_each_possible_cpu(cpu)
922 		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
923 }
924 
925 static atomic_t global_nwo;
926 
927 /*
928  * Start of Monarch synchronization. This waits until all CPUs have
929  * entered the exception handler and then determines if any of them
930  * saw a fatal event that requires panic. Then it executes them
931  * in the entry order.
932  * TBD double check parallel CPU hotunplug
933  */
934 static int mce_start(int *no_way_out)
935 {
936 	int order;
937 	int cpus = num_online_cpus();
938 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
939 
940 	if (!timeout)
941 		return -1;
942 
943 	atomic_add(*no_way_out, &global_nwo);
944 	/*
945 	 * Rely on the implied barrier below, such that global_nwo
946 	 * is updated before mce_callin.
947 	 */
948 	order = atomic_inc_return(&mce_callin);
949 
950 	/*
951 	 * Wait for everyone.
952 	 */
953 	while (atomic_read(&mce_callin) != cpus) {
954 		if (mce_timed_out(&timeout,
955 				  "Timeout: Not all CPUs entered broadcast exception handler")) {
956 			atomic_set(&global_nwo, 0);
957 			return -1;
958 		}
959 		ndelay(SPINUNIT);
960 	}
961 
962 	/*
963 	 * mce_callin should be read before global_nwo
964 	 */
965 	smp_rmb();
966 
967 	if (order == 1) {
968 		/*
969 		 * Monarch: Starts executing now, the others wait.
970 		 */
971 		atomic_set(&mce_executing, 1);
972 	} else {
973 		/*
974 		 * Subject: Now start the scanning loop one by one in
975 		 * the original callin order.
976 		 * This way when there are any shared banks it will be
977 		 * only seen by one CPU before cleared, avoiding duplicates.
978 		 */
979 		while (atomic_read(&mce_executing) < order) {
980 			if (mce_timed_out(&timeout,
981 					  "Timeout: Subject CPUs unable to finish machine check processing")) {
982 				atomic_set(&global_nwo, 0);
983 				return -1;
984 			}
985 			ndelay(SPINUNIT);
986 		}
987 	}
988 
989 	/*
990 	 * Cache the global no_way_out state.
991 	 */
992 	*no_way_out = atomic_read(&global_nwo);
993 
994 	return order;
995 }
996 
997 /*
998  * Synchronize between CPUs after main scanning loop.
999  * This invokes the bulk of the Monarch processing.
1000  */
1001 static int mce_end(int order)
1002 {
1003 	int ret = -1;
1004 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1005 
1006 	if (!timeout)
1007 		goto reset;
1008 	if (order < 0)
1009 		goto reset;
1010 
1011 	/*
1012 	 * Allow others to run.
1013 	 */
1014 	atomic_inc(&mce_executing);
1015 
1016 	if (order == 1) {
1017 		/* CHECKME: Can this race with a parallel hotplug? */
1018 		int cpus = num_online_cpus();
1019 
1020 		/*
1021 		 * Monarch: Wait for everyone to go through their scanning
1022 		 * loops.
1023 		 */
1024 		while (atomic_read(&mce_executing) <= cpus) {
1025 			if (mce_timed_out(&timeout,
1026 					  "Timeout: Monarch CPU unable to finish machine check processing"))
1027 				goto reset;
1028 			ndelay(SPINUNIT);
1029 		}
1030 
1031 		mce_reign();
1032 		barrier();
1033 		ret = 0;
1034 	} else {
1035 		/*
1036 		 * Subject: Wait for Monarch to finish.
1037 		 */
1038 		while (atomic_read(&mce_executing) != 0) {
1039 			if (mce_timed_out(&timeout,
1040 					  "Timeout: Monarch CPU did not finish machine check processing"))
1041 				goto reset;
1042 			ndelay(SPINUNIT);
1043 		}
1044 
1045 		/*
1046 		 * Don't reset anything. That's done by the Monarch.
1047 		 */
1048 		return 0;
1049 	}
1050 
1051 	/*
1052 	 * Reset all global state.
1053 	 */
1054 reset:
1055 	atomic_set(&global_nwo, 0);
1056 	atomic_set(&mce_callin, 0);
1057 	barrier();
1058 
1059 	/*
1060 	 * Let others run again.
1061 	 */
1062 	atomic_set(&mce_executing, 0);
1063 	return ret;
1064 }
1065 
1066 static void mce_clear_state(unsigned long *toclear)
1067 {
1068 	int i;
1069 
1070 	for (i = 0; i < mca_cfg.banks; i++) {
1071 		if (test_bit(i, toclear))
1072 			mce_wrmsrl(msr_ops.status(i), 0);
1073 	}
1074 }
1075 
1076 static int do_memory_failure(struct mce *m)
1077 {
1078 	int flags = MF_ACTION_REQUIRED;
1079 	int ret;
1080 
1081 	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1082 	if (!(m->mcgstatus & MCG_STATUS_RIPV))
1083 		flags |= MF_MUST_KILL;
1084 	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1085 	if (ret)
1086 		pr_err("Memory error not recovered");
1087 	else
1088 		set_mce_nospec(m->addr >> PAGE_SHIFT);
1089 	return ret;
1090 }
1091 
1092 
1093 /*
1094  * Cases where we avoid rendezvous handler timeout:
1095  * 1) If this CPU is offline.
1096  *
1097  * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1098  *  skip those CPUs which remain looping in the 1st kernel - see
1099  *  crash_nmi_callback().
1100  *
1101  * Note: there still is a small window between kexec-ing and the new,
1102  * kdump kernel establishing a new #MC handler where a broadcasted MCE
1103  * might not get handled properly.
1104  */
1105 static bool __mc_check_crashing_cpu(int cpu)
1106 {
1107 	if (cpu_is_offline(cpu) ||
1108 	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
1109 		u64 mcgstatus;
1110 
1111 		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1112 		if (mcgstatus & MCG_STATUS_RIPV) {
1113 			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1114 			return true;
1115 		}
1116 	}
1117 	return false;
1118 }
1119 
1120 static void __mc_scan_banks(struct mce *m, struct mce *final,
1121 			    unsigned long *toclear, unsigned long *valid_banks,
1122 			    int no_way_out, int *worst)
1123 {
1124 	struct mca_config *cfg = &mca_cfg;
1125 	int severity, i;
1126 
1127 	for (i = 0; i < cfg->banks; i++) {
1128 		__clear_bit(i, toclear);
1129 		if (!test_bit(i, valid_banks))
1130 			continue;
1131 
1132 		if (!mce_banks[i].ctl)
1133 			continue;
1134 
1135 		m->misc = 0;
1136 		m->addr = 0;
1137 		m->bank = i;
1138 
1139 		m->status = mce_rdmsrl(msr_ops.status(i));
1140 		if (!(m->status & MCI_STATUS_VAL))
1141 			continue;
1142 
1143 		/*
1144 		 * Corrected or non-signaled errors are handled by
1145 		 * machine_check_poll(). Leave them alone, unless this panics.
1146 		 */
1147 		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1148 			!no_way_out)
1149 			continue;
1150 
1151 		/* Set taint even when machine check was not enabled. */
1152 		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1153 
1154 		severity = mce_severity(m, cfg->tolerant, NULL, true);
1155 
1156 		/*
1157 		 * When machine check was for corrected/deferred handler don't
1158 		 * touch, unless we're panicking.
1159 		 */
1160 		if ((severity == MCE_KEEP_SEVERITY ||
1161 		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1162 			continue;
1163 
1164 		__set_bit(i, toclear);
1165 
1166 		/* Machine check event was not enabled. Clear, but ignore. */
1167 		if (severity == MCE_NO_SEVERITY)
1168 			continue;
1169 
1170 		mce_read_aux(m, i);
1171 
1172 		/* assuming valid severity level != 0 */
1173 		m->severity = severity;
1174 
1175 		mce_log(m);
1176 
1177 		if (severity > *worst) {
1178 			*final = *m;
1179 			*worst = severity;
1180 		}
1181 	}
1182 
1183 	/* mce_clear_state will clear *final, save locally for use later */
1184 	*m = *final;
1185 }
1186 
1187 /*
1188  * The actual machine check handler. This only handles real
1189  * exceptions when something got corrupted coming in through int 18.
1190  *
1191  * This is executed in NMI context not subject to normal locking rules. This
1192  * implies that most kernel services cannot be safely used. Don't even
1193  * think about putting a printk in there!
1194  *
1195  * On Intel systems this is entered on all CPUs in parallel through
1196  * MCE broadcast. However some CPUs might be broken beyond repair,
1197  * so be always careful when synchronizing with others.
1198  */
1199 void do_machine_check(struct pt_regs *regs, long error_code)
1200 {
1201 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1202 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1203 	struct mca_config *cfg = &mca_cfg;
1204 	int cpu = smp_processor_id();
1205 	char *msg = "Unknown";
1206 	struct mce m, *final;
1207 	int worst = 0;
1208 
1209 	/*
1210 	 * Establish sequential order between the CPUs entering the machine
1211 	 * check handler.
1212 	 */
1213 	int order = -1;
1214 
1215 	/*
1216 	 * If no_way_out gets set, there is no safe way to recover from this
1217 	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1218 	 */
1219 	int no_way_out = 0;
1220 
1221 	/*
1222 	 * If kill_it gets set, there might be a way to recover from this
1223 	 * error.
1224 	 */
1225 	int kill_it = 0;
1226 
1227 	/*
1228 	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1229 	 * on Intel.
1230 	 */
1231 	int lmce = 1;
1232 
1233 	if (__mc_check_crashing_cpu(cpu))
1234 		return;
1235 
1236 	ist_enter(regs);
1237 
1238 	this_cpu_inc(mce_exception_count);
1239 
1240 	mce_gather_info(&m, regs);
1241 	m.tsc = rdtsc();
1242 
1243 	final = this_cpu_ptr(&mces_seen);
1244 	*final = m;
1245 
1246 	memset(valid_banks, 0, sizeof(valid_banks));
1247 	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1248 
1249 	barrier();
1250 
1251 	/*
1252 	 * When no restart IP might need to kill or panic.
1253 	 * Assume the worst for now, but if we find the
1254 	 * severity is MCE_AR_SEVERITY we have other options.
1255 	 */
1256 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1257 		kill_it = 1;
1258 
1259 	/*
1260 	 * Check if this MCE is signaled to only this logical processor,
1261 	 * on Intel only.
1262 	 */
1263 	if (m.cpuvendor == X86_VENDOR_INTEL)
1264 		lmce = m.mcgstatus & MCG_STATUS_LMCES;
1265 
1266 	/*
1267 	 * Local machine check may already know that we have to panic.
1268 	 * Broadcast machine check begins rendezvous in mce_start()
1269 	 * Go through all banks in exclusion of the other CPUs. This way we
1270 	 * don't report duplicated events on shared banks because the first one
1271 	 * to see it will clear it.
1272 	 */
1273 	if (lmce) {
1274 		if (no_way_out)
1275 			mce_panic("Fatal local machine check", &m, msg);
1276 	} else {
1277 		order = mce_start(&no_way_out);
1278 	}
1279 
1280 	__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
1281 
1282 	if (!no_way_out)
1283 		mce_clear_state(toclear);
1284 
1285 	/*
1286 	 * Do most of the synchronization with other CPUs.
1287 	 * When there's any problem use only local no_way_out state.
1288 	 */
1289 	if (!lmce) {
1290 		if (mce_end(order) < 0)
1291 			no_way_out = worst >= MCE_PANIC_SEVERITY;
1292 	} else {
1293 		/*
1294 		 * If there was a fatal machine check we should have
1295 		 * already called mce_panic earlier in this function.
1296 		 * Since we re-read the banks, we might have found
1297 		 * something new. Check again to see if we found a
1298 		 * fatal error. We call "mce_severity()" again to
1299 		 * make sure we have the right "msg".
1300 		 */
1301 		if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1302 			mce_severity(&m, cfg->tolerant, &msg, true);
1303 			mce_panic("Local fatal machine check!", &m, msg);
1304 		}
1305 	}
1306 
1307 	/*
1308 	 * If tolerant is at an insane level we drop requests to kill
1309 	 * processes and continue even when there is no way out.
1310 	 */
1311 	if (cfg->tolerant == 3)
1312 		kill_it = 0;
1313 	else if (no_way_out)
1314 		mce_panic("Fatal machine check on current CPU", &m, msg);
1315 
1316 	if (worst > 0)
1317 		irq_work_queue(&mce_irq_work);
1318 
1319 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1320 
1321 	sync_core();
1322 
1323 	if (worst != MCE_AR_SEVERITY && !kill_it)
1324 		goto out_ist;
1325 
1326 	/* Fault was in user mode and we need to take some action */
1327 	if ((m.cs & 3) == 3) {
1328 		ist_begin_non_atomic(regs);
1329 		local_irq_enable();
1330 
1331 		if (kill_it || do_memory_failure(&m))
1332 			force_sig(SIGBUS, current);
1333 		local_irq_disable();
1334 		ist_end_non_atomic();
1335 	} else {
1336 		if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
1337 			mce_panic("Failed kernel mode recovery", &m, NULL);
1338 	}
1339 
1340 out_ist:
1341 	ist_exit(regs);
1342 }
1343 EXPORT_SYMBOL_GPL(do_machine_check);
1344 
1345 #ifndef CONFIG_MEMORY_FAILURE
1346 int memory_failure(unsigned long pfn, int flags)
1347 {
1348 	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1349 	BUG_ON(flags & MF_ACTION_REQUIRED);
1350 	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1351 	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1352 	       pfn);
1353 
1354 	return 0;
1355 }
1356 #endif
1357 
1358 /*
1359  * Periodic polling timer for "silent" machine check errors.  If the
1360  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1361  * errors, poll 2x slower (up to check_interval seconds).
1362  */
1363 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1364 
1365 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1366 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1367 
1368 static unsigned long mce_adjust_timer_default(unsigned long interval)
1369 {
1370 	return interval;
1371 }
1372 
1373 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1374 
1375 static void __start_timer(struct timer_list *t, unsigned long interval)
1376 {
1377 	unsigned long when = jiffies + interval;
1378 	unsigned long flags;
1379 
1380 	local_irq_save(flags);
1381 
1382 	if (!timer_pending(t) || time_before(when, t->expires))
1383 		mod_timer(t, round_jiffies(when));
1384 
1385 	local_irq_restore(flags);
1386 }
1387 
1388 static void mce_timer_fn(struct timer_list *t)
1389 {
1390 	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1391 	unsigned long iv;
1392 
1393 	WARN_ON(cpu_t != t);
1394 
1395 	iv = __this_cpu_read(mce_next_interval);
1396 
1397 	if (mce_available(this_cpu_ptr(&cpu_info))) {
1398 		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1399 
1400 		if (mce_intel_cmci_poll()) {
1401 			iv = mce_adjust_timer(iv);
1402 			goto done;
1403 		}
1404 	}
1405 
1406 	/*
1407 	 * Alert userspace if needed. If we logged an MCE, reduce the polling
1408 	 * interval, otherwise increase the polling interval.
1409 	 */
1410 	if (mce_notify_irq())
1411 		iv = max(iv / 2, (unsigned long) HZ/100);
1412 	else
1413 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1414 
1415 done:
1416 	__this_cpu_write(mce_next_interval, iv);
1417 	__start_timer(t, iv);
1418 }
1419 
1420 /*
1421  * Ensure that the timer is firing in @interval from now.
1422  */
1423 void mce_timer_kick(unsigned long interval)
1424 {
1425 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1426 	unsigned long iv = __this_cpu_read(mce_next_interval);
1427 
1428 	__start_timer(t, interval);
1429 
1430 	if (interval < iv)
1431 		__this_cpu_write(mce_next_interval, interval);
1432 }
1433 
1434 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1435 static void mce_timer_delete_all(void)
1436 {
1437 	int cpu;
1438 
1439 	for_each_online_cpu(cpu)
1440 		del_timer_sync(&per_cpu(mce_timer, cpu));
1441 }
1442 
1443 /*
1444  * Notify the user(s) about new machine check events.
1445  * Can be called from interrupt context, but not from machine check/NMI
1446  * context.
1447  */
1448 int mce_notify_irq(void)
1449 {
1450 	/* Not more than two messages every minute */
1451 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1452 
1453 	if (test_and_clear_bit(0, &mce_need_notify)) {
1454 		mce_work_trigger();
1455 
1456 		if (__ratelimit(&ratelimit))
1457 			pr_info(HW_ERR "Machine check events logged\n");
1458 
1459 		return 1;
1460 	}
1461 	return 0;
1462 }
1463 EXPORT_SYMBOL_GPL(mce_notify_irq);
1464 
1465 static int __mcheck_cpu_mce_banks_init(void)
1466 {
1467 	int i;
1468 
1469 	mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
1470 	if (!mce_banks)
1471 		return -ENOMEM;
1472 
1473 	for (i = 0; i < MAX_NR_BANKS; i++) {
1474 		struct mce_bank *b = &mce_banks[i];
1475 
1476 		b->ctl = -1ULL;
1477 		b->init = 1;
1478 	}
1479 	return 0;
1480 }
1481 
1482 /*
1483  * Initialize Machine Checks for a CPU.
1484  */
1485 static int __mcheck_cpu_cap_init(void)
1486 {
1487 	u64 cap;
1488 	u8 b;
1489 
1490 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1491 
1492 	b = cap & MCG_BANKCNT_MASK;
1493 	if (WARN_ON_ONCE(b > MAX_NR_BANKS))
1494 		b = MAX_NR_BANKS;
1495 
1496 	mca_cfg.banks = max(mca_cfg.banks, b);
1497 
1498 	if (!mce_banks) {
1499 		int err = __mcheck_cpu_mce_banks_init();
1500 		if (err)
1501 			return err;
1502 	}
1503 
1504 	/* Use accurate RIP reporting if available. */
1505 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1506 		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1507 
1508 	if (cap & MCG_SER_P)
1509 		mca_cfg.ser = 1;
1510 
1511 	return 0;
1512 }
1513 
1514 static void __mcheck_cpu_init_generic(void)
1515 {
1516 	enum mcp_flags m_fl = 0;
1517 	mce_banks_t all_banks;
1518 	u64 cap;
1519 
1520 	if (!mca_cfg.bootlog)
1521 		m_fl = MCP_DONTLOG;
1522 
1523 	/*
1524 	 * Log the machine checks left over from the previous reset.
1525 	 */
1526 	bitmap_fill(all_banks, MAX_NR_BANKS);
1527 	machine_check_poll(MCP_UC | m_fl, &all_banks);
1528 
1529 	cr4_set_bits(X86_CR4_MCE);
1530 
1531 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1532 	if (cap & MCG_CTL_P)
1533 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1534 }
1535 
1536 static void __mcheck_cpu_init_clear_banks(void)
1537 {
1538 	int i;
1539 
1540 	for (i = 0; i < mca_cfg.banks; i++) {
1541 		struct mce_bank *b = &mce_banks[i];
1542 
1543 		if (!b->init)
1544 			continue;
1545 		wrmsrl(msr_ops.ctl(i), b->ctl);
1546 		wrmsrl(msr_ops.status(i), 0);
1547 	}
1548 }
1549 
1550 /*
1551  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1552  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1553  * Vol 3B Table 15-20). But this confuses both the code that determines
1554  * whether the machine check occurred in kernel or user mode, and also
1555  * the severity assessment code. Pretend that EIPV was set, and take the
1556  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1557  */
1558 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1559 {
1560 	if (bank != 0)
1561 		return;
1562 	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1563 		return;
1564 	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1565 		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1566 			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1567 			  MCACOD)) !=
1568 			 (MCI_STATUS_UC|MCI_STATUS_EN|
1569 			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1570 			  MCI_STATUS_AR|MCACOD_INSTR))
1571 		return;
1572 
1573 	m->mcgstatus |= MCG_STATUS_EIPV;
1574 	m->ip = regs->ip;
1575 	m->cs = regs->cs;
1576 }
1577 
1578 /* Add per CPU specific workarounds here */
1579 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1580 {
1581 	struct mca_config *cfg = &mca_cfg;
1582 
1583 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1584 		pr_info("unknown CPU type - not enabling MCE support\n");
1585 		return -EOPNOTSUPP;
1586 	}
1587 
1588 	/* This should be disabled by the BIOS, but isn't always */
1589 	if (c->x86_vendor == X86_VENDOR_AMD) {
1590 		if (c->x86 == 15 && cfg->banks > 4) {
1591 			/*
1592 			 * disable GART TBL walk error reporting, which
1593 			 * trips off incorrectly with the IOMMU & 3ware
1594 			 * & Cerberus:
1595 			 */
1596 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1597 		}
1598 		if (c->x86 < 0x11 && cfg->bootlog < 0) {
1599 			/*
1600 			 * Lots of broken BIOS around that don't clear them
1601 			 * by default and leave crap in there. Don't log:
1602 			 */
1603 			cfg->bootlog = 0;
1604 		}
1605 		/*
1606 		 * Various K7s with broken bank 0 around. Always disable
1607 		 * by default.
1608 		 */
1609 		if (c->x86 == 6 && cfg->banks > 0)
1610 			mce_banks[0].ctl = 0;
1611 
1612 		/*
1613 		 * overflow_recov is supported for F15h Models 00h-0fh
1614 		 * even though we don't have a CPUID bit for it.
1615 		 */
1616 		if (c->x86 == 0x15 && c->x86_model <= 0xf)
1617 			mce_flags.overflow_recov = 1;
1618 
1619 	}
1620 
1621 	if (c->x86_vendor == X86_VENDOR_INTEL) {
1622 		/*
1623 		 * SDM documents that on family 6 bank 0 should not be written
1624 		 * because it aliases to another special BIOS controlled
1625 		 * register.
1626 		 * But it's not aliased anymore on model 0x1a+
1627 		 * Don't ignore bank 0 completely because there could be a
1628 		 * valid event later, merely don't write CTL0.
1629 		 */
1630 
1631 		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1632 			mce_banks[0].init = 0;
1633 
1634 		/*
1635 		 * All newer Intel systems support MCE broadcasting. Enable
1636 		 * synchronization with a one second timeout.
1637 		 */
1638 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1639 			cfg->monarch_timeout < 0)
1640 			cfg->monarch_timeout = USEC_PER_SEC;
1641 
1642 		/*
1643 		 * There are also broken BIOSes on some Pentium M and
1644 		 * earlier systems:
1645 		 */
1646 		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1647 			cfg->bootlog = 0;
1648 
1649 		if (c->x86 == 6 && c->x86_model == 45)
1650 			quirk_no_way_out = quirk_sandybridge_ifu;
1651 	}
1652 	if (cfg->monarch_timeout < 0)
1653 		cfg->monarch_timeout = 0;
1654 	if (cfg->bootlog != 0)
1655 		cfg->panic_timeout = 30;
1656 
1657 	return 0;
1658 }
1659 
1660 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1661 {
1662 	if (c->x86 != 5)
1663 		return 0;
1664 
1665 	switch (c->x86_vendor) {
1666 	case X86_VENDOR_INTEL:
1667 		intel_p5_mcheck_init(c);
1668 		return 1;
1669 		break;
1670 	case X86_VENDOR_CENTAUR:
1671 		winchip_mcheck_init(c);
1672 		return 1;
1673 		break;
1674 	default:
1675 		return 0;
1676 	}
1677 
1678 	return 0;
1679 }
1680 
1681 /*
1682  * Init basic CPU features needed for early decoding of MCEs.
1683  */
1684 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1685 {
1686 	if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1687 		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1688 		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
1689 		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
1690 
1691 		if (mce_flags.smca) {
1692 			msr_ops.ctl	= smca_ctl_reg;
1693 			msr_ops.status	= smca_status_reg;
1694 			msr_ops.addr	= smca_addr_reg;
1695 			msr_ops.misc	= smca_misc_reg;
1696 		}
1697 	}
1698 }
1699 
1700 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1701 {
1702 	struct mca_config *cfg = &mca_cfg;
1703 
1704 	 /*
1705 	  * All newer Centaur CPUs support MCE broadcasting. Enable
1706 	  * synchronization with a one second timeout.
1707 	  */
1708 	if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1709 	     c->x86 > 6) {
1710 		if (cfg->monarch_timeout < 0)
1711 			cfg->monarch_timeout = USEC_PER_SEC;
1712 	}
1713 }
1714 
1715 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1716 {
1717 	switch (c->x86_vendor) {
1718 	case X86_VENDOR_INTEL:
1719 		mce_intel_feature_init(c);
1720 		mce_adjust_timer = cmci_intel_adjust_timer;
1721 		break;
1722 
1723 	case X86_VENDOR_AMD: {
1724 		mce_amd_feature_init(c);
1725 		break;
1726 		}
1727 
1728 	case X86_VENDOR_HYGON:
1729 		mce_hygon_feature_init(c);
1730 		break;
1731 
1732 	case X86_VENDOR_CENTAUR:
1733 		mce_centaur_feature_init(c);
1734 		break;
1735 
1736 	default:
1737 		break;
1738 	}
1739 }
1740 
1741 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1742 {
1743 	switch (c->x86_vendor) {
1744 	case X86_VENDOR_INTEL:
1745 		mce_intel_feature_clear(c);
1746 		break;
1747 	default:
1748 		break;
1749 	}
1750 }
1751 
1752 static void mce_start_timer(struct timer_list *t)
1753 {
1754 	unsigned long iv = check_interval * HZ;
1755 
1756 	if (mca_cfg.ignore_ce || !iv)
1757 		return;
1758 
1759 	this_cpu_write(mce_next_interval, iv);
1760 	__start_timer(t, iv);
1761 }
1762 
1763 static void __mcheck_cpu_setup_timer(void)
1764 {
1765 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1766 
1767 	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1768 }
1769 
1770 static void __mcheck_cpu_init_timer(void)
1771 {
1772 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1773 
1774 	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1775 	mce_start_timer(t);
1776 }
1777 
1778 bool filter_mce(struct mce *m)
1779 {
1780 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1781 		return amd_filter_mce(m);
1782 
1783 	return false;
1784 }
1785 
1786 /* Handle unconfigured int18 (should never happen) */
1787 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1788 {
1789 	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1790 	       smp_processor_id());
1791 }
1792 
1793 /* Call the installed machine check handler for this CPU setup. */
1794 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1795 						unexpected_machine_check;
1796 
1797 dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1798 {
1799 	machine_check_vector(regs, error_code);
1800 }
1801 
1802 /*
1803  * Called for each booted CPU to set up machine checks.
1804  * Must be called with preempt off:
1805  */
1806 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1807 {
1808 	if (mca_cfg.disabled)
1809 		return;
1810 
1811 	if (__mcheck_cpu_ancient_init(c))
1812 		return;
1813 
1814 	if (!mce_available(c))
1815 		return;
1816 
1817 	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1818 		mca_cfg.disabled = 1;
1819 		return;
1820 	}
1821 
1822 	if (mce_gen_pool_init()) {
1823 		mca_cfg.disabled = 1;
1824 		pr_emerg("Couldn't allocate MCE records pool!\n");
1825 		return;
1826 	}
1827 
1828 	machine_check_vector = do_machine_check;
1829 
1830 	__mcheck_cpu_init_early(c);
1831 	__mcheck_cpu_init_generic();
1832 	__mcheck_cpu_init_vendor(c);
1833 	__mcheck_cpu_init_clear_banks();
1834 	__mcheck_cpu_setup_timer();
1835 }
1836 
1837 /*
1838  * Called for each booted CPU to clear some machine checks opt-ins
1839  */
1840 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1841 {
1842 	if (mca_cfg.disabled)
1843 		return;
1844 
1845 	if (!mce_available(c))
1846 		return;
1847 
1848 	/*
1849 	 * Possibly to clear general settings generic to x86
1850 	 * __mcheck_cpu_clear_generic(c);
1851 	 */
1852 	__mcheck_cpu_clear_vendor(c);
1853 
1854 }
1855 
1856 static void __mce_disable_bank(void *arg)
1857 {
1858 	int bank = *((int *)arg);
1859 	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1860 	cmci_disable_bank(bank);
1861 }
1862 
1863 void mce_disable_bank(int bank)
1864 {
1865 	if (bank >= mca_cfg.banks) {
1866 		pr_warn(FW_BUG
1867 			"Ignoring request to disable invalid MCA bank %d.\n",
1868 			bank);
1869 		return;
1870 	}
1871 	set_bit(bank, mce_banks_ce_disabled);
1872 	on_each_cpu(__mce_disable_bank, &bank, 1);
1873 }
1874 
1875 /*
1876  * mce=off Disables machine check
1877  * mce=no_cmci Disables CMCI
1878  * mce=no_lmce Disables LMCE
1879  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1880  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1881  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1882  *	monarchtimeout is how long to wait for other CPUs on machine
1883  *	check, or 0 to not wait
1884  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1885 	and older.
1886  * mce=nobootlog Don't log MCEs from before booting.
1887  * mce=bios_cmci_threshold Don't program the CMCI threshold
1888  * mce=recovery force enable memcpy_mcsafe()
1889  */
1890 static int __init mcheck_enable(char *str)
1891 {
1892 	struct mca_config *cfg = &mca_cfg;
1893 
1894 	if (*str == 0) {
1895 		enable_p5_mce();
1896 		return 1;
1897 	}
1898 	if (*str == '=')
1899 		str++;
1900 	if (!strcmp(str, "off"))
1901 		cfg->disabled = 1;
1902 	else if (!strcmp(str, "no_cmci"))
1903 		cfg->cmci_disabled = true;
1904 	else if (!strcmp(str, "no_lmce"))
1905 		cfg->lmce_disabled = 1;
1906 	else if (!strcmp(str, "dont_log_ce"))
1907 		cfg->dont_log_ce = true;
1908 	else if (!strcmp(str, "ignore_ce"))
1909 		cfg->ignore_ce = true;
1910 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1911 		cfg->bootlog = (str[0] == 'b');
1912 	else if (!strcmp(str, "bios_cmci_threshold"))
1913 		cfg->bios_cmci_threshold = 1;
1914 	else if (!strcmp(str, "recovery"))
1915 		cfg->recovery = 1;
1916 	else if (isdigit(str[0])) {
1917 		if (get_option(&str, &cfg->tolerant) == 2)
1918 			get_option(&str, &(cfg->monarch_timeout));
1919 	} else {
1920 		pr_info("mce argument %s ignored. Please use /sys\n", str);
1921 		return 0;
1922 	}
1923 	return 1;
1924 }
1925 __setup("mce", mcheck_enable);
1926 
1927 int __init mcheck_init(void)
1928 {
1929 	mcheck_intel_therm_init();
1930 	mce_register_decode_chain(&first_nb);
1931 	mce_register_decode_chain(&mce_srao_nb);
1932 	mce_register_decode_chain(&mce_default_nb);
1933 	mcheck_vendor_init_severity();
1934 
1935 	INIT_WORK(&mce_work, mce_gen_pool_process);
1936 	init_irq_work(&mce_irq_work, mce_irq_work_cb);
1937 
1938 	return 0;
1939 }
1940 
1941 /*
1942  * mce_syscore: PM support
1943  */
1944 
1945 /*
1946  * Disable machine checks on suspend and shutdown. We can't really handle
1947  * them later.
1948  */
1949 static void mce_disable_error_reporting(void)
1950 {
1951 	int i;
1952 
1953 	for (i = 0; i < mca_cfg.banks; i++) {
1954 		struct mce_bank *b = &mce_banks[i];
1955 
1956 		if (b->init)
1957 			wrmsrl(msr_ops.ctl(i), 0);
1958 	}
1959 	return;
1960 }
1961 
1962 static void vendor_disable_error_reporting(void)
1963 {
1964 	/*
1965 	 * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
1966 	 * are socket-wide.
1967 	 * Disabling them for just a single offlined CPU is bad, since it will
1968 	 * inhibit reporting for all shared resources on the socket like the
1969 	 * last level cache (LLC), the integrated memory controller (iMC), etc.
1970 	 */
1971 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
1972 	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
1973 	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1974 		return;
1975 
1976 	mce_disable_error_reporting();
1977 }
1978 
1979 static int mce_syscore_suspend(void)
1980 {
1981 	vendor_disable_error_reporting();
1982 	return 0;
1983 }
1984 
1985 static void mce_syscore_shutdown(void)
1986 {
1987 	vendor_disable_error_reporting();
1988 }
1989 
1990 /*
1991  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1992  * Only one CPU is active at this time, the others get re-added later using
1993  * CPU hotplug:
1994  */
1995 static void mce_syscore_resume(void)
1996 {
1997 	__mcheck_cpu_init_generic();
1998 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
1999 	__mcheck_cpu_init_clear_banks();
2000 }
2001 
2002 static struct syscore_ops mce_syscore_ops = {
2003 	.suspend	= mce_syscore_suspend,
2004 	.shutdown	= mce_syscore_shutdown,
2005 	.resume		= mce_syscore_resume,
2006 };
2007 
2008 /*
2009  * mce_device: Sysfs support
2010  */
2011 
2012 static void mce_cpu_restart(void *data)
2013 {
2014 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2015 		return;
2016 	__mcheck_cpu_init_generic();
2017 	__mcheck_cpu_init_clear_banks();
2018 	__mcheck_cpu_init_timer();
2019 }
2020 
2021 /* Reinit MCEs after user configuration changes */
2022 static void mce_restart(void)
2023 {
2024 	mce_timer_delete_all();
2025 	on_each_cpu(mce_cpu_restart, NULL, 1);
2026 }
2027 
2028 /* Toggle features for corrected errors */
2029 static void mce_disable_cmci(void *data)
2030 {
2031 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2032 		return;
2033 	cmci_clear();
2034 }
2035 
2036 static void mce_enable_ce(void *all)
2037 {
2038 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2039 		return;
2040 	cmci_reenable();
2041 	cmci_recheck();
2042 	if (all)
2043 		__mcheck_cpu_init_timer();
2044 }
2045 
2046 static struct bus_type mce_subsys = {
2047 	.name		= "machinecheck",
2048 	.dev_name	= "machinecheck",
2049 };
2050 
2051 DEFINE_PER_CPU(struct device *, mce_device);
2052 
2053 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2054 {
2055 	return container_of(attr, struct mce_bank, attr);
2056 }
2057 
2058 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2059 			 char *buf)
2060 {
2061 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2062 }
2063 
2064 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2065 			const char *buf, size_t size)
2066 {
2067 	u64 new;
2068 
2069 	if (kstrtou64(buf, 0, &new) < 0)
2070 		return -EINVAL;
2071 
2072 	attr_to_bank(attr)->ctl = new;
2073 	mce_restart();
2074 
2075 	return size;
2076 }
2077 
2078 static ssize_t set_ignore_ce(struct device *s,
2079 			     struct device_attribute *attr,
2080 			     const char *buf, size_t size)
2081 {
2082 	u64 new;
2083 
2084 	if (kstrtou64(buf, 0, &new) < 0)
2085 		return -EINVAL;
2086 
2087 	mutex_lock(&mce_sysfs_mutex);
2088 	if (mca_cfg.ignore_ce ^ !!new) {
2089 		if (new) {
2090 			/* disable ce features */
2091 			mce_timer_delete_all();
2092 			on_each_cpu(mce_disable_cmci, NULL, 1);
2093 			mca_cfg.ignore_ce = true;
2094 		} else {
2095 			/* enable ce features */
2096 			mca_cfg.ignore_ce = false;
2097 			on_each_cpu(mce_enable_ce, (void *)1, 1);
2098 		}
2099 	}
2100 	mutex_unlock(&mce_sysfs_mutex);
2101 
2102 	return size;
2103 }
2104 
2105 static ssize_t set_cmci_disabled(struct device *s,
2106 				 struct device_attribute *attr,
2107 				 const char *buf, size_t size)
2108 {
2109 	u64 new;
2110 
2111 	if (kstrtou64(buf, 0, &new) < 0)
2112 		return -EINVAL;
2113 
2114 	mutex_lock(&mce_sysfs_mutex);
2115 	if (mca_cfg.cmci_disabled ^ !!new) {
2116 		if (new) {
2117 			/* disable cmci */
2118 			on_each_cpu(mce_disable_cmci, NULL, 1);
2119 			mca_cfg.cmci_disabled = true;
2120 		} else {
2121 			/* enable cmci */
2122 			mca_cfg.cmci_disabled = false;
2123 			on_each_cpu(mce_enable_ce, NULL, 1);
2124 		}
2125 	}
2126 	mutex_unlock(&mce_sysfs_mutex);
2127 
2128 	return size;
2129 }
2130 
2131 static ssize_t store_int_with_restart(struct device *s,
2132 				      struct device_attribute *attr,
2133 				      const char *buf, size_t size)
2134 {
2135 	unsigned long old_check_interval = check_interval;
2136 	ssize_t ret = device_store_ulong(s, attr, buf, size);
2137 
2138 	if (check_interval == old_check_interval)
2139 		return ret;
2140 
2141 	mutex_lock(&mce_sysfs_mutex);
2142 	mce_restart();
2143 	mutex_unlock(&mce_sysfs_mutex);
2144 
2145 	return ret;
2146 }
2147 
2148 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2149 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2150 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2151 
2152 static struct dev_ext_attribute dev_attr_check_interval = {
2153 	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2154 	&check_interval
2155 };
2156 
2157 static struct dev_ext_attribute dev_attr_ignore_ce = {
2158 	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2159 	&mca_cfg.ignore_ce
2160 };
2161 
2162 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2163 	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2164 	&mca_cfg.cmci_disabled
2165 };
2166 
2167 static struct device_attribute *mce_device_attrs[] = {
2168 	&dev_attr_tolerant.attr,
2169 	&dev_attr_check_interval.attr,
2170 #ifdef CONFIG_X86_MCELOG_LEGACY
2171 	&dev_attr_trigger,
2172 #endif
2173 	&dev_attr_monarch_timeout.attr,
2174 	&dev_attr_dont_log_ce.attr,
2175 	&dev_attr_ignore_ce.attr,
2176 	&dev_attr_cmci_disabled.attr,
2177 	NULL
2178 };
2179 
2180 static cpumask_var_t mce_device_initialized;
2181 
2182 static void mce_device_release(struct device *dev)
2183 {
2184 	kfree(dev);
2185 }
2186 
2187 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2188 static int mce_device_create(unsigned int cpu)
2189 {
2190 	struct device *dev;
2191 	int err;
2192 	int i, j;
2193 
2194 	if (!mce_available(&boot_cpu_data))
2195 		return -EIO;
2196 
2197 	dev = per_cpu(mce_device, cpu);
2198 	if (dev)
2199 		return 0;
2200 
2201 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2202 	if (!dev)
2203 		return -ENOMEM;
2204 	dev->id  = cpu;
2205 	dev->bus = &mce_subsys;
2206 	dev->release = &mce_device_release;
2207 
2208 	err = device_register(dev);
2209 	if (err) {
2210 		put_device(dev);
2211 		return err;
2212 	}
2213 
2214 	for (i = 0; mce_device_attrs[i]; i++) {
2215 		err = device_create_file(dev, mce_device_attrs[i]);
2216 		if (err)
2217 			goto error;
2218 	}
2219 	for (j = 0; j < mca_cfg.banks; j++) {
2220 		err = device_create_file(dev, &mce_banks[j].attr);
2221 		if (err)
2222 			goto error2;
2223 	}
2224 	cpumask_set_cpu(cpu, mce_device_initialized);
2225 	per_cpu(mce_device, cpu) = dev;
2226 
2227 	return 0;
2228 error2:
2229 	while (--j >= 0)
2230 		device_remove_file(dev, &mce_banks[j].attr);
2231 error:
2232 	while (--i >= 0)
2233 		device_remove_file(dev, mce_device_attrs[i]);
2234 
2235 	device_unregister(dev);
2236 
2237 	return err;
2238 }
2239 
2240 static void mce_device_remove(unsigned int cpu)
2241 {
2242 	struct device *dev = per_cpu(mce_device, cpu);
2243 	int i;
2244 
2245 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2246 		return;
2247 
2248 	for (i = 0; mce_device_attrs[i]; i++)
2249 		device_remove_file(dev, mce_device_attrs[i]);
2250 
2251 	for (i = 0; i < mca_cfg.banks; i++)
2252 		device_remove_file(dev, &mce_banks[i].attr);
2253 
2254 	device_unregister(dev);
2255 	cpumask_clear_cpu(cpu, mce_device_initialized);
2256 	per_cpu(mce_device, cpu) = NULL;
2257 }
2258 
2259 /* Make sure there are no machine checks on offlined CPUs. */
2260 static void mce_disable_cpu(void)
2261 {
2262 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2263 		return;
2264 
2265 	if (!cpuhp_tasks_frozen)
2266 		cmci_clear();
2267 
2268 	vendor_disable_error_reporting();
2269 }
2270 
2271 static void mce_reenable_cpu(void)
2272 {
2273 	int i;
2274 
2275 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2276 		return;
2277 
2278 	if (!cpuhp_tasks_frozen)
2279 		cmci_reenable();
2280 	for (i = 0; i < mca_cfg.banks; i++) {
2281 		struct mce_bank *b = &mce_banks[i];
2282 
2283 		if (b->init)
2284 			wrmsrl(msr_ops.ctl(i), b->ctl);
2285 	}
2286 }
2287 
2288 static int mce_cpu_dead(unsigned int cpu)
2289 {
2290 	mce_intel_hcpu_update(cpu);
2291 
2292 	/* intentionally ignoring frozen here */
2293 	if (!cpuhp_tasks_frozen)
2294 		cmci_rediscover();
2295 	return 0;
2296 }
2297 
2298 static int mce_cpu_online(unsigned int cpu)
2299 {
2300 	struct timer_list *t = this_cpu_ptr(&mce_timer);
2301 	int ret;
2302 
2303 	mce_device_create(cpu);
2304 
2305 	ret = mce_threshold_create_device(cpu);
2306 	if (ret) {
2307 		mce_device_remove(cpu);
2308 		return ret;
2309 	}
2310 	mce_reenable_cpu();
2311 	mce_start_timer(t);
2312 	return 0;
2313 }
2314 
2315 static int mce_cpu_pre_down(unsigned int cpu)
2316 {
2317 	struct timer_list *t = this_cpu_ptr(&mce_timer);
2318 
2319 	mce_disable_cpu();
2320 	del_timer_sync(t);
2321 	mce_threshold_remove_device(cpu);
2322 	mce_device_remove(cpu);
2323 	return 0;
2324 }
2325 
2326 static __init void mce_init_banks(void)
2327 {
2328 	int i;
2329 
2330 	for (i = 0; i < mca_cfg.banks; i++) {
2331 		struct mce_bank *b = &mce_banks[i];
2332 		struct device_attribute *a = &b->attr;
2333 
2334 		sysfs_attr_init(&a->attr);
2335 		a->attr.name	= b->attrname;
2336 		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2337 
2338 		a->attr.mode	= 0644;
2339 		a->show		= show_bank;
2340 		a->store	= set_bank;
2341 	}
2342 }
2343 
2344 static __init int mcheck_init_device(void)
2345 {
2346 	int err;
2347 
2348 	/*
2349 	 * Check if we have a spare virtual bit. This will only become
2350 	 * a problem if/when we move beyond 5-level page tables.
2351 	 */
2352 	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2353 
2354 	if (!mce_available(&boot_cpu_data)) {
2355 		err = -EIO;
2356 		goto err_out;
2357 	}
2358 
2359 	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2360 		err = -ENOMEM;
2361 		goto err_out;
2362 	}
2363 
2364 	mce_init_banks();
2365 
2366 	err = subsys_system_register(&mce_subsys, NULL);
2367 	if (err)
2368 		goto err_out_mem;
2369 
2370 	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2371 				mce_cpu_dead);
2372 	if (err)
2373 		goto err_out_mem;
2374 
2375 	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2376 				mce_cpu_online, mce_cpu_pre_down);
2377 	if (err < 0)
2378 		goto err_out_online;
2379 
2380 	register_syscore_ops(&mce_syscore_ops);
2381 
2382 	return 0;
2383 
2384 err_out_online:
2385 	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2386 
2387 err_out_mem:
2388 	free_cpumask_var(mce_device_initialized);
2389 
2390 err_out:
2391 	pr_err("Unable to init MCE device (rc: %d)\n", err);
2392 
2393 	return err;
2394 }
2395 device_initcall_sync(mcheck_init_device);
2396 
2397 /*
2398  * Old style boot options parsing. Only for compatibility.
2399  */
2400 static int __init mcheck_disable(char *str)
2401 {
2402 	mca_cfg.disabled = 1;
2403 	return 1;
2404 }
2405 __setup("nomce", mcheck_disable);
2406 
2407 #ifdef CONFIG_DEBUG_FS
2408 struct dentry *mce_get_debugfs_dir(void)
2409 {
2410 	static struct dentry *dmce;
2411 
2412 	if (!dmce)
2413 		dmce = debugfs_create_dir("mce", NULL);
2414 
2415 	return dmce;
2416 }
2417 
2418 static void mce_reset(void)
2419 {
2420 	cpu_missing = 0;
2421 	atomic_set(&mce_fake_panicked, 0);
2422 	atomic_set(&mce_executing, 0);
2423 	atomic_set(&mce_callin, 0);
2424 	atomic_set(&global_nwo, 0);
2425 }
2426 
2427 static int fake_panic_get(void *data, u64 *val)
2428 {
2429 	*val = fake_panic;
2430 	return 0;
2431 }
2432 
2433 static int fake_panic_set(void *data, u64 val)
2434 {
2435 	mce_reset();
2436 	fake_panic = val;
2437 	return 0;
2438 }
2439 
2440 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2441 			 "%llu\n");
2442 
2443 static int __init mcheck_debugfs_init(void)
2444 {
2445 	struct dentry *dmce, *ffake_panic;
2446 
2447 	dmce = mce_get_debugfs_dir();
2448 	if (!dmce)
2449 		return -ENOMEM;
2450 	ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce,
2451 						 NULL, &fake_panic_fops);
2452 	if (!ffake_panic)
2453 		return -ENOMEM;
2454 
2455 	return 0;
2456 }
2457 #else
2458 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2459 #endif
2460 
2461 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2462 EXPORT_SYMBOL_GPL(mcsafe_key);
2463 
2464 static int __init mcheck_late_init(void)
2465 {
2466 	pr_info("Using %d MCE banks\n", mca_cfg.banks);
2467 
2468 	if (mca_cfg.recovery)
2469 		static_branch_inc(&mcsafe_key);
2470 
2471 	mcheck_debugfs_init();
2472 	cec_init();
2473 
2474 	/*
2475 	 * Flush out everything that has been logged during early boot, now that
2476 	 * everything has been initialized (workqueues, decoders, ...).
2477 	 */
2478 	mce_schedule_work();
2479 
2480 	return 0;
2481 }
2482 late_initcall(mcheck_late_init);
2483