xref: /openbmc/linux/drivers/edac/mce_amd.c (revision baa7eb025ab14f3cba2e35c0a8648f9c9f01d24f)
1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include "mce_amd.h"
5 
6 static struct amd_decoder_ops *fam_ops;
7 
8 static u8 nb_err_cpumask = 0xf;
9 
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
12 
13 void amd_report_gart_errors(bool v)
14 {
15 	report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18 
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
20 {
21 	nb_bus_decoder = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24 
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
26 {
27 	if (nb_bus_decoder) {
28 		WARN_ON(nb_bus_decoder != f);
29 
30 		nb_bus_decoder = NULL;
31 	}
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34 
35 /*
36  * string representation for the different MCA reported error types, see F3x48
37  * or MSR0000_0411.
38  */
39 
40 /* transaction type */
41 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 EXPORT_SYMBOL_GPL(tt_msgs);
43 
44 /* cache level */
45 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46 EXPORT_SYMBOL_GPL(ll_msgs);
47 
48 /* memory transaction type */
49 const char *rrrr_msgs[] = {
50        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 };
52 EXPORT_SYMBOL_GPL(rrrr_msgs);
53 
54 /* participating processor */
55 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
57 
58 /* request timeout */
59 const char *to_msgs[] = { "no timeout",	"timed out" };
60 EXPORT_SYMBOL_GPL(to_msgs);
61 
62 /* memory or i/o */
63 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64 EXPORT_SYMBOL_GPL(ii_msgs);
65 
66 static const char *f10h_nb_mce_desc[] = {
67 	"HT link data error",
68 	"Protocol error (link, L3, probe filter, etc.)",
69 	"Parity error in NB-internal arrays",
70 	"Link Retry due to IO link transmission error",
71 	"L3 ECC data cache error",
72 	"ECC error in L3 cache tag",
73 	"L3 LRU parity bits error",
74 	"ECC Error in the Probe Filter directory"
75 };
76 
77 static bool f12h_dc_mce(u16 ec)
78 {
79 	bool ret = false;
80 
81 	if (MEM_ERROR(ec)) {
82 		u8 ll = ec & 0x3;
83 		ret = true;
84 
85 		if (ll == LL_L2)
86 			pr_cont("during L1 linefill from L2.\n");
87 		else if (ll == LL_L1)
88 			pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
89 		else
90 			ret = false;
91 	}
92 	return ret;
93 }
94 
95 static bool f10h_dc_mce(u16 ec)
96 {
97 	u8 r4  = (ec >> 4) & 0xf;
98 	u8 ll  = ec & 0x3;
99 
100 	if (r4 == R4_GEN && ll == LL_L1) {
101 		pr_cont("during data scrub.\n");
102 		return true;
103 	}
104 	return f12h_dc_mce(ec);
105 }
106 
107 static bool k8_dc_mce(u16 ec)
108 {
109 	if (BUS_ERROR(ec)) {
110 		pr_cont("during system linefill.\n");
111 		return true;
112 	}
113 
114 	return f10h_dc_mce(ec);
115 }
116 
117 static bool f14h_dc_mce(u16 ec)
118 {
119 	u8 r4	 = (ec >> 4) & 0xf;
120 	u8 ll	 = ec & 0x3;
121 	u8 tt	 = (ec >> 2) & 0x3;
122 	u8 ii	 = tt;
123 	bool ret = true;
124 
125 	if (MEM_ERROR(ec)) {
126 
127 		if (tt != TT_DATA || ll != LL_L1)
128 			return false;
129 
130 		switch (r4) {
131 		case R4_DRD:
132 		case R4_DWR:
133 			pr_cont("Data/Tag parity error due to %s.\n",
134 				(r4 == R4_DRD ? "load/hw prf" : "store"));
135 			break;
136 		case R4_EVICT:
137 			pr_cont("Copyback parity error on a tag miss.\n");
138 			break;
139 		case R4_SNOOP:
140 			pr_cont("Tag parity error during snoop.\n");
141 			break;
142 		default:
143 			ret = false;
144 		}
145 	} else if (BUS_ERROR(ec)) {
146 
147 		if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
148 			return false;
149 
150 		pr_cont("System read data error on a ");
151 
152 		switch (r4) {
153 		case R4_RD:
154 			pr_cont("TLB reload.\n");
155 			break;
156 		case R4_DWR:
157 			pr_cont("store.\n");
158 			break;
159 		case R4_DRD:
160 			pr_cont("load.\n");
161 			break;
162 		default:
163 			ret = false;
164 		}
165 	} else {
166 		ret = false;
167 	}
168 
169 	return ret;
170 }
171 
172 static void amd_decode_dc_mce(struct mce *m)
173 {
174 	u16 ec = m->status & 0xffff;
175 	u8 xec = (m->status >> 16) & 0xf;
176 
177 	pr_emerg(HW_ERR "Data Cache Error: ");
178 
179 	/* TLB error signatures are the same across families */
180 	if (TLB_ERROR(ec)) {
181 		u8 tt = (ec >> 2) & 0x3;
182 
183 		if (tt == TT_DATA) {
184 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
185 				(xec ? "multimatch" : "parity error"));
186 			return;
187 		}
188 		else
189 			goto wrong_dc_mce;
190 	}
191 
192 	if (!fam_ops->dc_mce(ec))
193 		goto wrong_dc_mce;
194 
195 	return;
196 
197 wrong_dc_mce:
198 	pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
199 }
200 
201 static bool k8_ic_mce(u16 ec)
202 {
203 	u8 ll	 = ec & 0x3;
204 	u8 r4	 = (ec >> 4) & 0xf;
205 	bool ret = true;
206 
207 	if (!MEM_ERROR(ec))
208 		return false;
209 
210 	if (ll == 0x2)
211 		pr_cont("during a linefill from L2.\n");
212 	else if (ll == 0x1) {
213 		switch (r4) {
214 		case R4_IRD:
215 			pr_cont("Parity error during data load.\n");
216 			break;
217 
218 		case R4_EVICT:
219 			pr_cont("Copyback Parity/Victim error.\n");
220 			break;
221 
222 		case R4_SNOOP:
223 			pr_cont("Tag Snoop error.\n");
224 			break;
225 
226 		default:
227 			ret = false;
228 			break;
229 		}
230 	} else
231 		ret = false;
232 
233 	return ret;
234 }
235 
236 static bool f14h_ic_mce(u16 ec)
237 {
238 	u8 ll    = ec & 0x3;
239 	u8 tt    = (ec >> 2) & 0x3;
240 	u8 r4  = (ec >> 4) & 0xf;
241 	bool ret = true;
242 
243 	if (MEM_ERROR(ec)) {
244 		if (tt != 0 || ll != 1)
245 			ret = false;
246 
247 		if (r4 == R4_IRD)
248 			pr_cont("Data/tag array parity error for a tag hit.\n");
249 		else if (r4 == R4_SNOOP)
250 			pr_cont("Tag error during snoop/victimization.\n");
251 		else
252 			ret = false;
253 	}
254 	return ret;
255 }
256 
257 static void amd_decode_ic_mce(struct mce *m)
258 {
259 	u16 ec = m->status & 0xffff;
260 	u8 xec = (m->status >> 16) & 0xf;
261 
262 	pr_emerg(HW_ERR "Instruction Cache Error: ");
263 
264 	if (TLB_ERROR(ec))
265 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
266 			(xec ? "multimatch" : "parity error"));
267 	else if (BUS_ERROR(ec)) {
268 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
269 
270 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
271 	} else if (fam_ops->ic_mce(ec))
272 		;
273 	else
274 		pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
275 }
276 
277 static void amd_decode_bu_mce(struct mce *m)
278 {
279 	u32 ec = m->status & 0xffff;
280 	u32 xec = (m->status >> 16) & 0xf;
281 
282 	pr_emerg(HW_ERR "Bus Unit Error");
283 
284 	if (xec == 0x1)
285 		pr_cont(" in the write data buffers.\n");
286 	else if (xec == 0x3)
287 		pr_cont(" in the victim data buffers.\n");
288 	else if (xec == 0x2 && MEM_ERROR(ec))
289 		pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
290 	else if (xec == 0x0) {
291 		if (TLB_ERROR(ec))
292 			pr_cont(": %s error in a Page Descriptor Cache or "
293 				"Guest TLB.\n", TT_MSG(ec));
294 		else if (BUS_ERROR(ec))
295 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
296 				RRRR_MSG(ec), PP_MSG(ec));
297 		else if (MEM_ERROR(ec)) {
298 			u8 rrrr = (ec >> 4) & 0xf;
299 
300 			if (rrrr >= 0x7)
301 				pr_cont(": %s error during data copyback.\n",
302 					RRRR_MSG(ec));
303 			else if (rrrr <= 0x1)
304 				pr_cont(": %s parity/ECC error during data "
305 					"access from L2.\n", RRRR_MSG(ec));
306 			else
307 				goto wrong_bu_mce;
308 		} else
309 			goto wrong_bu_mce;
310 	} else
311 		goto wrong_bu_mce;
312 
313 	return;
314 
315 wrong_bu_mce:
316 	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
317 }
318 
319 static void amd_decode_ls_mce(struct mce *m)
320 {
321 	u16 ec = m->status & 0xffff;
322 	u8 xec = (m->status >> 16) & 0xf;
323 
324 	if (boot_cpu_data.x86 == 0x14) {
325 		pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
326 			 " please report on LKML.\n");
327 		return;
328 	}
329 
330 	pr_emerg(HW_ERR "Load Store Error");
331 
332 	if (xec == 0x0) {
333 		u8 r4 = (ec >> 4) & 0xf;
334 
335 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
336 			goto wrong_ls_mce;
337 
338 		pr_cont(" during %s.\n", RRRR_MSG(ec));
339 	} else
340 		goto wrong_ls_mce;
341 
342 	return;
343 
344 wrong_ls_mce:
345 	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
346 }
347 
348 static bool k8_nb_mce(u16 ec, u8 xec)
349 {
350 	bool ret = true;
351 
352 	switch (xec) {
353 	case 0x1:
354 		pr_cont("CRC error detected on HT link.\n");
355 		break;
356 
357 	case 0x5:
358 		pr_cont("Invalid GART PTE entry during GART table walk.\n");
359 		break;
360 
361 	case 0x6:
362 		pr_cont("Unsupported atomic RMW received from an IO link.\n");
363 		break;
364 
365 	case 0x0:
366 	case 0x8:
367 		if (boot_cpu_data.x86 == 0x11)
368 			return false;
369 
370 		pr_cont("DRAM ECC error detected on the NB.\n");
371 		break;
372 
373 	case 0xd:
374 		pr_cont("Parity error on the DRAM addr/ctl signals.\n");
375 		break;
376 
377 	default:
378 		ret = false;
379 		break;
380 	}
381 
382 	return ret;
383 }
384 
385 static bool f10h_nb_mce(u16 ec, u8 xec)
386 {
387 	bool ret = true;
388 	u8 offset = 0;
389 
390 	if (k8_nb_mce(ec, xec))
391 		return true;
392 
393 	switch(xec) {
394 	case 0xa ... 0xc:
395 		offset = 10;
396 		break;
397 
398 	case 0xe:
399 		offset = 11;
400 		break;
401 
402 	case 0xf:
403 		if (TLB_ERROR(ec))
404 			pr_cont("GART Table Walk data error.\n");
405 		else if (BUS_ERROR(ec))
406 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
407 		else
408 			ret = false;
409 
410 		goto out;
411 		break;
412 
413 	case 0x1c ... 0x1f:
414 		offset = 24;
415 		break;
416 
417 	default:
418 		ret = false;
419 
420 		goto out;
421 		break;
422 	}
423 
424 	pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
425 
426 out:
427 	return ret;
428 }
429 
430 static bool nb_noop_mce(u16 ec, u8 xec)
431 {
432 	return false;
433 }
434 
435 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
436 {
437 	u8 xec   = (m->status >> 16) & 0x1f;
438 	u16 ec   = m->status & 0xffff;
439 	u32 nbsh = (u32)(m->status >> 32);
440 
441 	pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
442 
443 	/*
444 	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
445 	 * value encoding has changed so interpret those differently
446 	 */
447 	if ((boot_cpu_data.x86 == 0x10) &&
448 	    (boot_cpu_data.x86_model > 7)) {
449 		if (nbsh & K8_NBSH_ERR_CPU_VAL)
450 			pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
451 	} else {
452 		u8 assoc_cpus = nbsh & nb_err_cpumask;
453 
454 		if (assoc_cpus > 0)
455 			pr_cont(", core: %d", fls(assoc_cpus) - 1);
456 	}
457 
458 	switch (xec) {
459 	case 0x2:
460 		pr_cont("Sync error (sync packets on HT link detected).\n");
461 		return;
462 
463 	case 0x3:
464 		pr_cont("HT Master abort.\n");
465 		return;
466 
467 	case 0x4:
468 		pr_cont("HT Target abort.\n");
469 		return;
470 
471 	case 0x7:
472 		pr_cont("NB Watchdog timeout.\n");
473 		return;
474 
475 	case 0x9:
476 		pr_cont("SVM DMA Exclusion Vector error.\n");
477 		return;
478 
479 	default:
480 		break;
481 	}
482 
483 	if (!fam_ops->nb_mce(ec, xec))
484 		goto wrong_nb_mce;
485 
486 	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
487 		if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
488 			nb_bus_decoder(node_id, m, nbcfg);
489 
490 	return;
491 
492 wrong_nb_mce:
493 	pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
494 }
495 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
496 
497 static void amd_decode_fr_mce(struct mce *m)
498 {
499 	if (boot_cpu_data.x86 == 0xf ||
500 	    boot_cpu_data.x86 == 0x11)
501 		goto wrong_fr_mce;
502 
503 	/* we have only one error signature so match all fields at once. */
504 	if ((m->status & 0xffff) == 0x0f0f) {
505 		pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
506 		return;
507 	}
508 
509 wrong_fr_mce:
510 	pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
511 }
512 
513 static inline void amd_decode_err_code(u16 ec)
514 {
515 	if (TLB_ERROR(ec)) {
516 		pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
517 			 TT_MSG(ec), LL_MSG(ec));
518 	} else if (MEM_ERROR(ec)) {
519 		pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
520 			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
521 	} else if (BUS_ERROR(ec)) {
522 		pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
523 			 "Participating Processor: %s\n",
524 			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
525 			  PP_MSG(ec));
526 	} else
527 		pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
528 }
529 
530 /*
531  * Filter out unwanted MCE signatures here.
532  */
533 static bool amd_filter_mce(struct mce *m)
534 {
535 	u8 xec = (m->status >> 16) & 0x1f;
536 
537 	/*
538 	 * NB GART TLB error reporting is disabled by default.
539 	 */
540 	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
541 		return true;
542 
543 	return false;
544 }
545 
546 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
547 {
548 	struct mce *m = (struct mce *)data;
549 	int node, ecc;
550 
551 	if (amd_filter_mce(m))
552 		return NOTIFY_STOP;
553 
554 	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
555 
556 	pr_cont("%sorrected error, other errors lost: %s, "
557 		 "CPU context corrupt: %s",
558 		 ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
559 		 ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
560 		 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
561 
562 	/* do the two bits[14:13] together */
563 	ecc = (m->status >> 45) & 0x3;
564 	if (ecc)
565 		pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
566 
567 	pr_cont("\n");
568 
569 	switch (m->bank) {
570 	case 0:
571 		amd_decode_dc_mce(m);
572 		break;
573 
574 	case 1:
575 		amd_decode_ic_mce(m);
576 		break;
577 
578 	case 2:
579 		amd_decode_bu_mce(m);
580 		break;
581 
582 	case 3:
583 		amd_decode_ls_mce(m);
584 		break;
585 
586 	case 4:
587 		node = amd_get_nb_id(m->extcpu);
588 		amd_decode_nb_mce(node, m, 0);
589 		break;
590 
591 	case 5:
592 		amd_decode_fr_mce(m);
593 		break;
594 
595 	default:
596 		break;
597 	}
598 
599 	amd_decode_err_code(m->status & 0xffff);
600 
601 	return NOTIFY_STOP;
602 }
603 EXPORT_SYMBOL_GPL(amd_decode_mce);
604 
605 static struct notifier_block amd_mce_dec_nb = {
606 	.notifier_call	= amd_decode_mce,
607 };
608 
609 static int __init mce_amd_init(void)
610 {
611 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
612 		return 0;
613 
614 	if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
615 	    (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
616 		return 0;
617 
618 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
619 	if (!fam_ops)
620 		return -ENOMEM;
621 
622 	switch (boot_cpu_data.x86) {
623 	case 0xf:
624 		fam_ops->dc_mce = k8_dc_mce;
625 		fam_ops->ic_mce = k8_ic_mce;
626 		fam_ops->nb_mce = k8_nb_mce;
627 		break;
628 
629 	case 0x10:
630 		fam_ops->dc_mce = f10h_dc_mce;
631 		fam_ops->ic_mce = k8_ic_mce;
632 		fam_ops->nb_mce = f10h_nb_mce;
633 		break;
634 
635 	case 0x11:
636 		fam_ops->dc_mce = k8_dc_mce;
637 		fam_ops->ic_mce = k8_ic_mce;
638 		fam_ops->nb_mce = f10h_nb_mce;
639 		break;
640 
641 	case 0x12:
642 		fam_ops->dc_mce = f12h_dc_mce;
643 		fam_ops->ic_mce = k8_ic_mce;
644 		fam_ops->nb_mce = nb_noop_mce;
645 		break;
646 
647 	case 0x14:
648 		nb_err_cpumask  = 0x3;
649 		fam_ops->dc_mce = f14h_dc_mce;
650 		fam_ops->ic_mce = f14h_ic_mce;
651 		fam_ops->nb_mce = nb_noop_mce;
652 		break;
653 
654 	default:
655 		printk(KERN_WARNING "Huh? What family is that: %d?!\n",
656 				    boot_cpu_data.x86);
657 		kfree(fam_ops);
658 		return -EINVAL;
659 	}
660 
661 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
662 
663 	atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
664 
665 	return 0;
666 }
667 early_initcall(mce_amd_init);
668 
669 #ifdef MODULE
670 static void __exit mce_amd_exit(void)
671 {
672 	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
673 	kfree(fam_ops);
674 }
675 
676 MODULE_DESCRIPTION("AMD MCE decoder");
677 MODULE_ALIAS("edac-mce-amd");
678 MODULE_LICENSE("GPL");
679 module_exit(mce_amd_exit);
680 #endif
681