xref: /openbmc/linux/drivers/edac/mce_amd.c (revision 6dfcd296)
1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include "mce_amd.h"
5 
6 static struct amd_decoder_ops *fam_ops;
7 
8 static u8 xec_mask	 = 0xf;
9 
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m);
12 
13 void amd_report_gart_errors(bool v)
14 {
15 	report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18 
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 {
21 	nb_bus_decoder = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24 
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 {
27 	if (nb_bus_decoder) {
28 		WARN_ON(nb_bus_decoder != f);
29 
30 		nb_bus_decoder = NULL;
31 	}
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34 
35 /*
36  * string representation for the different MCA reported error types, see F3x48
37  * or MSR0000_0411.
38  */
39 
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 
43 /* cache level */
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
45 
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
49 };
50 
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
54 
55 /* request timeout */
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
57 
58 /* memory or i/o */
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
60 
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63 
64 static const char * const f15h_mc1_mce_desc[] = {
65 	"UC during a demand linefill from L2",
66 	"Parity error during data load from IC",
67 	"Parity error for IC valid bit",
68 	"Main tag parity error",
69 	"Parity error in prediction queue",
70 	"PFB data/address parity error",
71 	"Parity error in the branch status reg",
72 	"PFB promotion address error",
73 	"Tag error during probe/victimization",
74 	"Parity error for IC probe tag valid bit",
75 	"PFB non-cacheable bit parity error",
76 	"PFB valid bit parity error",			/* xec = 0xd */
77 	"Microcode Patch Buffer",			/* xec = 010 */
78 	"uop queue",
79 	"insn buffer",
80 	"predecode buffer",
81 	"fetch address FIFO",
82 	"dispatch uop queue"
83 };
84 
85 static const char * const f15h_mc2_mce_desc[] = {
86 	"Fill ECC error on data fills",			/* xec = 0x4 */
87 	"Fill parity error on insn fills",
88 	"Prefetcher request FIFO parity error",
89 	"PRQ address parity error",
90 	"PRQ data parity error",
91 	"WCC Tag ECC error",
92 	"WCC Data ECC error",
93 	"WCB Data parity error",
94 	"VB Data ECC or parity error",
95 	"L2 Tag ECC error",				/* xec = 0x10 */
96 	"Hard L2 Tag ECC error",
97 	"Multiple hits on L2 tag",
98 	"XAB parity error",
99 	"PRB address parity error"
100 };
101 
102 static const char * const mc4_mce_desc[] = {
103 	"DRAM ECC error detected on the NB",
104 	"CRC error detected on HT link",
105 	"Link-defined sync error packets detected on HT link",
106 	"HT Master abort",
107 	"HT Target abort",
108 	"Invalid GART PTE entry during GART table walk",
109 	"Unsupported atomic RMW received from an IO link",
110 	"Watchdog timeout due to lack of progress",
111 	"DRAM ECC error detected on the NB",
112 	"SVM DMA Exclusion Vector error",
113 	"HT data error detected on link",
114 	"Protocol error (link, L3, probe filter)",
115 	"NB internal arrays parity error",
116 	"DRAM addr/ctl signals parity error",
117 	"IO link transmission error",
118 	"L3 data cache ECC error",			/* xec = 0x1c */
119 	"L3 cache tag error",
120 	"L3 LRU parity bits error",
121 	"ECC Error in the Probe Filter directory"
122 };
123 
124 static const char * const mc5_mce_desc[] = {
125 	"CPU Watchdog timer expire",
126 	"Wakeup array dest tag",
127 	"AG payload array",
128 	"EX payload array",
129 	"IDRF array",
130 	"Retire dispatch queue",
131 	"Mapper checkpoint array",
132 	"Physical register file EX0 port",
133 	"Physical register file EX1 port",
134 	"Physical register file AG0 port",
135 	"Physical register file AG1 port",
136 	"Flag register file",
137 	"DE error occurred",
138 	"Retire status queue"
139 };
140 
141 static const char * const mc6_mce_desc[] = {
142 	"Hardware Assertion",
143 	"Free List",
144 	"Physical Register File",
145 	"Retire Queue",
146 	"Scheduler table",
147 	"Status Register File",
148 };
149 
150 /* Scalable MCA error strings */
151 static const char * const smca_ls_mce_desc[] = {
152 	"Load queue parity",
153 	"Store queue parity",
154 	"Miss address buffer payload parity",
155 	"L1 TLB parity",
156 	"Reserved",
157 	"DC tag error type 6",
158 	"DC tag error type 1",
159 	"Internal error type 1",
160 	"Internal error type 2",
161 	"Sys Read data error thread 0",
162 	"Sys read data error thread 1",
163 	"DC tag error type 2",
164 	"DC data error type 1 (poison comsumption)",
165 	"DC data error type 2",
166 	"DC data error type 3",
167 	"DC tag error type 4",
168 	"L2 TLB parity",
169 	"PDC parity error",
170 	"DC tag error type 3",
171 	"DC tag error type 5",
172 	"L2 fill data error",
173 };
174 
175 static const char * const smca_if_mce_desc[] = {
176 	"microtag probe port parity error",
177 	"IC microtag or full tag multi-hit error",
178 	"IC full tag parity",
179 	"IC data array parity",
180 	"Decoupling queue phys addr parity error",
181 	"L0 ITLB parity error",
182 	"L1 ITLB parity error",
183 	"L2 ITLB parity error",
184 	"BPQ snoop parity on Thread 0",
185 	"BPQ snoop parity on Thread 1",
186 	"L1 BTB multi-match error",
187 	"L2 BTB multi-match error",
188 	"L2 Cache Response Poison error",
189 	"System Read Data error",
190 };
191 
192 static const char * const smca_l2_mce_desc[] = {
193 	"L2M tag multi-way-hit error",
194 	"L2M tag ECC error",
195 	"L2M data ECC error",
196 	"HW assert",
197 };
198 
199 static const char * const smca_de_mce_desc[] = {
200 	"uop cache tag parity error",
201 	"uop cache data parity error",
202 	"Insn buffer parity error",
203 	"uop queue parity error",
204 	"Insn dispatch queue parity error",
205 	"Fetch address FIFO parity",
206 	"Patch RAM data parity",
207 	"Patch RAM sequencer parity",
208 	"uop buffer parity"
209 };
210 
211 static const char * const smca_ex_mce_desc[] = {
212 	"Watchdog timeout error",
213 	"Phy register file parity",
214 	"Flag register file parity",
215 	"Immediate displacement register file parity",
216 	"Address generator payload parity",
217 	"EX payload parity",
218 	"Checkpoint queue parity",
219 	"Retire dispatch queue parity",
220 	"Retire status queue parity error",
221 	"Scheduling queue parity error",
222 	"Branch buffer queue parity error",
223 };
224 
225 static const char * const smca_fp_mce_desc[] = {
226 	"Physical register file parity",
227 	"Freelist parity error",
228 	"Schedule queue parity",
229 	"NSQ parity error",
230 	"Retire queue parity",
231 	"Status register file parity",
232 	"Hardware assertion",
233 };
234 
235 static const char * const smca_l3_mce_desc[] = {
236 	"Shadow tag macro ECC error",
237 	"Shadow tag macro multi-way-hit error",
238 	"L3M tag ECC error",
239 	"L3M tag multi-way-hit error",
240 	"L3M data ECC error",
241 	"XI parity, L3 fill done channel error",
242 	"L3 victim queue parity",
243 	"L3 HW assert",
244 };
245 
246 static const char * const smca_cs_mce_desc[] = {
247 	"Illegal request from transport layer",
248 	"Address violation",
249 	"Security violation",
250 	"Illegal response from transport layer",
251 	"Unexpected response",
252 	"Parity error on incoming request or probe response data",
253 	"Parity error on incoming read response data",
254 	"Atomic request parity",
255 	"ECC error on probe filter access",
256 };
257 
258 static const char * const smca_pie_mce_desc[] = {
259 	"HW assert",
260 	"Internal PIE register security violation",
261 	"Error on GMI link",
262 	"Poison data written to internal PIE register",
263 };
264 
265 static const char * const smca_umc_mce_desc[] = {
266 	"DRAM ECC error",
267 	"Data poison error on DRAM",
268 	"SDP parity error",
269 	"Advanced peripheral bus error",
270 	"Command/address parity error",
271 	"Write data CRC error",
272 };
273 
274 static const char * const smca_pb_mce_desc[] = {
275 	"Parameter Block RAM ECC error",
276 };
277 
278 static const char * const smca_psp_mce_desc[] = {
279 	"PSP RAM ECC or parity error",
280 };
281 
282 static const char * const smca_smu_mce_desc[] = {
283 	"SMU RAM ECC or parity error",
284 };
285 
286 struct smca_mce_desc {
287 	const char * const *descs;
288 	unsigned int num_descs;
289 };
290 
291 static struct smca_mce_desc smca_mce_descs[] = {
292 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
293 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
294 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
295 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
296 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
297 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
298 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
299 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
300 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
301 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
302 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
303 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
304 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
305 };
306 
307 static bool f12h_mc0_mce(u16 ec, u8 xec)
308 {
309 	bool ret = false;
310 
311 	if (MEM_ERROR(ec)) {
312 		u8 ll = LL(ec);
313 		ret = true;
314 
315 		if (ll == LL_L2)
316 			pr_cont("during L1 linefill from L2.\n");
317 		else if (ll == LL_L1)
318 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
319 		else
320 			ret = false;
321 	}
322 	return ret;
323 }
324 
325 static bool f10h_mc0_mce(u16 ec, u8 xec)
326 {
327 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
328 		pr_cont("during data scrub.\n");
329 		return true;
330 	}
331 	return f12h_mc0_mce(ec, xec);
332 }
333 
334 static bool k8_mc0_mce(u16 ec, u8 xec)
335 {
336 	if (BUS_ERROR(ec)) {
337 		pr_cont("during system linefill.\n");
338 		return true;
339 	}
340 
341 	return f10h_mc0_mce(ec, xec);
342 }
343 
344 static bool cat_mc0_mce(u16 ec, u8 xec)
345 {
346 	u8 r4	 = R4(ec);
347 	bool ret = true;
348 
349 	if (MEM_ERROR(ec)) {
350 
351 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
352 			return false;
353 
354 		switch (r4) {
355 		case R4_DRD:
356 		case R4_DWR:
357 			pr_cont("Data/Tag parity error due to %s.\n",
358 				(r4 == R4_DRD ? "load/hw prf" : "store"));
359 			break;
360 		case R4_EVICT:
361 			pr_cont("Copyback parity error on a tag miss.\n");
362 			break;
363 		case R4_SNOOP:
364 			pr_cont("Tag parity error during snoop.\n");
365 			break;
366 		default:
367 			ret = false;
368 		}
369 	} else if (BUS_ERROR(ec)) {
370 
371 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
372 			return false;
373 
374 		pr_cont("System read data error on a ");
375 
376 		switch (r4) {
377 		case R4_RD:
378 			pr_cont("TLB reload.\n");
379 			break;
380 		case R4_DWR:
381 			pr_cont("store.\n");
382 			break;
383 		case R4_DRD:
384 			pr_cont("load.\n");
385 			break;
386 		default:
387 			ret = false;
388 		}
389 	} else {
390 		ret = false;
391 	}
392 
393 	return ret;
394 }
395 
396 static bool f15h_mc0_mce(u16 ec, u8 xec)
397 {
398 	bool ret = true;
399 
400 	if (MEM_ERROR(ec)) {
401 
402 		switch (xec) {
403 		case 0x0:
404 			pr_cont("Data Array access error.\n");
405 			break;
406 
407 		case 0x1:
408 			pr_cont("UC error during a linefill from L2/NB.\n");
409 			break;
410 
411 		case 0x2:
412 		case 0x11:
413 			pr_cont("STQ access error.\n");
414 			break;
415 
416 		case 0x3:
417 			pr_cont("SCB access error.\n");
418 			break;
419 
420 		case 0x10:
421 			pr_cont("Tag error.\n");
422 			break;
423 
424 		case 0x12:
425 			pr_cont("LDQ access error.\n");
426 			break;
427 
428 		default:
429 			ret = false;
430 		}
431 	} else if (BUS_ERROR(ec)) {
432 
433 		if (!xec)
434 			pr_cont("System Read Data Error.\n");
435 		else
436 			pr_cont(" Internal error condition type %d.\n", xec);
437 	} else if (INT_ERROR(ec)) {
438 		if (xec <= 0x1f)
439 			pr_cont("Hardware Assert.\n");
440 		else
441 			ret = false;
442 
443 	} else
444 		ret = false;
445 
446 	return ret;
447 }
448 
449 static void decode_mc0_mce(struct mce *m)
450 {
451 	u16 ec = EC(m->status);
452 	u8 xec = XEC(m->status, xec_mask);
453 
454 	pr_emerg(HW_ERR "MC0 Error: ");
455 
456 	/* TLB error signatures are the same across families */
457 	if (TLB_ERROR(ec)) {
458 		if (TT(ec) == TT_DATA) {
459 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
460 				((xec == 2) ? "locked miss"
461 					    : (xec ? "multimatch" : "parity")));
462 			return;
463 		}
464 	} else if (fam_ops->mc0_mce(ec, xec))
465 		;
466 	else
467 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
468 }
469 
470 static bool k8_mc1_mce(u16 ec, u8 xec)
471 {
472 	u8 ll	 = LL(ec);
473 	bool ret = true;
474 
475 	if (!MEM_ERROR(ec))
476 		return false;
477 
478 	if (ll == 0x2)
479 		pr_cont("during a linefill from L2.\n");
480 	else if (ll == 0x1) {
481 		switch (R4(ec)) {
482 		case R4_IRD:
483 			pr_cont("Parity error during data load.\n");
484 			break;
485 
486 		case R4_EVICT:
487 			pr_cont("Copyback Parity/Victim error.\n");
488 			break;
489 
490 		case R4_SNOOP:
491 			pr_cont("Tag Snoop error.\n");
492 			break;
493 
494 		default:
495 			ret = false;
496 			break;
497 		}
498 	} else
499 		ret = false;
500 
501 	return ret;
502 }
503 
504 static bool cat_mc1_mce(u16 ec, u8 xec)
505 {
506 	u8 r4    = R4(ec);
507 	bool ret = true;
508 
509 	if (!MEM_ERROR(ec))
510 		return false;
511 
512 	if (TT(ec) != TT_INSTR)
513 		return false;
514 
515 	if (r4 == R4_IRD)
516 		pr_cont("Data/tag array parity error for a tag hit.\n");
517 	else if (r4 == R4_SNOOP)
518 		pr_cont("Tag error during snoop/victimization.\n");
519 	else if (xec == 0x0)
520 		pr_cont("Tag parity error from victim castout.\n");
521 	else if (xec == 0x2)
522 		pr_cont("Microcode patch RAM parity error.\n");
523 	else
524 		ret = false;
525 
526 	return ret;
527 }
528 
529 static bool f15h_mc1_mce(u16 ec, u8 xec)
530 {
531 	bool ret = true;
532 
533 	if (!MEM_ERROR(ec))
534 		return false;
535 
536 	switch (xec) {
537 	case 0x0 ... 0xa:
538 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
539 		break;
540 
541 	case 0xd:
542 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
543 		break;
544 
545 	case 0x10:
546 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
547 		break;
548 
549 	case 0x11 ... 0x15:
550 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
551 		break;
552 
553 	default:
554 		ret = false;
555 	}
556 	return ret;
557 }
558 
559 static void decode_mc1_mce(struct mce *m)
560 {
561 	u16 ec = EC(m->status);
562 	u8 xec = XEC(m->status, xec_mask);
563 
564 	pr_emerg(HW_ERR "MC1 Error: ");
565 
566 	if (TLB_ERROR(ec))
567 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
568 			(xec ? "multimatch" : "parity error"));
569 	else if (BUS_ERROR(ec)) {
570 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
571 
572 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
573 	} else if (INT_ERROR(ec)) {
574 		if (xec <= 0x3f)
575 			pr_cont("Hardware Assert.\n");
576 		else
577 			goto wrong_mc1_mce;
578 	} else if (fam_ops->mc1_mce(ec, xec))
579 		;
580 	else
581 		goto wrong_mc1_mce;
582 
583 	return;
584 
585 wrong_mc1_mce:
586 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
587 }
588 
589 static bool k8_mc2_mce(u16 ec, u8 xec)
590 {
591 	bool ret = true;
592 
593 	if (xec == 0x1)
594 		pr_cont(" in the write data buffers.\n");
595 	else if (xec == 0x3)
596 		pr_cont(" in the victim data buffers.\n");
597 	else if (xec == 0x2 && MEM_ERROR(ec))
598 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
599 	else if (xec == 0x0) {
600 		if (TLB_ERROR(ec))
601 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
602 				TT_MSG(ec));
603 		else if (BUS_ERROR(ec))
604 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
605 				R4_MSG(ec), PP_MSG(ec));
606 		else if (MEM_ERROR(ec)) {
607 			u8 r4 = R4(ec);
608 
609 			if (r4 >= 0x7)
610 				pr_cont(": %s error during data copyback.\n",
611 					R4_MSG(ec));
612 			else if (r4 <= 0x1)
613 				pr_cont(": %s parity/ECC error during data "
614 					"access from L2.\n", R4_MSG(ec));
615 			else
616 				ret = false;
617 		} else
618 			ret = false;
619 	} else
620 		ret = false;
621 
622 	return ret;
623 }
624 
625 static bool f15h_mc2_mce(u16 ec, u8 xec)
626 {
627 	bool ret = true;
628 
629 	if (TLB_ERROR(ec)) {
630 		if (xec == 0x0)
631 			pr_cont("Data parity TLB read error.\n");
632 		else if (xec == 0x1)
633 			pr_cont("Poison data provided for TLB fill.\n");
634 		else
635 			ret = false;
636 	} else if (BUS_ERROR(ec)) {
637 		if (xec > 2)
638 			ret = false;
639 
640 		pr_cont("Error during attempted NB data read.\n");
641 	} else if (MEM_ERROR(ec)) {
642 		switch (xec) {
643 		case 0x4 ... 0xc:
644 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
645 			break;
646 
647 		case 0x10 ... 0x14:
648 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
649 			break;
650 
651 		default:
652 			ret = false;
653 		}
654 	} else if (INT_ERROR(ec)) {
655 		if (xec <= 0x3f)
656 			pr_cont("Hardware Assert.\n");
657 		else
658 			ret = false;
659 	}
660 
661 	return ret;
662 }
663 
664 static bool f16h_mc2_mce(u16 ec, u8 xec)
665 {
666 	u8 r4 = R4(ec);
667 
668 	if (!MEM_ERROR(ec))
669 		return false;
670 
671 	switch (xec) {
672 	case 0x04 ... 0x05:
673 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
674 		break;
675 
676 	case 0x09 ... 0x0b:
677 	case 0x0d ... 0x0f:
678 		pr_cont("ECC error in L2 tag (%s).\n",
679 			((r4 == R4_GEN)   ? "BankReq" :
680 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
681 		break;
682 
683 	case 0x10 ... 0x19:
684 	case 0x1b:
685 		pr_cont("ECC error in L2 data array (%s).\n",
686 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
687 			((r4 == R4_GEN)   ? "Attr" :
688 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
689 		break;
690 
691 	case 0x1c ... 0x1d:
692 	case 0x1f:
693 		pr_cont("Parity error in L2 attribute bits (%s).\n",
694 			((r4 == R4_RD)  ? "Hit"  :
695 			((r4 == R4_GEN) ? "Attr" : "Fill")));
696 		break;
697 
698 	default:
699 		return false;
700 	}
701 
702 	return true;
703 }
704 
705 static void decode_mc2_mce(struct mce *m)
706 {
707 	u16 ec = EC(m->status);
708 	u8 xec = XEC(m->status, xec_mask);
709 
710 	pr_emerg(HW_ERR "MC2 Error: ");
711 
712 	if (!fam_ops->mc2_mce(ec, xec))
713 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
714 }
715 
716 static void decode_mc3_mce(struct mce *m)
717 {
718 	u16 ec = EC(m->status);
719 	u8 xec = XEC(m->status, xec_mask);
720 
721 	if (boot_cpu_data.x86 >= 0x14) {
722 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
723 			 " please report on LKML.\n");
724 		return;
725 	}
726 
727 	pr_emerg(HW_ERR "MC3 Error");
728 
729 	if (xec == 0x0) {
730 		u8 r4 = R4(ec);
731 
732 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
733 			goto wrong_mc3_mce;
734 
735 		pr_cont(" during %s.\n", R4_MSG(ec));
736 	} else
737 		goto wrong_mc3_mce;
738 
739 	return;
740 
741  wrong_mc3_mce:
742 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
743 }
744 
745 static void decode_mc4_mce(struct mce *m)
746 {
747 	struct cpuinfo_x86 *c = &boot_cpu_data;
748 	int node_id = amd_get_nb_id(m->extcpu);
749 	u16 ec = EC(m->status);
750 	u8 xec = XEC(m->status, 0x1f);
751 	u8 offset = 0;
752 
753 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
754 
755 	switch (xec) {
756 	case 0x0 ... 0xe:
757 
758 		/* special handling for DRAM ECCs */
759 		if (xec == 0x0 || xec == 0x8) {
760 			/* no ECCs on F11h */
761 			if (c->x86 == 0x11)
762 				goto wrong_mc4_mce;
763 
764 			pr_cont("%s.\n", mc4_mce_desc[xec]);
765 
766 			if (nb_bus_decoder)
767 				nb_bus_decoder(node_id, m);
768 			return;
769 		}
770 		break;
771 
772 	case 0xf:
773 		if (TLB_ERROR(ec))
774 			pr_cont("GART Table Walk data error.\n");
775 		else if (BUS_ERROR(ec))
776 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
777 		else
778 			goto wrong_mc4_mce;
779 		return;
780 
781 	case 0x19:
782 		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
783 			pr_cont("Compute Unit Data Error.\n");
784 		else
785 			goto wrong_mc4_mce;
786 		return;
787 
788 	case 0x1c ... 0x1f:
789 		offset = 13;
790 		break;
791 
792 	default:
793 		goto wrong_mc4_mce;
794 	}
795 
796 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
797 	return;
798 
799  wrong_mc4_mce:
800 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
801 }
802 
803 static void decode_mc5_mce(struct mce *m)
804 {
805 	struct cpuinfo_x86 *c = &boot_cpu_data;
806 	u16 ec = EC(m->status);
807 	u8 xec = XEC(m->status, xec_mask);
808 
809 	if (c->x86 == 0xf || c->x86 == 0x11)
810 		goto wrong_mc5_mce;
811 
812 	pr_emerg(HW_ERR "MC5 Error: ");
813 
814 	if (INT_ERROR(ec)) {
815 		if (xec <= 0x1f) {
816 			pr_cont("Hardware Assert.\n");
817 			return;
818 		} else
819 			goto wrong_mc5_mce;
820 	}
821 
822 	if (xec == 0x0 || xec == 0xc)
823 		pr_cont("%s.\n", mc5_mce_desc[xec]);
824 	else if (xec <= 0xd)
825 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
826 	else
827 		goto wrong_mc5_mce;
828 
829 	return;
830 
831  wrong_mc5_mce:
832 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
833 }
834 
835 static void decode_mc6_mce(struct mce *m)
836 {
837 	u8 xec = XEC(m->status, xec_mask);
838 
839 	pr_emerg(HW_ERR "MC6 Error: ");
840 
841 	if (xec > 0x5)
842 		goto wrong_mc6_mce;
843 
844 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
845 	return;
846 
847  wrong_mc6_mce:
848 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
849 }
850 
851 /* Decode errors according to Scalable MCA specification */
852 static void decode_smca_errors(struct mce *m)
853 {
854 	struct smca_hwid_mcatype *type;
855 	unsigned int bank_type;
856 	const char *ip_name;
857 	u8 xec = XEC(m->status, xec_mask);
858 
859 	if (m->bank >= ARRAY_SIZE(smca_banks))
860 		return;
861 
862 	if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
863 		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
864 
865 	type = smca_banks[m->bank].type;
866 	if (!type)
867 		return;
868 
869 	bank_type = type->bank_type;
870 	ip_name = smca_bank_names[bank_type].long_name;
871 
872 	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
873 
874 	/* Only print the decode of valid error codes */
875 	if (xec < smca_mce_descs[bank_type].num_descs &&
876 			(type->xec_bitmap & BIT_ULL(xec))) {
877 		pr_emerg(HW_ERR "%s Error: ", ip_name);
878 		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
879 	}
880 }
881 
882 static inline void amd_decode_err_code(u16 ec)
883 {
884 	if (INT_ERROR(ec)) {
885 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
886 		return;
887 	}
888 
889 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
890 
891 	if (BUS_ERROR(ec))
892 		pr_cont(", mem/io: %s", II_MSG(ec));
893 	else
894 		pr_cont(", tx: %s", TT_MSG(ec));
895 
896 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
897 		pr_cont(", mem-tx: %s", R4_MSG(ec));
898 
899 		if (BUS_ERROR(ec))
900 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
901 	}
902 
903 	pr_cont("\n");
904 }
905 
906 /*
907  * Filter out unwanted MCE signatures here.
908  */
909 static bool amd_filter_mce(struct mce *m)
910 {
911 	u8 xec = (m->status >> 16) & 0x1f;
912 
913 	/*
914 	 * NB GART TLB error reporting is disabled by default.
915 	 */
916 	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
917 		return true;
918 
919 	return false;
920 }
921 
922 static const char *decode_error_status(struct mce *m)
923 {
924 	if (m->status & MCI_STATUS_UC) {
925 		if (m->status & MCI_STATUS_PCC)
926 			return "System Fatal error.";
927 		if (m->mcgstatus & MCG_STATUS_RIPV)
928 			return "Uncorrected, software restartable error.";
929 		return "Uncorrected, software containable error.";
930 	}
931 
932 	if (m->status & MCI_STATUS_DEFERRED)
933 		return "Deferred error.";
934 
935 	return "Corrected error, no action required.";
936 }
937 
938 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
939 {
940 	struct mce *m = (struct mce *)data;
941 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
942 	int ecc;
943 
944 	if (amd_filter_mce(m))
945 		return NOTIFY_STOP;
946 
947 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
948 
949 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
950 		m->extcpu,
951 		c->x86, c->x86_model, c->x86_mask,
952 		m->bank,
953 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
954 		((m->status & MCI_STATUS_UC)	? "UE"	  :
955 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
956 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
957 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
958 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
959 
960 	if (c->x86 >= 0x15)
961 		pr_cont("|%s|%s",
962 			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
963 			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
964 
965 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
966 		u32 low, high;
967 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
968 
969 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
970 
971 		if (!rdmsr_safe(addr, &low, &high) &&
972 		    (low & MCI_CONFIG_MCAX))
973 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
974 	}
975 
976 	/* do the two bits[14:13] together */
977 	ecc = (m->status >> 45) & 0x3;
978 	if (ecc)
979 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
980 
981 	pr_cont("]: 0x%016llx\n", m->status);
982 
983 	if (m->status & MCI_STATUS_ADDRV)
984 		pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
985 
986 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
987 		if (m->status & MCI_STATUS_SYNDV)
988 			pr_cont(", Syndrome: 0x%016llx", m->synd);
989 
990 		pr_cont(", IPID: 0x%016llx", m->ipid);
991 
992 		pr_cont("\n");
993 
994 		decode_smca_errors(m);
995 		goto err_code;
996 	} else
997 		pr_cont("\n");
998 
999 	if (!fam_ops)
1000 		goto err_code;
1001 
1002 	switch (m->bank) {
1003 	case 0:
1004 		decode_mc0_mce(m);
1005 		break;
1006 
1007 	case 1:
1008 		decode_mc1_mce(m);
1009 		break;
1010 
1011 	case 2:
1012 		decode_mc2_mce(m);
1013 		break;
1014 
1015 	case 3:
1016 		decode_mc3_mce(m);
1017 		break;
1018 
1019 	case 4:
1020 		decode_mc4_mce(m);
1021 		break;
1022 
1023 	case 5:
1024 		decode_mc5_mce(m);
1025 		break;
1026 
1027 	case 6:
1028 		decode_mc6_mce(m);
1029 		break;
1030 
1031 	default:
1032 		break;
1033 	}
1034 
1035  err_code:
1036 	amd_decode_err_code(m->status & 0xffff);
1037 
1038 	return NOTIFY_STOP;
1039 }
1040 EXPORT_SYMBOL_GPL(amd_decode_mce);
1041 
1042 static struct notifier_block amd_mce_dec_nb = {
1043 	.notifier_call	= amd_decode_mce,
1044 };
1045 
1046 static int __init mce_amd_init(void)
1047 {
1048 	struct cpuinfo_x86 *c = &boot_cpu_data;
1049 
1050 	if (c->x86_vendor != X86_VENDOR_AMD)
1051 		return -ENODEV;
1052 
1053 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1054 	if (!fam_ops)
1055 		return -ENOMEM;
1056 
1057 	switch (c->x86) {
1058 	case 0xf:
1059 		fam_ops->mc0_mce = k8_mc0_mce;
1060 		fam_ops->mc1_mce = k8_mc1_mce;
1061 		fam_ops->mc2_mce = k8_mc2_mce;
1062 		break;
1063 
1064 	case 0x10:
1065 		fam_ops->mc0_mce = f10h_mc0_mce;
1066 		fam_ops->mc1_mce = k8_mc1_mce;
1067 		fam_ops->mc2_mce = k8_mc2_mce;
1068 		break;
1069 
1070 	case 0x11:
1071 		fam_ops->mc0_mce = k8_mc0_mce;
1072 		fam_ops->mc1_mce = k8_mc1_mce;
1073 		fam_ops->mc2_mce = k8_mc2_mce;
1074 		break;
1075 
1076 	case 0x12:
1077 		fam_ops->mc0_mce = f12h_mc0_mce;
1078 		fam_ops->mc1_mce = k8_mc1_mce;
1079 		fam_ops->mc2_mce = k8_mc2_mce;
1080 		break;
1081 
1082 	case 0x14:
1083 		fam_ops->mc0_mce = cat_mc0_mce;
1084 		fam_ops->mc1_mce = cat_mc1_mce;
1085 		fam_ops->mc2_mce = k8_mc2_mce;
1086 		break;
1087 
1088 	case 0x15:
1089 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1090 
1091 		fam_ops->mc0_mce = f15h_mc0_mce;
1092 		fam_ops->mc1_mce = f15h_mc1_mce;
1093 		fam_ops->mc2_mce = f15h_mc2_mce;
1094 		break;
1095 
1096 	case 0x16:
1097 		xec_mask = 0x1f;
1098 		fam_ops->mc0_mce = cat_mc0_mce;
1099 		fam_ops->mc1_mce = cat_mc1_mce;
1100 		fam_ops->mc2_mce = f16h_mc2_mce;
1101 		break;
1102 
1103 	case 0x17:
1104 		xec_mask = 0x3f;
1105 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1106 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1107 			goto err_out;
1108 		}
1109 		break;
1110 
1111 	default:
1112 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1113 		goto err_out;
1114 	}
1115 
1116 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1117 
1118 	mce_register_decode_chain(&amd_mce_dec_nb);
1119 
1120 	return 0;
1121 
1122 err_out:
1123 	kfree(fam_ops);
1124 	fam_ops = NULL;
1125 	return -EINVAL;
1126 }
1127 early_initcall(mce_amd_init);
1128 
1129 #ifdef MODULE
1130 static void __exit mce_amd_exit(void)
1131 {
1132 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1133 	kfree(fam_ops);
1134 }
1135 
1136 MODULE_DESCRIPTION("AMD MCE decoder");
1137 MODULE_ALIAS("edac-mce-amd");
1138 MODULE_LICENSE("GPL");
1139 module_exit(mce_amd_exit);
1140 #endif
1141