xref: /openbmc/linux/drivers/edac/mce_amd.c (revision 57ee11ea)
1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include "mce_amd.h"
5 
6 static struct amd_decoder_ops *fam_ops;
7 
8 static u8 xec_mask	 = 0xf;
9 
10 static bool report_gart_errors;
11 static void (*decode_dram_ecc)(int node_id, struct mce *m);
12 
13 void amd_report_gart_errors(bool v)
14 {
15 	report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18 
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 {
21 	decode_dram_ecc = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24 
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 {
27 	if (decode_dram_ecc) {
28 		WARN_ON(decode_dram_ecc != f);
29 
30 		decode_dram_ecc = NULL;
31 	}
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34 
35 /*
36  * string representation for the different MCA reported error types, see F3x48
37  * or MSR0000_0411.
38  */
39 
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 
43 /* cache level */
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
45 
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
49 };
50 
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
54 
55 /* request timeout */
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
57 
58 /* memory or i/o */
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
60 
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63 
64 static const char * const f15h_mc1_mce_desc[] = {
65 	"UC during a demand linefill from L2",
66 	"Parity error during data load from IC",
67 	"Parity error for IC valid bit",
68 	"Main tag parity error",
69 	"Parity error in prediction queue",
70 	"PFB data/address parity error",
71 	"Parity error in the branch status reg",
72 	"PFB promotion address error",
73 	"Tag error during probe/victimization",
74 	"Parity error for IC probe tag valid bit",
75 	"PFB non-cacheable bit parity error",
76 	"PFB valid bit parity error",			/* xec = 0xd */
77 	"Microcode Patch Buffer",			/* xec = 010 */
78 	"uop queue",
79 	"insn buffer",
80 	"predecode buffer",
81 	"fetch address FIFO",
82 	"dispatch uop queue"
83 };
84 
85 static const char * const f15h_mc2_mce_desc[] = {
86 	"Fill ECC error on data fills",			/* xec = 0x4 */
87 	"Fill parity error on insn fills",
88 	"Prefetcher request FIFO parity error",
89 	"PRQ address parity error",
90 	"PRQ data parity error",
91 	"WCC Tag ECC error",
92 	"WCC Data ECC error",
93 	"WCB Data parity error",
94 	"VB Data ECC or parity error",
95 	"L2 Tag ECC error",				/* xec = 0x10 */
96 	"Hard L2 Tag ECC error",
97 	"Multiple hits on L2 tag",
98 	"XAB parity error",
99 	"PRB address parity error"
100 };
101 
102 static const char * const mc4_mce_desc[] = {
103 	"DRAM ECC error detected on the NB",
104 	"CRC error detected on HT link",
105 	"Link-defined sync error packets detected on HT link",
106 	"HT Master abort",
107 	"HT Target abort",
108 	"Invalid GART PTE entry during GART table walk",
109 	"Unsupported atomic RMW received from an IO link",
110 	"Watchdog timeout due to lack of progress",
111 	"DRAM ECC error detected on the NB",
112 	"SVM DMA Exclusion Vector error",
113 	"HT data error detected on link",
114 	"Protocol error (link, L3, probe filter)",
115 	"NB internal arrays parity error",
116 	"DRAM addr/ctl signals parity error",
117 	"IO link transmission error",
118 	"L3 data cache ECC error",			/* xec = 0x1c */
119 	"L3 cache tag error",
120 	"L3 LRU parity bits error",
121 	"ECC Error in the Probe Filter directory"
122 };
123 
124 static const char * const mc5_mce_desc[] = {
125 	"CPU Watchdog timer expire",
126 	"Wakeup array dest tag",
127 	"AG payload array",
128 	"EX payload array",
129 	"IDRF array",
130 	"Retire dispatch queue",
131 	"Mapper checkpoint array",
132 	"Physical register file EX0 port",
133 	"Physical register file EX1 port",
134 	"Physical register file AG0 port",
135 	"Physical register file AG1 port",
136 	"Flag register file",
137 	"DE error occurred",
138 	"Retire status queue"
139 };
140 
141 static const char * const mc6_mce_desc[] = {
142 	"Hardware Assertion",
143 	"Free List",
144 	"Physical Register File",
145 	"Retire Queue",
146 	"Scheduler table",
147 	"Status Register File",
148 };
149 
150 /* Scalable MCA error strings */
151 static const char * const smca_ls_mce_desc[] = {
152 	"Load queue parity",
153 	"Store queue parity",
154 	"Miss address buffer payload parity",
155 	"L1 TLB parity",
156 	"Reserved",
157 	"DC tag error type 6",
158 	"DC tag error type 1",
159 	"Internal error type 1",
160 	"Internal error type 2",
161 	"Sys Read data error thread 0",
162 	"Sys read data error thread 1",
163 	"DC tag error type 2",
164 	"DC data error type 1 (poison consumption)",
165 	"DC data error type 2",
166 	"DC data error type 3",
167 	"DC tag error type 4",
168 	"L2 TLB parity",
169 	"PDC parity error",
170 	"DC tag error type 3",
171 	"DC tag error type 5",
172 	"L2 fill data error",
173 };
174 
175 static const char * const smca_if_mce_desc[] = {
176 	"microtag probe port parity error",
177 	"IC microtag or full tag multi-hit error",
178 	"IC full tag parity",
179 	"IC data array parity",
180 	"Decoupling queue phys addr parity error",
181 	"L0 ITLB parity error",
182 	"L1 ITLB parity error",
183 	"L2 ITLB parity error",
184 	"BPQ snoop parity on Thread 0",
185 	"BPQ snoop parity on Thread 1",
186 	"L1 BTB multi-match error",
187 	"L2 BTB multi-match error",
188 	"L2 Cache Response Poison error",
189 	"System Read Data error",
190 };
191 
192 static const char * const smca_l2_mce_desc[] = {
193 	"L2M tag multi-way-hit error",
194 	"L2M tag ECC error",
195 	"L2M data ECC error",
196 	"HW assert",
197 };
198 
199 static const char * const smca_de_mce_desc[] = {
200 	"uop cache tag parity error",
201 	"uop cache data parity error",
202 	"Insn buffer parity error",
203 	"uop queue parity error",
204 	"Insn dispatch queue parity error",
205 	"Fetch address FIFO parity",
206 	"Patch RAM data parity",
207 	"Patch RAM sequencer parity",
208 	"uop buffer parity"
209 };
210 
211 static const char * const smca_ex_mce_desc[] = {
212 	"Watchdog timeout error",
213 	"Phy register file parity",
214 	"Flag register file parity",
215 	"Immediate displacement register file parity",
216 	"Address generator payload parity",
217 	"EX payload parity",
218 	"Checkpoint queue parity",
219 	"Retire dispatch queue parity",
220 	"Retire status queue parity error",
221 	"Scheduling queue parity error",
222 	"Branch buffer queue parity error",
223 };
224 
225 static const char * const smca_fp_mce_desc[] = {
226 	"Physical register file parity",
227 	"Freelist parity error",
228 	"Schedule queue parity",
229 	"NSQ parity error",
230 	"Retire queue parity",
231 	"Status register file parity",
232 	"Hardware assertion",
233 };
234 
235 static const char * const smca_l3_mce_desc[] = {
236 	"Shadow tag macro ECC error",
237 	"Shadow tag macro multi-way-hit error",
238 	"L3M tag ECC error",
239 	"L3M tag multi-way-hit error",
240 	"L3M data ECC error",
241 	"XI parity, L3 fill done channel error",
242 	"L3 victim queue parity",
243 	"L3 HW assert",
244 };
245 
246 static const char * const smca_cs_mce_desc[] = {
247 	"Illegal request from transport layer",
248 	"Address violation",
249 	"Security violation",
250 	"Illegal response from transport layer",
251 	"Unexpected response",
252 	"Parity error on incoming request or probe response data",
253 	"Parity error on incoming read response data",
254 	"Atomic request parity",
255 	"ECC error on probe filter access",
256 };
257 
258 static const char * const smca_pie_mce_desc[] = {
259 	"HW assert",
260 	"Internal PIE register security violation",
261 	"Error on GMI link",
262 	"Poison data written to internal PIE register",
263 };
264 
265 static const char * const smca_umc_mce_desc[] = {
266 	"DRAM ECC error",
267 	"Data poison error on DRAM",
268 	"SDP parity error",
269 	"Advanced peripheral bus error",
270 	"Command/address parity error",
271 	"Write data CRC error",
272 };
273 
274 static const char * const smca_pb_mce_desc[] = {
275 	"Parameter Block RAM ECC error",
276 };
277 
278 static const char * const smca_psp_mce_desc[] = {
279 	"PSP RAM ECC or parity error",
280 };
281 
282 static const char * const smca_smu_mce_desc[] = {
283 	"SMU RAM ECC or parity error",
284 };
285 
286 struct smca_mce_desc {
287 	const char * const *descs;
288 	unsigned int num_descs;
289 };
290 
291 static struct smca_mce_desc smca_mce_descs[] = {
292 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
293 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
294 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
295 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
296 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
297 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
298 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
299 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
300 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
301 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
302 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
303 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
304 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
305 };
306 
307 static bool f12h_mc0_mce(u16 ec, u8 xec)
308 {
309 	bool ret = false;
310 
311 	if (MEM_ERROR(ec)) {
312 		u8 ll = LL(ec);
313 		ret = true;
314 
315 		if (ll == LL_L2)
316 			pr_cont("during L1 linefill from L2.\n");
317 		else if (ll == LL_L1)
318 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
319 		else
320 			ret = false;
321 	}
322 	return ret;
323 }
324 
325 static bool f10h_mc0_mce(u16 ec, u8 xec)
326 {
327 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
328 		pr_cont("during data scrub.\n");
329 		return true;
330 	}
331 	return f12h_mc0_mce(ec, xec);
332 }
333 
334 static bool k8_mc0_mce(u16 ec, u8 xec)
335 {
336 	if (BUS_ERROR(ec)) {
337 		pr_cont("during system linefill.\n");
338 		return true;
339 	}
340 
341 	return f10h_mc0_mce(ec, xec);
342 }
343 
344 static bool cat_mc0_mce(u16 ec, u8 xec)
345 {
346 	u8 r4	 = R4(ec);
347 	bool ret = true;
348 
349 	if (MEM_ERROR(ec)) {
350 
351 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
352 			return false;
353 
354 		switch (r4) {
355 		case R4_DRD:
356 		case R4_DWR:
357 			pr_cont("Data/Tag parity error due to %s.\n",
358 				(r4 == R4_DRD ? "load/hw prf" : "store"));
359 			break;
360 		case R4_EVICT:
361 			pr_cont("Copyback parity error on a tag miss.\n");
362 			break;
363 		case R4_SNOOP:
364 			pr_cont("Tag parity error during snoop.\n");
365 			break;
366 		default:
367 			ret = false;
368 		}
369 	} else if (BUS_ERROR(ec)) {
370 
371 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
372 			return false;
373 
374 		pr_cont("System read data error on a ");
375 
376 		switch (r4) {
377 		case R4_RD:
378 			pr_cont("TLB reload.\n");
379 			break;
380 		case R4_DWR:
381 			pr_cont("store.\n");
382 			break;
383 		case R4_DRD:
384 			pr_cont("load.\n");
385 			break;
386 		default:
387 			ret = false;
388 		}
389 	} else {
390 		ret = false;
391 	}
392 
393 	return ret;
394 }
395 
396 static bool f15h_mc0_mce(u16 ec, u8 xec)
397 {
398 	bool ret = true;
399 
400 	if (MEM_ERROR(ec)) {
401 
402 		switch (xec) {
403 		case 0x0:
404 			pr_cont("Data Array access error.\n");
405 			break;
406 
407 		case 0x1:
408 			pr_cont("UC error during a linefill from L2/NB.\n");
409 			break;
410 
411 		case 0x2:
412 		case 0x11:
413 			pr_cont("STQ access error.\n");
414 			break;
415 
416 		case 0x3:
417 			pr_cont("SCB access error.\n");
418 			break;
419 
420 		case 0x10:
421 			pr_cont("Tag error.\n");
422 			break;
423 
424 		case 0x12:
425 			pr_cont("LDQ access error.\n");
426 			break;
427 
428 		default:
429 			ret = false;
430 		}
431 	} else if (BUS_ERROR(ec)) {
432 
433 		if (!xec)
434 			pr_cont("System Read Data Error.\n");
435 		else
436 			pr_cont(" Internal error condition type %d.\n", xec);
437 	} else if (INT_ERROR(ec)) {
438 		if (xec <= 0x1f)
439 			pr_cont("Hardware Assert.\n");
440 		else
441 			ret = false;
442 
443 	} else
444 		ret = false;
445 
446 	return ret;
447 }
448 
449 static void decode_mc0_mce(struct mce *m)
450 {
451 	u16 ec = EC(m->status);
452 	u8 xec = XEC(m->status, xec_mask);
453 
454 	pr_emerg(HW_ERR "MC0 Error: ");
455 
456 	/* TLB error signatures are the same across families */
457 	if (TLB_ERROR(ec)) {
458 		if (TT(ec) == TT_DATA) {
459 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
460 				((xec == 2) ? "locked miss"
461 					    : (xec ? "multimatch" : "parity")));
462 			return;
463 		}
464 	} else if (fam_ops->mc0_mce(ec, xec))
465 		;
466 	else
467 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
468 }
469 
470 static bool k8_mc1_mce(u16 ec, u8 xec)
471 {
472 	u8 ll	 = LL(ec);
473 	bool ret = true;
474 
475 	if (!MEM_ERROR(ec))
476 		return false;
477 
478 	if (ll == 0x2)
479 		pr_cont("during a linefill from L2.\n");
480 	else if (ll == 0x1) {
481 		switch (R4(ec)) {
482 		case R4_IRD:
483 			pr_cont("Parity error during data load.\n");
484 			break;
485 
486 		case R4_EVICT:
487 			pr_cont("Copyback Parity/Victim error.\n");
488 			break;
489 
490 		case R4_SNOOP:
491 			pr_cont("Tag Snoop error.\n");
492 			break;
493 
494 		default:
495 			ret = false;
496 			break;
497 		}
498 	} else
499 		ret = false;
500 
501 	return ret;
502 }
503 
504 static bool cat_mc1_mce(u16 ec, u8 xec)
505 {
506 	u8 r4    = R4(ec);
507 	bool ret = true;
508 
509 	if (!MEM_ERROR(ec))
510 		return false;
511 
512 	if (TT(ec) != TT_INSTR)
513 		return false;
514 
515 	if (r4 == R4_IRD)
516 		pr_cont("Data/tag array parity error for a tag hit.\n");
517 	else if (r4 == R4_SNOOP)
518 		pr_cont("Tag error during snoop/victimization.\n");
519 	else if (xec == 0x0)
520 		pr_cont("Tag parity error from victim castout.\n");
521 	else if (xec == 0x2)
522 		pr_cont("Microcode patch RAM parity error.\n");
523 	else
524 		ret = false;
525 
526 	return ret;
527 }
528 
529 static bool f15h_mc1_mce(u16 ec, u8 xec)
530 {
531 	bool ret = true;
532 
533 	if (!MEM_ERROR(ec))
534 		return false;
535 
536 	switch (xec) {
537 	case 0x0 ... 0xa:
538 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
539 		break;
540 
541 	case 0xd:
542 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
543 		break;
544 
545 	case 0x10:
546 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
547 		break;
548 
549 	case 0x11 ... 0x15:
550 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
551 		break;
552 
553 	default:
554 		ret = false;
555 	}
556 	return ret;
557 }
558 
559 static void decode_mc1_mce(struct mce *m)
560 {
561 	u16 ec = EC(m->status);
562 	u8 xec = XEC(m->status, xec_mask);
563 
564 	pr_emerg(HW_ERR "MC1 Error: ");
565 
566 	if (TLB_ERROR(ec))
567 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
568 			(xec ? "multimatch" : "parity error"));
569 	else if (BUS_ERROR(ec)) {
570 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
571 
572 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
573 	} else if (INT_ERROR(ec)) {
574 		if (xec <= 0x3f)
575 			pr_cont("Hardware Assert.\n");
576 		else
577 			goto wrong_mc1_mce;
578 	} else if (fam_ops->mc1_mce(ec, xec))
579 		;
580 	else
581 		goto wrong_mc1_mce;
582 
583 	return;
584 
585 wrong_mc1_mce:
586 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
587 }
588 
589 static bool k8_mc2_mce(u16 ec, u8 xec)
590 {
591 	bool ret = true;
592 
593 	if (xec == 0x1)
594 		pr_cont(" in the write data buffers.\n");
595 	else if (xec == 0x3)
596 		pr_cont(" in the victim data buffers.\n");
597 	else if (xec == 0x2 && MEM_ERROR(ec))
598 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
599 	else if (xec == 0x0) {
600 		if (TLB_ERROR(ec))
601 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
602 				TT_MSG(ec));
603 		else if (BUS_ERROR(ec))
604 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
605 				R4_MSG(ec), PP_MSG(ec));
606 		else if (MEM_ERROR(ec)) {
607 			u8 r4 = R4(ec);
608 
609 			if (r4 >= 0x7)
610 				pr_cont(": %s error during data copyback.\n",
611 					R4_MSG(ec));
612 			else if (r4 <= 0x1)
613 				pr_cont(": %s parity/ECC error during data "
614 					"access from L2.\n", R4_MSG(ec));
615 			else
616 				ret = false;
617 		} else
618 			ret = false;
619 	} else
620 		ret = false;
621 
622 	return ret;
623 }
624 
625 static bool f15h_mc2_mce(u16 ec, u8 xec)
626 {
627 	bool ret = true;
628 
629 	if (TLB_ERROR(ec)) {
630 		if (xec == 0x0)
631 			pr_cont("Data parity TLB read error.\n");
632 		else if (xec == 0x1)
633 			pr_cont("Poison data provided for TLB fill.\n");
634 		else
635 			ret = false;
636 	} else if (BUS_ERROR(ec)) {
637 		if (xec > 2)
638 			ret = false;
639 
640 		pr_cont("Error during attempted NB data read.\n");
641 	} else if (MEM_ERROR(ec)) {
642 		switch (xec) {
643 		case 0x4 ... 0xc:
644 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
645 			break;
646 
647 		case 0x10 ... 0x14:
648 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
649 			break;
650 
651 		default:
652 			ret = false;
653 		}
654 	} else if (INT_ERROR(ec)) {
655 		if (xec <= 0x3f)
656 			pr_cont("Hardware Assert.\n");
657 		else
658 			ret = false;
659 	}
660 
661 	return ret;
662 }
663 
664 static bool f16h_mc2_mce(u16 ec, u8 xec)
665 {
666 	u8 r4 = R4(ec);
667 
668 	if (!MEM_ERROR(ec))
669 		return false;
670 
671 	switch (xec) {
672 	case 0x04 ... 0x05:
673 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
674 		break;
675 
676 	case 0x09 ... 0x0b:
677 	case 0x0d ... 0x0f:
678 		pr_cont("ECC error in L2 tag (%s).\n",
679 			((r4 == R4_GEN)   ? "BankReq" :
680 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
681 		break;
682 
683 	case 0x10 ... 0x19:
684 	case 0x1b:
685 		pr_cont("ECC error in L2 data array (%s).\n",
686 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
687 			((r4 == R4_GEN)   ? "Attr" :
688 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
689 		break;
690 
691 	case 0x1c ... 0x1d:
692 	case 0x1f:
693 		pr_cont("Parity error in L2 attribute bits (%s).\n",
694 			((r4 == R4_RD)  ? "Hit"  :
695 			((r4 == R4_GEN) ? "Attr" : "Fill")));
696 		break;
697 
698 	default:
699 		return false;
700 	}
701 
702 	return true;
703 }
704 
705 static void decode_mc2_mce(struct mce *m)
706 {
707 	u16 ec = EC(m->status);
708 	u8 xec = XEC(m->status, xec_mask);
709 
710 	pr_emerg(HW_ERR "MC2 Error: ");
711 
712 	if (!fam_ops->mc2_mce(ec, xec))
713 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
714 }
715 
716 static void decode_mc3_mce(struct mce *m)
717 {
718 	u16 ec = EC(m->status);
719 	u8 xec = XEC(m->status, xec_mask);
720 
721 	if (boot_cpu_data.x86 >= 0x14) {
722 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
723 			 " please report on LKML.\n");
724 		return;
725 	}
726 
727 	pr_emerg(HW_ERR "MC3 Error");
728 
729 	if (xec == 0x0) {
730 		u8 r4 = R4(ec);
731 
732 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
733 			goto wrong_mc3_mce;
734 
735 		pr_cont(" during %s.\n", R4_MSG(ec));
736 	} else
737 		goto wrong_mc3_mce;
738 
739 	return;
740 
741  wrong_mc3_mce:
742 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
743 }
744 
745 static void decode_mc4_mce(struct mce *m)
746 {
747 	struct cpuinfo_x86 *c = &boot_cpu_data;
748 	int node_id = amd_get_nb_id(m->extcpu);
749 	u16 ec = EC(m->status);
750 	u8 xec = XEC(m->status, 0x1f);
751 	u8 offset = 0;
752 
753 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
754 
755 	switch (xec) {
756 	case 0x0 ... 0xe:
757 
758 		/* special handling for DRAM ECCs */
759 		if (xec == 0x0 || xec == 0x8) {
760 			/* no ECCs on F11h */
761 			if (c->x86 == 0x11)
762 				goto wrong_mc4_mce;
763 
764 			pr_cont("%s.\n", mc4_mce_desc[xec]);
765 
766 			if (decode_dram_ecc)
767 				decode_dram_ecc(node_id, m);
768 			return;
769 		}
770 		break;
771 
772 	case 0xf:
773 		if (TLB_ERROR(ec))
774 			pr_cont("GART Table Walk data error.\n");
775 		else if (BUS_ERROR(ec))
776 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
777 		else
778 			goto wrong_mc4_mce;
779 		return;
780 
781 	case 0x19:
782 		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
783 			pr_cont("Compute Unit Data Error.\n");
784 		else
785 			goto wrong_mc4_mce;
786 		return;
787 
788 	case 0x1c ... 0x1f:
789 		offset = 13;
790 		break;
791 
792 	default:
793 		goto wrong_mc4_mce;
794 	}
795 
796 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
797 	return;
798 
799  wrong_mc4_mce:
800 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
801 }
802 
803 static void decode_mc5_mce(struct mce *m)
804 {
805 	struct cpuinfo_x86 *c = &boot_cpu_data;
806 	u16 ec = EC(m->status);
807 	u8 xec = XEC(m->status, xec_mask);
808 
809 	if (c->x86 == 0xf || c->x86 == 0x11)
810 		goto wrong_mc5_mce;
811 
812 	pr_emerg(HW_ERR "MC5 Error: ");
813 
814 	if (INT_ERROR(ec)) {
815 		if (xec <= 0x1f) {
816 			pr_cont("Hardware Assert.\n");
817 			return;
818 		} else
819 			goto wrong_mc5_mce;
820 	}
821 
822 	if (xec == 0x0 || xec == 0xc)
823 		pr_cont("%s.\n", mc5_mce_desc[xec]);
824 	else if (xec <= 0xd)
825 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
826 	else
827 		goto wrong_mc5_mce;
828 
829 	return;
830 
831  wrong_mc5_mce:
832 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
833 }
834 
835 static void decode_mc6_mce(struct mce *m)
836 {
837 	u8 xec = XEC(m->status, xec_mask);
838 
839 	pr_emerg(HW_ERR "MC6 Error: ");
840 
841 	if (xec > 0x5)
842 		goto wrong_mc6_mce;
843 
844 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
845 	return;
846 
847  wrong_mc6_mce:
848 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
849 }
850 
851 /* Decode errors according to Scalable MCA specification */
852 static void decode_smca_errors(struct mce *m)
853 {
854 	struct smca_hwid *hwid;
855 	unsigned int bank_type;
856 	const char *ip_name;
857 	u8 xec = XEC(m->status, xec_mask);
858 
859 	if (m->bank >= ARRAY_SIZE(smca_banks))
860 		return;
861 
862 	if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
863 		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
864 
865 	hwid = smca_banks[m->bank].hwid;
866 	if (!hwid)
867 		return;
868 
869 	bank_type = hwid->bank_type;
870 	ip_name = smca_get_long_name(bank_type);
871 
872 	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
873 
874 	/* Only print the decode of valid error codes */
875 	if (xec < smca_mce_descs[bank_type].num_descs &&
876 			(hwid->xec_bitmap & BIT_ULL(xec))) {
877 		pr_emerg(HW_ERR "%s Error: ", ip_name);
878 		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
879 	}
880 
881 	/*
882 	 * amd_get_nb_id() returns the last level cache id.
883 	 * The last level cache on Fam17h is 1 level below the node.
884 	 */
885 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
886 		decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
887 }
888 
889 static inline void amd_decode_err_code(u16 ec)
890 {
891 	if (INT_ERROR(ec)) {
892 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
893 		return;
894 	}
895 
896 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
897 
898 	if (BUS_ERROR(ec))
899 		pr_cont(", mem/io: %s", II_MSG(ec));
900 	else
901 		pr_cont(", tx: %s", TT_MSG(ec));
902 
903 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
904 		pr_cont(", mem-tx: %s", R4_MSG(ec));
905 
906 		if (BUS_ERROR(ec))
907 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
908 	}
909 
910 	pr_cont("\n");
911 }
912 
913 /*
914  * Filter out unwanted MCE signatures here.
915  */
916 static bool amd_filter_mce(struct mce *m)
917 {
918 	u8 xec = (m->status >> 16) & 0x1f;
919 
920 	/*
921 	 * NB GART TLB error reporting is disabled by default.
922 	 */
923 	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
924 		return true;
925 
926 	return false;
927 }
928 
929 static const char *decode_error_status(struct mce *m)
930 {
931 	if (m->status & MCI_STATUS_UC) {
932 		if (m->status & MCI_STATUS_PCC)
933 			return "System Fatal error.";
934 		if (m->mcgstatus & MCG_STATUS_RIPV)
935 			return "Uncorrected, software restartable error.";
936 		return "Uncorrected, software containable error.";
937 	}
938 
939 	if (m->status & MCI_STATUS_DEFERRED)
940 		return "Deferred error, no action required.";
941 
942 	return "Corrected error, no action required.";
943 }
944 
945 static int
946 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
947 {
948 	struct mce *m = (struct mce *)data;
949 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
950 	int ecc;
951 
952 	if (amd_filter_mce(m))
953 		return NOTIFY_STOP;
954 
955 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
956 
957 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
958 		m->extcpu,
959 		c->x86, c->x86_model, c->x86_mask,
960 		m->bank,
961 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
962 		((m->status & MCI_STATUS_UC)	? "UE"	  :
963 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
964 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
965 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
966 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
967 
968 	if (c->x86 >= 0x15) {
969 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
970 
971 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
972 		if (c->x86 != 0x15 || m->bank != 4)
973 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
974 	}
975 
976 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
977 		u32 low, high;
978 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
979 
980 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
981 
982 		if (!rdmsr_safe(addr, &low, &high) &&
983 		    (low & MCI_CONFIG_MCAX))
984 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
985 	}
986 
987 	/* do the two bits[14:13] together */
988 	ecc = (m->status >> 45) & 0x3;
989 	if (ecc)
990 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
991 
992 	pr_cont("]: 0x%016llx\n", m->status);
993 
994 	if (m->status & MCI_STATUS_ADDRV)
995 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
996 
997 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
998 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
999 
1000 		if (m->status & MCI_STATUS_SYNDV)
1001 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1002 
1003 		pr_cont("\n");
1004 
1005 		decode_smca_errors(m);
1006 		goto err_code;
1007 	}
1008 
1009 	if (m->tsc)
1010 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1011 
1012 	if (!fam_ops)
1013 		goto err_code;
1014 
1015 	switch (m->bank) {
1016 	case 0:
1017 		decode_mc0_mce(m);
1018 		break;
1019 
1020 	case 1:
1021 		decode_mc1_mce(m);
1022 		break;
1023 
1024 	case 2:
1025 		decode_mc2_mce(m);
1026 		break;
1027 
1028 	case 3:
1029 		decode_mc3_mce(m);
1030 		break;
1031 
1032 	case 4:
1033 		decode_mc4_mce(m);
1034 		break;
1035 
1036 	case 5:
1037 		decode_mc5_mce(m);
1038 		break;
1039 
1040 	case 6:
1041 		decode_mc6_mce(m);
1042 		break;
1043 
1044 	default:
1045 		break;
1046 	}
1047 
1048  err_code:
1049 	amd_decode_err_code(m->status & 0xffff);
1050 
1051 	return NOTIFY_STOP;
1052 }
1053 
1054 static struct notifier_block amd_mce_dec_nb = {
1055 	.notifier_call	= amd_decode_mce,
1056 	.priority	= MCE_PRIO_EDAC,
1057 };
1058 
1059 static int __init mce_amd_init(void)
1060 {
1061 	struct cpuinfo_x86 *c = &boot_cpu_data;
1062 
1063 	if (c->x86_vendor != X86_VENDOR_AMD)
1064 		return -ENODEV;
1065 
1066 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1067 	if (!fam_ops)
1068 		return -ENOMEM;
1069 
1070 	switch (c->x86) {
1071 	case 0xf:
1072 		fam_ops->mc0_mce = k8_mc0_mce;
1073 		fam_ops->mc1_mce = k8_mc1_mce;
1074 		fam_ops->mc2_mce = k8_mc2_mce;
1075 		break;
1076 
1077 	case 0x10:
1078 		fam_ops->mc0_mce = f10h_mc0_mce;
1079 		fam_ops->mc1_mce = k8_mc1_mce;
1080 		fam_ops->mc2_mce = k8_mc2_mce;
1081 		break;
1082 
1083 	case 0x11:
1084 		fam_ops->mc0_mce = k8_mc0_mce;
1085 		fam_ops->mc1_mce = k8_mc1_mce;
1086 		fam_ops->mc2_mce = k8_mc2_mce;
1087 		break;
1088 
1089 	case 0x12:
1090 		fam_ops->mc0_mce = f12h_mc0_mce;
1091 		fam_ops->mc1_mce = k8_mc1_mce;
1092 		fam_ops->mc2_mce = k8_mc2_mce;
1093 		break;
1094 
1095 	case 0x14:
1096 		fam_ops->mc0_mce = cat_mc0_mce;
1097 		fam_ops->mc1_mce = cat_mc1_mce;
1098 		fam_ops->mc2_mce = k8_mc2_mce;
1099 		break;
1100 
1101 	case 0x15:
1102 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1103 
1104 		fam_ops->mc0_mce = f15h_mc0_mce;
1105 		fam_ops->mc1_mce = f15h_mc1_mce;
1106 		fam_ops->mc2_mce = f15h_mc2_mce;
1107 		break;
1108 
1109 	case 0x16:
1110 		xec_mask = 0x1f;
1111 		fam_ops->mc0_mce = cat_mc0_mce;
1112 		fam_ops->mc1_mce = cat_mc1_mce;
1113 		fam_ops->mc2_mce = f16h_mc2_mce;
1114 		break;
1115 
1116 	case 0x17:
1117 		xec_mask = 0x3f;
1118 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1119 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1120 			goto err_out;
1121 		}
1122 		break;
1123 
1124 	default:
1125 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1126 		goto err_out;
1127 	}
1128 
1129 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1130 
1131 	mce_register_decode_chain(&amd_mce_dec_nb);
1132 
1133 	return 0;
1134 
1135 err_out:
1136 	kfree(fam_ops);
1137 	fam_ops = NULL;
1138 	return -EINVAL;
1139 }
1140 early_initcall(mce_amd_init);
1141 
1142 #ifdef MODULE
1143 static void __exit mce_amd_exit(void)
1144 {
1145 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1146 	kfree(fam_ops);
1147 }
1148 
1149 MODULE_DESCRIPTION("AMD MCE decoder");
1150 MODULE_ALIAS("edac-mce-amd");
1151 MODULE_LICENSE("GPL");
1152 module_exit(mce_amd_exit);
1153 #endif
1154