xref: /openbmc/linux/drivers/edac/mce_amd.c (revision 0cabf991)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 	decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20 
21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	if (decode_dram_ecc) {
24 		WARN_ON(decode_dram_ecc != f);
25 
26 		decode_dram_ecc = NULL;
27 	}
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30 
31 /*
32  * string representation for the different MCA reported error types, see F3x48
33  * or MSR0000_0411.
34  */
35 
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38 
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41 
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46 
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50 
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53 
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56 
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59 
60 static const char * const f15h_mc1_mce_desc[] = {
61 	"UC during a demand linefill from L2",
62 	"Parity error during data load from IC",
63 	"Parity error for IC valid bit",
64 	"Main tag parity error",
65 	"Parity error in prediction queue",
66 	"PFB data/address parity error",
67 	"Parity error in the branch status reg",
68 	"PFB promotion address error",
69 	"Tag error during probe/victimization",
70 	"Parity error for IC probe tag valid bit",
71 	"PFB non-cacheable bit parity error",
72 	"PFB valid bit parity error",			/* xec = 0xd */
73 	"Microcode Patch Buffer",			/* xec = 010 */
74 	"uop queue",
75 	"insn buffer",
76 	"predecode buffer",
77 	"fetch address FIFO",
78 	"dispatch uop queue"
79 };
80 
81 static const char * const f15h_mc2_mce_desc[] = {
82 	"Fill ECC error on data fills",			/* xec = 0x4 */
83 	"Fill parity error on insn fills",
84 	"Prefetcher request FIFO parity error",
85 	"PRQ address parity error",
86 	"PRQ data parity error",
87 	"WCC Tag ECC error",
88 	"WCC Data ECC error",
89 	"WCB Data parity error",
90 	"VB Data ECC or parity error",
91 	"L2 Tag ECC error",				/* xec = 0x10 */
92 	"Hard L2 Tag ECC error",
93 	"Multiple hits on L2 tag",
94 	"XAB parity error",
95 	"PRB address parity error"
96 };
97 
98 static const char * const mc4_mce_desc[] = {
99 	"DRAM ECC error detected on the NB",
100 	"CRC error detected on HT link",
101 	"Link-defined sync error packets detected on HT link",
102 	"HT Master abort",
103 	"HT Target abort",
104 	"Invalid GART PTE entry during GART table walk",
105 	"Unsupported atomic RMW received from an IO link",
106 	"Watchdog timeout due to lack of progress",
107 	"DRAM ECC error detected on the NB",
108 	"SVM DMA Exclusion Vector error",
109 	"HT data error detected on link",
110 	"Protocol error (link, L3, probe filter)",
111 	"NB internal arrays parity error",
112 	"DRAM addr/ctl signals parity error",
113 	"IO link transmission error",
114 	"L3 data cache ECC error",			/* xec = 0x1c */
115 	"L3 cache tag error",
116 	"L3 LRU parity bits error",
117 	"ECC Error in the Probe Filter directory"
118 };
119 
120 static const char * const mc5_mce_desc[] = {
121 	"CPU Watchdog timer expire",
122 	"Wakeup array dest tag",
123 	"AG payload array",
124 	"EX payload array",
125 	"IDRF array",
126 	"Retire dispatch queue",
127 	"Mapper checkpoint array",
128 	"Physical register file EX0 port",
129 	"Physical register file EX1 port",
130 	"Physical register file AG0 port",
131 	"Physical register file AG1 port",
132 	"Flag register file",
133 	"DE error occurred",
134 	"Retire status queue"
135 };
136 
137 static const char * const mc6_mce_desc[] = {
138 	"Hardware Assertion",
139 	"Free List",
140 	"Physical Register File",
141 	"Retire Queue",
142 	"Scheduler table",
143 	"Status Register File",
144 };
145 
146 /* Scalable MCA error strings */
147 static const char * const smca_ls_mce_desc[] = {
148 	"Load queue parity error",
149 	"Store queue parity error",
150 	"Miss address buffer payload parity error",
151 	"Level 1 TLB parity error",
152 	"DC Tag error type 5",
153 	"DC Tag error type 6",
154 	"DC Tag error type 1",
155 	"Internal error type 1",
156 	"Internal error type 2",
157 	"System Read Data Error Thread 0",
158 	"System Read Data Error Thread 1",
159 	"DC Tag error type 2",
160 	"DC Data error type 1 and poison consumption",
161 	"DC Data error type 2",
162 	"DC Data error type 3",
163 	"DC Tag error type 4",
164 	"Level 2 TLB parity error",
165 	"PDC parity error",
166 	"DC Tag error type 3",
167 	"DC Tag error type 5",
168 	"L2 Fill Data error",
169 };
170 
171 static const char * const smca_ls2_mce_desc[] = {
172 	"An ECC error was detected on a data cache read by a probe or victimization",
173 	"An ECC error or L2 poison was detected on a data cache read by a load",
174 	"An ECC error was detected on a data cache read-modify-write by a store",
175 	"An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
176 	"An ECC error or poison bit mismatch was detected on a tag read by a load",
177 	"An ECC error or poison bit mismatch was detected on a tag read by a store",
178 	"An ECC error was detected on an EMEM read by a load",
179 	"An ECC error was detected on an EMEM read-modify-write by a store",
180 	"A parity error was detected in an L1 TLB entry by any access",
181 	"A parity error was detected in an L2 TLB entry by any access",
182 	"A parity error was detected in a PWC entry by any access",
183 	"A parity error was detected in an STQ entry by any access",
184 	"A parity error was detected in an LDQ entry by any access",
185 	"A parity error was detected in a MAB entry by any access",
186 	"A parity error was detected in an SCB entry state field by any access",
187 	"A parity error was detected in an SCB entry address field by any access",
188 	"A parity error was detected in an SCB entry data field by any access",
189 	"A parity error was detected in a WCB entry by any access",
190 	"A poisoned line was detected in an SCB entry by any access",
191 	"A SystemReadDataError error was reported on read data returned from L2 for a load",
192 	"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
193 	"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
194 	"A hardware assertion error was reported",
195 	"A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
196 };
197 
198 static const char * const smca_if_mce_desc[] = {
199 	"Op Cache Microtag Probe Port Parity Error",
200 	"IC Microtag or Full Tag Multi-hit Error",
201 	"IC Full Tag Parity Error",
202 	"IC Data Array Parity Error",
203 	"Decoupling Queue PhysAddr Parity Error",
204 	"L0 ITLB Parity Error",
205 	"L1 ITLB Parity Error",
206 	"L2 ITLB Parity Error",
207 	"BPQ Thread 0 Snoop Parity Error",
208 	"BPQ Thread 1 Snoop Parity Error",
209 	"L1 BTB Multi-Match Error",
210 	"L2 BTB Multi-Match Error",
211 	"L2 Cache Response Poison Error",
212 	"System Read Data Error",
213 };
214 
215 static const char * const smca_l2_mce_desc[] = {
216 	"L2M Tag Multiple-Way-Hit error",
217 	"L2M Tag or State Array ECC Error",
218 	"L2M Data Array ECC Error",
219 	"Hardware Assert Error",
220 };
221 
222 static const char * const smca_de_mce_desc[] = {
223 	"Micro-op cache tag parity error",
224 	"Micro-op cache data parity error",
225 	"Instruction buffer parity error",
226 	"Micro-op queue parity error",
227 	"Instruction dispatch queue parity error",
228 	"Fetch address FIFO parity error",
229 	"Patch RAM data parity error",
230 	"Patch RAM sequencer parity error",
231 	"Micro-op buffer parity error"
232 };
233 
234 static const char * const smca_ex_mce_desc[] = {
235 	"Watchdog Timeout error",
236 	"Physical register file parity error",
237 	"Flag register file parity error",
238 	"Immediate displacement register file parity error",
239 	"Address generator payload parity error",
240 	"EX payload parity error",
241 	"Checkpoint queue parity error",
242 	"Retire dispatch queue parity error",
243 	"Retire status queue parity error",
244 	"Scheduling queue parity error",
245 	"Branch buffer queue parity error",
246 	"Hardware Assertion error",
247 };
248 
249 static const char * const smca_fp_mce_desc[] = {
250 	"Physical register file (PRF) parity error",
251 	"Freelist (FL) parity error",
252 	"Schedule queue parity error",
253 	"NSQ parity error",
254 	"Retire queue (RQ) parity error",
255 	"Status register file (SRF) parity error",
256 	"Hardware assertion",
257 };
258 
259 static const char * const smca_l3_mce_desc[] = {
260 	"Shadow Tag Macro ECC Error",
261 	"Shadow Tag Macro Multi-way-hit Error",
262 	"L3M Tag ECC Error",
263 	"L3M Tag Multi-way-hit Error",
264 	"L3M Data ECC Error",
265 	"SDP Parity Error or SystemReadDataError from XI",
266 	"L3 Victim Queue Parity Error",
267 	"L3 Hardware Assertion",
268 };
269 
270 static const char * const smca_cs_mce_desc[] = {
271 	"Illegal Request",
272 	"Address Violation",
273 	"Security Violation",
274 	"Illegal Response",
275 	"Unexpected Response",
276 	"Request or Probe Parity Error",
277 	"Read Response Parity Error",
278 	"Atomic Request Parity Error",
279 	"Probe Filter ECC Error",
280 };
281 
282 static const char * const smca_cs2_mce_desc[] = {
283 	"Illegal Request",
284 	"Address Violation",
285 	"Security Violation",
286 	"Illegal Response",
287 	"Unexpected Response",
288 	"Request or Probe Parity Error",
289 	"Read Response Parity Error",
290 	"Atomic Request Parity Error",
291 	"SDP read response had no match in the CS queue",
292 	"Probe Filter Protocol Error",
293 	"Probe Filter ECC Error",
294 	"SDP read response had an unexpected RETRY error",
295 	"Counter overflow error",
296 	"Counter underflow error",
297 };
298 
299 static const char * const smca_pie_mce_desc[] = {
300 	"Hardware Assert",
301 	"Register security violation",
302 	"Link Error",
303 	"Poison data consumption",
304 	"A deferred error was detected in the DF"
305 };
306 
307 static const char * const smca_umc_mce_desc[] = {
308 	"DRAM ECC error",
309 	"Data poison error",
310 	"SDP parity error",
311 	"Advanced peripheral bus error",
312 	"Address/Command parity error",
313 	"Write data CRC error",
314 	"DCQ SRAM ECC error",
315 	"AES SRAM ECC error",
316 };
317 
318 static const char * const smca_pb_mce_desc[] = {
319 	"An ECC error in the Parameter Block RAM array",
320 };
321 
322 static const char * const smca_psp_mce_desc[] = {
323 	"An ECC or parity error in a PSP RAM instance",
324 };
325 
326 static const char * const smca_psp2_mce_desc[] = {
327 	"High SRAM ECC or parity error",
328 	"Low SRAM ECC or parity error",
329 	"Instruction Cache Bank 0 ECC or parity error",
330 	"Instruction Cache Bank 1 ECC or parity error",
331 	"Instruction Tag Ram 0 parity error",
332 	"Instruction Tag Ram 1 parity error",
333 	"Data Cache Bank 0 ECC or parity error",
334 	"Data Cache Bank 1 ECC or parity error",
335 	"Data Cache Bank 2 ECC or parity error",
336 	"Data Cache Bank 3 ECC or parity error",
337 	"Data Tag Bank 0 parity error",
338 	"Data Tag Bank 1 parity error",
339 	"Data Tag Bank 2 parity error",
340 	"Data Tag Bank 3 parity error",
341 	"Dirty Data Ram parity error",
342 	"TLB Bank 0 parity error",
343 	"TLB Bank 1 parity error",
344 	"System Hub Read Buffer ECC or parity error",
345 };
346 
347 static const char * const smca_smu_mce_desc[] = {
348 	"An ECC or parity error in an SMU RAM instance",
349 };
350 
351 static const char * const smca_smu2_mce_desc[] = {
352 	"High SRAM ECC or parity error",
353 	"Low SRAM ECC or parity error",
354 	"Data Cache Bank A ECC or parity error",
355 	"Data Cache Bank B ECC or parity error",
356 	"Data Tag Cache Bank A ECC or parity error",
357 	"Data Tag Cache Bank B ECC or parity error",
358 	"Instruction Cache Bank A ECC or parity error",
359 	"Instruction Cache Bank B ECC or parity error",
360 	"Instruction Tag Cache Bank A ECC or parity error",
361 	"Instruction Tag Cache Bank B ECC or parity error",
362 	"System Hub Read Buffer ECC or parity error",
363 };
364 
365 static const char * const smca_mp5_mce_desc[] = {
366 	"High SRAM ECC or parity error",
367 	"Low SRAM ECC or parity error",
368 	"Data Cache Bank A ECC or parity error",
369 	"Data Cache Bank B ECC or parity error",
370 	"Data Tag Cache Bank A ECC or parity error",
371 	"Data Tag Cache Bank B ECC or parity error",
372 	"Instruction Cache Bank A ECC or parity error",
373 	"Instruction Cache Bank B ECC or parity error",
374 	"Instruction Tag Cache Bank A ECC or parity error",
375 	"Instruction Tag Cache Bank B ECC or parity error",
376 };
377 
378 static const char * const smca_nbio_mce_desc[] = {
379 	"ECC or Parity error",
380 	"PCIE error",
381 	"SDP ErrEvent error",
382 	"SDP Egress Poison Error",
383 	"IOHC Internal Poison Error",
384 };
385 
386 static const char * const smca_pcie_mce_desc[] = {
387 	"CCIX PER Message logging",
388 	"CCIX Read Response with Status: Non-Data Error",
389 	"CCIX Write Response with Status: Non-Data Error",
390 	"CCIX Read Response with Status: Data Error",
391 	"CCIX Non-okay write response with data error",
392 };
393 
394 struct smca_mce_desc {
395 	const char * const *descs;
396 	unsigned int num_descs;
397 };
398 
399 static struct smca_mce_desc smca_mce_descs[] = {
400 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
401 	[SMCA_LS_V2]	= { smca_ls2_mce_desc,	ARRAY_SIZE(smca_ls2_mce_desc)	},
402 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
403 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
404 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
405 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
406 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
407 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
408 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
409 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
410 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
411 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
412 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
413 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
414 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
415 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
416 	[SMCA_SMU_V2]	= { smca_smu2_mce_desc,	ARRAY_SIZE(smca_smu2_mce_desc)	},
417 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
418 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
419 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
420 };
421 
422 static bool f12h_mc0_mce(u16 ec, u8 xec)
423 {
424 	bool ret = false;
425 
426 	if (MEM_ERROR(ec)) {
427 		u8 ll = LL(ec);
428 		ret = true;
429 
430 		if (ll == LL_L2)
431 			pr_cont("during L1 linefill from L2.\n");
432 		else if (ll == LL_L1)
433 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
434 		else
435 			ret = false;
436 	}
437 	return ret;
438 }
439 
440 static bool f10h_mc0_mce(u16 ec, u8 xec)
441 {
442 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
443 		pr_cont("during data scrub.\n");
444 		return true;
445 	}
446 	return f12h_mc0_mce(ec, xec);
447 }
448 
449 static bool k8_mc0_mce(u16 ec, u8 xec)
450 {
451 	if (BUS_ERROR(ec)) {
452 		pr_cont("during system linefill.\n");
453 		return true;
454 	}
455 
456 	return f10h_mc0_mce(ec, xec);
457 }
458 
459 static bool cat_mc0_mce(u16 ec, u8 xec)
460 {
461 	u8 r4	 = R4(ec);
462 	bool ret = true;
463 
464 	if (MEM_ERROR(ec)) {
465 
466 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
467 			return false;
468 
469 		switch (r4) {
470 		case R4_DRD:
471 		case R4_DWR:
472 			pr_cont("Data/Tag parity error due to %s.\n",
473 				(r4 == R4_DRD ? "load/hw prf" : "store"));
474 			break;
475 		case R4_EVICT:
476 			pr_cont("Copyback parity error on a tag miss.\n");
477 			break;
478 		case R4_SNOOP:
479 			pr_cont("Tag parity error during snoop.\n");
480 			break;
481 		default:
482 			ret = false;
483 		}
484 	} else if (BUS_ERROR(ec)) {
485 
486 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
487 			return false;
488 
489 		pr_cont("System read data error on a ");
490 
491 		switch (r4) {
492 		case R4_RD:
493 			pr_cont("TLB reload.\n");
494 			break;
495 		case R4_DWR:
496 			pr_cont("store.\n");
497 			break;
498 		case R4_DRD:
499 			pr_cont("load.\n");
500 			break;
501 		default:
502 			ret = false;
503 		}
504 	} else {
505 		ret = false;
506 	}
507 
508 	return ret;
509 }
510 
511 static bool f15h_mc0_mce(u16 ec, u8 xec)
512 {
513 	bool ret = true;
514 
515 	if (MEM_ERROR(ec)) {
516 
517 		switch (xec) {
518 		case 0x0:
519 			pr_cont("Data Array access error.\n");
520 			break;
521 
522 		case 0x1:
523 			pr_cont("UC error during a linefill from L2/NB.\n");
524 			break;
525 
526 		case 0x2:
527 		case 0x11:
528 			pr_cont("STQ access error.\n");
529 			break;
530 
531 		case 0x3:
532 			pr_cont("SCB access error.\n");
533 			break;
534 
535 		case 0x10:
536 			pr_cont("Tag error.\n");
537 			break;
538 
539 		case 0x12:
540 			pr_cont("LDQ access error.\n");
541 			break;
542 
543 		default:
544 			ret = false;
545 		}
546 	} else if (BUS_ERROR(ec)) {
547 
548 		if (!xec)
549 			pr_cont("System Read Data Error.\n");
550 		else
551 			pr_cont(" Internal error condition type %d.\n", xec);
552 	} else if (INT_ERROR(ec)) {
553 		if (xec <= 0x1f)
554 			pr_cont("Hardware Assert.\n");
555 		else
556 			ret = false;
557 
558 	} else
559 		ret = false;
560 
561 	return ret;
562 }
563 
564 static void decode_mc0_mce(struct mce *m)
565 {
566 	u16 ec = EC(m->status);
567 	u8 xec = XEC(m->status, xec_mask);
568 
569 	pr_emerg(HW_ERR "MC0 Error: ");
570 
571 	/* TLB error signatures are the same across families */
572 	if (TLB_ERROR(ec)) {
573 		if (TT(ec) == TT_DATA) {
574 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
575 				((xec == 2) ? "locked miss"
576 					    : (xec ? "multimatch" : "parity")));
577 			return;
578 		}
579 	} else if (fam_ops.mc0_mce(ec, xec))
580 		;
581 	else
582 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
583 }
584 
585 static bool k8_mc1_mce(u16 ec, u8 xec)
586 {
587 	u8 ll	 = LL(ec);
588 	bool ret = true;
589 
590 	if (!MEM_ERROR(ec))
591 		return false;
592 
593 	if (ll == 0x2)
594 		pr_cont("during a linefill from L2.\n");
595 	else if (ll == 0x1) {
596 		switch (R4(ec)) {
597 		case R4_IRD:
598 			pr_cont("Parity error during data load.\n");
599 			break;
600 
601 		case R4_EVICT:
602 			pr_cont("Copyback Parity/Victim error.\n");
603 			break;
604 
605 		case R4_SNOOP:
606 			pr_cont("Tag Snoop error.\n");
607 			break;
608 
609 		default:
610 			ret = false;
611 			break;
612 		}
613 	} else
614 		ret = false;
615 
616 	return ret;
617 }
618 
619 static bool cat_mc1_mce(u16 ec, u8 xec)
620 {
621 	u8 r4    = R4(ec);
622 	bool ret = true;
623 
624 	if (!MEM_ERROR(ec))
625 		return false;
626 
627 	if (TT(ec) != TT_INSTR)
628 		return false;
629 
630 	if (r4 == R4_IRD)
631 		pr_cont("Data/tag array parity error for a tag hit.\n");
632 	else if (r4 == R4_SNOOP)
633 		pr_cont("Tag error during snoop/victimization.\n");
634 	else if (xec == 0x0)
635 		pr_cont("Tag parity error from victim castout.\n");
636 	else if (xec == 0x2)
637 		pr_cont("Microcode patch RAM parity error.\n");
638 	else
639 		ret = false;
640 
641 	return ret;
642 }
643 
644 static bool f15h_mc1_mce(u16 ec, u8 xec)
645 {
646 	bool ret = true;
647 
648 	if (!MEM_ERROR(ec))
649 		return false;
650 
651 	switch (xec) {
652 	case 0x0 ... 0xa:
653 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
654 		break;
655 
656 	case 0xd:
657 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
658 		break;
659 
660 	case 0x10:
661 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
662 		break;
663 
664 	case 0x11 ... 0x15:
665 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
666 		break;
667 
668 	default:
669 		ret = false;
670 	}
671 	return ret;
672 }
673 
674 static void decode_mc1_mce(struct mce *m)
675 {
676 	u16 ec = EC(m->status);
677 	u8 xec = XEC(m->status, xec_mask);
678 
679 	pr_emerg(HW_ERR "MC1 Error: ");
680 
681 	if (TLB_ERROR(ec))
682 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
683 			(xec ? "multimatch" : "parity error"));
684 	else if (BUS_ERROR(ec)) {
685 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
686 
687 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
688 	} else if (INT_ERROR(ec)) {
689 		if (xec <= 0x3f)
690 			pr_cont("Hardware Assert.\n");
691 		else
692 			goto wrong_mc1_mce;
693 	} else if (fam_ops.mc1_mce(ec, xec))
694 		;
695 	else
696 		goto wrong_mc1_mce;
697 
698 	return;
699 
700 wrong_mc1_mce:
701 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
702 }
703 
704 static bool k8_mc2_mce(u16 ec, u8 xec)
705 {
706 	bool ret = true;
707 
708 	if (xec == 0x1)
709 		pr_cont(" in the write data buffers.\n");
710 	else if (xec == 0x3)
711 		pr_cont(" in the victim data buffers.\n");
712 	else if (xec == 0x2 && MEM_ERROR(ec))
713 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
714 	else if (xec == 0x0) {
715 		if (TLB_ERROR(ec))
716 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
717 				TT_MSG(ec));
718 		else if (BUS_ERROR(ec))
719 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
720 				R4_MSG(ec), PP_MSG(ec));
721 		else if (MEM_ERROR(ec)) {
722 			u8 r4 = R4(ec);
723 
724 			if (r4 >= 0x7)
725 				pr_cont(": %s error during data copyback.\n",
726 					R4_MSG(ec));
727 			else if (r4 <= 0x1)
728 				pr_cont(": %s parity/ECC error during data "
729 					"access from L2.\n", R4_MSG(ec));
730 			else
731 				ret = false;
732 		} else
733 			ret = false;
734 	} else
735 		ret = false;
736 
737 	return ret;
738 }
739 
740 static bool f15h_mc2_mce(u16 ec, u8 xec)
741 {
742 	bool ret = true;
743 
744 	if (TLB_ERROR(ec)) {
745 		if (xec == 0x0)
746 			pr_cont("Data parity TLB read error.\n");
747 		else if (xec == 0x1)
748 			pr_cont("Poison data provided for TLB fill.\n");
749 		else
750 			ret = false;
751 	} else if (BUS_ERROR(ec)) {
752 		if (xec > 2)
753 			ret = false;
754 
755 		pr_cont("Error during attempted NB data read.\n");
756 	} else if (MEM_ERROR(ec)) {
757 		switch (xec) {
758 		case 0x4 ... 0xc:
759 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
760 			break;
761 
762 		case 0x10 ... 0x14:
763 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
764 			break;
765 
766 		default:
767 			ret = false;
768 		}
769 	} else if (INT_ERROR(ec)) {
770 		if (xec <= 0x3f)
771 			pr_cont("Hardware Assert.\n");
772 		else
773 			ret = false;
774 	}
775 
776 	return ret;
777 }
778 
779 static bool f16h_mc2_mce(u16 ec, u8 xec)
780 {
781 	u8 r4 = R4(ec);
782 
783 	if (!MEM_ERROR(ec))
784 		return false;
785 
786 	switch (xec) {
787 	case 0x04 ... 0x05:
788 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
789 		break;
790 
791 	case 0x09 ... 0x0b:
792 	case 0x0d ... 0x0f:
793 		pr_cont("ECC error in L2 tag (%s).\n",
794 			((r4 == R4_GEN)   ? "BankReq" :
795 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
796 		break;
797 
798 	case 0x10 ... 0x19:
799 	case 0x1b:
800 		pr_cont("ECC error in L2 data array (%s).\n",
801 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
802 			((r4 == R4_GEN)   ? "Attr" :
803 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
804 		break;
805 
806 	case 0x1c ... 0x1d:
807 	case 0x1f:
808 		pr_cont("Parity error in L2 attribute bits (%s).\n",
809 			((r4 == R4_RD)  ? "Hit"  :
810 			((r4 == R4_GEN) ? "Attr" : "Fill")));
811 		break;
812 
813 	default:
814 		return false;
815 	}
816 
817 	return true;
818 }
819 
820 static void decode_mc2_mce(struct mce *m)
821 {
822 	u16 ec = EC(m->status);
823 	u8 xec = XEC(m->status, xec_mask);
824 
825 	pr_emerg(HW_ERR "MC2 Error: ");
826 
827 	if (!fam_ops.mc2_mce(ec, xec))
828 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
829 }
830 
831 static void decode_mc3_mce(struct mce *m)
832 {
833 	u16 ec = EC(m->status);
834 	u8 xec = XEC(m->status, xec_mask);
835 
836 	if (boot_cpu_data.x86 >= 0x14) {
837 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
838 			 " please report on LKML.\n");
839 		return;
840 	}
841 
842 	pr_emerg(HW_ERR "MC3 Error");
843 
844 	if (xec == 0x0) {
845 		u8 r4 = R4(ec);
846 
847 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
848 			goto wrong_mc3_mce;
849 
850 		pr_cont(" during %s.\n", R4_MSG(ec));
851 	} else
852 		goto wrong_mc3_mce;
853 
854 	return;
855 
856  wrong_mc3_mce:
857 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
858 }
859 
860 static void decode_mc4_mce(struct mce *m)
861 {
862 	unsigned int fam = x86_family(m->cpuid);
863 	int node_id = amd_get_nb_id(m->extcpu);
864 	u16 ec = EC(m->status);
865 	u8 xec = XEC(m->status, 0x1f);
866 	u8 offset = 0;
867 
868 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
869 
870 	switch (xec) {
871 	case 0x0 ... 0xe:
872 
873 		/* special handling for DRAM ECCs */
874 		if (xec == 0x0 || xec == 0x8) {
875 			/* no ECCs on F11h */
876 			if (fam == 0x11)
877 				goto wrong_mc4_mce;
878 
879 			pr_cont("%s.\n", mc4_mce_desc[xec]);
880 
881 			if (decode_dram_ecc)
882 				decode_dram_ecc(node_id, m);
883 			return;
884 		}
885 		break;
886 
887 	case 0xf:
888 		if (TLB_ERROR(ec))
889 			pr_cont("GART Table Walk data error.\n");
890 		else if (BUS_ERROR(ec))
891 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
892 		else
893 			goto wrong_mc4_mce;
894 		return;
895 
896 	case 0x19:
897 		if (fam == 0x15 || fam == 0x16)
898 			pr_cont("Compute Unit Data Error.\n");
899 		else
900 			goto wrong_mc4_mce;
901 		return;
902 
903 	case 0x1c ... 0x1f:
904 		offset = 13;
905 		break;
906 
907 	default:
908 		goto wrong_mc4_mce;
909 	}
910 
911 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
912 	return;
913 
914  wrong_mc4_mce:
915 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
916 }
917 
918 static void decode_mc5_mce(struct mce *m)
919 {
920 	unsigned int fam = x86_family(m->cpuid);
921 	u16 ec = EC(m->status);
922 	u8 xec = XEC(m->status, xec_mask);
923 
924 	if (fam == 0xf || fam == 0x11)
925 		goto wrong_mc5_mce;
926 
927 	pr_emerg(HW_ERR "MC5 Error: ");
928 
929 	if (INT_ERROR(ec)) {
930 		if (xec <= 0x1f) {
931 			pr_cont("Hardware Assert.\n");
932 			return;
933 		} else
934 			goto wrong_mc5_mce;
935 	}
936 
937 	if (xec == 0x0 || xec == 0xc)
938 		pr_cont("%s.\n", mc5_mce_desc[xec]);
939 	else if (xec <= 0xd)
940 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
941 	else
942 		goto wrong_mc5_mce;
943 
944 	return;
945 
946  wrong_mc5_mce:
947 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
948 }
949 
950 static void decode_mc6_mce(struct mce *m)
951 {
952 	u8 xec = XEC(m->status, xec_mask);
953 
954 	pr_emerg(HW_ERR "MC6 Error: ");
955 
956 	if (xec > 0x5)
957 		goto wrong_mc6_mce;
958 
959 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
960 	return;
961 
962  wrong_mc6_mce:
963 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
964 }
965 
966 /* Decode errors according to Scalable MCA specification */
967 static void decode_smca_error(struct mce *m)
968 {
969 	struct smca_hwid *hwid;
970 	enum smca_bank_types bank_type;
971 	const char *ip_name;
972 	u8 xec = XEC(m->status, xec_mask);
973 
974 	if (m->bank >= ARRAY_SIZE(smca_banks))
975 		return;
976 
977 	hwid = smca_banks[m->bank].hwid;
978 	if (!hwid)
979 		return;
980 
981 	bank_type = hwid->bank_type;
982 
983 	if (bank_type == SMCA_RESERVED) {
984 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
985 		return;
986 	}
987 
988 	ip_name = smca_get_long_name(bank_type);
989 
990 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
991 
992 	/* Only print the decode of valid error codes */
993 	if (xec < smca_mce_descs[bank_type].num_descs &&
994 			(hwid->xec_bitmap & BIT_ULL(xec))) {
995 		pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
996 	}
997 
998 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
999 		decode_dram_ecc(cpu_to_node(m->extcpu), m);
1000 }
1001 
1002 static inline void amd_decode_err_code(u16 ec)
1003 {
1004 	if (INT_ERROR(ec)) {
1005 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1006 		return;
1007 	}
1008 
1009 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1010 
1011 	if (BUS_ERROR(ec))
1012 		pr_cont(", mem/io: %s", II_MSG(ec));
1013 	else
1014 		pr_cont(", tx: %s", TT_MSG(ec));
1015 
1016 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1017 		pr_cont(", mem-tx: %s", R4_MSG(ec));
1018 
1019 		if (BUS_ERROR(ec))
1020 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1021 	}
1022 
1023 	pr_cont("\n");
1024 }
1025 
1026 static const char *decode_error_status(struct mce *m)
1027 {
1028 	if (m->status & MCI_STATUS_UC) {
1029 		if (m->status & MCI_STATUS_PCC)
1030 			return "System Fatal error.";
1031 		if (m->mcgstatus & MCG_STATUS_RIPV)
1032 			return "Uncorrected, software restartable error.";
1033 		return "Uncorrected, software containable error.";
1034 	}
1035 
1036 	if (m->status & MCI_STATUS_DEFERRED)
1037 		return "Deferred error, no action required.";
1038 
1039 	return "Corrected error, no action required.";
1040 }
1041 
1042 static int
1043 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1044 {
1045 	struct mce *m = (struct mce *)data;
1046 	unsigned int fam = x86_family(m->cpuid);
1047 	int ecc;
1048 
1049 	if (m->kflags & MCE_HANDLED_CEC)
1050 		return NOTIFY_DONE;
1051 
1052 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1053 
1054 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1055 		m->extcpu,
1056 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1057 		m->bank,
1058 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1059 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1060 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1061 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1062 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
1063 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
1064 
1065 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1066 		u32 low, high;
1067 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1068 
1069 		if (!rdmsr_safe(addr, &low, &high) &&
1070 		    (low & MCI_CONFIG_MCAX))
1071 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1072 
1073 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1074 	}
1075 
1076 	/* do the two bits[14:13] together */
1077 	ecc = (m->status >> 45) & 0x3;
1078 	if (ecc)
1079 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1080 
1081 	if (fam >= 0x15) {
1082 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1083 
1084 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
1085 		if (fam != 0x15 || m->bank != 4)
1086 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1087 	}
1088 
1089 	if (fam >= 0x17)
1090 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1091 
1092 	pr_cont("]: 0x%016llx\n", m->status);
1093 
1094 	if (m->status & MCI_STATUS_ADDRV)
1095 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1096 
1097 	if (m->ppin)
1098 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
1099 
1100 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1101 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1102 
1103 		if (m->status & MCI_STATUS_SYNDV)
1104 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1105 
1106 		pr_cont("\n");
1107 
1108 		decode_smca_error(m);
1109 		goto err_code;
1110 	}
1111 
1112 	if (m->tsc)
1113 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1114 
1115 	/* Doesn't matter which member to test. */
1116 	if (!fam_ops.mc0_mce)
1117 		goto err_code;
1118 
1119 	switch (m->bank) {
1120 	case 0:
1121 		decode_mc0_mce(m);
1122 		break;
1123 
1124 	case 1:
1125 		decode_mc1_mce(m);
1126 		break;
1127 
1128 	case 2:
1129 		decode_mc2_mce(m);
1130 		break;
1131 
1132 	case 3:
1133 		decode_mc3_mce(m);
1134 		break;
1135 
1136 	case 4:
1137 		decode_mc4_mce(m);
1138 		break;
1139 
1140 	case 5:
1141 		decode_mc5_mce(m);
1142 		break;
1143 
1144 	case 6:
1145 		decode_mc6_mce(m);
1146 		break;
1147 
1148 	default:
1149 		break;
1150 	}
1151 
1152  err_code:
1153 	amd_decode_err_code(m->status & 0xffff);
1154 
1155 	m->kflags |= MCE_HANDLED_EDAC;
1156 	return NOTIFY_OK;
1157 }
1158 
1159 static struct notifier_block amd_mce_dec_nb = {
1160 	.notifier_call	= amd_decode_mce,
1161 	.priority	= MCE_PRIO_EDAC,
1162 };
1163 
1164 static int __init mce_amd_init(void)
1165 {
1166 	struct cpuinfo_x86 *c = &boot_cpu_data;
1167 
1168 	if (c->x86_vendor != X86_VENDOR_AMD &&
1169 	    c->x86_vendor != X86_VENDOR_HYGON)
1170 		return -ENODEV;
1171 
1172 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1173 		xec_mask = 0x3f;
1174 		goto out;
1175 	}
1176 
1177 	switch (c->x86) {
1178 	case 0xf:
1179 		fam_ops.mc0_mce = k8_mc0_mce;
1180 		fam_ops.mc1_mce = k8_mc1_mce;
1181 		fam_ops.mc2_mce = k8_mc2_mce;
1182 		break;
1183 
1184 	case 0x10:
1185 		fam_ops.mc0_mce = f10h_mc0_mce;
1186 		fam_ops.mc1_mce = k8_mc1_mce;
1187 		fam_ops.mc2_mce = k8_mc2_mce;
1188 		break;
1189 
1190 	case 0x11:
1191 		fam_ops.mc0_mce = k8_mc0_mce;
1192 		fam_ops.mc1_mce = k8_mc1_mce;
1193 		fam_ops.mc2_mce = k8_mc2_mce;
1194 		break;
1195 
1196 	case 0x12:
1197 		fam_ops.mc0_mce = f12h_mc0_mce;
1198 		fam_ops.mc1_mce = k8_mc1_mce;
1199 		fam_ops.mc2_mce = k8_mc2_mce;
1200 		break;
1201 
1202 	case 0x14:
1203 		fam_ops.mc0_mce = cat_mc0_mce;
1204 		fam_ops.mc1_mce = cat_mc1_mce;
1205 		fam_ops.mc2_mce = k8_mc2_mce;
1206 		break;
1207 
1208 	case 0x15:
1209 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1210 
1211 		fam_ops.mc0_mce = f15h_mc0_mce;
1212 		fam_ops.mc1_mce = f15h_mc1_mce;
1213 		fam_ops.mc2_mce = f15h_mc2_mce;
1214 		break;
1215 
1216 	case 0x16:
1217 		xec_mask = 0x1f;
1218 		fam_ops.mc0_mce = cat_mc0_mce;
1219 		fam_ops.mc1_mce = cat_mc1_mce;
1220 		fam_ops.mc2_mce = f16h_mc2_mce;
1221 		break;
1222 
1223 	case 0x17:
1224 	case 0x18:
1225 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1226 		return -EINVAL;
1227 
1228 	default:
1229 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1230 		return -EINVAL;
1231 	}
1232 
1233 out:
1234 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1235 
1236 	mce_register_decode_chain(&amd_mce_dec_nb);
1237 
1238 	return 0;
1239 }
1240 early_initcall(mce_amd_init);
1241 
1242 #ifdef MODULE
1243 static void __exit mce_amd_exit(void)
1244 {
1245 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1246 }
1247 
1248 MODULE_DESCRIPTION("AMD MCE decoder");
1249 MODULE_ALIAS("edac-mce-amd");
1250 MODULE_LICENSE("GPL");
1251 module_exit(mce_amd_exit);
1252 #endif
1253