xref: /openbmc/linux/drivers/edac/mce_amd.c (revision c35977b0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
amd_register_ecc_decoder(void (* f)(int,struct mce *))15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 	decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	if (decode_dram_ecc) {
24 		WARN_ON(decode_dram_ecc != f);
25 
26 		decode_dram_ecc = NULL;
27 	}
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30 
31 /*
32  * string representation for the different MCA reported error types, see F3x48
33  * or MSR0000_0411.
34  */
35 
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38 
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41 
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46 
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50 
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53 
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56 
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59 
60 static const char * const f15h_mc1_mce_desc[] = {
61 	"UC during a demand linefill from L2",
62 	"Parity error during data load from IC",
63 	"Parity error for IC valid bit",
64 	"Main tag parity error",
65 	"Parity error in prediction queue",
66 	"PFB data/address parity error",
67 	"Parity error in the branch status reg",
68 	"PFB promotion address error",
69 	"Tag error during probe/victimization",
70 	"Parity error for IC probe tag valid bit",
71 	"PFB non-cacheable bit parity error",
72 	"PFB valid bit parity error",			/* xec = 0xd */
73 	"Microcode Patch Buffer",			/* xec = 010 */
74 	"uop queue",
75 	"insn buffer",
76 	"predecode buffer",
77 	"fetch address FIFO",
78 	"dispatch uop queue"
79 };
80 
81 static const char * const f15h_mc2_mce_desc[] = {
82 	"Fill ECC error on data fills",			/* xec = 0x4 */
83 	"Fill parity error on insn fills",
84 	"Prefetcher request FIFO parity error",
85 	"PRQ address parity error",
86 	"PRQ data parity error",
87 	"WCC Tag ECC error",
88 	"WCC Data ECC error",
89 	"WCB Data parity error",
90 	"VB Data ECC or parity error",
91 	"L2 Tag ECC error",				/* xec = 0x10 */
92 	"Hard L2 Tag ECC error",
93 	"Multiple hits on L2 tag",
94 	"XAB parity error",
95 	"PRB address parity error"
96 };
97 
98 static const char * const mc4_mce_desc[] = {
99 	"DRAM ECC error detected on the NB",
100 	"CRC error detected on HT link",
101 	"Link-defined sync error packets detected on HT link",
102 	"HT Master abort",
103 	"HT Target abort",
104 	"Invalid GART PTE entry during GART table walk",
105 	"Unsupported atomic RMW received from an IO link",
106 	"Watchdog timeout due to lack of progress",
107 	"DRAM ECC error detected on the NB",
108 	"SVM DMA Exclusion Vector error",
109 	"HT data error detected on link",
110 	"Protocol error (link, L3, probe filter)",
111 	"NB internal arrays parity error",
112 	"DRAM addr/ctl signals parity error",
113 	"IO link transmission error",
114 	"L3 data cache ECC error",			/* xec = 0x1c */
115 	"L3 cache tag error",
116 	"L3 LRU parity bits error",
117 	"ECC Error in the Probe Filter directory"
118 };
119 
120 static const char * const mc5_mce_desc[] = {
121 	"CPU Watchdog timer expire",
122 	"Wakeup array dest tag",
123 	"AG payload array",
124 	"EX payload array",
125 	"IDRF array",
126 	"Retire dispatch queue",
127 	"Mapper checkpoint array",
128 	"Physical register file EX0 port",
129 	"Physical register file EX1 port",
130 	"Physical register file AG0 port",
131 	"Physical register file AG1 port",
132 	"Flag register file",
133 	"DE error occurred",
134 	"Retire status queue"
135 };
136 
137 static const char * const mc6_mce_desc[] = {
138 	"Hardware Assertion",
139 	"Free List",
140 	"Physical Register File",
141 	"Retire Queue",
142 	"Scheduler table",
143 	"Status Register File",
144 };
145 
146 /* Scalable MCA error strings */
147 static const char * const smca_ls_mce_desc[] = {
148 	"Load queue parity error",
149 	"Store queue parity error",
150 	"Miss address buffer payload parity error",
151 	"Level 1 TLB parity error",
152 	"DC Tag error type 5",
153 	"DC Tag error type 6",
154 	"DC Tag error type 1",
155 	"Internal error type 1",
156 	"Internal error type 2",
157 	"System Read Data Error Thread 0",
158 	"System Read Data Error Thread 1",
159 	"DC Tag error type 2",
160 	"DC Data error type 1 and poison consumption",
161 	"DC Data error type 2",
162 	"DC Data error type 3",
163 	"DC Tag error type 4",
164 	"Level 2 TLB parity error",
165 	"PDC parity error",
166 	"DC Tag error type 3",
167 	"DC Tag error type 5",
168 	"L2 Fill Data error",
169 };
170 
171 static const char * const smca_ls2_mce_desc[] = {
172 	"An ECC error was detected on a data cache read by a probe or victimization",
173 	"An ECC error or L2 poison was detected on a data cache read by a load",
174 	"An ECC error was detected on a data cache read-modify-write by a store",
175 	"An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
176 	"An ECC error or poison bit mismatch was detected on a tag read by a load",
177 	"An ECC error or poison bit mismatch was detected on a tag read by a store",
178 	"An ECC error was detected on an EMEM read by a load",
179 	"An ECC error was detected on an EMEM read-modify-write by a store",
180 	"A parity error was detected in an L1 TLB entry by any access",
181 	"A parity error was detected in an L2 TLB entry by any access",
182 	"A parity error was detected in a PWC entry by any access",
183 	"A parity error was detected in an STQ entry by any access",
184 	"A parity error was detected in an LDQ entry by any access",
185 	"A parity error was detected in a MAB entry by any access",
186 	"A parity error was detected in an SCB entry state field by any access",
187 	"A parity error was detected in an SCB entry address field by any access",
188 	"A parity error was detected in an SCB entry data field by any access",
189 	"A parity error was detected in a WCB entry by any access",
190 	"A poisoned line was detected in an SCB entry by any access",
191 	"A SystemReadDataError error was reported on read data returned from L2 for a load",
192 	"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
193 	"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
194 	"A hardware assertion error was reported",
195 	"A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
196 };
197 
198 static const char * const smca_if_mce_desc[] = {
199 	"Op Cache Microtag Probe Port Parity Error",
200 	"IC Microtag or Full Tag Multi-hit Error",
201 	"IC Full Tag Parity Error",
202 	"IC Data Array Parity Error",
203 	"Decoupling Queue PhysAddr Parity Error",
204 	"L0 ITLB Parity Error",
205 	"L1 ITLB Parity Error",
206 	"L2 ITLB Parity Error",
207 	"BPQ Thread 0 Snoop Parity Error",
208 	"BPQ Thread 1 Snoop Parity Error",
209 	"L1 BTB Multi-Match Error",
210 	"L2 BTB Multi-Match Error",
211 	"L2 Cache Response Poison Error",
212 	"System Read Data Error",
213 	"Hardware Assertion Error",
214 	"L1-TLB Multi-Hit",
215 	"L2-TLB Multi-Hit",
216 	"BSR Parity Error",
217 	"CT MCE",
218 };
219 
220 static const char * const smca_l2_mce_desc[] = {
221 	"L2M Tag Multiple-Way-Hit error",
222 	"L2M Tag or State Array ECC Error",
223 	"L2M Data Array ECC Error",
224 	"Hardware Assert Error",
225 };
226 
227 static const char * const smca_de_mce_desc[] = {
228 	"Micro-op cache tag parity error",
229 	"Micro-op cache data parity error",
230 	"Instruction buffer parity error",
231 	"Micro-op queue parity error",
232 	"Instruction dispatch queue parity error",
233 	"Fetch address FIFO parity error",
234 	"Patch RAM data parity error",
235 	"Patch RAM sequencer parity error",
236 	"Micro-op buffer parity error",
237 	"Hardware Assertion MCA Error",
238 };
239 
240 static const char * const smca_ex_mce_desc[] = {
241 	"Watchdog Timeout error",
242 	"Physical register file parity error",
243 	"Flag register file parity error",
244 	"Immediate displacement register file parity error",
245 	"Address generator payload parity error",
246 	"EX payload parity error",
247 	"Checkpoint queue parity error",
248 	"Retire dispatch queue parity error",
249 	"Retire status queue parity error",
250 	"Scheduling queue parity error",
251 	"Branch buffer queue parity error",
252 	"Hardware Assertion error",
253 	"Spec Map parity error",
254 	"Retire Map parity error",
255 };
256 
257 static const char * const smca_fp_mce_desc[] = {
258 	"Physical register file (PRF) parity error",
259 	"Freelist (FL) parity error",
260 	"Schedule queue parity error",
261 	"NSQ parity error",
262 	"Retire queue (RQ) parity error",
263 	"Status register file (SRF) parity error",
264 	"Hardware assertion",
265 };
266 
267 static const char * const smca_l3_mce_desc[] = {
268 	"Shadow Tag Macro ECC Error",
269 	"Shadow Tag Macro Multi-way-hit Error",
270 	"L3M Tag ECC Error",
271 	"L3M Tag Multi-way-hit Error",
272 	"L3M Data ECC Error",
273 	"SDP Parity Error or SystemReadDataError from XI",
274 	"L3 Victim Queue Parity Error",
275 	"L3 Hardware Assertion",
276 };
277 
278 static const char * const smca_cs_mce_desc[] = {
279 	"Illegal Request",
280 	"Address Violation",
281 	"Security Violation",
282 	"Illegal Response",
283 	"Unexpected Response",
284 	"Request or Probe Parity Error",
285 	"Read Response Parity Error",
286 	"Atomic Request Parity Error",
287 	"Probe Filter ECC Error",
288 };
289 
290 static const char * const smca_cs2_mce_desc[] = {
291 	"Illegal Request",
292 	"Address Violation",
293 	"Security Violation",
294 	"Illegal Response",
295 	"Unexpected Response",
296 	"Request or Probe Parity Error",
297 	"Read Response Parity Error",
298 	"Atomic Request Parity Error",
299 	"SDP read response had no match in the CS queue",
300 	"Probe Filter Protocol Error",
301 	"Probe Filter ECC Error",
302 	"SDP read response had an unexpected RETRY error",
303 	"Counter overflow error",
304 	"Counter underflow error",
305 };
306 
307 static const char * const smca_pie_mce_desc[] = {
308 	"Hardware Assert",
309 	"Register security violation",
310 	"Link Error",
311 	"Poison data consumption",
312 	"A deferred error was detected in the DF"
313 };
314 
315 static const char * const smca_umc_mce_desc[] = {
316 	"DRAM ECC error",
317 	"Data poison error",
318 	"SDP parity error",
319 	"Advanced peripheral bus error",
320 	"Address/Command parity error",
321 	"Write data CRC error",
322 	"DCQ SRAM ECC error",
323 	"AES SRAM ECC error",
324 };
325 
326 static const char * const smca_umc2_mce_desc[] = {
327 	"DRAM ECC error",
328 	"Data poison error",
329 	"SDP parity error",
330 	"Reserved",
331 	"Address/Command parity error",
332 	"Write data parity error",
333 	"DCQ SRAM ECC error",
334 	"Reserved",
335 	"Read data parity error",
336 	"Rdb SRAM ECC error",
337 	"RdRsp SRAM ECC error",
338 	"LM32 MP errors",
339 };
340 
341 static const char * const smca_pb_mce_desc[] = {
342 	"An ECC error in the Parameter Block RAM array",
343 };
344 
345 static const char * const smca_psp_mce_desc[] = {
346 	"An ECC or parity error in a PSP RAM instance",
347 };
348 
349 static const char * const smca_psp2_mce_desc[] = {
350 	"High SRAM ECC or parity error",
351 	"Low SRAM ECC or parity error",
352 	"Instruction Cache Bank 0 ECC or parity error",
353 	"Instruction Cache Bank 1 ECC or parity error",
354 	"Instruction Tag Ram 0 parity error",
355 	"Instruction Tag Ram 1 parity error",
356 	"Data Cache Bank 0 ECC or parity error",
357 	"Data Cache Bank 1 ECC or parity error",
358 	"Data Cache Bank 2 ECC or parity error",
359 	"Data Cache Bank 3 ECC or parity error",
360 	"Data Tag Bank 0 parity error",
361 	"Data Tag Bank 1 parity error",
362 	"Data Tag Bank 2 parity error",
363 	"Data Tag Bank 3 parity error",
364 	"Dirty Data Ram parity error",
365 	"TLB Bank 0 parity error",
366 	"TLB Bank 1 parity error",
367 	"System Hub Read Buffer ECC or parity error",
368 };
369 
370 static const char * const smca_smu_mce_desc[] = {
371 	"An ECC or parity error in an SMU RAM instance",
372 };
373 
374 static const char * const smca_smu2_mce_desc[] = {
375 	"High SRAM ECC or parity error",
376 	"Low SRAM ECC or parity error",
377 	"Data Cache Bank A ECC or parity error",
378 	"Data Cache Bank B ECC or parity error",
379 	"Data Tag Cache Bank A ECC or parity error",
380 	"Data Tag Cache Bank B ECC or parity error",
381 	"Instruction Cache Bank A ECC or parity error",
382 	"Instruction Cache Bank B ECC or parity error",
383 	"Instruction Tag Cache Bank A ECC or parity error",
384 	"Instruction Tag Cache Bank B ECC or parity error",
385 	"System Hub Read Buffer ECC or parity error",
386 	"PHY RAM ECC error",
387 };
388 
389 static const char * const smca_mp5_mce_desc[] = {
390 	"High SRAM ECC or parity error",
391 	"Low SRAM ECC or parity error",
392 	"Data Cache Bank A ECC or parity error",
393 	"Data Cache Bank B ECC or parity error",
394 	"Data Tag Cache Bank A ECC or parity error",
395 	"Data Tag Cache Bank B ECC or parity error",
396 	"Instruction Cache Bank A ECC or parity error",
397 	"Instruction Cache Bank B ECC or parity error",
398 	"Instruction Tag Cache Bank A ECC or parity error",
399 	"Instruction Tag Cache Bank B ECC or parity error",
400 };
401 
402 static const char * const smca_mpdma_mce_desc[] = {
403 	"Main SRAM [31:0] bank ECC or parity error",
404 	"Main SRAM [63:32] bank ECC or parity error",
405 	"Main SRAM [95:64] bank ECC or parity error",
406 	"Main SRAM [127:96] bank ECC or parity error",
407 	"Data Cache Bank A ECC or parity error",
408 	"Data Cache Bank B ECC or parity error",
409 	"Data Tag Cache Bank A ECC or parity error",
410 	"Data Tag Cache Bank B ECC or parity error",
411 	"Instruction Cache Bank A ECC or parity error",
412 	"Instruction Cache Bank B ECC or parity error",
413 	"Instruction Tag Cache Bank A ECC or parity error",
414 	"Instruction Tag Cache Bank B ECC or parity error",
415 	"Data Cache Bank A ECC or parity error",
416 	"Data Cache Bank B ECC or parity error",
417 	"Data Tag Cache Bank A ECC or parity error",
418 	"Data Tag Cache Bank B ECC or parity error",
419 	"Instruction Cache Bank A ECC or parity error",
420 	"Instruction Cache Bank B ECC or parity error",
421 	"Instruction Tag Cache Bank A ECC or parity error",
422 	"Instruction Tag Cache Bank B ECC or parity error",
423 	"Data Cache Bank A ECC or parity error",
424 	"Data Cache Bank B ECC or parity error",
425 	"Data Tag Cache Bank A ECC or parity error",
426 	"Data Tag Cache Bank B ECC or parity error",
427 	"Instruction Cache Bank A ECC or parity error",
428 	"Instruction Cache Bank B ECC or parity error",
429 	"Instruction Tag Cache Bank A ECC or parity error",
430 	"Instruction Tag Cache Bank B ECC or parity error",
431 	"System Hub Read Buffer ECC or parity error",
432 	"MPDMA TVF DVSEC Memory ECC or parity error",
433 	"MPDMA TVF MMIO Mailbox0 ECC or parity error",
434 	"MPDMA TVF MMIO Mailbox1 ECC or parity error",
435 	"MPDMA TVF Doorbell Memory ECC or parity error",
436 	"MPDMA TVF SDP Slave Memory 0 ECC or parity error",
437 	"MPDMA TVF SDP Slave Memory 1 ECC or parity error",
438 	"MPDMA TVF SDP Slave Memory 2 ECC or parity error",
439 	"MPDMA TVF SDP Master Memory 0 ECC or parity error",
440 	"MPDMA TVF SDP Master Memory 1 ECC or parity error",
441 	"MPDMA TVF SDP Master Memory 2 ECC or parity error",
442 	"MPDMA TVF SDP Master Memory 3 ECC or parity error",
443 	"MPDMA TVF SDP Master Memory 4 ECC or parity error",
444 	"MPDMA TVF SDP Master Memory 5 ECC or parity error",
445 	"MPDMA TVF SDP Master Memory 6 ECC or parity error",
446 	"MPDMA PTE Command FIFO ECC or parity error",
447 	"MPDMA PTE Hub Data FIFO ECC or parity error",
448 	"MPDMA PTE Internal Data FIFO ECC or parity error",
449 	"MPDMA PTE Command Memory DMA ECC or parity error",
450 	"MPDMA PTE Command Memory Internal ECC or parity error",
451 	"MPDMA PTE DMA Completion FIFO ECC or parity error",
452 	"MPDMA PTE Tablewalk Completion FIFO ECC or parity error",
453 	"MPDMA PTE Descriptor Completion FIFO ECC or parity error",
454 	"MPDMA PTE ReadOnly Completion FIFO ECC or parity error",
455 	"MPDMA PTE DirectWrite Completion FIFO ECC or parity error",
456 	"SDP Watchdog Timer expired",
457 };
458 
459 static const char * const smca_nbio_mce_desc[] = {
460 	"ECC or Parity error",
461 	"PCIE error",
462 	"SDP ErrEvent error",
463 	"SDP Egress Poison Error",
464 	"IOHC Internal Poison Error",
465 };
466 
467 static const char * const smca_pcie_mce_desc[] = {
468 	"CCIX PER Message logging",
469 	"CCIX Read Response with Status: Non-Data Error",
470 	"CCIX Write Response with Status: Non-Data Error",
471 	"CCIX Read Response with Status: Data Error",
472 	"CCIX Non-okay write response with data error",
473 };
474 
475 static const char * const smca_pcie2_mce_desc[] = {
476 	"SDP Parity Error logging",
477 };
478 
479 static const char * const smca_xgmipcs_mce_desc[] = {
480 	"Data Loss Error",
481 	"Training Error",
482 	"Flow Control Acknowledge Error",
483 	"Rx Fifo Underflow Error",
484 	"Rx Fifo Overflow Error",
485 	"CRC Error",
486 	"BER Exceeded Error",
487 	"Tx Vcid Data Error",
488 	"Replay Buffer Parity Error",
489 	"Data Parity Error",
490 	"Replay Fifo Overflow Error",
491 	"Replay Fifo Underflow Error",
492 	"Elastic Fifo Overflow Error",
493 	"Deskew Error",
494 	"Flow Control CRC Error",
495 	"Data Startup Limit Error",
496 	"FC Init Timeout Error",
497 	"Recovery Timeout Error",
498 	"Ready Serial Timeout Error",
499 	"Ready Serial Attempt Error",
500 	"Recovery Attempt Error",
501 	"Recovery Relock Attempt Error",
502 	"Replay Attempt Error",
503 	"Sync Header Error",
504 	"Tx Replay Timeout Error",
505 	"Rx Replay Timeout Error",
506 	"LinkSub Tx Timeout Error",
507 	"LinkSub Rx Timeout Error",
508 	"Rx CMD Packet Error",
509 };
510 
511 static const char * const smca_xgmiphy_mce_desc[] = {
512 	"RAM ECC Error",
513 	"ARC instruction buffer parity error",
514 	"ARC data buffer parity error",
515 	"PHY APB error",
516 };
517 
518 static const char * const smca_nbif_mce_desc[] = {
519 	"Timeout error from GMI",
520 	"SRAM ECC error",
521 	"NTB Error Event",
522 	"SDP Parity error",
523 };
524 
525 static const char * const smca_sata_mce_desc[] = {
526 	"Parity error for port 0",
527 	"Parity error for port 1",
528 	"Parity error for port 2",
529 	"Parity error for port 3",
530 	"Parity error for port 4",
531 	"Parity error for port 5",
532 	"Parity error for port 6",
533 	"Parity error for port 7",
534 };
535 
536 static const char * const smca_usb_mce_desc[] = {
537 	"Parity error or ECC error for S0 RAM0",
538 	"Parity error or ECC error for S0 RAM1",
539 	"Parity error or ECC error for S0 RAM2",
540 	"Parity error for PHY RAM0",
541 	"Parity error for PHY RAM1",
542 	"AXI Slave Response error",
543 };
544 
545 static const char * const smca_gmipcs_mce_desc[] = {
546 	"Data Loss Error",
547 	"Training Error",
548 	"Replay Parity Error",
549 	"Rx Fifo Underflow Error",
550 	"Rx Fifo Overflow Error",
551 	"CRC Error",
552 	"BER Exceeded Error",
553 	"Tx Fifo Underflow Error",
554 	"Replay Buffer Parity Error",
555 	"Tx Overflow Error",
556 	"Replay Fifo Overflow Error",
557 	"Replay Fifo Underflow Error",
558 	"Elastic Fifo Overflow Error",
559 	"Deskew Error",
560 	"Offline Error",
561 	"Data Startup Limit Error",
562 	"FC Init Timeout Error",
563 	"Recovery Timeout Error",
564 	"Ready Serial Timeout Error",
565 	"Ready Serial Attempt Error",
566 	"Recovery Attempt Error",
567 	"Recovery Relock Attempt Error",
568 	"Deskew Abort Error",
569 	"Rx Buffer Error",
570 	"Rx LFDS Fifo Overflow Error",
571 	"Rx LFDS Fifo Underflow Error",
572 	"LinkSub Tx Timeout Error",
573 	"LinkSub Rx Timeout Error",
574 	"Rx CMD Packet Error",
575 	"LFDS Training Timeout Error",
576 	"LFDS FC Init Timeout Error",
577 	"Data Loss Error",
578 };
579 
580 struct smca_mce_desc {
581 	const char * const *descs;
582 	unsigned int num_descs;
583 };
584 
585 static struct smca_mce_desc smca_mce_descs[] = {
586 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
587 	[SMCA_LS_V2]	= { smca_ls2_mce_desc,	ARRAY_SIZE(smca_ls2_mce_desc)	},
588 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
589 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
590 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
591 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
592 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
593 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
594 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
595 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
596 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
597 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
598 	[SMCA_UMC_V2]	= { smca_umc2_mce_desc,	ARRAY_SIZE(smca_umc2_mce_desc)	},
599 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
600 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
601 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
602 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
603 	[SMCA_SMU_V2]	= { smca_smu2_mce_desc,	ARRAY_SIZE(smca_smu2_mce_desc)	},
604 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
605 	[SMCA_MPDMA]	= { smca_mpdma_mce_desc,	ARRAY_SIZE(smca_mpdma_mce_desc)	},
606 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
607 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
608 	[SMCA_PCIE_V2]	= { smca_pcie2_mce_desc,   ARRAY_SIZE(smca_pcie2_mce_desc)	},
609 	[SMCA_XGMI_PCS]	= { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc)	},
610 	/* NBIF and SHUB have the same error descriptions, for now. */
611 	[SMCA_NBIF]	= { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc)	},
612 	[SMCA_SHUB]	= { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc)	},
613 	[SMCA_SATA]	= { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc)	},
614 	[SMCA_USB]	= { smca_usb_mce_desc,	ARRAY_SIZE(smca_usb_mce_desc)	},
615 	[SMCA_GMI_PCS]	= { smca_gmipcs_mce_desc,  ARRAY_SIZE(smca_gmipcs_mce_desc)	},
616 	/* All the PHY bank types have the same error descriptions, for now. */
617 	[SMCA_XGMI_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
618 	[SMCA_WAFL_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
619 	[SMCA_GMI_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
620 };
621 
f12h_mc0_mce(u16 ec,u8 xec)622 static bool f12h_mc0_mce(u16 ec, u8 xec)
623 {
624 	bool ret = false;
625 
626 	if (MEM_ERROR(ec)) {
627 		u8 ll = LL(ec);
628 		ret = true;
629 
630 		if (ll == LL_L2)
631 			pr_cont("during L1 linefill from L2.\n");
632 		else if (ll == LL_L1)
633 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
634 		else
635 			ret = false;
636 	}
637 	return ret;
638 }
639 
f10h_mc0_mce(u16 ec,u8 xec)640 static bool f10h_mc0_mce(u16 ec, u8 xec)
641 {
642 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
643 		pr_cont("during data scrub.\n");
644 		return true;
645 	}
646 	return f12h_mc0_mce(ec, xec);
647 }
648 
k8_mc0_mce(u16 ec,u8 xec)649 static bool k8_mc0_mce(u16 ec, u8 xec)
650 {
651 	if (BUS_ERROR(ec)) {
652 		pr_cont("during system linefill.\n");
653 		return true;
654 	}
655 
656 	return f10h_mc0_mce(ec, xec);
657 }
658 
cat_mc0_mce(u16 ec,u8 xec)659 static bool cat_mc0_mce(u16 ec, u8 xec)
660 {
661 	u8 r4	 = R4(ec);
662 	bool ret = true;
663 
664 	if (MEM_ERROR(ec)) {
665 
666 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
667 			return false;
668 
669 		switch (r4) {
670 		case R4_DRD:
671 		case R4_DWR:
672 			pr_cont("Data/Tag parity error due to %s.\n",
673 				(r4 == R4_DRD ? "load/hw prf" : "store"));
674 			break;
675 		case R4_EVICT:
676 			pr_cont("Copyback parity error on a tag miss.\n");
677 			break;
678 		case R4_SNOOP:
679 			pr_cont("Tag parity error during snoop.\n");
680 			break;
681 		default:
682 			ret = false;
683 		}
684 	} else if (BUS_ERROR(ec)) {
685 
686 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
687 			return false;
688 
689 		pr_cont("System read data error on a ");
690 
691 		switch (r4) {
692 		case R4_RD:
693 			pr_cont("TLB reload.\n");
694 			break;
695 		case R4_DWR:
696 			pr_cont("store.\n");
697 			break;
698 		case R4_DRD:
699 			pr_cont("load.\n");
700 			break;
701 		default:
702 			ret = false;
703 		}
704 	} else {
705 		ret = false;
706 	}
707 
708 	return ret;
709 }
710 
f15h_mc0_mce(u16 ec,u8 xec)711 static bool f15h_mc0_mce(u16 ec, u8 xec)
712 {
713 	bool ret = true;
714 
715 	if (MEM_ERROR(ec)) {
716 
717 		switch (xec) {
718 		case 0x0:
719 			pr_cont("Data Array access error.\n");
720 			break;
721 
722 		case 0x1:
723 			pr_cont("UC error during a linefill from L2/NB.\n");
724 			break;
725 
726 		case 0x2:
727 		case 0x11:
728 			pr_cont("STQ access error.\n");
729 			break;
730 
731 		case 0x3:
732 			pr_cont("SCB access error.\n");
733 			break;
734 
735 		case 0x10:
736 			pr_cont("Tag error.\n");
737 			break;
738 
739 		case 0x12:
740 			pr_cont("LDQ access error.\n");
741 			break;
742 
743 		default:
744 			ret = false;
745 		}
746 	} else if (BUS_ERROR(ec)) {
747 
748 		if (!xec)
749 			pr_cont("System Read Data Error.\n");
750 		else
751 			pr_cont(" Internal error condition type %d.\n", xec);
752 	} else if (INT_ERROR(ec)) {
753 		if (xec <= 0x1f)
754 			pr_cont("Hardware Assert.\n");
755 		else
756 			ret = false;
757 
758 	} else
759 		ret = false;
760 
761 	return ret;
762 }
763 
decode_mc0_mce(struct mce * m)764 static void decode_mc0_mce(struct mce *m)
765 {
766 	u16 ec = EC(m->status);
767 	u8 xec = XEC(m->status, xec_mask);
768 
769 	pr_emerg(HW_ERR "MC0 Error: ");
770 
771 	/* TLB error signatures are the same across families */
772 	if (TLB_ERROR(ec)) {
773 		if (TT(ec) == TT_DATA) {
774 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
775 				((xec == 2) ? "locked miss"
776 					    : (xec ? "multimatch" : "parity")));
777 			return;
778 		}
779 	} else if (fam_ops.mc0_mce(ec, xec))
780 		;
781 	else
782 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
783 }
784 
k8_mc1_mce(u16 ec,u8 xec)785 static bool k8_mc1_mce(u16 ec, u8 xec)
786 {
787 	u8 ll	 = LL(ec);
788 	bool ret = true;
789 
790 	if (!MEM_ERROR(ec))
791 		return false;
792 
793 	if (ll == 0x2)
794 		pr_cont("during a linefill from L2.\n");
795 	else if (ll == 0x1) {
796 		switch (R4(ec)) {
797 		case R4_IRD:
798 			pr_cont("Parity error during data load.\n");
799 			break;
800 
801 		case R4_EVICT:
802 			pr_cont("Copyback Parity/Victim error.\n");
803 			break;
804 
805 		case R4_SNOOP:
806 			pr_cont("Tag Snoop error.\n");
807 			break;
808 
809 		default:
810 			ret = false;
811 			break;
812 		}
813 	} else
814 		ret = false;
815 
816 	return ret;
817 }
818 
cat_mc1_mce(u16 ec,u8 xec)819 static bool cat_mc1_mce(u16 ec, u8 xec)
820 {
821 	u8 r4    = R4(ec);
822 	bool ret = true;
823 
824 	if (!MEM_ERROR(ec))
825 		return false;
826 
827 	if (TT(ec) != TT_INSTR)
828 		return false;
829 
830 	if (r4 == R4_IRD)
831 		pr_cont("Data/tag array parity error for a tag hit.\n");
832 	else if (r4 == R4_SNOOP)
833 		pr_cont("Tag error during snoop/victimization.\n");
834 	else if (xec == 0x0)
835 		pr_cont("Tag parity error from victim castout.\n");
836 	else if (xec == 0x2)
837 		pr_cont("Microcode patch RAM parity error.\n");
838 	else
839 		ret = false;
840 
841 	return ret;
842 }
843 
f15h_mc1_mce(u16 ec,u8 xec)844 static bool f15h_mc1_mce(u16 ec, u8 xec)
845 {
846 	bool ret = true;
847 
848 	if (!MEM_ERROR(ec))
849 		return false;
850 
851 	switch (xec) {
852 	case 0x0 ... 0xa:
853 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
854 		break;
855 
856 	case 0xd:
857 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
858 		break;
859 
860 	case 0x10:
861 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
862 		break;
863 
864 	case 0x11 ... 0x15:
865 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
866 		break;
867 
868 	default:
869 		ret = false;
870 	}
871 	return ret;
872 }
873 
decode_mc1_mce(struct mce * m)874 static void decode_mc1_mce(struct mce *m)
875 {
876 	u16 ec = EC(m->status);
877 	u8 xec = XEC(m->status, xec_mask);
878 
879 	pr_emerg(HW_ERR "MC1 Error: ");
880 
881 	if (TLB_ERROR(ec))
882 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
883 			(xec ? "multimatch" : "parity error"));
884 	else if (BUS_ERROR(ec)) {
885 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
886 
887 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
888 	} else if (INT_ERROR(ec)) {
889 		if (xec <= 0x3f)
890 			pr_cont("Hardware Assert.\n");
891 		else
892 			goto wrong_mc1_mce;
893 	} else if (fam_ops.mc1_mce(ec, xec))
894 		;
895 	else
896 		goto wrong_mc1_mce;
897 
898 	return;
899 
900 wrong_mc1_mce:
901 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
902 }
903 
k8_mc2_mce(u16 ec,u8 xec)904 static bool k8_mc2_mce(u16 ec, u8 xec)
905 {
906 	bool ret = true;
907 
908 	if (xec == 0x1)
909 		pr_cont(" in the write data buffers.\n");
910 	else if (xec == 0x3)
911 		pr_cont(" in the victim data buffers.\n");
912 	else if (xec == 0x2 && MEM_ERROR(ec))
913 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
914 	else if (xec == 0x0) {
915 		if (TLB_ERROR(ec))
916 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
917 				TT_MSG(ec));
918 		else if (BUS_ERROR(ec))
919 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
920 				R4_MSG(ec), PP_MSG(ec));
921 		else if (MEM_ERROR(ec)) {
922 			u8 r4 = R4(ec);
923 
924 			if (r4 >= 0x7)
925 				pr_cont(": %s error during data copyback.\n",
926 					R4_MSG(ec));
927 			else if (r4 <= 0x1)
928 				pr_cont(": %s parity/ECC error during data "
929 					"access from L2.\n", R4_MSG(ec));
930 			else
931 				ret = false;
932 		} else
933 			ret = false;
934 	} else
935 		ret = false;
936 
937 	return ret;
938 }
939 
f15h_mc2_mce(u16 ec,u8 xec)940 static bool f15h_mc2_mce(u16 ec, u8 xec)
941 {
942 	bool ret = true;
943 
944 	if (TLB_ERROR(ec)) {
945 		if (xec == 0x0)
946 			pr_cont("Data parity TLB read error.\n");
947 		else if (xec == 0x1)
948 			pr_cont("Poison data provided for TLB fill.\n");
949 		else
950 			ret = false;
951 	} else if (BUS_ERROR(ec)) {
952 		if (xec > 2)
953 			ret = false;
954 
955 		pr_cont("Error during attempted NB data read.\n");
956 	} else if (MEM_ERROR(ec)) {
957 		switch (xec) {
958 		case 0x4 ... 0xc:
959 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
960 			break;
961 
962 		case 0x10 ... 0x14:
963 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
964 			break;
965 
966 		default:
967 			ret = false;
968 		}
969 	} else if (INT_ERROR(ec)) {
970 		if (xec <= 0x3f)
971 			pr_cont("Hardware Assert.\n");
972 		else
973 			ret = false;
974 	}
975 
976 	return ret;
977 }
978 
f16h_mc2_mce(u16 ec,u8 xec)979 static bool f16h_mc2_mce(u16 ec, u8 xec)
980 {
981 	u8 r4 = R4(ec);
982 
983 	if (!MEM_ERROR(ec))
984 		return false;
985 
986 	switch (xec) {
987 	case 0x04 ... 0x05:
988 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
989 		break;
990 
991 	case 0x09 ... 0x0b:
992 	case 0x0d ... 0x0f:
993 		pr_cont("ECC error in L2 tag (%s).\n",
994 			((r4 == R4_GEN)   ? "BankReq" :
995 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
996 		break;
997 
998 	case 0x10 ... 0x19:
999 	case 0x1b:
1000 		pr_cont("ECC error in L2 data array (%s).\n",
1001 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
1002 			((r4 == R4_GEN)   ? "Attr" :
1003 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
1004 		break;
1005 
1006 	case 0x1c ... 0x1d:
1007 	case 0x1f:
1008 		pr_cont("Parity error in L2 attribute bits (%s).\n",
1009 			((r4 == R4_RD)  ? "Hit"  :
1010 			((r4 == R4_GEN) ? "Attr" : "Fill")));
1011 		break;
1012 
1013 	default:
1014 		return false;
1015 	}
1016 
1017 	return true;
1018 }
1019 
decode_mc2_mce(struct mce * m)1020 static void decode_mc2_mce(struct mce *m)
1021 {
1022 	u16 ec = EC(m->status);
1023 	u8 xec = XEC(m->status, xec_mask);
1024 
1025 	pr_emerg(HW_ERR "MC2 Error: ");
1026 
1027 	if (!fam_ops.mc2_mce(ec, xec))
1028 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
1029 }
1030 
decode_mc3_mce(struct mce * m)1031 static void decode_mc3_mce(struct mce *m)
1032 {
1033 	u16 ec = EC(m->status);
1034 	u8 xec = XEC(m->status, xec_mask);
1035 
1036 	if (boot_cpu_data.x86 >= 0x14) {
1037 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
1038 			 " please report on LKML.\n");
1039 		return;
1040 	}
1041 
1042 	pr_emerg(HW_ERR "MC3 Error");
1043 
1044 	if (xec == 0x0) {
1045 		u8 r4 = R4(ec);
1046 
1047 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
1048 			goto wrong_mc3_mce;
1049 
1050 		pr_cont(" during %s.\n", R4_MSG(ec));
1051 	} else
1052 		goto wrong_mc3_mce;
1053 
1054 	return;
1055 
1056  wrong_mc3_mce:
1057 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
1058 }
1059 
decode_mc4_mce(struct mce * m)1060 static void decode_mc4_mce(struct mce *m)
1061 {
1062 	unsigned int fam = x86_family(m->cpuid);
1063 	int node_id = topology_die_id(m->extcpu);
1064 	u16 ec = EC(m->status);
1065 	u8 xec = XEC(m->status, 0x1f);
1066 	u8 offset = 0;
1067 
1068 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
1069 
1070 	switch (xec) {
1071 	case 0x0 ... 0xe:
1072 
1073 		/* special handling for DRAM ECCs */
1074 		if (xec == 0x0 || xec == 0x8) {
1075 			/* no ECCs on F11h */
1076 			if (fam == 0x11)
1077 				goto wrong_mc4_mce;
1078 
1079 			pr_cont("%s.\n", mc4_mce_desc[xec]);
1080 
1081 			if (decode_dram_ecc)
1082 				decode_dram_ecc(node_id, m);
1083 			return;
1084 		}
1085 		break;
1086 
1087 	case 0xf:
1088 		if (TLB_ERROR(ec))
1089 			pr_cont("GART Table Walk data error.\n");
1090 		else if (BUS_ERROR(ec))
1091 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
1092 		else
1093 			goto wrong_mc4_mce;
1094 		return;
1095 
1096 	case 0x19:
1097 		if (fam == 0x15 || fam == 0x16)
1098 			pr_cont("Compute Unit Data Error.\n");
1099 		else
1100 			goto wrong_mc4_mce;
1101 		return;
1102 
1103 	case 0x1c ... 0x1f:
1104 		offset = 13;
1105 		break;
1106 
1107 	default:
1108 		goto wrong_mc4_mce;
1109 	}
1110 
1111 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
1112 	return;
1113 
1114  wrong_mc4_mce:
1115 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
1116 }
1117 
decode_mc5_mce(struct mce * m)1118 static void decode_mc5_mce(struct mce *m)
1119 {
1120 	unsigned int fam = x86_family(m->cpuid);
1121 	u16 ec = EC(m->status);
1122 	u8 xec = XEC(m->status, xec_mask);
1123 
1124 	if (fam == 0xf || fam == 0x11)
1125 		goto wrong_mc5_mce;
1126 
1127 	pr_emerg(HW_ERR "MC5 Error: ");
1128 
1129 	if (INT_ERROR(ec)) {
1130 		if (xec <= 0x1f) {
1131 			pr_cont("Hardware Assert.\n");
1132 			return;
1133 		} else
1134 			goto wrong_mc5_mce;
1135 	}
1136 
1137 	if (xec == 0x0 || xec == 0xc)
1138 		pr_cont("%s.\n", mc5_mce_desc[xec]);
1139 	else if (xec <= 0xd)
1140 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
1141 	else
1142 		goto wrong_mc5_mce;
1143 
1144 	return;
1145 
1146  wrong_mc5_mce:
1147 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
1148 }
1149 
decode_mc6_mce(struct mce * m)1150 static void decode_mc6_mce(struct mce *m)
1151 {
1152 	u8 xec = XEC(m->status, xec_mask);
1153 
1154 	pr_emerg(HW_ERR "MC6 Error: ");
1155 
1156 	if (xec > 0x5)
1157 		goto wrong_mc6_mce;
1158 
1159 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
1160 	return;
1161 
1162  wrong_mc6_mce:
1163 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
1164 }
1165 
1166 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)1167 static void decode_smca_error(struct mce *m)
1168 {
1169 	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
1170 	const char *ip_name;
1171 	u8 xec = XEC(m->status, xec_mask);
1172 
1173 	if (bank_type >= N_SMCA_BANK_TYPES)
1174 		return;
1175 
1176 	if (bank_type == SMCA_RESERVED) {
1177 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
1178 		return;
1179 	}
1180 
1181 	ip_name = smca_get_long_name(bank_type);
1182 
1183 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
1184 
1185 	/* Only print the decode of valid error codes */
1186 	if (xec < smca_mce_descs[bank_type].num_descs)
1187 		pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
1188 
1189 	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
1190 	    xec == 0 && decode_dram_ecc)
1191 		decode_dram_ecc(topology_die_id(m->extcpu), m);
1192 }
1193 
amd_decode_err_code(u16 ec)1194 static inline void amd_decode_err_code(u16 ec)
1195 {
1196 	if (INT_ERROR(ec)) {
1197 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1198 		return;
1199 	}
1200 
1201 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1202 
1203 	if (BUS_ERROR(ec))
1204 		pr_cont(", mem/io: %s", II_MSG(ec));
1205 	else
1206 		pr_cont(", tx: %s", TT_MSG(ec));
1207 
1208 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1209 		pr_cont(", mem-tx: %s", R4_MSG(ec));
1210 
1211 		if (BUS_ERROR(ec))
1212 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1213 	}
1214 
1215 	pr_cont("\n");
1216 }
1217 
decode_error_status(struct mce * m)1218 static const char *decode_error_status(struct mce *m)
1219 {
1220 	if (m->status & MCI_STATUS_UC) {
1221 		if (m->status & MCI_STATUS_PCC)
1222 			return "System Fatal error.";
1223 		if (m->mcgstatus & MCG_STATUS_RIPV)
1224 			return "Uncorrected, software restartable error.";
1225 		return "Uncorrected, software containable error.";
1226 	}
1227 
1228 	if (m->status & MCI_STATUS_DEFERRED)
1229 		return "Deferred error, no action required.";
1230 
1231 	return "Corrected error, no action required.";
1232 }
1233 
1234 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)1235 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1236 {
1237 	struct mce *m = (struct mce *)data;
1238 	unsigned int fam = x86_family(m->cpuid);
1239 	int ecc;
1240 
1241 	if (m->kflags & MCE_HANDLED_CEC)
1242 		return NOTIFY_DONE;
1243 
1244 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1245 
1246 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1247 		m->extcpu,
1248 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1249 		m->bank,
1250 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1251 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1252 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1253 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1254 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
1255 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
1256 
1257 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1258 		u32 low, high;
1259 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1260 
1261 		if (!rdmsr_safe(addr, &low, &high) &&
1262 		    (low & MCI_CONFIG_MCAX))
1263 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1264 
1265 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1266 	}
1267 
1268 	/* do the two bits[14:13] together */
1269 	ecc = (m->status >> 45) & 0x3;
1270 	if (ecc)
1271 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1272 
1273 	if (fam >= 0x15) {
1274 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1275 
1276 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
1277 		if (fam != 0x15 || m->bank != 4)
1278 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1279 	}
1280 
1281 	if (fam >= 0x17)
1282 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1283 
1284 	pr_cont("]: 0x%016llx\n", m->status);
1285 
1286 	if (m->status & MCI_STATUS_ADDRV)
1287 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1288 
1289 	if (m->ppin)
1290 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
1291 
1292 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1293 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1294 
1295 		if (m->status & MCI_STATUS_SYNDV)
1296 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1297 
1298 		pr_cont("\n");
1299 
1300 		decode_smca_error(m);
1301 		goto err_code;
1302 	}
1303 
1304 	if (m->tsc)
1305 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1306 
1307 	/* Doesn't matter which member to test. */
1308 	if (!fam_ops.mc0_mce)
1309 		goto err_code;
1310 
1311 	switch (m->bank) {
1312 	case 0:
1313 		decode_mc0_mce(m);
1314 		break;
1315 
1316 	case 1:
1317 		decode_mc1_mce(m);
1318 		break;
1319 
1320 	case 2:
1321 		decode_mc2_mce(m);
1322 		break;
1323 
1324 	case 3:
1325 		decode_mc3_mce(m);
1326 		break;
1327 
1328 	case 4:
1329 		decode_mc4_mce(m);
1330 		break;
1331 
1332 	case 5:
1333 		decode_mc5_mce(m);
1334 		break;
1335 
1336 	case 6:
1337 		decode_mc6_mce(m);
1338 		break;
1339 
1340 	default:
1341 		break;
1342 	}
1343 
1344  err_code:
1345 	amd_decode_err_code(m->status & 0xffff);
1346 
1347 	m->kflags |= MCE_HANDLED_EDAC;
1348 	return NOTIFY_OK;
1349 }
1350 
1351 static struct notifier_block amd_mce_dec_nb = {
1352 	.notifier_call	= amd_decode_mce,
1353 	.priority	= MCE_PRIO_EDAC,
1354 };
1355 
mce_amd_init(void)1356 static int __init mce_amd_init(void)
1357 {
1358 	struct cpuinfo_x86 *c = &boot_cpu_data;
1359 
1360 	if (c->x86_vendor != X86_VENDOR_AMD &&
1361 	    c->x86_vendor != X86_VENDOR_HYGON)
1362 		return -ENODEV;
1363 
1364 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
1365 		return -ENODEV;
1366 
1367 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1368 		xec_mask = 0x3f;
1369 		goto out;
1370 	}
1371 
1372 	switch (c->x86) {
1373 	case 0xf:
1374 		fam_ops.mc0_mce = k8_mc0_mce;
1375 		fam_ops.mc1_mce = k8_mc1_mce;
1376 		fam_ops.mc2_mce = k8_mc2_mce;
1377 		break;
1378 
1379 	case 0x10:
1380 		fam_ops.mc0_mce = f10h_mc0_mce;
1381 		fam_ops.mc1_mce = k8_mc1_mce;
1382 		fam_ops.mc2_mce = k8_mc2_mce;
1383 		break;
1384 
1385 	case 0x11:
1386 		fam_ops.mc0_mce = k8_mc0_mce;
1387 		fam_ops.mc1_mce = k8_mc1_mce;
1388 		fam_ops.mc2_mce = k8_mc2_mce;
1389 		break;
1390 
1391 	case 0x12:
1392 		fam_ops.mc0_mce = f12h_mc0_mce;
1393 		fam_ops.mc1_mce = k8_mc1_mce;
1394 		fam_ops.mc2_mce = k8_mc2_mce;
1395 		break;
1396 
1397 	case 0x14:
1398 		fam_ops.mc0_mce = cat_mc0_mce;
1399 		fam_ops.mc1_mce = cat_mc1_mce;
1400 		fam_ops.mc2_mce = k8_mc2_mce;
1401 		break;
1402 
1403 	case 0x15:
1404 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1405 
1406 		fam_ops.mc0_mce = f15h_mc0_mce;
1407 		fam_ops.mc1_mce = f15h_mc1_mce;
1408 		fam_ops.mc2_mce = f15h_mc2_mce;
1409 		break;
1410 
1411 	case 0x16:
1412 		xec_mask = 0x1f;
1413 		fam_ops.mc0_mce = cat_mc0_mce;
1414 		fam_ops.mc1_mce = cat_mc1_mce;
1415 		fam_ops.mc2_mce = f16h_mc2_mce;
1416 		break;
1417 
1418 	case 0x17:
1419 	case 0x18:
1420 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1421 		return -EINVAL;
1422 
1423 	default:
1424 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1425 		return -EINVAL;
1426 	}
1427 
1428 out:
1429 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1430 
1431 	mce_register_decode_chain(&amd_mce_dec_nb);
1432 
1433 	return 0;
1434 }
1435 early_initcall(mce_amd_init);
1436 
1437 #ifdef MODULE
mce_amd_exit(void)1438 static void __exit mce_amd_exit(void)
1439 {
1440 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1441 }
1442 
1443 MODULE_DESCRIPTION("AMD MCE decoder");
1444 MODULE_ALIAS("edac-mce-amd");
1445 MODULE_LICENSE("GPL");
1446 module_exit(mce_amd_exit);
1447 #endif
1448