1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 7 #include "mce_amd.h" 8 9 static struct amd_decoder_ops fam_ops; 10 11 static u8 xec_mask = 0xf; 12 13 static void (*decode_dram_ecc)(int node_id, struct mce *m); 14 15 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 16 { 17 decode_dram_ecc = f; 18 } 19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 20 21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 22 { 23 if (decode_dram_ecc) { 24 WARN_ON(decode_dram_ecc != f); 25 26 decode_dram_ecc = NULL; 27 } 28 } 29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 30 31 /* 32 * string representation for the different MCA reported error types, see F3x48 33 * or MSR0000_0411. 34 */ 35 36 /* transaction type */ 37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 38 39 /* cache level */ 40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 41 42 /* memory transaction type */ 43 static const char * const rrrr_msgs[] = { 44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 45 }; 46 47 /* participating processor */ 48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 49 EXPORT_SYMBOL_GPL(pp_msgs); 50 51 /* request timeout */ 52 static const char * const to_msgs[] = { "no timeout", "timed out" }; 53 54 /* memory or i/o */ 55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 56 57 /* internal error type */ 58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 59 60 static const char * const f15h_mc1_mce_desc[] = { 61 "UC during a demand linefill from L2", 62 "Parity error during data load from IC", 63 "Parity error for IC valid bit", 64 "Main tag parity error", 65 "Parity error in prediction queue", 66 "PFB data/address parity error", 67 "Parity error in the branch status reg", 68 "PFB promotion address error", 69 "Tag error during probe/victimization", 70 "Parity error for IC probe tag valid bit", 71 "PFB non-cacheable bit parity error", 72 "PFB valid bit parity error", /* xec = 0xd */ 73 "Microcode Patch Buffer", /* xec = 010 */ 74 "uop queue", 75 "insn buffer", 76 "predecode buffer", 77 "fetch address FIFO", 78 "dispatch uop queue" 79 }; 80 81 static const char * const f15h_mc2_mce_desc[] = { 82 "Fill ECC error on data fills", /* xec = 0x4 */ 83 "Fill parity error on insn fills", 84 "Prefetcher request FIFO parity error", 85 "PRQ address parity error", 86 "PRQ data parity error", 87 "WCC Tag ECC error", 88 "WCC Data ECC error", 89 "WCB Data parity error", 90 "VB Data ECC or parity error", 91 "L2 Tag ECC error", /* xec = 0x10 */ 92 "Hard L2 Tag ECC error", 93 "Multiple hits on L2 tag", 94 "XAB parity error", 95 "PRB address parity error" 96 }; 97 98 static const char * const mc4_mce_desc[] = { 99 "DRAM ECC error detected on the NB", 100 "CRC error detected on HT link", 101 "Link-defined sync error packets detected on HT link", 102 "HT Master abort", 103 "HT Target abort", 104 "Invalid GART PTE entry during GART table walk", 105 "Unsupported atomic RMW received from an IO link", 106 "Watchdog timeout due to lack of progress", 107 "DRAM ECC error detected on the NB", 108 "SVM DMA Exclusion Vector error", 109 "HT data error detected on link", 110 "Protocol error (link, L3, probe filter)", 111 "NB internal arrays parity error", 112 "DRAM addr/ctl signals parity error", 113 "IO link transmission error", 114 "L3 data cache ECC error", /* xec = 0x1c */ 115 "L3 cache tag error", 116 "L3 LRU parity bits error", 117 "ECC Error in the Probe Filter directory" 118 }; 119 120 static const char * const mc5_mce_desc[] = { 121 "CPU Watchdog timer expire", 122 "Wakeup array dest tag", 123 "AG payload array", 124 "EX payload array", 125 "IDRF array", 126 "Retire dispatch queue", 127 "Mapper checkpoint array", 128 "Physical register file EX0 port", 129 "Physical register file EX1 port", 130 "Physical register file AG0 port", 131 "Physical register file AG1 port", 132 "Flag register file", 133 "DE error occurred", 134 "Retire status queue" 135 }; 136 137 static const char * const mc6_mce_desc[] = { 138 "Hardware Assertion", 139 "Free List", 140 "Physical Register File", 141 "Retire Queue", 142 "Scheduler table", 143 "Status Register File", 144 }; 145 146 /* Scalable MCA error strings */ 147 static const char * const smca_ls_mce_desc[] = { 148 "Load queue parity error", 149 "Store queue parity error", 150 "Miss address buffer payload parity error", 151 "Level 1 TLB parity error", 152 "DC Tag error type 5", 153 "DC Tag error type 6", 154 "DC Tag error type 1", 155 "Internal error type 1", 156 "Internal error type 2", 157 "System Read Data Error Thread 0", 158 "System Read Data Error Thread 1", 159 "DC Tag error type 2", 160 "DC Data error type 1 and poison consumption", 161 "DC Data error type 2", 162 "DC Data error type 3", 163 "DC Tag error type 4", 164 "Level 2 TLB parity error", 165 "PDC parity error", 166 "DC Tag error type 3", 167 "DC Tag error type 5", 168 "L2 Fill Data error", 169 }; 170 171 static const char * const smca_ls2_mce_desc[] = { 172 "An ECC error was detected on a data cache read by a probe or victimization", 173 "An ECC error or L2 poison was detected on a data cache read by a load", 174 "An ECC error was detected on a data cache read-modify-write by a store", 175 "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", 176 "An ECC error or poison bit mismatch was detected on a tag read by a load", 177 "An ECC error or poison bit mismatch was detected on a tag read by a store", 178 "An ECC error was detected on an EMEM read by a load", 179 "An ECC error was detected on an EMEM read-modify-write by a store", 180 "A parity error was detected in an L1 TLB entry by any access", 181 "A parity error was detected in an L2 TLB entry by any access", 182 "A parity error was detected in a PWC entry by any access", 183 "A parity error was detected in an STQ entry by any access", 184 "A parity error was detected in an LDQ entry by any access", 185 "A parity error was detected in a MAB entry by any access", 186 "A parity error was detected in an SCB entry state field by any access", 187 "A parity error was detected in an SCB entry address field by any access", 188 "A parity error was detected in an SCB entry data field by any access", 189 "A parity error was detected in a WCB entry by any access", 190 "A poisoned line was detected in an SCB entry by any access", 191 "A SystemReadDataError error was reported on read data returned from L2 for a load", 192 "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", 193 "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", 194 "A hardware assertion error was reported", 195 "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", 196 }; 197 198 static const char * const smca_if_mce_desc[] = { 199 "Op Cache Microtag Probe Port Parity Error", 200 "IC Microtag or Full Tag Multi-hit Error", 201 "IC Full Tag Parity Error", 202 "IC Data Array Parity Error", 203 "Decoupling Queue PhysAddr Parity Error", 204 "L0 ITLB Parity Error", 205 "L1 ITLB Parity Error", 206 "L2 ITLB Parity Error", 207 "BPQ Thread 0 Snoop Parity Error", 208 "BPQ Thread 1 Snoop Parity Error", 209 "L1 BTB Multi-Match Error", 210 "L2 BTB Multi-Match Error", 211 "L2 Cache Response Poison Error", 212 "System Read Data Error", 213 }; 214 215 static const char * const smca_l2_mce_desc[] = { 216 "L2M Tag Multiple-Way-Hit error", 217 "L2M Tag or State Array ECC Error", 218 "L2M Data Array ECC Error", 219 "Hardware Assert Error", 220 }; 221 222 static const char * const smca_de_mce_desc[] = { 223 "Micro-op cache tag parity error", 224 "Micro-op cache data parity error", 225 "Instruction buffer parity error", 226 "Micro-op queue parity error", 227 "Instruction dispatch queue parity error", 228 "Fetch address FIFO parity error", 229 "Patch RAM data parity error", 230 "Patch RAM sequencer parity error", 231 "Micro-op buffer parity error" 232 }; 233 234 static const char * const smca_ex_mce_desc[] = { 235 "Watchdog Timeout error", 236 "Physical register file parity error", 237 "Flag register file parity error", 238 "Immediate displacement register file parity error", 239 "Address generator payload parity error", 240 "EX payload parity error", 241 "Checkpoint queue parity error", 242 "Retire dispatch queue parity error", 243 "Retire status queue parity error", 244 "Scheduling queue parity error", 245 "Branch buffer queue parity error", 246 "Hardware Assertion error", 247 }; 248 249 static const char * const smca_fp_mce_desc[] = { 250 "Physical register file (PRF) parity error", 251 "Freelist (FL) parity error", 252 "Schedule queue parity error", 253 "NSQ parity error", 254 "Retire queue (RQ) parity error", 255 "Status register file (SRF) parity error", 256 "Hardware assertion", 257 }; 258 259 static const char * const smca_l3_mce_desc[] = { 260 "Shadow Tag Macro ECC Error", 261 "Shadow Tag Macro Multi-way-hit Error", 262 "L3M Tag ECC Error", 263 "L3M Tag Multi-way-hit Error", 264 "L3M Data ECC Error", 265 "SDP Parity Error or SystemReadDataError from XI", 266 "L3 Victim Queue Parity Error", 267 "L3 Hardware Assertion", 268 }; 269 270 static const char * const smca_cs_mce_desc[] = { 271 "Illegal Request", 272 "Address Violation", 273 "Security Violation", 274 "Illegal Response", 275 "Unexpected Response", 276 "Request or Probe Parity Error", 277 "Read Response Parity Error", 278 "Atomic Request Parity Error", 279 "Probe Filter ECC Error", 280 }; 281 282 static const char * const smca_cs2_mce_desc[] = { 283 "Illegal Request", 284 "Address Violation", 285 "Security Violation", 286 "Illegal Response", 287 "Unexpected Response", 288 "Request or Probe Parity Error", 289 "Read Response Parity Error", 290 "Atomic Request Parity Error", 291 "SDP read response had no match in the CS queue", 292 "Probe Filter Protocol Error", 293 "Probe Filter ECC Error", 294 "SDP read response had an unexpected RETRY error", 295 "Counter overflow error", 296 "Counter underflow error", 297 }; 298 299 static const char * const smca_pie_mce_desc[] = { 300 "Hardware Assert", 301 "Register security violation", 302 "Link Error", 303 "Poison data consumption", 304 "A deferred error was detected in the DF" 305 }; 306 307 static const char * const smca_umc_mce_desc[] = { 308 "DRAM ECC error", 309 "Data poison error", 310 "SDP parity error", 311 "Advanced peripheral bus error", 312 "Address/Command parity error", 313 "Write data CRC error", 314 "DCQ SRAM ECC error", 315 "AES SRAM ECC error", 316 }; 317 318 static const char * const smca_pb_mce_desc[] = { 319 "An ECC error in the Parameter Block RAM array", 320 }; 321 322 static const char * const smca_psp_mce_desc[] = { 323 "An ECC or parity error in a PSP RAM instance", 324 }; 325 326 static const char * const smca_psp2_mce_desc[] = { 327 "High SRAM ECC or parity error", 328 "Low SRAM ECC or parity error", 329 "Instruction Cache Bank 0 ECC or parity error", 330 "Instruction Cache Bank 1 ECC or parity error", 331 "Instruction Tag Ram 0 parity error", 332 "Instruction Tag Ram 1 parity error", 333 "Data Cache Bank 0 ECC or parity error", 334 "Data Cache Bank 1 ECC or parity error", 335 "Data Cache Bank 2 ECC or parity error", 336 "Data Cache Bank 3 ECC or parity error", 337 "Data Tag Bank 0 parity error", 338 "Data Tag Bank 1 parity error", 339 "Data Tag Bank 2 parity error", 340 "Data Tag Bank 3 parity error", 341 "Dirty Data Ram parity error", 342 "TLB Bank 0 parity error", 343 "TLB Bank 1 parity error", 344 "System Hub Read Buffer ECC or parity error", 345 }; 346 347 static const char * const smca_smu_mce_desc[] = { 348 "An ECC or parity error in an SMU RAM instance", 349 }; 350 351 static const char * const smca_smu2_mce_desc[] = { 352 "High SRAM ECC or parity error", 353 "Low SRAM ECC or parity error", 354 "Data Cache Bank A ECC or parity error", 355 "Data Cache Bank B ECC or parity error", 356 "Data Tag Cache Bank A ECC or parity error", 357 "Data Tag Cache Bank B ECC or parity error", 358 "Instruction Cache Bank A ECC or parity error", 359 "Instruction Cache Bank B ECC or parity error", 360 "Instruction Tag Cache Bank A ECC or parity error", 361 "Instruction Tag Cache Bank B ECC or parity error", 362 "System Hub Read Buffer ECC or parity error", 363 }; 364 365 static const char * const smca_mp5_mce_desc[] = { 366 "High SRAM ECC or parity error", 367 "Low SRAM ECC or parity error", 368 "Data Cache Bank A ECC or parity error", 369 "Data Cache Bank B ECC or parity error", 370 "Data Tag Cache Bank A ECC or parity error", 371 "Data Tag Cache Bank B ECC or parity error", 372 "Instruction Cache Bank A ECC or parity error", 373 "Instruction Cache Bank B ECC or parity error", 374 "Instruction Tag Cache Bank A ECC or parity error", 375 "Instruction Tag Cache Bank B ECC or parity error", 376 }; 377 378 static const char * const smca_nbio_mce_desc[] = { 379 "ECC or Parity error", 380 "PCIE error", 381 "SDP ErrEvent error", 382 "SDP Egress Poison Error", 383 "IOHC Internal Poison Error", 384 }; 385 386 static const char * const smca_pcie_mce_desc[] = { 387 "CCIX PER Message logging", 388 "CCIX Read Response with Status: Non-Data Error", 389 "CCIX Write Response with Status: Non-Data Error", 390 "CCIX Read Response with Status: Data Error", 391 "CCIX Non-okay write response with data error", 392 }; 393 394 struct smca_mce_desc { 395 const char * const *descs; 396 unsigned int num_descs; 397 }; 398 399 static struct smca_mce_desc smca_mce_descs[] = { 400 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, 401 [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, 402 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, 403 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, 404 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, 405 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, 406 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, 407 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, 408 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, 409 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, 410 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, 411 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, 412 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, 413 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, 414 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, 415 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, 416 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, 417 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, 418 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, 419 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, 420 }; 421 422 static bool f12h_mc0_mce(u16 ec, u8 xec) 423 { 424 bool ret = false; 425 426 if (MEM_ERROR(ec)) { 427 u8 ll = LL(ec); 428 ret = true; 429 430 if (ll == LL_L2) 431 pr_cont("during L1 linefill from L2.\n"); 432 else if (ll == LL_L1) 433 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 434 else 435 ret = false; 436 } 437 return ret; 438 } 439 440 static bool f10h_mc0_mce(u16 ec, u8 xec) 441 { 442 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 443 pr_cont("during data scrub.\n"); 444 return true; 445 } 446 return f12h_mc0_mce(ec, xec); 447 } 448 449 static bool k8_mc0_mce(u16 ec, u8 xec) 450 { 451 if (BUS_ERROR(ec)) { 452 pr_cont("during system linefill.\n"); 453 return true; 454 } 455 456 return f10h_mc0_mce(ec, xec); 457 } 458 459 static bool cat_mc0_mce(u16 ec, u8 xec) 460 { 461 u8 r4 = R4(ec); 462 bool ret = true; 463 464 if (MEM_ERROR(ec)) { 465 466 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 467 return false; 468 469 switch (r4) { 470 case R4_DRD: 471 case R4_DWR: 472 pr_cont("Data/Tag parity error due to %s.\n", 473 (r4 == R4_DRD ? "load/hw prf" : "store")); 474 break; 475 case R4_EVICT: 476 pr_cont("Copyback parity error on a tag miss.\n"); 477 break; 478 case R4_SNOOP: 479 pr_cont("Tag parity error during snoop.\n"); 480 break; 481 default: 482 ret = false; 483 } 484 } else if (BUS_ERROR(ec)) { 485 486 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 487 return false; 488 489 pr_cont("System read data error on a "); 490 491 switch (r4) { 492 case R4_RD: 493 pr_cont("TLB reload.\n"); 494 break; 495 case R4_DWR: 496 pr_cont("store.\n"); 497 break; 498 case R4_DRD: 499 pr_cont("load.\n"); 500 break; 501 default: 502 ret = false; 503 } 504 } else { 505 ret = false; 506 } 507 508 return ret; 509 } 510 511 static bool f15h_mc0_mce(u16 ec, u8 xec) 512 { 513 bool ret = true; 514 515 if (MEM_ERROR(ec)) { 516 517 switch (xec) { 518 case 0x0: 519 pr_cont("Data Array access error.\n"); 520 break; 521 522 case 0x1: 523 pr_cont("UC error during a linefill from L2/NB.\n"); 524 break; 525 526 case 0x2: 527 case 0x11: 528 pr_cont("STQ access error.\n"); 529 break; 530 531 case 0x3: 532 pr_cont("SCB access error.\n"); 533 break; 534 535 case 0x10: 536 pr_cont("Tag error.\n"); 537 break; 538 539 case 0x12: 540 pr_cont("LDQ access error.\n"); 541 break; 542 543 default: 544 ret = false; 545 } 546 } else if (BUS_ERROR(ec)) { 547 548 if (!xec) 549 pr_cont("System Read Data Error.\n"); 550 else 551 pr_cont(" Internal error condition type %d.\n", xec); 552 } else if (INT_ERROR(ec)) { 553 if (xec <= 0x1f) 554 pr_cont("Hardware Assert.\n"); 555 else 556 ret = false; 557 558 } else 559 ret = false; 560 561 return ret; 562 } 563 564 static void decode_mc0_mce(struct mce *m) 565 { 566 u16 ec = EC(m->status); 567 u8 xec = XEC(m->status, xec_mask); 568 569 pr_emerg(HW_ERR "MC0 Error: "); 570 571 /* TLB error signatures are the same across families */ 572 if (TLB_ERROR(ec)) { 573 if (TT(ec) == TT_DATA) { 574 pr_cont("%s TLB %s.\n", LL_MSG(ec), 575 ((xec == 2) ? "locked miss" 576 : (xec ? "multimatch" : "parity"))); 577 return; 578 } 579 } else if (fam_ops.mc0_mce(ec, xec)) 580 ; 581 else 582 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 583 } 584 585 static bool k8_mc1_mce(u16 ec, u8 xec) 586 { 587 u8 ll = LL(ec); 588 bool ret = true; 589 590 if (!MEM_ERROR(ec)) 591 return false; 592 593 if (ll == 0x2) 594 pr_cont("during a linefill from L2.\n"); 595 else if (ll == 0x1) { 596 switch (R4(ec)) { 597 case R4_IRD: 598 pr_cont("Parity error during data load.\n"); 599 break; 600 601 case R4_EVICT: 602 pr_cont("Copyback Parity/Victim error.\n"); 603 break; 604 605 case R4_SNOOP: 606 pr_cont("Tag Snoop error.\n"); 607 break; 608 609 default: 610 ret = false; 611 break; 612 } 613 } else 614 ret = false; 615 616 return ret; 617 } 618 619 static bool cat_mc1_mce(u16 ec, u8 xec) 620 { 621 u8 r4 = R4(ec); 622 bool ret = true; 623 624 if (!MEM_ERROR(ec)) 625 return false; 626 627 if (TT(ec) != TT_INSTR) 628 return false; 629 630 if (r4 == R4_IRD) 631 pr_cont("Data/tag array parity error for a tag hit.\n"); 632 else if (r4 == R4_SNOOP) 633 pr_cont("Tag error during snoop/victimization.\n"); 634 else if (xec == 0x0) 635 pr_cont("Tag parity error from victim castout.\n"); 636 else if (xec == 0x2) 637 pr_cont("Microcode patch RAM parity error.\n"); 638 else 639 ret = false; 640 641 return ret; 642 } 643 644 static bool f15h_mc1_mce(u16 ec, u8 xec) 645 { 646 bool ret = true; 647 648 if (!MEM_ERROR(ec)) 649 return false; 650 651 switch (xec) { 652 case 0x0 ... 0xa: 653 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 654 break; 655 656 case 0xd: 657 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 658 break; 659 660 case 0x10: 661 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 662 break; 663 664 case 0x11 ... 0x15: 665 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 666 break; 667 668 default: 669 ret = false; 670 } 671 return ret; 672 } 673 674 static void decode_mc1_mce(struct mce *m) 675 { 676 u16 ec = EC(m->status); 677 u8 xec = XEC(m->status, xec_mask); 678 679 pr_emerg(HW_ERR "MC1 Error: "); 680 681 if (TLB_ERROR(ec)) 682 pr_cont("%s TLB %s.\n", LL_MSG(ec), 683 (xec ? "multimatch" : "parity error")); 684 else if (BUS_ERROR(ec)) { 685 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 686 687 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 688 } else if (INT_ERROR(ec)) { 689 if (xec <= 0x3f) 690 pr_cont("Hardware Assert.\n"); 691 else 692 goto wrong_mc1_mce; 693 } else if (fam_ops.mc1_mce(ec, xec)) 694 ; 695 else 696 goto wrong_mc1_mce; 697 698 return; 699 700 wrong_mc1_mce: 701 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 702 } 703 704 static bool k8_mc2_mce(u16 ec, u8 xec) 705 { 706 bool ret = true; 707 708 if (xec == 0x1) 709 pr_cont(" in the write data buffers.\n"); 710 else if (xec == 0x3) 711 pr_cont(" in the victim data buffers.\n"); 712 else if (xec == 0x2 && MEM_ERROR(ec)) 713 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 714 else if (xec == 0x0) { 715 if (TLB_ERROR(ec)) 716 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 717 TT_MSG(ec)); 718 else if (BUS_ERROR(ec)) 719 pr_cont(": %s/ECC error in data read from NB: %s.\n", 720 R4_MSG(ec), PP_MSG(ec)); 721 else if (MEM_ERROR(ec)) { 722 u8 r4 = R4(ec); 723 724 if (r4 >= 0x7) 725 pr_cont(": %s error during data copyback.\n", 726 R4_MSG(ec)); 727 else if (r4 <= 0x1) 728 pr_cont(": %s parity/ECC error during data " 729 "access from L2.\n", R4_MSG(ec)); 730 else 731 ret = false; 732 } else 733 ret = false; 734 } else 735 ret = false; 736 737 return ret; 738 } 739 740 static bool f15h_mc2_mce(u16 ec, u8 xec) 741 { 742 bool ret = true; 743 744 if (TLB_ERROR(ec)) { 745 if (xec == 0x0) 746 pr_cont("Data parity TLB read error.\n"); 747 else if (xec == 0x1) 748 pr_cont("Poison data provided for TLB fill.\n"); 749 else 750 ret = false; 751 } else if (BUS_ERROR(ec)) { 752 if (xec > 2) 753 ret = false; 754 755 pr_cont("Error during attempted NB data read.\n"); 756 } else if (MEM_ERROR(ec)) { 757 switch (xec) { 758 case 0x4 ... 0xc: 759 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 760 break; 761 762 case 0x10 ... 0x14: 763 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 764 break; 765 766 default: 767 ret = false; 768 } 769 } else if (INT_ERROR(ec)) { 770 if (xec <= 0x3f) 771 pr_cont("Hardware Assert.\n"); 772 else 773 ret = false; 774 } 775 776 return ret; 777 } 778 779 static bool f16h_mc2_mce(u16 ec, u8 xec) 780 { 781 u8 r4 = R4(ec); 782 783 if (!MEM_ERROR(ec)) 784 return false; 785 786 switch (xec) { 787 case 0x04 ... 0x05: 788 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 789 break; 790 791 case 0x09 ... 0x0b: 792 case 0x0d ... 0x0f: 793 pr_cont("ECC error in L2 tag (%s).\n", 794 ((r4 == R4_GEN) ? "BankReq" : 795 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 796 break; 797 798 case 0x10 ... 0x19: 799 case 0x1b: 800 pr_cont("ECC error in L2 data array (%s).\n", 801 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 802 ((r4 == R4_GEN) ? "Attr" : 803 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 804 break; 805 806 case 0x1c ... 0x1d: 807 case 0x1f: 808 pr_cont("Parity error in L2 attribute bits (%s).\n", 809 ((r4 == R4_RD) ? "Hit" : 810 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 811 break; 812 813 default: 814 return false; 815 } 816 817 return true; 818 } 819 820 static void decode_mc2_mce(struct mce *m) 821 { 822 u16 ec = EC(m->status); 823 u8 xec = XEC(m->status, xec_mask); 824 825 pr_emerg(HW_ERR "MC2 Error: "); 826 827 if (!fam_ops.mc2_mce(ec, xec)) 828 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 829 } 830 831 static void decode_mc3_mce(struct mce *m) 832 { 833 u16 ec = EC(m->status); 834 u8 xec = XEC(m->status, xec_mask); 835 836 if (boot_cpu_data.x86 >= 0x14) { 837 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 838 " please report on LKML.\n"); 839 return; 840 } 841 842 pr_emerg(HW_ERR "MC3 Error"); 843 844 if (xec == 0x0) { 845 u8 r4 = R4(ec); 846 847 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 848 goto wrong_mc3_mce; 849 850 pr_cont(" during %s.\n", R4_MSG(ec)); 851 } else 852 goto wrong_mc3_mce; 853 854 return; 855 856 wrong_mc3_mce: 857 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 858 } 859 860 static void decode_mc4_mce(struct mce *m) 861 { 862 unsigned int fam = x86_family(m->cpuid); 863 int node_id = amd_get_nb_id(m->extcpu); 864 u16 ec = EC(m->status); 865 u8 xec = XEC(m->status, 0x1f); 866 u8 offset = 0; 867 868 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 869 870 switch (xec) { 871 case 0x0 ... 0xe: 872 873 /* special handling for DRAM ECCs */ 874 if (xec == 0x0 || xec == 0x8) { 875 /* no ECCs on F11h */ 876 if (fam == 0x11) 877 goto wrong_mc4_mce; 878 879 pr_cont("%s.\n", mc4_mce_desc[xec]); 880 881 if (decode_dram_ecc) 882 decode_dram_ecc(node_id, m); 883 return; 884 } 885 break; 886 887 case 0xf: 888 if (TLB_ERROR(ec)) 889 pr_cont("GART Table Walk data error.\n"); 890 else if (BUS_ERROR(ec)) 891 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 892 else 893 goto wrong_mc4_mce; 894 return; 895 896 case 0x19: 897 if (fam == 0x15 || fam == 0x16) 898 pr_cont("Compute Unit Data Error.\n"); 899 else 900 goto wrong_mc4_mce; 901 return; 902 903 case 0x1c ... 0x1f: 904 offset = 13; 905 break; 906 907 default: 908 goto wrong_mc4_mce; 909 } 910 911 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 912 return; 913 914 wrong_mc4_mce: 915 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 916 } 917 918 static void decode_mc5_mce(struct mce *m) 919 { 920 unsigned int fam = x86_family(m->cpuid); 921 u16 ec = EC(m->status); 922 u8 xec = XEC(m->status, xec_mask); 923 924 if (fam == 0xf || fam == 0x11) 925 goto wrong_mc5_mce; 926 927 pr_emerg(HW_ERR "MC5 Error: "); 928 929 if (INT_ERROR(ec)) { 930 if (xec <= 0x1f) { 931 pr_cont("Hardware Assert.\n"); 932 return; 933 } else 934 goto wrong_mc5_mce; 935 } 936 937 if (xec == 0x0 || xec == 0xc) 938 pr_cont("%s.\n", mc5_mce_desc[xec]); 939 else if (xec <= 0xd) 940 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 941 else 942 goto wrong_mc5_mce; 943 944 return; 945 946 wrong_mc5_mce: 947 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 948 } 949 950 static void decode_mc6_mce(struct mce *m) 951 { 952 u8 xec = XEC(m->status, xec_mask); 953 954 pr_emerg(HW_ERR "MC6 Error: "); 955 956 if (xec > 0x5) 957 goto wrong_mc6_mce; 958 959 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 960 return; 961 962 wrong_mc6_mce: 963 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 964 } 965 966 /* Decode errors according to Scalable MCA specification */ 967 static void decode_smca_error(struct mce *m) 968 { 969 struct smca_hwid *hwid; 970 enum smca_bank_types bank_type; 971 const char *ip_name; 972 u8 xec = XEC(m->status, xec_mask); 973 974 if (m->bank >= ARRAY_SIZE(smca_banks)) 975 return; 976 977 hwid = smca_banks[m->bank].hwid; 978 if (!hwid) 979 return; 980 981 bank_type = hwid->bank_type; 982 983 if (bank_type == SMCA_RESERVED) { 984 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 985 return; 986 } 987 988 ip_name = smca_get_long_name(bank_type); 989 990 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); 991 992 /* Only print the decode of valid error codes */ 993 if (xec < smca_mce_descs[bank_type].num_descs && 994 (hwid->xec_bitmap & BIT_ULL(xec))) { 995 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); 996 } 997 998 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) 999 decode_dram_ecc(cpu_to_node(m->extcpu), m); 1000 } 1001 1002 static inline void amd_decode_err_code(u16 ec) 1003 { 1004 if (INT_ERROR(ec)) { 1005 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 1006 return; 1007 } 1008 1009 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 1010 1011 if (BUS_ERROR(ec)) 1012 pr_cont(", mem/io: %s", II_MSG(ec)); 1013 else 1014 pr_cont(", tx: %s", TT_MSG(ec)); 1015 1016 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 1017 pr_cont(", mem-tx: %s", R4_MSG(ec)); 1018 1019 if (BUS_ERROR(ec)) 1020 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 1021 } 1022 1023 pr_cont("\n"); 1024 } 1025 1026 static const char *decode_error_status(struct mce *m) 1027 { 1028 if (m->status & MCI_STATUS_UC) { 1029 if (m->status & MCI_STATUS_PCC) 1030 return "System Fatal error."; 1031 if (m->mcgstatus & MCG_STATUS_RIPV) 1032 return "Uncorrected, software restartable error."; 1033 return "Uncorrected, software containable error."; 1034 } 1035 1036 if (m->status & MCI_STATUS_DEFERRED) 1037 return "Deferred error, no action required."; 1038 1039 return "Corrected error, no action required."; 1040 } 1041 1042 static int 1043 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 1044 { 1045 struct mce *m = (struct mce *)data; 1046 unsigned int fam = x86_family(m->cpuid); 1047 int ecc; 1048 1049 if (m->kflags & MCE_HANDLED_CEC) 1050 return NOTIFY_DONE; 1051 1052 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1053 1054 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 1055 m->extcpu, 1056 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 1057 m->bank, 1058 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 1059 ((m->status & MCI_STATUS_UC) ? "UE" : 1060 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 1061 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 1062 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 1063 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 1064 1065 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1066 u32 low, high; 1067 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); 1068 1069 if (!rdmsr_safe(addr, &low, &high) && 1070 (low & MCI_CONFIG_MCAX)) 1071 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 1072 1073 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 1074 } 1075 1076 /* do the two bits[14:13] together */ 1077 ecc = (m->status >> 45) & 0x3; 1078 if (ecc) 1079 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 1080 1081 if (fam >= 0x15) { 1082 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 1083 1084 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 1085 if (fam != 0x15 || m->bank != 4) 1086 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 1087 } 1088 1089 if (fam >= 0x17) 1090 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 1091 1092 pr_cont("]: 0x%016llx\n", m->status); 1093 1094 if (m->status & MCI_STATUS_ADDRV) 1095 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 1096 1097 if (m->ppin) 1098 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); 1099 1100 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1101 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 1102 1103 if (m->status & MCI_STATUS_SYNDV) 1104 pr_cont(", Syndrome: 0x%016llx", m->synd); 1105 1106 pr_cont("\n"); 1107 1108 decode_smca_error(m); 1109 goto err_code; 1110 } 1111 1112 if (m->tsc) 1113 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 1114 1115 /* Doesn't matter which member to test. */ 1116 if (!fam_ops.mc0_mce) 1117 goto err_code; 1118 1119 switch (m->bank) { 1120 case 0: 1121 decode_mc0_mce(m); 1122 break; 1123 1124 case 1: 1125 decode_mc1_mce(m); 1126 break; 1127 1128 case 2: 1129 decode_mc2_mce(m); 1130 break; 1131 1132 case 3: 1133 decode_mc3_mce(m); 1134 break; 1135 1136 case 4: 1137 decode_mc4_mce(m); 1138 break; 1139 1140 case 5: 1141 decode_mc5_mce(m); 1142 break; 1143 1144 case 6: 1145 decode_mc6_mce(m); 1146 break; 1147 1148 default: 1149 break; 1150 } 1151 1152 err_code: 1153 amd_decode_err_code(m->status & 0xffff); 1154 1155 m->kflags |= MCE_HANDLED_EDAC; 1156 return NOTIFY_OK; 1157 } 1158 1159 static struct notifier_block amd_mce_dec_nb = { 1160 .notifier_call = amd_decode_mce, 1161 .priority = MCE_PRIO_EDAC, 1162 }; 1163 1164 static int __init mce_amd_init(void) 1165 { 1166 struct cpuinfo_x86 *c = &boot_cpu_data; 1167 1168 if (c->x86_vendor != X86_VENDOR_AMD && 1169 c->x86_vendor != X86_VENDOR_HYGON) 1170 return -ENODEV; 1171 1172 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1173 xec_mask = 0x3f; 1174 goto out; 1175 } 1176 1177 switch (c->x86) { 1178 case 0xf: 1179 fam_ops.mc0_mce = k8_mc0_mce; 1180 fam_ops.mc1_mce = k8_mc1_mce; 1181 fam_ops.mc2_mce = k8_mc2_mce; 1182 break; 1183 1184 case 0x10: 1185 fam_ops.mc0_mce = f10h_mc0_mce; 1186 fam_ops.mc1_mce = k8_mc1_mce; 1187 fam_ops.mc2_mce = k8_mc2_mce; 1188 break; 1189 1190 case 0x11: 1191 fam_ops.mc0_mce = k8_mc0_mce; 1192 fam_ops.mc1_mce = k8_mc1_mce; 1193 fam_ops.mc2_mce = k8_mc2_mce; 1194 break; 1195 1196 case 0x12: 1197 fam_ops.mc0_mce = f12h_mc0_mce; 1198 fam_ops.mc1_mce = k8_mc1_mce; 1199 fam_ops.mc2_mce = k8_mc2_mce; 1200 break; 1201 1202 case 0x14: 1203 fam_ops.mc0_mce = cat_mc0_mce; 1204 fam_ops.mc1_mce = cat_mc1_mce; 1205 fam_ops.mc2_mce = k8_mc2_mce; 1206 break; 1207 1208 case 0x15: 1209 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 1210 1211 fam_ops.mc0_mce = f15h_mc0_mce; 1212 fam_ops.mc1_mce = f15h_mc1_mce; 1213 fam_ops.mc2_mce = f15h_mc2_mce; 1214 break; 1215 1216 case 0x16: 1217 xec_mask = 0x1f; 1218 fam_ops.mc0_mce = cat_mc0_mce; 1219 fam_ops.mc1_mce = cat_mc1_mce; 1220 fam_ops.mc2_mce = f16h_mc2_mce; 1221 break; 1222 1223 case 0x17: 1224 case 0x18: 1225 pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); 1226 return -EINVAL; 1227 1228 default: 1229 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1230 return -EINVAL; 1231 } 1232 1233 out: 1234 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1235 1236 mce_register_decode_chain(&amd_mce_dec_nb); 1237 1238 return 0; 1239 } 1240 early_initcall(mce_amd_init); 1241 1242 #ifdef MODULE 1243 static void __exit mce_amd_exit(void) 1244 { 1245 mce_unregister_decode_chain(&amd_mce_dec_nb); 1246 } 1247 1248 MODULE_DESCRIPTION("AMD MCE decoder"); 1249 MODULE_ALIAS("edac-mce-amd"); 1250 MODULE_LICENSE("GPL"); 1251 module_exit(mce_amd_exit); 1252 #endif 1253