1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 7 #include "mce_amd.h" 8 9 static struct amd_decoder_ops fam_ops; 10 11 static u8 xec_mask = 0xf; 12 13 static bool report_gart_errors; 14 static void (*decode_dram_ecc)(int node_id, struct mce *m); 15 16 void amd_report_gart_errors(bool v) 17 { 18 report_gart_errors = v; 19 } 20 EXPORT_SYMBOL_GPL(amd_report_gart_errors); 21 22 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 23 { 24 decode_dram_ecc = f; 25 } 26 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 27 28 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 29 { 30 if (decode_dram_ecc) { 31 WARN_ON(decode_dram_ecc != f); 32 33 decode_dram_ecc = NULL; 34 } 35 } 36 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 37 38 /* 39 * string representation for the different MCA reported error types, see F3x48 40 * or MSR0000_0411. 41 */ 42 43 /* transaction type */ 44 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 45 46 /* cache level */ 47 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 48 49 /* memory transaction type */ 50 static const char * const rrrr_msgs[] = { 51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 52 }; 53 54 /* participating processor */ 55 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 56 EXPORT_SYMBOL_GPL(pp_msgs); 57 58 /* request timeout */ 59 static const char * const to_msgs[] = { "no timeout", "timed out" }; 60 61 /* memory or i/o */ 62 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 63 64 /* internal error type */ 65 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 66 67 static const char * const f15h_mc1_mce_desc[] = { 68 "UC during a demand linefill from L2", 69 "Parity error during data load from IC", 70 "Parity error for IC valid bit", 71 "Main tag parity error", 72 "Parity error in prediction queue", 73 "PFB data/address parity error", 74 "Parity error in the branch status reg", 75 "PFB promotion address error", 76 "Tag error during probe/victimization", 77 "Parity error for IC probe tag valid bit", 78 "PFB non-cacheable bit parity error", 79 "PFB valid bit parity error", /* xec = 0xd */ 80 "Microcode Patch Buffer", /* xec = 010 */ 81 "uop queue", 82 "insn buffer", 83 "predecode buffer", 84 "fetch address FIFO", 85 "dispatch uop queue" 86 }; 87 88 static const char * const f15h_mc2_mce_desc[] = { 89 "Fill ECC error on data fills", /* xec = 0x4 */ 90 "Fill parity error on insn fills", 91 "Prefetcher request FIFO parity error", 92 "PRQ address parity error", 93 "PRQ data parity error", 94 "WCC Tag ECC error", 95 "WCC Data ECC error", 96 "WCB Data parity error", 97 "VB Data ECC or parity error", 98 "L2 Tag ECC error", /* xec = 0x10 */ 99 "Hard L2 Tag ECC error", 100 "Multiple hits on L2 tag", 101 "XAB parity error", 102 "PRB address parity error" 103 }; 104 105 static const char * const mc4_mce_desc[] = { 106 "DRAM ECC error detected on the NB", 107 "CRC error detected on HT link", 108 "Link-defined sync error packets detected on HT link", 109 "HT Master abort", 110 "HT Target abort", 111 "Invalid GART PTE entry during GART table walk", 112 "Unsupported atomic RMW received from an IO link", 113 "Watchdog timeout due to lack of progress", 114 "DRAM ECC error detected on the NB", 115 "SVM DMA Exclusion Vector error", 116 "HT data error detected on link", 117 "Protocol error (link, L3, probe filter)", 118 "NB internal arrays parity error", 119 "DRAM addr/ctl signals parity error", 120 "IO link transmission error", 121 "L3 data cache ECC error", /* xec = 0x1c */ 122 "L3 cache tag error", 123 "L3 LRU parity bits error", 124 "ECC Error in the Probe Filter directory" 125 }; 126 127 static const char * const mc5_mce_desc[] = { 128 "CPU Watchdog timer expire", 129 "Wakeup array dest tag", 130 "AG payload array", 131 "EX payload array", 132 "IDRF array", 133 "Retire dispatch queue", 134 "Mapper checkpoint array", 135 "Physical register file EX0 port", 136 "Physical register file EX1 port", 137 "Physical register file AG0 port", 138 "Physical register file AG1 port", 139 "Flag register file", 140 "DE error occurred", 141 "Retire status queue" 142 }; 143 144 static const char * const mc6_mce_desc[] = { 145 "Hardware Assertion", 146 "Free List", 147 "Physical Register File", 148 "Retire Queue", 149 "Scheduler table", 150 "Status Register File", 151 }; 152 153 /* Scalable MCA error strings */ 154 static const char * const smca_ls_mce_desc[] = { 155 "Load queue parity error", 156 "Store queue parity error", 157 "Miss address buffer payload parity error", 158 "Level 1 TLB parity error", 159 "DC Tag error type 5", 160 "DC Tag error type 6", 161 "DC Tag error type 1", 162 "Internal error type 1", 163 "Internal error type 2", 164 "System Read Data Error Thread 0", 165 "System Read Data Error Thread 1", 166 "DC Tag error type 2", 167 "DC Data error type 1 and poison consumption", 168 "DC Data error type 2", 169 "DC Data error type 3", 170 "DC Tag error type 4", 171 "Level 2 TLB parity error", 172 "PDC parity error", 173 "DC Tag error type 3", 174 "DC Tag error type 5", 175 "L2 Fill Data error", 176 }; 177 178 static const char * const smca_ls2_mce_desc[] = { 179 "An ECC error was detected on a data cache read by a probe or victimization", 180 "An ECC error or L2 poison was detected on a data cache read by a load", 181 "An ECC error was detected on a data cache read-modify-write by a store", 182 "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", 183 "An ECC error or poison bit mismatch was detected on a tag read by a load", 184 "An ECC error or poison bit mismatch was detected on a tag read by a store", 185 "An ECC error was detected on an EMEM read by a load", 186 "An ECC error was detected on an EMEM read-modify-write by a store", 187 "A parity error was detected in an L1 TLB entry by any access", 188 "A parity error was detected in an L2 TLB entry by any access", 189 "A parity error was detected in a PWC entry by any access", 190 "A parity error was detected in an STQ entry by any access", 191 "A parity error was detected in an LDQ entry by any access", 192 "A parity error was detected in a MAB entry by any access", 193 "A parity error was detected in an SCB entry state field by any access", 194 "A parity error was detected in an SCB entry address field by any access", 195 "A parity error was detected in an SCB entry data field by any access", 196 "A parity error was detected in a WCB entry by any access", 197 "A poisoned line was detected in an SCB entry by any access", 198 "A SystemReadDataError error was reported on read data returned from L2 for a load", 199 "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", 200 "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", 201 "A hardware assertion error was reported", 202 "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", 203 }; 204 205 static const char * const smca_if_mce_desc[] = { 206 "Op Cache Microtag Probe Port Parity Error", 207 "IC Microtag or Full Tag Multi-hit Error", 208 "IC Full Tag Parity Error", 209 "IC Data Array Parity Error", 210 "Decoupling Queue PhysAddr Parity Error", 211 "L0 ITLB Parity Error", 212 "L1 ITLB Parity Error", 213 "L2 ITLB Parity Error", 214 "BPQ Thread 0 Snoop Parity Error", 215 "BPQ Thread 1 Snoop Parity Error", 216 "L1 BTB Multi-Match Error", 217 "L2 BTB Multi-Match Error", 218 "L2 Cache Response Poison Error", 219 "System Read Data Error", 220 }; 221 222 static const char * const smca_l2_mce_desc[] = { 223 "L2M Tag Multiple-Way-Hit error", 224 "L2M Tag or State Array ECC Error", 225 "L2M Data Array ECC Error", 226 "Hardware Assert Error", 227 }; 228 229 static const char * const smca_de_mce_desc[] = { 230 "Micro-op cache tag parity error", 231 "Micro-op cache data parity error", 232 "Instruction buffer parity error", 233 "Micro-op queue parity error", 234 "Instruction dispatch queue parity error", 235 "Fetch address FIFO parity error", 236 "Patch RAM data parity error", 237 "Patch RAM sequencer parity error", 238 "Micro-op buffer parity error" 239 }; 240 241 static const char * const smca_ex_mce_desc[] = { 242 "Watchdog Timeout error", 243 "Physical register file parity error", 244 "Flag register file parity error", 245 "Immediate displacement register file parity error", 246 "Address generator payload parity error", 247 "EX payload parity error", 248 "Checkpoint queue parity error", 249 "Retire dispatch queue parity error", 250 "Retire status queue parity error", 251 "Scheduling queue parity error", 252 "Branch buffer queue parity error", 253 "Hardware Assertion error", 254 }; 255 256 static const char * const smca_fp_mce_desc[] = { 257 "Physical register file (PRF) parity error", 258 "Freelist (FL) parity error", 259 "Schedule queue parity error", 260 "NSQ parity error", 261 "Retire queue (RQ) parity error", 262 "Status register file (SRF) parity error", 263 "Hardware assertion", 264 }; 265 266 static const char * const smca_l3_mce_desc[] = { 267 "Shadow Tag Macro ECC Error", 268 "Shadow Tag Macro Multi-way-hit Error", 269 "L3M Tag ECC Error", 270 "L3M Tag Multi-way-hit Error", 271 "L3M Data ECC Error", 272 "SDP Parity Error or SystemReadDataError from XI", 273 "L3 Victim Queue Parity Error", 274 "L3 Hardware Assertion", 275 }; 276 277 static const char * const smca_cs_mce_desc[] = { 278 "Illegal Request", 279 "Address Violation", 280 "Security Violation", 281 "Illegal Response", 282 "Unexpected Response", 283 "Request or Probe Parity Error", 284 "Read Response Parity Error", 285 "Atomic Request Parity Error", 286 "Probe Filter ECC Error", 287 }; 288 289 static const char * const smca_cs2_mce_desc[] = { 290 "Illegal Request", 291 "Address Violation", 292 "Security Violation", 293 "Illegal Response", 294 "Unexpected Response", 295 "Request or Probe Parity Error", 296 "Read Response Parity Error", 297 "Atomic Request Parity Error", 298 "SDP read response had no match in the CS queue", 299 "Probe Filter Protocol Error", 300 "Probe Filter ECC Error", 301 "SDP read response had an unexpected RETRY error", 302 "Counter overflow error", 303 "Counter underflow error", 304 }; 305 306 static const char * const smca_pie_mce_desc[] = { 307 "Hardware Assert", 308 "Register security violation", 309 "Link Error", 310 "Poison data consumption", 311 "A deferred error was detected in the DF" 312 }; 313 314 static const char * const smca_umc_mce_desc[] = { 315 "DRAM ECC error", 316 "Data poison error", 317 "SDP parity error", 318 "Advanced peripheral bus error", 319 "Address/Command parity error", 320 "Write data CRC error", 321 "DCQ SRAM ECC error", 322 "AES SRAM ECC error", 323 }; 324 325 static const char * const smca_pb_mce_desc[] = { 326 "An ECC error in the Parameter Block RAM array", 327 }; 328 329 static const char * const smca_psp_mce_desc[] = { 330 "An ECC or parity error in a PSP RAM instance", 331 }; 332 333 static const char * const smca_psp2_mce_desc[] = { 334 "High SRAM ECC or parity error", 335 "Low SRAM ECC or parity error", 336 "Instruction Cache Bank 0 ECC or parity error", 337 "Instruction Cache Bank 1 ECC or parity error", 338 "Instruction Tag Ram 0 parity error", 339 "Instruction Tag Ram 1 parity error", 340 "Data Cache Bank 0 ECC or parity error", 341 "Data Cache Bank 1 ECC or parity error", 342 "Data Cache Bank 2 ECC or parity error", 343 "Data Cache Bank 3 ECC or parity error", 344 "Data Tag Bank 0 parity error", 345 "Data Tag Bank 1 parity error", 346 "Data Tag Bank 2 parity error", 347 "Data Tag Bank 3 parity error", 348 "Dirty Data Ram parity error", 349 "TLB Bank 0 parity error", 350 "TLB Bank 1 parity error", 351 "System Hub Read Buffer ECC or parity error", 352 }; 353 354 static const char * const smca_smu_mce_desc[] = { 355 "An ECC or parity error in an SMU RAM instance", 356 }; 357 358 static const char * const smca_smu2_mce_desc[] = { 359 "High SRAM ECC or parity error", 360 "Low SRAM ECC or parity error", 361 "Data Cache Bank A ECC or parity error", 362 "Data Cache Bank B ECC or parity error", 363 "Data Tag Cache Bank A ECC or parity error", 364 "Data Tag Cache Bank B ECC or parity error", 365 "Instruction Cache Bank A ECC or parity error", 366 "Instruction Cache Bank B ECC or parity error", 367 "Instruction Tag Cache Bank A ECC or parity error", 368 "Instruction Tag Cache Bank B ECC or parity error", 369 "System Hub Read Buffer ECC or parity error", 370 }; 371 372 static const char * const smca_mp5_mce_desc[] = { 373 "High SRAM ECC or parity error", 374 "Low SRAM ECC or parity error", 375 "Data Cache Bank A ECC or parity error", 376 "Data Cache Bank B ECC or parity error", 377 "Data Tag Cache Bank A ECC or parity error", 378 "Data Tag Cache Bank B ECC or parity error", 379 "Instruction Cache Bank A ECC or parity error", 380 "Instruction Cache Bank B ECC or parity error", 381 "Instruction Tag Cache Bank A ECC or parity error", 382 "Instruction Tag Cache Bank B ECC or parity error", 383 }; 384 385 static const char * const smca_nbio_mce_desc[] = { 386 "ECC or Parity error", 387 "PCIE error", 388 "SDP ErrEvent error", 389 "SDP Egress Poison Error", 390 "IOHC Internal Poison Error", 391 }; 392 393 static const char * const smca_pcie_mce_desc[] = { 394 "CCIX PER Message logging", 395 "CCIX Read Response with Status: Non-Data Error", 396 "CCIX Write Response with Status: Non-Data Error", 397 "CCIX Read Response with Status: Data Error", 398 "CCIX Non-okay write response with data error", 399 }; 400 401 struct smca_mce_desc { 402 const char * const *descs; 403 unsigned int num_descs; 404 }; 405 406 static struct smca_mce_desc smca_mce_descs[] = { 407 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, 408 [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, 409 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, 410 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, 411 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, 412 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, 413 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, 414 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, 415 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, 416 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, 417 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, 418 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, 419 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, 420 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, 421 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, 422 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, 423 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, 424 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, 425 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, 426 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, 427 }; 428 429 static bool f12h_mc0_mce(u16 ec, u8 xec) 430 { 431 bool ret = false; 432 433 if (MEM_ERROR(ec)) { 434 u8 ll = LL(ec); 435 ret = true; 436 437 if (ll == LL_L2) 438 pr_cont("during L1 linefill from L2.\n"); 439 else if (ll == LL_L1) 440 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 441 else 442 ret = false; 443 } 444 return ret; 445 } 446 447 static bool f10h_mc0_mce(u16 ec, u8 xec) 448 { 449 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 450 pr_cont("during data scrub.\n"); 451 return true; 452 } 453 return f12h_mc0_mce(ec, xec); 454 } 455 456 static bool k8_mc0_mce(u16 ec, u8 xec) 457 { 458 if (BUS_ERROR(ec)) { 459 pr_cont("during system linefill.\n"); 460 return true; 461 } 462 463 return f10h_mc0_mce(ec, xec); 464 } 465 466 static bool cat_mc0_mce(u16 ec, u8 xec) 467 { 468 u8 r4 = R4(ec); 469 bool ret = true; 470 471 if (MEM_ERROR(ec)) { 472 473 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 474 return false; 475 476 switch (r4) { 477 case R4_DRD: 478 case R4_DWR: 479 pr_cont("Data/Tag parity error due to %s.\n", 480 (r4 == R4_DRD ? "load/hw prf" : "store")); 481 break; 482 case R4_EVICT: 483 pr_cont("Copyback parity error on a tag miss.\n"); 484 break; 485 case R4_SNOOP: 486 pr_cont("Tag parity error during snoop.\n"); 487 break; 488 default: 489 ret = false; 490 } 491 } else if (BUS_ERROR(ec)) { 492 493 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 494 return false; 495 496 pr_cont("System read data error on a "); 497 498 switch (r4) { 499 case R4_RD: 500 pr_cont("TLB reload.\n"); 501 break; 502 case R4_DWR: 503 pr_cont("store.\n"); 504 break; 505 case R4_DRD: 506 pr_cont("load.\n"); 507 break; 508 default: 509 ret = false; 510 } 511 } else { 512 ret = false; 513 } 514 515 return ret; 516 } 517 518 static bool f15h_mc0_mce(u16 ec, u8 xec) 519 { 520 bool ret = true; 521 522 if (MEM_ERROR(ec)) { 523 524 switch (xec) { 525 case 0x0: 526 pr_cont("Data Array access error.\n"); 527 break; 528 529 case 0x1: 530 pr_cont("UC error during a linefill from L2/NB.\n"); 531 break; 532 533 case 0x2: 534 case 0x11: 535 pr_cont("STQ access error.\n"); 536 break; 537 538 case 0x3: 539 pr_cont("SCB access error.\n"); 540 break; 541 542 case 0x10: 543 pr_cont("Tag error.\n"); 544 break; 545 546 case 0x12: 547 pr_cont("LDQ access error.\n"); 548 break; 549 550 default: 551 ret = false; 552 } 553 } else if (BUS_ERROR(ec)) { 554 555 if (!xec) 556 pr_cont("System Read Data Error.\n"); 557 else 558 pr_cont(" Internal error condition type %d.\n", xec); 559 } else if (INT_ERROR(ec)) { 560 if (xec <= 0x1f) 561 pr_cont("Hardware Assert.\n"); 562 else 563 ret = false; 564 565 } else 566 ret = false; 567 568 return ret; 569 } 570 571 static void decode_mc0_mce(struct mce *m) 572 { 573 u16 ec = EC(m->status); 574 u8 xec = XEC(m->status, xec_mask); 575 576 pr_emerg(HW_ERR "MC0 Error: "); 577 578 /* TLB error signatures are the same across families */ 579 if (TLB_ERROR(ec)) { 580 if (TT(ec) == TT_DATA) { 581 pr_cont("%s TLB %s.\n", LL_MSG(ec), 582 ((xec == 2) ? "locked miss" 583 : (xec ? "multimatch" : "parity"))); 584 return; 585 } 586 } else if (fam_ops.mc0_mce(ec, xec)) 587 ; 588 else 589 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 590 } 591 592 static bool k8_mc1_mce(u16 ec, u8 xec) 593 { 594 u8 ll = LL(ec); 595 bool ret = true; 596 597 if (!MEM_ERROR(ec)) 598 return false; 599 600 if (ll == 0x2) 601 pr_cont("during a linefill from L2.\n"); 602 else if (ll == 0x1) { 603 switch (R4(ec)) { 604 case R4_IRD: 605 pr_cont("Parity error during data load.\n"); 606 break; 607 608 case R4_EVICT: 609 pr_cont("Copyback Parity/Victim error.\n"); 610 break; 611 612 case R4_SNOOP: 613 pr_cont("Tag Snoop error.\n"); 614 break; 615 616 default: 617 ret = false; 618 break; 619 } 620 } else 621 ret = false; 622 623 return ret; 624 } 625 626 static bool cat_mc1_mce(u16 ec, u8 xec) 627 { 628 u8 r4 = R4(ec); 629 bool ret = true; 630 631 if (!MEM_ERROR(ec)) 632 return false; 633 634 if (TT(ec) != TT_INSTR) 635 return false; 636 637 if (r4 == R4_IRD) 638 pr_cont("Data/tag array parity error for a tag hit.\n"); 639 else if (r4 == R4_SNOOP) 640 pr_cont("Tag error during snoop/victimization.\n"); 641 else if (xec == 0x0) 642 pr_cont("Tag parity error from victim castout.\n"); 643 else if (xec == 0x2) 644 pr_cont("Microcode patch RAM parity error.\n"); 645 else 646 ret = false; 647 648 return ret; 649 } 650 651 static bool f15h_mc1_mce(u16 ec, u8 xec) 652 { 653 bool ret = true; 654 655 if (!MEM_ERROR(ec)) 656 return false; 657 658 switch (xec) { 659 case 0x0 ... 0xa: 660 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 661 break; 662 663 case 0xd: 664 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 665 break; 666 667 case 0x10: 668 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 669 break; 670 671 case 0x11 ... 0x15: 672 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 673 break; 674 675 default: 676 ret = false; 677 } 678 return ret; 679 } 680 681 static void decode_mc1_mce(struct mce *m) 682 { 683 u16 ec = EC(m->status); 684 u8 xec = XEC(m->status, xec_mask); 685 686 pr_emerg(HW_ERR "MC1 Error: "); 687 688 if (TLB_ERROR(ec)) 689 pr_cont("%s TLB %s.\n", LL_MSG(ec), 690 (xec ? "multimatch" : "parity error")); 691 else if (BUS_ERROR(ec)) { 692 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 693 694 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 695 } else if (INT_ERROR(ec)) { 696 if (xec <= 0x3f) 697 pr_cont("Hardware Assert.\n"); 698 else 699 goto wrong_mc1_mce; 700 } else if (fam_ops.mc1_mce(ec, xec)) 701 ; 702 else 703 goto wrong_mc1_mce; 704 705 return; 706 707 wrong_mc1_mce: 708 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 709 } 710 711 static bool k8_mc2_mce(u16 ec, u8 xec) 712 { 713 bool ret = true; 714 715 if (xec == 0x1) 716 pr_cont(" in the write data buffers.\n"); 717 else if (xec == 0x3) 718 pr_cont(" in the victim data buffers.\n"); 719 else if (xec == 0x2 && MEM_ERROR(ec)) 720 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 721 else if (xec == 0x0) { 722 if (TLB_ERROR(ec)) 723 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 724 TT_MSG(ec)); 725 else if (BUS_ERROR(ec)) 726 pr_cont(": %s/ECC error in data read from NB: %s.\n", 727 R4_MSG(ec), PP_MSG(ec)); 728 else if (MEM_ERROR(ec)) { 729 u8 r4 = R4(ec); 730 731 if (r4 >= 0x7) 732 pr_cont(": %s error during data copyback.\n", 733 R4_MSG(ec)); 734 else if (r4 <= 0x1) 735 pr_cont(": %s parity/ECC error during data " 736 "access from L2.\n", R4_MSG(ec)); 737 else 738 ret = false; 739 } else 740 ret = false; 741 } else 742 ret = false; 743 744 return ret; 745 } 746 747 static bool f15h_mc2_mce(u16 ec, u8 xec) 748 { 749 bool ret = true; 750 751 if (TLB_ERROR(ec)) { 752 if (xec == 0x0) 753 pr_cont("Data parity TLB read error.\n"); 754 else if (xec == 0x1) 755 pr_cont("Poison data provided for TLB fill.\n"); 756 else 757 ret = false; 758 } else if (BUS_ERROR(ec)) { 759 if (xec > 2) 760 ret = false; 761 762 pr_cont("Error during attempted NB data read.\n"); 763 } else if (MEM_ERROR(ec)) { 764 switch (xec) { 765 case 0x4 ... 0xc: 766 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 767 break; 768 769 case 0x10 ... 0x14: 770 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 771 break; 772 773 default: 774 ret = false; 775 } 776 } else if (INT_ERROR(ec)) { 777 if (xec <= 0x3f) 778 pr_cont("Hardware Assert.\n"); 779 else 780 ret = false; 781 } 782 783 return ret; 784 } 785 786 static bool f16h_mc2_mce(u16 ec, u8 xec) 787 { 788 u8 r4 = R4(ec); 789 790 if (!MEM_ERROR(ec)) 791 return false; 792 793 switch (xec) { 794 case 0x04 ... 0x05: 795 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 796 break; 797 798 case 0x09 ... 0x0b: 799 case 0x0d ... 0x0f: 800 pr_cont("ECC error in L2 tag (%s).\n", 801 ((r4 == R4_GEN) ? "BankReq" : 802 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 803 break; 804 805 case 0x10 ... 0x19: 806 case 0x1b: 807 pr_cont("ECC error in L2 data array (%s).\n", 808 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 809 ((r4 == R4_GEN) ? "Attr" : 810 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 811 break; 812 813 case 0x1c ... 0x1d: 814 case 0x1f: 815 pr_cont("Parity error in L2 attribute bits (%s).\n", 816 ((r4 == R4_RD) ? "Hit" : 817 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 818 break; 819 820 default: 821 return false; 822 } 823 824 return true; 825 } 826 827 static void decode_mc2_mce(struct mce *m) 828 { 829 u16 ec = EC(m->status); 830 u8 xec = XEC(m->status, xec_mask); 831 832 pr_emerg(HW_ERR "MC2 Error: "); 833 834 if (!fam_ops.mc2_mce(ec, xec)) 835 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 836 } 837 838 static void decode_mc3_mce(struct mce *m) 839 { 840 u16 ec = EC(m->status); 841 u8 xec = XEC(m->status, xec_mask); 842 843 if (boot_cpu_data.x86 >= 0x14) { 844 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 845 " please report on LKML.\n"); 846 return; 847 } 848 849 pr_emerg(HW_ERR "MC3 Error"); 850 851 if (xec == 0x0) { 852 u8 r4 = R4(ec); 853 854 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 855 goto wrong_mc3_mce; 856 857 pr_cont(" during %s.\n", R4_MSG(ec)); 858 } else 859 goto wrong_mc3_mce; 860 861 return; 862 863 wrong_mc3_mce: 864 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 865 } 866 867 static void decode_mc4_mce(struct mce *m) 868 { 869 unsigned int fam = x86_family(m->cpuid); 870 int node_id = amd_get_nb_id(m->extcpu); 871 u16 ec = EC(m->status); 872 u8 xec = XEC(m->status, 0x1f); 873 u8 offset = 0; 874 875 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 876 877 switch (xec) { 878 case 0x0 ... 0xe: 879 880 /* special handling for DRAM ECCs */ 881 if (xec == 0x0 || xec == 0x8) { 882 /* no ECCs on F11h */ 883 if (fam == 0x11) 884 goto wrong_mc4_mce; 885 886 pr_cont("%s.\n", mc4_mce_desc[xec]); 887 888 if (decode_dram_ecc) 889 decode_dram_ecc(node_id, m); 890 return; 891 } 892 break; 893 894 case 0xf: 895 if (TLB_ERROR(ec)) 896 pr_cont("GART Table Walk data error.\n"); 897 else if (BUS_ERROR(ec)) 898 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 899 else 900 goto wrong_mc4_mce; 901 return; 902 903 case 0x19: 904 if (fam == 0x15 || fam == 0x16) 905 pr_cont("Compute Unit Data Error.\n"); 906 else 907 goto wrong_mc4_mce; 908 return; 909 910 case 0x1c ... 0x1f: 911 offset = 13; 912 break; 913 914 default: 915 goto wrong_mc4_mce; 916 } 917 918 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 919 return; 920 921 wrong_mc4_mce: 922 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 923 } 924 925 static void decode_mc5_mce(struct mce *m) 926 { 927 unsigned int fam = x86_family(m->cpuid); 928 u16 ec = EC(m->status); 929 u8 xec = XEC(m->status, xec_mask); 930 931 if (fam == 0xf || fam == 0x11) 932 goto wrong_mc5_mce; 933 934 pr_emerg(HW_ERR "MC5 Error: "); 935 936 if (INT_ERROR(ec)) { 937 if (xec <= 0x1f) { 938 pr_cont("Hardware Assert.\n"); 939 return; 940 } else 941 goto wrong_mc5_mce; 942 } 943 944 if (xec == 0x0 || xec == 0xc) 945 pr_cont("%s.\n", mc5_mce_desc[xec]); 946 else if (xec <= 0xd) 947 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 948 else 949 goto wrong_mc5_mce; 950 951 return; 952 953 wrong_mc5_mce: 954 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 955 } 956 957 static void decode_mc6_mce(struct mce *m) 958 { 959 u8 xec = XEC(m->status, xec_mask); 960 961 pr_emerg(HW_ERR "MC6 Error: "); 962 963 if (xec > 0x5) 964 goto wrong_mc6_mce; 965 966 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 967 return; 968 969 wrong_mc6_mce: 970 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 971 } 972 973 /* Decode errors according to Scalable MCA specification */ 974 static void decode_smca_error(struct mce *m) 975 { 976 struct smca_hwid *hwid; 977 enum smca_bank_types bank_type; 978 const char *ip_name; 979 u8 xec = XEC(m->status, xec_mask); 980 981 if (m->bank >= ARRAY_SIZE(smca_banks)) 982 return; 983 984 hwid = smca_banks[m->bank].hwid; 985 if (!hwid) 986 return; 987 988 bank_type = hwid->bank_type; 989 990 if (bank_type == SMCA_RESERVED) { 991 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 992 return; 993 } 994 995 ip_name = smca_get_long_name(bank_type); 996 997 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); 998 999 /* Only print the decode of valid error codes */ 1000 if (xec < smca_mce_descs[bank_type].num_descs && 1001 (hwid->xec_bitmap & BIT_ULL(xec))) { 1002 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); 1003 } 1004 1005 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) 1006 decode_dram_ecc(cpu_to_node(m->extcpu), m); 1007 } 1008 1009 static inline void amd_decode_err_code(u16 ec) 1010 { 1011 if (INT_ERROR(ec)) { 1012 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 1013 return; 1014 } 1015 1016 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 1017 1018 if (BUS_ERROR(ec)) 1019 pr_cont(", mem/io: %s", II_MSG(ec)); 1020 else 1021 pr_cont(", tx: %s", TT_MSG(ec)); 1022 1023 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 1024 pr_cont(", mem-tx: %s", R4_MSG(ec)); 1025 1026 if (BUS_ERROR(ec)) 1027 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 1028 } 1029 1030 pr_cont("\n"); 1031 } 1032 1033 /* 1034 * Filter out unwanted MCE signatures here. 1035 */ 1036 static bool ignore_mce(struct mce *m) 1037 { 1038 /* 1039 * NB GART TLB error reporting is disabled by default. 1040 */ 1041 if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors) 1042 return true; 1043 1044 return false; 1045 } 1046 1047 static const char *decode_error_status(struct mce *m) 1048 { 1049 if (m->status & MCI_STATUS_UC) { 1050 if (m->status & MCI_STATUS_PCC) 1051 return "System Fatal error."; 1052 if (m->mcgstatus & MCG_STATUS_RIPV) 1053 return "Uncorrected, software restartable error."; 1054 return "Uncorrected, software containable error."; 1055 } 1056 1057 if (m->status & MCI_STATUS_DEFERRED) 1058 return "Deferred error, no action required."; 1059 1060 return "Corrected error, no action required."; 1061 } 1062 1063 static int 1064 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 1065 { 1066 struct mce *m = (struct mce *)data; 1067 unsigned int fam = x86_family(m->cpuid); 1068 int ecc; 1069 1070 if (ignore_mce(m)) 1071 return NOTIFY_STOP; 1072 1073 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1074 1075 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 1076 m->extcpu, 1077 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 1078 m->bank, 1079 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 1080 ((m->status & MCI_STATUS_UC) ? "UE" : 1081 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 1082 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 1083 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 1084 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 1085 1086 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1087 u32 low, high; 1088 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); 1089 1090 if (!rdmsr_safe(addr, &low, &high) && 1091 (low & MCI_CONFIG_MCAX)) 1092 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 1093 1094 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 1095 } 1096 1097 /* do the two bits[14:13] together */ 1098 ecc = (m->status >> 45) & 0x3; 1099 if (ecc) 1100 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 1101 1102 if (fam >= 0x15) { 1103 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 1104 1105 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 1106 if (fam != 0x15 || m->bank != 4) 1107 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 1108 } 1109 1110 if (fam >= 0x17) 1111 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 1112 1113 pr_cont("]: 0x%016llx\n", m->status); 1114 1115 if (m->status & MCI_STATUS_ADDRV) 1116 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 1117 1118 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1119 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 1120 1121 if (m->status & MCI_STATUS_SYNDV) 1122 pr_cont(", Syndrome: 0x%016llx", m->synd); 1123 1124 pr_cont("\n"); 1125 1126 decode_smca_error(m); 1127 goto err_code; 1128 } 1129 1130 if (m->tsc) 1131 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 1132 1133 /* Doesn't matter which member to test. */ 1134 if (!fam_ops.mc0_mce) 1135 goto err_code; 1136 1137 switch (m->bank) { 1138 case 0: 1139 decode_mc0_mce(m); 1140 break; 1141 1142 case 1: 1143 decode_mc1_mce(m); 1144 break; 1145 1146 case 2: 1147 decode_mc2_mce(m); 1148 break; 1149 1150 case 3: 1151 decode_mc3_mce(m); 1152 break; 1153 1154 case 4: 1155 decode_mc4_mce(m); 1156 break; 1157 1158 case 5: 1159 decode_mc5_mce(m); 1160 break; 1161 1162 case 6: 1163 decode_mc6_mce(m); 1164 break; 1165 1166 default: 1167 break; 1168 } 1169 1170 err_code: 1171 amd_decode_err_code(m->status & 0xffff); 1172 1173 return NOTIFY_STOP; 1174 } 1175 1176 static struct notifier_block amd_mce_dec_nb = { 1177 .notifier_call = amd_decode_mce, 1178 .priority = MCE_PRIO_EDAC, 1179 }; 1180 1181 static int __init mce_amd_init(void) 1182 { 1183 struct cpuinfo_x86 *c = &boot_cpu_data; 1184 1185 if (c->x86_vendor != X86_VENDOR_AMD && 1186 c->x86_vendor != X86_VENDOR_HYGON) 1187 return -ENODEV; 1188 1189 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1190 xec_mask = 0x3f; 1191 goto out; 1192 } 1193 1194 switch (c->x86) { 1195 case 0xf: 1196 fam_ops.mc0_mce = k8_mc0_mce; 1197 fam_ops.mc1_mce = k8_mc1_mce; 1198 fam_ops.mc2_mce = k8_mc2_mce; 1199 break; 1200 1201 case 0x10: 1202 fam_ops.mc0_mce = f10h_mc0_mce; 1203 fam_ops.mc1_mce = k8_mc1_mce; 1204 fam_ops.mc2_mce = k8_mc2_mce; 1205 break; 1206 1207 case 0x11: 1208 fam_ops.mc0_mce = k8_mc0_mce; 1209 fam_ops.mc1_mce = k8_mc1_mce; 1210 fam_ops.mc2_mce = k8_mc2_mce; 1211 break; 1212 1213 case 0x12: 1214 fam_ops.mc0_mce = f12h_mc0_mce; 1215 fam_ops.mc1_mce = k8_mc1_mce; 1216 fam_ops.mc2_mce = k8_mc2_mce; 1217 break; 1218 1219 case 0x14: 1220 fam_ops.mc0_mce = cat_mc0_mce; 1221 fam_ops.mc1_mce = cat_mc1_mce; 1222 fam_ops.mc2_mce = k8_mc2_mce; 1223 break; 1224 1225 case 0x15: 1226 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 1227 1228 fam_ops.mc0_mce = f15h_mc0_mce; 1229 fam_ops.mc1_mce = f15h_mc1_mce; 1230 fam_ops.mc2_mce = f15h_mc2_mce; 1231 break; 1232 1233 case 0x16: 1234 xec_mask = 0x1f; 1235 fam_ops.mc0_mce = cat_mc0_mce; 1236 fam_ops.mc1_mce = cat_mc1_mce; 1237 fam_ops.mc2_mce = f16h_mc2_mce; 1238 break; 1239 1240 case 0x17: 1241 case 0x18: 1242 pr_warn("Decoding supported only on Scalable MCA processors.\n"); 1243 return -EINVAL; 1244 1245 default: 1246 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1247 return -EINVAL; 1248 } 1249 1250 out: 1251 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1252 1253 mce_register_decode_chain(&amd_mce_dec_nb); 1254 1255 return 0; 1256 } 1257 early_initcall(mce_amd_init); 1258 1259 #ifdef MODULE 1260 static void __exit mce_amd_exit(void) 1261 { 1262 mce_unregister_decode_chain(&amd_mce_dec_nb); 1263 } 1264 1265 MODULE_DESCRIPTION("AMD MCE decoder"); 1266 MODULE_ALIAS("edac-mce-amd"); 1267 MODULE_LICENSE("GPL"); 1268 module_exit(mce_amd_exit); 1269 #endif 1270