1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 7 #include "mce_amd.h" 8 9 static struct amd_decoder_ops fam_ops; 10 11 static u8 xec_mask = 0xf; 12 13 static void (*decode_dram_ecc)(int node_id, struct mce *m); 14 15 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 16 { 17 decode_dram_ecc = f; 18 } 19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 20 21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 22 { 23 if (decode_dram_ecc) { 24 WARN_ON(decode_dram_ecc != f); 25 26 decode_dram_ecc = NULL; 27 } 28 } 29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 30 31 /* 32 * string representation for the different MCA reported error types, see F3x48 33 * or MSR0000_0411. 34 */ 35 36 /* transaction type */ 37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 38 39 /* cache level */ 40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 41 42 /* memory transaction type */ 43 static const char * const rrrr_msgs[] = { 44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 45 }; 46 47 /* participating processor */ 48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 49 EXPORT_SYMBOL_GPL(pp_msgs); 50 51 /* request timeout */ 52 static const char * const to_msgs[] = { "no timeout", "timed out" }; 53 54 /* memory or i/o */ 55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 56 57 /* internal error type */ 58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 59 60 static const char * const f15h_mc1_mce_desc[] = { 61 "UC during a demand linefill from L2", 62 "Parity error during data load from IC", 63 "Parity error for IC valid bit", 64 "Main tag parity error", 65 "Parity error in prediction queue", 66 "PFB data/address parity error", 67 "Parity error in the branch status reg", 68 "PFB promotion address error", 69 "Tag error during probe/victimization", 70 "Parity error for IC probe tag valid bit", 71 "PFB non-cacheable bit parity error", 72 "PFB valid bit parity error", /* xec = 0xd */ 73 "Microcode Patch Buffer", /* xec = 010 */ 74 "uop queue", 75 "insn buffer", 76 "predecode buffer", 77 "fetch address FIFO", 78 "dispatch uop queue" 79 }; 80 81 static const char * const f15h_mc2_mce_desc[] = { 82 "Fill ECC error on data fills", /* xec = 0x4 */ 83 "Fill parity error on insn fills", 84 "Prefetcher request FIFO parity error", 85 "PRQ address parity error", 86 "PRQ data parity error", 87 "WCC Tag ECC error", 88 "WCC Data ECC error", 89 "WCB Data parity error", 90 "VB Data ECC or parity error", 91 "L2 Tag ECC error", /* xec = 0x10 */ 92 "Hard L2 Tag ECC error", 93 "Multiple hits on L2 tag", 94 "XAB parity error", 95 "PRB address parity error" 96 }; 97 98 static const char * const mc4_mce_desc[] = { 99 "DRAM ECC error detected on the NB", 100 "CRC error detected on HT link", 101 "Link-defined sync error packets detected on HT link", 102 "HT Master abort", 103 "HT Target abort", 104 "Invalid GART PTE entry during GART table walk", 105 "Unsupported atomic RMW received from an IO link", 106 "Watchdog timeout due to lack of progress", 107 "DRAM ECC error detected on the NB", 108 "SVM DMA Exclusion Vector error", 109 "HT data error detected on link", 110 "Protocol error (link, L3, probe filter)", 111 "NB internal arrays parity error", 112 "DRAM addr/ctl signals parity error", 113 "IO link transmission error", 114 "L3 data cache ECC error", /* xec = 0x1c */ 115 "L3 cache tag error", 116 "L3 LRU parity bits error", 117 "ECC Error in the Probe Filter directory" 118 }; 119 120 static const char * const mc5_mce_desc[] = { 121 "CPU Watchdog timer expire", 122 "Wakeup array dest tag", 123 "AG payload array", 124 "EX payload array", 125 "IDRF array", 126 "Retire dispatch queue", 127 "Mapper checkpoint array", 128 "Physical register file EX0 port", 129 "Physical register file EX1 port", 130 "Physical register file AG0 port", 131 "Physical register file AG1 port", 132 "Flag register file", 133 "DE error occurred", 134 "Retire status queue" 135 }; 136 137 static const char * const mc6_mce_desc[] = { 138 "Hardware Assertion", 139 "Free List", 140 "Physical Register File", 141 "Retire Queue", 142 "Scheduler table", 143 "Status Register File", 144 }; 145 146 /* Scalable MCA error strings */ 147 static const char * const smca_ls_mce_desc[] = { 148 "Load queue parity error", 149 "Store queue parity error", 150 "Miss address buffer payload parity error", 151 "Level 1 TLB parity error", 152 "DC Tag error type 5", 153 "DC Tag error type 6", 154 "DC Tag error type 1", 155 "Internal error type 1", 156 "Internal error type 2", 157 "System Read Data Error Thread 0", 158 "System Read Data Error Thread 1", 159 "DC Tag error type 2", 160 "DC Data error type 1 and poison consumption", 161 "DC Data error type 2", 162 "DC Data error type 3", 163 "DC Tag error type 4", 164 "Level 2 TLB parity error", 165 "PDC parity error", 166 "DC Tag error type 3", 167 "DC Tag error type 5", 168 "L2 Fill Data error", 169 }; 170 171 static const char * const smca_ls2_mce_desc[] = { 172 "An ECC error was detected on a data cache read by a probe or victimization", 173 "An ECC error or L2 poison was detected on a data cache read by a load", 174 "An ECC error was detected on a data cache read-modify-write by a store", 175 "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", 176 "An ECC error or poison bit mismatch was detected on a tag read by a load", 177 "An ECC error or poison bit mismatch was detected on a tag read by a store", 178 "An ECC error was detected on an EMEM read by a load", 179 "An ECC error was detected on an EMEM read-modify-write by a store", 180 "A parity error was detected in an L1 TLB entry by any access", 181 "A parity error was detected in an L2 TLB entry by any access", 182 "A parity error was detected in a PWC entry by any access", 183 "A parity error was detected in an STQ entry by any access", 184 "A parity error was detected in an LDQ entry by any access", 185 "A parity error was detected in a MAB entry by any access", 186 "A parity error was detected in an SCB entry state field by any access", 187 "A parity error was detected in an SCB entry address field by any access", 188 "A parity error was detected in an SCB entry data field by any access", 189 "A parity error was detected in a WCB entry by any access", 190 "A poisoned line was detected in an SCB entry by any access", 191 "A SystemReadDataError error was reported on read data returned from L2 for a load", 192 "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", 193 "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", 194 "A hardware assertion error was reported", 195 "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", 196 }; 197 198 static const char * const smca_if_mce_desc[] = { 199 "Op Cache Microtag Probe Port Parity Error", 200 "IC Microtag or Full Tag Multi-hit Error", 201 "IC Full Tag Parity Error", 202 "IC Data Array Parity Error", 203 "Decoupling Queue PhysAddr Parity Error", 204 "L0 ITLB Parity Error", 205 "L1 ITLB Parity Error", 206 "L2 ITLB Parity Error", 207 "BPQ Thread 0 Snoop Parity Error", 208 "BPQ Thread 1 Snoop Parity Error", 209 "L1 BTB Multi-Match Error", 210 "L2 BTB Multi-Match Error", 211 "L2 Cache Response Poison Error", 212 "System Read Data Error", 213 "Hardware Assertion Error", 214 "L1-TLB Multi-Hit", 215 "L2-TLB Multi-Hit", 216 "BSR Parity Error", 217 "CT MCE", 218 }; 219 220 static const char * const smca_l2_mce_desc[] = { 221 "L2M Tag Multiple-Way-Hit error", 222 "L2M Tag or State Array ECC Error", 223 "L2M Data Array ECC Error", 224 "Hardware Assert Error", 225 }; 226 227 static const char * const smca_de_mce_desc[] = { 228 "Micro-op cache tag parity error", 229 "Micro-op cache data parity error", 230 "Instruction buffer parity error", 231 "Micro-op queue parity error", 232 "Instruction dispatch queue parity error", 233 "Fetch address FIFO parity error", 234 "Patch RAM data parity error", 235 "Patch RAM sequencer parity error", 236 "Micro-op buffer parity error", 237 "Hardware Assertion MCA Error", 238 }; 239 240 static const char * const smca_ex_mce_desc[] = { 241 "Watchdog Timeout error", 242 "Physical register file parity error", 243 "Flag register file parity error", 244 "Immediate displacement register file parity error", 245 "Address generator payload parity error", 246 "EX payload parity error", 247 "Checkpoint queue parity error", 248 "Retire dispatch queue parity error", 249 "Retire status queue parity error", 250 "Scheduling queue parity error", 251 "Branch buffer queue parity error", 252 "Hardware Assertion error", 253 "Spec Map parity error", 254 "Retire Map parity error", 255 }; 256 257 static const char * const smca_fp_mce_desc[] = { 258 "Physical register file (PRF) parity error", 259 "Freelist (FL) parity error", 260 "Schedule queue parity error", 261 "NSQ parity error", 262 "Retire queue (RQ) parity error", 263 "Status register file (SRF) parity error", 264 "Hardware assertion", 265 }; 266 267 static const char * const smca_l3_mce_desc[] = { 268 "Shadow Tag Macro ECC Error", 269 "Shadow Tag Macro Multi-way-hit Error", 270 "L3M Tag ECC Error", 271 "L3M Tag Multi-way-hit Error", 272 "L3M Data ECC Error", 273 "SDP Parity Error or SystemReadDataError from XI", 274 "L3 Victim Queue Parity Error", 275 "L3 Hardware Assertion", 276 }; 277 278 static const char * const smca_cs_mce_desc[] = { 279 "Illegal Request", 280 "Address Violation", 281 "Security Violation", 282 "Illegal Response", 283 "Unexpected Response", 284 "Request or Probe Parity Error", 285 "Read Response Parity Error", 286 "Atomic Request Parity Error", 287 "Probe Filter ECC Error", 288 }; 289 290 static const char * const smca_cs2_mce_desc[] = { 291 "Illegal Request", 292 "Address Violation", 293 "Security Violation", 294 "Illegal Response", 295 "Unexpected Response", 296 "Request or Probe Parity Error", 297 "Read Response Parity Error", 298 "Atomic Request Parity Error", 299 "SDP read response had no match in the CS queue", 300 "Probe Filter Protocol Error", 301 "Probe Filter ECC Error", 302 "SDP read response had an unexpected RETRY error", 303 "Counter overflow error", 304 "Counter underflow error", 305 }; 306 307 static const char * const smca_pie_mce_desc[] = { 308 "Hardware Assert", 309 "Register security violation", 310 "Link Error", 311 "Poison data consumption", 312 "A deferred error was detected in the DF" 313 }; 314 315 static const char * const smca_umc_mce_desc[] = { 316 "DRAM ECC error", 317 "Data poison error", 318 "SDP parity error", 319 "Advanced peripheral bus error", 320 "Address/Command parity error", 321 "Write data CRC error", 322 "DCQ SRAM ECC error", 323 "AES SRAM ECC error", 324 }; 325 326 static const char * const smca_umc2_mce_desc[] = { 327 "DRAM ECC error", 328 "Data poison error", 329 "SDP parity error", 330 "Reserved", 331 "Address/Command parity error", 332 "Write data parity error", 333 "DCQ SRAM ECC error", 334 "Reserved", 335 "Read data parity error", 336 "Rdb SRAM ECC error", 337 "RdRsp SRAM ECC error", 338 "LM32 MP errors", 339 }; 340 341 static const char * const smca_pb_mce_desc[] = { 342 "An ECC error in the Parameter Block RAM array", 343 }; 344 345 static const char * const smca_psp_mce_desc[] = { 346 "An ECC or parity error in a PSP RAM instance", 347 }; 348 349 static const char * const smca_psp2_mce_desc[] = { 350 "High SRAM ECC or parity error", 351 "Low SRAM ECC or parity error", 352 "Instruction Cache Bank 0 ECC or parity error", 353 "Instruction Cache Bank 1 ECC or parity error", 354 "Instruction Tag Ram 0 parity error", 355 "Instruction Tag Ram 1 parity error", 356 "Data Cache Bank 0 ECC or parity error", 357 "Data Cache Bank 1 ECC or parity error", 358 "Data Cache Bank 2 ECC or parity error", 359 "Data Cache Bank 3 ECC or parity error", 360 "Data Tag Bank 0 parity error", 361 "Data Tag Bank 1 parity error", 362 "Data Tag Bank 2 parity error", 363 "Data Tag Bank 3 parity error", 364 "Dirty Data Ram parity error", 365 "TLB Bank 0 parity error", 366 "TLB Bank 1 parity error", 367 "System Hub Read Buffer ECC or parity error", 368 }; 369 370 static const char * const smca_smu_mce_desc[] = { 371 "An ECC or parity error in an SMU RAM instance", 372 }; 373 374 static const char * const smca_smu2_mce_desc[] = { 375 "High SRAM ECC or parity error", 376 "Low SRAM ECC or parity error", 377 "Data Cache Bank A ECC or parity error", 378 "Data Cache Bank B ECC or parity error", 379 "Data Tag Cache Bank A ECC or parity error", 380 "Data Tag Cache Bank B ECC or parity error", 381 "Instruction Cache Bank A ECC or parity error", 382 "Instruction Cache Bank B ECC or parity error", 383 "Instruction Tag Cache Bank A ECC or parity error", 384 "Instruction Tag Cache Bank B ECC or parity error", 385 "System Hub Read Buffer ECC or parity error", 386 "PHY RAM ECC error", 387 }; 388 389 static const char * const smca_mp5_mce_desc[] = { 390 "High SRAM ECC or parity error", 391 "Low SRAM ECC or parity error", 392 "Data Cache Bank A ECC or parity error", 393 "Data Cache Bank B ECC or parity error", 394 "Data Tag Cache Bank A ECC or parity error", 395 "Data Tag Cache Bank B ECC or parity error", 396 "Instruction Cache Bank A ECC or parity error", 397 "Instruction Cache Bank B ECC or parity error", 398 "Instruction Tag Cache Bank A ECC or parity error", 399 "Instruction Tag Cache Bank B ECC or parity error", 400 }; 401 402 static const char * const smca_nbio_mce_desc[] = { 403 "ECC or Parity error", 404 "PCIE error", 405 "SDP ErrEvent error", 406 "SDP Egress Poison Error", 407 "IOHC Internal Poison Error", 408 }; 409 410 static const char * const smca_pcie_mce_desc[] = { 411 "CCIX PER Message logging", 412 "CCIX Read Response with Status: Non-Data Error", 413 "CCIX Write Response with Status: Non-Data Error", 414 "CCIX Read Response with Status: Data Error", 415 "CCIX Non-okay write response with data error", 416 }; 417 418 static const char * const smca_pcie2_mce_desc[] = { 419 "SDP Parity Error logging", 420 }; 421 422 static const char * const smca_xgmipcs_mce_desc[] = { 423 "Data Loss Error", 424 "Training Error", 425 "Flow Control Acknowledge Error", 426 "Rx Fifo Underflow Error", 427 "Rx Fifo Overflow Error", 428 "CRC Error", 429 "BER Exceeded Error", 430 "Tx Vcid Data Error", 431 "Replay Buffer Parity Error", 432 "Data Parity Error", 433 "Replay Fifo Overflow Error", 434 "Replay Fifo Underflow Error", 435 "Elastic Fifo Overflow Error", 436 "Deskew Error", 437 "Flow Control CRC Error", 438 "Data Startup Limit Error", 439 "FC Init Timeout Error", 440 "Recovery Timeout Error", 441 "Ready Serial Timeout Error", 442 "Ready Serial Attempt Error", 443 "Recovery Attempt Error", 444 "Recovery Relock Attempt Error", 445 "Replay Attempt Error", 446 "Sync Header Error", 447 "Tx Replay Timeout Error", 448 "Rx Replay Timeout Error", 449 "LinkSub Tx Timeout Error", 450 "LinkSub Rx Timeout Error", 451 "Rx CMD Pocket Error", 452 }; 453 454 static const char * const smca_xgmiphy_mce_desc[] = { 455 "RAM ECC Error", 456 "ARC instruction buffer parity error", 457 "ARC data buffer parity error", 458 "PHY APB error", 459 }; 460 461 static const char * const smca_waflphy_mce_desc[] = { 462 "RAM ECC Error", 463 "ARC instruction buffer parity error", 464 "ARC data buffer parity error", 465 "PHY APB error", 466 }; 467 468 struct smca_mce_desc { 469 const char * const *descs; 470 unsigned int num_descs; 471 }; 472 473 static struct smca_mce_desc smca_mce_descs[] = { 474 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, 475 [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, 476 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, 477 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, 478 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, 479 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, 480 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, 481 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, 482 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, 483 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, 484 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, 485 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, 486 [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, 487 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, 488 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, 489 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, 490 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, 491 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, 492 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, 493 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, 494 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, 495 [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, 496 [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, 497 [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, 498 [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, 499 }; 500 501 static bool f12h_mc0_mce(u16 ec, u8 xec) 502 { 503 bool ret = false; 504 505 if (MEM_ERROR(ec)) { 506 u8 ll = LL(ec); 507 ret = true; 508 509 if (ll == LL_L2) 510 pr_cont("during L1 linefill from L2.\n"); 511 else if (ll == LL_L1) 512 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 513 else 514 ret = false; 515 } 516 return ret; 517 } 518 519 static bool f10h_mc0_mce(u16 ec, u8 xec) 520 { 521 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 522 pr_cont("during data scrub.\n"); 523 return true; 524 } 525 return f12h_mc0_mce(ec, xec); 526 } 527 528 static bool k8_mc0_mce(u16 ec, u8 xec) 529 { 530 if (BUS_ERROR(ec)) { 531 pr_cont("during system linefill.\n"); 532 return true; 533 } 534 535 return f10h_mc0_mce(ec, xec); 536 } 537 538 static bool cat_mc0_mce(u16 ec, u8 xec) 539 { 540 u8 r4 = R4(ec); 541 bool ret = true; 542 543 if (MEM_ERROR(ec)) { 544 545 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 546 return false; 547 548 switch (r4) { 549 case R4_DRD: 550 case R4_DWR: 551 pr_cont("Data/Tag parity error due to %s.\n", 552 (r4 == R4_DRD ? "load/hw prf" : "store")); 553 break; 554 case R4_EVICT: 555 pr_cont("Copyback parity error on a tag miss.\n"); 556 break; 557 case R4_SNOOP: 558 pr_cont("Tag parity error during snoop.\n"); 559 break; 560 default: 561 ret = false; 562 } 563 } else if (BUS_ERROR(ec)) { 564 565 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 566 return false; 567 568 pr_cont("System read data error on a "); 569 570 switch (r4) { 571 case R4_RD: 572 pr_cont("TLB reload.\n"); 573 break; 574 case R4_DWR: 575 pr_cont("store.\n"); 576 break; 577 case R4_DRD: 578 pr_cont("load.\n"); 579 break; 580 default: 581 ret = false; 582 } 583 } else { 584 ret = false; 585 } 586 587 return ret; 588 } 589 590 static bool f15h_mc0_mce(u16 ec, u8 xec) 591 { 592 bool ret = true; 593 594 if (MEM_ERROR(ec)) { 595 596 switch (xec) { 597 case 0x0: 598 pr_cont("Data Array access error.\n"); 599 break; 600 601 case 0x1: 602 pr_cont("UC error during a linefill from L2/NB.\n"); 603 break; 604 605 case 0x2: 606 case 0x11: 607 pr_cont("STQ access error.\n"); 608 break; 609 610 case 0x3: 611 pr_cont("SCB access error.\n"); 612 break; 613 614 case 0x10: 615 pr_cont("Tag error.\n"); 616 break; 617 618 case 0x12: 619 pr_cont("LDQ access error.\n"); 620 break; 621 622 default: 623 ret = false; 624 } 625 } else if (BUS_ERROR(ec)) { 626 627 if (!xec) 628 pr_cont("System Read Data Error.\n"); 629 else 630 pr_cont(" Internal error condition type %d.\n", xec); 631 } else if (INT_ERROR(ec)) { 632 if (xec <= 0x1f) 633 pr_cont("Hardware Assert.\n"); 634 else 635 ret = false; 636 637 } else 638 ret = false; 639 640 return ret; 641 } 642 643 static void decode_mc0_mce(struct mce *m) 644 { 645 u16 ec = EC(m->status); 646 u8 xec = XEC(m->status, xec_mask); 647 648 pr_emerg(HW_ERR "MC0 Error: "); 649 650 /* TLB error signatures are the same across families */ 651 if (TLB_ERROR(ec)) { 652 if (TT(ec) == TT_DATA) { 653 pr_cont("%s TLB %s.\n", LL_MSG(ec), 654 ((xec == 2) ? "locked miss" 655 : (xec ? "multimatch" : "parity"))); 656 return; 657 } 658 } else if (fam_ops.mc0_mce(ec, xec)) 659 ; 660 else 661 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 662 } 663 664 static bool k8_mc1_mce(u16 ec, u8 xec) 665 { 666 u8 ll = LL(ec); 667 bool ret = true; 668 669 if (!MEM_ERROR(ec)) 670 return false; 671 672 if (ll == 0x2) 673 pr_cont("during a linefill from L2.\n"); 674 else if (ll == 0x1) { 675 switch (R4(ec)) { 676 case R4_IRD: 677 pr_cont("Parity error during data load.\n"); 678 break; 679 680 case R4_EVICT: 681 pr_cont("Copyback Parity/Victim error.\n"); 682 break; 683 684 case R4_SNOOP: 685 pr_cont("Tag Snoop error.\n"); 686 break; 687 688 default: 689 ret = false; 690 break; 691 } 692 } else 693 ret = false; 694 695 return ret; 696 } 697 698 static bool cat_mc1_mce(u16 ec, u8 xec) 699 { 700 u8 r4 = R4(ec); 701 bool ret = true; 702 703 if (!MEM_ERROR(ec)) 704 return false; 705 706 if (TT(ec) != TT_INSTR) 707 return false; 708 709 if (r4 == R4_IRD) 710 pr_cont("Data/tag array parity error for a tag hit.\n"); 711 else if (r4 == R4_SNOOP) 712 pr_cont("Tag error during snoop/victimization.\n"); 713 else if (xec == 0x0) 714 pr_cont("Tag parity error from victim castout.\n"); 715 else if (xec == 0x2) 716 pr_cont("Microcode patch RAM parity error.\n"); 717 else 718 ret = false; 719 720 return ret; 721 } 722 723 static bool f15h_mc1_mce(u16 ec, u8 xec) 724 { 725 bool ret = true; 726 727 if (!MEM_ERROR(ec)) 728 return false; 729 730 switch (xec) { 731 case 0x0 ... 0xa: 732 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 733 break; 734 735 case 0xd: 736 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 737 break; 738 739 case 0x10: 740 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 741 break; 742 743 case 0x11 ... 0x15: 744 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 745 break; 746 747 default: 748 ret = false; 749 } 750 return ret; 751 } 752 753 static void decode_mc1_mce(struct mce *m) 754 { 755 u16 ec = EC(m->status); 756 u8 xec = XEC(m->status, xec_mask); 757 758 pr_emerg(HW_ERR "MC1 Error: "); 759 760 if (TLB_ERROR(ec)) 761 pr_cont("%s TLB %s.\n", LL_MSG(ec), 762 (xec ? "multimatch" : "parity error")); 763 else if (BUS_ERROR(ec)) { 764 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 765 766 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 767 } else if (INT_ERROR(ec)) { 768 if (xec <= 0x3f) 769 pr_cont("Hardware Assert.\n"); 770 else 771 goto wrong_mc1_mce; 772 } else if (fam_ops.mc1_mce(ec, xec)) 773 ; 774 else 775 goto wrong_mc1_mce; 776 777 return; 778 779 wrong_mc1_mce: 780 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 781 } 782 783 static bool k8_mc2_mce(u16 ec, u8 xec) 784 { 785 bool ret = true; 786 787 if (xec == 0x1) 788 pr_cont(" in the write data buffers.\n"); 789 else if (xec == 0x3) 790 pr_cont(" in the victim data buffers.\n"); 791 else if (xec == 0x2 && MEM_ERROR(ec)) 792 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 793 else if (xec == 0x0) { 794 if (TLB_ERROR(ec)) 795 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 796 TT_MSG(ec)); 797 else if (BUS_ERROR(ec)) 798 pr_cont(": %s/ECC error in data read from NB: %s.\n", 799 R4_MSG(ec), PP_MSG(ec)); 800 else if (MEM_ERROR(ec)) { 801 u8 r4 = R4(ec); 802 803 if (r4 >= 0x7) 804 pr_cont(": %s error during data copyback.\n", 805 R4_MSG(ec)); 806 else if (r4 <= 0x1) 807 pr_cont(": %s parity/ECC error during data " 808 "access from L2.\n", R4_MSG(ec)); 809 else 810 ret = false; 811 } else 812 ret = false; 813 } else 814 ret = false; 815 816 return ret; 817 } 818 819 static bool f15h_mc2_mce(u16 ec, u8 xec) 820 { 821 bool ret = true; 822 823 if (TLB_ERROR(ec)) { 824 if (xec == 0x0) 825 pr_cont("Data parity TLB read error.\n"); 826 else if (xec == 0x1) 827 pr_cont("Poison data provided for TLB fill.\n"); 828 else 829 ret = false; 830 } else if (BUS_ERROR(ec)) { 831 if (xec > 2) 832 ret = false; 833 834 pr_cont("Error during attempted NB data read.\n"); 835 } else if (MEM_ERROR(ec)) { 836 switch (xec) { 837 case 0x4 ... 0xc: 838 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 839 break; 840 841 case 0x10 ... 0x14: 842 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 843 break; 844 845 default: 846 ret = false; 847 } 848 } else if (INT_ERROR(ec)) { 849 if (xec <= 0x3f) 850 pr_cont("Hardware Assert.\n"); 851 else 852 ret = false; 853 } 854 855 return ret; 856 } 857 858 static bool f16h_mc2_mce(u16 ec, u8 xec) 859 { 860 u8 r4 = R4(ec); 861 862 if (!MEM_ERROR(ec)) 863 return false; 864 865 switch (xec) { 866 case 0x04 ... 0x05: 867 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 868 break; 869 870 case 0x09 ... 0x0b: 871 case 0x0d ... 0x0f: 872 pr_cont("ECC error in L2 tag (%s).\n", 873 ((r4 == R4_GEN) ? "BankReq" : 874 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 875 break; 876 877 case 0x10 ... 0x19: 878 case 0x1b: 879 pr_cont("ECC error in L2 data array (%s).\n", 880 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 881 ((r4 == R4_GEN) ? "Attr" : 882 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 883 break; 884 885 case 0x1c ... 0x1d: 886 case 0x1f: 887 pr_cont("Parity error in L2 attribute bits (%s).\n", 888 ((r4 == R4_RD) ? "Hit" : 889 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 890 break; 891 892 default: 893 return false; 894 } 895 896 return true; 897 } 898 899 static void decode_mc2_mce(struct mce *m) 900 { 901 u16 ec = EC(m->status); 902 u8 xec = XEC(m->status, xec_mask); 903 904 pr_emerg(HW_ERR "MC2 Error: "); 905 906 if (!fam_ops.mc2_mce(ec, xec)) 907 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 908 } 909 910 static void decode_mc3_mce(struct mce *m) 911 { 912 u16 ec = EC(m->status); 913 u8 xec = XEC(m->status, xec_mask); 914 915 if (boot_cpu_data.x86 >= 0x14) { 916 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 917 " please report on LKML.\n"); 918 return; 919 } 920 921 pr_emerg(HW_ERR "MC3 Error"); 922 923 if (xec == 0x0) { 924 u8 r4 = R4(ec); 925 926 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 927 goto wrong_mc3_mce; 928 929 pr_cont(" during %s.\n", R4_MSG(ec)); 930 } else 931 goto wrong_mc3_mce; 932 933 return; 934 935 wrong_mc3_mce: 936 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 937 } 938 939 static void decode_mc4_mce(struct mce *m) 940 { 941 unsigned int fam = x86_family(m->cpuid); 942 int node_id = topology_die_id(m->extcpu); 943 u16 ec = EC(m->status); 944 u8 xec = XEC(m->status, 0x1f); 945 u8 offset = 0; 946 947 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 948 949 switch (xec) { 950 case 0x0 ... 0xe: 951 952 /* special handling for DRAM ECCs */ 953 if (xec == 0x0 || xec == 0x8) { 954 /* no ECCs on F11h */ 955 if (fam == 0x11) 956 goto wrong_mc4_mce; 957 958 pr_cont("%s.\n", mc4_mce_desc[xec]); 959 960 if (decode_dram_ecc) 961 decode_dram_ecc(node_id, m); 962 return; 963 } 964 break; 965 966 case 0xf: 967 if (TLB_ERROR(ec)) 968 pr_cont("GART Table Walk data error.\n"); 969 else if (BUS_ERROR(ec)) 970 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 971 else 972 goto wrong_mc4_mce; 973 return; 974 975 case 0x19: 976 if (fam == 0x15 || fam == 0x16) 977 pr_cont("Compute Unit Data Error.\n"); 978 else 979 goto wrong_mc4_mce; 980 return; 981 982 case 0x1c ... 0x1f: 983 offset = 13; 984 break; 985 986 default: 987 goto wrong_mc4_mce; 988 } 989 990 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 991 return; 992 993 wrong_mc4_mce: 994 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 995 } 996 997 static void decode_mc5_mce(struct mce *m) 998 { 999 unsigned int fam = x86_family(m->cpuid); 1000 u16 ec = EC(m->status); 1001 u8 xec = XEC(m->status, xec_mask); 1002 1003 if (fam == 0xf || fam == 0x11) 1004 goto wrong_mc5_mce; 1005 1006 pr_emerg(HW_ERR "MC5 Error: "); 1007 1008 if (INT_ERROR(ec)) { 1009 if (xec <= 0x1f) { 1010 pr_cont("Hardware Assert.\n"); 1011 return; 1012 } else 1013 goto wrong_mc5_mce; 1014 } 1015 1016 if (xec == 0x0 || xec == 0xc) 1017 pr_cont("%s.\n", mc5_mce_desc[xec]); 1018 else if (xec <= 0xd) 1019 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 1020 else 1021 goto wrong_mc5_mce; 1022 1023 return; 1024 1025 wrong_mc5_mce: 1026 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 1027 } 1028 1029 static void decode_mc6_mce(struct mce *m) 1030 { 1031 u8 xec = XEC(m->status, xec_mask); 1032 1033 pr_emerg(HW_ERR "MC6 Error: "); 1034 1035 if (xec > 0x5) 1036 goto wrong_mc6_mce; 1037 1038 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 1039 return; 1040 1041 wrong_mc6_mce: 1042 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 1043 } 1044 1045 /* Decode errors according to Scalable MCA specification */ 1046 static void decode_smca_error(struct mce *m) 1047 { 1048 struct smca_hwid *hwid; 1049 enum smca_bank_types bank_type; 1050 const char *ip_name; 1051 u8 xec = XEC(m->status, xec_mask); 1052 1053 if (m->bank >= ARRAY_SIZE(smca_banks)) 1054 return; 1055 1056 hwid = smca_banks[m->bank].hwid; 1057 if (!hwid) 1058 return; 1059 1060 bank_type = hwid->bank_type; 1061 1062 if (bank_type == SMCA_RESERVED) { 1063 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 1064 return; 1065 } 1066 1067 ip_name = smca_get_long_name(bank_type); 1068 1069 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); 1070 1071 /* Only print the decode of valid error codes */ 1072 if (xec < smca_mce_descs[bank_type].num_descs) 1073 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); 1074 1075 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) 1076 decode_dram_ecc(topology_die_id(m->extcpu), m); 1077 } 1078 1079 static inline void amd_decode_err_code(u16 ec) 1080 { 1081 if (INT_ERROR(ec)) { 1082 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 1083 return; 1084 } 1085 1086 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 1087 1088 if (BUS_ERROR(ec)) 1089 pr_cont(", mem/io: %s", II_MSG(ec)); 1090 else 1091 pr_cont(", tx: %s", TT_MSG(ec)); 1092 1093 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 1094 pr_cont(", mem-tx: %s", R4_MSG(ec)); 1095 1096 if (BUS_ERROR(ec)) 1097 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 1098 } 1099 1100 pr_cont("\n"); 1101 } 1102 1103 static const char *decode_error_status(struct mce *m) 1104 { 1105 if (m->status & MCI_STATUS_UC) { 1106 if (m->status & MCI_STATUS_PCC) 1107 return "System Fatal error."; 1108 if (m->mcgstatus & MCG_STATUS_RIPV) 1109 return "Uncorrected, software restartable error."; 1110 return "Uncorrected, software containable error."; 1111 } 1112 1113 if (m->status & MCI_STATUS_DEFERRED) 1114 return "Deferred error, no action required."; 1115 1116 return "Corrected error, no action required."; 1117 } 1118 1119 static int 1120 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 1121 { 1122 struct mce *m = (struct mce *)data; 1123 unsigned int fam = x86_family(m->cpuid); 1124 int ecc; 1125 1126 if (m->kflags & MCE_HANDLED_CEC) 1127 return NOTIFY_DONE; 1128 1129 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1130 1131 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 1132 m->extcpu, 1133 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 1134 m->bank, 1135 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 1136 ((m->status & MCI_STATUS_UC) ? "UE" : 1137 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 1138 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 1139 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 1140 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 1141 1142 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1143 u32 low, high; 1144 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); 1145 1146 if (!rdmsr_safe(addr, &low, &high) && 1147 (low & MCI_CONFIG_MCAX)) 1148 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 1149 1150 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 1151 } 1152 1153 /* do the two bits[14:13] together */ 1154 ecc = (m->status >> 45) & 0x3; 1155 if (ecc) 1156 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 1157 1158 if (fam >= 0x15) { 1159 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 1160 1161 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 1162 if (fam != 0x15 || m->bank != 4) 1163 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 1164 } 1165 1166 if (fam >= 0x17) 1167 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 1168 1169 pr_cont("]: 0x%016llx\n", m->status); 1170 1171 if (m->status & MCI_STATUS_ADDRV) 1172 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 1173 1174 if (m->ppin) 1175 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); 1176 1177 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1178 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 1179 1180 if (m->status & MCI_STATUS_SYNDV) 1181 pr_cont(", Syndrome: 0x%016llx", m->synd); 1182 1183 pr_cont("\n"); 1184 1185 decode_smca_error(m); 1186 goto err_code; 1187 } 1188 1189 if (m->tsc) 1190 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 1191 1192 /* Doesn't matter which member to test. */ 1193 if (!fam_ops.mc0_mce) 1194 goto err_code; 1195 1196 switch (m->bank) { 1197 case 0: 1198 decode_mc0_mce(m); 1199 break; 1200 1201 case 1: 1202 decode_mc1_mce(m); 1203 break; 1204 1205 case 2: 1206 decode_mc2_mce(m); 1207 break; 1208 1209 case 3: 1210 decode_mc3_mce(m); 1211 break; 1212 1213 case 4: 1214 decode_mc4_mce(m); 1215 break; 1216 1217 case 5: 1218 decode_mc5_mce(m); 1219 break; 1220 1221 case 6: 1222 decode_mc6_mce(m); 1223 break; 1224 1225 default: 1226 break; 1227 } 1228 1229 err_code: 1230 amd_decode_err_code(m->status & 0xffff); 1231 1232 m->kflags |= MCE_HANDLED_EDAC; 1233 return NOTIFY_OK; 1234 } 1235 1236 static struct notifier_block amd_mce_dec_nb = { 1237 .notifier_call = amd_decode_mce, 1238 .priority = MCE_PRIO_EDAC, 1239 }; 1240 1241 static int __init mce_amd_init(void) 1242 { 1243 struct cpuinfo_x86 *c = &boot_cpu_data; 1244 1245 if (c->x86_vendor != X86_VENDOR_AMD && 1246 c->x86_vendor != X86_VENDOR_HYGON) 1247 return -ENODEV; 1248 1249 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1250 xec_mask = 0x3f; 1251 goto out; 1252 } 1253 1254 switch (c->x86) { 1255 case 0xf: 1256 fam_ops.mc0_mce = k8_mc0_mce; 1257 fam_ops.mc1_mce = k8_mc1_mce; 1258 fam_ops.mc2_mce = k8_mc2_mce; 1259 break; 1260 1261 case 0x10: 1262 fam_ops.mc0_mce = f10h_mc0_mce; 1263 fam_ops.mc1_mce = k8_mc1_mce; 1264 fam_ops.mc2_mce = k8_mc2_mce; 1265 break; 1266 1267 case 0x11: 1268 fam_ops.mc0_mce = k8_mc0_mce; 1269 fam_ops.mc1_mce = k8_mc1_mce; 1270 fam_ops.mc2_mce = k8_mc2_mce; 1271 break; 1272 1273 case 0x12: 1274 fam_ops.mc0_mce = f12h_mc0_mce; 1275 fam_ops.mc1_mce = k8_mc1_mce; 1276 fam_ops.mc2_mce = k8_mc2_mce; 1277 break; 1278 1279 case 0x14: 1280 fam_ops.mc0_mce = cat_mc0_mce; 1281 fam_ops.mc1_mce = cat_mc1_mce; 1282 fam_ops.mc2_mce = k8_mc2_mce; 1283 break; 1284 1285 case 0x15: 1286 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 1287 1288 fam_ops.mc0_mce = f15h_mc0_mce; 1289 fam_ops.mc1_mce = f15h_mc1_mce; 1290 fam_ops.mc2_mce = f15h_mc2_mce; 1291 break; 1292 1293 case 0x16: 1294 xec_mask = 0x1f; 1295 fam_ops.mc0_mce = cat_mc0_mce; 1296 fam_ops.mc1_mce = cat_mc1_mce; 1297 fam_ops.mc2_mce = f16h_mc2_mce; 1298 break; 1299 1300 case 0x17: 1301 case 0x18: 1302 pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); 1303 return -EINVAL; 1304 1305 default: 1306 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1307 return -EINVAL; 1308 } 1309 1310 out: 1311 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1312 1313 mce_register_decode_chain(&amd_mce_dec_nb); 1314 1315 return 0; 1316 } 1317 early_initcall(mce_amd_init); 1318 1319 #ifdef MODULE 1320 static void __exit mce_amd_exit(void) 1321 { 1322 mce_unregister_decode_chain(&amd_mce_dec_nb); 1323 } 1324 1325 MODULE_DESCRIPTION("AMD MCE decoder"); 1326 MODULE_ALIAS("edac-mce-amd"); 1327 MODULE_LICENSE("GPL"); 1328 module_exit(mce_amd_exit); 1329 #endif 1330