1 #include <linux/module.h> 2 #include <linux/slab.h> 3 4 #include "mce_amd.h" 5 6 static struct amd_decoder_ops *fam_ops; 7 8 static u8 xec_mask = 0xf; 9 10 static bool report_gart_errors; 11 static void (*nb_bus_decoder)(int node_id, struct mce *m); 12 13 void amd_report_gart_errors(bool v) 14 { 15 report_gart_errors = v; 16 } 17 EXPORT_SYMBOL_GPL(amd_report_gart_errors); 18 19 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 20 { 21 nb_bus_decoder = f; 22 } 23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 24 25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 26 { 27 if (nb_bus_decoder) { 28 WARN_ON(nb_bus_decoder != f); 29 30 nb_bus_decoder = NULL; 31 } 32 } 33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 34 35 /* 36 * string representation for the different MCA reported error types, see F3x48 37 * or MSR0000_0411. 38 */ 39 40 /* transaction type */ 41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 42 43 /* cache level */ 44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 45 46 /* memory transaction type */ 47 static const char * const rrrr_msgs[] = { 48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 49 }; 50 51 /* participating processor */ 52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 53 EXPORT_SYMBOL_GPL(pp_msgs); 54 55 /* request timeout */ 56 static const char * const to_msgs[] = { "no timeout", "timed out" }; 57 58 /* memory or i/o */ 59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 60 61 /* internal error type */ 62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 63 64 static const char * const f15h_mc1_mce_desc[] = { 65 "UC during a demand linefill from L2", 66 "Parity error during data load from IC", 67 "Parity error for IC valid bit", 68 "Main tag parity error", 69 "Parity error in prediction queue", 70 "PFB data/address parity error", 71 "Parity error in the branch status reg", 72 "PFB promotion address error", 73 "Tag error during probe/victimization", 74 "Parity error for IC probe tag valid bit", 75 "PFB non-cacheable bit parity error", 76 "PFB valid bit parity error", /* xec = 0xd */ 77 "Microcode Patch Buffer", /* xec = 010 */ 78 "uop queue", 79 "insn buffer", 80 "predecode buffer", 81 "fetch address FIFO", 82 "dispatch uop queue" 83 }; 84 85 static const char * const f15h_mc2_mce_desc[] = { 86 "Fill ECC error on data fills", /* xec = 0x4 */ 87 "Fill parity error on insn fills", 88 "Prefetcher request FIFO parity error", 89 "PRQ address parity error", 90 "PRQ data parity error", 91 "WCC Tag ECC error", 92 "WCC Data ECC error", 93 "WCB Data parity error", 94 "VB Data ECC or parity error", 95 "L2 Tag ECC error", /* xec = 0x10 */ 96 "Hard L2 Tag ECC error", 97 "Multiple hits on L2 tag", 98 "XAB parity error", 99 "PRB address parity error" 100 }; 101 102 static const char * const mc4_mce_desc[] = { 103 "DRAM ECC error detected on the NB", 104 "CRC error detected on HT link", 105 "Link-defined sync error packets detected on HT link", 106 "HT Master abort", 107 "HT Target abort", 108 "Invalid GART PTE entry during GART table walk", 109 "Unsupported atomic RMW received from an IO link", 110 "Watchdog timeout due to lack of progress", 111 "DRAM ECC error detected on the NB", 112 "SVM DMA Exclusion Vector error", 113 "HT data error detected on link", 114 "Protocol error (link, L3, probe filter)", 115 "NB internal arrays parity error", 116 "DRAM addr/ctl signals parity error", 117 "IO link transmission error", 118 "L3 data cache ECC error", /* xec = 0x1c */ 119 "L3 cache tag error", 120 "L3 LRU parity bits error", 121 "ECC Error in the Probe Filter directory" 122 }; 123 124 static const char * const mc5_mce_desc[] = { 125 "CPU Watchdog timer expire", 126 "Wakeup array dest tag", 127 "AG payload array", 128 "EX payload array", 129 "IDRF array", 130 "Retire dispatch queue", 131 "Mapper checkpoint array", 132 "Physical register file EX0 port", 133 "Physical register file EX1 port", 134 "Physical register file AG0 port", 135 "Physical register file AG1 port", 136 "Flag register file", 137 "DE error occurred", 138 "Retire status queue" 139 }; 140 141 static const char * const mc6_mce_desc[] = { 142 "Hardware Assertion", 143 "Free List", 144 "Physical Register File", 145 "Retire Queue", 146 "Scheduler table", 147 "Status Register File", 148 }; 149 150 /* Scalable MCA error strings */ 151 static const char * const f17h_ls_mce_desc[] = { 152 "Load queue parity", 153 "Store queue parity", 154 "Miss address buffer payload parity", 155 "L1 TLB parity", 156 "", /* reserved */ 157 "DC tag error type 6", 158 "DC tag error type 1", 159 "Internal error type 1", 160 "Internal error type 2", 161 "Sys Read data error thread 0", 162 "Sys read data error thread 1", 163 "DC tag error type 2", 164 "DC data error type 1 (poison comsumption)", 165 "DC data error type 2", 166 "DC data error type 3", 167 "DC tag error type 4", 168 "L2 TLB parity", 169 "PDC parity error", 170 "DC tag error type 3", 171 "DC tag error type 5", 172 "L2 fill data error", 173 }; 174 175 static const char * const f17h_if_mce_desc[] = { 176 "microtag probe port parity error", 177 "IC microtag or full tag multi-hit error", 178 "IC full tag parity", 179 "IC data array parity", 180 "Decoupling queue phys addr parity error", 181 "L0 ITLB parity error", 182 "L1 ITLB parity error", 183 "L2 ITLB parity error", 184 "BPQ snoop parity on Thread 0", 185 "BPQ snoop parity on Thread 1", 186 "L1 BTB multi-match error", 187 "L2 BTB multi-match error", 188 }; 189 190 static const char * const f17h_l2_mce_desc[] = { 191 "L2M tag multi-way-hit error", 192 "L2M tag ECC error", 193 "L2M data ECC error", 194 "HW assert", 195 }; 196 197 static const char * const f17h_de_mce_desc[] = { 198 "uop cache tag parity error", 199 "uop cache data parity error", 200 "Insn buffer parity error", 201 "Insn dispatch queue parity error", 202 "Fetch address FIFO parity", 203 "Patch RAM data parity", 204 "Patch RAM sequencer parity", 205 "uop buffer parity" 206 }; 207 208 static const char * const f17h_ex_mce_desc[] = { 209 "Watchdog timeout error", 210 "Phy register file parity", 211 "Flag register file parity", 212 "Immediate displacement register file parity", 213 "Address generator payload parity", 214 "EX payload parity", 215 "Checkpoint queue parity", 216 "Retire dispatch queue parity", 217 }; 218 219 static const char * const f17h_fp_mce_desc[] = { 220 "Physical register file parity", 221 "Freelist parity error", 222 "Schedule queue parity", 223 "NSQ parity error", 224 "Retire queue parity", 225 "Status register file parity", 226 }; 227 228 static const char * const f17h_l3_mce_desc[] = { 229 "Shadow tag macro ECC error", 230 "Shadow tag macro multi-way-hit error", 231 "L3M tag ECC error", 232 "L3M tag multi-way-hit error", 233 "L3M data ECC error", 234 "XI parity, L3 fill done channel error", 235 "L3 victim queue parity", 236 "L3 HW assert", 237 }; 238 239 static const char * const f17h_cs_mce_desc[] = { 240 "Illegal request from transport layer", 241 "Address violation", 242 "Security violation", 243 "Illegal response from transport layer", 244 "Unexpected response", 245 "Parity error on incoming request or probe response data", 246 "Parity error on incoming read response data", 247 "Atomic request parity", 248 "ECC error on probe filter access", 249 }; 250 251 static const char * const f17h_pie_mce_desc[] = { 252 "HW assert", 253 "Internal PIE register security violation", 254 "Error on GMI link", 255 "Poison data written to internal PIE register", 256 }; 257 258 static const char * const f17h_umc_mce_desc[] = { 259 "DRAM ECC error", 260 "Data poison error on DRAM", 261 "SDP parity error", 262 "Advanced peripheral bus error", 263 "Command/address parity error", 264 "Write data CRC error", 265 }; 266 267 static const char * const f17h_pb_mce_desc[] = { 268 "Parameter Block RAM ECC error", 269 }; 270 271 static const char * const f17h_psp_mce_desc[] = { 272 "PSP RAM ECC or parity error", 273 }; 274 275 static const char * const f17h_smu_mce_desc[] = { 276 "SMU RAM ECC or parity error", 277 }; 278 279 static bool f12h_mc0_mce(u16 ec, u8 xec) 280 { 281 bool ret = false; 282 283 if (MEM_ERROR(ec)) { 284 u8 ll = LL(ec); 285 ret = true; 286 287 if (ll == LL_L2) 288 pr_cont("during L1 linefill from L2.\n"); 289 else if (ll == LL_L1) 290 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 291 else 292 ret = false; 293 } 294 return ret; 295 } 296 297 static bool f10h_mc0_mce(u16 ec, u8 xec) 298 { 299 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 300 pr_cont("during data scrub.\n"); 301 return true; 302 } 303 return f12h_mc0_mce(ec, xec); 304 } 305 306 static bool k8_mc0_mce(u16 ec, u8 xec) 307 { 308 if (BUS_ERROR(ec)) { 309 pr_cont("during system linefill.\n"); 310 return true; 311 } 312 313 return f10h_mc0_mce(ec, xec); 314 } 315 316 static bool cat_mc0_mce(u16 ec, u8 xec) 317 { 318 u8 r4 = R4(ec); 319 bool ret = true; 320 321 if (MEM_ERROR(ec)) { 322 323 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 324 return false; 325 326 switch (r4) { 327 case R4_DRD: 328 case R4_DWR: 329 pr_cont("Data/Tag parity error due to %s.\n", 330 (r4 == R4_DRD ? "load/hw prf" : "store")); 331 break; 332 case R4_EVICT: 333 pr_cont("Copyback parity error on a tag miss.\n"); 334 break; 335 case R4_SNOOP: 336 pr_cont("Tag parity error during snoop.\n"); 337 break; 338 default: 339 ret = false; 340 } 341 } else if (BUS_ERROR(ec)) { 342 343 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 344 return false; 345 346 pr_cont("System read data error on a "); 347 348 switch (r4) { 349 case R4_RD: 350 pr_cont("TLB reload.\n"); 351 break; 352 case R4_DWR: 353 pr_cont("store.\n"); 354 break; 355 case R4_DRD: 356 pr_cont("load.\n"); 357 break; 358 default: 359 ret = false; 360 } 361 } else { 362 ret = false; 363 } 364 365 return ret; 366 } 367 368 static bool f15h_mc0_mce(u16 ec, u8 xec) 369 { 370 bool ret = true; 371 372 if (MEM_ERROR(ec)) { 373 374 switch (xec) { 375 case 0x0: 376 pr_cont("Data Array access error.\n"); 377 break; 378 379 case 0x1: 380 pr_cont("UC error during a linefill from L2/NB.\n"); 381 break; 382 383 case 0x2: 384 case 0x11: 385 pr_cont("STQ access error.\n"); 386 break; 387 388 case 0x3: 389 pr_cont("SCB access error.\n"); 390 break; 391 392 case 0x10: 393 pr_cont("Tag error.\n"); 394 break; 395 396 case 0x12: 397 pr_cont("LDQ access error.\n"); 398 break; 399 400 default: 401 ret = false; 402 } 403 } else if (BUS_ERROR(ec)) { 404 405 if (!xec) 406 pr_cont("System Read Data Error.\n"); 407 else 408 pr_cont(" Internal error condition type %d.\n", xec); 409 } else if (INT_ERROR(ec)) { 410 if (xec <= 0x1f) 411 pr_cont("Hardware Assert.\n"); 412 else 413 ret = false; 414 415 } else 416 ret = false; 417 418 return ret; 419 } 420 421 static void decode_mc0_mce(struct mce *m) 422 { 423 u16 ec = EC(m->status); 424 u8 xec = XEC(m->status, xec_mask); 425 426 pr_emerg(HW_ERR "MC0 Error: "); 427 428 /* TLB error signatures are the same across families */ 429 if (TLB_ERROR(ec)) { 430 if (TT(ec) == TT_DATA) { 431 pr_cont("%s TLB %s.\n", LL_MSG(ec), 432 ((xec == 2) ? "locked miss" 433 : (xec ? "multimatch" : "parity"))); 434 return; 435 } 436 } else if (fam_ops->mc0_mce(ec, xec)) 437 ; 438 else 439 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 440 } 441 442 static bool k8_mc1_mce(u16 ec, u8 xec) 443 { 444 u8 ll = LL(ec); 445 bool ret = true; 446 447 if (!MEM_ERROR(ec)) 448 return false; 449 450 if (ll == 0x2) 451 pr_cont("during a linefill from L2.\n"); 452 else if (ll == 0x1) { 453 switch (R4(ec)) { 454 case R4_IRD: 455 pr_cont("Parity error during data load.\n"); 456 break; 457 458 case R4_EVICT: 459 pr_cont("Copyback Parity/Victim error.\n"); 460 break; 461 462 case R4_SNOOP: 463 pr_cont("Tag Snoop error.\n"); 464 break; 465 466 default: 467 ret = false; 468 break; 469 } 470 } else 471 ret = false; 472 473 return ret; 474 } 475 476 static bool cat_mc1_mce(u16 ec, u8 xec) 477 { 478 u8 r4 = R4(ec); 479 bool ret = true; 480 481 if (!MEM_ERROR(ec)) 482 return false; 483 484 if (TT(ec) != TT_INSTR) 485 return false; 486 487 if (r4 == R4_IRD) 488 pr_cont("Data/tag array parity error for a tag hit.\n"); 489 else if (r4 == R4_SNOOP) 490 pr_cont("Tag error during snoop/victimization.\n"); 491 else if (xec == 0x0) 492 pr_cont("Tag parity error from victim castout.\n"); 493 else if (xec == 0x2) 494 pr_cont("Microcode patch RAM parity error.\n"); 495 else 496 ret = false; 497 498 return ret; 499 } 500 501 static bool f15h_mc1_mce(u16 ec, u8 xec) 502 { 503 bool ret = true; 504 505 if (!MEM_ERROR(ec)) 506 return false; 507 508 switch (xec) { 509 case 0x0 ... 0xa: 510 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 511 break; 512 513 case 0xd: 514 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 515 break; 516 517 case 0x10: 518 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 519 break; 520 521 case 0x11 ... 0x15: 522 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 523 break; 524 525 default: 526 ret = false; 527 } 528 return ret; 529 } 530 531 static void decode_mc1_mce(struct mce *m) 532 { 533 u16 ec = EC(m->status); 534 u8 xec = XEC(m->status, xec_mask); 535 536 pr_emerg(HW_ERR "MC1 Error: "); 537 538 if (TLB_ERROR(ec)) 539 pr_cont("%s TLB %s.\n", LL_MSG(ec), 540 (xec ? "multimatch" : "parity error")); 541 else if (BUS_ERROR(ec)) { 542 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 543 544 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 545 } else if (INT_ERROR(ec)) { 546 if (xec <= 0x3f) 547 pr_cont("Hardware Assert.\n"); 548 else 549 goto wrong_mc1_mce; 550 } else if (fam_ops->mc1_mce(ec, xec)) 551 ; 552 else 553 goto wrong_mc1_mce; 554 555 return; 556 557 wrong_mc1_mce: 558 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 559 } 560 561 static bool k8_mc2_mce(u16 ec, u8 xec) 562 { 563 bool ret = true; 564 565 if (xec == 0x1) 566 pr_cont(" in the write data buffers.\n"); 567 else if (xec == 0x3) 568 pr_cont(" in the victim data buffers.\n"); 569 else if (xec == 0x2 && MEM_ERROR(ec)) 570 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 571 else if (xec == 0x0) { 572 if (TLB_ERROR(ec)) 573 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 574 TT_MSG(ec)); 575 else if (BUS_ERROR(ec)) 576 pr_cont(": %s/ECC error in data read from NB: %s.\n", 577 R4_MSG(ec), PP_MSG(ec)); 578 else if (MEM_ERROR(ec)) { 579 u8 r4 = R4(ec); 580 581 if (r4 >= 0x7) 582 pr_cont(": %s error during data copyback.\n", 583 R4_MSG(ec)); 584 else if (r4 <= 0x1) 585 pr_cont(": %s parity/ECC error during data " 586 "access from L2.\n", R4_MSG(ec)); 587 else 588 ret = false; 589 } else 590 ret = false; 591 } else 592 ret = false; 593 594 return ret; 595 } 596 597 static bool f15h_mc2_mce(u16 ec, u8 xec) 598 { 599 bool ret = true; 600 601 if (TLB_ERROR(ec)) { 602 if (xec == 0x0) 603 pr_cont("Data parity TLB read error.\n"); 604 else if (xec == 0x1) 605 pr_cont("Poison data provided for TLB fill.\n"); 606 else 607 ret = false; 608 } else if (BUS_ERROR(ec)) { 609 if (xec > 2) 610 ret = false; 611 612 pr_cont("Error during attempted NB data read.\n"); 613 } else if (MEM_ERROR(ec)) { 614 switch (xec) { 615 case 0x4 ... 0xc: 616 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 617 break; 618 619 case 0x10 ... 0x14: 620 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 621 break; 622 623 default: 624 ret = false; 625 } 626 } else if (INT_ERROR(ec)) { 627 if (xec <= 0x3f) 628 pr_cont("Hardware Assert.\n"); 629 else 630 ret = false; 631 } 632 633 return ret; 634 } 635 636 static bool f16h_mc2_mce(u16 ec, u8 xec) 637 { 638 u8 r4 = R4(ec); 639 640 if (!MEM_ERROR(ec)) 641 return false; 642 643 switch (xec) { 644 case 0x04 ... 0x05: 645 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 646 break; 647 648 case 0x09 ... 0x0b: 649 case 0x0d ... 0x0f: 650 pr_cont("ECC error in L2 tag (%s).\n", 651 ((r4 == R4_GEN) ? "BankReq" : 652 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 653 break; 654 655 case 0x10 ... 0x19: 656 case 0x1b: 657 pr_cont("ECC error in L2 data array (%s).\n", 658 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 659 ((r4 == R4_GEN) ? "Attr" : 660 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 661 break; 662 663 case 0x1c ... 0x1d: 664 case 0x1f: 665 pr_cont("Parity error in L2 attribute bits (%s).\n", 666 ((r4 == R4_RD) ? "Hit" : 667 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 668 break; 669 670 default: 671 return false; 672 } 673 674 return true; 675 } 676 677 static void decode_mc2_mce(struct mce *m) 678 { 679 u16 ec = EC(m->status); 680 u8 xec = XEC(m->status, xec_mask); 681 682 pr_emerg(HW_ERR "MC2 Error: "); 683 684 if (!fam_ops->mc2_mce(ec, xec)) 685 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 686 } 687 688 static void decode_mc3_mce(struct mce *m) 689 { 690 u16 ec = EC(m->status); 691 u8 xec = XEC(m->status, xec_mask); 692 693 if (boot_cpu_data.x86 >= 0x14) { 694 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 695 " please report on LKML.\n"); 696 return; 697 } 698 699 pr_emerg(HW_ERR "MC3 Error"); 700 701 if (xec == 0x0) { 702 u8 r4 = R4(ec); 703 704 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 705 goto wrong_mc3_mce; 706 707 pr_cont(" during %s.\n", R4_MSG(ec)); 708 } else 709 goto wrong_mc3_mce; 710 711 return; 712 713 wrong_mc3_mce: 714 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 715 } 716 717 static void decode_mc4_mce(struct mce *m) 718 { 719 struct cpuinfo_x86 *c = &boot_cpu_data; 720 int node_id = amd_get_nb_id(m->extcpu); 721 u16 ec = EC(m->status); 722 u8 xec = XEC(m->status, 0x1f); 723 u8 offset = 0; 724 725 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 726 727 switch (xec) { 728 case 0x0 ... 0xe: 729 730 /* special handling for DRAM ECCs */ 731 if (xec == 0x0 || xec == 0x8) { 732 /* no ECCs on F11h */ 733 if (c->x86 == 0x11) 734 goto wrong_mc4_mce; 735 736 pr_cont("%s.\n", mc4_mce_desc[xec]); 737 738 if (nb_bus_decoder) 739 nb_bus_decoder(node_id, m); 740 return; 741 } 742 break; 743 744 case 0xf: 745 if (TLB_ERROR(ec)) 746 pr_cont("GART Table Walk data error.\n"); 747 else if (BUS_ERROR(ec)) 748 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 749 else 750 goto wrong_mc4_mce; 751 return; 752 753 case 0x19: 754 if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) 755 pr_cont("Compute Unit Data Error.\n"); 756 else 757 goto wrong_mc4_mce; 758 return; 759 760 case 0x1c ... 0x1f: 761 offset = 13; 762 break; 763 764 default: 765 goto wrong_mc4_mce; 766 } 767 768 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 769 return; 770 771 wrong_mc4_mce: 772 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 773 } 774 775 static void decode_mc5_mce(struct mce *m) 776 { 777 struct cpuinfo_x86 *c = &boot_cpu_data; 778 u16 ec = EC(m->status); 779 u8 xec = XEC(m->status, xec_mask); 780 781 if (c->x86 == 0xf || c->x86 == 0x11) 782 goto wrong_mc5_mce; 783 784 pr_emerg(HW_ERR "MC5 Error: "); 785 786 if (INT_ERROR(ec)) { 787 if (xec <= 0x1f) { 788 pr_cont("Hardware Assert.\n"); 789 return; 790 } else 791 goto wrong_mc5_mce; 792 } 793 794 if (xec == 0x0 || xec == 0xc) 795 pr_cont("%s.\n", mc5_mce_desc[xec]); 796 else if (xec <= 0xd) 797 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 798 else 799 goto wrong_mc5_mce; 800 801 return; 802 803 wrong_mc5_mce: 804 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 805 } 806 807 static void decode_mc6_mce(struct mce *m) 808 { 809 u8 xec = XEC(m->status, xec_mask); 810 811 pr_emerg(HW_ERR "MC6 Error: "); 812 813 if (xec > 0x5) 814 goto wrong_mc6_mce; 815 816 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 817 return; 818 819 wrong_mc6_mce: 820 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 821 } 822 823 static void decode_f17h_core_errors(const char *ip_name, u8 xec, 824 unsigned int mca_type) 825 { 826 const char * const *error_desc_array; 827 size_t len; 828 829 pr_emerg(HW_ERR "%s Error: ", ip_name); 830 831 switch (mca_type) { 832 case SMCA_LS: 833 error_desc_array = f17h_ls_mce_desc; 834 len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; 835 836 if (xec == 0x4) { 837 pr_cont("Unrecognized LS MCA error code.\n"); 838 return; 839 } 840 break; 841 842 case SMCA_IF: 843 error_desc_array = f17h_if_mce_desc; 844 len = ARRAY_SIZE(f17h_if_mce_desc) - 1; 845 break; 846 847 case SMCA_L2_CACHE: 848 error_desc_array = f17h_l2_mce_desc; 849 len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; 850 break; 851 852 case SMCA_DE: 853 error_desc_array = f17h_de_mce_desc; 854 len = ARRAY_SIZE(f17h_de_mce_desc) - 1; 855 break; 856 857 case SMCA_EX: 858 error_desc_array = f17h_ex_mce_desc; 859 len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; 860 break; 861 862 case SMCA_FP: 863 error_desc_array = f17h_fp_mce_desc; 864 len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; 865 break; 866 867 case SMCA_L3_CACHE: 868 error_desc_array = f17h_l3_mce_desc; 869 len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; 870 break; 871 872 default: 873 pr_cont("Corrupted MCA core error info.\n"); 874 return; 875 } 876 877 if (xec > len) { 878 pr_cont("Unrecognized %s MCA bank error code.\n", 879 amd_core_mcablock_names[mca_type]); 880 return; 881 } 882 883 pr_cont("%s.\n", error_desc_array[xec]); 884 } 885 886 static void decode_df_errors(u8 xec, unsigned int mca_type) 887 { 888 const char * const *error_desc_array; 889 size_t len; 890 891 pr_emerg(HW_ERR "Data Fabric Error: "); 892 893 switch (mca_type) { 894 case SMCA_CS: 895 error_desc_array = f17h_cs_mce_desc; 896 len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; 897 break; 898 899 case SMCA_PIE: 900 error_desc_array = f17h_pie_mce_desc; 901 len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; 902 break; 903 904 default: 905 pr_cont("Corrupted MCA Data Fabric info.\n"); 906 return; 907 } 908 909 if (xec > len) { 910 pr_cont("Unrecognized %s MCA bank error code.\n", 911 amd_df_mcablock_names[mca_type]); 912 return; 913 } 914 915 pr_cont("%s.\n", error_desc_array[xec]); 916 } 917 918 /* Decode errors according to Scalable MCA specification */ 919 static void decode_smca_errors(struct mce *m) 920 { 921 u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); 922 unsigned int hwid, mca_type, i; 923 u8 xec = XEC(m->status, xec_mask); 924 const char * const *error_desc_array; 925 const char *ip_name; 926 u32 low, high; 927 size_t len; 928 929 if (rdmsr_safe(addr, &low, &high)) { 930 pr_emerg("Invalid IP block specified, error information is unreliable.\n"); 931 return; 932 } 933 934 hwid = high & MCI_IPID_HWID; 935 mca_type = (high & MCI_IPID_MCATYPE) >> 16; 936 937 pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); 938 939 /* 940 * Based on hwid and mca_type values, decode errors from respective IPs. 941 * Note: mca_type values make sense only in the context of an hwid. 942 */ 943 for (i = 0; i < ARRAY_SIZE(amd_hwids); i++) 944 if (amd_hwids[i].hwid == hwid) 945 break; 946 947 switch (i) { 948 case SMCA_F17H_CORE: 949 ip_name = (mca_type == SMCA_L3_CACHE) ? 950 "L3 Cache" : "F17h Core"; 951 return decode_f17h_core_errors(ip_name, xec, mca_type); 952 break; 953 954 case SMCA_DF: 955 return decode_df_errors(xec, mca_type); 956 break; 957 958 case SMCA_UMC: 959 error_desc_array = f17h_umc_mce_desc; 960 len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; 961 break; 962 963 case SMCA_PB: 964 error_desc_array = f17h_pb_mce_desc; 965 len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; 966 break; 967 968 case SMCA_PSP: 969 error_desc_array = f17h_psp_mce_desc; 970 len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; 971 break; 972 973 case SMCA_SMU: 974 error_desc_array = f17h_smu_mce_desc; 975 len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; 976 break; 977 978 default: 979 pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); 980 return; 981 } 982 983 ip_name = amd_hwids[i].name; 984 pr_emerg(HW_ERR "%s Error: ", ip_name); 985 986 if (xec > len) { 987 pr_cont("Unrecognized %s MCA bank error code.\n", ip_name); 988 return; 989 } 990 991 pr_cont("%s.\n", error_desc_array[xec]); 992 } 993 994 static inline void amd_decode_err_code(u16 ec) 995 { 996 if (INT_ERROR(ec)) { 997 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 998 return; 999 } 1000 1001 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 1002 1003 if (BUS_ERROR(ec)) 1004 pr_cont(", mem/io: %s", II_MSG(ec)); 1005 else 1006 pr_cont(", tx: %s", TT_MSG(ec)); 1007 1008 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 1009 pr_cont(", mem-tx: %s", R4_MSG(ec)); 1010 1011 if (BUS_ERROR(ec)) 1012 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 1013 } 1014 1015 pr_cont("\n"); 1016 } 1017 1018 /* 1019 * Filter out unwanted MCE signatures here. 1020 */ 1021 static bool amd_filter_mce(struct mce *m) 1022 { 1023 u8 xec = (m->status >> 16) & 0x1f; 1024 1025 /* 1026 * NB GART TLB error reporting is disabled by default. 1027 */ 1028 if (m->bank == 4 && xec == 0x5 && !report_gart_errors) 1029 return true; 1030 1031 return false; 1032 } 1033 1034 static const char *decode_error_status(struct mce *m) 1035 { 1036 if (m->status & MCI_STATUS_UC) { 1037 if (m->status & MCI_STATUS_PCC) 1038 return "System Fatal error."; 1039 if (m->mcgstatus & MCG_STATUS_RIPV) 1040 return "Uncorrected, software restartable error."; 1041 return "Uncorrected, software containable error."; 1042 } 1043 1044 if (m->status & MCI_STATUS_DEFERRED) 1045 return "Deferred error."; 1046 1047 return "Corrected error, no action required."; 1048 } 1049 1050 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 1051 { 1052 struct mce *m = (struct mce *)data; 1053 struct cpuinfo_x86 *c = &cpu_data(m->extcpu); 1054 int ecc; 1055 1056 if (amd_filter_mce(m)) 1057 return NOTIFY_STOP; 1058 1059 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1060 1061 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 1062 m->extcpu, 1063 c->x86, c->x86_model, c->x86_mask, 1064 m->bank, 1065 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 1066 ((m->status & MCI_STATUS_UC) ? "UE" : 1067 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 1068 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 1069 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), 1070 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); 1071 1072 if (c->x86 >= 0x15) 1073 pr_cont("|%s|%s", 1074 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), 1075 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); 1076 1077 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1078 u32 low, high; 1079 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); 1080 1081 if (!rdmsr_safe(addr, &low, &high) && 1082 (low & MCI_CONFIG_MCAX)) 1083 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 1084 } 1085 1086 /* do the two bits[14:13] together */ 1087 ecc = (m->status >> 45) & 0x3; 1088 if (ecc) 1089 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 1090 1091 pr_cont("]: 0x%016llx\n", m->status); 1092 1093 if (m->status & MCI_STATUS_ADDRV) 1094 pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); 1095 1096 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1097 decode_smca_errors(m); 1098 goto err_code; 1099 } 1100 1101 if (!fam_ops) 1102 goto err_code; 1103 1104 switch (m->bank) { 1105 case 0: 1106 decode_mc0_mce(m); 1107 break; 1108 1109 case 1: 1110 decode_mc1_mce(m); 1111 break; 1112 1113 case 2: 1114 decode_mc2_mce(m); 1115 break; 1116 1117 case 3: 1118 decode_mc3_mce(m); 1119 break; 1120 1121 case 4: 1122 decode_mc4_mce(m); 1123 break; 1124 1125 case 5: 1126 decode_mc5_mce(m); 1127 break; 1128 1129 case 6: 1130 decode_mc6_mce(m); 1131 break; 1132 1133 default: 1134 break; 1135 } 1136 1137 err_code: 1138 amd_decode_err_code(m->status & 0xffff); 1139 1140 return NOTIFY_STOP; 1141 } 1142 EXPORT_SYMBOL_GPL(amd_decode_mce); 1143 1144 static struct notifier_block amd_mce_dec_nb = { 1145 .notifier_call = amd_decode_mce, 1146 }; 1147 1148 static int __init mce_amd_init(void) 1149 { 1150 struct cpuinfo_x86 *c = &boot_cpu_data; 1151 1152 if (c->x86_vendor != X86_VENDOR_AMD) 1153 return -ENODEV; 1154 1155 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); 1156 if (!fam_ops) 1157 return -ENOMEM; 1158 1159 switch (c->x86) { 1160 case 0xf: 1161 fam_ops->mc0_mce = k8_mc0_mce; 1162 fam_ops->mc1_mce = k8_mc1_mce; 1163 fam_ops->mc2_mce = k8_mc2_mce; 1164 break; 1165 1166 case 0x10: 1167 fam_ops->mc0_mce = f10h_mc0_mce; 1168 fam_ops->mc1_mce = k8_mc1_mce; 1169 fam_ops->mc2_mce = k8_mc2_mce; 1170 break; 1171 1172 case 0x11: 1173 fam_ops->mc0_mce = k8_mc0_mce; 1174 fam_ops->mc1_mce = k8_mc1_mce; 1175 fam_ops->mc2_mce = k8_mc2_mce; 1176 break; 1177 1178 case 0x12: 1179 fam_ops->mc0_mce = f12h_mc0_mce; 1180 fam_ops->mc1_mce = k8_mc1_mce; 1181 fam_ops->mc2_mce = k8_mc2_mce; 1182 break; 1183 1184 case 0x14: 1185 fam_ops->mc0_mce = cat_mc0_mce; 1186 fam_ops->mc1_mce = cat_mc1_mce; 1187 fam_ops->mc2_mce = k8_mc2_mce; 1188 break; 1189 1190 case 0x15: 1191 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 1192 1193 fam_ops->mc0_mce = f15h_mc0_mce; 1194 fam_ops->mc1_mce = f15h_mc1_mce; 1195 fam_ops->mc2_mce = f15h_mc2_mce; 1196 break; 1197 1198 case 0x16: 1199 xec_mask = 0x1f; 1200 fam_ops->mc0_mce = cat_mc0_mce; 1201 fam_ops->mc1_mce = cat_mc1_mce; 1202 fam_ops->mc2_mce = f16h_mc2_mce; 1203 break; 1204 1205 case 0x17: 1206 xec_mask = 0x3f; 1207 if (!boot_cpu_has(X86_FEATURE_SMCA)) { 1208 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); 1209 goto err_out; 1210 } 1211 break; 1212 1213 default: 1214 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1215 goto err_out; 1216 } 1217 1218 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1219 1220 mce_register_decode_chain(&amd_mce_dec_nb); 1221 1222 return 0; 1223 1224 err_out: 1225 kfree(fam_ops); 1226 fam_ops = NULL; 1227 return -EINVAL; 1228 } 1229 early_initcall(mce_amd_init); 1230 1231 #ifdef MODULE 1232 static void __exit mce_amd_exit(void) 1233 { 1234 mce_unregister_decode_chain(&amd_mce_dec_nb); 1235 kfree(fam_ops); 1236 } 1237 1238 MODULE_DESCRIPTION("AMD MCE decoder"); 1239 MODULE_ALIAS("edac-mce-amd"); 1240 MODULE_LICENSE("GPL"); 1241 module_exit(mce_amd_exit); 1242 #endif 1243