1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * 4 * Shared code by both skx_edac and i10nm_edac. Originally split out 5 * from the skx_edac driver. 6 * 7 * This file is linked into both skx_edac and i10nm_edac drivers. In 8 * order to avoid link errors, this file must be like a pure library 9 * without including symbols and defines which would otherwise conflict, 10 * when linked once into a module and into a built-in object, at the 11 * same time. For example, __this_module symbol references when that 12 * file is being linked into a built-in object. 13 * 14 * Copyright (c) 2018, Intel Corporation. 15 */ 16 17 #include <linux/acpi.h> 18 #include <linux/dmi.h> 19 #include <linux/adxl.h> 20 #include <acpi/nfit.h> 21 #include <asm/mce.h> 22 #include "edac_module.h" 23 #include "skx_common.h" 24 25 static const char * const component_names[] = { 26 [INDEX_SOCKET] = "ProcessorSocketId", 27 [INDEX_MEMCTRL] = "MemoryControllerId", 28 [INDEX_CHANNEL] = "ChannelId", 29 [INDEX_DIMM] = "DimmSlotId", 30 }; 31 32 static int component_indices[ARRAY_SIZE(component_names)]; 33 static int adxl_component_count; 34 static const char * const *adxl_component_names; 35 static u64 *adxl_values; 36 static char *adxl_msg; 37 38 static char skx_msg[MSG_SIZE]; 39 static skx_decode_f skx_decode; 40 static u64 skx_tolm, skx_tohm; 41 static LIST_HEAD(dev_edac_list); 42 43 int __init skx_adxl_get(void) 44 { 45 const char * const *names; 46 int i, j; 47 48 names = adxl_get_component_names(); 49 if (!names) { 50 skx_printk(KERN_NOTICE, "No firmware support for address translation.\n"); 51 return -ENODEV; 52 } 53 54 for (i = 0; i < INDEX_MAX; i++) { 55 for (j = 0; names[j]; j++) { 56 if (!strcmp(component_names[i], names[j])) { 57 component_indices[i] = j; 58 break; 59 } 60 } 61 62 if (!names[j]) 63 goto err; 64 } 65 66 adxl_component_names = names; 67 while (*names++) 68 adxl_component_count++; 69 70 adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values), 71 GFP_KERNEL); 72 if (!adxl_values) { 73 adxl_component_count = 0; 74 return -ENOMEM; 75 } 76 77 adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL); 78 if (!adxl_msg) { 79 adxl_component_count = 0; 80 kfree(adxl_values); 81 return -ENOMEM; 82 } 83 84 return 0; 85 err: 86 skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ", 87 component_names[i]); 88 for (j = 0; names[j]; j++) 89 skx_printk(KERN_CONT, "%s ", names[j]); 90 skx_printk(KERN_CONT, "\n"); 91 92 return -ENODEV; 93 } 94 95 void __exit skx_adxl_put(void) 96 { 97 kfree(adxl_values); 98 kfree(adxl_msg); 99 } 100 101 static bool skx_adxl_decode(struct decoded_addr *res) 102 { 103 int i, len = 0; 104 105 if (res->addr >= skx_tohm || (res->addr >= skx_tolm && 106 res->addr < BIT_ULL(32))) { 107 edac_dbg(0, "Address 0x%llx out of range\n", res->addr); 108 return false; 109 } 110 111 if (adxl_decode(res->addr, adxl_values)) { 112 edac_dbg(0, "Failed to decode 0x%llx\n", res->addr); 113 return false; 114 } 115 116 res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; 117 res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; 118 res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; 119 res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; 120 121 for (i = 0; i < adxl_component_count; i++) { 122 if (adxl_values[i] == ~0x0ull) 123 continue; 124 125 len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx", 126 adxl_component_names[i], adxl_values[i]); 127 if (MSG_SIZE - len <= 0) 128 break; 129 } 130 131 return true; 132 } 133 134 void skx_set_decode(skx_decode_f decode) 135 { 136 skx_decode = decode; 137 } 138 139 int skx_get_src_id(struct skx_dev *d, int off, u8 *id) 140 { 141 u32 reg; 142 143 if (pci_read_config_dword(d->util_all, off, ®)) { 144 skx_printk(KERN_ERR, "Failed to read src id\n"); 145 return -ENODEV; 146 } 147 148 *id = GET_BITFIELD(reg, 12, 14); 149 return 0; 150 } 151 152 int skx_get_node_id(struct skx_dev *d, u8 *id) 153 { 154 u32 reg; 155 156 if (pci_read_config_dword(d->util_all, 0xf4, ®)) { 157 skx_printk(KERN_ERR, "Failed to read node id\n"); 158 return -ENODEV; 159 } 160 161 *id = GET_BITFIELD(reg, 0, 2); 162 return 0; 163 } 164 165 static int get_width(u32 mtr) 166 { 167 switch (GET_BITFIELD(mtr, 8, 9)) { 168 case 0: 169 return DEV_X4; 170 case 1: 171 return DEV_X8; 172 case 2: 173 return DEV_X16; 174 } 175 return DEV_UNKNOWN; 176 } 177 178 /* 179 * We use the per-socket device @did to count how many sockets are present, 180 * and to detemine which PCI buses are associated with each socket. Allocate 181 * and build the full list of all the skx_dev structures that we need here. 182 */ 183 int skx_get_all_bus_mappings(unsigned int did, int off, enum type type, 184 struct list_head **list) 185 { 186 struct pci_dev *pdev, *prev; 187 struct skx_dev *d; 188 u32 reg; 189 int ndev = 0; 190 191 prev = NULL; 192 for (;;) { 193 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, did, prev); 194 if (!pdev) 195 break; 196 ndev++; 197 d = kzalloc(sizeof(*d), GFP_KERNEL); 198 if (!d) { 199 pci_dev_put(pdev); 200 return -ENOMEM; 201 } 202 203 if (pci_read_config_dword(pdev, off, ®)) { 204 kfree(d); 205 pci_dev_put(pdev); 206 skx_printk(KERN_ERR, "Failed to read bus idx\n"); 207 return -ENODEV; 208 } 209 210 d->bus[0] = GET_BITFIELD(reg, 0, 7); 211 d->bus[1] = GET_BITFIELD(reg, 8, 15); 212 if (type == SKX) { 213 d->seg = pci_domain_nr(pdev->bus); 214 d->bus[2] = GET_BITFIELD(reg, 16, 23); 215 d->bus[3] = GET_BITFIELD(reg, 24, 31); 216 } else { 217 d->seg = GET_BITFIELD(reg, 16, 23); 218 } 219 220 edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", 221 d->bus[0], d->bus[1], d->bus[2], d->bus[3]); 222 list_add_tail(&d->list, &dev_edac_list); 223 prev = pdev; 224 } 225 226 if (list) 227 *list = &dev_edac_list; 228 return ndev; 229 } 230 231 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm) 232 { 233 struct pci_dev *pdev; 234 u32 reg; 235 236 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, did, NULL); 237 if (!pdev) { 238 skx_printk(KERN_ERR, "Can't get tolm/tohm\n"); 239 return -ENODEV; 240 } 241 242 if (pci_read_config_dword(pdev, off[0], ®)) { 243 skx_printk(KERN_ERR, "Failed to read tolm\n"); 244 goto fail; 245 } 246 skx_tolm = reg; 247 248 if (pci_read_config_dword(pdev, off[1], ®)) { 249 skx_printk(KERN_ERR, "Failed to read lower tohm\n"); 250 goto fail; 251 } 252 skx_tohm = reg; 253 254 if (pci_read_config_dword(pdev, off[2], ®)) { 255 skx_printk(KERN_ERR, "Failed to read upper tohm\n"); 256 goto fail; 257 } 258 skx_tohm |= (u64)reg << 32; 259 260 pci_dev_put(pdev); 261 *tolm = skx_tolm; 262 *tohm = skx_tohm; 263 edac_dbg(2, "tolm = 0x%llx tohm = 0x%llx\n", skx_tolm, skx_tohm); 264 return 0; 265 fail: 266 pci_dev_put(pdev); 267 return -ENODEV; 268 } 269 270 static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, 271 int minval, int maxval, const char *name) 272 { 273 u32 val = GET_BITFIELD(reg, lobit, hibit); 274 275 if (val < minval || val > maxval) { 276 edac_dbg(2, "bad %s = %d (raw=0x%x)\n", name, val, reg); 277 return -EINVAL; 278 } 279 return val + add; 280 } 281 282 #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks") 283 #define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 6, "rows") 284 #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") 285 286 int skx_get_dimm_info(u32 mtr, u32 amap, struct dimm_info *dimm, 287 struct skx_imc *imc, int chan, int dimmno) 288 { 289 int banks = 16, ranks, rows, cols, npages; 290 u64 size; 291 292 ranks = numrank(mtr); 293 rows = numrow(mtr); 294 cols = numcol(mtr); 295 296 /* 297 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20) 298 */ 299 size = ((1ull << (rows + cols + ranks)) * banks) >> (20 - 3); 300 npages = MiB_TO_PAGES(size); 301 302 edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld MiB (%d pages) bank: %d, rank: %d, row: 0x%x, col: 0x%x\n", 303 imc->mc, chan, dimmno, size, npages, 304 banks, 1 << ranks, rows, cols); 305 306 imc->chan[chan].dimms[dimmno].close_pg = GET_BITFIELD(mtr, 0, 0); 307 imc->chan[chan].dimms[dimmno].bank_xor_enable = GET_BITFIELD(mtr, 9, 9); 308 imc->chan[chan].dimms[dimmno].fine_grain_bank = GET_BITFIELD(amap, 0, 0); 309 imc->chan[chan].dimms[dimmno].rowbits = rows; 310 imc->chan[chan].dimms[dimmno].colbits = cols; 311 312 dimm->nr_pages = npages; 313 dimm->grain = 32; 314 dimm->dtype = get_width(mtr); 315 dimm->mtype = MEM_DDR4; 316 dimm->edac_mode = EDAC_SECDED; /* likely better than this */ 317 snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", 318 imc->src_id, imc->lmc, chan, dimmno); 319 320 return 1; 321 } 322 323 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, 324 int chan, int dimmno, const char *mod_str) 325 { 326 int smbios_handle; 327 u32 dev_handle; 328 u16 flags; 329 u64 size = 0; 330 331 dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, 332 imc->src_id, 0); 333 334 smbios_handle = nfit_get_smbios_id(dev_handle, &flags); 335 if (smbios_handle == -EOPNOTSUPP) { 336 pr_warn_once("%s: Can't find size of NVDIMM. Try enabling CONFIG_ACPI_NFIT\n", mod_str); 337 goto unknown_size; 338 } 339 340 if (smbios_handle < 0) { 341 skx_printk(KERN_ERR, "Can't find handle for NVDIMM ADR=0x%x\n", dev_handle); 342 goto unknown_size; 343 } 344 345 if (flags & ACPI_NFIT_MEM_MAP_FAILED) { 346 skx_printk(KERN_ERR, "NVDIMM ADR=0x%x is not mapped\n", dev_handle); 347 goto unknown_size; 348 } 349 350 size = dmi_memdev_size(smbios_handle); 351 if (size == ~0ull) 352 skx_printk(KERN_ERR, "Can't find size for NVDIMM ADR=0x%x/SMBIOS=0x%x\n", 353 dev_handle, smbios_handle); 354 355 unknown_size: 356 dimm->nr_pages = size >> PAGE_SHIFT; 357 dimm->grain = 32; 358 dimm->dtype = DEV_UNKNOWN; 359 dimm->mtype = MEM_NVDIMM; 360 dimm->edac_mode = EDAC_SECDED; /* likely better than this */ 361 362 edac_dbg(0, "mc#%d: channel %d, dimm %d, %llu MiB (%u pages)\n", 363 imc->mc, chan, dimmno, size >> 20, dimm->nr_pages); 364 365 snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", 366 imc->src_id, imc->lmc, chan, dimmno); 367 368 return (size == 0 || size == ~0ull) ? 0 : 1; 369 } 370 371 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, 372 const char *ctl_name, const char *mod_str, 373 get_dimm_config_f get_dimm_config) 374 { 375 struct mem_ctl_info *mci; 376 struct edac_mc_layer layers[2]; 377 struct skx_pvt *pvt; 378 int rc; 379 380 /* Allocate a new MC control structure */ 381 layers[0].type = EDAC_MC_LAYER_CHANNEL; 382 layers[0].size = NUM_CHANNELS; 383 layers[0].is_virt_csrow = false; 384 layers[1].type = EDAC_MC_LAYER_SLOT; 385 layers[1].size = NUM_DIMMS; 386 layers[1].is_virt_csrow = true; 387 mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, 388 sizeof(struct skx_pvt)); 389 390 if (unlikely(!mci)) 391 return -ENOMEM; 392 393 edac_dbg(0, "MC#%d: mci = %p\n", imc->mc, mci); 394 395 /* Associate skx_dev and mci for future usage */ 396 imc->mci = mci; 397 pvt = mci->pvt_info; 398 pvt->imc = imc; 399 400 mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name, 401 imc->node_id, imc->lmc); 402 if (!mci->ctl_name) { 403 rc = -ENOMEM; 404 goto fail0; 405 } 406 407 mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM; 408 mci->edac_ctl_cap = EDAC_FLAG_NONE; 409 mci->edac_cap = EDAC_FLAG_NONE; 410 mci->mod_name = mod_str; 411 mci->dev_name = pci_name(pdev); 412 mci->ctl_page_to_phys = NULL; 413 414 rc = get_dimm_config(mci); 415 if (rc < 0) 416 goto fail; 417 418 /* Record ptr to the generic device */ 419 mci->pdev = &pdev->dev; 420 421 /* Add this new MC control structure to EDAC's list of MCs */ 422 if (unlikely(edac_mc_add_mc(mci))) { 423 edac_dbg(0, "MC: failed edac_mc_add_mc()\n"); 424 rc = -EINVAL; 425 goto fail; 426 } 427 428 return 0; 429 430 fail: 431 kfree(mci->ctl_name); 432 fail0: 433 edac_mc_free(mci); 434 imc->mci = NULL; 435 return rc; 436 } 437 438 static void skx_unregister_mci(struct skx_imc *imc) 439 { 440 struct mem_ctl_info *mci = imc->mci; 441 442 if (!mci) 443 return; 444 445 edac_dbg(0, "MC%d: mci = %p\n", imc->mc, mci); 446 447 /* Remove MC sysfs nodes */ 448 edac_mc_del_mc(mci->pdev); 449 450 edac_dbg(1, "%s: free mci struct\n", mci->ctl_name); 451 kfree(mci->ctl_name); 452 edac_mc_free(mci); 453 } 454 455 static struct mem_ctl_info *get_mci(int src_id, int lmc) 456 { 457 struct skx_dev *d; 458 459 if (lmc > NUM_IMC - 1) { 460 skx_printk(KERN_ERR, "Bad lmc %d\n", lmc); 461 return NULL; 462 } 463 464 list_for_each_entry(d, &dev_edac_list, list) { 465 if (d->imc[0].src_id == src_id) 466 return d->imc[lmc].mci; 467 } 468 469 skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc); 470 return NULL; 471 } 472 473 static void skx_mce_output_error(struct mem_ctl_info *mci, 474 const struct mce *m, 475 struct decoded_addr *res) 476 { 477 enum hw_event_mc_err_type tp_event; 478 char *type, *optype; 479 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); 480 bool overflow = GET_BITFIELD(m->status, 62, 62); 481 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); 482 bool recoverable; 483 u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); 484 u32 mscod = GET_BITFIELD(m->status, 16, 31); 485 u32 errcode = GET_BITFIELD(m->status, 0, 15); 486 u32 optypenum = GET_BITFIELD(m->status, 4, 6); 487 488 recoverable = GET_BITFIELD(m->status, 56, 56); 489 490 if (uncorrected_error) { 491 core_err_cnt = 1; 492 if (ripv) { 493 type = "FATAL"; 494 tp_event = HW_EVENT_ERR_FATAL; 495 } else { 496 type = "NON_FATAL"; 497 tp_event = HW_EVENT_ERR_UNCORRECTED; 498 } 499 } else { 500 type = "CORRECTED"; 501 tp_event = HW_EVENT_ERR_CORRECTED; 502 } 503 504 /* 505 * According to Intel Architecture spec vol 3B, 506 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding" 507 * memory errors should fit one of these masks: 508 * 000f 0000 1mmm cccc (binary) 509 * 000f 0010 1mmm cccc (binary) [RAM used as cache] 510 * where: 511 * f = Correction Report Filtering Bit. If 1, subsequent errors 512 * won't be shown 513 * mmm = error type 514 * cccc = channel 515 * If the mask doesn't match, report an error to the parsing logic 516 */ 517 if (!((errcode & 0xef80) == 0x80 || (errcode & 0xef80) == 0x280)) { 518 optype = "Can't parse: it is not a mem"; 519 } else { 520 switch (optypenum) { 521 case 0: 522 optype = "generic undef request error"; 523 break; 524 case 1: 525 optype = "memory read error"; 526 break; 527 case 2: 528 optype = "memory write error"; 529 break; 530 case 3: 531 optype = "addr/cmd error"; 532 break; 533 case 4: 534 optype = "memory scrubbing error"; 535 break; 536 default: 537 optype = "reserved"; 538 break; 539 } 540 } 541 if (adxl_component_count) { 542 snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", 543 overflow ? " OVERFLOW" : "", 544 (uncorrected_error && recoverable) ? " recoverable" : "", 545 mscod, errcode, adxl_msg); 546 } else { 547 snprintf(skx_msg, MSG_SIZE, 548 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x", 549 overflow ? " OVERFLOW" : "", 550 (uncorrected_error && recoverable) ? " recoverable" : "", 551 mscod, errcode, 552 res->socket, res->imc, res->rank, 553 res->bank_group, res->bank_address, res->row, res->column); 554 } 555 556 edac_dbg(0, "%s\n", skx_msg); 557 558 /* Call the helper to output message */ 559 edac_mc_handle_error(tp_event, mci, core_err_cnt, 560 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, 561 res->channel, res->dimm, -1, 562 optype, skx_msg); 563 } 564 565 int skx_mce_check_error(struct notifier_block *nb, unsigned long val, 566 void *data) 567 { 568 struct mce *mce = (struct mce *)data; 569 struct decoded_addr res; 570 struct mem_ctl_info *mci; 571 char *type; 572 573 if (edac_get_report_status() == EDAC_REPORTING_DISABLED) 574 return NOTIFY_DONE; 575 576 /* ignore unless this is memory related with an address */ 577 if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) 578 return NOTIFY_DONE; 579 580 memset(&res, 0, sizeof(res)); 581 res.addr = mce->addr; 582 583 if (adxl_component_count) { 584 if (!skx_adxl_decode(&res)) 585 return NOTIFY_DONE; 586 587 mci = get_mci(res.socket, res.imc); 588 } else { 589 if (!skx_decode || !skx_decode(&res)) 590 return NOTIFY_DONE; 591 592 mci = res.dev->imc[res.imc].mci; 593 } 594 595 if (!mci) 596 return NOTIFY_DONE; 597 598 if (mce->mcgstatus & MCG_STATUS_MCIP) 599 type = "Exception"; 600 else 601 type = "Event"; 602 603 skx_mc_printk(mci, KERN_DEBUG, "HANDLING MCE MEMORY ERROR\n"); 604 605 skx_mc_printk(mci, KERN_DEBUG, "CPU %d: Machine Check %s: 0x%llx " 606 "Bank %d: 0x%llx\n", mce->extcpu, type, 607 mce->mcgstatus, mce->bank, mce->status); 608 skx_mc_printk(mci, KERN_DEBUG, "TSC 0x%llx ", mce->tsc); 609 skx_mc_printk(mci, KERN_DEBUG, "ADDR 0x%llx ", mce->addr); 610 skx_mc_printk(mci, KERN_DEBUG, "MISC 0x%llx ", mce->misc); 611 612 skx_mc_printk(mci, KERN_DEBUG, "PROCESSOR %u:0x%x TIME %llu SOCKET " 613 "%u APIC 0x%x\n", mce->cpuvendor, mce->cpuid, 614 mce->time, mce->socketid, mce->apicid); 615 616 skx_mce_output_error(mci, mce, &res); 617 618 return NOTIFY_DONE; 619 } 620 621 void skx_remove(void) 622 { 623 int i, j; 624 struct skx_dev *d, *tmp; 625 626 edac_dbg(0, "\n"); 627 628 list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { 629 list_del(&d->list); 630 for (i = 0; i < NUM_IMC; i++) { 631 if (d->imc[i].mci) 632 skx_unregister_mci(&d->imc[i]); 633 634 if (d->imc[i].mdev) 635 pci_dev_put(d->imc[i].mdev); 636 637 if (d->imc[i].mbase) 638 iounmap(d->imc[i].mbase); 639 640 for (j = 0; j < NUM_CHANNELS; j++) { 641 if (d->imc[i].chan[j].cdev) 642 pci_dev_put(d->imc[i].chan[j].cdev); 643 } 644 } 645 if (d->util_all) 646 pci_dev_put(d->util_all); 647 if (d->sad_all) 648 pci_dev_put(d->sad_all); 649 if (d->uracu) 650 pci_dev_put(d->uracu); 651 652 kfree(d); 653 } 654 } 655