1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * 4 * Shared code by both skx_edac and i10nm_edac. Originally split out 5 * from the skx_edac driver. 6 * 7 * This file is linked into both skx_edac and i10nm_edac drivers. In 8 * order to avoid link errors, this file must be like a pure library 9 * without including symbols and defines which would otherwise conflict, 10 * when linked once into a module and into a built-in object, at the 11 * same time. For example, __this_module symbol references when that 12 * file is being linked into a built-in object. 13 * 14 * Copyright (c) 2018, Intel Corporation. 15 */ 16 17 #include <linux/acpi.h> 18 #include <linux/dmi.h> 19 #include <linux/adxl.h> 20 #include <acpi/nfit.h> 21 #include <asm/mce.h> 22 #include "edac_module.h" 23 #include "skx_common.h" 24 25 static const char * const component_names[] = { 26 [INDEX_SOCKET] = "ProcessorSocketId", 27 [INDEX_MEMCTRL] = "MemoryControllerId", 28 [INDEX_CHANNEL] = "ChannelId", 29 [INDEX_DIMM] = "DimmSlotId", 30 }; 31 32 static int component_indices[ARRAY_SIZE(component_names)]; 33 static int adxl_component_count; 34 static const char * const *adxl_component_names; 35 static u64 *adxl_values; 36 static char *adxl_msg; 37 38 static char skx_msg[MSG_SIZE]; 39 static skx_decode_f skx_decode; 40 static skx_show_retry_log_f skx_show_retry_rd_err_log; 41 static u64 skx_tolm, skx_tohm; 42 static LIST_HEAD(dev_edac_list); 43 44 int __init skx_adxl_get(void) 45 { 46 const char * const *names; 47 int i, j; 48 49 names = adxl_get_component_names(); 50 if (!names) { 51 skx_printk(KERN_NOTICE, "No firmware support for address translation.\n"); 52 return -ENODEV; 53 } 54 55 for (i = 0; i < INDEX_MAX; i++) { 56 for (j = 0; names[j]; j++) { 57 if (!strcmp(component_names[i], names[j])) { 58 component_indices[i] = j; 59 break; 60 } 61 } 62 63 if (!names[j]) 64 goto err; 65 } 66 67 adxl_component_names = names; 68 while (*names++) 69 adxl_component_count++; 70 71 adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values), 72 GFP_KERNEL); 73 if (!adxl_values) { 74 adxl_component_count = 0; 75 return -ENOMEM; 76 } 77 78 adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL); 79 if (!adxl_msg) { 80 adxl_component_count = 0; 81 kfree(adxl_values); 82 return -ENOMEM; 83 } 84 85 return 0; 86 err: 87 skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ", 88 component_names[i]); 89 for (j = 0; names[j]; j++) 90 skx_printk(KERN_CONT, "%s ", names[j]); 91 skx_printk(KERN_CONT, "\n"); 92 93 return -ENODEV; 94 } 95 96 void __exit skx_adxl_put(void) 97 { 98 kfree(adxl_values); 99 kfree(adxl_msg); 100 } 101 102 static bool skx_adxl_decode(struct decoded_addr *res) 103 { 104 struct skx_dev *d; 105 int i, len = 0; 106 107 if (res->addr >= skx_tohm || (res->addr >= skx_tolm && 108 res->addr < BIT_ULL(32))) { 109 edac_dbg(0, "Address 0x%llx out of range\n", res->addr); 110 return false; 111 } 112 113 if (adxl_decode(res->addr, adxl_values)) { 114 edac_dbg(0, "Failed to decode 0x%llx\n", res->addr); 115 return false; 116 } 117 118 res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; 119 res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; 120 res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; 121 res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; 122 123 if (res->imc > NUM_IMC - 1) { 124 skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); 125 return false; 126 } 127 128 list_for_each_entry(d, &dev_edac_list, list) { 129 if (d->imc[0].src_id == res->socket) { 130 res->dev = d; 131 break; 132 } 133 } 134 135 if (!res->dev) { 136 skx_printk(KERN_ERR, "No device for src_id %d imc %d\n", 137 res->socket, res->imc); 138 return false; 139 } 140 141 for (i = 0; i < adxl_component_count; i++) { 142 if (adxl_values[i] == ~0x0ull) 143 continue; 144 145 len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx", 146 adxl_component_names[i], adxl_values[i]); 147 if (MSG_SIZE - len <= 0) 148 break; 149 } 150 151 return true; 152 } 153 154 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) 155 { 156 skx_decode = decode; 157 skx_show_retry_rd_err_log = show_retry_log; 158 } 159 160 int skx_get_src_id(struct skx_dev *d, int off, u8 *id) 161 { 162 u32 reg; 163 164 if (pci_read_config_dword(d->util_all, off, ®)) { 165 skx_printk(KERN_ERR, "Failed to read src id\n"); 166 return -ENODEV; 167 } 168 169 *id = GET_BITFIELD(reg, 12, 14); 170 return 0; 171 } 172 173 int skx_get_node_id(struct skx_dev *d, u8 *id) 174 { 175 u32 reg; 176 177 if (pci_read_config_dword(d->util_all, 0xf4, ®)) { 178 skx_printk(KERN_ERR, "Failed to read node id\n"); 179 return -ENODEV; 180 } 181 182 *id = GET_BITFIELD(reg, 0, 2); 183 return 0; 184 } 185 186 static int get_width(u32 mtr) 187 { 188 switch (GET_BITFIELD(mtr, 8, 9)) { 189 case 0: 190 return DEV_X4; 191 case 1: 192 return DEV_X8; 193 case 2: 194 return DEV_X16; 195 } 196 return DEV_UNKNOWN; 197 } 198 199 /* 200 * We use the per-socket device @cfg->did to count how many sockets are present, 201 * and to detemine which PCI buses are associated with each socket. Allocate 202 * and build the full list of all the skx_dev structures that we need here. 203 */ 204 int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) 205 { 206 struct pci_dev *pdev, *prev; 207 struct skx_dev *d; 208 u32 reg; 209 int ndev = 0; 210 211 prev = NULL; 212 for (;;) { 213 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, cfg->decs_did, prev); 214 if (!pdev) 215 break; 216 ndev++; 217 d = kzalloc(sizeof(*d), GFP_KERNEL); 218 if (!d) { 219 pci_dev_put(pdev); 220 return -ENOMEM; 221 } 222 223 if (pci_read_config_dword(pdev, cfg->busno_cfg_offset, ®)) { 224 kfree(d); 225 pci_dev_put(pdev); 226 skx_printk(KERN_ERR, "Failed to read bus idx\n"); 227 return -ENODEV; 228 } 229 230 d->bus[0] = GET_BITFIELD(reg, 0, 7); 231 d->bus[1] = GET_BITFIELD(reg, 8, 15); 232 if (cfg->type == SKX) { 233 d->seg = pci_domain_nr(pdev->bus); 234 d->bus[2] = GET_BITFIELD(reg, 16, 23); 235 d->bus[3] = GET_BITFIELD(reg, 24, 31); 236 } else { 237 d->seg = GET_BITFIELD(reg, 16, 23); 238 } 239 240 edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", 241 d->bus[0], d->bus[1], d->bus[2], d->bus[3]); 242 list_add_tail(&d->list, &dev_edac_list); 243 prev = pdev; 244 } 245 246 if (list) 247 *list = &dev_edac_list; 248 return ndev; 249 } 250 251 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm) 252 { 253 struct pci_dev *pdev; 254 u32 reg; 255 256 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, did, NULL); 257 if (!pdev) { 258 edac_dbg(2, "Can't get tolm/tohm\n"); 259 return -ENODEV; 260 } 261 262 if (pci_read_config_dword(pdev, off[0], ®)) { 263 skx_printk(KERN_ERR, "Failed to read tolm\n"); 264 goto fail; 265 } 266 skx_tolm = reg; 267 268 if (pci_read_config_dword(pdev, off[1], ®)) { 269 skx_printk(KERN_ERR, "Failed to read lower tohm\n"); 270 goto fail; 271 } 272 skx_tohm = reg; 273 274 if (pci_read_config_dword(pdev, off[2], ®)) { 275 skx_printk(KERN_ERR, "Failed to read upper tohm\n"); 276 goto fail; 277 } 278 skx_tohm |= (u64)reg << 32; 279 280 pci_dev_put(pdev); 281 *tolm = skx_tolm; 282 *tohm = skx_tohm; 283 edac_dbg(2, "tolm = 0x%llx tohm = 0x%llx\n", skx_tolm, skx_tohm); 284 return 0; 285 fail: 286 pci_dev_put(pdev); 287 return -ENODEV; 288 } 289 290 static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, 291 int minval, int maxval, const char *name) 292 { 293 u32 val = GET_BITFIELD(reg, lobit, hibit); 294 295 if (val < minval || val > maxval) { 296 edac_dbg(2, "bad %s = %d (raw=0x%x)\n", name, val, reg); 297 return -EINVAL; 298 } 299 return val + add; 300 } 301 302 #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks") 303 #define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 6, "rows") 304 #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") 305 306 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, 307 struct skx_imc *imc, int chan, int dimmno) 308 { 309 int banks = 16, ranks, rows, cols, npages; 310 u64 size; 311 312 ranks = numrank(mtr); 313 rows = numrow(mtr); 314 cols = numcol(mtr); 315 316 /* 317 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20) 318 */ 319 size = ((1ull << (rows + cols + ranks)) * banks) >> (20 - 3); 320 npages = MiB_TO_PAGES(size); 321 322 edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld MiB (%d pages) bank: %d, rank: %d, row: 0x%x, col: 0x%x\n", 323 imc->mc, chan, dimmno, size, npages, 324 banks, 1 << ranks, rows, cols); 325 326 imc->chan[chan].dimms[dimmno].close_pg = GET_BITFIELD(mcmtr, 0, 0); 327 imc->chan[chan].dimms[dimmno].bank_xor_enable = GET_BITFIELD(mcmtr, 9, 9); 328 imc->chan[chan].dimms[dimmno].fine_grain_bank = GET_BITFIELD(amap, 0, 0); 329 imc->chan[chan].dimms[dimmno].rowbits = rows; 330 imc->chan[chan].dimms[dimmno].colbits = cols; 331 332 dimm->nr_pages = npages; 333 dimm->grain = 32; 334 dimm->dtype = get_width(mtr); 335 dimm->mtype = MEM_DDR4; 336 dimm->edac_mode = EDAC_SECDED; /* likely better than this */ 337 snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", 338 imc->src_id, imc->lmc, chan, dimmno); 339 340 return 1; 341 } 342 343 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, 344 int chan, int dimmno, const char *mod_str) 345 { 346 int smbios_handle; 347 u32 dev_handle; 348 u16 flags; 349 u64 size = 0; 350 351 dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, 352 imc->src_id, 0); 353 354 smbios_handle = nfit_get_smbios_id(dev_handle, &flags); 355 if (smbios_handle == -EOPNOTSUPP) { 356 pr_warn_once("%s: Can't find size of NVDIMM. Try enabling CONFIG_ACPI_NFIT\n", mod_str); 357 goto unknown_size; 358 } 359 360 if (smbios_handle < 0) { 361 skx_printk(KERN_ERR, "Can't find handle for NVDIMM ADR=0x%x\n", dev_handle); 362 goto unknown_size; 363 } 364 365 if (flags & ACPI_NFIT_MEM_MAP_FAILED) { 366 skx_printk(KERN_ERR, "NVDIMM ADR=0x%x is not mapped\n", dev_handle); 367 goto unknown_size; 368 } 369 370 size = dmi_memdev_size(smbios_handle); 371 if (size == ~0ull) 372 skx_printk(KERN_ERR, "Can't find size for NVDIMM ADR=0x%x/SMBIOS=0x%x\n", 373 dev_handle, smbios_handle); 374 375 unknown_size: 376 dimm->nr_pages = size >> PAGE_SHIFT; 377 dimm->grain = 32; 378 dimm->dtype = DEV_UNKNOWN; 379 dimm->mtype = MEM_NVDIMM; 380 dimm->edac_mode = EDAC_SECDED; /* likely better than this */ 381 382 edac_dbg(0, "mc#%d: channel %d, dimm %d, %llu MiB (%u pages)\n", 383 imc->mc, chan, dimmno, size >> 20, dimm->nr_pages); 384 385 snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", 386 imc->src_id, imc->lmc, chan, dimmno); 387 388 return (size == 0 || size == ~0ull) ? 0 : 1; 389 } 390 391 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, 392 const char *ctl_name, const char *mod_str, 393 get_dimm_config_f get_dimm_config) 394 { 395 struct mem_ctl_info *mci; 396 struct edac_mc_layer layers[2]; 397 struct skx_pvt *pvt; 398 int rc; 399 400 /* Allocate a new MC control structure */ 401 layers[0].type = EDAC_MC_LAYER_CHANNEL; 402 layers[0].size = NUM_CHANNELS; 403 layers[0].is_virt_csrow = false; 404 layers[1].type = EDAC_MC_LAYER_SLOT; 405 layers[1].size = NUM_DIMMS; 406 layers[1].is_virt_csrow = true; 407 mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, 408 sizeof(struct skx_pvt)); 409 410 if (unlikely(!mci)) 411 return -ENOMEM; 412 413 edac_dbg(0, "MC#%d: mci = %p\n", imc->mc, mci); 414 415 /* Associate skx_dev and mci for future usage */ 416 imc->mci = mci; 417 pvt = mci->pvt_info; 418 pvt->imc = imc; 419 420 mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name, 421 imc->node_id, imc->lmc); 422 if (!mci->ctl_name) { 423 rc = -ENOMEM; 424 goto fail0; 425 } 426 427 mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM; 428 mci->edac_ctl_cap = EDAC_FLAG_NONE; 429 mci->edac_cap = EDAC_FLAG_NONE; 430 mci->mod_name = mod_str; 431 mci->dev_name = pci_name(pdev); 432 mci->ctl_page_to_phys = NULL; 433 434 rc = get_dimm_config(mci); 435 if (rc < 0) 436 goto fail; 437 438 /* Record ptr to the generic device */ 439 mci->pdev = &pdev->dev; 440 441 /* Add this new MC control structure to EDAC's list of MCs */ 442 if (unlikely(edac_mc_add_mc(mci))) { 443 edac_dbg(0, "MC: failed edac_mc_add_mc()\n"); 444 rc = -EINVAL; 445 goto fail; 446 } 447 448 return 0; 449 450 fail: 451 kfree(mci->ctl_name); 452 fail0: 453 edac_mc_free(mci); 454 imc->mci = NULL; 455 return rc; 456 } 457 458 static void skx_unregister_mci(struct skx_imc *imc) 459 { 460 struct mem_ctl_info *mci = imc->mci; 461 462 if (!mci) 463 return; 464 465 edac_dbg(0, "MC%d: mci = %p\n", imc->mc, mci); 466 467 /* Remove MC sysfs nodes */ 468 edac_mc_del_mc(mci->pdev); 469 470 edac_dbg(1, "%s: free mci struct\n", mci->ctl_name); 471 kfree(mci->ctl_name); 472 edac_mc_free(mci); 473 } 474 475 static void skx_mce_output_error(struct mem_ctl_info *mci, 476 const struct mce *m, 477 struct decoded_addr *res) 478 { 479 enum hw_event_mc_err_type tp_event; 480 char *optype; 481 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); 482 bool overflow = GET_BITFIELD(m->status, 62, 62); 483 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); 484 bool recoverable; 485 int len; 486 u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); 487 u32 mscod = GET_BITFIELD(m->status, 16, 31); 488 u32 errcode = GET_BITFIELD(m->status, 0, 15); 489 u32 optypenum = GET_BITFIELD(m->status, 4, 6); 490 491 recoverable = GET_BITFIELD(m->status, 56, 56); 492 493 if (uncorrected_error) { 494 core_err_cnt = 1; 495 if (ripv) { 496 tp_event = HW_EVENT_ERR_FATAL; 497 } else { 498 tp_event = HW_EVENT_ERR_UNCORRECTED; 499 } 500 } else { 501 tp_event = HW_EVENT_ERR_CORRECTED; 502 } 503 504 /* 505 * According to Intel Architecture spec vol 3B, 506 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding" 507 * memory errors should fit one of these masks: 508 * 000f 0000 1mmm cccc (binary) 509 * 000f 0010 1mmm cccc (binary) [RAM used as cache] 510 * where: 511 * f = Correction Report Filtering Bit. If 1, subsequent errors 512 * won't be shown 513 * mmm = error type 514 * cccc = channel 515 * If the mask doesn't match, report an error to the parsing logic 516 */ 517 if (!((errcode & 0xef80) == 0x80 || (errcode & 0xef80) == 0x280)) { 518 optype = "Can't parse: it is not a mem"; 519 } else { 520 switch (optypenum) { 521 case 0: 522 optype = "generic undef request error"; 523 break; 524 case 1: 525 optype = "memory read error"; 526 break; 527 case 2: 528 optype = "memory write error"; 529 break; 530 case 3: 531 optype = "addr/cmd error"; 532 break; 533 case 4: 534 optype = "memory scrubbing error"; 535 break; 536 default: 537 optype = "reserved"; 538 break; 539 } 540 } 541 if (adxl_component_count) { 542 len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", 543 overflow ? " OVERFLOW" : "", 544 (uncorrected_error && recoverable) ? " recoverable" : "", 545 mscod, errcode, adxl_msg); 546 } else { 547 len = snprintf(skx_msg, MSG_SIZE, 548 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x", 549 overflow ? " OVERFLOW" : "", 550 (uncorrected_error && recoverable) ? " recoverable" : "", 551 mscod, errcode, 552 res->socket, res->imc, res->rank, 553 res->bank_group, res->bank_address, res->row, res->column); 554 } 555 556 if (skx_show_retry_rd_err_log) 557 skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); 558 559 edac_dbg(0, "%s\n", skx_msg); 560 561 /* Call the helper to output message */ 562 edac_mc_handle_error(tp_event, mci, core_err_cnt, 563 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, 564 res->channel, res->dimm, -1, 565 optype, skx_msg); 566 } 567 568 int skx_mce_check_error(struct notifier_block *nb, unsigned long val, 569 void *data) 570 { 571 struct mce *mce = (struct mce *)data; 572 struct decoded_addr res; 573 struct mem_ctl_info *mci; 574 char *type; 575 576 if (mce->kflags & MCE_HANDLED_CEC) 577 return NOTIFY_DONE; 578 579 /* ignore unless this is memory related with an address */ 580 if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) 581 return NOTIFY_DONE; 582 583 memset(&res, 0, sizeof(res)); 584 res.addr = mce->addr; 585 586 if (adxl_component_count) { 587 if (!skx_adxl_decode(&res)) 588 return NOTIFY_DONE; 589 } else if (!skx_decode || !skx_decode(&res)) { 590 return NOTIFY_DONE; 591 } 592 593 mci = res.dev->imc[res.imc].mci; 594 595 if (!mci) 596 return NOTIFY_DONE; 597 598 if (mce->mcgstatus & MCG_STATUS_MCIP) 599 type = "Exception"; 600 else 601 type = "Event"; 602 603 skx_mc_printk(mci, KERN_DEBUG, "HANDLING MCE MEMORY ERROR\n"); 604 605 skx_mc_printk(mci, KERN_DEBUG, "CPU %d: Machine Check %s: 0x%llx " 606 "Bank %d: 0x%llx\n", mce->extcpu, type, 607 mce->mcgstatus, mce->bank, mce->status); 608 skx_mc_printk(mci, KERN_DEBUG, "TSC 0x%llx ", mce->tsc); 609 skx_mc_printk(mci, KERN_DEBUG, "ADDR 0x%llx ", mce->addr); 610 skx_mc_printk(mci, KERN_DEBUG, "MISC 0x%llx ", mce->misc); 611 612 skx_mc_printk(mci, KERN_DEBUG, "PROCESSOR %u:0x%x TIME %llu SOCKET " 613 "%u APIC 0x%x\n", mce->cpuvendor, mce->cpuid, 614 mce->time, mce->socketid, mce->apicid); 615 616 skx_mce_output_error(mci, mce, &res); 617 618 mce->kflags |= MCE_HANDLED_EDAC; 619 return NOTIFY_DONE; 620 } 621 622 void skx_remove(void) 623 { 624 int i, j; 625 struct skx_dev *d, *tmp; 626 627 edac_dbg(0, "\n"); 628 629 list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { 630 list_del(&d->list); 631 for (i = 0; i < NUM_IMC; i++) { 632 if (d->imc[i].mci) 633 skx_unregister_mci(&d->imc[i]); 634 635 if (d->imc[i].mdev) 636 pci_dev_put(d->imc[i].mdev); 637 638 if (d->imc[i].mbase) 639 iounmap(d->imc[i].mbase); 640 641 for (j = 0; j < NUM_CHANNELS; j++) { 642 if (d->imc[i].chan[j].cdev) 643 pci_dev_put(d->imc[i].chan[j].cdev); 644 } 645 } 646 if (d->util_all) 647 pci_dev_put(d->util_all); 648 if (d->sad_all) 649 pci_dev_put(d->sad_all); 650 if (d->uracu) 651 pci_dev_put(d->uracu); 652 653 kfree(d); 654 } 655 } 656