1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * GHES/EDAC Linux driver 4 * 5 * Copyright (c) 2013 by Mauro Carvalho Chehab 6 * 7 * Red Hat Inc. http://www.redhat.com 8 */ 9 10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 11 12 #include <acpi/ghes.h> 13 #include <linux/edac.h> 14 #include <linux/dmi.h> 15 #include "edac_module.h" 16 #include <ras/ras_event.h> 17 18 struct ghes_edac_pvt { 19 struct list_head list; 20 struct ghes *ghes; 21 struct mem_ctl_info *mci; 22 23 /* Buffers for the error handling routine */ 24 char detail_location[240]; 25 char other_detail[160]; 26 char msg[80]; 27 }; 28 29 static atomic_t ghes_init = ATOMIC_INIT(0); 30 static struct ghes_edac_pvt *ghes_pvt; 31 32 /* 33 * Sync with other, potentially concurrent callers of 34 * ghes_edac_report_mem_error(). We don't know what the 35 * "inventive" firmware would do. 36 */ 37 static DEFINE_SPINLOCK(ghes_lock); 38 39 /* "ghes_edac.force_load=1" skips the platform check */ 40 static bool __read_mostly force_load; 41 module_param(force_load, bool, 0); 42 43 /* Memory Device - Type 17 of SMBIOS spec */ 44 struct memdev_dmi_entry { 45 u8 type; 46 u8 length; 47 u16 handle; 48 u16 phys_mem_array_handle; 49 u16 mem_err_info_handle; 50 u16 total_width; 51 u16 data_width; 52 u16 size; 53 u8 form_factor; 54 u8 device_set; 55 u8 device_locator; 56 u8 bank_locator; 57 u8 memory_type; 58 u16 type_detail; 59 u16 speed; 60 u8 manufacturer; 61 u8 serial_number; 62 u8 asset_tag; 63 u8 part_number; 64 u8 attributes; 65 u32 extended_size; 66 u16 conf_mem_clk_speed; 67 } __attribute__((__packed__)); 68 69 struct ghes_edac_dimm_fill { 70 struct mem_ctl_info *mci; 71 unsigned int count; 72 }; 73 74 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) 75 { 76 int *num_dimm = arg; 77 78 if (dh->type == DMI_ENTRY_MEM_DEVICE) 79 (*num_dimm)++; 80 } 81 82 static int get_dimm_smbios_index(u16 handle) 83 { 84 struct mem_ctl_info *mci = ghes_pvt->mci; 85 int i; 86 87 for (i = 0; i < mci->tot_dimms; i++) { 88 if (mci->dimms[i]->smbios_handle == handle) 89 return i; 90 } 91 return -1; 92 } 93 94 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) 95 { 96 struct ghes_edac_dimm_fill *dimm_fill = arg; 97 struct mem_ctl_info *mci = dimm_fill->mci; 98 99 if (dh->type == DMI_ENTRY_MEM_DEVICE) { 100 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; 101 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 102 mci->n_layers, 103 dimm_fill->count, 0, 0); 104 u16 rdr_mask = BIT(7) | BIT(13); 105 106 if (entry->size == 0xffff) { 107 pr_info("Can't get DIMM%i size\n", 108 dimm_fill->count); 109 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ 110 } else if (entry->size == 0x7fff) { 111 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); 112 } else { 113 if (entry->size & BIT(15)) 114 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); 115 else 116 dimm->nr_pages = MiB_TO_PAGES(entry->size); 117 } 118 119 switch (entry->memory_type) { 120 case 0x12: 121 if (entry->type_detail & BIT(13)) 122 dimm->mtype = MEM_RDDR; 123 else 124 dimm->mtype = MEM_DDR; 125 break; 126 case 0x13: 127 if (entry->type_detail & BIT(13)) 128 dimm->mtype = MEM_RDDR2; 129 else 130 dimm->mtype = MEM_DDR2; 131 break; 132 case 0x14: 133 dimm->mtype = MEM_FB_DDR2; 134 break; 135 case 0x18: 136 if (entry->type_detail & BIT(12)) 137 dimm->mtype = MEM_NVDIMM; 138 else if (entry->type_detail & BIT(13)) 139 dimm->mtype = MEM_RDDR3; 140 else 141 dimm->mtype = MEM_DDR3; 142 break; 143 case 0x1a: 144 if (entry->type_detail & BIT(12)) 145 dimm->mtype = MEM_NVDIMM; 146 else if (entry->type_detail & BIT(13)) 147 dimm->mtype = MEM_RDDR4; 148 else 149 dimm->mtype = MEM_DDR4; 150 break; 151 default: 152 if (entry->type_detail & BIT(6)) 153 dimm->mtype = MEM_RMBS; 154 else if ((entry->type_detail & rdr_mask) == rdr_mask) 155 dimm->mtype = MEM_RDR; 156 else if (entry->type_detail & BIT(7)) 157 dimm->mtype = MEM_SDR; 158 else if (entry->type_detail & BIT(9)) 159 dimm->mtype = MEM_EDO; 160 else 161 dimm->mtype = MEM_UNKNOWN; 162 } 163 164 /* 165 * Actually, we can only detect if the memory has bits for 166 * checksum or not 167 */ 168 if (entry->total_width == entry->data_width) 169 dimm->edac_mode = EDAC_NONE; 170 else 171 dimm->edac_mode = EDAC_SECDED; 172 173 dimm->dtype = DEV_UNKNOWN; 174 dimm->grain = 128; /* Likely, worse case */ 175 176 /* 177 * FIXME: It shouldn't be hard to also fill the DIMM labels 178 */ 179 180 if (dimm->nr_pages) { 181 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", 182 dimm_fill->count, edac_mem_types[dimm->mtype], 183 PAGES_TO_MiB(dimm->nr_pages), 184 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); 185 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", 186 entry->memory_type, entry->type_detail, 187 entry->total_width, entry->data_width); 188 } 189 190 dimm->smbios_handle = entry->handle; 191 192 dimm_fill->count++; 193 } 194 } 195 196 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) 197 { 198 enum hw_event_mc_err_type type; 199 struct edac_raw_error_desc *e; 200 struct mem_ctl_info *mci; 201 struct ghes_edac_pvt *pvt = ghes_pvt; 202 unsigned long flags; 203 char *p; 204 u8 grain_bits; 205 206 if (!pvt) 207 return; 208 209 /* 210 * We can do the locking below because GHES defers error processing 211 * from NMI to IRQ context. Whenever that changes, we'd at least 212 * know. 213 */ 214 if (WARN_ON_ONCE(in_nmi())) 215 return; 216 217 spin_lock_irqsave(&ghes_lock, flags); 218 219 mci = pvt->mci; 220 e = &mci->error_desc; 221 222 /* Cleans the error report buffer */ 223 memset(e, 0, sizeof (*e)); 224 e->error_count = 1; 225 strcpy(e->label, "unknown label"); 226 e->msg = pvt->msg; 227 e->other_detail = pvt->other_detail; 228 e->top_layer = -1; 229 e->mid_layer = -1; 230 e->low_layer = -1; 231 *pvt->other_detail = '\0'; 232 *pvt->msg = '\0'; 233 234 switch (sev) { 235 case GHES_SEV_CORRECTED: 236 type = HW_EVENT_ERR_CORRECTED; 237 break; 238 case GHES_SEV_RECOVERABLE: 239 type = HW_EVENT_ERR_UNCORRECTED; 240 break; 241 case GHES_SEV_PANIC: 242 type = HW_EVENT_ERR_FATAL; 243 break; 244 default: 245 case GHES_SEV_NO: 246 type = HW_EVENT_ERR_INFO; 247 } 248 249 edac_dbg(1, "error validation_bits: 0x%08llx\n", 250 (long long)mem_err->validation_bits); 251 252 /* Error type, mapped on e->msg */ 253 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 254 p = pvt->msg; 255 switch (mem_err->error_type) { 256 case 0: 257 p += sprintf(p, "Unknown"); 258 break; 259 case 1: 260 p += sprintf(p, "No error"); 261 break; 262 case 2: 263 p += sprintf(p, "Single-bit ECC"); 264 break; 265 case 3: 266 p += sprintf(p, "Multi-bit ECC"); 267 break; 268 case 4: 269 p += sprintf(p, "Single-symbol ChipKill ECC"); 270 break; 271 case 5: 272 p += sprintf(p, "Multi-symbol ChipKill ECC"); 273 break; 274 case 6: 275 p += sprintf(p, "Master abort"); 276 break; 277 case 7: 278 p += sprintf(p, "Target abort"); 279 break; 280 case 8: 281 p += sprintf(p, "Parity Error"); 282 break; 283 case 9: 284 p += sprintf(p, "Watchdog timeout"); 285 break; 286 case 10: 287 p += sprintf(p, "Invalid address"); 288 break; 289 case 11: 290 p += sprintf(p, "Mirror Broken"); 291 break; 292 case 12: 293 p += sprintf(p, "Memory Sparing"); 294 break; 295 case 13: 296 p += sprintf(p, "Scrub corrected error"); 297 break; 298 case 14: 299 p += sprintf(p, "Scrub uncorrected error"); 300 break; 301 case 15: 302 p += sprintf(p, "Physical Memory Map-out event"); 303 break; 304 default: 305 p += sprintf(p, "reserved error (%d)", 306 mem_err->error_type); 307 } 308 } else { 309 strcpy(pvt->msg, "unknown error"); 310 } 311 312 /* Error address */ 313 if (mem_err->validation_bits & CPER_MEM_VALID_PA) { 314 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 315 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 316 } 317 318 /* Error grain */ 319 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) 320 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 321 322 /* Memory error location, mapped on e->location */ 323 p = e->location; 324 if (mem_err->validation_bits & CPER_MEM_VALID_NODE) 325 p += sprintf(p, "node:%d ", mem_err->node); 326 if (mem_err->validation_bits & CPER_MEM_VALID_CARD) 327 p += sprintf(p, "card:%d ", mem_err->card); 328 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) 329 p += sprintf(p, "module:%d ", mem_err->module); 330 if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 331 p += sprintf(p, "rank:%d ", mem_err->rank); 332 if (mem_err->validation_bits & CPER_MEM_VALID_BANK) 333 p += sprintf(p, "bank:%d ", mem_err->bank); 334 if (mem_err->validation_bits & CPER_MEM_VALID_ROW) 335 p += sprintf(p, "row:%d ", mem_err->row); 336 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) 337 p += sprintf(p, "col:%d ", mem_err->column); 338 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) 339 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); 340 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { 341 const char *bank = NULL, *device = NULL; 342 int index = -1; 343 344 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); 345 if (bank != NULL && device != NULL) 346 p += sprintf(p, "DIMM location:%s %s ", bank, device); 347 else 348 p += sprintf(p, "DIMM DMI handle: 0x%.4x ", 349 mem_err->mem_dev_handle); 350 351 index = get_dimm_smbios_index(mem_err->mem_dev_handle); 352 if (index >= 0) { 353 e->top_layer = index; 354 e->enable_per_layer_report = true; 355 } 356 357 } 358 if (p > e->location) 359 *(p - 1) = '\0'; 360 361 /* All other fields are mapped on e->other_detail */ 362 p = pvt->other_detail; 363 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { 364 u64 status = mem_err->error_status; 365 366 p += sprintf(p, "status(0x%016llx): ", (long long)status); 367 switch ((status >> 8) & 0xff) { 368 case 1: 369 p += sprintf(p, "Error detected internal to the component "); 370 break; 371 case 16: 372 p += sprintf(p, "Error detected in the bus "); 373 break; 374 case 4: 375 p += sprintf(p, "Storage error in DRAM memory "); 376 break; 377 case 5: 378 p += sprintf(p, "Storage error in TLB "); 379 break; 380 case 6: 381 p += sprintf(p, "Storage error in cache "); 382 break; 383 case 7: 384 p += sprintf(p, "Error in one or more functional units "); 385 break; 386 case 8: 387 p += sprintf(p, "component failed self test "); 388 break; 389 case 9: 390 p += sprintf(p, "Overflow or undervalue of internal queue "); 391 break; 392 case 17: 393 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); 394 break; 395 case 18: 396 p += sprintf(p, "Improper access error "); 397 break; 398 case 19: 399 p += sprintf(p, "Access to a memory address which is not mapped to any component "); 400 break; 401 case 20: 402 p += sprintf(p, "Loss of Lockstep "); 403 break; 404 case 21: 405 p += sprintf(p, "Response not associated with a request "); 406 break; 407 case 22: 408 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); 409 break; 410 case 23: 411 p += sprintf(p, "Detection of a PATH_ERROR "); 412 break; 413 case 25: 414 p += sprintf(p, "Bus operation timeout "); 415 break; 416 case 26: 417 p += sprintf(p, "A read was issued to data that has been poisoned "); 418 break; 419 default: 420 p += sprintf(p, "reserved "); 421 break; 422 } 423 } 424 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 425 p += sprintf(p, "requestorID: 0x%016llx ", 426 (long long)mem_err->requestor_id); 427 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 428 p += sprintf(p, "responderID: 0x%016llx ", 429 (long long)mem_err->responder_id); 430 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) 431 p += sprintf(p, "targetID: 0x%016llx ", 432 (long long)mem_err->responder_id); 433 if (p > pvt->other_detail) 434 *(p - 1) = '\0'; 435 436 /* Generate the trace event */ 437 grain_bits = fls_long(e->grain); 438 snprintf(pvt->detail_location, sizeof(pvt->detail_location), 439 "APEI location: %s %s", e->location, e->other_detail); 440 trace_mc_event(type, e->msg, e->label, e->error_count, 441 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, 442 (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, 443 grain_bits, e->syndrome, pvt->detail_location); 444 445 edac_raw_mc_handle_error(type, mci, e); 446 spin_unlock_irqrestore(&ghes_lock, flags); 447 } 448 449 /* 450 * Known systems that are safe to enable this module. 451 */ 452 static struct acpi_platform_list plat_list[] = { 453 {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, 454 { } /* End */ 455 }; 456 457 int ghes_edac_register(struct ghes *ghes, struct device *dev) 458 { 459 bool fake = false; 460 int rc, num_dimm = 0; 461 struct mem_ctl_info *mci; 462 struct edac_mc_layer layers[1]; 463 struct ghes_edac_dimm_fill dimm_fill; 464 int idx = -1; 465 466 if (IS_ENABLED(CONFIG_X86)) { 467 /* Check if safe to enable on this system */ 468 idx = acpi_match_platform_list(plat_list); 469 if (!force_load && idx < 0) 470 return -ENODEV; 471 } else { 472 idx = 0; 473 } 474 475 /* 476 * We have only one logical memory controller to which all DIMMs belong. 477 */ 478 if (atomic_inc_return(&ghes_init) > 1) 479 return 0; 480 481 /* Get the number of DIMMs */ 482 dmi_walk(ghes_edac_count_dimms, &num_dimm); 483 484 /* Check if we've got a bogus BIOS */ 485 if (num_dimm == 0) { 486 fake = true; 487 num_dimm = 1; 488 } 489 490 layers[0].type = EDAC_MC_LAYER_ALL_MEM; 491 layers[0].size = num_dimm; 492 layers[0].is_virt_csrow = true; 493 494 mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); 495 if (!mci) { 496 pr_info("Can't allocate memory for EDAC data\n"); 497 return -ENOMEM; 498 } 499 500 ghes_pvt = mci->pvt_info; 501 ghes_pvt->ghes = ghes; 502 ghes_pvt->mci = mci; 503 504 mci->pdev = dev; 505 mci->mtype_cap = MEM_FLAG_EMPTY; 506 mci->edac_ctl_cap = EDAC_FLAG_NONE; 507 mci->edac_cap = EDAC_FLAG_NONE; 508 mci->mod_name = "ghes_edac.c"; 509 mci->ctl_name = "ghes_edac"; 510 mci->dev_name = "ghes"; 511 512 if (fake) { 513 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); 514 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); 515 pr_info("work on such system. Use this driver with caution\n"); 516 } else if (idx < 0) { 517 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); 518 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); 519 pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); 520 pr_info("If you find incorrect reports, please contact your hardware vendor\n"); 521 pr_info("to correct its BIOS.\n"); 522 pr_info("This system has %d DIMM sockets.\n", num_dimm); 523 } 524 525 if (!fake) { 526 dimm_fill.count = 0; 527 dimm_fill.mci = mci; 528 dmi_walk(ghes_edac_dmidecode, &dimm_fill); 529 } else { 530 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 531 mci->n_layers, 0, 0, 0); 532 533 dimm->nr_pages = 1; 534 dimm->grain = 128; 535 dimm->mtype = MEM_UNKNOWN; 536 dimm->dtype = DEV_UNKNOWN; 537 dimm->edac_mode = EDAC_SECDED; 538 } 539 540 rc = edac_mc_add_mc(mci); 541 if (rc < 0) { 542 pr_info("Can't register at EDAC core\n"); 543 edac_mc_free(mci); 544 return -ENODEV; 545 } 546 return 0; 547 } 548 549 void ghes_edac_unregister(struct ghes *ghes) 550 { 551 struct mem_ctl_info *mci; 552 553 if (!ghes_pvt) 554 return; 555 556 mci = ghes_pvt->mci; 557 edac_mc_del_mc(mci->pdev); 558 edac_mc_free(mci); 559 } 560