1 /* 2 * GHES/EDAC Linux driver 3 * 4 * This file may be distributed under the terms of the GNU General Public 5 * License version 2. 6 * 7 * Copyright (c) 2013 by Mauro Carvalho Chehab 8 * 9 * Red Hat Inc. http://www.redhat.com 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <acpi/ghes.h> 15 #include <linux/edac.h> 16 #include <linux/dmi.h> 17 #include "edac_module.h" 18 #include <ras/ras_event.h> 19 20 struct ghes_edac_pvt { 21 struct list_head list; 22 struct ghes *ghes; 23 struct mem_ctl_info *mci; 24 25 /* Buffers for the error handling routine */ 26 char detail_location[240]; 27 char other_detail[160]; 28 char msg[80]; 29 }; 30 31 static atomic_t ghes_init = ATOMIC_INIT(0); 32 static struct ghes_edac_pvt *ghes_pvt; 33 34 /* 35 * Sync with other, potentially concurrent callers of 36 * ghes_edac_report_mem_error(). We don't know what the 37 * "inventive" firmware would do. 38 */ 39 static DEFINE_SPINLOCK(ghes_lock); 40 41 /* "ghes_edac.force_load=1" skips the platform check */ 42 static bool __read_mostly force_load; 43 module_param(force_load, bool, 0); 44 45 /* Memory Device - Type 17 of SMBIOS spec */ 46 struct memdev_dmi_entry { 47 u8 type; 48 u8 length; 49 u16 handle; 50 u16 phys_mem_array_handle; 51 u16 mem_err_info_handle; 52 u16 total_width; 53 u16 data_width; 54 u16 size; 55 u8 form_factor; 56 u8 device_set; 57 u8 device_locator; 58 u8 bank_locator; 59 u8 memory_type; 60 u16 type_detail; 61 u16 speed; 62 u8 manufacturer; 63 u8 serial_number; 64 u8 asset_tag; 65 u8 part_number; 66 u8 attributes; 67 u32 extended_size; 68 u16 conf_mem_clk_speed; 69 } __attribute__((__packed__)); 70 71 struct ghes_edac_dimm_fill { 72 struct mem_ctl_info *mci; 73 unsigned count; 74 }; 75 76 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) 77 { 78 int *num_dimm = arg; 79 80 if (dh->type == DMI_ENTRY_MEM_DEVICE) 81 (*num_dimm)++; 82 } 83 84 static int get_dimm_smbios_index(u16 handle) 85 { 86 struct mem_ctl_info *mci = ghes_pvt->mci; 87 int i; 88 89 for (i = 0; i < mci->tot_dimms; i++) { 90 if (mci->dimms[i]->smbios_handle == handle) 91 return i; 92 } 93 return -1; 94 } 95 96 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) 97 { 98 struct ghes_edac_dimm_fill *dimm_fill = arg; 99 struct mem_ctl_info *mci = dimm_fill->mci; 100 101 if (dh->type == DMI_ENTRY_MEM_DEVICE) { 102 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; 103 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 104 mci->n_layers, 105 dimm_fill->count, 0, 0); 106 u16 rdr_mask = BIT(7) | BIT(13); 107 108 if (entry->size == 0xffff) { 109 pr_info("Can't get DIMM%i size\n", 110 dimm_fill->count); 111 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ 112 } else if (entry->size == 0x7fff) { 113 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); 114 } else { 115 if (entry->size & BIT(15)) 116 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); 117 else 118 dimm->nr_pages = MiB_TO_PAGES(entry->size); 119 } 120 121 switch (entry->memory_type) { 122 case 0x12: 123 if (entry->type_detail & BIT(13)) 124 dimm->mtype = MEM_RDDR; 125 else 126 dimm->mtype = MEM_DDR; 127 break; 128 case 0x13: 129 if (entry->type_detail & BIT(13)) 130 dimm->mtype = MEM_RDDR2; 131 else 132 dimm->mtype = MEM_DDR2; 133 break; 134 case 0x14: 135 dimm->mtype = MEM_FB_DDR2; 136 break; 137 case 0x18: 138 if (entry->type_detail & BIT(12)) 139 dimm->mtype = MEM_NVDIMM; 140 else if (entry->type_detail & BIT(13)) 141 dimm->mtype = MEM_RDDR3; 142 else 143 dimm->mtype = MEM_DDR3; 144 break; 145 case 0x1a: 146 if (entry->type_detail & BIT(12)) 147 dimm->mtype = MEM_NVDIMM; 148 else if (entry->type_detail & BIT(13)) 149 dimm->mtype = MEM_RDDR4; 150 else 151 dimm->mtype = MEM_DDR4; 152 break; 153 default: 154 if (entry->type_detail & BIT(6)) 155 dimm->mtype = MEM_RMBS; 156 else if ((entry->type_detail & rdr_mask) == rdr_mask) 157 dimm->mtype = MEM_RDR; 158 else if (entry->type_detail & BIT(7)) 159 dimm->mtype = MEM_SDR; 160 else if (entry->type_detail & BIT(9)) 161 dimm->mtype = MEM_EDO; 162 else 163 dimm->mtype = MEM_UNKNOWN; 164 } 165 166 /* 167 * Actually, we can only detect if the memory has bits for 168 * checksum or not 169 */ 170 if (entry->total_width == entry->data_width) 171 dimm->edac_mode = EDAC_NONE; 172 else 173 dimm->edac_mode = EDAC_SECDED; 174 175 dimm->dtype = DEV_UNKNOWN; 176 dimm->grain = 128; /* Likely, worse case */ 177 178 /* 179 * FIXME: It shouldn't be hard to also fill the DIMM labels 180 */ 181 182 if (dimm->nr_pages) { 183 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", 184 dimm_fill->count, edac_mem_types[dimm->mtype], 185 PAGES_TO_MiB(dimm->nr_pages), 186 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); 187 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", 188 entry->memory_type, entry->type_detail, 189 entry->total_width, entry->data_width); 190 } 191 192 dimm->smbios_handle = entry->handle; 193 194 dimm_fill->count++; 195 } 196 } 197 198 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) 199 { 200 enum hw_event_mc_err_type type; 201 struct edac_raw_error_desc *e; 202 struct mem_ctl_info *mci; 203 struct ghes_edac_pvt *pvt = ghes_pvt; 204 unsigned long flags; 205 char *p; 206 u8 grain_bits; 207 208 if (!pvt) 209 return; 210 211 /* 212 * We can do the locking below because GHES defers error processing 213 * from NMI to IRQ context. Whenever that changes, we'd at least 214 * know. 215 */ 216 if (WARN_ON_ONCE(in_nmi())) 217 return; 218 219 spin_lock_irqsave(&ghes_lock, flags); 220 221 mci = pvt->mci; 222 e = &mci->error_desc; 223 224 /* Cleans the error report buffer */ 225 memset(e, 0, sizeof (*e)); 226 e->error_count = 1; 227 strcpy(e->label, "unknown label"); 228 e->msg = pvt->msg; 229 e->other_detail = pvt->other_detail; 230 e->top_layer = -1; 231 e->mid_layer = -1; 232 e->low_layer = -1; 233 *pvt->other_detail = '\0'; 234 *pvt->msg = '\0'; 235 236 switch (sev) { 237 case GHES_SEV_CORRECTED: 238 type = HW_EVENT_ERR_CORRECTED; 239 break; 240 case GHES_SEV_RECOVERABLE: 241 type = HW_EVENT_ERR_UNCORRECTED; 242 break; 243 case GHES_SEV_PANIC: 244 type = HW_EVENT_ERR_FATAL; 245 break; 246 default: 247 case GHES_SEV_NO: 248 type = HW_EVENT_ERR_INFO; 249 } 250 251 edac_dbg(1, "error validation_bits: 0x%08llx\n", 252 (long long)mem_err->validation_bits); 253 254 /* Error type, mapped on e->msg */ 255 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 256 p = pvt->msg; 257 switch (mem_err->error_type) { 258 case 0: 259 p += sprintf(p, "Unknown"); 260 break; 261 case 1: 262 p += sprintf(p, "No error"); 263 break; 264 case 2: 265 p += sprintf(p, "Single-bit ECC"); 266 break; 267 case 3: 268 p += sprintf(p, "Multi-bit ECC"); 269 break; 270 case 4: 271 p += sprintf(p, "Single-symbol ChipKill ECC"); 272 break; 273 case 5: 274 p += sprintf(p, "Multi-symbol ChipKill ECC"); 275 break; 276 case 6: 277 p += sprintf(p, "Master abort"); 278 break; 279 case 7: 280 p += sprintf(p, "Target abort"); 281 break; 282 case 8: 283 p += sprintf(p, "Parity Error"); 284 break; 285 case 9: 286 p += sprintf(p, "Watchdog timeout"); 287 break; 288 case 10: 289 p += sprintf(p, "Invalid address"); 290 break; 291 case 11: 292 p += sprintf(p, "Mirror Broken"); 293 break; 294 case 12: 295 p += sprintf(p, "Memory Sparing"); 296 break; 297 case 13: 298 p += sprintf(p, "Scrub corrected error"); 299 break; 300 case 14: 301 p += sprintf(p, "Scrub uncorrected error"); 302 break; 303 case 15: 304 p += sprintf(p, "Physical Memory Map-out event"); 305 break; 306 default: 307 p += sprintf(p, "reserved error (%d)", 308 mem_err->error_type); 309 } 310 } else { 311 strcpy(pvt->msg, "unknown error"); 312 } 313 314 /* Error address */ 315 if (mem_err->validation_bits & CPER_MEM_VALID_PA) { 316 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 317 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 318 } 319 320 /* Error grain */ 321 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) 322 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 323 324 /* Memory error location, mapped on e->location */ 325 p = e->location; 326 if (mem_err->validation_bits & CPER_MEM_VALID_NODE) 327 p += sprintf(p, "node:%d ", mem_err->node); 328 if (mem_err->validation_bits & CPER_MEM_VALID_CARD) 329 p += sprintf(p, "card:%d ", mem_err->card); 330 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) 331 p += sprintf(p, "module:%d ", mem_err->module); 332 if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 333 p += sprintf(p, "rank:%d ", mem_err->rank); 334 if (mem_err->validation_bits & CPER_MEM_VALID_BANK) 335 p += sprintf(p, "bank:%d ", mem_err->bank); 336 if (mem_err->validation_bits & CPER_MEM_VALID_ROW) 337 p += sprintf(p, "row:%d ", mem_err->row); 338 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) 339 p += sprintf(p, "col:%d ", mem_err->column); 340 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) 341 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); 342 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { 343 const char *bank = NULL, *device = NULL; 344 int index = -1; 345 346 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); 347 if (bank != NULL && device != NULL) 348 p += sprintf(p, "DIMM location:%s %s ", bank, device); 349 else 350 p += sprintf(p, "DIMM DMI handle: 0x%.4x ", 351 mem_err->mem_dev_handle); 352 353 index = get_dimm_smbios_index(mem_err->mem_dev_handle); 354 if (index >= 0) { 355 e->top_layer = index; 356 e->enable_per_layer_report = true; 357 } 358 359 } 360 if (p > e->location) 361 *(p - 1) = '\0'; 362 363 /* All other fields are mapped on e->other_detail */ 364 p = pvt->other_detail; 365 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { 366 u64 status = mem_err->error_status; 367 368 p += sprintf(p, "status(0x%016llx): ", (long long)status); 369 switch ((status >> 8) & 0xff) { 370 case 1: 371 p += sprintf(p, "Error detected internal to the component "); 372 break; 373 case 16: 374 p += sprintf(p, "Error detected in the bus "); 375 break; 376 case 4: 377 p += sprintf(p, "Storage error in DRAM memory "); 378 break; 379 case 5: 380 p += sprintf(p, "Storage error in TLB "); 381 break; 382 case 6: 383 p += sprintf(p, "Storage error in cache "); 384 break; 385 case 7: 386 p += sprintf(p, "Error in one or more functional units "); 387 break; 388 case 8: 389 p += sprintf(p, "component failed self test "); 390 break; 391 case 9: 392 p += sprintf(p, "Overflow or undervalue of internal queue "); 393 break; 394 case 17: 395 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); 396 break; 397 case 18: 398 p += sprintf(p, "Improper access error "); 399 break; 400 case 19: 401 p += sprintf(p, "Access to a memory address which is not mapped to any component "); 402 break; 403 case 20: 404 p += sprintf(p, "Loss of Lockstep "); 405 break; 406 case 21: 407 p += sprintf(p, "Response not associated with a request "); 408 break; 409 case 22: 410 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); 411 break; 412 case 23: 413 p += sprintf(p, "Detection of a PATH_ERROR "); 414 break; 415 case 25: 416 p += sprintf(p, "Bus operation timeout "); 417 break; 418 case 26: 419 p += sprintf(p, "A read was issued to data that has been poisoned "); 420 break; 421 default: 422 p += sprintf(p, "reserved "); 423 break; 424 } 425 } 426 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 427 p += sprintf(p, "requestorID: 0x%016llx ", 428 (long long)mem_err->requestor_id); 429 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 430 p += sprintf(p, "responderID: 0x%016llx ", 431 (long long)mem_err->responder_id); 432 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) 433 p += sprintf(p, "targetID: 0x%016llx ", 434 (long long)mem_err->responder_id); 435 if (p > pvt->other_detail) 436 *(p - 1) = '\0'; 437 438 /* Generate the trace event */ 439 grain_bits = fls_long(e->grain); 440 snprintf(pvt->detail_location, sizeof(pvt->detail_location), 441 "APEI location: %s %s", e->location, e->other_detail); 442 trace_mc_event(type, e->msg, e->label, e->error_count, 443 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, 444 (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, 445 grain_bits, e->syndrome, pvt->detail_location); 446 447 edac_raw_mc_handle_error(type, mci, e); 448 spin_unlock_irqrestore(&ghes_lock, flags); 449 } 450 451 /* 452 * Known systems that are safe to enable this module. 453 */ 454 static struct acpi_platform_list plat_list[] = { 455 {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, 456 { } /* End */ 457 }; 458 459 int ghes_edac_register(struct ghes *ghes, struct device *dev) 460 { 461 bool fake = false; 462 int rc, num_dimm = 0; 463 struct mem_ctl_info *mci; 464 struct edac_mc_layer layers[1]; 465 struct ghes_edac_dimm_fill dimm_fill; 466 int idx = -1; 467 468 if (IS_ENABLED(CONFIG_X86)) { 469 /* Check if safe to enable on this system */ 470 idx = acpi_match_platform_list(plat_list); 471 if (!force_load && idx < 0) 472 return -ENODEV; 473 } else { 474 idx = 0; 475 } 476 477 /* 478 * We have only one logical memory controller to which all DIMMs belong. 479 */ 480 if (atomic_inc_return(&ghes_init) > 1) 481 return 0; 482 483 /* Get the number of DIMMs */ 484 dmi_walk(ghes_edac_count_dimms, &num_dimm); 485 486 /* Check if we've got a bogus BIOS */ 487 if (num_dimm == 0) { 488 fake = true; 489 num_dimm = 1; 490 } 491 492 layers[0].type = EDAC_MC_LAYER_ALL_MEM; 493 layers[0].size = num_dimm; 494 layers[0].is_virt_csrow = true; 495 496 mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); 497 if (!mci) { 498 pr_info("Can't allocate memory for EDAC data\n"); 499 return -ENOMEM; 500 } 501 502 ghes_pvt = mci->pvt_info; 503 ghes_pvt->ghes = ghes; 504 ghes_pvt->mci = mci; 505 506 mci->pdev = dev; 507 mci->mtype_cap = MEM_FLAG_EMPTY; 508 mci->edac_ctl_cap = EDAC_FLAG_NONE; 509 mci->edac_cap = EDAC_FLAG_NONE; 510 mci->mod_name = "ghes_edac.c"; 511 mci->ctl_name = "ghes_edac"; 512 mci->dev_name = "ghes"; 513 514 if (fake) { 515 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); 516 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); 517 pr_info("work on such system. Use this driver with caution\n"); 518 } else if (idx < 0) { 519 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); 520 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); 521 pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); 522 pr_info("If you find incorrect reports, please contact your hardware vendor\n"); 523 pr_info("to correct its BIOS.\n"); 524 pr_info("This system has %d DIMM sockets.\n", num_dimm); 525 } 526 527 if (!fake) { 528 dimm_fill.count = 0; 529 dimm_fill.mci = mci; 530 dmi_walk(ghes_edac_dmidecode, &dimm_fill); 531 } else { 532 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 533 mci->n_layers, 0, 0, 0); 534 535 dimm->nr_pages = 1; 536 dimm->grain = 128; 537 dimm->mtype = MEM_UNKNOWN; 538 dimm->dtype = DEV_UNKNOWN; 539 dimm->edac_mode = EDAC_SECDED; 540 } 541 542 rc = edac_mc_add_mc(mci); 543 if (rc < 0) { 544 pr_info("Can't register at EDAC core\n"); 545 edac_mc_free(mci); 546 return -ENODEV; 547 } 548 return 0; 549 } 550 551 void ghes_edac_unregister(struct ghes *ghes) 552 { 553 struct mem_ctl_info *mci; 554 555 if (!ghes_pvt) 556 return; 557 558 mci = ghes_pvt->mci; 559 edac_mc_del_mc(mci->pdev); 560 edac_mc_free(mci); 561 } 562