1 /* 2 * GHES/EDAC Linux driver 3 * 4 * This file may be distributed under the terms of the GNU General Public 5 * License version 2. 6 * 7 * Copyright (c) 2013 by Mauro Carvalho Chehab 8 * 9 * Red Hat Inc. http://www.redhat.com 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <acpi/ghes.h> 15 #include <linux/edac.h> 16 #include <linux/dmi.h> 17 #include "edac_module.h" 18 #include <ras/ras_event.h> 19 20 struct ghes_edac_pvt { 21 struct list_head list; 22 struct ghes *ghes; 23 struct mem_ctl_info *mci; 24 25 /* Buffers for the error handling routine */ 26 char detail_location[240]; 27 char other_detail[160]; 28 char msg[80]; 29 }; 30 31 static atomic_t ghes_init = ATOMIC_INIT(0); 32 static struct ghes_edac_pvt *ghes_pvt; 33 34 /* 35 * Sync with other, potentially concurrent callers of 36 * ghes_edac_report_mem_error(). We don't know what the 37 * "inventive" firmware would do. 38 */ 39 static DEFINE_SPINLOCK(ghes_lock); 40 41 /* "ghes_edac.force_load=1" skips the platform check */ 42 static bool __read_mostly force_load; 43 module_param(force_load, bool, 0); 44 45 /* Memory Device - Type 17 of SMBIOS spec */ 46 struct memdev_dmi_entry { 47 u8 type; 48 u8 length; 49 u16 handle; 50 u16 phys_mem_array_handle; 51 u16 mem_err_info_handle; 52 u16 total_width; 53 u16 data_width; 54 u16 size; 55 u8 form_factor; 56 u8 device_set; 57 u8 device_locator; 58 u8 bank_locator; 59 u8 memory_type; 60 u16 type_detail; 61 u16 speed; 62 u8 manufacturer; 63 u8 serial_number; 64 u8 asset_tag; 65 u8 part_number; 66 u8 attributes; 67 u32 extended_size; 68 u16 conf_mem_clk_speed; 69 } __attribute__((__packed__)); 70 71 struct ghes_edac_dimm_fill { 72 struct mem_ctl_info *mci; 73 unsigned count; 74 }; 75 76 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) 77 { 78 int *num_dimm = arg; 79 80 if (dh->type == DMI_ENTRY_MEM_DEVICE) 81 (*num_dimm)++; 82 } 83 84 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) 85 { 86 struct ghes_edac_dimm_fill *dimm_fill = arg; 87 struct mem_ctl_info *mci = dimm_fill->mci; 88 89 if (dh->type == DMI_ENTRY_MEM_DEVICE) { 90 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; 91 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 92 mci->n_layers, 93 dimm_fill->count, 0, 0); 94 95 if (entry->size == 0xffff) { 96 pr_info("Can't get DIMM%i size\n", 97 dimm_fill->count); 98 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ 99 } else if (entry->size == 0x7fff) { 100 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); 101 } else { 102 if (entry->size & 1 << 15) 103 dimm->nr_pages = MiB_TO_PAGES((entry->size & 104 0x7fff) << 10); 105 else 106 dimm->nr_pages = MiB_TO_PAGES(entry->size); 107 } 108 109 switch (entry->memory_type) { 110 case 0x12: 111 if (entry->type_detail & 1 << 13) 112 dimm->mtype = MEM_RDDR; 113 else 114 dimm->mtype = MEM_DDR; 115 break; 116 case 0x13: 117 if (entry->type_detail & 1 << 13) 118 dimm->mtype = MEM_RDDR2; 119 else 120 dimm->mtype = MEM_DDR2; 121 break; 122 case 0x14: 123 dimm->mtype = MEM_FB_DDR2; 124 break; 125 case 0x18: 126 if (entry->type_detail & 1 << 13) 127 dimm->mtype = MEM_RDDR3; 128 else 129 dimm->mtype = MEM_DDR3; 130 break; 131 default: 132 if (entry->type_detail & 1 << 6) 133 dimm->mtype = MEM_RMBS; 134 else if ((entry->type_detail & ((1 << 7) | (1 << 13))) 135 == ((1 << 7) | (1 << 13))) 136 dimm->mtype = MEM_RDR; 137 else if (entry->type_detail & 1 << 7) 138 dimm->mtype = MEM_SDR; 139 else if (entry->type_detail & 1 << 9) 140 dimm->mtype = MEM_EDO; 141 else 142 dimm->mtype = MEM_UNKNOWN; 143 } 144 145 /* 146 * Actually, we can only detect if the memory has bits for 147 * checksum or not 148 */ 149 if (entry->total_width == entry->data_width) 150 dimm->edac_mode = EDAC_NONE; 151 else 152 dimm->edac_mode = EDAC_SECDED; 153 154 dimm->dtype = DEV_UNKNOWN; 155 dimm->grain = 128; /* Likely, worse case */ 156 157 /* 158 * FIXME: It shouldn't be hard to also fill the DIMM labels 159 */ 160 161 if (dimm->nr_pages) { 162 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", 163 dimm_fill->count, edac_mem_types[dimm->mtype], 164 PAGES_TO_MiB(dimm->nr_pages), 165 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); 166 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", 167 entry->memory_type, entry->type_detail, 168 entry->total_width, entry->data_width); 169 } 170 171 dimm_fill->count++; 172 } 173 } 174 175 void ghes_edac_report_mem_error(struct ghes *ghes, int sev, 176 struct cper_sec_mem_err *mem_err) 177 { 178 enum hw_event_mc_err_type type; 179 struct edac_raw_error_desc *e; 180 struct mem_ctl_info *mci; 181 struct ghes_edac_pvt *pvt = ghes_pvt; 182 unsigned long flags; 183 char *p; 184 u8 grain_bits; 185 186 if (!pvt) { 187 pr_err("Internal error: Can't find EDAC structure\n"); 188 return; 189 } 190 191 /* 192 * We can do the locking below because GHES defers error processing 193 * from NMI to IRQ context. Whenever that changes, we'd at least 194 * know. 195 */ 196 if (WARN_ON_ONCE(in_nmi())) 197 return; 198 199 spin_lock_irqsave(&ghes_lock, flags); 200 201 mci = pvt->mci; 202 e = &mci->error_desc; 203 204 /* Cleans the error report buffer */ 205 memset(e, 0, sizeof (*e)); 206 e->error_count = 1; 207 strcpy(e->label, "unknown label"); 208 e->msg = pvt->msg; 209 e->other_detail = pvt->other_detail; 210 e->top_layer = -1; 211 e->mid_layer = -1; 212 e->low_layer = -1; 213 *pvt->other_detail = '\0'; 214 *pvt->msg = '\0'; 215 216 switch (sev) { 217 case GHES_SEV_CORRECTED: 218 type = HW_EVENT_ERR_CORRECTED; 219 break; 220 case GHES_SEV_RECOVERABLE: 221 type = HW_EVENT_ERR_UNCORRECTED; 222 break; 223 case GHES_SEV_PANIC: 224 type = HW_EVENT_ERR_FATAL; 225 break; 226 default: 227 case GHES_SEV_NO: 228 type = HW_EVENT_ERR_INFO; 229 } 230 231 edac_dbg(1, "error validation_bits: 0x%08llx\n", 232 (long long)mem_err->validation_bits); 233 234 /* Error type, mapped on e->msg */ 235 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 236 p = pvt->msg; 237 switch (mem_err->error_type) { 238 case 0: 239 p += sprintf(p, "Unknown"); 240 break; 241 case 1: 242 p += sprintf(p, "No error"); 243 break; 244 case 2: 245 p += sprintf(p, "Single-bit ECC"); 246 break; 247 case 3: 248 p += sprintf(p, "Multi-bit ECC"); 249 break; 250 case 4: 251 p += sprintf(p, "Single-symbol ChipKill ECC"); 252 break; 253 case 5: 254 p += sprintf(p, "Multi-symbol ChipKill ECC"); 255 break; 256 case 6: 257 p += sprintf(p, "Master abort"); 258 break; 259 case 7: 260 p += sprintf(p, "Target abort"); 261 break; 262 case 8: 263 p += sprintf(p, "Parity Error"); 264 break; 265 case 9: 266 p += sprintf(p, "Watchdog timeout"); 267 break; 268 case 10: 269 p += sprintf(p, "Invalid address"); 270 break; 271 case 11: 272 p += sprintf(p, "Mirror Broken"); 273 break; 274 case 12: 275 p += sprintf(p, "Memory Sparing"); 276 break; 277 case 13: 278 p += sprintf(p, "Scrub corrected error"); 279 break; 280 case 14: 281 p += sprintf(p, "Scrub uncorrected error"); 282 break; 283 case 15: 284 p += sprintf(p, "Physical Memory Map-out event"); 285 break; 286 default: 287 p += sprintf(p, "reserved error (%d)", 288 mem_err->error_type); 289 } 290 } else { 291 strcpy(pvt->msg, "unknown error"); 292 } 293 294 /* Error address */ 295 if (mem_err->validation_bits & CPER_MEM_VALID_PA) { 296 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 297 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 298 } 299 300 /* Error grain */ 301 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) 302 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 303 304 /* Memory error location, mapped on e->location */ 305 p = e->location; 306 if (mem_err->validation_bits & CPER_MEM_VALID_NODE) 307 p += sprintf(p, "node:%d ", mem_err->node); 308 if (mem_err->validation_bits & CPER_MEM_VALID_CARD) 309 p += sprintf(p, "card:%d ", mem_err->card); 310 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) 311 p += sprintf(p, "module:%d ", mem_err->module); 312 if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 313 p += sprintf(p, "rank:%d ", mem_err->rank); 314 if (mem_err->validation_bits & CPER_MEM_VALID_BANK) 315 p += sprintf(p, "bank:%d ", mem_err->bank); 316 if (mem_err->validation_bits & CPER_MEM_VALID_ROW) 317 p += sprintf(p, "row:%d ", mem_err->row); 318 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) 319 p += sprintf(p, "col:%d ", mem_err->column); 320 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) 321 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); 322 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { 323 const char *bank = NULL, *device = NULL; 324 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); 325 if (bank != NULL && device != NULL) 326 p += sprintf(p, "DIMM location:%s %s ", bank, device); 327 else 328 p += sprintf(p, "DIMM DMI handle: 0x%.4x ", 329 mem_err->mem_dev_handle); 330 } 331 if (p > e->location) 332 *(p - 1) = '\0'; 333 334 /* All other fields are mapped on e->other_detail */ 335 p = pvt->other_detail; 336 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { 337 u64 status = mem_err->error_status; 338 339 p += sprintf(p, "status(0x%016llx): ", (long long)status); 340 switch ((status >> 8) & 0xff) { 341 case 1: 342 p += sprintf(p, "Error detected internal to the component "); 343 break; 344 case 16: 345 p += sprintf(p, "Error detected in the bus "); 346 break; 347 case 4: 348 p += sprintf(p, "Storage error in DRAM memory "); 349 break; 350 case 5: 351 p += sprintf(p, "Storage error in TLB "); 352 break; 353 case 6: 354 p += sprintf(p, "Storage error in cache "); 355 break; 356 case 7: 357 p += sprintf(p, "Error in one or more functional units "); 358 break; 359 case 8: 360 p += sprintf(p, "component failed self test "); 361 break; 362 case 9: 363 p += sprintf(p, "Overflow or undervalue of internal queue "); 364 break; 365 case 17: 366 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); 367 break; 368 case 18: 369 p += sprintf(p, "Improper access error "); 370 break; 371 case 19: 372 p += sprintf(p, "Access to a memory address which is not mapped to any component "); 373 break; 374 case 20: 375 p += sprintf(p, "Loss of Lockstep "); 376 break; 377 case 21: 378 p += sprintf(p, "Response not associated with a request "); 379 break; 380 case 22: 381 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); 382 break; 383 case 23: 384 p += sprintf(p, "Detection of a PATH_ERROR "); 385 break; 386 case 25: 387 p += sprintf(p, "Bus operation timeout "); 388 break; 389 case 26: 390 p += sprintf(p, "A read was issued to data that has been poisoned "); 391 break; 392 default: 393 p += sprintf(p, "reserved "); 394 break; 395 } 396 } 397 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 398 p += sprintf(p, "requestorID: 0x%016llx ", 399 (long long)mem_err->requestor_id); 400 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 401 p += sprintf(p, "responderID: 0x%016llx ", 402 (long long)mem_err->responder_id); 403 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) 404 p += sprintf(p, "targetID: 0x%016llx ", 405 (long long)mem_err->responder_id); 406 if (p > pvt->other_detail) 407 *(p - 1) = '\0'; 408 409 /* Generate the trace event */ 410 grain_bits = fls_long(e->grain); 411 snprintf(pvt->detail_location, sizeof(pvt->detail_location), 412 "APEI location: %s %s", e->location, e->other_detail); 413 trace_mc_event(type, e->msg, e->label, e->error_count, 414 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, 415 (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, 416 grain_bits, e->syndrome, pvt->detail_location); 417 418 edac_raw_mc_handle_error(type, mci, e); 419 spin_unlock_irqrestore(&ghes_lock, flags); 420 } 421 422 /* 423 * Known systems that are safe to enable this module. 424 */ 425 static struct acpi_platform_list plat_list[] = { 426 {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, 427 { } /* End */ 428 }; 429 430 int ghes_edac_register(struct ghes *ghes, struct device *dev) 431 { 432 bool fake = false; 433 int rc, num_dimm = 0; 434 struct mem_ctl_info *mci; 435 struct edac_mc_layer layers[1]; 436 struct ghes_edac_dimm_fill dimm_fill; 437 int idx; 438 439 /* Check if safe to enable on this system */ 440 idx = acpi_match_platform_list(plat_list); 441 if (!force_load && idx < 0) 442 return 0; 443 444 /* 445 * We have only one logical memory controller to which all DIMMs belong. 446 */ 447 if (atomic_inc_return(&ghes_init) > 1) 448 return 0; 449 450 /* Get the number of DIMMs */ 451 dmi_walk(ghes_edac_count_dimms, &num_dimm); 452 453 /* Check if we've got a bogus BIOS */ 454 if (num_dimm == 0) { 455 fake = true; 456 num_dimm = 1; 457 } 458 459 layers[0].type = EDAC_MC_LAYER_ALL_MEM; 460 layers[0].size = num_dimm; 461 layers[0].is_virt_csrow = true; 462 463 mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); 464 if (!mci) { 465 pr_info("Can't allocate memory for EDAC data\n"); 466 return -ENOMEM; 467 } 468 469 ghes_pvt = mci->pvt_info; 470 ghes_pvt->ghes = ghes; 471 ghes_pvt->mci = mci; 472 473 mci->pdev = dev; 474 mci->mtype_cap = MEM_FLAG_EMPTY; 475 mci->edac_ctl_cap = EDAC_FLAG_NONE; 476 mci->edac_cap = EDAC_FLAG_NONE; 477 mci->mod_name = "ghes_edac.c"; 478 mci->ctl_name = "ghes_edac"; 479 mci->dev_name = "ghes"; 480 481 if (fake) { 482 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); 483 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); 484 pr_info("work on such system. Use this driver with caution\n"); 485 } else if (idx < 0) { 486 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); 487 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); 488 pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); 489 pr_info("If you find incorrect reports, please contact your hardware vendor\n"); 490 pr_info("to correct its BIOS.\n"); 491 pr_info("This system has %d DIMM sockets.\n", num_dimm); 492 } 493 494 if (!fake) { 495 dimm_fill.count = 0; 496 dimm_fill.mci = mci; 497 dmi_walk(ghes_edac_dmidecode, &dimm_fill); 498 } else { 499 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 500 mci->n_layers, 0, 0, 0); 501 502 dimm->nr_pages = 1; 503 dimm->grain = 128; 504 dimm->mtype = MEM_UNKNOWN; 505 dimm->dtype = DEV_UNKNOWN; 506 dimm->edac_mode = EDAC_SECDED; 507 } 508 509 rc = edac_mc_add_mc(mci); 510 if (rc < 0) { 511 pr_info("Can't register at EDAC core\n"); 512 edac_mc_free(mci); 513 return -ENODEV; 514 } 515 return 0; 516 } 517 518 void ghes_edac_unregister(struct ghes *ghes) 519 { 520 struct mem_ctl_info *mci; 521 522 mci = ghes_pvt->mci; 523 edac_mc_del_mc(mci->pdev); 524 edac_mc_free(mci); 525 } 526