1 /* 2 * GHES/EDAC Linux driver 3 * 4 * This file may be distributed under the terms of the GNU General Public 5 * License version 2. 6 * 7 * Copyright (c) 2013 by Mauro Carvalho Chehab 8 * 9 * Red Hat Inc. http://www.redhat.com 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <acpi/ghes.h> 15 #include <linux/edac.h> 16 #include <linux/dmi.h> 17 #include "edac_module.h" 18 #include <ras/ras_event.h> 19 20 struct ghes_edac_pvt { 21 struct list_head list; 22 struct ghes *ghes; 23 struct mem_ctl_info *mci; 24 25 /* Buffers for the error handling routine */ 26 char detail_location[240]; 27 char other_detail[160]; 28 char msg[80]; 29 }; 30 31 static LIST_HEAD(ghes_reglist); 32 static DEFINE_MUTEX(ghes_edac_lock); 33 static int ghes_edac_mc_num; 34 35 36 /* Memory Device - Type 17 of SMBIOS spec */ 37 struct memdev_dmi_entry { 38 u8 type; 39 u8 length; 40 u16 handle; 41 u16 phys_mem_array_handle; 42 u16 mem_err_info_handle; 43 u16 total_width; 44 u16 data_width; 45 u16 size; 46 u8 form_factor; 47 u8 device_set; 48 u8 device_locator; 49 u8 bank_locator; 50 u8 memory_type; 51 u16 type_detail; 52 u16 speed; 53 u8 manufacturer; 54 u8 serial_number; 55 u8 asset_tag; 56 u8 part_number; 57 u8 attributes; 58 u32 extended_size; 59 u16 conf_mem_clk_speed; 60 } __attribute__((__packed__)); 61 62 struct ghes_edac_dimm_fill { 63 struct mem_ctl_info *mci; 64 unsigned count; 65 }; 66 67 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) 68 { 69 int *num_dimm = arg; 70 71 if (dh->type == DMI_ENTRY_MEM_DEVICE) 72 (*num_dimm)++; 73 } 74 75 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) 76 { 77 struct ghes_edac_dimm_fill *dimm_fill = arg; 78 struct mem_ctl_info *mci = dimm_fill->mci; 79 80 if (dh->type == DMI_ENTRY_MEM_DEVICE) { 81 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; 82 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 83 mci->n_layers, 84 dimm_fill->count, 0, 0); 85 86 if (entry->size == 0xffff) { 87 pr_info("Can't get DIMM%i size\n", 88 dimm_fill->count); 89 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ 90 } else if (entry->size == 0x7fff) { 91 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); 92 } else { 93 if (entry->size & 1 << 15) 94 dimm->nr_pages = MiB_TO_PAGES((entry->size & 95 0x7fff) << 10); 96 else 97 dimm->nr_pages = MiB_TO_PAGES(entry->size); 98 } 99 100 switch (entry->memory_type) { 101 case 0x12: 102 if (entry->type_detail & 1 << 13) 103 dimm->mtype = MEM_RDDR; 104 else 105 dimm->mtype = MEM_DDR; 106 break; 107 case 0x13: 108 if (entry->type_detail & 1 << 13) 109 dimm->mtype = MEM_RDDR2; 110 else 111 dimm->mtype = MEM_DDR2; 112 break; 113 case 0x14: 114 dimm->mtype = MEM_FB_DDR2; 115 break; 116 case 0x18: 117 if (entry->type_detail & 1 << 13) 118 dimm->mtype = MEM_RDDR3; 119 else 120 dimm->mtype = MEM_DDR3; 121 break; 122 default: 123 if (entry->type_detail & 1 << 6) 124 dimm->mtype = MEM_RMBS; 125 else if ((entry->type_detail & ((1 << 7) | (1 << 13))) 126 == ((1 << 7) | (1 << 13))) 127 dimm->mtype = MEM_RDR; 128 else if (entry->type_detail & 1 << 7) 129 dimm->mtype = MEM_SDR; 130 else if (entry->type_detail & 1 << 9) 131 dimm->mtype = MEM_EDO; 132 else 133 dimm->mtype = MEM_UNKNOWN; 134 } 135 136 /* 137 * Actually, we can only detect if the memory has bits for 138 * checksum or not 139 */ 140 if (entry->total_width == entry->data_width) 141 dimm->edac_mode = EDAC_NONE; 142 else 143 dimm->edac_mode = EDAC_SECDED; 144 145 dimm->dtype = DEV_UNKNOWN; 146 dimm->grain = 128; /* Likely, worse case */ 147 148 /* 149 * FIXME: It shouldn't be hard to also fill the DIMM labels 150 */ 151 152 if (dimm->nr_pages) { 153 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", 154 dimm_fill->count, edac_mem_types[dimm->mtype], 155 PAGES_TO_MiB(dimm->nr_pages), 156 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); 157 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", 158 entry->memory_type, entry->type_detail, 159 entry->total_width, entry->data_width); 160 } 161 162 dimm_fill->count++; 163 } 164 } 165 166 void ghes_edac_report_mem_error(struct ghes *ghes, int sev, 167 struct cper_sec_mem_err *mem_err) 168 { 169 enum hw_event_mc_err_type type; 170 struct edac_raw_error_desc *e; 171 struct mem_ctl_info *mci; 172 struct ghes_edac_pvt *pvt = NULL; 173 char *p; 174 u8 grain_bits; 175 176 list_for_each_entry(pvt, &ghes_reglist, list) { 177 if (ghes == pvt->ghes) 178 break; 179 } 180 if (!pvt) { 181 pr_err("Internal error: Can't find EDAC structure\n"); 182 return; 183 } 184 mci = pvt->mci; 185 e = &mci->error_desc; 186 187 /* Cleans the error report buffer */ 188 memset(e, 0, sizeof (*e)); 189 e->error_count = 1; 190 strcpy(e->label, "unknown label"); 191 e->msg = pvt->msg; 192 e->other_detail = pvt->other_detail; 193 e->top_layer = -1; 194 e->mid_layer = -1; 195 e->low_layer = -1; 196 *pvt->other_detail = '\0'; 197 *pvt->msg = '\0'; 198 199 switch (sev) { 200 case GHES_SEV_CORRECTED: 201 type = HW_EVENT_ERR_CORRECTED; 202 break; 203 case GHES_SEV_RECOVERABLE: 204 type = HW_EVENT_ERR_UNCORRECTED; 205 break; 206 case GHES_SEV_PANIC: 207 type = HW_EVENT_ERR_FATAL; 208 break; 209 default: 210 case GHES_SEV_NO: 211 type = HW_EVENT_ERR_INFO; 212 } 213 214 edac_dbg(1, "error validation_bits: 0x%08llx\n", 215 (long long)mem_err->validation_bits); 216 217 /* Error type, mapped on e->msg */ 218 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 219 p = pvt->msg; 220 switch (mem_err->error_type) { 221 case 0: 222 p += sprintf(p, "Unknown"); 223 break; 224 case 1: 225 p += sprintf(p, "No error"); 226 break; 227 case 2: 228 p += sprintf(p, "Single-bit ECC"); 229 break; 230 case 3: 231 p += sprintf(p, "Multi-bit ECC"); 232 break; 233 case 4: 234 p += sprintf(p, "Single-symbol ChipKill ECC"); 235 break; 236 case 5: 237 p += sprintf(p, "Multi-symbol ChipKill ECC"); 238 break; 239 case 6: 240 p += sprintf(p, "Master abort"); 241 break; 242 case 7: 243 p += sprintf(p, "Target abort"); 244 break; 245 case 8: 246 p += sprintf(p, "Parity Error"); 247 break; 248 case 9: 249 p += sprintf(p, "Watchdog timeout"); 250 break; 251 case 10: 252 p += sprintf(p, "Invalid address"); 253 break; 254 case 11: 255 p += sprintf(p, "Mirror Broken"); 256 break; 257 case 12: 258 p += sprintf(p, "Memory Sparing"); 259 break; 260 case 13: 261 p += sprintf(p, "Scrub corrected error"); 262 break; 263 case 14: 264 p += sprintf(p, "Scrub uncorrected error"); 265 break; 266 case 15: 267 p += sprintf(p, "Physical Memory Map-out event"); 268 break; 269 default: 270 p += sprintf(p, "reserved error (%d)", 271 mem_err->error_type); 272 } 273 } else { 274 strcpy(pvt->msg, "unknown error"); 275 } 276 277 /* Error address */ 278 if (mem_err->validation_bits & CPER_MEM_VALID_PA) { 279 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 280 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 281 } 282 283 /* Error grain */ 284 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) 285 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 286 287 /* Memory error location, mapped on e->location */ 288 p = e->location; 289 if (mem_err->validation_bits & CPER_MEM_VALID_NODE) 290 p += sprintf(p, "node:%d ", mem_err->node); 291 if (mem_err->validation_bits & CPER_MEM_VALID_CARD) 292 p += sprintf(p, "card:%d ", mem_err->card); 293 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) 294 p += sprintf(p, "module:%d ", mem_err->module); 295 if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 296 p += sprintf(p, "rank:%d ", mem_err->rank); 297 if (mem_err->validation_bits & CPER_MEM_VALID_BANK) 298 p += sprintf(p, "bank:%d ", mem_err->bank); 299 if (mem_err->validation_bits & CPER_MEM_VALID_ROW) 300 p += sprintf(p, "row:%d ", mem_err->row); 301 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) 302 p += sprintf(p, "col:%d ", mem_err->column); 303 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) 304 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); 305 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { 306 const char *bank = NULL, *device = NULL; 307 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); 308 if (bank != NULL && device != NULL) 309 p += sprintf(p, "DIMM location:%s %s ", bank, device); 310 else 311 p += sprintf(p, "DIMM DMI handle: 0x%.4x ", 312 mem_err->mem_dev_handle); 313 } 314 if (p > e->location) 315 *(p - 1) = '\0'; 316 317 /* All other fields are mapped on e->other_detail */ 318 p = pvt->other_detail; 319 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { 320 u64 status = mem_err->error_status; 321 322 p += sprintf(p, "status(0x%016llx): ", (long long)status); 323 switch ((status >> 8) & 0xff) { 324 case 1: 325 p += sprintf(p, "Error detected internal to the component "); 326 break; 327 case 16: 328 p += sprintf(p, "Error detected in the bus "); 329 break; 330 case 4: 331 p += sprintf(p, "Storage error in DRAM memory "); 332 break; 333 case 5: 334 p += sprintf(p, "Storage error in TLB "); 335 break; 336 case 6: 337 p += sprintf(p, "Storage error in cache "); 338 break; 339 case 7: 340 p += sprintf(p, "Error in one or more functional units "); 341 break; 342 case 8: 343 p += sprintf(p, "component failed self test "); 344 break; 345 case 9: 346 p += sprintf(p, "Overflow or undervalue of internal queue "); 347 break; 348 case 17: 349 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); 350 break; 351 case 18: 352 p += sprintf(p, "Improper access error "); 353 break; 354 case 19: 355 p += sprintf(p, "Access to a memory address which is not mapped to any component "); 356 break; 357 case 20: 358 p += sprintf(p, "Loss of Lockstep "); 359 break; 360 case 21: 361 p += sprintf(p, "Response not associated with a request "); 362 break; 363 case 22: 364 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); 365 break; 366 case 23: 367 p += sprintf(p, "Detection of a PATH_ERROR "); 368 break; 369 case 25: 370 p += sprintf(p, "Bus operation timeout "); 371 break; 372 case 26: 373 p += sprintf(p, "A read was issued to data that has been poisoned "); 374 break; 375 default: 376 p += sprintf(p, "reserved "); 377 break; 378 } 379 } 380 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 381 p += sprintf(p, "requestorID: 0x%016llx ", 382 (long long)mem_err->requestor_id); 383 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 384 p += sprintf(p, "responderID: 0x%016llx ", 385 (long long)mem_err->responder_id); 386 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) 387 p += sprintf(p, "targetID: 0x%016llx ", 388 (long long)mem_err->responder_id); 389 if (p > pvt->other_detail) 390 *(p - 1) = '\0'; 391 392 /* Generate the trace event */ 393 grain_bits = fls_long(e->grain); 394 snprintf(pvt->detail_location, sizeof(pvt->detail_location), 395 "APEI location: %s %s", e->location, e->other_detail); 396 trace_mc_event(type, e->msg, e->label, e->error_count, 397 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, 398 (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, 399 grain_bits, e->syndrome, pvt->detail_location); 400 401 /* Report the error via EDAC API */ 402 edac_raw_mc_handle_error(type, mci, e); 403 } 404 EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error); 405 406 int ghes_edac_register(struct ghes *ghes, struct device *dev) 407 { 408 bool fake = false; 409 int rc, num_dimm = 0; 410 struct mem_ctl_info *mci; 411 struct edac_mc_layer layers[1]; 412 struct ghes_edac_pvt *pvt; 413 struct ghes_edac_dimm_fill dimm_fill; 414 415 /* Get the number of DIMMs */ 416 dmi_walk(ghes_edac_count_dimms, &num_dimm); 417 418 /* Check if we've got a bogus BIOS */ 419 if (num_dimm == 0) { 420 fake = true; 421 num_dimm = 1; 422 } 423 424 layers[0].type = EDAC_MC_LAYER_ALL_MEM; 425 layers[0].size = num_dimm; 426 layers[0].is_virt_csrow = true; 427 428 /* 429 * We need to serialize edac_mc_alloc() and edac_mc_add_mc(), 430 * to avoid duplicated memory controller numbers 431 */ 432 mutex_lock(&ghes_edac_lock); 433 mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers, 434 sizeof(*pvt)); 435 if (!mci) { 436 pr_info("Can't allocate memory for EDAC data\n"); 437 mutex_unlock(&ghes_edac_lock); 438 return -ENOMEM; 439 } 440 441 pvt = mci->pvt_info; 442 memset(pvt, 0, sizeof(*pvt)); 443 list_add_tail(&pvt->list, &ghes_reglist); 444 pvt->ghes = ghes; 445 pvt->mci = mci; 446 mci->pdev = dev; 447 448 mci->mtype_cap = MEM_FLAG_EMPTY; 449 mci->edac_ctl_cap = EDAC_FLAG_NONE; 450 mci->edac_cap = EDAC_FLAG_NONE; 451 mci->mod_name = "ghes_edac.c"; 452 mci->ctl_name = "ghes_edac"; 453 mci->dev_name = "ghes"; 454 455 if (!ghes_edac_mc_num) { 456 if (!fake) { 457 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); 458 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); 459 pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); 460 pr_info("If you find incorrect reports, please contact your hardware vendor\n"); 461 pr_info("to correct its BIOS.\n"); 462 pr_info("This system has %d DIMM sockets.\n", 463 num_dimm); 464 } else { 465 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); 466 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); 467 pr_info("work on such system. Use this driver with caution\n"); 468 } 469 } 470 471 if (!fake) { 472 /* 473 * Fill DIMM info from DMI for the memory controller #0 474 * 475 * Keep it in blank for the other memory controllers, as 476 * there's no reliable way to properly credit each DIMM to 477 * the memory controller, as different BIOSes fill the 478 * DMI bank location fields on different ways 479 */ 480 if (!ghes_edac_mc_num) { 481 dimm_fill.count = 0; 482 dimm_fill.mci = mci; 483 dmi_walk(ghes_edac_dmidecode, &dimm_fill); 484 } 485 } else { 486 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 487 mci->n_layers, 0, 0, 0); 488 489 dimm->nr_pages = 1; 490 dimm->grain = 128; 491 dimm->mtype = MEM_UNKNOWN; 492 dimm->dtype = DEV_UNKNOWN; 493 dimm->edac_mode = EDAC_SECDED; 494 } 495 496 rc = edac_mc_add_mc(mci); 497 if (rc < 0) { 498 pr_info("Can't register at EDAC core\n"); 499 edac_mc_free(mci); 500 mutex_unlock(&ghes_edac_lock); 501 return -ENODEV; 502 } 503 504 ghes_edac_mc_num++; 505 mutex_unlock(&ghes_edac_lock); 506 return 0; 507 } 508 EXPORT_SYMBOL_GPL(ghes_edac_register); 509 510 void ghes_edac_unregister(struct ghes *ghes) 511 { 512 struct mem_ctl_info *mci; 513 struct ghes_edac_pvt *pvt, *tmp; 514 515 list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) { 516 if (ghes == pvt->ghes) { 517 mci = pvt->mci; 518 edac_mc_del_mc(mci->pdev); 519 edac_mc_free(mci); 520 list_del(&pvt->list); 521 } 522 } 523 } 524 EXPORT_SYMBOL_GPL(ghes_edac_unregister); 525