1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * UEFI Common Platform Error Record (CPER) support 4 * 5 * Copyright (C) 2010, Intel Corp. 6 * Author: Huang Ying <ying.huang@intel.com> 7 * 8 * CPER is the format used to describe platform hardware error by 9 * various tables, such as ERST, BERT and HEST etc. 10 * 11 * For more information about CPER, please refer to Appendix N of UEFI 12 * Specification version 2.4. 13 */ 14 15 #include <linux/kernel.h> 16 #include <linux/module.h> 17 #include <linux/time.h> 18 #include <linux/cper.h> 19 #include <linux/dmi.h> 20 #include <linux/acpi.h> 21 #include <linux/pci.h> 22 #include <linux/aer.h> 23 #include <linux/printk.h> 24 #include <linux/bcd.h> 25 #include <acpi/ghes.h> 26 #include <ras/ras_event.h> 27 28 static char rcd_decode_str[CPER_REC_LEN]; 29 30 /* 31 * CPER record ID need to be unique even after reboot, because record 32 * ID is used as index for ERST storage, while CPER records from 33 * multiple boot may co-exist in ERST. 34 */ 35 u64 cper_next_record_id(void) 36 { 37 static atomic64_t seq; 38 39 if (!atomic64_read(&seq)) { 40 time64_t time = ktime_get_real_seconds(); 41 42 /* 43 * This code is unlikely to still be needed in year 2106, 44 * but just in case, let's use a few more bits for timestamps 45 * after y2038 to be sure they keep increasing monotonically 46 * for the next few hundred years... 47 */ 48 if (time < 0x80000000) 49 atomic64_set(&seq, (ktime_get_real_seconds()) << 32); 50 else 51 atomic64_set(&seq, 0x8000000000000000ull | 52 ktime_get_real_seconds() << 24); 53 } 54 55 return atomic64_inc_return(&seq); 56 } 57 EXPORT_SYMBOL_GPL(cper_next_record_id); 58 59 static const char * const severity_strs[] = { 60 "recoverable", 61 "fatal", 62 "corrected", 63 "info", 64 }; 65 66 const char *cper_severity_str(unsigned int severity) 67 { 68 return severity < ARRAY_SIZE(severity_strs) ? 69 severity_strs[severity] : "unknown"; 70 } 71 EXPORT_SYMBOL_GPL(cper_severity_str); 72 73 /* 74 * cper_print_bits - print strings for set bits 75 * @pfx: prefix for each line, including log level and prefix string 76 * @bits: bit mask 77 * @strs: string array, indexed by bit position 78 * @strs_size: size of the string array: @strs 79 * 80 * For each set bit in @bits, print the corresponding string in @strs. 81 * If the output length is longer than 80, multiple line will be 82 * printed, with @pfx is printed at the beginning of each line. 83 */ 84 void cper_print_bits(const char *pfx, unsigned int bits, 85 const char * const strs[], unsigned int strs_size) 86 { 87 int i, len = 0; 88 const char *str; 89 char buf[84]; 90 91 for (i = 0; i < strs_size; i++) { 92 if (!(bits & (1U << i))) 93 continue; 94 str = strs[i]; 95 if (!str) 96 continue; 97 if (len && len + strlen(str) + 2 > 80) { 98 printk("%s\n", buf); 99 len = 0; 100 } 101 if (!len) 102 len = snprintf(buf, sizeof(buf), "%s%s", pfx, str); 103 else 104 len += scnprintf(buf+len, sizeof(buf)-len, ", %s", str); 105 } 106 if (len) 107 printk("%s\n", buf); 108 } 109 110 static const char * const proc_type_strs[] = { 111 "IA32/X64", 112 "IA64", 113 "ARM", 114 }; 115 116 static const char * const proc_isa_strs[] = { 117 "IA32", 118 "IA64", 119 "X64", 120 "ARM A32/T32", 121 "ARM A64", 122 }; 123 124 const char * const cper_proc_error_type_strs[] = { 125 "cache error", 126 "TLB error", 127 "bus error", 128 "micro-architectural error", 129 }; 130 131 static const char * const proc_op_strs[] = { 132 "unknown or generic", 133 "data read", 134 "data write", 135 "instruction execution", 136 }; 137 138 static const char * const proc_flag_strs[] = { 139 "restartable", 140 "precise IP", 141 "overflow", 142 "corrected", 143 }; 144 145 static void cper_print_proc_generic(const char *pfx, 146 const struct cper_sec_proc_generic *proc) 147 { 148 if (proc->validation_bits & CPER_PROC_VALID_TYPE) 149 printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type, 150 proc->proc_type < ARRAY_SIZE(proc_type_strs) ? 151 proc_type_strs[proc->proc_type] : "unknown"); 152 if (proc->validation_bits & CPER_PROC_VALID_ISA) 153 printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa, 154 proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ? 155 proc_isa_strs[proc->proc_isa] : "unknown"); 156 if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) { 157 printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type); 158 cper_print_bits(pfx, proc->proc_error_type, 159 cper_proc_error_type_strs, 160 ARRAY_SIZE(cper_proc_error_type_strs)); 161 } 162 if (proc->validation_bits & CPER_PROC_VALID_OPERATION) 163 printk("%s""operation: %d, %s\n", pfx, proc->operation, 164 proc->operation < ARRAY_SIZE(proc_op_strs) ? 165 proc_op_strs[proc->operation] : "unknown"); 166 if (proc->validation_bits & CPER_PROC_VALID_FLAGS) { 167 printk("%s""flags: 0x%02x\n", pfx, proc->flags); 168 cper_print_bits(pfx, proc->flags, proc_flag_strs, 169 ARRAY_SIZE(proc_flag_strs)); 170 } 171 if (proc->validation_bits & CPER_PROC_VALID_LEVEL) 172 printk("%s""level: %d\n", pfx, proc->level); 173 if (proc->validation_bits & CPER_PROC_VALID_VERSION) 174 printk("%s""version_info: 0x%016llx\n", pfx, proc->cpu_version); 175 if (proc->validation_bits & CPER_PROC_VALID_ID) 176 printk("%s""processor_id: 0x%016llx\n", pfx, proc->proc_id); 177 if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS) 178 printk("%s""target_address: 0x%016llx\n", 179 pfx, proc->target_addr); 180 if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID) 181 printk("%s""requestor_id: 0x%016llx\n", 182 pfx, proc->requestor_id); 183 if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID) 184 printk("%s""responder_id: 0x%016llx\n", 185 pfx, proc->responder_id); 186 if (proc->validation_bits & CPER_PROC_VALID_IP) 187 printk("%s""IP: 0x%016llx\n", pfx, proc->ip); 188 } 189 190 static const char * const mem_err_type_strs[] = { 191 "unknown", 192 "no error", 193 "single-bit ECC", 194 "multi-bit ECC", 195 "single-symbol chipkill ECC", 196 "multi-symbol chipkill ECC", 197 "master abort", 198 "target abort", 199 "parity error", 200 "watchdog timeout", 201 "invalid address", 202 "mirror Broken", 203 "memory sparing", 204 "scrub corrected error", 205 "scrub uncorrected error", 206 "physical memory map-out event", 207 }; 208 209 const char *cper_mem_err_type_str(unsigned int etype) 210 { 211 return etype < ARRAY_SIZE(mem_err_type_strs) ? 212 mem_err_type_strs[etype] : "unknown"; 213 } 214 EXPORT_SYMBOL_GPL(cper_mem_err_type_str); 215 216 static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) 217 { 218 u32 len, n; 219 220 if (!msg) 221 return 0; 222 223 n = 0; 224 len = CPER_REC_LEN - 1; 225 if (mem->validation_bits & CPER_MEM_VALID_NODE) 226 n += scnprintf(msg + n, len - n, "node: %d ", mem->node); 227 if (mem->validation_bits & CPER_MEM_VALID_CARD) 228 n += scnprintf(msg + n, len - n, "card: %d ", mem->card); 229 if (mem->validation_bits & CPER_MEM_VALID_MODULE) 230 n += scnprintf(msg + n, len - n, "module: %d ", mem->module); 231 if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 232 n += scnprintf(msg + n, len - n, "rank: %d ", mem->rank); 233 if (mem->validation_bits & CPER_MEM_VALID_BANK) 234 n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); 235 if (mem->validation_bits & CPER_MEM_VALID_DEVICE) 236 n += scnprintf(msg + n, len - n, "device: %d ", mem->device); 237 if (mem->validation_bits & CPER_MEM_VALID_ROW) 238 n += scnprintf(msg + n, len - n, "row: %d ", mem->row); 239 if (mem->validation_bits & CPER_MEM_VALID_COLUMN) 240 n += scnprintf(msg + n, len - n, "column: %d ", mem->column); 241 if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) 242 n += scnprintf(msg + n, len - n, "bit_position: %d ", 243 mem->bit_pos); 244 if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 245 n += scnprintf(msg + n, len - n, "requestor_id: 0x%016llx ", 246 mem->requestor_id); 247 if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 248 n += scnprintf(msg + n, len - n, "responder_id: 0x%016llx ", 249 mem->responder_id); 250 if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) 251 scnprintf(msg + n, len - n, "target_id: 0x%016llx ", 252 mem->target_id); 253 254 msg[n] = '\0'; 255 return n; 256 } 257 258 static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) 259 { 260 u32 len, n; 261 const char *bank = NULL, *device = NULL; 262 263 if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)) 264 return 0; 265 266 n = 0; 267 len = CPER_REC_LEN - 1; 268 dmi_memdev_name(mem->mem_dev_handle, &bank, &device); 269 if (bank && device) 270 n = snprintf(msg, len, "DIMM location: %s %s ", bank, device); 271 else 272 n = snprintf(msg, len, 273 "DIMM location: not present. DMI handle: 0x%.4x ", 274 mem->mem_dev_handle); 275 276 msg[n] = '\0'; 277 return n; 278 } 279 280 void cper_mem_err_pack(const struct cper_sec_mem_err *mem, 281 struct cper_mem_err_compact *cmem) 282 { 283 cmem->validation_bits = mem->validation_bits; 284 cmem->node = mem->node; 285 cmem->card = mem->card; 286 cmem->module = mem->module; 287 cmem->bank = mem->bank; 288 cmem->device = mem->device; 289 cmem->row = mem->row; 290 cmem->column = mem->column; 291 cmem->bit_pos = mem->bit_pos; 292 cmem->requestor_id = mem->requestor_id; 293 cmem->responder_id = mem->responder_id; 294 cmem->target_id = mem->target_id; 295 cmem->rank = mem->rank; 296 cmem->mem_array_handle = mem->mem_array_handle; 297 cmem->mem_dev_handle = mem->mem_dev_handle; 298 } 299 300 const char *cper_mem_err_unpack(struct trace_seq *p, 301 struct cper_mem_err_compact *cmem) 302 { 303 const char *ret = trace_seq_buffer_ptr(p); 304 305 if (cper_mem_err_location(cmem, rcd_decode_str)) 306 trace_seq_printf(p, "%s", rcd_decode_str); 307 if (cper_dimm_err_location(cmem, rcd_decode_str)) 308 trace_seq_printf(p, "%s", rcd_decode_str); 309 trace_seq_putc(p, '\0'); 310 311 return ret; 312 } 313 314 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, 315 int len) 316 { 317 struct cper_mem_err_compact cmem; 318 319 /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ 320 if (len == sizeof(struct cper_sec_mem_err_old) && 321 (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) { 322 pr_err(FW_WARN "valid bits set for fields beyond structure\n"); 323 return; 324 } 325 if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) 326 printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); 327 if (mem->validation_bits & CPER_MEM_VALID_PA) 328 printk("%s""physical_address: 0x%016llx\n", 329 pfx, mem->physical_addr); 330 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) 331 printk("%s""physical_address_mask: 0x%016llx\n", 332 pfx, mem->physical_addr_mask); 333 cper_mem_err_pack(mem, &cmem); 334 if (cper_mem_err_location(&cmem, rcd_decode_str)) 335 printk("%s%s\n", pfx, rcd_decode_str); 336 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 337 u8 etype = mem->error_type; 338 printk("%s""error_type: %d, %s\n", pfx, etype, 339 cper_mem_err_type_str(etype)); 340 } 341 if (cper_dimm_err_location(&cmem, rcd_decode_str)) 342 printk("%s%s\n", pfx, rcd_decode_str); 343 } 344 345 static const char * const pcie_port_type_strs[] = { 346 "PCIe end point", 347 "legacy PCI end point", 348 "unknown", 349 "unknown", 350 "root port", 351 "upstream switch port", 352 "downstream switch port", 353 "PCIe to PCI/PCI-X bridge", 354 "PCI/PCI-X to PCIe bridge", 355 "root complex integrated endpoint device", 356 "root complex event collector", 357 }; 358 359 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, 360 const struct acpi_hest_generic_data *gdata) 361 { 362 if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) 363 printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, 364 pcie->port_type < ARRAY_SIZE(pcie_port_type_strs) ? 365 pcie_port_type_strs[pcie->port_type] : "unknown"); 366 if (pcie->validation_bits & CPER_PCIE_VALID_VERSION) 367 printk("%s""version: %d.%d\n", pfx, 368 pcie->version.major, pcie->version.minor); 369 if (pcie->validation_bits & CPER_PCIE_VALID_COMMAND_STATUS) 370 printk("%s""command: 0x%04x, status: 0x%04x\n", pfx, 371 pcie->command, pcie->status); 372 if (pcie->validation_bits & CPER_PCIE_VALID_DEVICE_ID) { 373 const __u8 *p; 374 printk("%s""device_id: %04x:%02x:%02x.%x\n", pfx, 375 pcie->device_id.segment, pcie->device_id.bus, 376 pcie->device_id.device, pcie->device_id.function); 377 printk("%s""slot: %d\n", pfx, 378 pcie->device_id.slot >> CPER_PCIE_SLOT_SHIFT); 379 printk("%s""secondary_bus: 0x%02x\n", pfx, 380 pcie->device_id.secondary_bus); 381 printk("%s""vendor_id: 0x%04x, device_id: 0x%04x\n", pfx, 382 pcie->device_id.vendor_id, pcie->device_id.device_id); 383 p = pcie->device_id.class_code; 384 printk("%s""class_code: %02x%02x%02x\n", pfx, p[2], p[1], p[0]); 385 } 386 if (pcie->validation_bits & CPER_PCIE_VALID_SERIAL_NUMBER) 387 printk("%s""serial number: 0x%04x, 0x%04x\n", pfx, 388 pcie->serial_number.lower, pcie->serial_number.upper); 389 if (pcie->validation_bits & CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS) 390 printk( 391 "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", 392 pfx, pcie->bridge.secondary_status, pcie->bridge.control); 393 394 /* Fatal errors call __ghes_panic() before AER handler prints this */ 395 if ((pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) && 396 (gdata->error_severity & CPER_SEV_FATAL)) { 397 struct aer_capability_regs *aer; 398 399 aer = (struct aer_capability_regs *)pcie->aer_info; 400 printk("%saer_uncor_status: 0x%08x, aer_uncor_mask: 0x%08x\n", 401 pfx, aer->uncor_status, aer->uncor_mask); 402 printk("%saer_uncor_severity: 0x%08x\n", 403 pfx, aer->uncor_severity); 404 printk("%sTLP Header: %08x %08x %08x %08x\n", pfx, 405 aer->header_log.dw0, aer->header_log.dw1, 406 aer->header_log.dw2, aer->header_log.dw3); 407 } 408 } 409 410 static void cper_print_tstamp(const char *pfx, 411 struct acpi_hest_generic_data_v300 *gdata) 412 { 413 __u8 hour, min, sec, day, mon, year, century, *timestamp; 414 415 if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) { 416 timestamp = (__u8 *)&(gdata->time_stamp); 417 sec = bcd2bin(timestamp[0]); 418 min = bcd2bin(timestamp[1]); 419 hour = bcd2bin(timestamp[2]); 420 day = bcd2bin(timestamp[4]); 421 mon = bcd2bin(timestamp[5]); 422 year = bcd2bin(timestamp[6]); 423 century = bcd2bin(timestamp[7]); 424 425 printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx, 426 (timestamp[3] & 0x1 ? "precise " : "imprecise "), 427 century, year, mon, day, hour, min, sec); 428 } 429 } 430 431 static void 432 cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata, 433 int sec_no) 434 { 435 guid_t *sec_type = (guid_t *)gdata->section_type; 436 __u16 severity; 437 char newpfx[64]; 438 439 if (acpi_hest_get_version(gdata) >= 3) 440 cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata); 441 442 severity = gdata->error_severity; 443 printk("%s""Error %d, type: %s\n", pfx, sec_no, 444 cper_severity_str(severity)); 445 if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) 446 printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id); 447 if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) 448 printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text); 449 450 snprintf(newpfx, sizeof(newpfx), "%s ", pfx); 451 if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) { 452 struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata); 453 454 printk("%s""section_type: general processor error\n", newpfx); 455 if (gdata->error_data_length >= sizeof(*proc_err)) 456 cper_print_proc_generic(newpfx, proc_err); 457 else 458 goto err_section_too_small; 459 } else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { 460 struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); 461 462 printk("%s""section_type: memory error\n", newpfx); 463 if (gdata->error_data_length >= 464 sizeof(struct cper_sec_mem_err_old)) 465 cper_print_mem(newpfx, mem_err, 466 gdata->error_data_length); 467 else 468 goto err_section_too_small; 469 } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { 470 struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata); 471 472 printk("%s""section_type: PCIe error\n", newpfx); 473 if (gdata->error_data_length >= sizeof(*pcie)) 474 cper_print_pcie(newpfx, pcie, gdata); 475 else 476 goto err_section_too_small; 477 #if defined(CONFIG_ARM64) || defined(CONFIG_ARM) 478 } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { 479 struct cper_sec_proc_arm *arm_err = acpi_hest_get_payload(gdata); 480 481 printk("%ssection_type: ARM processor error\n", newpfx); 482 if (gdata->error_data_length >= sizeof(*arm_err)) 483 cper_print_proc_arm(newpfx, arm_err); 484 else 485 goto err_section_too_small; 486 #endif 487 #if defined(CONFIG_UEFI_CPER_X86) 488 } else if (guid_equal(sec_type, &CPER_SEC_PROC_IA)) { 489 struct cper_sec_proc_ia *ia_err = acpi_hest_get_payload(gdata); 490 491 printk("%ssection_type: IA32/X64 processor error\n", newpfx); 492 if (gdata->error_data_length >= sizeof(*ia_err)) 493 cper_print_proc_ia(newpfx, ia_err); 494 else 495 goto err_section_too_small; 496 #endif 497 } else { 498 const void *err = acpi_hest_get_payload(gdata); 499 500 printk("%ssection type: unknown, %pUl\n", newpfx, sec_type); 501 printk("%ssection length: %#x\n", newpfx, 502 gdata->error_data_length); 503 print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err, 504 gdata->error_data_length, true); 505 } 506 507 return; 508 509 err_section_too_small: 510 pr_err(FW_WARN "error section length is too small\n"); 511 } 512 513 void cper_estatus_print(const char *pfx, 514 const struct acpi_hest_generic_status *estatus) 515 { 516 struct acpi_hest_generic_data *gdata; 517 int sec_no = 0; 518 char newpfx[64]; 519 __u16 severity; 520 521 severity = estatus->error_severity; 522 if (severity == CPER_SEV_CORRECTED) 523 printk("%s%s\n", pfx, 524 "It has been corrected by h/w " 525 "and requires no further action"); 526 printk("%s""event severity: %s\n", pfx, cper_severity_str(severity)); 527 snprintf(newpfx, sizeof(newpfx), "%s ", pfx); 528 529 apei_estatus_for_each_section(estatus, gdata) { 530 cper_estatus_print_section(newpfx, gdata, sec_no); 531 sec_no++; 532 } 533 } 534 EXPORT_SYMBOL_GPL(cper_estatus_print); 535 536 int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus) 537 { 538 if (estatus->data_length && 539 estatus->data_length < sizeof(struct acpi_hest_generic_data)) 540 return -EINVAL; 541 if (estatus->raw_data_length && 542 estatus->raw_data_offset < sizeof(*estatus) + estatus->data_length) 543 return -EINVAL; 544 545 return 0; 546 } 547 EXPORT_SYMBOL_GPL(cper_estatus_check_header); 548 549 int cper_estatus_check(const struct acpi_hest_generic_status *estatus) 550 { 551 struct acpi_hest_generic_data *gdata; 552 unsigned int data_len, record_size; 553 int rc; 554 555 rc = cper_estatus_check_header(estatus); 556 if (rc) 557 return rc; 558 559 data_len = estatus->data_length; 560 561 apei_estatus_for_each_section(estatus, gdata) { 562 if (sizeof(struct acpi_hest_generic_data) > data_len) 563 return -EINVAL; 564 565 record_size = acpi_hest_get_record_size(gdata); 566 if (record_size > data_len) 567 return -EINVAL; 568 569 data_len -= record_size; 570 } 571 if (data_len) 572 return -EINVAL; 573 574 return 0; 575 } 576 EXPORT_SYMBOL_GPL(cper_estatus_check); 577