xref: /openbmc/linux/drivers/firmware/efi/cper.c (revision ddc141e5)
1 /*
2  * UEFI Common Platform Error Record (CPER) support
3  *
4  * Copyright (C) 2010, Intel Corp.
5  *	Author: Huang Ying <ying.huang@intel.com>
6  *
7  * CPER is the format used to describe platform hardware error by
8  * various tables, such as ERST, BERT and HEST etc.
9  *
10  * For more information about CPER, please refer to Appendix N of UEFI
11  * Specification version 2.4.
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version
15  * 2 as published by the Free Software Foundation.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25  */
26 
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/time.h>
30 #include <linux/cper.h>
31 #include <linux/dmi.h>
32 #include <linux/acpi.h>
33 #include <linux/pci.h>
34 #include <linux/aer.h>
35 #include <linux/printk.h>
36 #include <linux/bcd.h>
37 #include <acpi/ghes.h>
38 #include <ras/ras_event.h>
39 
40 #define INDENT_SP	" "
41 
42 static char rcd_decode_str[CPER_REC_LEN];
43 
44 /*
45  * CPER record ID need to be unique even after reboot, because record
46  * ID is used as index for ERST storage, while CPER records from
47  * multiple boot may co-exist in ERST.
48  */
49 u64 cper_next_record_id(void)
50 {
51 	static atomic64_t seq;
52 
53 	if (!atomic64_read(&seq))
54 		atomic64_set(&seq, ((u64)get_seconds()) << 32);
55 
56 	return atomic64_inc_return(&seq);
57 }
58 EXPORT_SYMBOL_GPL(cper_next_record_id);
59 
60 static const char * const severity_strs[] = {
61 	"recoverable",
62 	"fatal",
63 	"corrected",
64 	"info",
65 };
66 
67 const char *cper_severity_str(unsigned int severity)
68 {
69 	return severity < ARRAY_SIZE(severity_strs) ?
70 		severity_strs[severity] : "unknown";
71 }
72 EXPORT_SYMBOL_GPL(cper_severity_str);
73 
74 /*
75  * cper_print_bits - print strings for set bits
76  * @pfx: prefix for each line, including log level and prefix string
77  * @bits: bit mask
78  * @strs: string array, indexed by bit position
79  * @strs_size: size of the string array: @strs
80  *
81  * For each set bit in @bits, print the corresponding string in @strs.
82  * If the output length is longer than 80, multiple line will be
83  * printed, with @pfx is printed at the beginning of each line.
84  */
85 void cper_print_bits(const char *pfx, unsigned int bits,
86 		     const char * const strs[], unsigned int strs_size)
87 {
88 	int i, len = 0;
89 	const char *str;
90 	char buf[84];
91 
92 	for (i = 0; i < strs_size; i++) {
93 		if (!(bits & (1U << i)))
94 			continue;
95 		str = strs[i];
96 		if (!str)
97 			continue;
98 		if (len && len + strlen(str) + 2 > 80) {
99 			printk("%s\n", buf);
100 			len = 0;
101 		}
102 		if (!len)
103 			len = snprintf(buf, sizeof(buf), "%s%s", pfx, str);
104 		else
105 			len += snprintf(buf+len, sizeof(buf)-len, ", %s", str);
106 	}
107 	if (len)
108 		printk("%s\n", buf);
109 }
110 
111 static const char * const proc_type_strs[] = {
112 	"IA32/X64",
113 	"IA64",
114 	"ARM",
115 };
116 
117 static const char * const proc_isa_strs[] = {
118 	"IA32",
119 	"IA64",
120 	"X64",
121 	"ARM A32/T32",
122 	"ARM A64",
123 };
124 
125 const char * const cper_proc_error_type_strs[] = {
126 	"cache error",
127 	"TLB error",
128 	"bus error",
129 	"micro-architectural error",
130 };
131 
132 static const char * const proc_op_strs[] = {
133 	"unknown or generic",
134 	"data read",
135 	"data write",
136 	"instruction execution",
137 };
138 
139 static const char * const proc_flag_strs[] = {
140 	"restartable",
141 	"precise IP",
142 	"overflow",
143 	"corrected",
144 };
145 
146 static void cper_print_proc_generic(const char *pfx,
147 				    const struct cper_sec_proc_generic *proc)
148 {
149 	if (proc->validation_bits & CPER_PROC_VALID_TYPE)
150 		printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type,
151 		       proc->proc_type < ARRAY_SIZE(proc_type_strs) ?
152 		       proc_type_strs[proc->proc_type] : "unknown");
153 	if (proc->validation_bits & CPER_PROC_VALID_ISA)
154 		printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa,
155 		       proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ?
156 		       proc_isa_strs[proc->proc_isa] : "unknown");
157 	if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) {
158 		printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type);
159 		cper_print_bits(pfx, proc->proc_error_type,
160 				cper_proc_error_type_strs,
161 				ARRAY_SIZE(cper_proc_error_type_strs));
162 	}
163 	if (proc->validation_bits & CPER_PROC_VALID_OPERATION)
164 		printk("%s""operation: %d, %s\n", pfx, proc->operation,
165 		       proc->operation < ARRAY_SIZE(proc_op_strs) ?
166 		       proc_op_strs[proc->operation] : "unknown");
167 	if (proc->validation_bits & CPER_PROC_VALID_FLAGS) {
168 		printk("%s""flags: 0x%02x\n", pfx, proc->flags);
169 		cper_print_bits(pfx, proc->flags, proc_flag_strs,
170 				ARRAY_SIZE(proc_flag_strs));
171 	}
172 	if (proc->validation_bits & CPER_PROC_VALID_LEVEL)
173 		printk("%s""level: %d\n", pfx, proc->level);
174 	if (proc->validation_bits & CPER_PROC_VALID_VERSION)
175 		printk("%s""version_info: 0x%016llx\n", pfx, proc->cpu_version);
176 	if (proc->validation_bits & CPER_PROC_VALID_ID)
177 		printk("%s""processor_id: 0x%016llx\n", pfx, proc->proc_id);
178 	if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS)
179 		printk("%s""target_address: 0x%016llx\n",
180 		       pfx, proc->target_addr);
181 	if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID)
182 		printk("%s""requestor_id: 0x%016llx\n",
183 		       pfx, proc->requestor_id);
184 	if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID)
185 		printk("%s""responder_id: 0x%016llx\n",
186 		       pfx, proc->responder_id);
187 	if (proc->validation_bits & CPER_PROC_VALID_IP)
188 		printk("%s""IP: 0x%016llx\n", pfx, proc->ip);
189 }
190 
191 static const char * const mem_err_type_strs[] = {
192 	"unknown",
193 	"no error",
194 	"single-bit ECC",
195 	"multi-bit ECC",
196 	"single-symbol chipkill ECC",
197 	"multi-symbol chipkill ECC",
198 	"master abort",
199 	"target abort",
200 	"parity error",
201 	"watchdog timeout",
202 	"invalid address",
203 	"mirror Broken",
204 	"memory sparing",
205 	"scrub corrected error",
206 	"scrub uncorrected error",
207 	"physical memory map-out event",
208 };
209 
210 const char *cper_mem_err_type_str(unsigned int etype)
211 {
212 	return etype < ARRAY_SIZE(mem_err_type_strs) ?
213 		mem_err_type_strs[etype] : "unknown";
214 }
215 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
216 
217 static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
218 {
219 	u32 len, n;
220 
221 	if (!msg)
222 		return 0;
223 
224 	n = 0;
225 	len = CPER_REC_LEN - 1;
226 	if (mem->validation_bits & CPER_MEM_VALID_NODE)
227 		n += scnprintf(msg + n, len - n, "node: %d ", mem->node);
228 	if (mem->validation_bits & CPER_MEM_VALID_CARD)
229 		n += scnprintf(msg + n, len - n, "card: %d ", mem->card);
230 	if (mem->validation_bits & CPER_MEM_VALID_MODULE)
231 		n += scnprintf(msg + n, len - n, "module: %d ", mem->module);
232 	if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
233 		n += scnprintf(msg + n, len - n, "rank: %d ", mem->rank);
234 	if (mem->validation_bits & CPER_MEM_VALID_BANK)
235 		n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
236 	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
237 		n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
238 	if (mem->validation_bits & CPER_MEM_VALID_ROW)
239 		n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
240 	if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
241 		n += scnprintf(msg + n, len - n, "column: %d ", mem->column);
242 	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
243 		n += scnprintf(msg + n, len - n, "bit_position: %d ",
244 			       mem->bit_pos);
245 	if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
246 		n += scnprintf(msg + n, len - n, "requestor_id: 0x%016llx ",
247 			       mem->requestor_id);
248 	if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
249 		n += scnprintf(msg + n, len - n, "responder_id: 0x%016llx ",
250 			       mem->responder_id);
251 	if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
252 		scnprintf(msg + n, len - n, "target_id: 0x%016llx ",
253 			  mem->target_id);
254 
255 	msg[n] = '\0';
256 	return n;
257 }
258 
259 static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
260 {
261 	u32 len, n;
262 	const char *bank = NULL, *device = NULL;
263 
264 	if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE))
265 		return 0;
266 
267 	n = 0;
268 	len = CPER_REC_LEN - 1;
269 	dmi_memdev_name(mem->mem_dev_handle, &bank, &device);
270 	if (bank && device)
271 		n = snprintf(msg, len, "DIMM location: %s %s ", bank, device);
272 	else
273 		n = snprintf(msg, len,
274 			     "DIMM location: not present. DMI handle: 0x%.4x ",
275 			     mem->mem_dev_handle);
276 
277 	msg[n] = '\0';
278 	return n;
279 }
280 
281 void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
282 		       struct cper_mem_err_compact *cmem)
283 {
284 	cmem->validation_bits = mem->validation_bits;
285 	cmem->node = mem->node;
286 	cmem->card = mem->card;
287 	cmem->module = mem->module;
288 	cmem->bank = mem->bank;
289 	cmem->device = mem->device;
290 	cmem->row = mem->row;
291 	cmem->column = mem->column;
292 	cmem->bit_pos = mem->bit_pos;
293 	cmem->requestor_id = mem->requestor_id;
294 	cmem->responder_id = mem->responder_id;
295 	cmem->target_id = mem->target_id;
296 	cmem->rank = mem->rank;
297 	cmem->mem_array_handle = mem->mem_array_handle;
298 	cmem->mem_dev_handle = mem->mem_dev_handle;
299 }
300 
301 const char *cper_mem_err_unpack(struct trace_seq *p,
302 				struct cper_mem_err_compact *cmem)
303 {
304 	const char *ret = trace_seq_buffer_ptr(p);
305 
306 	if (cper_mem_err_location(cmem, rcd_decode_str))
307 		trace_seq_printf(p, "%s", rcd_decode_str);
308 	if (cper_dimm_err_location(cmem, rcd_decode_str))
309 		trace_seq_printf(p, "%s", rcd_decode_str);
310 	trace_seq_putc(p, '\0');
311 
312 	return ret;
313 }
314 
315 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem,
316 	int len)
317 {
318 	struct cper_mem_err_compact cmem;
319 
320 	/* Don't trust UEFI 2.1/2.2 structure with bad validation bits */
321 	if (len == sizeof(struct cper_sec_mem_err_old) &&
322 	    (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) {
323 		pr_err(FW_WARN "valid bits set for fields beyond structure\n");
324 		return;
325 	}
326 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
327 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
328 	if (mem->validation_bits & CPER_MEM_VALID_PA)
329 		printk("%s""physical_address: 0x%016llx\n",
330 		       pfx, mem->physical_addr);
331 	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
332 		printk("%s""physical_address_mask: 0x%016llx\n",
333 		       pfx, mem->physical_addr_mask);
334 	cper_mem_err_pack(mem, &cmem);
335 	if (cper_mem_err_location(&cmem, rcd_decode_str))
336 		printk("%s%s\n", pfx, rcd_decode_str);
337 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
338 		u8 etype = mem->error_type;
339 		printk("%s""error_type: %d, %s\n", pfx, etype,
340 		       cper_mem_err_type_str(etype));
341 	}
342 	if (cper_dimm_err_location(&cmem, rcd_decode_str))
343 		printk("%s%s\n", pfx, rcd_decode_str);
344 }
345 
346 static const char * const pcie_port_type_strs[] = {
347 	"PCIe end point",
348 	"legacy PCI end point",
349 	"unknown",
350 	"unknown",
351 	"root port",
352 	"upstream switch port",
353 	"downstream switch port",
354 	"PCIe to PCI/PCI-X bridge",
355 	"PCI/PCI-X to PCIe bridge",
356 	"root complex integrated endpoint device",
357 	"root complex event collector",
358 };
359 
360 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
361 			    const struct acpi_hest_generic_data *gdata)
362 {
363 	if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
364 		printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
365 		       pcie->port_type < ARRAY_SIZE(pcie_port_type_strs) ?
366 		       pcie_port_type_strs[pcie->port_type] : "unknown");
367 	if (pcie->validation_bits & CPER_PCIE_VALID_VERSION)
368 		printk("%s""version: %d.%d\n", pfx,
369 		       pcie->version.major, pcie->version.minor);
370 	if (pcie->validation_bits & CPER_PCIE_VALID_COMMAND_STATUS)
371 		printk("%s""command: 0x%04x, status: 0x%04x\n", pfx,
372 		       pcie->command, pcie->status);
373 	if (pcie->validation_bits & CPER_PCIE_VALID_DEVICE_ID) {
374 		const __u8 *p;
375 		printk("%s""device_id: %04x:%02x:%02x.%x\n", pfx,
376 		       pcie->device_id.segment, pcie->device_id.bus,
377 		       pcie->device_id.device, pcie->device_id.function);
378 		printk("%s""slot: %d\n", pfx,
379 		       pcie->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
380 		printk("%s""secondary_bus: 0x%02x\n", pfx,
381 		       pcie->device_id.secondary_bus);
382 		printk("%s""vendor_id: 0x%04x, device_id: 0x%04x\n", pfx,
383 		       pcie->device_id.vendor_id, pcie->device_id.device_id);
384 		p = pcie->device_id.class_code;
385 		printk("%s""class_code: %02x%02x%02x\n", pfx, p[0], p[1], p[2]);
386 	}
387 	if (pcie->validation_bits & CPER_PCIE_VALID_SERIAL_NUMBER)
388 		printk("%s""serial number: 0x%04x, 0x%04x\n", pfx,
389 		       pcie->serial_number.lower, pcie->serial_number.upper);
390 	if (pcie->validation_bits & CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS)
391 		printk(
392 	"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
393 	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
394 }
395 
396 static void cper_print_tstamp(const char *pfx,
397 				   struct acpi_hest_generic_data_v300 *gdata)
398 {
399 	__u8 hour, min, sec, day, mon, year, century, *timestamp;
400 
401 	if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
402 		timestamp = (__u8 *)&(gdata->time_stamp);
403 		sec       = bcd2bin(timestamp[0]);
404 		min       = bcd2bin(timestamp[1]);
405 		hour      = bcd2bin(timestamp[2]);
406 		day       = bcd2bin(timestamp[4]);
407 		mon       = bcd2bin(timestamp[5]);
408 		year      = bcd2bin(timestamp[6]);
409 		century   = bcd2bin(timestamp[7]);
410 
411 		printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
412 		       (timestamp[3] & 0x1 ? "precise " : "imprecise "),
413 		       century, year, mon, day, hour, min, sec);
414 	}
415 }
416 
417 static void
418 cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
419 			   int sec_no)
420 {
421 	guid_t *sec_type = (guid_t *)gdata->section_type;
422 	__u16 severity;
423 	char newpfx[64];
424 
425 	if (acpi_hest_get_version(gdata) >= 3)
426 		cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata);
427 
428 	severity = gdata->error_severity;
429 	printk("%s""Error %d, type: %s\n", pfx, sec_no,
430 	       cper_severity_str(severity));
431 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
432 		printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id);
433 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
434 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
435 
436 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
437 	if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
438 		struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
439 
440 		printk("%s""section_type: general processor error\n", newpfx);
441 		if (gdata->error_data_length >= sizeof(*proc_err))
442 			cper_print_proc_generic(newpfx, proc_err);
443 		else
444 			goto err_section_too_small;
445 	} else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
446 		struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
447 
448 		printk("%s""section_type: memory error\n", newpfx);
449 		if (gdata->error_data_length >=
450 		    sizeof(struct cper_sec_mem_err_old))
451 			cper_print_mem(newpfx, mem_err,
452 				       gdata->error_data_length);
453 		else
454 			goto err_section_too_small;
455 	} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
456 		struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
457 
458 		printk("%s""section_type: PCIe error\n", newpfx);
459 		if (gdata->error_data_length >= sizeof(*pcie))
460 			cper_print_pcie(newpfx, pcie, gdata);
461 		else
462 			goto err_section_too_small;
463 #if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
464 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_ARM)) {
465 		struct cper_sec_proc_arm *arm_err = acpi_hest_get_payload(gdata);
466 
467 		printk("%ssection_type: ARM processor error\n", newpfx);
468 		if (gdata->error_data_length >= sizeof(*arm_err))
469 			cper_print_proc_arm(newpfx, arm_err);
470 		else
471 			goto err_section_too_small;
472 #endif
473 	} else {
474 		const void *err = acpi_hest_get_payload(gdata);
475 
476 		printk("%ssection type: unknown, %pUl\n", newpfx, sec_type);
477 		printk("%ssection length: %#x\n", newpfx,
478 		       gdata->error_data_length);
479 		print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err,
480 			       gdata->error_data_length, true);
481 	}
482 
483 	return;
484 
485 err_section_too_small:
486 	pr_err(FW_WARN "error section length is too small\n");
487 }
488 
489 void cper_estatus_print(const char *pfx,
490 			const struct acpi_hest_generic_status *estatus)
491 {
492 	struct acpi_hest_generic_data *gdata;
493 	int sec_no = 0;
494 	char newpfx[64];
495 	__u16 severity;
496 
497 	severity = estatus->error_severity;
498 	if (severity == CPER_SEV_CORRECTED)
499 		printk("%s%s\n", pfx,
500 		       "It has been corrected by h/w "
501 		       "and requires no further action");
502 	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
503 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
504 
505 	apei_estatus_for_each_section(estatus, gdata) {
506 		cper_estatus_print_section(newpfx, gdata, sec_no);
507 		sec_no++;
508 	}
509 }
510 EXPORT_SYMBOL_GPL(cper_estatus_print);
511 
512 int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus)
513 {
514 	if (estatus->data_length &&
515 	    estatus->data_length < sizeof(struct acpi_hest_generic_data))
516 		return -EINVAL;
517 	if (estatus->raw_data_length &&
518 	    estatus->raw_data_offset < sizeof(*estatus) + estatus->data_length)
519 		return -EINVAL;
520 
521 	return 0;
522 }
523 EXPORT_SYMBOL_GPL(cper_estatus_check_header);
524 
525 int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
526 {
527 	struct acpi_hest_generic_data *gdata;
528 	unsigned int data_len, gedata_len;
529 	int rc;
530 
531 	rc = cper_estatus_check_header(estatus);
532 	if (rc)
533 		return rc;
534 	data_len = estatus->data_length;
535 
536 	apei_estatus_for_each_section(estatus, gdata) {
537 		gedata_len = acpi_hest_get_error_length(gdata);
538 		if (gedata_len > data_len - acpi_hest_get_size(gdata))
539 			return -EINVAL;
540 		data_len -= acpi_hest_get_record_size(gdata);
541 	}
542 	if (data_len)
543 		return -EINVAL;
544 
545 	return 0;
546 }
547 EXPORT_SYMBOL_GPL(cper_estatus_check);
548