xref: /openbmc/linux/drivers/acpi/apei/ghes.c (revision efe4a1ac)
1 /*
2  * APEI Generic Hardware Error Source support
3  *
4  * Generic Hardware Error Source provides a way to report platform
5  * hardware errors (such as that from chipset). It works in so called
6  * "Firmware First" mode, that is, hardware errors are reported to
7  * firmware firstly, then reported to Linux by firmware. This way,
8  * some non-standard hardware error registers or non-standard hardware
9  * link can be checked by firmware to produce more hardware error
10  * information for Linux.
11  *
12  * For more information about Generic Hardware Error Source, please
13  * refer to ACPI Specification version 4.0, section 17.3.2.6
14  *
15  * Copyright 2010,2011 Intel Corp.
16  *   Author: Huang Ying <ying.huang@intel.com>
17  *
18  * This program is free software; you can redistribute it and/or
19  * modify it under the terms of the GNU General Public License version
20  * 2 as published by the Free Software Foundation;
21  *
22  * This program is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25  * GNU General Public License for more details.
26  */
27 
28 #include <linux/kernel.h>
29 #include <linux/moduleparam.h>
30 #include <linux/init.h>
31 #include <linux/acpi.h>
32 #include <linux/io.h>
33 #include <linux/interrupt.h>
34 #include <linux/timer.h>
35 #include <linux/cper.h>
36 #include <linux/kdebug.h>
37 #include <linux/platform_device.h>
38 #include <linux/mutex.h>
39 #include <linux/ratelimit.h>
40 #include <linux/vmalloc.h>
41 #include <linux/irq_work.h>
42 #include <linux/llist.h>
43 #include <linux/genalloc.h>
44 #include <linux/pci.h>
45 #include <linux/aer.h>
46 #include <linux/nmi.h>
47 #include <linux/sched/clock.h>
48 
49 #include <acpi/ghes.h>
50 #include <acpi/apei.h>
51 #include <asm/tlbflush.h>
52 
53 #include "apei-internal.h"
54 
55 #define GHES_PFX	"GHES: "
56 
57 #define GHES_ESTATUS_MAX_SIZE		65536
58 #define GHES_ESOURCE_PREALLOC_MAX_SIZE	65536
59 
60 #define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
61 
62 /* This is just an estimation for memory pool allocation */
63 #define GHES_ESTATUS_CACHE_AVG_SIZE	512
64 
65 #define GHES_ESTATUS_CACHES_SIZE	4
66 
67 #define GHES_ESTATUS_IN_CACHE_MAX_NSEC	10000000000ULL
68 /* Prevent too many caches are allocated because of RCU */
69 #define GHES_ESTATUS_CACHE_ALLOCED_MAX	(GHES_ESTATUS_CACHES_SIZE * 3 / 2)
70 
71 #define GHES_ESTATUS_CACHE_LEN(estatus_len)			\
72 	(sizeof(struct ghes_estatus_cache) + (estatus_len))
73 #define GHES_ESTATUS_FROM_CACHE(estatus_cache)			\
74 	((struct acpi_hest_generic_status *)				\
75 	 ((struct ghes_estatus_cache *)(estatus_cache) + 1))
76 
77 #define GHES_ESTATUS_NODE_LEN(estatus_len)			\
78 	(sizeof(struct ghes_estatus_node) + (estatus_len))
79 #define GHES_ESTATUS_FROM_NODE(estatus_node)			\
80 	((struct acpi_hest_generic_status *)				\
81 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
82 
83 /*
84  * This driver isn't really modular, however for the time being,
85  * continuing to use module_param is the easiest way to remain
86  * compatible with existing boot arg use cases.
87  */
88 bool ghes_disable;
89 module_param_named(disable, ghes_disable, bool, 0);
90 
91 /*
92  * All error sources notified with SCI shares one notifier function,
93  * so they need to be linked and checked one by one.  This is applied
94  * to NMI too.
95  *
96  * RCU is used for these lists, so ghes_list_mutex is only used for
97  * list changing, not for traversing.
98  */
99 static LIST_HEAD(ghes_sci);
100 static DEFINE_MUTEX(ghes_list_mutex);
101 
102 /*
103  * Because the memory area used to transfer hardware error information
104  * from BIOS to Linux can be determined only in NMI, IRQ or timer
105  * handler, but general ioremap can not be used in atomic context, so
106  * a special version of atomic ioremap is implemented for that.
107  */
108 
109 /*
110  * Two virtual pages are used, one for IRQ/PROCESS context, the other for
111  * NMI context (optionally).
112  */
113 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
114 #define GHES_IOREMAP_PAGES           2
115 #else
116 #define GHES_IOREMAP_PAGES           1
117 #endif
118 #define GHES_IOREMAP_IRQ_PAGE(base)	(base)
119 #define GHES_IOREMAP_NMI_PAGE(base)	((base) + PAGE_SIZE)
120 
121 /* virtual memory area for atomic ioremap */
122 static struct vm_struct *ghes_ioremap_area;
123 /*
124  * These 2 spinlock is used to prevent atomic ioremap virtual memory
125  * area from being mapped simultaneously.
126  */
127 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
128 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
129 
130 static struct gen_pool *ghes_estatus_pool;
131 static unsigned long ghes_estatus_pool_size_request;
132 
133 static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
134 static atomic_t ghes_estatus_cache_alloced;
135 
136 static int ghes_ioremap_init(void)
137 {
138 	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
139 		VM_IOREMAP, VMALLOC_START, VMALLOC_END);
140 	if (!ghes_ioremap_area) {
141 		pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n");
142 		return -ENOMEM;
143 	}
144 
145 	return 0;
146 }
147 
148 static void ghes_ioremap_exit(void)
149 {
150 	free_vm_area(ghes_ioremap_area);
151 }
152 
153 static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
154 {
155 	unsigned long vaddr;
156 
157 	vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
158 	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
159 			   pfn << PAGE_SHIFT, PAGE_KERNEL);
160 
161 	return (void __iomem *)vaddr;
162 }
163 
164 static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
165 {
166 	unsigned long vaddr, paddr;
167 	pgprot_t prot;
168 
169 	vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
170 
171 	paddr = pfn << PAGE_SHIFT;
172 	prot = arch_apei_get_mem_attribute(paddr);
173 
174 	ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot);
175 
176 	return (void __iomem *)vaddr;
177 }
178 
179 static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
180 {
181 	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
182 	void *base = ghes_ioremap_area->addr;
183 
184 	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
185 	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
186 	arch_apei_flush_tlb_one(vaddr);
187 }
188 
189 static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
190 {
191 	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
192 	void *base = ghes_ioremap_area->addr;
193 
194 	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
195 	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
196 	arch_apei_flush_tlb_one(vaddr);
197 }
198 
199 static int ghes_estatus_pool_init(void)
200 {
201 	ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
202 	if (!ghes_estatus_pool)
203 		return -ENOMEM;
204 	return 0;
205 }
206 
207 static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
208 					      struct gen_pool_chunk *chunk,
209 					      void *data)
210 {
211 	free_page(chunk->start_addr);
212 }
213 
214 static void ghes_estatus_pool_exit(void)
215 {
216 	gen_pool_for_each_chunk(ghes_estatus_pool,
217 				ghes_estatus_pool_free_chunk_page, NULL);
218 	gen_pool_destroy(ghes_estatus_pool);
219 }
220 
221 static int ghes_estatus_pool_expand(unsigned long len)
222 {
223 	unsigned long i, pages, size, addr;
224 	int ret;
225 
226 	ghes_estatus_pool_size_request += PAGE_ALIGN(len);
227 	size = gen_pool_size(ghes_estatus_pool);
228 	if (size >= ghes_estatus_pool_size_request)
229 		return 0;
230 	pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
231 	for (i = 0; i < pages; i++) {
232 		addr = __get_free_page(GFP_KERNEL);
233 		if (!addr)
234 			return -ENOMEM;
235 		ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
236 		if (ret)
237 			return ret;
238 	}
239 
240 	return 0;
241 }
242 
243 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
244 {
245 	struct ghes *ghes;
246 	unsigned int error_block_length;
247 	int rc;
248 
249 	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
250 	if (!ghes)
251 		return ERR_PTR(-ENOMEM);
252 	ghes->generic = generic;
253 	rc = apei_map_generic_address(&generic->error_status_address);
254 	if (rc)
255 		goto err_free;
256 	error_block_length = generic->error_block_length;
257 	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
258 		pr_warning(FW_WARN GHES_PFX
259 			   "Error status block length is too long: %u for "
260 			   "generic hardware error source: %d.\n",
261 			   error_block_length, generic->header.source_id);
262 		error_block_length = GHES_ESTATUS_MAX_SIZE;
263 	}
264 	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
265 	if (!ghes->estatus) {
266 		rc = -ENOMEM;
267 		goto err_unmap;
268 	}
269 
270 	return ghes;
271 
272 err_unmap:
273 	apei_unmap_generic_address(&generic->error_status_address);
274 err_free:
275 	kfree(ghes);
276 	return ERR_PTR(rc);
277 }
278 
279 static void ghes_fini(struct ghes *ghes)
280 {
281 	kfree(ghes->estatus);
282 	apei_unmap_generic_address(&ghes->generic->error_status_address);
283 }
284 
285 static inline int ghes_severity(int severity)
286 {
287 	switch (severity) {
288 	case CPER_SEV_INFORMATIONAL:
289 		return GHES_SEV_NO;
290 	case CPER_SEV_CORRECTED:
291 		return GHES_SEV_CORRECTED;
292 	case CPER_SEV_RECOVERABLE:
293 		return GHES_SEV_RECOVERABLE;
294 	case CPER_SEV_FATAL:
295 		return GHES_SEV_PANIC;
296 	default:
297 		/* Unknown, go panic */
298 		return GHES_SEV_PANIC;
299 	}
300 }
301 
302 static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
303 				  int from_phys)
304 {
305 	void __iomem *vaddr;
306 	unsigned long flags = 0;
307 	int in_nmi = in_nmi();
308 	u64 offset;
309 	u32 trunk;
310 
311 	while (len > 0) {
312 		offset = paddr - (paddr & PAGE_MASK);
313 		if (in_nmi) {
314 			raw_spin_lock(&ghes_ioremap_lock_nmi);
315 			vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
316 		} else {
317 			spin_lock_irqsave(&ghes_ioremap_lock_irq, flags);
318 			vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
319 		}
320 		trunk = PAGE_SIZE - offset;
321 		trunk = min(trunk, len);
322 		if (from_phys)
323 			memcpy_fromio(buffer, vaddr + offset, trunk);
324 		else
325 			memcpy_toio(vaddr + offset, buffer, trunk);
326 		len -= trunk;
327 		paddr += trunk;
328 		buffer += trunk;
329 		if (in_nmi) {
330 			ghes_iounmap_nmi(vaddr);
331 			raw_spin_unlock(&ghes_ioremap_lock_nmi);
332 		} else {
333 			ghes_iounmap_irq(vaddr);
334 			spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
335 		}
336 	}
337 }
338 
339 static int ghes_read_estatus(struct ghes *ghes, int silent)
340 {
341 	struct acpi_hest_generic *g = ghes->generic;
342 	u64 buf_paddr;
343 	u32 len;
344 	int rc;
345 
346 	rc = apei_read(&buf_paddr, &g->error_status_address);
347 	if (rc) {
348 		if (!silent && printk_ratelimit())
349 			pr_warning(FW_WARN GHES_PFX
350 "Failed to read error status block address for hardware error source: %d.\n",
351 				   g->header.source_id);
352 		return -EIO;
353 	}
354 	if (!buf_paddr)
355 		return -ENOENT;
356 
357 	ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
358 			      sizeof(*ghes->estatus), 1);
359 	if (!ghes->estatus->block_status)
360 		return -ENOENT;
361 
362 	ghes->buffer_paddr = buf_paddr;
363 	ghes->flags |= GHES_TO_CLEAR;
364 
365 	rc = -EIO;
366 	len = cper_estatus_len(ghes->estatus);
367 	if (len < sizeof(*ghes->estatus))
368 		goto err_read_block;
369 	if (len > ghes->generic->error_block_length)
370 		goto err_read_block;
371 	if (cper_estatus_check_header(ghes->estatus))
372 		goto err_read_block;
373 	ghes_copy_tofrom_phys(ghes->estatus + 1,
374 			      buf_paddr + sizeof(*ghes->estatus),
375 			      len - sizeof(*ghes->estatus), 1);
376 	if (cper_estatus_check(ghes->estatus))
377 		goto err_read_block;
378 	rc = 0;
379 
380 err_read_block:
381 	if (rc && !silent && printk_ratelimit())
382 		pr_warning(FW_WARN GHES_PFX
383 			   "Failed to read error status block!\n");
384 	return rc;
385 }
386 
387 static void ghes_clear_estatus(struct ghes *ghes)
388 {
389 	ghes->estatus->block_status = 0;
390 	if (!(ghes->flags & GHES_TO_CLEAR))
391 		return;
392 	ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
393 			      sizeof(ghes->estatus->block_status), 0);
394 	ghes->flags &= ~GHES_TO_CLEAR;
395 }
396 
397 static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
398 {
399 #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
400 	unsigned long pfn;
401 	int flags = -1;
402 	int sec_sev = ghes_severity(gdata->error_severity);
403 	struct cper_sec_mem_err *mem_err;
404 	mem_err = (struct cper_sec_mem_err *)(gdata + 1);
405 
406 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
407 		return;
408 
409 	pfn = mem_err->physical_addr >> PAGE_SHIFT;
410 	if (!pfn_valid(pfn)) {
411 		pr_warn_ratelimited(FW_WARN GHES_PFX
412 		"Invalid address in generic error data: %#llx\n",
413 		mem_err->physical_addr);
414 		return;
415 	}
416 
417 	/* iff following two events can be handled properly by now */
418 	if (sec_sev == GHES_SEV_CORRECTED &&
419 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
420 		flags = MF_SOFT_OFFLINE;
421 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
422 		flags = 0;
423 
424 	if (flags != -1)
425 		memory_failure_queue(pfn, 0, flags);
426 #endif
427 }
428 
429 static void ghes_do_proc(struct ghes *ghes,
430 			 const struct acpi_hest_generic_status *estatus)
431 {
432 	int sev, sec_sev;
433 	struct acpi_hest_generic_data *gdata;
434 
435 	sev = ghes_severity(estatus->error_severity);
436 	apei_estatus_for_each_section(estatus, gdata) {
437 		sec_sev = ghes_severity(gdata->error_severity);
438 		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
439 				 CPER_SEC_PLATFORM_MEM)) {
440 			struct cper_sec_mem_err *mem_err;
441 			mem_err = (struct cper_sec_mem_err *)(gdata+1);
442 			ghes_edac_report_mem_error(ghes, sev, mem_err);
443 
444 			arch_apei_report_mem_error(sev, mem_err);
445 			ghes_handle_memory_failure(gdata, sev);
446 		}
447 #ifdef CONFIG_ACPI_APEI_PCIEAER
448 		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
449 				      CPER_SEC_PCIE)) {
450 			struct cper_sec_pcie *pcie_err;
451 			pcie_err = (struct cper_sec_pcie *)(gdata+1);
452 			if (sev == GHES_SEV_RECOVERABLE &&
453 			    sec_sev == GHES_SEV_RECOVERABLE &&
454 			    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
455 			    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
456 				unsigned int devfn;
457 				int aer_severity;
458 
459 				devfn = PCI_DEVFN(pcie_err->device_id.device,
460 						  pcie_err->device_id.function);
461 				aer_severity = cper_severity_to_aer(gdata->error_severity);
462 
463 				/*
464 				 * If firmware reset the component to contain
465 				 * the error, we must reinitialize it before
466 				 * use, so treat it as a fatal AER error.
467 				 */
468 				if (gdata->flags & CPER_SEC_RESET)
469 					aer_severity = AER_FATAL;
470 
471 				aer_recover_queue(pcie_err->device_id.segment,
472 						  pcie_err->device_id.bus,
473 						  devfn, aer_severity,
474 						  (struct aer_capability_regs *)
475 						  pcie_err->aer_info);
476 			}
477 
478 		}
479 #endif
480 	}
481 }
482 
483 static void __ghes_print_estatus(const char *pfx,
484 				 const struct acpi_hest_generic *generic,
485 				 const struct acpi_hest_generic_status *estatus)
486 {
487 	static atomic_t seqno;
488 	unsigned int curr_seqno;
489 	char pfx_seq[64];
490 
491 	if (pfx == NULL) {
492 		if (ghes_severity(estatus->error_severity) <=
493 		    GHES_SEV_CORRECTED)
494 			pfx = KERN_WARNING;
495 		else
496 			pfx = KERN_ERR;
497 	}
498 	curr_seqno = atomic_inc_return(&seqno);
499 	snprintf(pfx_seq, sizeof(pfx_seq), "%s{%u}" HW_ERR, pfx, curr_seqno);
500 	printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
501 	       pfx_seq, generic->header.source_id);
502 	cper_estatus_print(pfx_seq, estatus);
503 }
504 
505 static int ghes_print_estatus(const char *pfx,
506 			      const struct acpi_hest_generic *generic,
507 			      const struct acpi_hest_generic_status *estatus)
508 {
509 	/* Not more than 2 messages every 5 seconds */
510 	static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
511 	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
512 	struct ratelimit_state *ratelimit;
513 
514 	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
515 		ratelimit = &ratelimit_corrected;
516 	else
517 		ratelimit = &ratelimit_uncorrected;
518 	if (__ratelimit(ratelimit)) {
519 		__ghes_print_estatus(pfx, generic, estatus);
520 		return 1;
521 	}
522 	return 0;
523 }
524 
525 /*
526  * GHES error status reporting throttle, to report more kinds of
527  * errors, instead of just most frequently occurred errors.
528  */
529 static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus)
530 {
531 	u32 len;
532 	int i, cached = 0;
533 	unsigned long long now;
534 	struct ghes_estatus_cache *cache;
535 	struct acpi_hest_generic_status *cache_estatus;
536 
537 	len = cper_estatus_len(estatus);
538 	rcu_read_lock();
539 	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
540 		cache = rcu_dereference(ghes_estatus_caches[i]);
541 		if (cache == NULL)
542 			continue;
543 		if (len != cache->estatus_len)
544 			continue;
545 		cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
546 		if (memcmp(estatus, cache_estatus, len))
547 			continue;
548 		atomic_inc(&cache->count);
549 		now = sched_clock();
550 		if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC)
551 			cached = 1;
552 		break;
553 	}
554 	rcu_read_unlock();
555 	return cached;
556 }
557 
558 static struct ghes_estatus_cache *ghes_estatus_cache_alloc(
559 	struct acpi_hest_generic *generic,
560 	struct acpi_hest_generic_status *estatus)
561 {
562 	int alloced;
563 	u32 len, cache_len;
564 	struct ghes_estatus_cache *cache;
565 	struct acpi_hest_generic_status *cache_estatus;
566 
567 	alloced = atomic_add_return(1, &ghes_estatus_cache_alloced);
568 	if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) {
569 		atomic_dec(&ghes_estatus_cache_alloced);
570 		return NULL;
571 	}
572 	len = cper_estatus_len(estatus);
573 	cache_len = GHES_ESTATUS_CACHE_LEN(len);
574 	cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len);
575 	if (!cache) {
576 		atomic_dec(&ghes_estatus_cache_alloced);
577 		return NULL;
578 	}
579 	cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
580 	memcpy(cache_estatus, estatus, len);
581 	cache->estatus_len = len;
582 	atomic_set(&cache->count, 0);
583 	cache->generic = generic;
584 	cache->time_in = sched_clock();
585 	return cache;
586 }
587 
588 static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache)
589 {
590 	u32 len;
591 
592 	len = cper_estatus_len(GHES_ESTATUS_FROM_CACHE(cache));
593 	len = GHES_ESTATUS_CACHE_LEN(len);
594 	gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len);
595 	atomic_dec(&ghes_estatus_cache_alloced);
596 }
597 
598 static void ghes_estatus_cache_rcu_free(struct rcu_head *head)
599 {
600 	struct ghes_estatus_cache *cache;
601 
602 	cache = container_of(head, struct ghes_estatus_cache, rcu);
603 	ghes_estatus_cache_free(cache);
604 }
605 
606 static void ghes_estatus_cache_add(
607 	struct acpi_hest_generic *generic,
608 	struct acpi_hest_generic_status *estatus)
609 {
610 	int i, slot = -1, count;
611 	unsigned long long now, duration, period, max_period = 0;
612 	struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache;
613 
614 	new_cache = ghes_estatus_cache_alloc(generic, estatus);
615 	if (new_cache == NULL)
616 		return;
617 	rcu_read_lock();
618 	now = sched_clock();
619 	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
620 		cache = rcu_dereference(ghes_estatus_caches[i]);
621 		if (cache == NULL) {
622 			slot = i;
623 			slot_cache = NULL;
624 			break;
625 		}
626 		duration = now - cache->time_in;
627 		if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) {
628 			slot = i;
629 			slot_cache = cache;
630 			break;
631 		}
632 		count = atomic_read(&cache->count);
633 		period = duration;
634 		do_div(period, (count + 1));
635 		if (period > max_period) {
636 			max_period = period;
637 			slot = i;
638 			slot_cache = cache;
639 		}
640 	}
641 	/* new_cache must be put into array after its contents are written */
642 	smp_wmb();
643 	if (slot != -1 && cmpxchg(ghes_estatus_caches + slot,
644 				  slot_cache, new_cache) == slot_cache) {
645 		if (slot_cache)
646 			call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free);
647 	} else
648 		ghes_estatus_cache_free(new_cache);
649 	rcu_read_unlock();
650 }
651 
652 static int ghes_proc(struct ghes *ghes)
653 {
654 	int rc;
655 
656 	rc = ghes_read_estatus(ghes, 0);
657 	if (rc)
658 		goto out;
659 	if (!ghes_estatus_cached(ghes->estatus)) {
660 		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
661 			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
662 	}
663 	ghes_do_proc(ghes, ghes->estatus);
664 out:
665 	ghes_clear_estatus(ghes);
666 	return rc;
667 }
668 
669 static void ghes_add_timer(struct ghes *ghes)
670 {
671 	struct acpi_hest_generic *g = ghes->generic;
672 	unsigned long expire;
673 
674 	if (!g->notify.poll_interval) {
675 		pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n",
676 			   g->header.source_id);
677 		return;
678 	}
679 	expire = jiffies + msecs_to_jiffies(g->notify.poll_interval);
680 	ghes->timer.expires = round_jiffies_relative(expire);
681 	add_timer(&ghes->timer);
682 }
683 
684 static void ghes_poll_func(unsigned long data)
685 {
686 	struct ghes *ghes = (void *)data;
687 
688 	ghes_proc(ghes);
689 	if (!(ghes->flags & GHES_EXITING))
690 		ghes_add_timer(ghes);
691 }
692 
693 static irqreturn_t ghes_irq_func(int irq, void *data)
694 {
695 	struct ghes *ghes = data;
696 	int rc;
697 
698 	rc = ghes_proc(ghes);
699 	if (rc)
700 		return IRQ_NONE;
701 
702 	return IRQ_HANDLED;
703 }
704 
705 static int ghes_notify_sci(struct notifier_block *this,
706 				  unsigned long event, void *data)
707 {
708 	struct ghes *ghes;
709 	int ret = NOTIFY_DONE;
710 
711 	rcu_read_lock();
712 	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
713 		if (!ghes_proc(ghes))
714 			ret = NOTIFY_OK;
715 	}
716 	rcu_read_unlock();
717 
718 	return ret;
719 }
720 
721 static struct notifier_block ghes_notifier_sci = {
722 	.notifier_call = ghes_notify_sci,
723 };
724 
725 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
726 /*
727  * printk is not safe in NMI context.  So in NMI handler, we allocate
728  * required memory from lock-less memory allocator
729  * (ghes_estatus_pool), save estatus into it, put them into lock-less
730  * list (ghes_estatus_llist), then delay printk into IRQ context via
731  * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record
732  * required pool size by all NMI error source.
733  */
734 static struct llist_head ghes_estatus_llist;
735 static struct irq_work ghes_proc_irq_work;
736 
737 /*
738  * NMI may be triggered on any CPU, so ghes_in_nmi is used for
739  * having only one concurrent reader.
740  */
741 static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
742 
743 static LIST_HEAD(ghes_nmi);
744 
745 static int ghes_panic_timeout	__read_mostly = 30;
746 
747 static void ghes_proc_in_irq(struct irq_work *irq_work)
748 {
749 	struct llist_node *llnode, *next;
750 	struct ghes_estatus_node *estatus_node;
751 	struct acpi_hest_generic *generic;
752 	struct acpi_hest_generic_status *estatus;
753 	u32 len, node_len;
754 
755 	llnode = llist_del_all(&ghes_estatus_llist);
756 	/*
757 	 * Because the time order of estatus in list is reversed,
758 	 * revert it back to proper order.
759 	 */
760 	llnode = llist_reverse_order(llnode);
761 	while (llnode) {
762 		next = llnode->next;
763 		estatus_node = llist_entry(llnode, struct ghes_estatus_node,
764 					   llnode);
765 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
766 		len = cper_estatus_len(estatus);
767 		node_len = GHES_ESTATUS_NODE_LEN(len);
768 		ghes_do_proc(estatus_node->ghes, estatus);
769 		if (!ghes_estatus_cached(estatus)) {
770 			generic = estatus_node->generic;
771 			if (ghes_print_estatus(NULL, generic, estatus))
772 				ghes_estatus_cache_add(generic, estatus);
773 		}
774 		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
775 			      node_len);
776 		llnode = next;
777 	}
778 }
779 
780 static void ghes_print_queued_estatus(void)
781 {
782 	struct llist_node *llnode;
783 	struct ghes_estatus_node *estatus_node;
784 	struct acpi_hest_generic *generic;
785 	struct acpi_hest_generic_status *estatus;
786 	u32 len, node_len;
787 
788 	llnode = llist_del_all(&ghes_estatus_llist);
789 	/*
790 	 * Because the time order of estatus in list is reversed,
791 	 * revert it back to proper order.
792 	 */
793 	llnode = llist_reverse_order(llnode);
794 	while (llnode) {
795 		estatus_node = llist_entry(llnode, struct ghes_estatus_node,
796 					   llnode);
797 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
798 		len = cper_estatus_len(estatus);
799 		node_len = GHES_ESTATUS_NODE_LEN(len);
800 		generic = estatus_node->generic;
801 		ghes_print_estatus(NULL, generic, estatus);
802 		llnode = llnode->next;
803 	}
804 }
805 
806 /* Save estatus for further processing in IRQ context */
807 static void __process_error(struct ghes *ghes)
808 {
809 #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
810 	u32 len, node_len;
811 	struct ghes_estatus_node *estatus_node;
812 	struct acpi_hest_generic_status *estatus;
813 
814 	if (ghes_estatus_cached(ghes->estatus))
815 		return;
816 
817 	len = cper_estatus_len(ghes->estatus);
818 	node_len = GHES_ESTATUS_NODE_LEN(len);
819 
820 	estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
821 	if (!estatus_node)
822 		return;
823 
824 	estatus_node->ghes = ghes;
825 	estatus_node->generic = ghes->generic;
826 	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
827 	memcpy(estatus, ghes->estatus, len);
828 	llist_add(&estatus_node->llnode, &ghes_estatus_llist);
829 #endif
830 }
831 
832 static void __ghes_panic(struct ghes *ghes)
833 {
834 	oops_begin();
835 	ghes_print_queued_estatus();
836 	__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
837 
838 	/* reboot to log the error! */
839 	if (panic_timeout == 0)
840 		panic_timeout = ghes_panic_timeout;
841 	panic("Fatal hardware error!");
842 }
843 
844 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
845 {
846 	struct ghes *ghes;
847 	int sev, ret = NMI_DONE;
848 
849 	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
850 		return ret;
851 
852 	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
853 		if (ghes_read_estatus(ghes, 1)) {
854 			ghes_clear_estatus(ghes);
855 			continue;
856 		} else {
857 			ret = NMI_HANDLED;
858 		}
859 
860 		sev = ghes_severity(ghes->estatus->error_severity);
861 		if (sev >= GHES_SEV_PANIC)
862 			__ghes_panic(ghes);
863 
864 		if (!(ghes->flags & GHES_TO_CLEAR))
865 			continue;
866 
867 		__process_error(ghes);
868 		ghes_clear_estatus(ghes);
869 	}
870 
871 #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
872 	if (ret == NMI_HANDLED)
873 		irq_work_queue(&ghes_proc_irq_work);
874 #endif
875 	atomic_dec(&ghes_in_nmi);
876 	return ret;
877 }
878 
879 static unsigned long ghes_esource_prealloc_size(
880 	const struct acpi_hest_generic *generic)
881 {
882 	unsigned long block_length, prealloc_records, prealloc_size;
883 
884 	block_length = min_t(unsigned long, generic->error_block_length,
885 			     GHES_ESTATUS_MAX_SIZE);
886 	prealloc_records = max_t(unsigned long,
887 				 generic->records_to_preallocate, 1);
888 	prealloc_size = min_t(unsigned long, block_length * prealloc_records,
889 			      GHES_ESOURCE_PREALLOC_MAX_SIZE);
890 
891 	return prealloc_size;
892 }
893 
894 static void ghes_estatus_pool_shrink(unsigned long len)
895 {
896 	ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
897 }
898 
899 static void ghes_nmi_add(struct ghes *ghes)
900 {
901 	unsigned long len;
902 
903 	len = ghes_esource_prealloc_size(ghes->generic);
904 	ghes_estatus_pool_expand(len);
905 	mutex_lock(&ghes_list_mutex);
906 	if (list_empty(&ghes_nmi))
907 		register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
908 	list_add_rcu(&ghes->list, &ghes_nmi);
909 	mutex_unlock(&ghes_list_mutex);
910 }
911 
912 static void ghes_nmi_remove(struct ghes *ghes)
913 {
914 	unsigned long len;
915 
916 	mutex_lock(&ghes_list_mutex);
917 	list_del_rcu(&ghes->list);
918 	if (list_empty(&ghes_nmi))
919 		unregister_nmi_handler(NMI_LOCAL, "ghes");
920 	mutex_unlock(&ghes_list_mutex);
921 	/*
922 	 * To synchronize with NMI handler, ghes can only be
923 	 * freed after NMI handler finishes.
924 	 */
925 	synchronize_rcu();
926 	len = ghes_esource_prealloc_size(ghes->generic);
927 	ghes_estatus_pool_shrink(len);
928 }
929 
930 static void ghes_nmi_init_cxt(void)
931 {
932 	init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
933 }
934 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
935 static inline void ghes_nmi_add(struct ghes *ghes)
936 {
937 	pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n",
938 	       ghes->generic->header.source_id);
939 	BUG();
940 }
941 
942 static inline void ghes_nmi_remove(struct ghes *ghes)
943 {
944 	pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n",
945 	       ghes->generic->header.source_id);
946 	BUG();
947 }
948 
949 static inline void ghes_nmi_init_cxt(void)
950 {
951 }
952 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
953 
954 static int ghes_probe(struct platform_device *ghes_dev)
955 {
956 	struct acpi_hest_generic *generic;
957 	struct ghes *ghes = NULL;
958 
959 	int rc = -EINVAL;
960 
961 	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
962 	if (!generic->enabled)
963 		return -ENODEV;
964 
965 	switch (generic->notify.type) {
966 	case ACPI_HEST_NOTIFY_POLLED:
967 	case ACPI_HEST_NOTIFY_EXTERNAL:
968 	case ACPI_HEST_NOTIFY_SCI:
969 		break;
970 	case ACPI_HEST_NOTIFY_NMI:
971 		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
972 			pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
973 				generic->header.source_id);
974 			goto err;
975 		}
976 		break;
977 	case ACPI_HEST_NOTIFY_LOCAL:
978 		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
979 			   generic->header.source_id);
980 		goto err;
981 	default:
982 		pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n",
983 			   generic->notify.type, generic->header.source_id);
984 		goto err;
985 	}
986 
987 	rc = -EIO;
988 	if (generic->error_block_length <
989 	    sizeof(struct acpi_hest_generic_status)) {
990 		pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n",
991 			   generic->error_block_length,
992 			   generic->header.source_id);
993 		goto err;
994 	}
995 	ghes = ghes_new(generic);
996 	if (IS_ERR(ghes)) {
997 		rc = PTR_ERR(ghes);
998 		ghes = NULL;
999 		goto err;
1000 	}
1001 
1002 	rc = ghes_edac_register(ghes, &ghes_dev->dev);
1003 	if (rc < 0)
1004 		goto err;
1005 
1006 	switch (generic->notify.type) {
1007 	case ACPI_HEST_NOTIFY_POLLED:
1008 		setup_deferrable_timer(&ghes->timer, ghes_poll_func,
1009 				       (unsigned long)ghes);
1010 		ghes_add_timer(ghes);
1011 		break;
1012 	case ACPI_HEST_NOTIFY_EXTERNAL:
1013 		/* External interrupt vector is GSI */
1014 		rc = acpi_gsi_to_irq(generic->notify.vector, &ghes->irq);
1015 		if (rc) {
1016 			pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
1017 			       generic->header.source_id);
1018 			goto err_edac_unreg;
1019 		}
1020 		rc = request_irq(ghes->irq, ghes_irq_func, 0, "GHES IRQ", ghes);
1021 		if (rc) {
1022 			pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
1023 			       generic->header.source_id);
1024 			goto err_edac_unreg;
1025 		}
1026 		break;
1027 	case ACPI_HEST_NOTIFY_SCI:
1028 		mutex_lock(&ghes_list_mutex);
1029 		if (list_empty(&ghes_sci))
1030 			register_acpi_hed_notifier(&ghes_notifier_sci);
1031 		list_add_rcu(&ghes->list, &ghes_sci);
1032 		mutex_unlock(&ghes_list_mutex);
1033 		break;
1034 	case ACPI_HEST_NOTIFY_NMI:
1035 		ghes_nmi_add(ghes);
1036 		break;
1037 	default:
1038 		BUG();
1039 	}
1040 	platform_set_drvdata(ghes_dev, ghes);
1041 
1042 	return 0;
1043 err_edac_unreg:
1044 	ghes_edac_unregister(ghes);
1045 err:
1046 	if (ghes) {
1047 		ghes_fini(ghes);
1048 		kfree(ghes);
1049 	}
1050 	return rc;
1051 }
1052 
1053 static int ghes_remove(struct platform_device *ghes_dev)
1054 {
1055 	struct ghes *ghes;
1056 	struct acpi_hest_generic *generic;
1057 
1058 	ghes = platform_get_drvdata(ghes_dev);
1059 	generic = ghes->generic;
1060 
1061 	ghes->flags |= GHES_EXITING;
1062 	switch (generic->notify.type) {
1063 	case ACPI_HEST_NOTIFY_POLLED:
1064 		del_timer_sync(&ghes->timer);
1065 		break;
1066 	case ACPI_HEST_NOTIFY_EXTERNAL:
1067 		free_irq(ghes->irq, ghes);
1068 		break;
1069 	case ACPI_HEST_NOTIFY_SCI:
1070 		mutex_lock(&ghes_list_mutex);
1071 		list_del_rcu(&ghes->list);
1072 		if (list_empty(&ghes_sci))
1073 			unregister_acpi_hed_notifier(&ghes_notifier_sci);
1074 		mutex_unlock(&ghes_list_mutex);
1075 		synchronize_rcu();
1076 		break;
1077 	case ACPI_HEST_NOTIFY_NMI:
1078 		ghes_nmi_remove(ghes);
1079 		break;
1080 	default:
1081 		BUG();
1082 		break;
1083 	}
1084 
1085 	ghes_fini(ghes);
1086 
1087 	ghes_edac_unregister(ghes);
1088 
1089 	kfree(ghes);
1090 
1091 	platform_set_drvdata(ghes_dev, NULL);
1092 
1093 	return 0;
1094 }
1095 
1096 static struct platform_driver ghes_platform_driver = {
1097 	.driver		= {
1098 		.name	= "GHES",
1099 	},
1100 	.probe		= ghes_probe,
1101 	.remove		= ghes_remove,
1102 };
1103 
1104 static int __init ghes_init(void)
1105 {
1106 	int rc;
1107 
1108 	if (acpi_disabled)
1109 		return -ENODEV;
1110 
1111 	if (hest_disable) {
1112 		pr_info(GHES_PFX "HEST is not enabled!\n");
1113 		return -EINVAL;
1114 	}
1115 
1116 	if (ghes_disable) {
1117 		pr_info(GHES_PFX "GHES is not enabled!\n");
1118 		return -EINVAL;
1119 	}
1120 
1121 	ghes_nmi_init_cxt();
1122 
1123 	rc = ghes_ioremap_init();
1124 	if (rc)
1125 		goto err;
1126 
1127 	rc = ghes_estatus_pool_init();
1128 	if (rc)
1129 		goto err_ioremap_exit;
1130 
1131 	rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
1132 				      GHES_ESTATUS_CACHE_ALLOCED_MAX);
1133 	if (rc)
1134 		goto err_pool_exit;
1135 
1136 	rc = platform_driver_register(&ghes_platform_driver);
1137 	if (rc)
1138 		goto err_pool_exit;
1139 
1140 	rc = apei_osc_setup();
1141 	if (rc == 0 && osc_sb_apei_support_acked)
1142 		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
1143 	else if (rc == 0 && !osc_sb_apei_support_acked)
1144 		pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n");
1145 	else if (rc && osc_sb_apei_support_acked)
1146 		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n");
1147 	else
1148 		pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
1149 
1150 	return 0;
1151 err_pool_exit:
1152 	ghes_estatus_pool_exit();
1153 err_ioremap_exit:
1154 	ghes_ioremap_exit();
1155 err:
1156 	return rc;
1157 }
1158 device_initcall(ghes_init);
1159