xref: /openbmc/linux/drivers/acpi/apei/ghes.c (revision a2cce7a9)
1 /*
2  * APEI Generic Hardware Error Source support
3  *
4  * Generic Hardware Error Source provides a way to report platform
5  * hardware errors (such as that from chipset). It works in so called
6  * "Firmware First" mode, that is, hardware errors are reported to
7  * firmware firstly, then reported to Linux by firmware. This way,
8  * some non-standard hardware error registers or non-standard hardware
9  * link can be checked by firmware to produce more hardware error
10  * information for Linux.
11  *
12  * For more information about Generic Hardware Error Source, please
13  * refer to ACPI Specification version 4.0, section 17.3.2.6
14  *
15  * Copyright 2010,2011 Intel Corp.
16  *   Author: Huang Ying <ying.huang@intel.com>
17  *
18  * This program is free software; you can redistribute it and/or
19  * modify it under the terms of the GNU General Public License version
20  * 2 as published by the Free Software Foundation;
21  *
22  * This program is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25  * GNU General Public License for more details.
26  */
27 
28 #include <linux/kernel.h>
29 #include <linux/module.h>
30 #include <linux/init.h>
31 #include <linux/acpi.h>
32 #include <linux/io.h>
33 #include <linux/interrupt.h>
34 #include <linux/timer.h>
35 #include <linux/cper.h>
36 #include <linux/kdebug.h>
37 #include <linux/platform_device.h>
38 #include <linux/mutex.h>
39 #include <linux/ratelimit.h>
40 #include <linux/vmalloc.h>
41 #include <linux/irq_work.h>
42 #include <linux/llist.h>
43 #include <linux/genalloc.h>
44 #include <linux/pci.h>
45 #include <linux/aer.h>
46 #include <linux/nmi.h>
47 
48 #include <acpi/ghes.h>
49 #include <acpi/apei.h>
50 #include <asm/tlbflush.h>
51 
52 #include "apei-internal.h"
53 
54 #define GHES_PFX	"GHES: "
55 
56 #define GHES_ESTATUS_MAX_SIZE		65536
57 #define GHES_ESOURCE_PREALLOC_MAX_SIZE	65536
58 
59 #define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
60 
61 /* This is just an estimation for memory pool allocation */
62 #define GHES_ESTATUS_CACHE_AVG_SIZE	512
63 
64 #define GHES_ESTATUS_CACHES_SIZE	4
65 
66 #define GHES_ESTATUS_IN_CACHE_MAX_NSEC	10000000000ULL
67 /* Prevent too many caches are allocated because of RCU */
68 #define GHES_ESTATUS_CACHE_ALLOCED_MAX	(GHES_ESTATUS_CACHES_SIZE * 3 / 2)
69 
70 #define GHES_ESTATUS_CACHE_LEN(estatus_len)			\
71 	(sizeof(struct ghes_estatus_cache) + (estatus_len))
72 #define GHES_ESTATUS_FROM_CACHE(estatus_cache)			\
73 	((struct acpi_hest_generic_status *)				\
74 	 ((struct ghes_estatus_cache *)(estatus_cache) + 1))
75 
76 #define GHES_ESTATUS_NODE_LEN(estatus_len)			\
77 	(sizeof(struct ghes_estatus_node) + (estatus_len))
78 #define GHES_ESTATUS_FROM_NODE(estatus_node)			\
79 	((struct acpi_hest_generic_status *)				\
80 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
81 
82 bool ghes_disable;
83 module_param_named(disable, ghes_disable, bool, 0);
84 
85 /*
86  * All error sources notified with SCI shares one notifier function,
87  * so they need to be linked and checked one by one.  This is applied
88  * to NMI too.
89  *
90  * RCU is used for these lists, so ghes_list_mutex is only used for
91  * list changing, not for traversing.
92  */
93 static LIST_HEAD(ghes_sci);
94 static DEFINE_MUTEX(ghes_list_mutex);
95 
96 /*
97  * Because the memory area used to transfer hardware error information
98  * from BIOS to Linux can be determined only in NMI, IRQ or timer
99  * handler, but general ioremap can not be used in atomic context, so
100  * a special version of atomic ioremap is implemented for that.
101  */
102 
103 /*
104  * Two virtual pages are used, one for IRQ/PROCESS context, the other for
105  * NMI context (optionally).
106  */
107 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
108 #define GHES_IOREMAP_PAGES           2
109 #else
110 #define GHES_IOREMAP_PAGES           1
111 #endif
112 #define GHES_IOREMAP_IRQ_PAGE(base)	(base)
113 #define GHES_IOREMAP_NMI_PAGE(base)	((base) + PAGE_SIZE)
114 
115 /* virtual memory area for atomic ioremap */
116 static struct vm_struct *ghes_ioremap_area;
117 /*
118  * These 2 spinlock is used to prevent atomic ioremap virtual memory
119  * area from being mapped simultaneously.
120  */
121 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
122 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
123 
124 static struct gen_pool *ghes_estatus_pool;
125 static unsigned long ghes_estatus_pool_size_request;
126 
127 static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
128 static atomic_t ghes_estatus_cache_alloced;
129 
130 static int ghes_ioremap_init(void)
131 {
132 	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
133 		VM_IOREMAP, VMALLOC_START, VMALLOC_END);
134 	if (!ghes_ioremap_area) {
135 		pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n");
136 		return -ENOMEM;
137 	}
138 
139 	return 0;
140 }
141 
142 static void ghes_ioremap_exit(void)
143 {
144 	free_vm_area(ghes_ioremap_area);
145 }
146 
147 static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
148 {
149 	unsigned long vaddr;
150 
151 	vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
152 	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
153 			   pfn << PAGE_SHIFT, PAGE_KERNEL);
154 
155 	return (void __iomem *)vaddr;
156 }
157 
158 static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
159 {
160 	unsigned long vaddr;
161 
162 	vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
163 	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
164 			   pfn << PAGE_SHIFT, PAGE_KERNEL);
165 
166 	return (void __iomem *)vaddr;
167 }
168 
169 static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
170 {
171 	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
172 	void *base = ghes_ioremap_area->addr;
173 
174 	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
175 	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
176 	arch_apei_flush_tlb_one(vaddr);
177 }
178 
179 static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
180 {
181 	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
182 	void *base = ghes_ioremap_area->addr;
183 
184 	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
185 	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
186 	arch_apei_flush_tlb_one(vaddr);
187 }
188 
189 static int ghes_estatus_pool_init(void)
190 {
191 	ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
192 	if (!ghes_estatus_pool)
193 		return -ENOMEM;
194 	return 0;
195 }
196 
197 static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
198 					      struct gen_pool_chunk *chunk,
199 					      void *data)
200 {
201 	free_page(chunk->start_addr);
202 }
203 
204 static void ghes_estatus_pool_exit(void)
205 {
206 	gen_pool_for_each_chunk(ghes_estatus_pool,
207 				ghes_estatus_pool_free_chunk_page, NULL);
208 	gen_pool_destroy(ghes_estatus_pool);
209 }
210 
211 static int ghes_estatus_pool_expand(unsigned long len)
212 {
213 	unsigned long i, pages, size, addr;
214 	int ret;
215 
216 	ghes_estatus_pool_size_request += PAGE_ALIGN(len);
217 	size = gen_pool_size(ghes_estatus_pool);
218 	if (size >= ghes_estatus_pool_size_request)
219 		return 0;
220 	pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
221 	for (i = 0; i < pages; i++) {
222 		addr = __get_free_page(GFP_KERNEL);
223 		if (!addr)
224 			return -ENOMEM;
225 		ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
226 		if (ret)
227 			return ret;
228 	}
229 
230 	return 0;
231 }
232 
233 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
234 {
235 	struct ghes *ghes;
236 	unsigned int error_block_length;
237 	int rc;
238 
239 	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
240 	if (!ghes)
241 		return ERR_PTR(-ENOMEM);
242 	ghes->generic = generic;
243 	rc = apei_map_generic_address(&generic->error_status_address);
244 	if (rc)
245 		goto err_free;
246 	error_block_length = generic->error_block_length;
247 	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
248 		pr_warning(FW_WARN GHES_PFX
249 			   "Error status block length is too long: %u for "
250 			   "generic hardware error source: %d.\n",
251 			   error_block_length, generic->header.source_id);
252 		error_block_length = GHES_ESTATUS_MAX_SIZE;
253 	}
254 	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
255 	if (!ghes->estatus) {
256 		rc = -ENOMEM;
257 		goto err_unmap;
258 	}
259 
260 	return ghes;
261 
262 err_unmap:
263 	apei_unmap_generic_address(&generic->error_status_address);
264 err_free:
265 	kfree(ghes);
266 	return ERR_PTR(rc);
267 }
268 
269 static void ghes_fini(struct ghes *ghes)
270 {
271 	kfree(ghes->estatus);
272 	apei_unmap_generic_address(&ghes->generic->error_status_address);
273 }
274 
275 static inline int ghes_severity(int severity)
276 {
277 	switch (severity) {
278 	case CPER_SEV_INFORMATIONAL:
279 		return GHES_SEV_NO;
280 	case CPER_SEV_CORRECTED:
281 		return GHES_SEV_CORRECTED;
282 	case CPER_SEV_RECOVERABLE:
283 		return GHES_SEV_RECOVERABLE;
284 	case CPER_SEV_FATAL:
285 		return GHES_SEV_PANIC;
286 	default:
287 		/* Unknown, go panic */
288 		return GHES_SEV_PANIC;
289 	}
290 }
291 
292 static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
293 				  int from_phys)
294 {
295 	void __iomem *vaddr;
296 	unsigned long flags = 0;
297 	int in_nmi = in_nmi();
298 	u64 offset;
299 	u32 trunk;
300 
301 	while (len > 0) {
302 		offset = paddr - (paddr & PAGE_MASK);
303 		if (in_nmi) {
304 			raw_spin_lock(&ghes_ioremap_lock_nmi);
305 			vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
306 		} else {
307 			spin_lock_irqsave(&ghes_ioremap_lock_irq, flags);
308 			vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
309 		}
310 		trunk = PAGE_SIZE - offset;
311 		trunk = min(trunk, len);
312 		if (from_phys)
313 			memcpy_fromio(buffer, vaddr + offset, trunk);
314 		else
315 			memcpy_toio(vaddr + offset, buffer, trunk);
316 		len -= trunk;
317 		paddr += trunk;
318 		buffer += trunk;
319 		if (in_nmi) {
320 			ghes_iounmap_nmi(vaddr);
321 			raw_spin_unlock(&ghes_ioremap_lock_nmi);
322 		} else {
323 			ghes_iounmap_irq(vaddr);
324 			spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
325 		}
326 	}
327 }
328 
329 static int ghes_read_estatus(struct ghes *ghes, int silent)
330 {
331 	struct acpi_hest_generic *g = ghes->generic;
332 	u64 buf_paddr;
333 	u32 len;
334 	int rc;
335 
336 	rc = apei_read(&buf_paddr, &g->error_status_address);
337 	if (rc) {
338 		if (!silent && printk_ratelimit())
339 			pr_warning(FW_WARN GHES_PFX
340 "Failed to read error status block address for hardware error source: %d.\n",
341 				   g->header.source_id);
342 		return -EIO;
343 	}
344 	if (!buf_paddr)
345 		return -ENOENT;
346 
347 	ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
348 			      sizeof(*ghes->estatus), 1);
349 	if (!ghes->estatus->block_status)
350 		return -ENOENT;
351 
352 	ghes->buffer_paddr = buf_paddr;
353 	ghes->flags |= GHES_TO_CLEAR;
354 
355 	rc = -EIO;
356 	len = cper_estatus_len(ghes->estatus);
357 	if (len < sizeof(*ghes->estatus))
358 		goto err_read_block;
359 	if (len > ghes->generic->error_block_length)
360 		goto err_read_block;
361 	if (cper_estatus_check_header(ghes->estatus))
362 		goto err_read_block;
363 	ghes_copy_tofrom_phys(ghes->estatus + 1,
364 			      buf_paddr + sizeof(*ghes->estatus),
365 			      len - sizeof(*ghes->estatus), 1);
366 	if (cper_estatus_check(ghes->estatus))
367 		goto err_read_block;
368 	rc = 0;
369 
370 err_read_block:
371 	if (rc && !silent && printk_ratelimit())
372 		pr_warning(FW_WARN GHES_PFX
373 			   "Failed to read error status block!\n");
374 	return rc;
375 }
376 
377 static void ghes_clear_estatus(struct ghes *ghes)
378 {
379 	ghes->estatus->block_status = 0;
380 	if (!(ghes->flags & GHES_TO_CLEAR))
381 		return;
382 	ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
383 			      sizeof(ghes->estatus->block_status), 0);
384 	ghes->flags &= ~GHES_TO_CLEAR;
385 }
386 
387 static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
388 {
389 #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
390 	unsigned long pfn;
391 	int flags = -1;
392 	int sec_sev = ghes_severity(gdata->error_severity);
393 	struct cper_sec_mem_err *mem_err;
394 	mem_err = (struct cper_sec_mem_err *)(gdata + 1);
395 
396 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
397 		return;
398 
399 	pfn = mem_err->physical_addr >> PAGE_SHIFT;
400 	if (!pfn_valid(pfn)) {
401 		pr_warn_ratelimited(FW_WARN GHES_PFX
402 		"Invalid address in generic error data: %#llx\n",
403 		mem_err->physical_addr);
404 		return;
405 	}
406 
407 	/* iff following two events can be handled properly by now */
408 	if (sec_sev == GHES_SEV_CORRECTED &&
409 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
410 		flags = MF_SOFT_OFFLINE;
411 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
412 		flags = 0;
413 
414 	if (flags != -1)
415 		memory_failure_queue(pfn, 0, flags);
416 #endif
417 }
418 
419 static void ghes_do_proc(struct ghes *ghes,
420 			 const struct acpi_hest_generic_status *estatus)
421 {
422 	int sev, sec_sev;
423 	struct acpi_hest_generic_data *gdata;
424 
425 	sev = ghes_severity(estatus->error_severity);
426 	apei_estatus_for_each_section(estatus, gdata) {
427 		sec_sev = ghes_severity(gdata->error_severity);
428 		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
429 				 CPER_SEC_PLATFORM_MEM)) {
430 			struct cper_sec_mem_err *mem_err;
431 			mem_err = (struct cper_sec_mem_err *)(gdata+1);
432 			ghes_edac_report_mem_error(ghes, sev, mem_err);
433 
434 			arch_apei_report_mem_error(sev, mem_err);
435 			ghes_handle_memory_failure(gdata, sev);
436 		}
437 #ifdef CONFIG_ACPI_APEI_PCIEAER
438 		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
439 				      CPER_SEC_PCIE)) {
440 			struct cper_sec_pcie *pcie_err;
441 			pcie_err = (struct cper_sec_pcie *)(gdata+1);
442 			if (sev == GHES_SEV_RECOVERABLE &&
443 			    sec_sev == GHES_SEV_RECOVERABLE &&
444 			    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
445 			    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
446 				unsigned int devfn;
447 				int aer_severity;
448 
449 				devfn = PCI_DEVFN(pcie_err->device_id.device,
450 						  pcie_err->device_id.function);
451 				aer_severity = cper_severity_to_aer(sev);
452 
453 				/*
454 				 * If firmware reset the component to contain
455 				 * the error, we must reinitialize it before
456 				 * use, so treat it as a fatal AER error.
457 				 */
458 				if (gdata->flags & CPER_SEC_RESET)
459 					aer_severity = AER_FATAL;
460 
461 				aer_recover_queue(pcie_err->device_id.segment,
462 						  pcie_err->device_id.bus,
463 						  devfn, aer_severity,
464 						  (struct aer_capability_regs *)
465 						  pcie_err->aer_info);
466 			}
467 
468 		}
469 #endif
470 	}
471 }
472 
473 static void __ghes_print_estatus(const char *pfx,
474 				 const struct acpi_hest_generic *generic,
475 				 const struct acpi_hest_generic_status *estatus)
476 {
477 	static atomic_t seqno;
478 	unsigned int curr_seqno;
479 	char pfx_seq[64];
480 
481 	if (pfx == NULL) {
482 		if (ghes_severity(estatus->error_severity) <=
483 		    GHES_SEV_CORRECTED)
484 			pfx = KERN_WARNING;
485 		else
486 			pfx = KERN_ERR;
487 	}
488 	curr_seqno = atomic_inc_return(&seqno);
489 	snprintf(pfx_seq, sizeof(pfx_seq), "%s{%u}" HW_ERR, pfx, curr_seqno);
490 	printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
491 	       pfx_seq, generic->header.source_id);
492 	cper_estatus_print(pfx_seq, estatus);
493 }
494 
495 static int ghes_print_estatus(const char *pfx,
496 			      const struct acpi_hest_generic *generic,
497 			      const struct acpi_hest_generic_status *estatus)
498 {
499 	/* Not more than 2 messages every 5 seconds */
500 	static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
501 	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
502 	struct ratelimit_state *ratelimit;
503 
504 	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
505 		ratelimit = &ratelimit_corrected;
506 	else
507 		ratelimit = &ratelimit_uncorrected;
508 	if (__ratelimit(ratelimit)) {
509 		__ghes_print_estatus(pfx, generic, estatus);
510 		return 1;
511 	}
512 	return 0;
513 }
514 
515 /*
516  * GHES error status reporting throttle, to report more kinds of
517  * errors, instead of just most frequently occurred errors.
518  */
519 static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus)
520 {
521 	u32 len;
522 	int i, cached = 0;
523 	unsigned long long now;
524 	struct ghes_estatus_cache *cache;
525 	struct acpi_hest_generic_status *cache_estatus;
526 
527 	len = cper_estatus_len(estatus);
528 	rcu_read_lock();
529 	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
530 		cache = rcu_dereference(ghes_estatus_caches[i]);
531 		if (cache == NULL)
532 			continue;
533 		if (len != cache->estatus_len)
534 			continue;
535 		cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
536 		if (memcmp(estatus, cache_estatus, len))
537 			continue;
538 		atomic_inc(&cache->count);
539 		now = sched_clock();
540 		if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC)
541 			cached = 1;
542 		break;
543 	}
544 	rcu_read_unlock();
545 	return cached;
546 }
547 
548 static struct ghes_estatus_cache *ghes_estatus_cache_alloc(
549 	struct acpi_hest_generic *generic,
550 	struct acpi_hest_generic_status *estatus)
551 {
552 	int alloced;
553 	u32 len, cache_len;
554 	struct ghes_estatus_cache *cache;
555 	struct acpi_hest_generic_status *cache_estatus;
556 
557 	alloced = atomic_add_return(1, &ghes_estatus_cache_alloced);
558 	if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) {
559 		atomic_dec(&ghes_estatus_cache_alloced);
560 		return NULL;
561 	}
562 	len = cper_estatus_len(estatus);
563 	cache_len = GHES_ESTATUS_CACHE_LEN(len);
564 	cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len);
565 	if (!cache) {
566 		atomic_dec(&ghes_estatus_cache_alloced);
567 		return NULL;
568 	}
569 	cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
570 	memcpy(cache_estatus, estatus, len);
571 	cache->estatus_len = len;
572 	atomic_set(&cache->count, 0);
573 	cache->generic = generic;
574 	cache->time_in = sched_clock();
575 	return cache;
576 }
577 
578 static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache)
579 {
580 	u32 len;
581 
582 	len = cper_estatus_len(GHES_ESTATUS_FROM_CACHE(cache));
583 	len = GHES_ESTATUS_CACHE_LEN(len);
584 	gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len);
585 	atomic_dec(&ghes_estatus_cache_alloced);
586 }
587 
588 static void ghes_estatus_cache_rcu_free(struct rcu_head *head)
589 {
590 	struct ghes_estatus_cache *cache;
591 
592 	cache = container_of(head, struct ghes_estatus_cache, rcu);
593 	ghes_estatus_cache_free(cache);
594 }
595 
596 static void ghes_estatus_cache_add(
597 	struct acpi_hest_generic *generic,
598 	struct acpi_hest_generic_status *estatus)
599 {
600 	int i, slot = -1, count;
601 	unsigned long long now, duration, period, max_period = 0;
602 	struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache;
603 
604 	new_cache = ghes_estatus_cache_alloc(generic, estatus);
605 	if (new_cache == NULL)
606 		return;
607 	rcu_read_lock();
608 	now = sched_clock();
609 	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
610 		cache = rcu_dereference(ghes_estatus_caches[i]);
611 		if (cache == NULL) {
612 			slot = i;
613 			slot_cache = NULL;
614 			break;
615 		}
616 		duration = now - cache->time_in;
617 		if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) {
618 			slot = i;
619 			slot_cache = cache;
620 			break;
621 		}
622 		count = atomic_read(&cache->count);
623 		period = duration;
624 		do_div(period, (count + 1));
625 		if (period > max_period) {
626 			max_period = period;
627 			slot = i;
628 			slot_cache = cache;
629 		}
630 	}
631 	/* new_cache must be put into array after its contents are written */
632 	smp_wmb();
633 	if (slot != -1 && cmpxchg(ghes_estatus_caches + slot,
634 				  slot_cache, new_cache) == slot_cache) {
635 		if (slot_cache)
636 			call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free);
637 	} else
638 		ghes_estatus_cache_free(new_cache);
639 	rcu_read_unlock();
640 }
641 
642 static int ghes_proc(struct ghes *ghes)
643 {
644 	int rc;
645 
646 	rc = ghes_read_estatus(ghes, 0);
647 	if (rc)
648 		goto out;
649 	if (!ghes_estatus_cached(ghes->estatus)) {
650 		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
651 			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
652 	}
653 	ghes_do_proc(ghes, ghes->estatus);
654 out:
655 	ghes_clear_estatus(ghes);
656 	return 0;
657 }
658 
659 static void ghes_add_timer(struct ghes *ghes)
660 {
661 	struct acpi_hest_generic *g = ghes->generic;
662 	unsigned long expire;
663 
664 	if (!g->notify.poll_interval) {
665 		pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n",
666 			   g->header.source_id);
667 		return;
668 	}
669 	expire = jiffies + msecs_to_jiffies(g->notify.poll_interval);
670 	ghes->timer.expires = round_jiffies_relative(expire);
671 	add_timer(&ghes->timer);
672 }
673 
674 static void ghes_poll_func(unsigned long data)
675 {
676 	struct ghes *ghes = (void *)data;
677 
678 	ghes_proc(ghes);
679 	if (!(ghes->flags & GHES_EXITING))
680 		ghes_add_timer(ghes);
681 }
682 
683 static irqreturn_t ghes_irq_func(int irq, void *data)
684 {
685 	struct ghes *ghes = data;
686 	int rc;
687 
688 	rc = ghes_proc(ghes);
689 	if (rc)
690 		return IRQ_NONE;
691 
692 	return IRQ_HANDLED;
693 }
694 
695 static int ghes_notify_sci(struct notifier_block *this,
696 				  unsigned long event, void *data)
697 {
698 	struct ghes *ghes;
699 	int ret = NOTIFY_DONE;
700 
701 	rcu_read_lock();
702 	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
703 		if (!ghes_proc(ghes))
704 			ret = NOTIFY_OK;
705 	}
706 	rcu_read_unlock();
707 
708 	return ret;
709 }
710 
711 static struct notifier_block ghes_notifier_sci = {
712 	.notifier_call = ghes_notify_sci,
713 };
714 
715 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
716 /*
717  * printk is not safe in NMI context.  So in NMI handler, we allocate
718  * required memory from lock-less memory allocator
719  * (ghes_estatus_pool), save estatus into it, put them into lock-less
720  * list (ghes_estatus_llist), then delay printk into IRQ context via
721  * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record
722  * required pool size by all NMI error source.
723  */
724 static struct llist_head ghes_estatus_llist;
725 static struct irq_work ghes_proc_irq_work;
726 
727 /*
728  * NMI may be triggered on any CPU, so ghes_in_nmi is used for
729  * having only one concurrent reader.
730  */
731 static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
732 
733 static LIST_HEAD(ghes_nmi);
734 
735 static int ghes_panic_timeout	__read_mostly = 30;
736 
737 static void ghes_proc_in_irq(struct irq_work *irq_work)
738 {
739 	struct llist_node *llnode, *next;
740 	struct ghes_estatus_node *estatus_node;
741 	struct acpi_hest_generic *generic;
742 	struct acpi_hest_generic_status *estatus;
743 	u32 len, node_len;
744 
745 	llnode = llist_del_all(&ghes_estatus_llist);
746 	/*
747 	 * Because the time order of estatus in list is reversed,
748 	 * revert it back to proper order.
749 	 */
750 	llnode = llist_reverse_order(llnode);
751 	while (llnode) {
752 		next = llnode->next;
753 		estatus_node = llist_entry(llnode, struct ghes_estatus_node,
754 					   llnode);
755 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
756 		len = cper_estatus_len(estatus);
757 		node_len = GHES_ESTATUS_NODE_LEN(len);
758 		ghes_do_proc(estatus_node->ghes, estatus);
759 		if (!ghes_estatus_cached(estatus)) {
760 			generic = estatus_node->generic;
761 			if (ghes_print_estatus(NULL, generic, estatus))
762 				ghes_estatus_cache_add(generic, estatus);
763 		}
764 		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
765 			      node_len);
766 		llnode = next;
767 	}
768 }
769 
770 static void ghes_print_queued_estatus(void)
771 {
772 	struct llist_node *llnode;
773 	struct ghes_estatus_node *estatus_node;
774 	struct acpi_hest_generic *generic;
775 	struct acpi_hest_generic_status *estatus;
776 	u32 len, node_len;
777 
778 	llnode = llist_del_all(&ghes_estatus_llist);
779 	/*
780 	 * Because the time order of estatus in list is reversed,
781 	 * revert it back to proper order.
782 	 */
783 	llnode = llist_reverse_order(llnode);
784 	while (llnode) {
785 		estatus_node = llist_entry(llnode, struct ghes_estatus_node,
786 					   llnode);
787 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
788 		len = cper_estatus_len(estatus);
789 		node_len = GHES_ESTATUS_NODE_LEN(len);
790 		generic = estatus_node->generic;
791 		ghes_print_estatus(NULL, generic, estatus);
792 		llnode = llnode->next;
793 	}
794 }
795 
796 /* Save estatus for further processing in IRQ context */
797 static void __process_error(struct ghes *ghes)
798 {
799 #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
800 	u32 len, node_len;
801 	struct ghes_estatus_node *estatus_node;
802 	struct acpi_hest_generic_status *estatus;
803 
804 	if (ghes_estatus_cached(ghes->estatus))
805 		return;
806 
807 	len = cper_estatus_len(ghes->estatus);
808 	node_len = GHES_ESTATUS_NODE_LEN(len);
809 
810 	estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
811 	if (!estatus_node)
812 		return;
813 
814 	estatus_node->ghes = ghes;
815 	estatus_node->generic = ghes->generic;
816 	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
817 	memcpy(estatus, ghes->estatus, len);
818 	llist_add(&estatus_node->llnode, &ghes_estatus_llist);
819 #endif
820 }
821 
822 static void __ghes_panic(struct ghes *ghes)
823 {
824 	oops_begin();
825 	ghes_print_queued_estatus();
826 	__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
827 
828 	/* reboot to log the error! */
829 	if (panic_timeout == 0)
830 		panic_timeout = ghes_panic_timeout;
831 	panic("Fatal hardware error!");
832 }
833 
834 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
835 {
836 	struct ghes *ghes;
837 	int sev, ret = NMI_DONE;
838 
839 	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
840 		return ret;
841 
842 	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
843 		if (ghes_read_estatus(ghes, 1)) {
844 			ghes_clear_estatus(ghes);
845 			continue;
846 		}
847 
848 		sev = ghes_severity(ghes->estatus->error_severity);
849 		if (sev >= GHES_SEV_PANIC)
850 			__ghes_panic(ghes);
851 
852 		if (!(ghes->flags & GHES_TO_CLEAR))
853 			continue;
854 
855 		__process_error(ghes);
856 		ghes_clear_estatus(ghes);
857 
858 		ret = NMI_HANDLED;
859 	}
860 
861 #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
862 	irq_work_queue(&ghes_proc_irq_work);
863 #endif
864 	atomic_dec(&ghes_in_nmi);
865 	return ret;
866 }
867 
868 static unsigned long ghes_esource_prealloc_size(
869 	const struct acpi_hest_generic *generic)
870 {
871 	unsigned long block_length, prealloc_records, prealloc_size;
872 
873 	block_length = min_t(unsigned long, generic->error_block_length,
874 			     GHES_ESTATUS_MAX_SIZE);
875 	prealloc_records = max_t(unsigned long,
876 				 generic->records_to_preallocate, 1);
877 	prealloc_size = min_t(unsigned long, block_length * prealloc_records,
878 			      GHES_ESOURCE_PREALLOC_MAX_SIZE);
879 
880 	return prealloc_size;
881 }
882 
883 static void ghes_estatus_pool_shrink(unsigned long len)
884 {
885 	ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
886 }
887 
888 static void ghes_nmi_add(struct ghes *ghes)
889 {
890 	unsigned long len;
891 
892 	len = ghes_esource_prealloc_size(ghes->generic);
893 	ghes_estatus_pool_expand(len);
894 	mutex_lock(&ghes_list_mutex);
895 	if (list_empty(&ghes_nmi))
896 		register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
897 	list_add_rcu(&ghes->list, &ghes_nmi);
898 	mutex_unlock(&ghes_list_mutex);
899 }
900 
901 static void ghes_nmi_remove(struct ghes *ghes)
902 {
903 	unsigned long len;
904 
905 	mutex_lock(&ghes_list_mutex);
906 	list_del_rcu(&ghes->list);
907 	if (list_empty(&ghes_nmi))
908 		unregister_nmi_handler(NMI_LOCAL, "ghes");
909 	mutex_unlock(&ghes_list_mutex);
910 	/*
911 	 * To synchronize with NMI handler, ghes can only be
912 	 * freed after NMI handler finishes.
913 	 */
914 	synchronize_rcu();
915 	len = ghes_esource_prealloc_size(ghes->generic);
916 	ghes_estatus_pool_shrink(len);
917 }
918 
919 static void ghes_nmi_init_cxt(void)
920 {
921 	init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
922 }
923 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
924 static inline void ghes_nmi_add(struct ghes *ghes)
925 {
926 	pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n",
927 	       ghes->generic->header.source_id);
928 	BUG();
929 }
930 
931 static inline void ghes_nmi_remove(struct ghes *ghes)
932 {
933 	pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n",
934 	       ghes->generic->header.source_id);
935 	BUG();
936 }
937 
938 static inline void ghes_nmi_init_cxt(void)
939 {
940 }
941 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
942 
943 static int ghes_probe(struct platform_device *ghes_dev)
944 {
945 	struct acpi_hest_generic *generic;
946 	struct ghes *ghes = NULL;
947 
948 	int rc = -EINVAL;
949 
950 	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
951 	if (!generic->enabled)
952 		return -ENODEV;
953 
954 	switch (generic->notify.type) {
955 	case ACPI_HEST_NOTIFY_POLLED:
956 	case ACPI_HEST_NOTIFY_EXTERNAL:
957 	case ACPI_HEST_NOTIFY_SCI:
958 		break;
959 	case ACPI_HEST_NOTIFY_NMI:
960 		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
961 			pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
962 				generic->header.source_id);
963 			goto err;
964 		}
965 		break;
966 	case ACPI_HEST_NOTIFY_LOCAL:
967 		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
968 			   generic->header.source_id);
969 		goto err;
970 	default:
971 		pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n",
972 			   generic->notify.type, generic->header.source_id);
973 		goto err;
974 	}
975 
976 	rc = -EIO;
977 	if (generic->error_block_length <
978 	    sizeof(struct acpi_hest_generic_status)) {
979 		pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n",
980 			   generic->error_block_length,
981 			   generic->header.source_id);
982 		goto err;
983 	}
984 	ghes = ghes_new(generic);
985 	if (IS_ERR(ghes)) {
986 		rc = PTR_ERR(ghes);
987 		ghes = NULL;
988 		goto err;
989 	}
990 
991 	rc = ghes_edac_register(ghes, &ghes_dev->dev);
992 	if (rc < 0)
993 		goto err;
994 
995 	switch (generic->notify.type) {
996 	case ACPI_HEST_NOTIFY_POLLED:
997 		ghes->timer.function = ghes_poll_func;
998 		ghes->timer.data = (unsigned long)ghes;
999 		init_timer_deferrable(&ghes->timer);
1000 		ghes_add_timer(ghes);
1001 		break;
1002 	case ACPI_HEST_NOTIFY_EXTERNAL:
1003 		/* External interrupt vector is GSI */
1004 		rc = acpi_gsi_to_irq(generic->notify.vector, &ghes->irq);
1005 		if (rc) {
1006 			pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
1007 			       generic->header.source_id);
1008 			goto err_edac_unreg;
1009 		}
1010 		rc = request_irq(ghes->irq, ghes_irq_func, 0, "GHES IRQ", ghes);
1011 		if (rc) {
1012 			pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
1013 			       generic->header.source_id);
1014 			goto err_edac_unreg;
1015 		}
1016 		break;
1017 	case ACPI_HEST_NOTIFY_SCI:
1018 		mutex_lock(&ghes_list_mutex);
1019 		if (list_empty(&ghes_sci))
1020 			register_acpi_hed_notifier(&ghes_notifier_sci);
1021 		list_add_rcu(&ghes->list, &ghes_sci);
1022 		mutex_unlock(&ghes_list_mutex);
1023 		break;
1024 	case ACPI_HEST_NOTIFY_NMI:
1025 		ghes_nmi_add(ghes);
1026 		break;
1027 	default:
1028 		BUG();
1029 	}
1030 	platform_set_drvdata(ghes_dev, ghes);
1031 
1032 	return 0;
1033 err_edac_unreg:
1034 	ghes_edac_unregister(ghes);
1035 err:
1036 	if (ghes) {
1037 		ghes_fini(ghes);
1038 		kfree(ghes);
1039 	}
1040 	return rc;
1041 }
1042 
1043 static int ghes_remove(struct platform_device *ghes_dev)
1044 {
1045 	struct ghes *ghes;
1046 	struct acpi_hest_generic *generic;
1047 
1048 	ghes = platform_get_drvdata(ghes_dev);
1049 	generic = ghes->generic;
1050 
1051 	ghes->flags |= GHES_EXITING;
1052 	switch (generic->notify.type) {
1053 	case ACPI_HEST_NOTIFY_POLLED:
1054 		del_timer_sync(&ghes->timer);
1055 		break;
1056 	case ACPI_HEST_NOTIFY_EXTERNAL:
1057 		free_irq(ghes->irq, ghes);
1058 		break;
1059 	case ACPI_HEST_NOTIFY_SCI:
1060 		mutex_lock(&ghes_list_mutex);
1061 		list_del_rcu(&ghes->list);
1062 		if (list_empty(&ghes_sci))
1063 			unregister_acpi_hed_notifier(&ghes_notifier_sci);
1064 		mutex_unlock(&ghes_list_mutex);
1065 		break;
1066 	case ACPI_HEST_NOTIFY_NMI:
1067 		ghes_nmi_remove(ghes);
1068 		break;
1069 	default:
1070 		BUG();
1071 		break;
1072 	}
1073 
1074 	ghes_fini(ghes);
1075 
1076 	ghes_edac_unregister(ghes);
1077 
1078 	kfree(ghes);
1079 
1080 	platform_set_drvdata(ghes_dev, NULL);
1081 
1082 	return 0;
1083 }
1084 
1085 static struct platform_driver ghes_platform_driver = {
1086 	.driver		= {
1087 		.name	= "GHES",
1088 	},
1089 	.probe		= ghes_probe,
1090 	.remove		= ghes_remove,
1091 };
1092 
1093 static int __init ghes_init(void)
1094 {
1095 	int rc;
1096 
1097 	if (acpi_disabled)
1098 		return -ENODEV;
1099 
1100 	if (hest_disable) {
1101 		pr_info(GHES_PFX "HEST is not enabled!\n");
1102 		return -EINVAL;
1103 	}
1104 
1105 	if (ghes_disable) {
1106 		pr_info(GHES_PFX "GHES is not enabled!\n");
1107 		return -EINVAL;
1108 	}
1109 
1110 	ghes_nmi_init_cxt();
1111 
1112 	rc = ghes_ioremap_init();
1113 	if (rc)
1114 		goto err;
1115 
1116 	rc = ghes_estatus_pool_init();
1117 	if (rc)
1118 		goto err_ioremap_exit;
1119 
1120 	rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
1121 				      GHES_ESTATUS_CACHE_ALLOCED_MAX);
1122 	if (rc)
1123 		goto err_pool_exit;
1124 
1125 	rc = platform_driver_register(&ghes_platform_driver);
1126 	if (rc)
1127 		goto err_pool_exit;
1128 
1129 	rc = apei_osc_setup();
1130 	if (rc == 0 && osc_sb_apei_support_acked)
1131 		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
1132 	else if (rc == 0 && !osc_sb_apei_support_acked)
1133 		pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n");
1134 	else if (rc && osc_sb_apei_support_acked)
1135 		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n");
1136 	else
1137 		pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
1138 
1139 	return 0;
1140 err_pool_exit:
1141 	ghes_estatus_pool_exit();
1142 err_ioremap_exit:
1143 	ghes_ioremap_exit();
1144 err:
1145 	return rc;
1146 }
1147 
1148 static void __exit ghes_exit(void)
1149 {
1150 	platform_driver_unregister(&ghes_platform_driver);
1151 	ghes_estatus_pool_exit();
1152 	ghes_ioremap_exit();
1153 }
1154 
1155 module_init(ghes_init);
1156 module_exit(ghes_exit);
1157 
1158 MODULE_AUTHOR("Huang Ying");
1159 MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
1160 MODULE_LICENSE("GPL");
1161 MODULE_ALIAS("platform:GHES");
1162