xref: /openbmc/linux/mm/memory-failure.c (revision d027122d8363e58cd8bc2fa6a16917f7f69b85bb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2008, 2009 Intel Corporation
4  * Authors: Andi Kleen, Fengguang Wu
5  *
6  * High level machine check handler. Handles pages reported by the
7  * hardware as being corrupted usually due to a multi-bit ECC memory or cache
8  * failure.
9  *
10  * In addition there is a "soft offline" entry point that allows stop using
11  * not-yet-corrupted-by-suspicious pages without killing anything.
12  *
13  * Handles page cache pages in various states.	The tricky part
14  * here is that we can access any page asynchronously in respect to
15  * other VM users, because memory failures could happen anytime and
16  * anywhere. This could violate some of their assumptions. This is why
17  * this code has to be extremely careful. Generally it tries to use
18  * normal locking rules, as in get the standard locks, even if that means
19  * the error handling takes potentially a long time.
20  *
21  * It can be very tempting to add handling for obscure cases here.
22  * In general any code for handling new cases should only be added iff:
23  * - You know how to test it.
24  * - You have a test that can be added to mce-test
25  *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
26  * - The case actually shows up as a frequent (top 10) page state in
27  *   tools/vm/page-types when running a real workload.
28  *
29  * There are several operations here with exponential complexity because
30  * of unsuitable VM data structures. For example the operation to map back
31  * from RMAP chains to processes has to walk the complete process list and
32  * has non linear complexity with the number. But since memory corruptions
33  * are rare we hope to get away with this. This avoids impacting the core
34  * VM.
35  */
36 
37 #define pr_fmt(fmt) "Memory failure: " fmt
38 
39 #include <linux/kernel.h>
40 #include <linux/mm.h>
41 #include <linux/page-flags.h>
42 #include <linux/kernel-page-flags.h>
43 #include <linux/sched/signal.h>
44 #include <linux/sched/task.h>
45 #include <linux/dax.h>
46 #include <linux/ksm.h>
47 #include <linux/rmap.h>
48 #include <linux/export.h>
49 #include <linux/pagemap.h>
50 #include <linux/swap.h>
51 #include <linux/backing-dev.h>
52 #include <linux/migrate.h>
53 #include <linux/suspend.h>
54 #include <linux/slab.h>
55 #include <linux/swapops.h>
56 #include <linux/hugetlb.h>
57 #include <linux/memory_hotplug.h>
58 #include <linux/mm_inline.h>
59 #include <linux/memremap.h>
60 #include <linux/kfifo.h>
61 #include <linux/ratelimit.h>
62 #include <linux/page-isolation.h>
63 #include <linux/pagewalk.h>
64 #include <linux/shmem_fs.h>
65 #include "swap.h"
66 #include "internal.h"
67 #include "ras/ras_event.h"
68 
69 int sysctl_memory_failure_early_kill __read_mostly = 0;
70 
71 int sysctl_memory_failure_recovery __read_mostly = 1;
72 
73 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
74 
75 static bool hw_memory_failure __read_mostly = false;
76 
77 inline void num_poisoned_pages_inc(void)
78 {
79 	atomic_long_inc(&num_poisoned_pages);
80 }
81 
82 static inline void num_poisoned_pages_sub(long i)
83 {
84 	atomic_long_sub(i, &num_poisoned_pages);
85 }
86 
87 /*
88  * Return values:
89  *   1:   the page is dissolved (if needed) and taken off from buddy,
90  *   0:   the page is dissolved (if needed) and not taken off from buddy,
91  *   < 0: failed to dissolve.
92  */
93 static int __page_handle_poison(struct page *page)
94 {
95 	int ret;
96 
97 	zone_pcp_disable(page_zone(page));
98 	ret = dissolve_free_huge_page(page);
99 	if (!ret)
100 		ret = take_page_off_buddy(page);
101 	zone_pcp_enable(page_zone(page));
102 
103 	return ret;
104 }
105 
106 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
107 {
108 	if (hugepage_or_freepage) {
109 		/*
110 		 * Doing this check for free pages is also fine since dissolve_free_huge_page
111 		 * returns 0 for non-hugetlb pages as well.
112 		 */
113 		if (__page_handle_poison(page) <= 0)
114 			/*
115 			 * We could fail to take off the target page from buddy
116 			 * for example due to racy page allocation, but that's
117 			 * acceptable because soft-offlined page is not broken
118 			 * and if someone really want to use it, they should
119 			 * take it.
120 			 */
121 			return false;
122 	}
123 
124 	SetPageHWPoison(page);
125 	if (release)
126 		put_page(page);
127 	page_ref_inc(page);
128 	num_poisoned_pages_inc();
129 
130 	return true;
131 }
132 
133 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
134 
135 u32 hwpoison_filter_enable = 0;
136 u32 hwpoison_filter_dev_major = ~0U;
137 u32 hwpoison_filter_dev_minor = ~0U;
138 u64 hwpoison_filter_flags_mask;
139 u64 hwpoison_filter_flags_value;
140 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
141 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
142 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
143 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
144 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
145 
146 static int hwpoison_filter_dev(struct page *p)
147 {
148 	struct address_space *mapping;
149 	dev_t dev;
150 
151 	if (hwpoison_filter_dev_major == ~0U &&
152 	    hwpoison_filter_dev_minor == ~0U)
153 		return 0;
154 
155 	mapping = page_mapping(p);
156 	if (mapping == NULL || mapping->host == NULL)
157 		return -EINVAL;
158 
159 	dev = mapping->host->i_sb->s_dev;
160 	if (hwpoison_filter_dev_major != ~0U &&
161 	    hwpoison_filter_dev_major != MAJOR(dev))
162 		return -EINVAL;
163 	if (hwpoison_filter_dev_minor != ~0U &&
164 	    hwpoison_filter_dev_minor != MINOR(dev))
165 		return -EINVAL;
166 
167 	return 0;
168 }
169 
170 static int hwpoison_filter_flags(struct page *p)
171 {
172 	if (!hwpoison_filter_flags_mask)
173 		return 0;
174 
175 	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
176 				    hwpoison_filter_flags_value)
177 		return 0;
178 	else
179 		return -EINVAL;
180 }
181 
182 /*
183  * This allows stress tests to limit test scope to a collection of tasks
184  * by putting them under some memcg. This prevents killing unrelated/important
185  * processes such as /sbin/init. Note that the target task may share clean
186  * pages with init (eg. libc text), which is harmless. If the target task
187  * share _dirty_ pages with another task B, the test scheme must make sure B
188  * is also included in the memcg. At last, due to race conditions this filter
189  * can only guarantee that the page either belongs to the memcg tasks, or is
190  * a freed page.
191  */
192 #ifdef CONFIG_MEMCG
193 u64 hwpoison_filter_memcg;
194 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
195 static int hwpoison_filter_task(struct page *p)
196 {
197 	if (!hwpoison_filter_memcg)
198 		return 0;
199 
200 	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
201 		return -EINVAL;
202 
203 	return 0;
204 }
205 #else
206 static int hwpoison_filter_task(struct page *p) { return 0; }
207 #endif
208 
209 int hwpoison_filter(struct page *p)
210 {
211 	if (!hwpoison_filter_enable)
212 		return 0;
213 
214 	if (hwpoison_filter_dev(p))
215 		return -EINVAL;
216 
217 	if (hwpoison_filter_flags(p))
218 		return -EINVAL;
219 
220 	if (hwpoison_filter_task(p))
221 		return -EINVAL;
222 
223 	return 0;
224 }
225 #else
226 int hwpoison_filter(struct page *p)
227 {
228 	return 0;
229 }
230 #endif
231 
232 EXPORT_SYMBOL_GPL(hwpoison_filter);
233 
234 /*
235  * Kill all processes that have a poisoned page mapped and then isolate
236  * the page.
237  *
238  * General strategy:
239  * Find all processes having the page mapped and kill them.
240  * But we keep a page reference around so that the page is not
241  * actually freed yet.
242  * Then stash the page away
243  *
244  * There's no convenient way to get back to mapped processes
245  * from the VMAs. So do a brute-force search over all
246  * running processes.
247  *
248  * Remember that machine checks are not common (or rather
249  * if they are common you have other problems), so this shouldn't
250  * be a performance issue.
251  *
252  * Also there are some races possible while we get from the
253  * error detection to actually handle it.
254  */
255 
256 struct to_kill {
257 	struct list_head nd;
258 	struct task_struct *tsk;
259 	unsigned long addr;
260 	short size_shift;
261 };
262 
263 /*
264  * Send all the processes who have the page mapped a signal.
265  * ``action optional'' if they are not immediately affected by the error
266  * ``action required'' if error happened in current execution context
267  */
268 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
269 {
270 	struct task_struct *t = tk->tsk;
271 	short addr_lsb = tk->size_shift;
272 	int ret = 0;
273 
274 	pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
275 			pfn, t->comm, t->pid);
276 
277 	if ((flags & MF_ACTION_REQUIRED) && (t == current))
278 		ret = force_sig_mceerr(BUS_MCEERR_AR,
279 				 (void __user *)tk->addr, addr_lsb);
280 	else
281 		/*
282 		 * Signal other processes sharing the page if they have
283 		 * PF_MCE_EARLY set.
284 		 * Don't use force here, it's convenient if the signal
285 		 * can be temporarily blocked.
286 		 * This could cause a loop when the user sets SIGBUS
287 		 * to SIG_IGN, but hopefully no one will do that?
288 		 */
289 		ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
290 				      addr_lsb, t);
291 	if (ret < 0)
292 		pr_info("Error sending signal to %s:%d: %d\n",
293 			t->comm, t->pid, ret);
294 	return ret;
295 }
296 
297 /*
298  * Unknown page type encountered. Try to check whether it can turn PageLRU by
299  * lru_add_drain_all.
300  */
301 void shake_page(struct page *p)
302 {
303 	if (PageHuge(p))
304 		return;
305 
306 	if (!PageSlab(p)) {
307 		lru_add_drain_all();
308 		if (PageLRU(p) || is_free_buddy_page(p))
309 			return;
310 	}
311 
312 	/*
313 	 * TODO: Could shrink slab caches here if a lightweight range-based
314 	 * shrinker will be available.
315 	 */
316 }
317 EXPORT_SYMBOL_GPL(shake_page);
318 
319 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
320 		unsigned long address)
321 {
322 	unsigned long ret = 0;
323 	pgd_t *pgd;
324 	p4d_t *p4d;
325 	pud_t *pud;
326 	pmd_t *pmd;
327 	pte_t *pte;
328 
329 	VM_BUG_ON_VMA(address == -EFAULT, vma);
330 	pgd = pgd_offset(vma->vm_mm, address);
331 	if (!pgd_present(*pgd))
332 		return 0;
333 	p4d = p4d_offset(pgd, address);
334 	if (!p4d_present(*p4d))
335 		return 0;
336 	pud = pud_offset(p4d, address);
337 	if (!pud_present(*pud))
338 		return 0;
339 	if (pud_devmap(*pud))
340 		return PUD_SHIFT;
341 	pmd = pmd_offset(pud, address);
342 	if (!pmd_present(*pmd))
343 		return 0;
344 	if (pmd_devmap(*pmd))
345 		return PMD_SHIFT;
346 	pte = pte_offset_map(pmd, address);
347 	if (pte_present(*pte) && pte_devmap(*pte))
348 		ret = PAGE_SHIFT;
349 	pte_unmap(pte);
350 	return ret;
351 }
352 
353 /*
354  * Failure handling: if we can't find or can't kill a process there's
355  * not much we can do.	We just print a message and ignore otherwise.
356  */
357 
358 #define FSDAX_INVALID_PGOFF ULONG_MAX
359 
360 /*
361  * Schedule a process for later kill.
362  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
363  *
364  * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
365  * filesystem with a memory failure handler has claimed the
366  * memory_failure event. In all other cases, page->index and
367  * page->mapping are sufficient for mapping the page back to its
368  * corresponding user virtual address.
369  */
370 static void add_to_kill(struct task_struct *tsk, struct page *p,
371 			pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
372 			struct list_head *to_kill)
373 {
374 	struct to_kill *tk;
375 
376 	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
377 	if (!tk) {
378 		pr_err("Out of memory while machine check handling\n");
379 		return;
380 	}
381 
382 	tk->addr = page_address_in_vma(p, vma);
383 	if (is_zone_device_page(p)) {
384 		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
385 			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
386 		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
387 	} else
388 		tk->size_shift = page_shift(compound_head(p));
389 
390 	/*
391 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
392 	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
393 	 * so "tk->size_shift == 0" effectively checks no mapping on
394 	 * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
395 	 * to a process' address space, it's possible not all N VMAs
396 	 * contain mappings for the page, but at least one VMA does.
397 	 * Only deliver SIGBUS with payload derived from the VMA that
398 	 * has a mapping for the page.
399 	 */
400 	if (tk->addr == -EFAULT) {
401 		pr_info("Unable to find user space address %lx in %s\n",
402 			page_to_pfn(p), tsk->comm);
403 	} else if (tk->size_shift == 0) {
404 		kfree(tk);
405 		return;
406 	}
407 
408 	get_task_struct(tsk);
409 	tk->tsk = tsk;
410 	list_add_tail(&tk->nd, to_kill);
411 }
412 
413 /*
414  * Kill the processes that have been collected earlier.
415  *
416  * Only do anything when FORCEKILL is set, otherwise just free the
417  * list (this is used for clean pages which do not need killing)
418  * Also when FAIL is set do a force kill because something went
419  * wrong earlier.
420  */
421 static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
422 		unsigned long pfn, int flags)
423 {
424 	struct to_kill *tk, *next;
425 
426 	list_for_each_entry_safe(tk, next, to_kill, nd) {
427 		if (forcekill) {
428 			/*
429 			 * In case something went wrong with munmapping
430 			 * make sure the process doesn't catch the
431 			 * signal and then access the memory. Just kill it.
432 			 */
433 			if (fail || tk->addr == -EFAULT) {
434 				pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
435 				       pfn, tk->tsk->comm, tk->tsk->pid);
436 				do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
437 						 tk->tsk, PIDTYPE_PID);
438 			}
439 
440 			/*
441 			 * In theory the process could have mapped
442 			 * something else on the address in-between. We could
443 			 * check for that, but we need to tell the
444 			 * process anyways.
445 			 */
446 			else if (kill_proc(tk, pfn, flags) < 0)
447 				pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
448 				       pfn, tk->tsk->comm, tk->tsk->pid);
449 		}
450 		list_del(&tk->nd);
451 		put_task_struct(tk->tsk);
452 		kfree(tk);
453 	}
454 }
455 
456 /*
457  * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
458  * on behalf of the thread group. Return task_struct of the (first found)
459  * dedicated thread if found, and return NULL otherwise.
460  *
461  * We already hold read_lock(&tasklist_lock) in the caller, so we don't
462  * have to call rcu_read_lock/unlock() in this function.
463  */
464 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
465 {
466 	struct task_struct *t;
467 
468 	for_each_thread(tsk, t) {
469 		if (t->flags & PF_MCE_PROCESS) {
470 			if (t->flags & PF_MCE_EARLY)
471 				return t;
472 		} else {
473 			if (sysctl_memory_failure_early_kill)
474 				return t;
475 		}
476 	}
477 	return NULL;
478 }
479 
480 /*
481  * Determine whether a given process is "early kill" process which expects
482  * to be signaled when some page under the process is hwpoisoned.
483  * Return task_struct of the dedicated thread (main thread unless explicitly
484  * specified) if the process is "early kill" and otherwise returns NULL.
485  *
486  * Note that the above is true for Action Optional case. For Action Required
487  * case, it's only meaningful to the current thread which need to be signaled
488  * with SIGBUS, this error is Action Optional for other non current
489  * processes sharing the same error page,if the process is "early kill", the
490  * task_struct of the dedicated thread will also be returned.
491  */
492 static struct task_struct *task_early_kill(struct task_struct *tsk,
493 					   int force_early)
494 {
495 	if (!tsk->mm)
496 		return NULL;
497 	/*
498 	 * Comparing ->mm here because current task might represent
499 	 * a subthread, while tsk always points to the main thread.
500 	 */
501 	if (force_early && tsk->mm == current->mm)
502 		return current;
503 
504 	return find_early_kill_thread(tsk);
505 }
506 
507 /*
508  * Collect processes when the error hit an anonymous page.
509  */
510 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
511 				int force_early)
512 {
513 	struct folio *folio = page_folio(page);
514 	struct vm_area_struct *vma;
515 	struct task_struct *tsk;
516 	struct anon_vma *av;
517 	pgoff_t pgoff;
518 
519 	av = folio_lock_anon_vma_read(folio, NULL);
520 	if (av == NULL)	/* Not actually mapped anymore */
521 		return;
522 
523 	pgoff = page_to_pgoff(page);
524 	read_lock(&tasklist_lock);
525 	for_each_process (tsk) {
526 		struct anon_vma_chain *vmac;
527 		struct task_struct *t = task_early_kill(tsk, force_early);
528 
529 		if (!t)
530 			continue;
531 		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
532 					       pgoff, pgoff) {
533 			vma = vmac->vma;
534 			if (vma->vm_mm != t->mm)
535 				continue;
536 			if (!page_mapped_in_vma(page, vma))
537 				continue;
538 			add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill);
539 		}
540 	}
541 	read_unlock(&tasklist_lock);
542 	anon_vma_unlock_read(av);
543 }
544 
545 /*
546  * Collect processes when the error hit a file mapped page.
547  */
548 static void collect_procs_file(struct page *page, struct list_head *to_kill,
549 				int force_early)
550 {
551 	struct vm_area_struct *vma;
552 	struct task_struct *tsk;
553 	struct address_space *mapping = page->mapping;
554 	pgoff_t pgoff;
555 
556 	i_mmap_lock_read(mapping);
557 	read_lock(&tasklist_lock);
558 	pgoff = page_to_pgoff(page);
559 	for_each_process(tsk) {
560 		struct task_struct *t = task_early_kill(tsk, force_early);
561 
562 		if (!t)
563 			continue;
564 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
565 				      pgoff) {
566 			/*
567 			 * Send early kill signal to tasks where a vma covers
568 			 * the page but the corrupted page is not necessarily
569 			 * mapped it in its pte.
570 			 * Assume applications who requested early kill want
571 			 * to be informed of all such data corruptions.
572 			 */
573 			if (vma->vm_mm == t->mm)
574 				add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
575 					    to_kill);
576 		}
577 	}
578 	read_unlock(&tasklist_lock);
579 	i_mmap_unlock_read(mapping);
580 }
581 
582 #ifdef CONFIG_FS_DAX
583 /*
584  * Collect processes when the error hit a fsdax page.
585  */
586 static void collect_procs_fsdax(struct page *page,
587 		struct address_space *mapping, pgoff_t pgoff,
588 		struct list_head *to_kill)
589 {
590 	struct vm_area_struct *vma;
591 	struct task_struct *tsk;
592 
593 	i_mmap_lock_read(mapping);
594 	read_lock(&tasklist_lock);
595 	for_each_process(tsk) {
596 		struct task_struct *t = task_early_kill(tsk, true);
597 
598 		if (!t)
599 			continue;
600 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
601 			if (vma->vm_mm == t->mm)
602 				add_to_kill(t, page, pgoff, vma, to_kill);
603 		}
604 	}
605 	read_unlock(&tasklist_lock);
606 	i_mmap_unlock_read(mapping);
607 }
608 #endif /* CONFIG_FS_DAX */
609 
610 /*
611  * Collect the processes who have the corrupted page mapped to kill.
612  */
613 static void collect_procs(struct page *page, struct list_head *tokill,
614 				int force_early)
615 {
616 	if (!page->mapping)
617 		return;
618 
619 	if (PageAnon(page))
620 		collect_procs_anon(page, tokill, force_early);
621 	else
622 		collect_procs_file(page, tokill, force_early);
623 }
624 
625 struct hwp_walk {
626 	struct to_kill tk;
627 	unsigned long pfn;
628 	int flags;
629 };
630 
631 static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
632 {
633 	tk->addr = addr;
634 	tk->size_shift = shift;
635 }
636 
637 static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
638 				unsigned long poisoned_pfn, struct to_kill *tk)
639 {
640 	unsigned long pfn = 0;
641 
642 	if (pte_present(pte)) {
643 		pfn = pte_pfn(pte);
644 	} else {
645 		swp_entry_t swp = pte_to_swp_entry(pte);
646 
647 		if (is_hwpoison_entry(swp))
648 			pfn = swp_offset_pfn(swp);
649 	}
650 
651 	if (!pfn || pfn != poisoned_pfn)
652 		return 0;
653 
654 	set_to_kill(tk, addr, shift);
655 	return 1;
656 }
657 
658 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
659 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
660 				      struct hwp_walk *hwp)
661 {
662 	pmd_t pmd = *pmdp;
663 	unsigned long pfn;
664 	unsigned long hwpoison_vaddr;
665 
666 	if (!pmd_present(pmd))
667 		return 0;
668 	pfn = pmd_pfn(pmd);
669 	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
670 		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
671 		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
672 		return 1;
673 	}
674 	return 0;
675 }
676 #else
677 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
678 				      struct hwp_walk *hwp)
679 {
680 	return 0;
681 }
682 #endif
683 
684 static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
685 			      unsigned long end, struct mm_walk *walk)
686 {
687 	struct hwp_walk *hwp = walk->private;
688 	int ret = 0;
689 	pte_t *ptep, *mapped_pte;
690 	spinlock_t *ptl;
691 
692 	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
693 	if (ptl) {
694 		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
695 		spin_unlock(ptl);
696 		goto out;
697 	}
698 
699 	if (pmd_trans_unstable(pmdp))
700 		goto out;
701 
702 	mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
703 						addr, &ptl);
704 	for (; addr != end; ptep++, addr += PAGE_SIZE) {
705 		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
706 					     hwp->pfn, &hwp->tk);
707 		if (ret == 1)
708 			break;
709 	}
710 	pte_unmap_unlock(mapped_pte, ptl);
711 out:
712 	cond_resched();
713 	return ret;
714 }
715 
716 #ifdef CONFIG_HUGETLB_PAGE
717 static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
718 			    unsigned long addr, unsigned long end,
719 			    struct mm_walk *walk)
720 {
721 	struct hwp_walk *hwp = walk->private;
722 	pte_t pte = huge_ptep_get(ptep);
723 	struct hstate *h = hstate_vma(walk->vma);
724 
725 	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
726 				      hwp->pfn, &hwp->tk);
727 }
728 #else
729 #define hwpoison_hugetlb_range	NULL
730 #endif
731 
732 static const struct mm_walk_ops hwp_walk_ops = {
733 	.pmd_entry = hwpoison_pte_range,
734 	.hugetlb_entry = hwpoison_hugetlb_range,
735 };
736 
737 /*
738  * Sends SIGBUS to the current process with error info.
739  *
740  * This function is intended to handle "Action Required" MCEs on already
741  * hardware poisoned pages. They could happen, for example, when
742  * memory_failure() failed to unmap the error page at the first call, or
743  * when multiple local machine checks happened on different CPUs.
744  *
745  * MCE handler currently has no easy access to the error virtual address,
746  * so this function walks page table to find it. The returned virtual address
747  * is proper in most cases, but it could be wrong when the application
748  * process has multiple entries mapping the error page.
749  */
750 static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
751 				  int flags)
752 {
753 	int ret;
754 	struct hwp_walk priv = {
755 		.pfn = pfn,
756 	};
757 	priv.tk.tsk = p;
758 
759 	if (!p->mm)
760 		return -EFAULT;
761 
762 	mmap_read_lock(p->mm);
763 	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
764 			      (void *)&priv);
765 	if (ret == 1 && priv.tk.addr)
766 		kill_proc(&priv.tk, pfn, flags);
767 	else
768 		ret = 0;
769 	mmap_read_unlock(p->mm);
770 	return ret > 0 ? -EHWPOISON : -EFAULT;
771 }
772 
773 static const char *action_name[] = {
774 	[MF_IGNORED] = "Ignored",
775 	[MF_FAILED] = "Failed",
776 	[MF_DELAYED] = "Delayed",
777 	[MF_RECOVERED] = "Recovered",
778 };
779 
780 static const char * const action_page_types[] = {
781 	[MF_MSG_KERNEL]			= "reserved kernel page",
782 	[MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
783 	[MF_MSG_SLAB]			= "kernel slab page",
784 	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
785 	[MF_MSG_HUGE]			= "huge page",
786 	[MF_MSG_FREE_HUGE]		= "free huge page",
787 	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
788 	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
789 	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
790 	[MF_MSG_DIRTY_MLOCKED_LRU]	= "dirty mlocked LRU page",
791 	[MF_MSG_CLEAN_MLOCKED_LRU]	= "clean mlocked LRU page",
792 	[MF_MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
793 	[MF_MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
794 	[MF_MSG_DIRTY_LRU]		= "dirty LRU page",
795 	[MF_MSG_CLEAN_LRU]		= "clean LRU page",
796 	[MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
797 	[MF_MSG_BUDDY]			= "free buddy page",
798 	[MF_MSG_DAX]			= "dax page",
799 	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
800 	[MF_MSG_UNKNOWN]		= "unknown page",
801 };
802 
803 /*
804  * XXX: It is possible that a page is isolated from LRU cache,
805  * and then kept in swap cache or failed to remove from page cache.
806  * The page count will stop it from being freed by unpoison.
807  * Stress tests should be aware of this memory leak problem.
808  */
809 static int delete_from_lru_cache(struct page *p)
810 {
811 	if (!isolate_lru_page(p)) {
812 		/*
813 		 * Clear sensible page flags, so that the buddy system won't
814 		 * complain when the page is unpoison-and-freed.
815 		 */
816 		ClearPageActive(p);
817 		ClearPageUnevictable(p);
818 
819 		/*
820 		 * Poisoned page might never drop its ref count to 0 so we have
821 		 * to uncharge it manually from its memcg.
822 		 */
823 		mem_cgroup_uncharge(page_folio(p));
824 
825 		/*
826 		 * drop the page count elevated by isolate_lru_page()
827 		 */
828 		put_page(p);
829 		return 0;
830 	}
831 	return -EIO;
832 }
833 
834 static int truncate_error_page(struct page *p, unsigned long pfn,
835 				struct address_space *mapping)
836 {
837 	int ret = MF_FAILED;
838 
839 	if (mapping->a_ops->error_remove_page) {
840 		int err = mapping->a_ops->error_remove_page(mapping, p);
841 
842 		if (err != 0) {
843 			pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
844 		} else if (page_has_private(p) &&
845 			   !try_to_release_page(p, GFP_NOIO)) {
846 			pr_info("%#lx: failed to release buffers\n", pfn);
847 		} else {
848 			ret = MF_RECOVERED;
849 		}
850 	} else {
851 		/*
852 		 * If the file system doesn't support it just invalidate
853 		 * This fails on dirty or anything with private pages
854 		 */
855 		if (invalidate_inode_page(p))
856 			ret = MF_RECOVERED;
857 		else
858 			pr_info("%#lx: Failed to invalidate\n",	pfn);
859 	}
860 
861 	return ret;
862 }
863 
864 struct page_state {
865 	unsigned long mask;
866 	unsigned long res;
867 	enum mf_action_page_type type;
868 
869 	/* Callback ->action() has to unlock the relevant page inside it. */
870 	int (*action)(struct page_state *ps, struct page *p);
871 };
872 
873 /*
874  * Return true if page is still referenced by others, otherwise return
875  * false.
876  *
877  * The extra_pins is true when one extra refcount is expected.
878  */
879 static bool has_extra_refcount(struct page_state *ps, struct page *p,
880 			       bool extra_pins)
881 {
882 	int count = page_count(p) - 1;
883 
884 	if (extra_pins)
885 		count -= 1;
886 
887 	if (count > 0) {
888 		pr_err("%#lx: %s still referenced by %d users\n",
889 		       page_to_pfn(p), action_page_types[ps->type], count);
890 		return true;
891 	}
892 
893 	return false;
894 }
895 
896 /*
897  * Error hit kernel page.
898  * Do nothing, try to be lucky and not touch this instead. For a few cases we
899  * could be more sophisticated.
900  */
901 static int me_kernel(struct page_state *ps, struct page *p)
902 {
903 	unlock_page(p);
904 	return MF_IGNORED;
905 }
906 
907 /*
908  * Page in unknown state. Do nothing.
909  */
910 static int me_unknown(struct page_state *ps, struct page *p)
911 {
912 	pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
913 	unlock_page(p);
914 	return MF_FAILED;
915 }
916 
917 /*
918  * Clean (or cleaned) page cache page.
919  */
920 static int me_pagecache_clean(struct page_state *ps, struct page *p)
921 {
922 	int ret;
923 	struct address_space *mapping;
924 	bool extra_pins;
925 
926 	delete_from_lru_cache(p);
927 
928 	/*
929 	 * For anonymous pages we're done the only reference left
930 	 * should be the one m_f() holds.
931 	 */
932 	if (PageAnon(p)) {
933 		ret = MF_RECOVERED;
934 		goto out;
935 	}
936 
937 	/*
938 	 * Now truncate the page in the page cache. This is really
939 	 * more like a "temporary hole punch"
940 	 * Don't do this for block devices when someone else
941 	 * has a reference, because it could be file system metadata
942 	 * and that's not safe to truncate.
943 	 */
944 	mapping = page_mapping(p);
945 	if (!mapping) {
946 		/*
947 		 * Page has been teared down in the meanwhile
948 		 */
949 		ret = MF_FAILED;
950 		goto out;
951 	}
952 
953 	/*
954 	 * The shmem page is kept in page cache instead of truncating
955 	 * so is expected to have an extra refcount after error-handling.
956 	 */
957 	extra_pins = shmem_mapping(mapping);
958 
959 	/*
960 	 * Truncation is a bit tricky. Enable it per file system for now.
961 	 *
962 	 * Open: to take i_rwsem or not for this? Right now we don't.
963 	 */
964 	ret = truncate_error_page(p, page_to_pfn(p), mapping);
965 	if (has_extra_refcount(ps, p, extra_pins))
966 		ret = MF_FAILED;
967 
968 out:
969 	unlock_page(p);
970 
971 	return ret;
972 }
973 
974 /*
975  * Dirty pagecache page
976  * Issues: when the error hit a hole page the error is not properly
977  * propagated.
978  */
979 static int me_pagecache_dirty(struct page_state *ps, struct page *p)
980 {
981 	struct address_space *mapping = page_mapping(p);
982 
983 	SetPageError(p);
984 	/* TBD: print more information about the file. */
985 	if (mapping) {
986 		/*
987 		 * IO error will be reported by write(), fsync(), etc.
988 		 * who check the mapping.
989 		 * This way the application knows that something went
990 		 * wrong with its dirty file data.
991 		 *
992 		 * There's one open issue:
993 		 *
994 		 * The EIO will be only reported on the next IO
995 		 * operation and then cleared through the IO map.
996 		 * Normally Linux has two mechanisms to pass IO error
997 		 * first through the AS_EIO flag in the address space
998 		 * and then through the PageError flag in the page.
999 		 * Since we drop pages on memory failure handling the
1000 		 * only mechanism open to use is through AS_AIO.
1001 		 *
1002 		 * This has the disadvantage that it gets cleared on
1003 		 * the first operation that returns an error, while
1004 		 * the PageError bit is more sticky and only cleared
1005 		 * when the page is reread or dropped.  If an
1006 		 * application assumes it will always get error on
1007 		 * fsync, but does other operations on the fd before
1008 		 * and the page is dropped between then the error
1009 		 * will not be properly reported.
1010 		 *
1011 		 * This can already happen even without hwpoisoned
1012 		 * pages: first on metadata IO errors (which only
1013 		 * report through AS_EIO) or when the page is dropped
1014 		 * at the wrong time.
1015 		 *
1016 		 * So right now we assume that the application DTRT on
1017 		 * the first EIO, but we're not worse than other parts
1018 		 * of the kernel.
1019 		 */
1020 		mapping_set_error(mapping, -EIO);
1021 	}
1022 
1023 	return me_pagecache_clean(ps, p);
1024 }
1025 
1026 /*
1027  * Clean and dirty swap cache.
1028  *
1029  * Dirty swap cache page is tricky to handle. The page could live both in page
1030  * cache and swap cache(ie. page is freshly swapped in). So it could be
1031  * referenced concurrently by 2 types of PTEs:
1032  * normal PTEs and swap PTEs. We try to handle them consistently by calling
1033  * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
1034  * and then
1035  *      - clear dirty bit to prevent IO
1036  *      - remove from LRU
1037  *      - but keep in the swap cache, so that when we return to it on
1038  *        a later page fault, we know the application is accessing
1039  *        corrupted data and shall be killed (we installed simple
1040  *        interception code in do_swap_page to catch it).
1041  *
1042  * Clean swap cache pages can be directly isolated. A later page fault will
1043  * bring in the known good data from disk.
1044  */
1045 static int me_swapcache_dirty(struct page_state *ps, struct page *p)
1046 {
1047 	int ret;
1048 	bool extra_pins = false;
1049 
1050 	ClearPageDirty(p);
1051 	/* Trigger EIO in shmem: */
1052 	ClearPageUptodate(p);
1053 
1054 	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
1055 	unlock_page(p);
1056 
1057 	if (ret == MF_DELAYED)
1058 		extra_pins = true;
1059 
1060 	if (has_extra_refcount(ps, p, extra_pins))
1061 		ret = MF_FAILED;
1062 
1063 	return ret;
1064 }
1065 
1066 static int me_swapcache_clean(struct page_state *ps, struct page *p)
1067 {
1068 	struct folio *folio = page_folio(p);
1069 	int ret;
1070 
1071 	delete_from_swap_cache(folio);
1072 
1073 	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
1074 	folio_unlock(folio);
1075 
1076 	if (has_extra_refcount(ps, p, false))
1077 		ret = MF_FAILED;
1078 
1079 	return ret;
1080 }
1081 
1082 /*
1083  * Huge pages. Needs work.
1084  * Issues:
1085  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
1086  *   To narrow down kill region to one page, we need to break up pmd.
1087  */
1088 static int me_huge_page(struct page_state *ps, struct page *p)
1089 {
1090 	int res;
1091 	struct page *hpage = compound_head(p);
1092 	struct address_space *mapping;
1093 
1094 	if (!PageHuge(hpage))
1095 		return MF_DELAYED;
1096 
1097 	mapping = page_mapping(hpage);
1098 	if (mapping) {
1099 		res = truncate_error_page(hpage, page_to_pfn(p), mapping);
1100 		unlock_page(hpage);
1101 	} else {
1102 		unlock_page(hpage);
1103 		/*
1104 		 * migration entry prevents later access on error hugepage,
1105 		 * so we can free and dissolve it into buddy to save healthy
1106 		 * subpages.
1107 		 */
1108 		put_page(hpage);
1109 		if (__page_handle_poison(p) >= 0) {
1110 			page_ref_inc(p);
1111 			res = MF_RECOVERED;
1112 		} else {
1113 			res = MF_FAILED;
1114 		}
1115 	}
1116 
1117 	if (has_extra_refcount(ps, p, false))
1118 		res = MF_FAILED;
1119 
1120 	return res;
1121 }
1122 
1123 /*
1124  * Various page states we can handle.
1125  *
1126  * A page state is defined by its current page->flags bits.
1127  * The table matches them in order and calls the right handler.
1128  *
1129  * This is quite tricky because we can access page at any time
1130  * in its live cycle, so all accesses have to be extremely careful.
1131  *
1132  * This is not complete. More states could be added.
1133  * For any missing state don't attempt recovery.
1134  */
1135 
1136 #define dirty		(1UL << PG_dirty)
1137 #define sc		((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1138 #define unevict		(1UL << PG_unevictable)
1139 #define mlock		(1UL << PG_mlocked)
1140 #define lru		(1UL << PG_lru)
1141 #define head		(1UL << PG_head)
1142 #define slab		(1UL << PG_slab)
1143 #define reserved	(1UL << PG_reserved)
1144 
1145 static struct page_state error_states[] = {
1146 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
1147 	/*
1148 	 * free pages are specially detected outside this table:
1149 	 * PG_buddy pages only make a small fraction of all free pages.
1150 	 */
1151 
1152 	/*
1153 	 * Could in theory check if slab page is free or if we can drop
1154 	 * currently unused objects without touching them. But just
1155 	 * treat it as standard kernel for now.
1156 	 */
1157 	{ slab,		slab,		MF_MSG_SLAB,	me_kernel },
1158 
1159 	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
1160 
1161 	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
1162 	{ sc|dirty,	sc,		MF_MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
1163 
1164 	{ mlock|dirty,	mlock|dirty,	MF_MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
1165 	{ mlock|dirty,	mlock,		MF_MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
1166 
1167 	{ unevict|dirty, unevict|dirty,	MF_MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
1168 	{ unevict|dirty, unevict,	MF_MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
1169 
1170 	{ lru|dirty,	lru|dirty,	MF_MSG_DIRTY_LRU,	me_pagecache_dirty },
1171 	{ lru|dirty,	lru,		MF_MSG_CLEAN_LRU,	me_pagecache_clean },
1172 
1173 	/*
1174 	 * Catchall entry: must be at end.
1175 	 */
1176 	{ 0,		0,		MF_MSG_UNKNOWN,	me_unknown },
1177 };
1178 
1179 #undef dirty
1180 #undef sc
1181 #undef unevict
1182 #undef mlock
1183 #undef lru
1184 #undef head
1185 #undef slab
1186 #undef reserved
1187 
1188 /*
1189  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
1190  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
1191  */
1192 static int action_result(unsigned long pfn, enum mf_action_page_type type,
1193 			 enum mf_result result)
1194 {
1195 	trace_memory_failure_event(pfn, type, result);
1196 
1197 	num_poisoned_pages_inc();
1198 	pr_err("%#lx: recovery action for %s: %s\n",
1199 		pfn, action_page_types[type], action_name[result]);
1200 
1201 	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1202 }
1203 
1204 static int page_action(struct page_state *ps, struct page *p,
1205 			unsigned long pfn)
1206 {
1207 	int result;
1208 
1209 	/* page p should be unlocked after returning from ps->action().  */
1210 	result = ps->action(ps, p);
1211 
1212 	/* Could do more checks here if page looks ok */
1213 	/*
1214 	 * Could adjust zone counters here to correct for the missing page.
1215 	 */
1216 
1217 	return action_result(pfn, ps->type, result);
1218 }
1219 
1220 static inline bool PageHWPoisonTakenOff(struct page *page)
1221 {
1222 	return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1223 }
1224 
1225 void SetPageHWPoisonTakenOff(struct page *page)
1226 {
1227 	set_page_private(page, MAGIC_HWPOISON);
1228 }
1229 
1230 void ClearPageHWPoisonTakenOff(struct page *page)
1231 {
1232 	if (PageHWPoison(page))
1233 		set_page_private(page, 0);
1234 }
1235 
1236 /*
1237  * Return true if a page type of a given page is supported by hwpoison
1238  * mechanism (while handling could fail), otherwise false.  This function
1239  * does not return true for hugetlb or device memory pages, so it's assumed
1240  * to be called only in the context where we never have such pages.
1241  */
1242 static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
1243 {
1244 	/* Soft offline could migrate non-LRU movable pages */
1245 	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1246 		return true;
1247 
1248 	return PageLRU(page) || is_free_buddy_page(page);
1249 }
1250 
1251 static int __get_hwpoison_page(struct page *page, unsigned long flags)
1252 {
1253 	struct page *head = compound_head(page);
1254 	int ret = 0;
1255 	bool hugetlb = false;
1256 
1257 	ret = get_hwpoison_huge_page(head, &hugetlb, false);
1258 	if (hugetlb)
1259 		return ret;
1260 
1261 	/*
1262 	 * This check prevents from calling get_page_unless_zero() for any
1263 	 * unsupported type of page in order to reduce the risk of unexpected
1264 	 * races caused by taking a page refcount.
1265 	 */
1266 	if (!HWPoisonHandlable(head, flags))
1267 		return -EBUSY;
1268 
1269 	if (get_page_unless_zero(head)) {
1270 		if (head == compound_head(page))
1271 			return 1;
1272 
1273 		pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1274 		put_page(head);
1275 	}
1276 
1277 	return 0;
1278 }
1279 
1280 static int get_any_page(struct page *p, unsigned long flags)
1281 {
1282 	int ret = 0, pass = 0;
1283 	bool count_increased = false;
1284 
1285 	if (flags & MF_COUNT_INCREASED)
1286 		count_increased = true;
1287 
1288 try_again:
1289 	if (!count_increased) {
1290 		ret = __get_hwpoison_page(p, flags);
1291 		if (!ret) {
1292 			if (page_count(p)) {
1293 				/* We raced with an allocation, retry. */
1294 				if (pass++ < 3)
1295 					goto try_again;
1296 				ret = -EBUSY;
1297 			} else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1298 				/* We raced with put_page, retry. */
1299 				if (pass++ < 3)
1300 					goto try_again;
1301 				ret = -EIO;
1302 			}
1303 			goto out;
1304 		} else if (ret == -EBUSY) {
1305 			/*
1306 			 * We raced with (possibly temporary) unhandlable
1307 			 * page, retry.
1308 			 */
1309 			if (pass++ < 3) {
1310 				shake_page(p);
1311 				goto try_again;
1312 			}
1313 			ret = -EIO;
1314 			goto out;
1315 		}
1316 	}
1317 
1318 	if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
1319 		ret = 1;
1320 	} else {
1321 		/*
1322 		 * A page we cannot handle. Check whether we can turn
1323 		 * it into something we can handle.
1324 		 */
1325 		if (pass++ < 3) {
1326 			put_page(p);
1327 			shake_page(p);
1328 			count_increased = false;
1329 			goto try_again;
1330 		}
1331 		put_page(p);
1332 		ret = -EIO;
1333 	}
1334 out:
1335 	if (ret == -EIO)
1336 		pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1337 
1338 	return ret;
1339 }
1340 
1341 static int __get_unpoison_page(struct page *page)
1342 {
1343 	struct page *head = compound_head(page);
1344 	int ret = 0;
1345 	bool hugetlb = false;
1346 
1347 	ret = get_hwpoison_huge_page(head, &hugetlb, true);
1348 	if (hugetlb)
1349 		return ret;
1350 
1351 	/*
1352 	 * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
1353 	 * but also isolated from buddy freelist, so need to identify the
1354 	 * state and have to cancel both operations to unpoison.
1355 	 */
1356 	if (PageHWPoisonTakenOff(page))
1357 		return -EHWPOISON;
1358 
1359 	return get_page_unless_zero(page) ? 1 : 0;
1360 }
1361 
1362 /**
1363  * get_hwpoison_page() - Get refcount for memory error handling
1364  * @p:		Raw error page (hit by memory error)
1365  * @flags:	Flags controlling behavior of error handling
1366  *
1367  * get_hwpoison_page() takes a page refcount of an error page to handle memory
1368  * error on it, after checking that the error page is in a well-defined state
1369  * (defined as a page-type we can successfully handle the memory error on it,
1370  * such as LRU page and hugetlb page).
1371  *
1372  * Memory error handling could be triggered at any time on any type of page,
1373  * so it's prone to race with typical memory management lifecycle (like
1374  * allocation and free).  So to avoid such races, get_hwpoison_page() takes
1375  * extra care for the error page's state (as done in __get_hwpoison_page()),
1376  * and has some retry logic in get_any_page().
1377  *
1378  * When called from unpoison_memory(), the caller should already ensure that
1379  * the given page has PG_hwpoison. So it's never reused for other page
1380  * allocations, and __get_unpoison_page() never races with them.
1381  *
1382  * Return: 0 on failure,
1383  *         1 on success for in-use pages in a well-defined state,
1384  *         -EIO for pages on which we can not handle memory errors,
1385  *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
1386  *         operations like allocation and free,
1387  *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
1388  */
1389 static int get_hwpoison_page(struct page *p, unsigned long flags)
1390 {
1391 	int ret;
1392 
1393 	zone_pcp_disable(page_zone(p));
1394 	if (flags & MF_UNPOISON)
1395 		ret = __get_unpoison_page(p);
1396 	else
1397 		ret = get_any_page(p, flags);
1398 	zone_pcp_enable(page_zone(p));
1399 
1400 	return ret;
1401 }
1402 
1403 /*
1404  * Do all that is necessary to remove user space mappings. Unmap
1405  * the pages and send SIGBUS to the processes if the data was dirty.
1406  */
1407 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1408 				  int flags, struct page *hpage)
1409 {
1410 	struct folio *folio = page_folio(hpage);
1411 	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
1412 	struct address_space *mapping;
1413 	LIST_HEAD(tokill);
1414 	bool unmap_success;
1415 	int forcekill;
1416 	bool mlocked = PageMlocked(hpage);
1417 
1418 	/*
1419 	 * Here we are interested only in user-mapped pages, so skip any
1420 	 * other types of pages.
1421 	 */
1422 	if (PageReserved(p) || PageSlab(p) || PageTable(p))
1423 		return true;
1424 	if (!(PageLRU(hpage) || PageHuge(p)))
1425 		return true;
1426 
1427 	/*
1428 	 * This check implies we don't kill processes if their pages
1429 	 * are in the swap cache early. Those are always late kills.
1430 	 */
1431 	if (!page_mapped(hpage))
1432 		return true;
1433 
1434 	if (PageKsm(p)) {
1435 		pr_err("%#lx: can't handle KSM pages.\n", pfn);
1436 		return false;
1437 	}
1438 
1439 	if (PageSwapCache(p)) {
1440 		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1441 		ttu |= TTU_IGNORE_HWPOISON;
1442 	}
1443 
1444 	/*
1445 	 * Propagate the dirty bit from PTEs to struct page first, because we
1446 	 * need this to decide if we should kill or just drop the page.
1447 	 * XXX: the dirty test could be racy: set_page_dirty() may not always
1448 	 * be called inside page lock (it's recommended but not enforced).
1449 	 */
1450 	mapping = page_mapping(hpage);
1451 	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1452 	    mapping_can_writeback(mapping)) {
1453 		if (page_mkclean(hpage)) {
1454 			SetPageDirty(hpage);
1455 		} else {
1456 			ttu |= TTU_IGNORE_HWPOISON;
1457 			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1458 				pfn);
1459 		}
1460 	}
1461 
1462 	/*
1463 	 * First collect all the processes that have the page
1464 	 * mapped in dirty form.  This has to be done before try_to_unmap,
1465 	 * because ttu takes the rmap data structures down.
1466 	 */
1467 	collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1468 
1469 	if (PageHuge(hpage) && !PageAnon(hpage)) {
1470 		/*
1471 		 * For hugetlb pages in shared mappings, try_to_unmap
1472 		 * could potentially call huge_pmd_unshare.  Because of
1473 		 * this, take semaphore in write mode here and set
1474 		 * TTU_RMAP_LOCKED to indicate we have taken the lock
1475 		 * at this higher level.
1476 		 */
1477 		mapping = hugetlb_page_mapping_lock_write(hpage);
1478 		if (mapping) {
1479 			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
1480 			i_mmap_unlock_write(mapping);
1481 		} else
1482 			pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1483 	} else {
1484 		try_to_unmap(folio, ttu);
1485 	}
1486 
1487 	unmap_success = !page_mapped(hpage);
1488 	if (!unmap_success)
1489 		pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
1490 		       pfn, page_mapcount(hpage));
1491 
1492 	/*
1493 	 * try_to_unmap() might put mlocked page in lru cache, so call
1494 	 * shake_page() again to ensure that it's flushed.
1495 	 */
1496 	if (mlocked)
1497 		shake_page(hpage);
1498 
1499 	/*
1500 	 * Now that the dirty bit has been propagated to the
1501 	 * struct page and all unmaps done we can decide if
1502 	 * killing is needed or not.  Only kill when the page
1503 	 * was dirty or the process is not restartable,
1504 	 * otherwise the tokill list is merely
1505 	 * freed.  When there was a problem unmapping earlier
1506 	 * use a more force-full uncatchable kill to prevent
1507 	 * any accesses to the poisoned memory.
1508 	 */
1509 	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
1510 		    !unmap_success;
1511 	kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1512 
1513 	return unmap_success;
1514 }
1515 
1516 static int identify_page_state(unsigned long pfn, struct page *p,
1517 				unsigned long page_flags)
1518 {
1519 	struct page_state *ps;
1520 
1521 	/*
1522 	 * The first check uses the current page flags which may not have any
1523 	 * relevant information. The second check with the saved page flags is
1524 	 * carried out only if the first check can't determine the page status.
1525 	 */
1526 	for (ps = error_states;; ps++)
1527 		if ((p->flags & ps->mask) == ps->res)
1528 			break;
1529 
1530 	page_flags |= (p->flags & (1UL << PG_dirty));
1531 
1532 	if (!ps->mask)
1533 		for (ps = error_states;; ps++)
1534 			if ((page_flags & ps->mask) == ps->res)
1535 				break;
1536 	return page_action(ps, p, pfn);
1537 }
1538 
1539 static int try_to_split_thp_page(struct page *page)
1540 {
1541 	int ret;
1542 
1543 	lock_page(page);
1544 	ret = split_huge_page(page);
1545 	unlock_page(page);
1546 
1547 	if (unlikely(ret))
1548 		put_page(page);
1549 
1550 	return ret;
1551 }
1552 
1553 static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
1554 		struct address_space *mapping, pgoff_t index, int flags)
1555 {
1556 	struct to_kill *tk;
1557 	unsigned long size = 0;
1558 
1559 	list_for_each_entry(tk, to_kill, nd)
1560 		if (tk->size_shift)
1561 			size = max(size, 1UL << tk->size_shift);
1562 
1563 	if (size) {
1564 		/*
1565 		 * Unmap the largest mapping to avoid breaking up device-dax
1566 		 * mappings which are constant size. The actual size of the
1567 		 * mapping being torn down is communicated in siginfo, see
1568 		 * kill_proc()
1569 		 */
1570 		loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
1571 
1572 		unmap_mapping_range(mapping, start, size, 0);
1573 	}
1574 
1575 	kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
1576 }
1577 
1578 static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1579 		struct dev_pagemap *pgmap)
1580 {
1581 	struct page *page = pfn_to_page(pfn);
1582 	LIST_HEAD(to_kill);
1583 	dax_entry_t cookie;
1584 	int rc = 0;
1585 
1586 	/*
1587 	 * Pages instantiated by device-dax (not filesystem-dax)
1588 	 * may be compound pages.
1589 	 */
1590 	page = compound_head(page);
1591 
1592 	/*
1593 	 * Prevent the inode from being freed while we are interrogating
1594 	 * the address_space, typically this would be handled by
1595 	 * lock_page(), but dax pages do not use the page lock. This
1596 	 * also prevents changes to the mapping of this pfn until
1597 	 * poison signaling is complete.
1598 	 */
1599 	cookie = dax_lock_page(page);
1600 	if (!cookie)
1601 		return -EBUSY;
1602 
1603 	if (hwpoison_filter(page)) {
1604 		rc = -EOPNOTSUPP;
1605 		goto unlock;
1606 	}
1607 
1608 	switch (pgmap->type) {
1609 	case MEMORY_DEVICE_PRIVATE:
1610 	case MEMORY_DEVICE_COHERENT:
1611 		/*
1612 		 * TODO: Handle device pages which may need coordination
1613 		 * with device-side memory.
1614 		 */
1615 		rc = -ENXIO;
1616 		goto unlock;
1617 	default:
1618 		break;
1619 	}
1620 
1621 	/*
1622 	 * Use this flag as an indication that the dax page has been
1623 	 * remapped UC to prevent speculative consumption of poison.
1624 	 */
1625 	SetPageHWPoison(page);
1626 
1627 	/*
1628 	 * Unlike System-RAM there is no possibility to swap in a
1629 	 * different physical page at a given virtual address, so all
1630 	 * userspace consumption of ZONE_DEVICE memory necessitates
1631 	 * SIGBUS (i.e. MF_MUST_KILL)
1632 	 */
1633 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1634 	collect_procs(page, &to_kill, true);
1635 
1636 	unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
1637 unlock:
1638 	dax_unlock_page(page, cookie);
1639 	return rc;
1640 }
1641 
1642 #ifdef CONFIG_FS_DAX
1643 /**
1644  * mf_dax_kill_procs - Collect and kill processes who are using this file range
1645  * @mapping:	address_space of the file in use
1646  * @index:	start pgoff of the range within the file
1647  * @count:	length of the range, in unit of PAGE_SIZE
1648  * @mf_flags:	memory failure flags
1649  */
1650 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1651 		unsigned long count, int mf_flags)
1652 {
1653 	LIST_HEAD(to_kill);
1654 	dax_entry_t cookie;
1655 	struct page *page;
1656 	size_t end = index + count;
1657 
1658 	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1659 
1660 	for (; index < end; index++) {
1661 		page = NULL;
1662 		cookie = dax_lock_mapping_entry(mapping, index, &page);
1663 		if (!cookie)
1664 			return -EBUSY;
1665 		if (!page)
1666 			goto unlock;
1667 
1668 		SetPageHWPoison(page);
1669 
1670 		collect_procs_fsdax(page, mapping, index, &to_kill);
1671 		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
1672 				index, mf_flags);
1673 unlock:
1674 		dax_unlock_mapping_entry(mapping, index, cookie);
1675 	}
1676 	return 0;
1677 }
1678 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1679 #endif /* CONFIG_FS_DAX */
1680 
1681 #ifdef CONFIG_HUGETLB_PAGE
1682 /*
1683  * Struct raw_hwp_page represents information about "raw error page",
1684  * constructing singly linked list originated from ->private field of
1685  * SUBPAGE_INDEX_HWPOISON-th tail page.
1686  */
1687 struct raw_hwp_page {
1688 	struct llist_node node;
1689 	struct page *page;
1690 };
1691 
1692 static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
1693 {
1694 	return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
1695 }
1696 
1697 static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
1698 {
1699 	struct llist_head *head;
1700 	struct llist_node *t, *tnode;
1701 	unsigned long count = 0;
1702 
1703 	head = raw_hwp_list_head(hpage);
1704 	llist_for_each_safe(tnode, t, head->first) {
1705 		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1706 
1707 		if (move_flag)
1708 			SetPageHWPoison(p->page);
1709 		kfree(p);
1710 		count++;
1711 	}
1712 	llist_del_all(head);
1713 	return count;
1714 }
1715 
1716 static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
1717 {
1718 	struct llist_head *head;
1719 	struct raw_hwp_page *raw_hwp;
1720 	struct llist_node *t, *tnode;
1721 	int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
1722 
1723 	/*
1724 	 * Once the hwpoison hugepage has lost reliable raw error info,
1725 	 * there is little meaning to keep additional error info precisely,
1726 	 * so skip to add additional raw error info.
1727 	 */
1728 	if (HPageRawHwpUnreliable(hpage))
1729 		return -EHWPOISON;
1730 	head = raw_hwp_list_head(hpage);
1731 	llist_for_each_safe(tnode, t, head->first) {
1732 		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
1733 
1734 		if (p->page == page)
1735 			return -EHWPOISON;
1736 	}
1737 
1738 	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
1739 	if (raw_hwp) {
1740 		raw_hwp->page = page;
1741 		llist_add(&raw_hwp->node, head);
1742 		/* the first error event will be counted in action_result(). */
1743 		if (ret)
1744 			num_poisoned_pages_inc();
1745 	} else {
1746 		/*
1747 		 * Failed to save raw error info.  We no longer trace all
1748 		 * hwpoisoned subpages, and we need refuse to free/dissolve
1749 		 * this hwpoisoned hugepage.
1750 		 */
1751 		SetHPageRawHwpUnreliable(hpage);
1752 		/*
1753 		 * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
1754 		 * used any more, so free it.
1755 		 */
1756 		__free_raw_hwp_pages(hpage, false);
1757 	}
1758 	return ret;
1759 }
1760 
1761 static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
1762 {
1763 	/*
1764 	 * HPageVmemmapOptimized hugepages can't be freed because struct
1765 	 * pages for tail pages are required but they don't exist.
1766 	 */
1767 	if (move_flag && HPageVmemmapOptimized(hpage))
1768 		return 0;
1769 
1770 	/*
1771 	 * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
1772 	 * definition.
1773 	 */
1774 	if (HPageRawHwpUnreliable(hpage))
1775 		return 0;
1776 
1777 	return __free_raw_hwp_pages(hpage, move_flag);
1778 }
1779 
1780 void hugetlb_clear_page_hwpoison(struct page *hpage)
1781 {
1782 	if (HPageRawHwpUnreliable(hpage))
1783 		return;
1784 	ClearPageHWPoison(hpage);
1785 	free_raw_hwp_pages(hpage, true);
1786 }
1787 
1788 /*
1789  * Called from hugetlb code with hugetlb_lock held.
1790  *
1791  * Return values:
1792  *   0             - free hugepage
1793  *   1             - in-use hugepage
1794  *   2             - not a hugepage
1795  *   -EBUSY        - the hugepage is busy (try to retry)
1796  *   -EHWPOISON    - the hugepage is already hwpoisoned
1797  */
1798 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
1799 				 bool *migratable_cleared)
1800 {
1801 	struct page *page = pfn_to_page(pfn);
1802 	struct page *head = compound_head(page);
1803 	int ret = 2;	/* fallback to normal page handling */
1804 	bool count_increased = false;
1805 
1806 	if (!PageHeadHuge(head))
1807 		goto out;
1808 
1809 	if (flags & MF_COUNT_INCREASED) {
1810 		ret = 1;
1811 		count_increased = true;
1812 	} else if (HPageFreed(head)) {
1813 		ret = 0;
1814 	} else if (HPageMigratable(head)) {
1815 		ret = get_page_unless_zero(head);
1816 		if (ret)
1817 			count_increased = true;
1818 	} else {
1819 		ret = -EBUSY;
1820 		if (!(flags & MF_NO_RETRY))
1821 			goto out;
1822 	}
1823 
1824 	if (hugetlb_set_page_hwpoison(head, page)) {
1825 		ret = -EHWPOISON;
1826 		goto out;
1827 	}
1828 
1829 	/*
1830 	 * Clearing HPageMigratable for hwpoisoned hugepages to prevent them
1831 	 * from being migrated by memory hotremove.
1832 	 */
1833 	if (count_increased && HPageMigratable(head)) {
1834 		ClearHPageMigratable(head);
1835 		*migratable_cleared = true;
1836 	}
1837 
1838 	return ret;
1839 out:
1840 	if (count_increased)
1841 		put_page(head);
1842 	return ret;
1843 }
1844 
1845 /*
1846  * Taking refcount of hugetlb pages needs extra care about race conditions
1847  * with basic operations like hugepage allocation/free/demotion.
1848  * So some of prechecks for hwpoison (pinning, and testing/setting
1849  * PageHWPoison) should be done in single hugetlb_lock range.
1850  */
1851 static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1852 {
1853 	int res;
1854 	struct page *p = pfn_to_page(pfn);
1855 	struct page *head;
1856 	unsigned long page_flags;
1857 	bool migratable_cleared = false;
1858 
1859 	*hugetlb = 1;
1860 retry:
1861 	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
1862 	if (res == 2) { /* fallback to normal page handling */
1863 		*hugetlb = 0;
1864 		return 0;
1865 	} else if (res == -EHWPOISON) {
1866 		pr_err("%#lx: already hardware poisoned\n", pfn);
1867 		if (flags & MF_ACTION_REQUIRED) {
1868 			head = compound_head(p);
1869 			res = kill_accessing_process(current, page_to_pfn(head), flags);
1870 		}
1871 		return res;
1872 	} else if (res == -EBUSY) {
1873 		if (!(flags & MF_NO_RETRY)) {
1874 			flags |= MF_NO_RETRY;
1875 			goto retry;
1876 		}
1877 		return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
1878 	}
1879 
1880 	head = compound_head(p);
1881 	lock_page(head);
1882 
1883 	if (hwpoison_filter(p)) {
1884 		hugetlb_clear_page_hwpoison(head);
1885 		if (migratable_cleared)
1886 			SetHPageMigratable(head);
1887 		unlock_page(head);
1888 		if (res == 1)
1889 			put_page(head);
1890 		return -EOPNOTSUPP;
1891 	}
1892 
1893 	/*
1894 	 * Handling free hugepage.  The possible race with hugepage allocation
1895 	 * or demotion can be prevented by PageHWPoison flag.
1896 	 */
1897 	if (res == 0) {
1898 		unlock_page(head);
1899 		if (__page_handle_poison(p) >= 0) {
1900 			page_ref_inc(p);
1901 			res = MF_RECOVERED;
1902 		} else {
1903 			res = MF_FAILED;
1904 		}
1905 		return action_result(pfn, MF_MSG_FREE_HUGE, res);
1906 	}
1907 
1908 	page_flags = head->flags;
1909 
1910 	if (!hwpoison_user_mappings(p, pfn, flags, head)) {
1911 		unlock_page(head);
1912 		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1913 	}
1914 
1915 	return identify_page_state(pfn, p, page_flags);
1916 }
1917 
1918 #else
1919 static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
1920 {
1921 	return 0;
1922 }
1923 
1924 static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
1925 {
1926 	return 0;
1927 }
1928 #endif	/* CONFIG_HUGETLB_PAGE */
1929 
1930 /* Drop the extra refcount in case we come from madvise() */
1931 static void put_ref_page(unsigned long pfn, int flags)
1932 {
1933 	struct page *page;
1934 
1935 	if (!(flags & MF_COUNT_INCREASED))
1936 		return;
1937 
1938 	page = pfn_to_page(pfn);
1939 	if (page)
1940 		put_page(page);
1941 }
1942 
1943 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1944 		struct dev_pagemap *pgmap)
1945 {
1946 	int rc = -ENXIO;
1947 
1948 	put_ref_page(pfn, flags);
1949 
1950 	/* device metadata space is not recoverable */
1951 	if (!pgmap_pfn_valid(pgmap, pfn))
1952 		goto out;
1953 
1954 	/*
1955 	 * Call driver's implementation to handle the memory failure, otherwise
1956 	 * fall back to generic handler.
1957 	 */
1958 	if (pgmap_has_memory_failure(pgmap)) {
1959 		rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
1960 		/*
1961 		 * Fall back to generic handler too if operation is not
1962 		 * supported inside the driver/device/filesystem.
1963 		 */
1964 		if (rc != -EOPNOTSUPP)
1965 			goto out;
1966 	}
1967 
1968 	rc = mf_generic_kill_procs(pfn, flags, pgmap);
1969 out:
1970 	/* drop pgmap ref acquired in caller */
1971 	put_dev_pagemap(pgmap);
1972 	action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1973 	return rc;
1974 }
1975 
1976 static DEFINE_MUTEX(mf_mutex);
1977 
1978 /**
1979  * memory_failure - Handle memory failure of a page.
1980  * @pfn: Page Number of the corrupted page
1981  * @flags: fine tune action taken
1982  *
1983  * This function is called by the low level machine check code
1984  * of an architecture when it detects hardware memory corruption
1985  * of a page. It tries its best to recover, which includes
1986  * dropping pages, killing processes etc.
1987  *
1988  * The function is primarily of use for corruptions that
1989  * happen outside the current execution context (e.g. when
1990  * detected by a background scrubber)
1991  *
1992  * Must run in process context (e.g. a work queue) with interrupts
1993  * enabled and no spinlocks hold.
1994  *
1995  * Return: 0 for successfully handled the memory error,
1996  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
1997  *         < 0(except -EOPNOTSUPP) on failure.
1998  */
1999 int memory_failure(unsigned long pfn, int flags)
2000 {
2001 	struct page *p;
2002 	struct page *hpage;
2003 	struct dev_pagemap *pgmap;
2004 	int res = 0;
2005 	unsigned long page_flags;
2006 	bool retry = true;
2007 	int hugetlb = 0;
2008 
2009 	if (!sysctl_memory_failure_recovery)
2010 		panic("Memory failure on page %lx", pfn);
2011 
2012 	mutex_lock(&mf_mutex);
2013 
2014 	if (!(flags & MF_SW_SIMULATED))
2015 		hw_memory_failure = true;
2016 
2017 	p = pfn_to_online_page(pfn);
2018 	if (!p) {
2019 		res = arch_memory_failure(pfn, flags);
2020 		if (res == 0)
2021 			goto unlock_mutex;
2022 
2023 		if (pfn_valid(pfn)) {
2024 			pgmap = get_dev_pagemap(pfn, NULL);
2025 			if (pgmap) {
2026 				res = memory_failure_dev_pagemap(pfn, flags,
2027 								 pgmap);
2028 				goto unlock_mutex;
2029 			}
2030 		}
2031 		pr_err("%#lx: memory outside kernel control\n", pfn);
2032 		res = -ENXIO;
2033 		goto unlock_mutex;
2034 	}
2035 
2036 try_again:
2037 	res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
2038 	if (hugetlb)
2039 		goto unlock_mutex;
2040 
2041 	if (TestSetPageHWPoison(p)) {
2042 		pr_err("%#lx: already hardware poisoned\n", pfn);
2043 		res = -EHWPOISON;
2044 		if (flags & MF_ACTION_REQUIRED)
2045 			res = kill_accessing_process(current, pfn, flags);
2046 		if (flags & MF_COUNT_INCREASED)
2047 			put_page(p);
2048 		goto unlock_mutex;
2049 	}
2050 
2051 	hpage = compound_head(p);
2052 
2053 	/*
2054 	 * We need/can do nothing about count=0 pages.
2055 	 * 1) it's a free page, and therefore in safe hand:
2056 	 *    check_new_page() will be the gate keeper.
2057 	 * 2) it's part of a non-compound high order page.
2058 	 *    Implies some kernel user: cannot stop them from
2059 	 *    R/W the page; let's pray that the page has been
2060 	 *    used and will be freed some time later.
2061 	 * In fact it's dangerous to directly bump up page count from 0,
2062 	 * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
2063 	 */
2064 	if (!(flags & MF_COUNT_INCREASED)) {
2065 		res = get_hwpoison_page(p, flags);
2066 		if (!res) {
2067 			if (is_free_buddy_page(p)) {
2068 				if (take_page_off_buddy(p)) {
2069 					page_ref_inc(p);
2070 					res = MF_RECOVERED;
2071 				} else {
2072 					/* We lost the race, try again */
2073 					if (retry) {
2074 						ClearPageHWPoison(p);
2075 						retry = false;
2076 						goto try_again;
2077 					}
2078 					res = MF_FAILED;
2079 				}
2080 				res = action_result(pfn, MF_MSG_BUDDY, res);
2081 			} else {
2082 				res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
2083 			}
2084 			goto unlock_mutex;
2085 		} else if (res < 0) {
2086 			res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
2087 			goto unlock_mutex;
2088 		}
2089 	}
2090 
2091 	if (PageTransHuge(hpage)) {
2092 		/*
2093 		 * The flag must be set after the refcount is bumped
2094 		 * otherwise it may race with THP split.
2095 		 * And the flag can't be set in get_hwpoison_page() since
2096 		 * it is called by soft offline too and it is just called
2097 		 * for !MF_COUNT_INCREASE.  So here seems to be the best
2098 		 * place.
2099 		 *
2100 		 * Don't need care about the above error handling paths for
2101 		 * get_hwpoison_page() since they handle either free page
2102 		 * or unhandlable page.  The refcount is bumped iff the
2103 		 * page is a valid handlable page.
2104 		 */
2105 		SetPageHasHWPoisoned(hpage);
2106 		if (try_to_split_thp_page(p) < 0) {
2107 			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
2108 			goto unlock_mutex;
2109 		}
2110 		VM_BUG_ON_PAGE(!page_count(p), p);
2111 	}
2112 
2113 	/*
2114 	 * We ignore non-LRU pages for good reasons.
2115 	 * - PG_locked is only well defined for LRU pages and a few others
2116 	 * - to avoid races with __SetPageLocked()
2117 	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
2118 	 * The check (unnecessarily) ignores LRU pages being isolated and
2119 	 * walked by the page reclaim code, however that's not a big loss.
2120 	 */
2121 	shake_page(p);
2122 
2123 	lock_page(p);
2124 
2125 	/*
2126 	 * We're only intended to deal with the non-Compound page here.
2127 	 * However, the page could have changed compound pages due to
2128 	 * race window. If this happens, we could try again to hopefully
2129 	 * handle the page next round.
2130 	 */
2131 	if (PageCompound(p)) {
2132 		if (retry) {
2133 			ClearPageHWPoison(p);
2134 			unlock_page(p);
2135 			put_page(p);
2136 			flags &= ~MF_COUNT_INCREASED;
2137 			retry = false;
2138 			goto try_again;
2139 		}
2140 		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
2141 		goto unlock_page;
2142 	}
2143 
2144 	/*
2145 	 * We use page flags to determine what action should be taken, but
2146 	 * the flags can be modified by the error containment action.  One
2147 	 * example is an mlocked page, where PG_mlocked is cleared by
2148 	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
2149 	 * correctly, we save a copy of the page flags at this time.
2150 	 */
2151 	page_flags = p->flags;
2152 
2153 	if (hwpoison_filter(p)) {
2154 		ClearPageHWPoison(p);
2155 		unlock_page(p);
2156 		put_page(p);
2157 		res = -EOPNOTSUPP;
2158 		goto unlock_mutex;
2159 	}
2160 
2161 	/*
2162 	 * __munlock_pagevec may clear a writeback page's LRU flag without
2163 	 * page_lock. We need wait writeback completion for this page or it
2164 	 * may trigger vfs BUG while evict inode.
2165 	 */
2166 	if (!PageLRU(p) && !PageWriteback(p))
2167 		goto identify_page_state;
2168 
2169 	/*
2170 	 * It's very difficult to mess with pages currently under IO
2171 	 * and in many cases impossible, so we just avoid it here.
2172 	 */
2173 	wait_on_page_writeback(p);
2174 
2175 	/*
2176 	 * Now take care of user space mappings.
2177 	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
2178 	 */
2179 	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
2180 		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
2181 		goto unlock_page;
2182 	}
2183 
2184 	/*
2185 	 * Torn down by someone else?
2186 	 */
2187 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
2188 		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
2189 		goto unlock_page;
2190 	}
2191 
2192 identify_page_state:
2193 	res = identify_page_state(pfn, p, page_flags);
2194 	mutex_unlock(&mf_mutex);
2195 	return res;
2196 unlock_page:
2197 	unlock_page(p);
2198 unlock_mutex:
2199 	mutex_unlock(&mf_mutex);
2200 	return res;
2201 }
2202 EXPORT_SYMBOL_GPL(memory_failure);
2203 
2204 #define MEMORY_FAILURE_FIFO_ORDER	4
2205 #define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)
2206 
2207 struct memory_failure_entry {
2208 	unsigned long pfn;
2209 	int flags;
2210 };
2211 
2212 struct memory_failure_cpu {
2213 	DECLARE_KFIFO(fifo, struct memory_failure_entry,
2214 		      MEMORY_FAILURE_FIFO_SIZE);
2215 	spinlock_t lock;
2216 	struct work_struct work;
2217 };
2218 
2219 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2220 
2221 /**
2222  * memory_failure_queue - Schedule handling memory failure of a page.
2223  * @pfn: Page Number of the corrupted page
2224  * @flags: Flags for memory failure handling
2225  *
2226  * This function is called by the low level hardware error handler
2227  * when it detects hardware memory corruption of a page. It schedules
2228  * the recovering of error page, including dropping pages, killing
2229  * processes etc.
2230  *
2231  * The function is primarily of use for corruptions that
2232  * happen outside the current execution context (e.g. when
2233  * detected by a background scrubber)
2234  *
2235  * Can run in IRQ context.
2236  */
2237 void memory_failure_queue(unsigned long pfn, int flags)
2238 {
2239 	struct memory_failure_cpu *mf_cpu;
2240 	unsigned long proc_flags;
2241 	struct memory_failure_entry entry = {
2242 		.pfn =		pfn,
2243 		.flags =	flags,
2244 	};
2245 
2246 	mf_cpu = &get_cpu_var(memory_failure_cpu);
2247 	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2248 	if (kfifo_put(&mf_cpu->fifo, entry))
2249 		schedule_work_on(smp_processor_id(), &mf_cpu->work);
2250 	else
2251 		pr_err("buffer overflow when queuing memory failure at %#lx\n",
2252 		       pfn);
2253 	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2254 	put_cpu_var(memory_failure_cpu);
2255 }
2256 EXPORT_SYMBOL_GPL(memory_failure_queue);
2257 
2258 static void memory_failure_work_func(struct work_struct *work)
2259 {
2260 	struct memory_failure_cpu *mf_cpu;
2261 	struct memory_failure_entry entry = { 0, };
2262 	unsigned long proc_flags;
2263 	int gotten;
2264 
2265 	mf_cpu = container_of(work, struct memory_failure_cpu, work);
2266 	for (;;) {
2267 		spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2268 		gotten = kfifo_get(&mf_cpu->fifo, &entry);
2269 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2270 		if (!gotten)
2271 			break;
2272 		if (entry.flags & MF_SOFT_OFFLINE)
2273 			soft_offline_page(entry.pfn, entry.flags);
2274 		else
2275 			memory_failure(entry.pfn, entry.flags);
2276 	}
2277 }
2278 
2279 /*
2280  * Process memory_failure work queued on the specified CPU.
2281  * Used to avoid return-to-userspace racing with the memory_failure workqueue.
2282  */
2283 void memory_failure_queue_kick(int cpu)
2284 {
2285 	struct memory_failure_cpu *mf_cpu;
2286 
2287 	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2288 	cancel_work_sync(&mf_cpu->work);
2289 	memory_failure_work_func(&mf_cpu->work);
2290 }
2291 
2292 static int __init memory_failure_init(void)
2293 {
2294 	struct memory_failure_cpu *mf_cpu;
2295 	int cpu;
2296 
2297 	for_each_possible_cpu(cpu) {
2298 		mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2299 		spin_lock_init(&mf_cpu->lock);
2300 		INIT_KFIFO(mf_cpu->fifo);
2301 		INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2302 	}
2303 
2304 	return 0;
2305 }
2306 core_initcall(memory_failure_init);
2307 
2308 #undef pr_fmt
2309 #define pr_fmt(fmt)	"" fmt
2310 #define unpoison_pr_info(fmt, pfn, rs)			\
2311 ({							\
2312 	if (__ratelimit(rs))				\
2313 		pr_info(fmt, pfn);			\
2314 })
2315 
2316 /**
2317  * unpoison_memory - Unpoison a previously poisoned page
2318  * @pfn: Page number of the to be unpoisoned page
2319  *
2320  * Software-unpoison a page that has been poisoned by
2321  * memory_failure() earlier.
2322  *
2323  * This is only done on the software-level, so it only works
2324  * for linux injected failures, not real hardware failures
2325  *
2326  * Returns 0 for success, otherwise -errno.
2327  */
2328 int unpoison_memory(unsigned long pfn)
2329 {
2330 	struct page *page;
2331 	struct page *p;
2332 	int ret = -EBUSY;
2333 	int freeit = 0;
2334 	unsigned long count = 1;
2335 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2336 					DEFAULT_RATELIMIT_BURST);
2337 
2338 	if (!pfn_valid(pfn))
2339 		return -ENXIO;
2340 
2341 	p = pfn_to_page(pfn);
2342 	page = compound_head(p);
2343 
2344 	mutex_lock(&mf_mutex);
2345 
2346 	if (hw_memory_failure) {
2347 		unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
2348 				 pfn, &unpoison_rs);
2349 		ret = -EOPNOTSUPP;
2350 		goto unlock_mutex;
2351 	}
2352 
2353 	if (!PageHWPoison(p)) {
2354 		unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
2355 				 pfn, &unpoison_rs);
2356 		goto unlock_mutex;
2357 	}
2358 
2359 	if (page_count(page) > 1) {
2360 		unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
2361 				 pfn, &unpoison_rs);
2362 		goto unlock_mutex;
2363 	}
2364 
2365 	if (page_mapped(page)) {
2366 		unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
2367 				 pfn, &unpoison_rs);
2368 		goto unlock_mutex;
2369 	}
2370 
2371 	if (page_mapping(page)) {
2372 		unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
2373 				 pfn, &unpoison_rs);
2374 		goto unlock_mutex;
2375 	}
2376 
2377 	if (PageSlab(page) || PageTable(page) || PageReserved(page))
2378 		goto unlock_mutex;
2379 
2380 	ret = get_hwpoison_page(p, MF_UNPOISON);
2381 	if (!ret) {
2382 		if (PageHuge(p)) {
2383 			count = free_raw_hwp_pages(page, false);
2384 			if (count == 0) {
2385 				ret = -EBUSY;
2386 				goto unlock_mutex;
2387 			}
2388 		}
2389 		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
2390 	} else if (ret < 0) {
2391 		if (ret == -EHWPOISON) {
2392 			ret = put_page_back_buddy(p) ? 0 : -EBUSY;
2393 		} else
2394 			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
2395 					 pfn, &unpoison_rs);
2396 	} else {
2397 		if (PageHuge(p)) {
2398 			count = free_raw_hwp_pages(page, false);
2399 			if (count == 0) {
2400 				ret = -EBUSY;
2401 				put_page(page);
2402 				goto unlock_mutex;
2403 			}
2404 		}
2405 		freeit = !!TestClearPageHWPoison(p);
2406 
2407 		put_page(page);
2408 		if (freeit) {
2409 			put_page(page);
2410 			ret = 0;
2411 		}
2412 	}
2413 
2414 unlock_mutex:
2415 	mutex_unlock(&mf_mutex);
2416 	if (!ret || freeit) {
2417 		num_poisoned_pages_sub(count);
2418 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2419 				 page_to_pfn(p), &unpoison_rs);
2420 	}
2421 	return ret;
2422 }
2423 EXPORT_SYMBOL(unpoison_memory);
2424 
2425 static bool isolate_page(struct page *page, struct list_head *pagelist)
2426 {
2427 	bool isolated = false;
2428 
2429 	if (PageHuge(page)) {
2430 		isolated = !isolate_hugetlb(page, pagelist);
2431 	} else {
2432 		bool lru = !__PageMovable(page);
2433 
2434 		if (lru)
2435 			isolated = !isolate_lru_page(page);
2436 		else
2437 			isolated = !isolate_movable_page(page,
2438 							 ISOLATE_UNEVICTABLE);
2439 
2440 		if (isolated) {
2441 			list_add(&page->lru, pagelist);
2442 			if (lru)
2443 				inc_node_page_state(page, NR_ISOLATED_ANON +
2444 						    page_is_file_lru(page));
2445 		}
2446 	}
2447 
2448 	/*
2449 	 * If we succeed to isolate the page, we grabbed another refcount on
2450 	 * the page, so we can safely drop the one we got from get_any_pages().
2451 	 * If we failed to isolate the page, it means that we cannot go further
2452 	 * and we will return an error, so drop the reference we got from
2453 	 * get_any_pages() as well.
2454 	 */
2455 	put_page(page);
2456 	return isolated;
2457 }
2458 
2459 /*
2460  * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
2461  * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
2462  * If the page is mapped, it migrates the contents over.
2463  */
2464 static int soft_offline_in_use_page(struct page *page)
2465 {
2466 	long ret = 0;
2467 	unsigned long pfn = page_to_pfn(page);
2468 	struct page *hpage = compound_head(page);
2469 	char const *msg_page[] = {"page", "hugepage"};
2470 	bool huge = PageHuge(page);
2471 	LIST_HEAD(pagelist);
2472 	struct migration_target_control mtc = {
2473 		.nid = NUMA_NO_NODE,
2474 		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2475 	};
2476 
2477 	if (!huge && PageTransHuge(hpage)) {
2478 		if (try_to_split_thp_page(page)) {
2479 			pr_info("soft offline: %#lx: thp split failed\n", pfn);
2480 			return -EBUSY;
2481 		}
2482 		hpage = page;
2483 	}
2484 
2485 	lock_page(page);
2486 	if (!PageHuge(page))
2487 		wait_on_page_writeback(page);
2488 	if (PageHWPoison(page)) {
2489 		unlock_page(page);
2490 		put_page(page);
2491 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
2492 		return 0;
2493 	}
2494 
2495 	if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
2496 		/*
2497 		 * Try to invalidate first. This should work for
2498 		 * non dirty unmapped page cache pages.
2499 		 */
2500 		ret = invalidate_inode_page(page);
2501 	unlock_page(page);
2502 
2503 	if (ret) {
2504 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
2505 		page_handle_poison(page, false, true);
2506 		return 0;
2507 	}
2508 
2509 	if (isolate_page(hpage, &pagelist)) {
2510 		ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2511 			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2512 		if (!ret) {
2513 			bool release = !huge;
2514 
2515 			if (!page_handle_poison(page, huge, release))
2516 				ret = -EBUSY;
2517 		} else {
2518 			if (!list_empty(&pagelist))
2519 				putback_movable_pages(&pagelist);
2520 
2521 			pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
2522 				pfn, msg_page[huge], ret, &page->flags);
2523 			if (ret > 0)
2524 				ret = -EBUSY;
2525 		}
2526 	} else {
2527 		pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
2528 			pfn, msg_page[huge], page_count(page), &page->flags);
2529 		ret = -EBUSY;
2530 	}
2531 	return ret;
2532 }
2533 
2534 /**
2535  * soft_offline_page - Soft offline a page.
2536  * @pfn: pfn to soft-offline
2537  * @flags: flags. Same as memory_failure().
2538  *
2539  * Returns 0 on success
2540  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event
2541  *         < 0 otherwise negated errno.
2542  *
2543  * Soft offline a page, by migration or invalidation,
2544  * without killing anything. This is for the case when
2545  * a page is not corrupted yet (so it's still valid to access),
2546  * but has had a number of corrected errors and is better taken
2547  * out.
2548  *
2549  * The actual policy on when to do that is maintained by
2550  * user space.
2551  *
2552  * This should never impact any application or cause data loss,
2553  * however it might take some time.
2554  *
2555  * This is not a 100% solution for all memory, but tries to be
2556  * ``good enough'' for the majority of memory.
2557  */
2558 int soft_offline_page(unsigned long pfn, int flags)
2559 {
2560 	int ret;
2561 	bool try_again = true;
2562 	struct page *page;
2563 
2564 	if (!pfn_valid(pfn)) {
2565 		WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
2566 		return -ENXIO;
2567 	}
2568 
2569 	/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
2570 	page = pfn_to_online_page(pfn);
2571 	if (!page) {
2572 		put_ref_page(pfn, flags);
2573 		return -EIO;
2574 	}
2575 
2576 	mutex_lock(&mf_mutex);
2577 
2578 	if (PageHWPoison(page)) {
2579 		pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2580 		put_ref_page(pfn, flags);
2581 		mutex_unlock(&mf_mutex);
2582 		return 0;
2583 	}
2584 
2585 retry:
2586 	get_online_mems();
2587 	ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
2588 	put_online_mems();
2589 
2590 	if (hwpoison_filter(page)) {
2591 		if (ret > 0)
2592 			put_page(page);
2593 
2594 		mutex_unlock(&mf_mutex);
2595 		return -EOPNOTSUPP;
2596 	}
2597 
2598 	if (ret > 0) {
2599 		ret = soft_offline_in_use_page(page);
2600 	} else if (ret == 0) {
2601 		if (!page_handle_poison(page, true, false) && try_again) {
2602 			try_again = false;
2603 			flags &= ~MF_COUNT_INCREASED;
2604 			goto retry;
2605 		}
2606 	}
2607 
2608 	mutex_unlock(&mf_mutex);
2609 
2610 	return ret;
2611 }
2612 
2613 void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
2614 {
2615 	int i, total = 0;
2616 
2617 	/*
2618 	 * A further optimization is to have per section refcounted
2619 	 * num_poisoned_pages.  But that would need more space per memmap, so
2620 	 * for now just do a quick global check to speed up this routine in the
2621 	 * absence of bad pages.
2622 	 */
2623 	if (atomic_long_read(&num_poisoned_pages) == 0)
2624 		return;
2625 
2626 	for (i = 0; i < nr_pages; i++) {
2627 		if (PageHWPoison(&memmap[i])) {
2628 			total++;
2629 			ClearPageHWPoison(&memmap[i]);
2630 		}
2631 	}
2632 	if (total)
2633 		num_poisoned_pages_sub(total);
2634 }
2635