1 /* 2 * Copyright (C) 2008, 2009 Intel Corporation 3 * Authors: Andi Kleen, Fengguang Wu 4 * 5 * This software may be redistributed and/or modified under the terms of 6 * the GNU General Public License ("GPL") version 2 only as published by the 7 * Free Software Foundation. 8 * 9 * High level machine check handler. Handles pages reported by the 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache 11 * failure. 12 * 13 * In addition there is a "soft offline" entry point that allows stop using 14 * not-yet-corrupted-by-suspicious pages without killing anything. 15 * 16 * Handles page cache pages in various states. The tricky part 17 * here is that we can access any page asynchronously in respect to 18 * other VM users, because memory failures could happen anytime and 19 * anywhere. This could violate some of their assumptions. This is why 20 * this code has to be extremely careful. Generally it tries to use 21 * normal locking rules, as in get the standard locks, even if that means 22 * the error handling takes potentially a long time. 23 * 24 * There are several operations here with exponential complexity because 25 * of unsuitable VM data structures. For example the operation to map back 26 * from RMAP chains to processes has to walk the complete process list and 27 * has non linear complexity with the number. But since memory corruptions 28 * are rare we hope to get away with this. This avoids impacting the core 29 * VM. 30 */ 31 32 /* 33 * Notebook: 34 * - hugetlb needs more code 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 36 * - pass bad pages to kdump next kernel 37 */ 38 #include <linux/kernel.h> 39 #include <linux/mm.h> 40 #include <linux/page-flags.h> 41 #include <linux/kernel-page-flags.h> 42 #include <linux/sched.h> 43 #include <linux/ksm.h> 44 #include <linux/rmap.h> 45 #include <linux/export.h> 46 #include <linux/pagemap.h> 47 #include <linux/swap.h> 48 #include <linux/backing-dev.h> 49 #include <linux/migrate.h> 50 #include <linux/page-isolation.h> 51 #include <linux/suspend.h> 52 #include <linux/slab.h> 53 #include <linux/swapops.h> 54 #include <linux/hugetlb.h> 55 #include <linux/memory_hotplug.h> 56 #include <linux/mm_inline.h> 57 #include <linux/kfifo.h> 58 #include "internal.h" 59 60 int sysctl_memory_failure_early_kill __read_mostly = 0; 61 62 int sysctl_memory_failure_recovery __read_mostly = 1; 63 64 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); 65 66 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 67 68 u32 hwpoison_filter_enable = 0; 69 u32 hwpoison_filter_dev_major = ~0U; 70 u32 hwpoison_filter_dev_minor = ~0U; 71 u64 hwpoison_filter_flags_mask; 72 u64 hwpoison_filter_flags_value; 73 EXPORT_SYMBOL_GPL(hwpoison_filter_enable); 74 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 75 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 76 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 77 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 78 79 static int hwpoison_filter_dev(struct page *p) 80 { 81 struct address_space *mapping; 82 dev_t dev; 83 84 if (hwpoison_filter_dev_major == ~0U && 85 hwpoison_filter_dev_minor == ~0U) 86 return 0; 87 88 /* 89 * page_mapping() does not accept slab pages. 90 */ 91 if (PageSlab(p)) 92 return -EINVAL; 93 94 mapping = page_mapping(p); 95 if (mapping == NULL || mapping->host == NULL) 96 return -EINVAL; 97 98 dev = mapping->host->i_sb->s_dev; 99 if (hwpoison_filter_dev_major != ~0U && 100 hwpoison_filter_dev_major != MAJOR(dev)) 101 return -EINVAL; 102 if (hwpoison_filter_dev_minor != ~0U && 103 hwpoison_filter_dev_minor != MINOR(dev)) 104 return -EINVAL; 105 106 return 0; 107 } 108 109 static int hwpoison_filter_flags(struct page *p) 110 { 111 if (!hwpoison_filter_flags_mask) 112 return 0; 113 114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 115 hwpoison_filter_flags_value) 116 return 0; 117 else 118 return -EINVAL; 119 } 120 121 /* 122 * This allows stress tests to limit test scope to a collection of tasks 123 * by putting them under some memcg. This prevents killing unrelated/important 124 * processes such as /sbin/init. Note that the target task may share clean 125 * pages with init (eg. libc text), which is harmless. If the target task 126 * share _dirty_ pages with another task B, the test scheme must make sure B 127 * is also included in the memcg. At last, due to race conditions this filter 128 * can only guarantee that the page either belongs to the memcg tasks, or is 129 * a freed page. 130 */ 131 #ifdef CONFIG_MEMCG_SWAP 132 u64 hwpoison_filter_memcg; 133 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 134 static int hwpoison_filter_task(struct page *p) 135 { 136 struct mem_cgroup *mem; 137 struct cgroup_subsys_state *css; 138 unsigned long ino; 139 140 if (!hwpoison_filter_memcg) 141 return 0; 142 143 mem = try_get_mem_cgroup_from_page(p); 144 if (!mem) 145 return -EINVAL; 146 147 css = mem_cgroup_css(mem); 148 ino = cgroup_ino(css->cgroup); 149 css_put(css); 150 151 if (ino != hwpoison_filter_memcg) 152 return -EINVAL; 153 154 return 0; 155 } 156 #else 157 static int hwpoison_filter_task(struct page *p) { return 0; } 158 #endif 159 160 int hwpoison_filter(struct page *p) 161 { 162 if (!hwpoison_filter_enable) 163 return 0; 164 165 if (hwpoison_filter_dev(p)) 166 return -EINVAL; 167 168 if (hwpoison_filter_flags(p)) 169 return -EINVAL; 170 171 if (hwpoison_filter_task(p)) 172 return -EINVAL; 173 174 return 0; 175 } 176 #else 177 int hwpoison_filter(struct page *p) 178 { 179 return 0; 180 } 181 #endif 182 183 EXPORT_SYMBOL_GPL(hwpoison_filter); 184 185 /* 186 * Send all the processes who have the page mapped a signal. 187 * ``action optional'' if they are not immediately affected by the error 188 * ``action required'' if error happened in current execution context 189 */ 190 static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, 191 unsigned long pfn, struct page *page, int flags) 192 { 193 struct siginfo si; 194 int ret; 195 196 printk(KERN_ERR 197 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", 198 pfn, t->comm, t->pid); 199 si.si_signo = SIGBUS; 200 si.si_errno = 0; 201 si.si_addr = (void *)addr; 202 #ifdef __ARCH_SI_TRAPNO 203 si.si_trapno = trapno; 204 #endif 205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 206 207 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { 208 si.si_code = BUS_MCEERR_AR; 209 ret = force_sig_info(SIGBUS, &si, current); 210 } else { 211 /* 212 * Don't use force here, it's convenient if the signal 213 * can be temporarily blocked. 214 * This could cause a loop when the user sets SIGBUS 215 * to SIG_IGN, but hopefully no one will do that? 216 */ 217 si.si_code = BUS_MCEERR_AO; 218 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 219 } 220 if (ret < 0) 221 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", 222 t->comm, t->pid, ret); 223 return ret; 224 } 225 226 /* 227 * When a unknown page type is encountered drain as many buffers as possible 228 * in the hope to turn the page into a LRU or free page, which we can handle. 229 */ 230 void shake_page(struct page *p, int access) 231 { 232 if (!PageSlab(p)) { 233 lru_add_drain_all(); 234 if (PageLRU(p)) 235 return; 236 drain_all_pages(page_zone(p)); 237 if (PageLRU(p) || is_free_buddy_page(p)) 238 return; 239 } 240 241 /* 242 * Only call shrink_node_slabs here (which would also shrink 243 * other caches) if access is not potentially fatal. 244 */ 245 if (access) { 246 int nr; 247 int nid = page_to_nid(p); 248 do { 249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); 250 if (page_count(p) == 1) 251 break; 252 } while (nr > 10); 253 } 254 } 255 EXPORT_SYMBOL_GPL(shake_page); 256 257 /* 258 * Kill all processes that have a poisoned page mapped and then isolate 259 * the page. 260 * 261 * General strategy: 262 * Find all processes having the page mapped and kill them. 263 * But we keep a page reference around so that the page is not 264 * actually freed yet. 265 * Then stash the page away 266 * 267 * There's no convenient way to get back to mapped processes 268 * from the VMAs. So do a brute-force search over all 269 * running processes. 270 * 271 * Remember that machine checks are not common (or rather 272 * if they are common you have other problems), so this shouldn't 273 * be a performance issue. 274 * 275 * Also there are some races possible while we get from the 276 * error detection to actually handle it. 277 */ 278 279 struct to_kill { 280 struct list_head nd; 281 struct task_struct *tsk; 282 unsigned long addr; 283 char addr_valid; 284 }; 285 286 /* 287 * Failure handling: if we can't find or can't kill a process there's 288 * not much we can do. We just print a message and ignore otherwise. 289 */ 290 291 /* 292 * Schedule a process for later kill. 293 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. 294 * TBD would GFP_NOIO be enough? 295 */ 296 static void add_to_kill(struct task_struct *tsk, struct page *p, 297 struct vm_area_struct *vma, 298 struct list_head *to_kill, 299 struct to_kill **tkc) 300 { 301 struct to_kill *tk; 302 303 if (*tkc) { 304 tk = *tkc; 305 *tkc = NULL; 306 } else { 307 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); 308 if (!tk) { 309 printk(KERN_ERR 310 "MCE: Out of memory while machine check handling\n"); 311 return; 312 } 313 } 314 tk->addr = page_address_in_vma(p, vma); 315 tk->addr_valid = 1; 316 317 /* 318 * In theory we don't have to kill when the page was 319 * munmaped. But it could be also a mremap. Since that's 320 * likely very rare kill anyways just out of paranoia, but use 321 * a SIGKILL because the error is not contained anymore. 322 */ 323 if (tk->addr == -EFAULT) { 324 pr_info("MCE: Unable to find user space address %lx in %s\n", 325 page_to_pfn(p), tsk->comm); 326 tk->addr_valid = 0; 327 } 328 get_task_struct(tsk); 329 tk->tsk = tsk; 330 list_add_tail(&tk->nd, to_kill); 331 } 332 333 /* 334 * Kill the processes that have been collected earlier. 335 * 336 * Only do anything when DOIT is set, otherwise just free the list 337 * (this is used for clean pages which do not need killing) 338 * Also when FAIL is set do a force kill because something went 339 * wrong earlier. 340 */ 341 static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, 342 int fail, struct page *page, unsigned long pfn, 343 int flags) 344 { 345 struct to_kill *tk, *next; 346 347 list_for_each_entry_safe (tk, next, to_kill, nd) { 348 if (forcekill) { 349 /* 350 * In case something went wrong with munmapping 351 * make sure the process doesn't catch the 352 * signal and then access the memory. Just kill it. 353 */ 354 if (fail || tk->addr_valid == 0) { 355 printk(KERN_ERR 356 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", 357 pfn, tk->tsk->comm, tk->tsk->pid); 358 force_sig(SIGKILL, tk->tsk); 359 } 360 361 /* 362 * In theory the process could have mapped 363 * something else on the address in-between. We could 364 * check for that, but we need to tell the 365 * process anyways. 366 */ 367 else if (kill_proc(tk->tsk, tk->addr, trapno, 368 pfn, page, flags) < 0) 369 printk(KERN_ERR 370 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 371 pfn, tk->tsk->comm, tk->tsk->pid); 372 } 373 put_task_struct(tk->tsk); 374 kfree(tk); 375 } 376 } 377 378 /* 379 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) 380 * on behalf of the thread group. Return task_struct of the (first found) 381 * dedicated thread if found, and return NULL otherwise. 382 * 383 * We already hold read_lock(&tasklist_lock) in the caller, so we don't 384 * have to call rcu_read_lock/unlock() in this function. 385 */ 386 static struct task_struct *find_early_kill_thread(struct task_struct *tsk) 387 { 388 struct task_struct *t; 389 390 for_each_thread(tsk, t) 391 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) 392 return t; 393 return NULL; 394 } 395 396 /* 397 * Determine whether a given process is "early kill" process which expects 398 * to be signaled when some page under the process is hwpoisoned. 399 * Return task_struct of the dedicated thread (main thread unless explicitly 400 * specified) if the process is "early kill," and otherwise returns NULL. 401 */ 402 static struct task_struct *task_early_kill(struct task_struct *tsk, 403 int force_early) 404 { 405 struct task_struct *t; 406 if (!tsk->mm) 407 return NULL; 408 if (force_early) 409 return tsk; 410 t = find_early_kill_thread(tsk); 411 if (t) 412 return t; 413 if (sysctl_memory_failure_early_kill) 414 return tsk; 415 return NULL; 416 } 417 418 /* 419 * Collect processes when the error hit an anonymous page. 420 */ 421 static void collect_procs_anon(struct page *page, struct list_head *to_kill, 422 struct to_kill **tkc, int force_early) 423 { 424 struct vm_area_struct *vma; 425 struct task_struct *tsk; 426 struct anon_vma *av; 427 pgoff_t pgoff; 428 429 av = page_lock_anon_vma_read(page); 430 if (av == NULL) /* Not actually mapped anymore */ 431 return; 432 433 pgoff = page_to_pgoff(page); 434 read_lock(&tasklist_lock); 435 for_each_process (tsk) { 436 struct anon_vma_chain *vmac; 437 struct task_struct *t = task_early_kill(tsk, force_early); 438 439 if (!t) 440 continue; 441 anon_vma_interval_tree_foreach(vmac, &av->rb_root, 442 pgoff, pgoff) { 443 vma = vmac->vma; 444 if (!page_mapped_in_vma(page, vma)) 445 continue; 446 if (vma->vm_mm == t->mm) 447 add_to_kill(t, page, vma, to_kill, tkc); 448 } 449 } 450 read_unlock(&tasklist_lock); 451 page_unlock_anon_vma_read(av); 452 } 453 454 /* 455 * Collect processes when the error hit a file mapped page. 456 */ 457 static void collect_procs_file(struct page *page, struct list_head *to_kill, 458 struct to_kill **tkc, int force_early) 459 { 460 struct vm_area_struct *vma; 461 struct task_struct *tsk; 462 struct address_space *mapping = page->mapping; 463 464 i_mmap_lock_read(mapping); 465 read_lock(&tasklist_lock); 466 for_each_process(tsk) { 467 pgoff_t pgoff = page_to_pgoff(page); 468 struct task_struct *t = task_early_kill(tsk, force_early); 469 470 if (!t) 471 continue; 472 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 473 pgoff) { 474 /* 475 * Send early kill signal to tasks where a vma covers 476 * the page but the corrupted page is not necessarily 477 * mapped it in its pte. 478 * Assume applications who requested early kill want 479 * to be informed of all such data corruptions. 480 */ 481 if (vma->vm_mm == t->mm) 482 add_to_kill(t, page, vma, to_kill, tkc); 483 } 484 } 485 read_unlock(&tasklist_lock); 486 i_mmap_unlock_read(mapping); 487 } 488 489 /* 490 * Collect the processes who have the corrupted page mapped to kill. 491 * This is done in two steps for locking reasons. 492 * First preallocate one tokill structure outside the spin locks, 493 * so that we can kill at least one process reasonably reliable. 494 */ 495 static void collect_procs(struct page *page, struct list_head *tokill, 496 int force_early) 497 { 498 struct to_kill *tk; 499 500 if (!page->mapping) 501 return; 502 503 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); 504 if (!tk) 505 return; 506 if (PageAnon(page)) 507 collect_procs_anon(page, tokill, &tk, force_early); 508 else 509 collect_procs_file(page, tokill, &tk, force_early); 510 kfree(tk); 511 } 512 513 /* 514 * Error handlers for various types of pages. 515 */ 516 517 enum outcome { 518 IGNORED, /* Error: cannot be handled */ 519 FAILED, /* Error: handling failed */ 520 DELAYED, /* Will be handled later */ 521 RECOVERED, /* Successfully recovered */ 522 }; 523 524 static const char *action_name[] = { 525 [IGNORED] = "Ignored", 526 [FAILED] = "Failed", 527 [DELAYED] = "Delayed", 528 [RECOVERED] = "Recovered", 529 }; 530 531 /* 532 * XXX: It is possible that a page is isolated from LRU cache, 533 * and then kept in swap cache or failed to remove from page cache. 534 * The page count will stop it from being freed by unpoison. 535 * Stress tests should be aware of this memory leak problem. 536 */ 537 static int delete_from_lru_cache(struct page *p) 538 { 539 if (!isolate_lru_page(p)) { 540 /* 541 * Clear sensible page flags, so that the buddy system won't 542 * complain when the page is unpoison-and-freed. 543 */ 544 ClearPageActive(p); 545 ClearPageUnevictable(p); 546 /* 547 * drop the page count elevated by isolate_lru_page() 548 */ 549 page_cache_release(p); 550 return 0; 551 } 552 return -EIO; 553 } 554 555 /* 556 * Error hit kernel page. 557 * Do nothing, try to be lucky and not touch this instead. For a few cases we 558 * could be more sophisticated. 559 */ 560 static int me_kernel(struct page *p, unsigned long pfn) 561 { 562 return IGNORED; 563 } 564 565 /* 566 * Page in unknown state. Do nothing. 567 */ 568 static int me_unknown(struct page *p, unsigned long pfn) 569 { 570 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); 571 return FAILED; 572 } 573 574 /* 575 * Clean (or cleaned) page cache page. 576 */ 577 static int me_pagecache_clean(struct page *p, unsigned long pfn) 578 { 579 int err; 580 int ret = FAILED; 581 struct address_space *mapping; 582 583 delete_from_lru_cache(p); 584 585 /* 586 * For anonymous pages we're done the only reference left 587 * should be the one m_f() holds. 588 */ 589 if (PageAnon(p)) 590 return RECOVERED; 591 592 /* 593 * Now truncate the page in the page cache. This is really 594 * more like a "temporary hole punch" 595 * Don't do this for block devices when someone else 596 * has a reference, because it could be file system metadata 597 * and that's not safe to truncate. 598 */ 599 mapping = page_mapping(p); 600 if (!mapping) { 601 /* 602 * Page has been teared down in the meanwhile 603 */ 604 return FAILED; 605 } 606 607 /* 608 * Truncation is a bit tricky. Enable it per file system for now. 609 * 610 * Open: to take i_mutex or not for this? Right now we don't. 611 */ 612 if (mapping->a_ops->error_remove_page) { 613 err = mapping->a_ops->error_remove_page(mapping, p); 614 if (err != 0) { 615 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", 616 pfn, err); 617 } else if (page_has_private(p) && 618 !try_to_release_page(p, GFP_NOIO)) { 619 pr_info("MCE %#lx: failed to release buffers\n", pfn); 620 } else { 621 ret = RECOVERED; 622 } 623 } else { 624 /* 625 * If the file system doesn't support it just invalidate 626 * This fails on dirty or anything with private pages 627 */ 628 if (invalidate_inode_page(p)) 629 ret = RECOVERED; 630 else 631 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", 632 pfn); 633 } 634 return ret; 635 } 636 637 /* 638 * Dirty pagecache page 639 * Issues: when the error hit a hole page the error is not properly 640 * propagated. 641 */ 642 static int me_pagecache_dirty(struct page *p, unsigned long pfn) 643 { 644 struct address_space *mapping = page_mapping(p); 645 646 SetPageError(p); 647 /* TBD: print more information about the file. */ 648 if (mapping) { 649 /* 650 * IO error will be reported by write(), fsync(), etc. 651 * who check the mapping. 652 * This way the application knows that something went 653 * wrong with its dirty file data. 654 * 655 * There's one open issue: 656 * 657 * The EIO will be only reported on the next IO 658 * operation and then cleared through the IO map. 659 * Normally Linux has two mechanisms to pass IO error 660 * first through the AS_EIO flag in the address space 661 * and then through the PageError flag in the page. 662 * Since we drop pages on memory failure handling the 663 * only mechanism open to use is through AS_AIO. 664 * 665 * This has the disadvantage that it gets cleared on 666 * the first operation that returns an error, while 667 * the PageError bit is more sticky and only cleared 668 * when the page is reread or dropped. If an 669 * application assumes it will always get error on 670 * fsync, but does other operations on the fd before 671 * and the page is dropped between then the error 672 * will not be properly reported. 673 * 674 * This can already happen even without hwpoisoned 675 * pages: first on metadata IO errors (which only 676 * report through AS_EIO) or when the page is dropped 677 * at the wrong time. 678 * 679 * So right now we assume that the application DTRT on 680 * the first EIO, but we're not worse than other parts 681 * of the kernel. 682 */ 683 mapping_set_error(mapping, EIO); 684 } 685 686 return me_pagecache_clean(p, pfn); 687 } 688 689 /* 690 * Clean and dirty swap cache. 691 * 692 * Dirty swap cache page is tricky to handle. The page could live both in page 693 * cache and swap cache(ie. page is freshly swapped in). So it could be 694 * referenced concurrently by 2 types of PTEs: 695 * normal PTEs and swap PTEs. We try to handle them consistently by calling 696 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, 697 * and then 698 * - clear dirty bit to prevent IO 699 * - remove from LRU 700 * - but keep in the swap cache, so that when we return to it on 701 * a later page fault, we know the application is accessing 702 * corrupted data and shall be killed (we installed simple 703 * interception code in do_swap_page to catch it). 704 * 705 * Clean swap cache pages can be directly isolated. A later page fault will 706 * bring in the known good data from disk. 707 */ 708 static int me_swapcache_dirty(struct page *p, unsigned long pfn) 709 { 710 ClearPageDirty(p); 711 /* Trigger EIO in shmem: */ 712 ClearPageUptodate(p); 713 714 if (!delete_from_lru_cache(p)) 715 return DELAYED; 716 else 717 return FAILED; 718 } 719 720 static int me_swapcache_clean(struct page *p, unsigned long pfn) 721 { 722 delete_from_swap_cache(p); 723 724 if (!delete_from_lru_cache(p)) 725 return RECOVERED; 726 else 727 return FAILED; 728 } 729 730 /* 731 * Huge pages. Needs work. 732 * Issues: 733 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 734 * To narrow down kill region to one page, we need to break up pmd. 735 */ 736 static int me_huge_page(struct page *p, unsigned long pfn) 737 { 738 int res = 0; 739 struct page *hpage = compound_head(p); 740 /* 741 * We can safely recover from error on free or reserved (i.e. 742 * not in-use) hugepage by dequeuing it from freelist. 743 * To check whether a hugepage is in-use or not, we can't use 744 * page->lru because it can be used in other hugepage operations, 745 * such as __unmap_hugepage_range() and gather_surplus_pages(). 746 * So instead we use page_mapping() and PageAnon(). 747 * We assume that this function is called with page lock held, 748 * so there is no race between isolation and mapping/unmapping. 749 */ 750 if (!(page_mapping(hpage) || PageAnon(hpage))) { 751 res = dequeue_hwpoisoned_huge_page(hpage); 752 if (!res) 753 return RECOVERED; 754 } 755 return DELAYED; 756 } 757 758 /* 759 * Various page states we can handle. 760 * 761 * A page state is defined by its current page->flags bits. 762 * The table matches them in order and calls the right handler. 763 * 764 * This is quite tricky because we can access page at any time 765 * in its live cycle, so all accesses have to be extremely careful. 766 * 767 * This is not complete. More states could be added. 768 * For any missing state don't attempt recovery. 769 */ 770 771 #define dirty (1UL << PG_dirty) 772 #define sc (1UL << PG_swapcache) 773 #define unevict (1UL << PG_unevictable) 774 #define mlock (1UL << PG_mlocked) 775 #define writeback (1UL << PG_writeback) 776 #define lru (1UL << PG_lru) 777 #define swapbacked (1UL << PG_swapbacked) 778 #define head (1UL << PG_head) 779 #define tail (1UL << PG_tail) 780 #define compound (1UL << PG_compound) 781 #define slab (1UL << PG_slab) 782 #define reserved (1UL << PG_reserved) 783 784 static struct page_state { 785 unsigned long mask; 786 unsigned long res; 787 char *msg; 788 int (*action)(struct page *p, unsigned long pfn); 789 } error_states[] = { 790 { reserved, reserved, "reserved kernel", me_kernel }, 791 /* 792 * free pages are specially detected outside this table: 793 * PG_buddy pages only make a small fraction of all free pages. 794 */ 795 796 /* 797 * Could in theory check if slab page is free or if we can drop 798 * currently unused objects without touching them. But just 799 * treat it as standard kernel for now. 800 */ 801 { slab, slab, "kernel slab", me_kernel }, 802 803 #ifdef CONFIG_PAGEFLAGS_EXTENDED 804 { head, head, "huge", me_huge_page }, 805 { tail, tail, "huge", me_huge_page }, 806 #else 807 { compound, compound, "huge", me_huge_page }, 808 #endif 809 810 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 811 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 812 813 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 814 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, 815 816 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, 817 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, 818 819 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 820 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 821 822 /* 823 * Catchall entry: must be at end. 824 */ 825 { 0, 0, "unknown page state", me_unknown }, 826 }; 827 828 #undef dirty 829 #undef sc 830 #undef unevict 831 #undef mlock 832 #undef writeback 833 #undef lru 834 #undef swapbacked 835 #undef head 836 #undef tail 837 #undef compound 838 #undef slab 839 #undef reserved 840 841 /* 842 * "Dirty/Clean" indication is not 100% accurate due to the possibility of 843 * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 844 */ 845 static void action_result(unsigned long pfn, char *msg, int result) 846 { 847 pr_err("MCE %#lx: %s page recovery: %s\n", 848 pfn, msg, action_name[result]); 849 } 850 851 static int page_action(struct page_state *ps, struct page *p, 852 unsigned long pfn) 853 { 854 int result; 855 int count; 856 857 result = ps->action(p, pfn); 858 859 count = page_count(p) - 1; 860 if (ps->action == me_swapcache_dirty && result == DELAYED) 861 count--; 862 if (count != 0) { 863 printk(KERN_ERR 864 "MCE %#lx: %s page still referenced by %d users\n", 865 pfn, ps->msg, count); 866 result = FAILED; 867 } 868 action_result(pfn, ps->msg, result); 869 870 /* Could do more checks here if page looks ok */ 871 /* 872 * Could adjust zone counters here to correct for the missing page. 873 */ 874 875 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 876 } 877 878 /* 879 * Do all that is necessary to remove user space mappings. Unmap 880 * the pages and send SIGBUS to the processes if the data was dirty. 881 */ 882 static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 883 int trapno, int flags, struct page **hpagep) 884 { 885 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 886 struct address_space *mapping; 887 LIST_HEAD(tokill); 888 int ret; 889 int kill = 1, forcekill; 890 struct page *hpage = *hpagep; 891 struct page *ppage; 892 893 /* 894 * Here we are interested only in user-mapped pages, so skip any 895 * other types of pages. 896 */ 897 if (PageReserved(p) || PageSlab(p)) 898 return SWAP_SUCCESS; 899 if (!(PageLRU(hpage) || PageHuge(p))) 900 return SWAP_SUCCESS; 901 902 /* 903 * This check implies we don't kill processes if their pages 904 * are in the swap cache early. Those are always late kills. 905 */ 906 if (!page_mapped(hpage)) 907 return SWAP_SUCCESS; 908 909 if (PageKsm(p)) { 910 pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); 911 return SWAP_FAIL; 912 } 913 914 if (PageSwapCache(p)) { 915 printk(KERN_ERR 916 "MCE %#lx: keeping poisoned page in swap cache\n", pfn); 917 ttu |= TTU_IGNORE_HWPOISON; 918 } 919 920 /* 921 * Propagate the dirty bit from PTEs to struct page first, because we 922 * need this to decide if we should kill or just drop the page. 923 * XXX: the dirty test could be racy: set_page_dirty() may not always 924 * be called inside page lock (it's recommended but not enforced). 925 */ 926 mapping = page_mapping(hpage); 927 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && 928 mapping_cap_writeback_dirty(mapping)) { 929 if (page_mkclean(hpage)) { 930 SetPageDirty(hpage); 931 } else { 932 kill = 0; 933 ttu |= TTU_IGNORE_HWPOISON; 934 printk(KERN_INFO 935 "MCE %#lx: corrupted page was clean: dropped without side effects\n", 936 pfn); 937 } 938 } 939 940 /* 941 * ppage: poisoned page 942 * if p is regular page(4k page) 943 * ppage == real poisoned page; 944 * else p is hugetlb or THP, ppage == head page. 945 */ 946 ppage = hpage; 947 948 if (PageTransHuge(hpage)) { 949 /* 950 * Verify that this isn't a hugetlbfs head page, the check for 951 * PageAnon is just for avoid tripping a split_huge_page 952 * internal debug check, as split_huge_page refuses to deal with 953 * anything that isn't an anon page. PageAnon can't go away fro 954 * under us because we hold a refcount on the hpage, without a 955 * refcount on the hpage. split_huge_page can't be safely called 956 * in the first place, having a refcount on the tail isn't 957 * enough * to be safe. 958 */ 959 if (!PageHuge(hpage) && PageAnon(hpage)) { 960 if (unlikely(split_huge_page(hpage))) { 961 /* 962 * FIXME: if splitting THP is failed, it is 963 * better to stop the following operation rather 964 * than causing panic by unmapping. System might 965 * survive if the page is freed later. 966 */ 967 printk(KERN_INFO 968 "MCE %#lx: failed to split THP\n", pfn); 969 970 BUG_ON(!PageHWPoison(p)); 971 return SWAP_FAIL; 972 } 973 /* 974 * We pinned the head page for hwpoison handling, 975 * now we split the thp and we are interested in 976 * the hwpoisoned raw page, so move the refcount 977 * to it. Similarly, page lock is shifted. 978 */ 979 if (hpage != p) { 980 if (!(flags & MF_COUNT_INCREASED)) { 981 put_page(hpage); 982 get_page(p); 983 } 984 lock_page(p); 985 unlock_page(hpage); 986 *hpagep = p; 987 } 988 /* THP is split, so ppage should be the real poisoned page. */ 989 ppage = p; 990 } 991 } 992 993 /* 994 * First collect all the processes that have the page 995 * mapped in dirty form. This has to be done before try_to_unmap, 996 * because ttu takes the rmap data structures down. 997 * 998 * Error handling: We ignore errors here because 999 * there's nothing that can be done. 1000 */ 1001 if (kill) 1002 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); 1003 1004 ret = try_to_unmap(ppage, ttu); 1005 if (ret != SWAP_SUCCESS) 1006 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 1007 pfn, page_mapcount(ppage)); 1008 1009 /* 1010 * Now that the dirty bit has been propagated to the 1011 * struct page and all unmaps done we can decide if 1012 * killing is needed or not. Only kill when the page 1013 * was dirty or the process is not restartable, 1014 * otherwise the tokill list is merely 1015 * freed. When there was a problem unmapping earlier 1016 * use a more force-full uncatchable kill to prevent 1017 * any accesses to the poisoned memory. 1018 */ 1019 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); 1020 kill_procs(&tokill, forcekill, trapno, 1021 ret != SWAP_SUCCESS, p, pfn, flags); 1022 1023 return ret; 1024 } 1025 1026 static void set_page_hwpoison_huge_page(struct page *hpage) 1027 { 1028 int i; 1029 int nr_pages = 1 << compound_order(hpage); 1030 for (i = 0; i < nr_pages; i++) 1031 SetPageHWPoison(hpage + i); 1032 } 1033 1034 static void clear_page_hwpoison_huge_page(struct page *hpage) 1035 { 1036 int i; 1037 int nr_pages = 1 << compound_order(hpage); 1038 for (i = 0; i < nr_pages; i++) 1039 ClearPageHWPoison(hpage + i); 1040 } 1041 1042 /** 1043 * memory_failure - Handle memory failure of a page. 1044 * @pfn: Page Number of the corrupted page 1045 * @trapno: Trap number reported in the signal to user space. 1046 * @flags: fine tune action taken 1047 * 1048 * This function is called by the low level machine check code 1049 * of an architecture when it detects hardware memory corruption 1050 * of a page. It tries its best to recover, which includes 1051 * dropping pages, killing processes etc. 1052 * 1053 * The function is primarily of use for corruptions that 1054 * happen outside the current execution context (e.g. when 1055 * detected by a background scrubber) 1056 * 1057 * Must run in process context (e.g. a work queue) with interrupts 1058 * enabled and no spinlocks hold. 1059 */ 1060 int memory_failure(unsigned long pfn, int trapno, int flags) 1061 { 1062 struct page_state *ps; 1063 struct page *p; 1064 struct page *hpage; 1065 int res; 1066 unsigned int nr_pages; 1067 unsigned long page_flags; 1068 1069 if (!sysctl_memory_failure_recovery) 1070 panic("Memory failure from trap %d on page %lx", trapno, pfn); 1071 1072 if (!pfn_valid(pfn)) { 1073 printk(KERN_ERR 1074 "MCE %#lx: memory outside kernel control\n", 1075 pfn); 1076 return -ENXIO; 1077 } 1078 1079 p = pfn_to_page(pfn); 1080 hpage = compound_head(p); 1081 if (TestSetPageHWPoison(p)) { 1082 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 1083 return 0; 1084 } 1085 1086 /* 1087 * Currently errors on hugetlbfs pages are measured in hugepage units, 1088 * so nr_pages should be 1 << compound_order. OTOH when errors are on 1089 * transparent hugepages, they are supposed to be split and error 1090 * measurement is done in normal page units. So nr_pages should be one 1091 * in this case. 1092 */ 1093 if (PageHuge(p)) 1094 nr_pages = 1 << compound_order(hpage); 1095 else /* normal page or thp */ 1096 nr_pages = 1; 1097 atomic_long_add(nr_pages, &num_poisoned_pages); 1098 1099 /* 1100 * We need/can do nothing about count=0 pages. 1101 * 1) it's a free page, and therefore in safe hand: 1102 * prep_new_page() will be the gate keeper. 1103 * 2) it's a free hugepage, which is also safe: 1104 * an affected hugepage will be dequeued from hugepage freelist, 1105 * so there's no concern about reusing it ever after. 1106 * 3) it's part of a non-compound high order page. 1107 * Implies some kernel user: cannot stop them from 1108 * R/W the page; let's pray that the page has been 1109 * used and will be freed some time later. 1110 * In fact it's dangerous to directly bump up page count from 0, 1111 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 1112 */ 1113 if (!(flags & MF_COUNT_INCREASED) && 1114 !get_page_unless_zero(hpage)) { 1115 if (is_free_buddy_page(p)) { 1116 action_result(pfn, "free buddy", DELAYED); 1117 return 0; 1118 } else if (PageHuge(hpage)) { 1119 /* 1120 * Check "filter hit" and "race with other subpage." 1121 */ 1122 lock_page(hpage); 1123 if (PageHWPoison(hpage)) { 1124 if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) 1125 || (p != hpage && TestSetPageHWPoison(hpage))) { 1126 atomic_long_sub(nr_pages, &num_poisoned_pages); 1127 unlock_page(hpage); 1128 return 0; 1129 } 1130 } 1131 set_page_hwpoison_huge_page(hpage); 1132 res = dequeue_hwpoisoned_huge_page(hpage); 1133 action_result(pfn, "free huge", 1134 res ? IGNORED : DELAYED); 1135 unlock_page(hpage); 1136 return res; 1137 } else { 1138 action_result(pfn, "high order kernel", IGNORED); 1139 return -EBUSY; 1140 } 1141 } 1142 1143 /* 1144 * We ignore non-LRU pages for good reasons. 1145 * - PG_locked is only well defined for LRU pages and a few others 1146 * - to avoid races with __set_page_locked() 1147 * - to avoid races with __SetPageSlab*() (and more non-atomic ops) 1148 * The check (unnecessarily) ignores LRU pages being isolated and 1149 * walked by the page reclaim code, however that's not a big loss. 1150 */ 1151 if (!PageHuge(p) && !PageTransTail(p)) { 1152 if (!PageLRU(p)) 1153 shake_page(p, 0); 1154 if (!PageLRU(p)) { 1155 /* 1156 * shake_page could have turned it free. 1157 */ 1158 if (is_free_buddy_page(p)) { 1159 if (flags & MF_COUNT_INCREASED) 1160 action_result(pfn, "free buddy", DELAYED); 1161 else 1162 action_result(pfn, "free buddy, 2nd try", DELAYED); 1163 return 0; 1164 } 1165 } 1166 } 1167 1168 lock_page(hpage); 1169 1170 /* 1171 * The page could have changed compound pages during the locking. 1172 * If this happens just bail out. 1173 */ 1174 if (compound_head(p) != hpage) { 1175 action_result(pfn, "different compound page after locking", IGNORED); 1176 res = -EBUSY; 1177 goto out; 1178 } 1179 1180 /* 1181 * We use page flags to determine what action should be taken, but 1182 * the flags can be modified by the error containment action. One 1183 * example is an mlocked page, where PG_mlocked is cleared by 1184 * page_remove_rmap() in try_to_unmap_one(). So to determine page status 1185 * correctly, we save a copy of the page flags at this time. 1186 */ 1187 page_flags = p->flags; 1188 1189 /* 1190 * unpoison always clear PG_hwpoison inside page lock 1191 */ 1192 if (!PageHWPoison(p)) { 1193 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 1194 atomic_long_sub(nr_pages, &num_poisoned_pages); 1195 put_page(hpage); 1196 res = 0; 1197 goto out; 1198 } 1199 if (hwpoison_filter(p)) { 1200 if (TestClearPageHWPoison(p)) 1201 atomic_long_sub(nr_pages, &num_poisoned_pages); 1202 unlock_page(hpage); 1203 put_page(hpage); 1204 return 0; 1205 } 1206 1207 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) 1208 goto identify_page_state; 1209 1210 /* 1211 * For error on the tail page, we should set PG_hwpoison 1212 * on the head page to show that the hugepage is hwpoisoned 1213 */ 1214 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { 1215 action_result(pfn, "hugepage already hardware poisoned", 1216 IGNORED); 1217 unlock_page(hpage); 1218 put_page(hpage); 1219 return 0; 1220 } 1221 /* 1222 * Set PG_hwpoison on all pages in an error hugepage, 1223 * because containment is done in hugepage unit for now. 1224 * Since we have done TestSetPageHWPoison() for the head page with 1225 * page lock held, we can safely set PG_hwpoison bits on tail pages. 1226 */ 1227 if (PageHuge(p)) 1228 set_page_hwpoison_huge_page(hpage); 1229 1230 /* 1231 * It's very difficult to mess with pages currently under IO 1232 * and in many cases impossible, so we just avoid it here. 1233 */ 1234 wait_on_page_writeback(p); 1235 1236 /* 1237 * Now take care of user space mappings. 1238 * Abort on fail: __delete_from_page_cache() assumes unmapped page. 1239 * 1240 * When the raw error page is thp tail page, hpage points to the raw 1241 * page after thp split. 1242 */ 1243 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) 1244 != SWAP_SUCCESS) { 1245 action_result(pfn, "unmapping failed", IGNORED); 1246 res = -EBUSY; 1247 goto out; 1248 } 1249 1250 /* 1251 * Torn down by someone else? 1252 */ 1253 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1254 action_result(pfn, "already truncated LRU", IGNORED); 1255 res = -EBUSY; 1256 goto out; 1257 } 1258 1259 identify_page_state: 1260 res = -EBUSY; 1261 /* 1262 * The first check uses the current page flags which may not have any 1263 * relevant information. The second check with the saved page flagss is 1264 * carried out only if the first check can't determine the page status. 1265 */ 1266 for (ps = error_states;; ps++) 1267 if ((p->flags & ps->mask) == ps->res) 1268 break; 1269 1270 page_flags |= (p->flags & (1UL << PG_dirty)); 1271 1272 if (!ps->mask) 1273 for (ps = error_states;; ps++) 1274 if ((page_flags & ps->mask) == ps->res) 1275 break; 1276 res = page_action(ps, p, pfn); 1277 out: 1278 unlock_page(hpage); 1279 return res; 1280 } 1281 EXPORT_SYMBOL_GPL(memory_failure); 1282 1283 #define MEMORY_FAILURE_FIFO_ORDER 4 1284 #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 1285 1286 struct memory_failure_entry { 1287 unsigned long pfn; 1288 int trapno; 1289 int flags; 1290 }; 1291 1292 struct memory_failure_cpu { 1293 DECLARE_KFIFO(fifo, struct memory_failure_entry, 1294 MEMORY_FAILURE_FIFO_SIZE); 1295 spinlock_t lock; 1296 struct work_struct work; 1297 }; 1298 1299 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); 1300 1301 /** 1302 * memory_failure_queue - Schedule handling memory failure of a page. 1303 * @pfn: Page Number of the corrupted page 1304 * @trapno: Trap number reported in the signal to user space. 1305 * @flags: Flags for memory failure handling 1306 * 1307 * This function is called by the low level hardware error handler 1308 * when it detects hardware memory corruption of a page. It schedules 1309 * the recovering of error page, including dropping pages, killing 1310 * processes etc. 1311 * 1312 * The function is primarily of use for corruptions that 1313 * happen outside the current execution context (e.g. when 1314 * detected by a background scrubber) 1315 * 1316 * Can run in IRQ context. 1317 */ 1318 void memory_failure_queue(unsigned long pfn, int trapno, int flags) 1319 { 1320 struct memory_failure_cpu *mf_cpu; 1321 unsigned long proc_flags; 1322 struct memory_failure_entry entry = { 1323 .pfn = pfn, 1324 .trapno = trapno, 1325 .flags = flags, 1326 }; 1327 1328 mf_cpu = &get_cpu_var(memory_failure_cpu); 1329 spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1330 if (kfifo_put(&mf_cpu->fifo, entry)) 1331 schedule_work_on(smp_processor_id(), &mf_cpu->work); 1332 else 1333 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", 1334 pfn); 1335 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1336 put_cpu_var(memory_failure_cpu); 1337 } 1338 EXPORT_SYMBOL_GPL(memory_failure_queue); 1339 1340 static void memory_failure_work_func(struct work_struct *work) 1341 { 1342 struct memory_failure_cpu *mf_cpu; 1343 struct memory_failure_entry entry = { 0, }; 1344 unsigned long proc_flags; 1345 int gotten; 1346 1347 mf_cpu = this_cpu_ptr(&memory_failure_cpu); 1348 for (;;) { 1349 spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1350 gotten = kfifo_get(&mf_cpu->fifo, &entry); 1351 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1352 if (!gotten) 1353 break; 1354 if (entry.flags & MF_SOFT_OFFLINE) 1355 soft_offline_page(pfn_to_page(entry.pfn), entry.flags); 1356 else 1357 memory_failure(entry.pfn, entry.trapno, entry.flags); 1358 } 1359 } 1360 1361 static int __init memory_failure_init(void) 1362 { 1363 struct memory_failure_cpu *mf_cpu; 1364 int cpu; 1365 1366 for_each_possible_cpu(cpu) { 1367 mf_cpu = &per_cpu(memory_failure_cpu, cpu); 1368 spin_lock_init(&mf_cpu->lock); 1369 INIT_KFIFO(mf_cpu->fifo); 1370 INIT_WORK(&mf_cpu->work, memory_failure_work_func); 1371 } 1372 1373 return 0; 1374 } 1375 core_initcall(memory_failure_init); 1376 1377 /** 1378 * unpoison_memory - Unpoison a previously poisoned page 1379 * @pfn: Page number of the to be unpoisoned page 1380 * 1381 * Software-unpoison a page that has been poisoned by 1382 * memory_failure() earlier. 1383 * 1384 * This is only done on the software-level, so it only works 1385 * for linux injected failures, not real hardware failures 1386 * 1387 * Returns 0 for success, otherwise -errno. 1388 */ 1389 int unpoison_memory(unsigned long pfn) 1390 { 1391 struct page *page; 1392 struct page *p; 1393 int freeit = 0; 1394 unsigned int nr_pages; 1395 1396 if (!pfn_valid(pfn)) 1397 return -ENXIO; 1398 1399 p = pfn_to_page(pfn); 1400 page = compound_head(p); 1401 1402 if (!PageHWPoison(p)) { 1403 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); 1404 return 0; 1405 } 1406 1407 /* 1408 * unpoison_memory() can encounter thp only when the thp is being 1409 * worked by memory_failure() and the page lock is not held yet. 1410 * In such case, we yield to memory_failure() and make unpoison fail. 1411 */ 1412 if (!PageHuge(page) && PageTransHuge(page)) { 1413 pr_info("MCE: Memory failure is now running on %#lx\n", pfn); 1414 return 0; 1415 } 1416 1417 nr_pages = 1 << compound_order(page); 1418 1419 if (!get_page_unless_zero(page)) { 1420 /* 1421 * Since HWPoisoned hugepage should have non-zero refcount, 1422 * race between memory failure and unpoison seems to happen. 1423 * In such case unpoison fails and memory failure runs 1424 * to the end. 1425 */ 1426 if (PageHuge(page)) { 1427 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); 1428 return 0; 1429 } 1430 if (TestClearPageHWPoison(p)) 1431 atomic_long_dec(&num_poisoned_pages); 1432 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1433 return 0; 1434 } 1435 1436 lock_page(page); 1437 /* 1438 * This test is racy because PG_hwpoison is set outside of page lock. 1439 * That's acceptable because that won't trigger kernel panic. Instead, 1440 * the PG_hwpoison page will be caught and isolated on the entrance to 1441 * the free buddy page pool. 1442 */ 1443 if (TestClearPageHWPoison(page)) { 1444 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1445 atomic_long_sub(nr_pages, &num_poisoned_pages); 1446 freeit = 1; 1447 if (PageHuge(page)) 1448 clear_page_hwpoison_huge_page(page); 1449 } 1450 unlock_page(page); 1451 1452 put_page(page); 1453 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) 1454 put_page(page); 1455 1456 return 0; 1457 } 1458 EXPORT_SYMBOL(unpoison_memory); 1459 1460 static struct page *new_page(struct page *p, unsigned long private, int **x) 1461 { 1462 int nid = page_to_nid(p); 1463 if (PageHuge(p)) 1464 return alloc_huge_page_node(page_hstate(compound_head(p)), 1465 nid); 1466 else 1467 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1468 } 1469 1470 /* 1471 * Safely get reference count of an arbitrary page. 1472 * Returns 0 for a free page, -EIO for a zero refcount page 1473 * that is not free, and 1 for any other page type. 1474 * For 1 the page is returned with increased page count, otherwise not. 1475 */ 1476 static int __get_any_page(struct page *p, unsigned long pfn, int flags) 1477 { 1478 int ret; 1479 1480 if (flags & MF_COUNT_INCREASED) 1481 return 1; 1482 1483 /* 1484 * When the target page is a free hugepage, just remove it 1485 * from free hugepage list. 1486 */ 1487 if (!get_page_unless_zero(compound_head(p))) { 1488 if (PageHuge(p)) { 1489 pr_info("%s: %#lx free huge page\n", __func__, pfn); 1490 ret = 0; 1491 } else if (is_free_buddy_page(p)) { 1492 pr_info("%s: %#lx free buddy page\n", __func__, pfn); 1493 ret = 0; 1494 } else { 1495 pr_info("%s: %#lx: unknown zero refcount page type %lx\n", 1496 __func__, pfn, p->flags); 1497 ret = -EIO; 1498 } 1499 } else { 1500 /* Not a free page */ 1501 ret = 1; 1502 } 1503 return ret; 1504 } 1505 1506 static int get_any_page(struct page *page, unsigned long pfn, int flags) 1507 { 1508 int ret = __get_any_page(page, pfn, flags); 1509 1510 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { 1511 /* 1512 * Try to free it. 1513 */ 1514 put_page(page); 1515 shake_page(page, 1); 1516 1517 /* 1518 * Did it turn free? 1519 */ 1520 ret = __get_any_page(page, pfn, 0); 1521 if (!PageLRU(page)) { 1522 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1523 pfn, page->flags); 1524 return -EIO; 1525 } 1526 } 1527 return ret; 1528 } 1529 1530 static int soft_offline_huge_page(struct page *page, int flags) 1531 { 1532 int ret; 1533 unsigned long pfn = page_to_pfn(page); 1534 struct page *hpage = compound_head(page); 1535 LIST_HEAD(pagelist); 1536 1537 /* 1538 * This double-check of PageHWPoison is to avoid the race with 1539 * memory_failure(). See also comment in __soft_offline_page(). 1540 */ 1541 lock_page(hpage); 1542 if (PageHWPoison(hpage)) { 1543 unlock_page(hpage); 1544 put_page(hpage); 1545 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); 1546 return -EBUSY; 1547 } 1548 unlock_page(hpage); 1549 1550 /* Keep page count to indicate a given hugepage is isolated. */ 1551 list_move(&hpage->lru, &pagelist); 1552 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1553 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1554 if (ret) { 1555 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1556 pfn, ret, page->flags); 1557 /* 1558 * We know that soft_offline_huge_page() tries to migrate 1559 * only one hugepage pointed to by hpage, so we need not 1560 * run through the pagelist here. 1561 */ 1562 putback_active_hugepage(hpage); 1563 if (ret > 0) 1564 ret = -EIO; 1565 } else { 1566 /* overcommit hugetlb page will be freed to buddy */ 1567 if (PageHuge(page)) { 1568 set_page_hwpoison_huge_page(hpage); 1569 dequeue_hwpoisoned_huge_page(hpage); 1570 atomic_long_add(1 << compound_order(hpage), 1571 &num_poisoned_pages); 1572 } else { 1573 SetPageHWPoison(page); 1574 atomic_long_inc(&num_poisoned_pages); 1575 } 1576 } 1577 return ret; 1578 } 1579 1580 static int __soft_offline_page(struct page *page, int flags) 1581 { 1582 int ret; 1583 unsigned long pfn = page_to_pfn(page); 1584 1585 /* 1586 * Check PageHWPoison again inside page lock because PageHWPoison 1587 * is set by memory_failure() outside page lock. Note that 1588 * memory_failure() also double-checks PageHWPoison inside page lock, 1589 * so there's no race between soft_offline_page() and memory_failure(). 1590 */ 1591 lock_page(page); 1592 wait_on_page_writeback(page); 1593 if (PageHWPoison(page)) { 1594 unlock_page(page); 1595 put_page(page); 1596 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1597 return -EBUSY; 1598 } 1599 /* 1600 * Try to invalidate first. This should work for 1601 * non dirty unmapped page cache pages. 1602 */ 1603 ret = invalidate_inode_page(page); 1604 unlock_page(page); 1605 /* 1606 * RED-PEN would be better to keep it isolated here, but we 1607 * would need to fix isolation locking first. 1608 */ 1609 if (ret == 1) { 1610 put_page(page); 1611 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1612 SetPageHWPoison(page); 1613 atomic_long_inc(&num_poisoned_pages); 1614 return 0; 1615 } 1616 1617 /* 1618 * Simple invalidation didn't work. 1619 * Try to migrate to a new page instead. migrate.c 1620 * handles a large number of cases for us. 1621 */ 1622 ret = isolate_lru_page(page); 1623 /* 1624 * Drop page reference which is came from get_any_page() 1625 * successful isolate_lru_page() already took another one. 1626 */ 1627 put_page(page); 1628 if (!ret) { 1629 LIST_HEAD(pagelist); 1630 inc_zone_page_state(page, NR_ISOLATED_ANON + 1631 page_is_file_cache(page)); 1632 list_add(&page->lru, &pagelist); 1633 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1634 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1635 if (ret) { 1636 if (!list_empty(&pagelist)) { 1637 list_del(&page->lru); 1638 dec_zone_page_state(page, NR_ISOLATED_ANON + 1639 page_is_file_cache(page)); 1640 putback_lru_page(page); 1641 } 1642 1643 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1644 pfn, ret, page->flags); 1645 if (ret > 0) 1646 ret = -EIO; 1647 } else { 1648 /* 1649 * After page migration succeeds, the source page can 1650 * be trapped in pagevec and actual freeing is delayed. 1651 * Freeing code works differently based on PG_hwpoison, 1652 * so there's a race. We need to make sure that the 1653 * source page should be freed back to buddy before 1654 * setting PG_hwpoison. 1655 */ 1656 if (!is_free_buddy_page(page)) 1657 lru_add_drain_all(); 1658 if (!is_free_buddy_page(page)) 1659 drain_all_pages(page_zone(page)); 1660 SetPageHWPoison(page); 1661 if (!is_free_buddy_page(page)) 1662 pr_info("soft offline: %#lx: page leaked\n", 1663 pfn); 1664 atomic_long_inc(&num_poisoned_pages); 1665 } 1666 } else { 1667 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1668 pfn, ret, page_count(page), page->flags); 1669 } 1670 return ret; 1671 } 1672 1673 /** 1674 * soft_offline_page - Soft offline a page. 1675 * @page: page to offline 1676 * @flags: flags. Same as memory_failure(). 1677 * 1678 * Returns 0 on success, otherwise negated errno. 1679 * 1680 * Soft offline a page, by migration or invalidation, 1681 * without killing anything. This is for the case when 1682 * a page is not corrupted yet (so it's still valid to access), 1683 * but has had a number of corrected errors and is better taken 1684 * out. 1685 * 1686 * The actual policy on when to do that is maintained by 1687 * user space. 1688 * 1689 * This should never impact any application or cause data loss, 1690 * however it might take some time. 1691 * 1692 * This is not a 100% solution for all memory, but tries to be 1693 * ``good enough'' for the majority of memory. 1694 */ 1695 int soft_offline_page(struct page *page, int flags) 1696 { 1697 int ret; 1698 unsigned long pfn = page_to_pfn(page); 1699 struct page *hpage = compound_head(page); 1700 1701 if (PageHWPoison(page)) { 1702 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1703 return -EBUSY; 1704 } 1705 if (!PageHuge(page) && PageTransHuge(hpage)) { 1706 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { 1707 pr_info("soft offline: %#lx: failed to split THP\n", 1708 pfn); 1709 return -EBUSY; 1710 } 1711 } 1712 1713 get_online_mems(); 1714 1715 /* 1716 * Isolate the page, so that it doesn't get reallocated if it 1717 * was free. This flag should be kept set until the source page 1718 * is freed and PG_hwpoison on it is set. 1719 */ 1720 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 1721 set_migratetype_isolate(page, true); 1722 1723 ret = get_any_page(page, pfn, flags); 1724 put_online_mems(); 1725 if (ret > 0) { /* for in-use pages */ 1726 if (PageHuge(page)) 1727 ret = soft_offline_huge_page(page, flags); 1728 else 1729 ret = __soft_offline_page(page, flags); 1730 } else if (ret == 0) { /* for free pages */ 1731 if (PageHuge(page)) { 1732 set_page_hwpoison_huge_page(hpage); 1733 dequeue_hwpoisoned_huge_page(hpage); 1734 atomic_long_add(1 << compound_order(hpage), 1735 &num_poisoned_pages); 1736 } else { 1737 SetPageHWPoison(page); 1738 atomic_long_inc(&num_poisoned_pages); 1739 } 1740 } 1741 unset_migratetype_isolate(page, MIGRATE_MOVABLE); 1742 return ret; 1743 } 1744