1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory Migration functionality - linux/mm/migrate.c 4 * 5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 6 * 7 * Page migration was first developed in the context of the memory hotplug 8 * project. The main authors of the migration code are: 9 * 10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 11 * Hirokazu Takahashi <taka@valinux.co.jp> 12 * Dave Hansen <haveblue@us.ibm.com> 13 * Christoph Lameter 14 */ 15 16 #include <linux/migrate.h> 17 #include <linux/export.h> 18 #include <linux/swap.h> 19 #include <linux/swapops.h> 20 #include <linux/pagemap.h> 21 #include <linux/buffer_head.h> 22 #include <linux/mm_inline.h> 23 #include <linux/nsproxy.h> 24 #include <linux/pagevec.h> 25 #include <linux/ksm.h> 26 #include <linux/rmap.h> 27 #include <linux/topology.h> 28 #include <linux/cpu.h> 29 #include <linux/cpuset.h> 30 #include <linux/writeback.h> 31 #include <linux/mempolicy.h> 32 #include <linux/vmalloc.h> 33 #include <linux/security.h> 34 #include <linux/backing-dev.h> 35 #include <linux/compaction.h> 36 #include <linux/syscalls.h> 37 #include <linux/compat.h> 38 #include <linux/hugetlb.h> 39 #include <linux/hugetlb_cgroup.h> 40 #include <linux/gfp.h> 41 #include <linux/pfn_t.h> 42 #include <linux/memremap.h> 43 #include <linux/userfaultfd_k.h> 44 #include <linux/balloon_compaction.h> 45 #include <linux/mmu_notifier.h> 46 #include <linux/page_idle.h> 47 #include <linux/page_owner.h> 48 #include <linux/sched/mm.h> 49 #include <linux/ptrace.h> 50 51 #include <asm/tlbflush.h> 52 53 #define CREATE_TRACE_POINTS 54 #include <trace/events/migrate.h> 55 56 #include "internal.h" 57 58 /* 59 * migrate_prep() needs to be called before we start compiling a list of pages 60 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 61 * undesirable, use migrate_prep_local() 62 */ 63 int migrate_prep(void) 64 { 65 /* 66 * Clear the LRU lists so pages can be isolated. 67 * Note that pages may be moved off the LRU after we have 68 * drained them. Those pages will fail to migrate like other 69 * pages that may be busy. 70 */ 71 lru_add_drain_all(); 72 73 return 0; 74 } 75 76 /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 77 int migrate_prep_local(void) 78 { 79 lru_add_drain(); 80 81 return 0; 82 } 83 84 int isolate_movable_page(struct page *page, isolate_mode_t mode) 85 { 86 struct address_space *mapping; 87 88 /* 89 * Avoid burning cycles with pages that are yet under __free_pages(), 90 * or just got freed under us. 91 * 92 * In case we 'win' a race for a movable page being freed under us and 93 * raise its refcount preventing __free_pages() from doing its job 94 * the put_page() at the end of this block will take care of 95 * release this page, thus avoiding a nasty leakage. 96 */ 97 if (unlikely(!get_page_unless_zero(page))) 98 goto out; 99 100 /* 101 * Check PageMovable before holding a PG_lock because page's owner 102 * assumes anybody doesn't touch PG_lock of newly allocated page 103 * so unconditionally grapping the lock ruins page's owner side. 104 */ 105 if (unlikely(!__PageMovable(page))) 106 goto out_putpage; 107 /* 108 * As movable pages are not isolated from LRU lists, concurrent 109 * compaction threads can race against page migration functions 110 * as well as race against the releasing a page. 111 * 112 * In order to avoid having an already isolated movable page 113 * being (wrongly) re-isolated while it is under migration, 114 * or to avoid attempting to isolate pages being released, 115 * lets be sure we have the page lock 116 * before proceeding with the movable page isolation steps. 117 */ 118 if (unlikely(!trylock_page(page))) 119 goto out_putpage; 120 121 if (!PageMovable(page) || PageIsolated(page)) 122 goto out_no_isolated; 123 124 mapping = page_mapping(page); 125 VM_BUG_ON_PAGE(!mapping, page); 126 127 if (!mapping->a_ops->isolate_page(page, mode)) 128 goto out_no_isolated; 129 130 /* Driver shouldn't use PG_isolated bit of page->flags */ 131 WARN_ON_ONCE(PageIsolated(page)); 132 __SetPageIsolated(page); 133 unlock_page(page); 134 135 return 0; 136 137 out_no_isolated: 138 unlock_page(page); 139 out_putpage: 140 put_page(page); 141 out: 142 return -EBUSY; 143 } 144 145 /* It should be called on page which is PG_movable */ 146 void putback_movable_page(struct page *page) 147 { 148 struct address_space *mapping; 149 150 VM_BUG_ON_PAGE(!PageLocked(page), page); 151 VM_BUG_ON_PAGE(!PageMovable(page), page); 152 VM_BUG_ON_PAGE(!PageIsolated(page), page); 153 154 mapping = page_mapping(page); 155 mapping->a_ops->putback_page(page); 156 __ClearPageIsolated(page); 157 } 158 159 /* 160 * Put previously isolated pages back onto the appropriate lists 161 * from where they were once taken off for compaction/migration. 162 * 163 * This function shall be used whenever the isolated pageset has been 164 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 165 * and isolate_huge_page(). 166 */ 167 void putback_movable_pages(struct list_head *l) 168 { 169 struct page *page; 170 struct page *page2; 171 172 list_for_each_entry_safe(page, page2, l, lru) { 173 if (unlikely(PageHuge(page))) { 174 putback_active_hugepage(page); 175 continue; 176 } 177 list_del(&page->lru); 178 /* 179 * We isolated non-lru movable page so here we can use 180 * __PageMovable because LRU page's mapping cannot have 181 * PAGE_MAPPING_MOVABLE. 182 */ 183 if (unlikely(__PageMovable(page))) { 184 VM_BUG_ON_PAGE(!PageIsolated(page), page); 185 lock_page(page); 186 if (PageMovable(page)) 187 putback_movable_page(page); 188 else 189 __ClearPageIsolated(page); 190 unlock_page(page); 191 put_page(page); 192 } else { 193 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 194 page_is_file_cache(page), -hpage_nr_pages(page)); 195 putback_lru_page(page); 196 } 197 } 198 } 199 200 /* 201 * Restore a potential migration pte to a working pte entry 202 */ 203 static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 204 unsigned long addr, void *old) 205 { 206 struct page_vma_mapped_walk pvmw = { 207 .page = old, 208 .vma = vma, 209 .address = addr, 210 .flags = PVMW_SYNC | PVMW_MIGRATION, 211 }; 212 struct page *new; 213 pte_t pte; 214 swp_entry_t entry; 215 216 VM_BUG_ON_PAGE(PageTail(page), page); 217 while (page_vma_mapped_walk(&pvmw)) { 218 if (PageKsm(page)) 219 new = page; 220 else 221 new = page - pvmw.page->index + 222 linear_page_index(vma, pvmw.address); 223 224 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 225 /* PMD-mapped THP migration entry */ 226 if (!pvmw.pte) { 227 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 228 remove_migration_pmd(&pvmw, new); 229 continue; 230 } 231 #endif 232 233 get_page(new); 234 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 235 if (pte_swp_soft_dirty(*pvmw.pte)) 236 pte = pte_mksoft_dirty(pte); 237 238 /* 239 * Recheck VMA as permissions can change since migration started 240 */ 241 entry = pte_to_swp_entry(*pvmw.pte); 242 if (is_write_migration_entry(entry)) 243 pte = maybe_mkwrite(pte, vma); 244 245 if (unlikely(is_zone_device_page(new))) { 246 if (is_device_private_page(new)) { 247 entry = make_device_private_entry(new, pte_write(pte)); 248 pte = swp_entry_to_pte(entry); 249 } else if (is_device_public_page(new)) { 250 pte = pte_mkdevmap(pte); 251 flush_dcache_page(new); 252 } 253 } else 254 flush_dcache_page(new); 255 256 #ifdef CONFIG_HUGETLB_PAGE 257 if (PageHuge(new)) { 258 pte = pte_mkhuge(pte); 259 pte = arch_make_huge_pte(pte, vma, new, 0); 260 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 261 if (PageAnon(new)) 262 hugepage_add_anon_rmap(new, vma, pvmw.address); 263 else 264 page_dup_rmap(new, true); 265 } else 266 #endif 267 { 268 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 269 270 if (PageAnon(new)) 271 page_add_anon_rmap(new, vma, pvmw.address, false); 272 else 273 page_add_file_rmap(new, false); 274 } 275 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 276 mlock_vma_page(new); 277 278 /* No need to invalidate - it was non-present before */ 279 update_mmu_cache(vma, pvmw.address, pvmw.pte); 280 } 281 282 return true; 283 } 284 285 /* 286 * Get rid of all migration entries and replace them by 287 * references to the indicated page. 288 */ 289 void remove_migration_ptes(struct page *old, struct page *new, bool locked) 290 { 291 struct rmap_walk_control rwc = { 292 .rmap_one = remove_migration_pte, 293 .arg = old, 294 }; 295 296 if (locked) 297 rmap_walk_locked(new, &rwc); 298 else 299 rmap_walk(new, &rwc); 300 } 301 302 /* 303 * Something used the pte of a page under migration. We need to 304 * get to the page and wait until migration is finished. 305 * When we return from this function the fault will be retried. 306 */ 307 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 308 spinlock_t *ptl) 309 { 310 pte_t pte; 311 swp_entry_t entry; 312 struct page *page; 313 314 spin_lock(ptl); 315 pte = *ptep; 316 if (!is_swap_pte(pte)) 317 goto out; 318 319 entry = pte_to_swp_entry(pte); 320 if (!is_migration_entry(entry)) 321 goto out; 322 323 page = migration_entry_to_page(entry); 324 325 /* 326 * Once radix-tree replacement of page migration started, page_count 327 * *must* be zero. And, we don't want to call wait_on_page_locked() 328 * against a page without get_page(). 329 * So, we use get_page_unless_zero(), here. Even failed, page fault 330 * will occur again. 331 */ 332 if (!get_page_unless_zero(page)) 333 goto out; 334 pte_unmap_unlock(ptep, ptl); 335 wait_on_page_locked(page); 336 put_page(page); 337 return; 338 out: 339 pte_unmap_unlock(ptep, ptl); 340 } 341 342 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 343 unsigned long address) 344 { 345 spinlock_t *ptl = pte_lockptr(mm, pmd); 346 pte_t *ptep = pte_offset_map(pmd, address); 347 __migration_entry_wait(mm, ptep, ptl); 348 } 349 350 void migration_entry_wait_huge(struct vm_area_struct *vma, 351 struct mm_struct *mm, pte_t *pte) 352 { 353 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 354 __migration_entry_wait(mm, pte, ptl); 355 } 356 357 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 358 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 359 { 360 spinlock_t *ptl; 361 struct page *page; 362 363 ptl = pmd_lock(mm, pmd); 364 if (!is_pmd_migration_entry(*pmd)) 365 goto unlock; 366 page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); 367 if (!get_page_unless_zero(page)) 368 goto unlock; 369 spin_unlock(ptl); 370 wait_on_page_locked(page); 371 put_page(page); 372 return; 373 unlock: 374 spin_unlock(ptl); 375 } 376 #endif 377 378 #ifdef CONFIG_BLOCK 379 /* Returns true if all buffers are successfully locked */ 380 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 381 enum migrate_mode mode) 382 { 383 struct buffer_head *bh = head; 384 385 /* Simple case, sync compaction */ 386 if (mode != MIGRATE_ASYNC) { 387 do { 388 get_bh(bh); 389 lock_buffer(bh); 390 bh = bh->b_this_page; 391 392 } while (bh != head); 393 394 return true; 395 } 396 397 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 398 do { 399 get_bh(bh); 400 if (!trylock_buffer(bh)) { 401 /* 402 * We failed to lock the buffer and cannot stall in 403 * async migration. Release the taken locks 404 */ 405 struct buffer_head *failed_bh = bh; 406 put_bh(failed_bh); 407 bh = head; 408 while (bh != failed_bh) { 409 unlock_buffer(bh); 410 put_bh(bh); 411 bh = bh->b_this_page; 412 } 413 return false; 414 } 415 416 bh = bh->b_this_page; 417 } while (bh != head); 418 return true; 419 } 420 #else 421 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, 422 enum migrate_mode mode) 423 { 424 return true; 425 } 426 #endif /* CONFIG_BLOCK */ 427 428 /* 429 * Replace the page in the mapping. 430 * 431 * The number of remaining references must be: 432 * 1 for anonymous pages without a mapping 433 * 2 for pages with a mapping 434 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 435 */ 436 int migrate_page_move_mapping(struct address_space *mapping, 437 struct page *newpage, struct page *page, 438 struct buffer_head *head, enum migrate_mode mode, 439 int extra_count) 440 { 441 struct zone *oldzone, *newzone; 442 int dirty; 443 int expected_count = 1 + extra_count; 444 void **pslot; 445 446 /* 447 * Device public or private pages have an extra refcount as they are 448 * ZONE_DEVICE pages. 449 */ 450 expected_count += is_device_private_page(page); 451 expected_count += is_device_public_page(page); 452 453 if (!mapping) { 454 /* Anonymous page without mapping */ 455 if (page_count(page) != expected_count) 456 return -EAGAIN; 457 458 /* No turning back from here */ 459 newpage->index = page->index; 460 newpage->mapping = page->mapping; 461 if (PageSwapBacked(page)) 462 __SetPageSwapBacked(newpage); 463 464 return MIGRATEPAGE_SUCCESS; 465 } 466 467 oldzone = page_zone(page); 468 newzone = page_zone(newpage); 469 470 xa_lock_irq(&mapping->i_pages); 471 472 pslot = radix_tree_lookup_slot(&mapping->i_pages, 473 page_index(page)); 474 475 expected_count += hpage_nr_pages(page) + page_has_private(page); 476 if (page_count(page) != expected_count || 477 radix_tree_deref_slot_protected(pslot, 478 &mapping->i_pages.xa_lock) != page) { 479 xa_unlock_irq(&mapping->i_pages); 480 return -EAGAIN; 481 } 482 483 if (!page_ref_freeze(page, expected_count)) { 484 xa_unlock_irq(&mapping->i_pages); 485 return -EAGAIN; 486 } 487 488 /* 489 * In the async migration case of moving a page with buffers, lock the 490 * buffers using trylock before the mapping is moved. If the mapping 491 * was moved, we later failed to lock the buffers and could not move 492 * the mapping back due to an elevated page count, we would have to 493 * block waiting on other references to be dropped. 494 */ 495 if (mode == MIGRATE_ASYNC && head && 496 !buffer_migrate_lock_buffers(head, mode)) { 497 page_ref_unfreeze(page, expected_count); 498 xa_unlock_irq(&mapping->i_pages); 499 return -EAGAIN; 500 } 501 502 /* 503 * Now we know that no one else is looking at the page: 504 * no turning back from here. 505 */ 506 newpage->index = page->index; 507 newpage->mapping = page->mapping; 508 page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ 509 if (PageSwapBacked(page)) { 510 __SetPageSwapBacked(newpage); 511 if (PageSwapCache(page)) { 512 SetPageSwapCache(newpage); 513 set_page_private(newpage, page_private(page)); 514 } 515 } else { 516 VM_BUG_ON_PAGE(PageSwapCache(page), page); 517 } 518 519 /* Move dirty while page refs frozen and newpage not yet exposed */ 520 dirty = PageDirty(page); 521 if (dirty) { 522 ClearPageDirty(page); 523 SetPageDirty(newpage); 524 } 525 526 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 527 if (PageTransHuge(page)) { 528 int i; 529 int index = page_index(page); 530 531 for (i = 1; i < HPAGE_PMD_NR; i++) { 532 pslot = radix_tree_lookup_slot(&mapping->i_pages, 533 index + i); 534 radix_tree_replace_slot(&mapping->i_pages, pslot, 535 newpage + i); 536 } 537 } 538 539 /* 540 * Drop cache reference from old page by unfreezing 541 * to one less reference. 542 * We know this isn't the last reference. 543 */ 544 page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); 545 546 xa_unlock(&mapping->i_pages); 547 /* Leave irq disabled to prevent preemption while updating stats */ 548 549 /* 550 * If moved to a different zone then also account 551 * the page for that zone. Other VM counters will be 552 * taken care of when we establish references to the 553 * new page and drop references to the old page. 554 * 555 * Note that anonymous pages are accounted for 556 * via NR_FILE_PAGES and NR_ANON_MAPPED if they 557 * are mapped to swap space. 558 */ 559 if (newzone != oldzone) { 560 __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); 561 __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); 562 if (PageSwapBacked(page) && !PageSwapCache(page)) { 563 __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); 564 __inc_node_state(newzone->zone_pgdat, NR_SHMEM); 565 } 566 if (dirty && mapping_cap_account_dirty(mapping)) { 567 __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); 568 __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); 569 __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); 570 __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); 571 } 572 } 573 local_irq_enable(); 574 575 return MIGRATEPAGE_SUCCESS; 576 } 577 EXPORT_SYMBOL(migrate_page_move_mapping); 578 579 /* 580 * The expected number of remaining references is the same as that 581 * of migrate_page_move_mapping(). 582 */ 583 int migrate_huge_page_move_mapping(struct address_space *mapping, 584 struct page *newpage, struct page *page) 585 { 586 int expected_count; 587 void **pslot; 588 589 xa_lock_irq(&mapping->i_pages); 590 591 pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); 592 593 expected_count = 2 + page_has_private(page); 594 if (page_count(page) != expected_count || 595 radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { 596 xa_unlock_irq(&mapping->i_pages); 597 return -EAGAIN; 598 } 599 600 if (!page_ref_freeze(page, expected_count)) { 601 xa_unlock_irq(&mapping->i_pages); 602 return -EAGAIN; 603 } 604 605 newpage->index = page->index; 606 newpage->mapping = page->mapping; 607 608 get_page(newpage); 609 610 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 611 612 page_ref_unfreeze(page, expected_count - 1); 613 614 xa_unlock_irq(&mapping->i_pages); 615 616 return MIGRATEPAGE_SUCCESS; 617 } 618 619 /* 620 * Gigantic pages are so large that we do not guarantee that page++ pointer 621 * arithmetic will work across the entire page. We need something more 622 * specialized. 623 */ 624 static void __copy_gigantic_page(struct page *dst, struct page *src, 625 int nr_pages) 626 { 627 int i; 628 struct page *dst_base = dst; 629 struct page *src_base = src; 630 631 for (i = 0; i < nr_pages; ) { 632 cond_resched(); 633 copy_highpage(dst, src); 634 635 i++; 636 dst = mem_map_next(dst, dst_base, i); 637 src = mem_map_next(src, src_base, i); 638 } 639 } 640 641 static void copy_huge_page(struct page *dst, struct page *src) 642 { 643 int i; 644 int nr_pages; 645 646 if (PageHuge(src)) { 647 /* hugetlbfs page */ 648 struct hstate *h = page_hstate(src); 649 nr_pages = pages_per_huge_page(h); 650 651 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { 652 __copy_gigantic_page(dst, src, nr_pages); 653 return; 654 } 655 } else { 656 /* thp page */ 657 BUG_ON(!PageTransHuge(src)); 658 nr_pages = hpage_nr_pages(src); 659 } 660 661 for (i = 0; i < nr_pages; i++) { 662 cond_resched(); 663 copy_highpage(dst + i, src + i); 664 } 665 } 666 667 /* 668 * Copy the page to its new location 669 */ 670 void migrate_page_states(struct page *newpage, struct page *page) 671 { 672 int cpupid; 673 674 if (PageError(page)) 675 SetPageError(newpage); 676 if (PageReferenced(page)) 677 SetPageReferenced(newpage); 678 if (PageUptodate(page)) 679 SetPageUptodate(newpage); 680 if (TestClearPageActive(page)) { 681 VM_BUG_ON_PAGE(PageUnevictable(page), page); 682 SetPageActive(newpage); 683 } else if (TestClearPageUnevictable(page)) 684 SetPageUnevictable(newpage); 685 if (PageChecked(page)) 686 SetPageChecked(newpage); 687 if (PageMappedToDisk(page)) 688 SetPageMappedToDisk(newpage); 689 690 /* Move dirty on pages not done by migrate_page_move_mapping() */ 691 if (PageDirty(page)) 692 SetPageDirty(newpage); 693 694 if (page_is_young(page)) 695 set_page_young(newpage); 696 if (page_is_idle(page)) 697 set_page_idle(newpage); 698 699 /* 700 * Copy NUMA information to the new page, to prevent over-eager 701 * future migrations of this same page. 702 */ 703 cpupid = page_cpupid_xchg_last(page, -1); 704 page_cpupid_xchg_last(newpage, cpupid); 705 706 ksm_migrate_page(newpage, page); 707 /* 708 * Please do not reorder this without considering how mm/ksm.c's 709 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 710 */ 711 if (PageSwapCache(page)) 712 ClearPageSwapCache(page); 713 ClearPagePrivate(page); 714 set_page_private(page, 0); 715 716 /* 717 * If any waiters have accumulated on the new page then 718 * wake them up. 719 */ 720 if (PageWriteback(newpage)) 721 end_page_writeback(newpage); 722 723 copy_page_owner(page, newpage); 724 725 mem_cgroup_migrate(page, newpage); 726 } 727 EXPORT_SYMBOL(migrate_page_states); 728 729 void migrate_page_copy(struct page *newpage, struct page *page) 730 { 731 if (PageHuge(page) || PageTransHuge(page)) 732 copy_huge_page(newpage, page); 733 else 734 copy_highpage(newpage, page); 735 736 migrate_page_states(newpage, page); 737 } 738 EXPORT_SYMBOL(migrate_page_copy); 739 740 /************************************************************ 741 * Migration functions 742 ***********************************************************/ 743 744 /* 745 * Common logic to directly migrate a single LRU page suitable for 746 * pages that do not use PagePrivate/PagePrivate2. 747 * 748 * Pages are locked upon entry and exit. 749 */ 750 int migrate_page(struct address_space *mapping, 751 struct page *newpage, struct page *page, 752 enum migrate_mode mode) 753 { 754 int rc; 755 756 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 757 758 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); 759 760 if (rc != MIGRATEPAGE_SUCCESS) 761 return rc; 762 763 if (mode != MIGRATE_SYNC_NO_COPY) 764 migrate_page_copy(newpage, page); 765 else 766 migrate_page_states(newpage, page); 767 return MIGRATEPAGE_SUCCESS; 768 } 769 EXPORT_SYMBOL(migrate_page); 770 771 #ifdef CONFIG_BLOCK 772 /* 773 * Migration function for pages with buffers. This function can only be used 774 * if the underlying filesystem guarantees that no other references to "page" 775 * exist. 776 */ 777 int buffer_migrate_page(struct address_space *mapping, 778 struct page *newpage, struct page *page, enum migrate_mode mode) 779 { 780 struct buffer_head *bh, *head; 781 int rc; 782 783 if (!page_has_buffers(page)) 784 return migrate_page(mapping, newpage, page, mode); 785 786 head = page_buffers(page); 787 788 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); 789 790 if (rc != MIGRATEPAGE_SUCCESS) 791 return rc; 792 793 /* 794 * In the async case, migrate_page_move_mapping locked the buffers 795 * with an IRQ-safe spinlock held. In the sync case, the buffers 796 * need to be locked now 797 */ 798 if (mode != MIGRATE_ASYNC) 799 BUG_ON(!buffer_migrate_lock_buffers(head, mode)); 800 801 ClearPagePrivate(page); 802 set_page_private(newpage, page_private(page)); 803 set_page_private(page, 0); 804 put_page(page); 805 get_page(newpage); 806 807 bh = head; 808 do { 809 set_bh_page(bh, newpage, bh_offset(bh)); 810 bh = bh->b_this_page; 811 812 } while (bh != head); 813 814 SetPagePrivate(newpage); 815 816 if (mode != MIGRATE_SYNC_NO_COPY) 817 migrate_page_copy(newpage, page); 818 else 819 migrate_page_states(newpage, page); 820 821 bh = head; 822 do { 823 unlock_buffer(bh); 824 put_bh(bh); 825 bh = bh->b_this_page; 826 827 } while (bh != head); 828 829 return MIGRATEPAGE_SUCCESS; 830 } 831 EXPORT_SYMBOL(buffer_migrate_page); 832 #endif 833 834 /* 835 * Writeback a page to clean the dirty state 836 */ 837 static int writeout(struct address_space *mapping, struct page *page) 838 { 839 struct writeback_control wbc = { 840 .sync_mode = WB_SYNC_NONE, 841 .nr_to_write = 1, 842 .range_start = 0, 843 .range_end = LLONG_MAX, 844 .for_reclaim = 1 845 }; 846 int rc; 847 848 if (!mapping->a_ops->writepage) 849 /* No write method for the address space */ 850 return -EINVAL; 851 852 if (!clear_page_dirty_for_io(page)) 853 /* Someone else already triggered a write */ 854 return -EAGAIN; 855 856 /* 857 * A dirty page may imply that the underlying filesystem has 858 * the page on some queue. So the page must be clean for 859 * migration. Writeout may mean we loose the lock and the 860 * page state is no longer what we checked for earlier. 861 * At this point we know that the migration attempt cannot 862 * be successful. 863 */ 864 remove_migration_ptes(page, page, false); 865 866 rc = mapping->a_ops->writepage(page, &wbc); 867 868 if (rc != AOP_WRITEPAGE_ACTIVATE) 869 /* unlocked. Relock */ 870 lock_page(page); 871 872 return (rc < 0) ? -EIO : -EAGAIN; 873 } 874 875 /* 876 * Default handling if a filesystem does not provide a migration function. 877 */ 878 static int fallback_migrate_page(struct address_space *mapping, 879 struct page *newpage, struct page *page, enum migrate_mode mode) 880 { 881 if (PageDirty(page)) { 882 /* Only writeback pages in full synchronous migration */ 883 switch (mode) { 884 case MIGRATE_SYNC: 885 case MIGRATE_SYNC_NO_COPY: 886 break; 887 default: 888 return -EBUSY; 889 } 890 return writeout(mapping, page); 891 } 892 893 /* 894 * Buffers may be managed in a filesystem specific way. 895 * We must have no buffers or drop them. 896 */ 897 if (page_has_private(page) && 898 !try_to_release_page(page, GFP_KERNEL)) 899 return -EAGAIN; 900 901 return migrate_page(mapping, newpage, page, mode); 902 } 903 904 /* 905 * Move a page to a newly allocated page 906 * The page is locked and all ptes have been successfully removed. 907 * 908 * The new page will have replaced the old page if this function 909 * is successful. 910 * 911 * Return value: 912 * < 0 - error code 913 * MIGRATEPAGE_SUCCESS - success 914 */ 915 static int move_to_new_page(struct page *newpage, struct page *page, 916 enum migrate_mode mode) 917 { 918 struct address_space *mapping; 919 int rc = -EAGAIN; 920 bool is_lru = !__PageMovable(page); 921 922 VM_BUG_ON_PAGE(!PageLocked(page), page); 923 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 924 925 mapping = page_mapping(page); 926 927 if (likely(is_lru)) { 928 if (!mapping) 929 rc = migrate_page(mapping, newpage, page, mode); 930 else if (mapping->a_ops->migratepage) 931 /* 932 * Most pages have a mapping and most filesystems 933 * provide a migratepage callback. Anonymous pages 934 * are part of swap space which also has its own 935 * migratepage callback. This is the most common path 936 * for page migration. 937 */ 938 rc = mapping->a_ops->migratepage(mapping, newpage, 939 page, mode); 940 else 941 rc = fallback_migrate_page(mapping, newpage, 942 page, mode); 943 } else { 944 /* 945 * In case of non-lru page, it could be released after 946 * isolation step. In that case, we shouldn't try migration. 947 */ 948 VM_BUG_ON_PAGE(!PageIsolated(page), page); 949 if (!PageMovable(page)) { 950 rc = MIGRATEPAGE_SUCCESS; 951 __ClearPageIsolated(page); 952 goto out; 953 } 954 955 rc = mapping->a_ops->migratepage(mapping, newpage, 956 page, mode); 957 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 958 !PageIsolated(page)); 959 } 960 961 /* 962 * When successful, old pagecache page->mapping must be cleared before 963 * page is freed; but stats require that PageAnon be left as PageAnon. 964 */ 965 if (rc == MIGRATEPAGE_SUCCESS) { 966 if (__PageMovable(page)) { 967 VM_BUG_ON_PAGE(!PageIsolated(page), page); 968 969 /* 970 * We clear PG_movable under page_lock so any compactor 971 * cannot try to migrate this page. 972 */ 973 __ClearPageIsolated(page); 974 } 975 976 /* 977 * Anonymous and movable page->mapping will be cleard by 978 * free_pages_prepare so don't reset it here for keeping 979 * the type to work PageAnon, for example. 980 */ 981 if (!PageMappingFlags(page)) 982 page->mapping = NULL; 983 } 984 out: 985 return rc; 986 } 987 988 static int __unmap_and_move(struct page *page, struct page *newpage, 989 int force, enum migrate_mode mode) 990 { 991 int rc = -EAGAIN; 992 int page_was_mapped = 0; 993 struct anon_vma *anon_vma = NULL; 994 bool is_lru = !__PageMovable(page); 995 996 if (!trylock_page(page)) { 997 if (!force || mode == MIGRATE_ASYNC) 998 goto out; 999 1000 /* 1001 * It's not safe for direct compaction to call lock_page. 1002 * For example, during page readahead pages are added locked 1003 * to the LRU. Later, when the IO completes the pages are 1004 * marked uptodate and unlocked. However, the queueing 1005 * could be merging multiple pages for one bio (e.g. 1006 * mpage_readpages). If an allocation happens for the 1007 * second or third page, the process can end up locking 1008 * the same page twice and deadlocking. Rather than 1009 * trying to be clever about what pages can be locked, 1010 * avoid the use of lock_page for direct compaction 1011 * altogether. 1012 */ 1013 if (current->flags & PF_MEMALLOC) 1014 goto out; 1015 1016 lock_page(page); 1017 } 1018 1019 if (PageWriteback(page)) { 1020 /* 1021 * Only in the case of a full synchronous migration is it 1022 * necessary to wait for PageWriteback. In the async case, 1023 * the retry loop is too short and in the sync-light case, 1024 * the overhead of stalling is too much 1025 */ 1026 switch (mode) { 1027 case MIGRATE_SYNC: 1028 case MIGRATE_SYNC_NO_COPY: 1029 break; 1030 default: 1031 rc = -EBUSY; 1032 goto out_unlock; 1033 } 1034 if (!force) 1035 goto out_unlock; 1036 wait_on_page_writeback(page); 1037 } 1038 1039 /* 1040 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 1041 * we cannot notice that anon_vma is freed while we migrates a page. 1042 * This get_anon_vma() delays freeing anon_vma pointer until the end 1043 * of migration. File cache pages are no problem because of page_lock() 1044 * File Caches may use write_page() or lock_page() in migration, then, 1045 * just care Anon page here. 1046 * 1047 * Only page_get_anon_vma() understands the subtleties of 1048 * getting a hold on an anon_vma from outside one of its mms. 1049 * But if we cannot get anon_vma, then we won't need it anyway, 1050 * because that implies that the anon page is no longer mapped 1051 * (and cannot be remapped so long as we hold the page lock). 1052 */ 1053 if (PageAnon(page) && !PageKsm(page)) 1054 anon_vma = page_get_anon_vma(page); 1055 1056 /* 1057 * Block others from accessing the new page when we get around to 1058 * establishing additional references. We are usually the only one 1059 * holding a reference to newpage at this point. We used to have a BUG 1060 * here if trylock_page(newpage) fails, but would like to allow for 1061 * cases where there might be a race with the previous use of newpage. 1062 * This is much like races on refcount of oldpage: just don't BUG(). 1063 */ 1064 if (unlikely(!trylock_page(newpage))) 1065 goto out_unlock; 1066 1067 if (unlikely(!is_lru)) { 1068 rc = move_to_new_page(newpage, page, mode); 1069 goto out_unlock_both; 1070 } 1071 1072 /* 1073 * Corner case handling: 1074 * 1. When a new swap-cache page is read into, it is added to the LRU 1075 * and treated as swapcache but it has no rmap yet. 1076 * Calling try_to_unmap() against a page->mapping==NULL page will 1077 * trigger a BUG. So handle it here. 1078 * 2. An orphaned page (see truncate_complete_page) might have 1079 * fs-private metadata. The page can be picked up due to memory 1080 * offlining. Everywhere else except page reclaim, the page is 1081 * invisible to the vm, so the page can not be migrated. So try to 1082 * free the metadata, so the page can be freed. 1083 */ 1084 if (!page->mapping) { 1085 VM_BUG_ON_PAGE(PageAnon(page), page); 1086 if (page_has_private(page)) { 1087 try_to_free_buffers(page); 1088 goto out_unlock_both; 1089 } 1090 } else if (page_mapped(page)) { 1091 /* Establish migration ptes */ 1092 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 1093 page); 1094 try_to_unmap(page, 1095 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1096 page_was_mapped = 1; 1097 } 1098 1099 if (!page_mapped(page)) 1100 rc = move_to_new_page(newpage, page, mode); 1101 1102 if (page_was_mapped) 1103 remove_migration_ptes(page, 1104 rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 1105 1106 out_unlock_both: 1107 unlock_page(newpage); 1108 out_unlock: 1109 /* Drop an anon_vma reference if we took one */ 1110 if (anon_vma) 1111 put_anon_vma(anon_vma); 1112 unlock_page(page); 1113 out: 1114 /* 1115 * If migration is successful, decrease refcount of the newpage 1116 * which will not free the page because new page owner increased 1117 * refcounter. As well, if it is LRU page, add the page to LRU 1118 * list in here. 1119 */ 1120 if (rc == MIGRATEPAGE_SUCCESS) { 1121 if (unlikely(__PageMovable(newpage))) 1122 put_page(newpage); 1123 else 1124 putback_lru_page(newpage); 1125 } 1126 1127 return rc; 1128 } 1129 1130 /* 1131 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work 1132 * around it. 1133 */ 1134 #if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) 1135 #define ICE_noinline noinline 1136 #else 1137 #define ICE_noinline 1138 #endif 1139 1140 /* 1141 * Obtain the lock on page, remove all ptes and migrate the page 1142 * to the newly allocated page in newpage. 1143 */ 1144 static ICE_noinline int unmap_and_move(new_page_t get_new_page, 1145 free_page_t put_new_page, 1146 unsigned long private, struct page *page, 1147 int force, enum migrate_mode mode, 1148 enum migrate_reason reason) 1149 { 1150 int rc = MIGRATEPAGE_SUCCESS; 1151 struct page *newpage; 1152 1153 if (!thp_migration_supported() && PageTransHuge(page)) 1154 return -ENOMEM; 1155 1156 newpage = get_new_page(page, private); 1157 if (!newpage) 1158 return -ENOMEM; 1159 1160 if (page_count(page) == 1) { 1161 /* page was freed from under us. So we are done. */ 1162 ClearPageActive(page); 1163 ClearPageUnevictable(page); 1164 if (unlikely(__PageMovable(page))) { 1165 lock_page(page); 1166 if (!PageMovable(page)) 1167 __ClearPageIsolated(page); 1168 unlock_page(page); 1169 } 1170 if (put_new_page) 1171 put_new_page(newpage, private); 1172 else 1173 put_page(newpage); 1174 goto out; 1175 } 1176 1177 rc = __unmap_and_move(page, newpage, force, mode); 1178 if (rc == MIGRATEPAGE_SUCCESS) 1179 set_page_owner_migrate_reason(newpage, reason); 1180 1181 out: 1182 if (rc != -EAGAIN) { 1183 /* 1184 * A page that has been migrated has all references 1185 * removed and will be freed. A page that has not been 1186 * migrated will have kepts its references and be 1187 * restored. 1188 */ 1189 list_del(&page->lru); 1190 1191 /* 1192 * Compaction can migrate also non-LRU pages which are 1193 * not accounted to NR_ISOLATED_*. They can be recognized 1194 * as __PageMovable 1195 */ 1196 if (likely(!__PageMovable(page))) 1197 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 1198 page_is_file_cache(page), -hpage_nr_pages(page)); 1199 } 1200 1201 /* 1202 * If migration is successful, releases reference grabbed during 1203 * isolation. Otherwise, restore the page to right list unless 1204 * we want to retry. 1205 */ 1206 if (rc == MIGRATEPAGE_SUCCESS) { 1207 put_page(page); 1208 if (reason == MR_MEMORY_FAILURE) { 1209 /* 1210 * Set PG_HWPoison on just freed page 1211 * intentionally. Although it's rather weird, 1212 * it's how HWPoison flag works at the moment. 1213 */ 1214 if (!test_set_page_hwpoison(page)) 1215 num_poisoned_pages_inc(); 1216 } 1217 } else { 1218 if (rc != -EAGAIN) { 1219 if (likely(!__PageMovable(page))) { 1220 putback_lru_page(page); 1221 goto put_new; 1222 } 1223 1224 lock_page(page); 1225 if (PageMovable(page)) 1226 putback_movable_page(page); 1227 else 1228 __ClearPageIsolated(page); 1229 unlock_page(page); 1230 put_page(page); 1231 } 1232 put_new: 1233 if (put_new_page) 1234 put_new_page(newpage, private); 1235 else 1236 put_page(newpage); 1237 } 1238 1239 return rc; 1240 } 1241 1242 /* 1243 * Counterpart of unmap_and_move_page() for hugepage migration. 1244 * 1245 * This function doesn't wait the completion of hugepage I/O 1246 * because there is no race between I/O and migration for hugepage. 1247 * Note that currently hugepage I/O occurs only in direct I/O 1248 * where no lock is held and PG_writeback is irrelevant, 1249 * and writeback status of all subpages are counted in the reference 1250 * count of the head page (i.e. if all subpages of a 2MB hugepage are 1251 * under direct I/O, the reference of the head page is 512 and a bit more.) 1252 * This means that when we try to migrate hugepage whose subpages are 1253 * doing direct I/O, some references remain after try_to_unmap() and 1254 * hugepage migration fails without data corruption. 1255 * 1256 * There is also no race when direct I/O is issued on the page under migration, 1257 * because then pte is replaced with migration swap entry and direct I/O code 1258 * will wait in the page fault for migration to complete. 1259 */ 1260 static int unmap_and_move_huge_page(new_page_t get_new_page, 1261 free_page_t put_new_page, unsigned long private, 1262 struct page *hpage, int force, 1263 enum migrate_mode mode, int reason) 1264 { 1265 int rc = -EAGAIN; 1266 int page_was_mapped = 0; 1267 struct page *new_hpage; 1268 struct anon_vma *anon_vma = NULL; 1269 1270 /* 1271 * Movability of hugepages depends on architectures and hugepage size. 1272 * This check is necessary because some callers of hugepage migration 1273 * like soft offline and memory hotremove don't walk through page 1274 * tables or check whether the hugepage is pmd-based or not before 1275 * kicking migration. 1276 */ 1277 if (!hugepage_migration_supported(page_hstate(hpage))) { 1278 putback_active_hugepage(hpage); 1279 return -ENOSYS; 1280 } 1281 1282 new_hpage = get_new_page(hpage, private); 1283 if (!new_hpage) 1284 return -ENOMEM; 1285 1286 if (!trylock_page(hpage)) { 1287 if (!force) 1288 goto out; 1289 switch (mode) { 1290 case MIGRATE_SYNC: 1291 case MIGRATE_SYNC_NO_COPY: 1292 break; 1293 default: 1294 goto out; 1295 } 1296 lock_page(hpage); 1297 } 1298 1299 if (PageAnon(hpage)) 1300 anon_vma = page_get_anon_vma(hpage); 1301 1302 if (unlikely(!trylock_page(new_hpage))) 1303 goto put_anon; 1304 1305 if (page_mapped(hpage)) { 1306 try_to_unmap(hpage, 1307 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1308 page_was_mapped = 1; 1309 } 1310 1311 if (!page_mapped(hpage)) 1312 rc = move_to_new_page(new_hpage, hpage, mode); 1313 1314 if (page_was_mapped) 1315 remove_migration_ptes(hpage, 1316 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); 1317 1318 unlock_page(new_hpage); 1319 1320 put_anon: 1321 if (anon_vma) 1322 put_anon_vma(anon_vma); 1323 1324 if (rc == MIGRATEPAGE_SUCCESS) { 1325 move_hugetlb_state(hpage, new_hpage, reason); 1326 put_new_page = NULL; 1327 } 1328 1329 unlock_page(hpage); 1330 out: 1331 if (rc != -EAGAIN) 1332 putback_active_hugepage(hpage); 1333 if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) 1334 num_poisoned_pages_inc(); 1335 1336 /* 1337 * If migration was not successful and there's a freeing callback, use 1338 * it. Otherwise, put_page() will drop the reference grabbed during 1339 * isolation. 1340 */ 1341 if (put_new_page) 1342 put_new_page(new_hpage, private); 1343 else 1344 putback_active_hugepage(new_hpage); 1345 1346 return rc; 1347 } 1348 1349 /* 1350 * migrate_pages - migrate the pages specified in a list, to the free pages 1351 * supplied as the target for the page migration 1352 * 1353 * @from: The list of pages to be migrated. 1354 * @get_new_page: The function used to allocate free pages to be used 1355 * as the target of the page migration. 1356 * @put_new_page: The function used to free target pages if migration 1357 * fails, or NULL if no special handling is necessary. 1358 * @private: Private data to be passed on to get_new_page() 1359 * @mode: The migration mode that specifies the constraints for 1360 * page migration, if any. 1361 * @reason: The reason for page migration. 1362 * 1363 * The function returns after 10 attempts or if no pages are movable any more 1364 * because the list has become empty or no retryable pages exist any more. 1365 * The caller should call putback_movable_pages() to return pages to the LRU 1366 * or free list only if ret != 0. 1367 * 1368 * Returns the number of pages that were not migrated, or an error code. 1369 */ 1370 int migrate_pages(struct list_head *from, new_page_t get_new_page, 1371 free_page_t put_new_page, unsigned long private, 1372 enum migrate_mode mode, int reason) 1373 { 1374 int retry = 1; 1375 int nr_failed = 0; 1376 int nr_succeeded = 0; 1377 int pass = 0; 1378 struct page *page; 1379 struct page *page2; 1380 int swapwrite = current->flags & PF_SWAPWRITE; 1381 int rc; 1382 1383 if (!swapwrite) 1384 current->flags |= PF_SWAPWRITE; 1385 1386 for(pass = 0; pass < 10 && retry; pass++) { 1387 retry = 0; 1388 1389 list_for_each_entry_safe(page, page2, from, lru) { 1390 retry: 1391 cond_resched(); 1392 1393 if (PageHuge(page)) 1394 rc = unmap_and_move_huge_page(get_new_page, 1395 put_new_page, private, page, 1396 pass > 2, mode, reason); 1397 else 1398 rc = unmap_and_move(get_new_page, put_new_page, 1399 private, page, pass > 2, mode, 1400 reason); 1401 1402 switch(rc) { 1403 case -ENOMEM: 1404 /* 1405 * THP migration might be unsupported or the 1406 * allocation could've failed so we should 1407 * retry on the same page with the THP split 1408 * to base pages. 1409 * 1410 * Head page is retried immediately and tail 1411 * pages are added to the tail of the list so 1412 * we encounter them after the rest of the list 1413 * is processed. 1414 */ 1415 if (PageTransHuge(page)) { 1416 lock_page(page); 1417 rc = split_huge_page_to_list(page, from); 1418 unlock_page(page); 1419 if (!rc) { 1420 list_safe_reset_next(page, page2, lru); 1421 goto retry; 1422 } 1423 } 1424 nr_failed++; 1425 goto out; 1426 case -EAGAIN: 1427 retry++; 1428 break; 1429 case MIGRATEPAGE_SUCCESS: 1430 nr_succeeded++; 1431 break; 1432 default: 1433 /* 1434 * Permanent failure (-EBUSY, -ENOSYS, etc.): 1435 * unlike -EAGAIN case, the failed page is 1436 * removed from migration page list and not 1437 * retried in the next outer loop. 1438 */ 1439 nr_failed++; 1440 break; 1441 } 1442 } 1443 } 1444 nr_failed += retry; 1445 rc = nr_failed; 1446 out: 1447 if (nr_succeeded) 1448 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1449 if (nr_failed) 1450 count_vm_events(PGMIGRATE_FAIL, nr_failed); 1451 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); 1452 1453 if (!swapwrite) 1454 current->flags &= ~PF_SWAPWRITE; 1455 1456 return rc; 1457 } 1458 1459 #ifdef CONFIG_NUMA 1460 1461 static int store_status(int __user *status, int start, int value, int nr) 1462 { 1463 while (nr-- > 0) { 1464 if (put_user(value, status + start)) 1465 return -EFAULT; 1466 start++; 1467 } 1468 1469 return 0; 1470 } 1471 1472 static int do_move_pages_to_node(struct mm_struct *mm, 1473 struct list_head *pagelist, int node) 1474 { 1475 int err; 1476 1477 if (list_empty(pagelist)) 1478 return 0; 1479 1480 err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, 1481 MIGRATE_SYNC, MR_SYSCALL); 1482 if (err) 1483 putback_movable_pages(pagelist); 1484 return err; 1485 } 1486 1487 /* 1488 * Resolves the given address to a struct page, isolates it from the LRU and 1489 * puts it to the given pagelist. 1490 * Returns -errno if the page cannot be found/isolated or 0 when it has been 1491 * queued or the page doesn't need to be migrated because it is already on 1492 * the target node 1493 */ 1494 static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, 1495 int node, struct list_head *pagelist, bool migrate_all) 1496 { 1497 struct vm_area_struct *vma; 1498 struct page *page; 1499 unsigned int follflags; 1500 int err; 1501 1502 down_read(&mm->mmap_sem); 1503 err = -EFAULT; 1504 vma = find_vma(mm, addr); 1505 if (!vma || addr < vma->vm_start || !vma_migratable(vma)) 1506 goto out; 1507 1508 /* FOLL_DUMP to ignore special (like zero) pages */ 1509 follflags = FOLL_GET | FOLL_DUMP; 1510 page = follow_page(vma, addr, follflags); 1511 1512 err = PTR_ERR(page); 1513 if (IS_ERR(page)) 1514 goto out; 1515 1516 err = -ENOENT; 1517 if (!page) 1518 goto out; 1519 1520 err = 0; 1521 if (page_to_nid(page) == node) 1522 goto out_putpage; 1523 1524 err = -EACCES; 1525 if (page_mapcount(page) > 1 && !migrate_all) 1526 goto out_putpage; 1527 1528 if (PageHuge(page)) { 1529 if (PageHead(page)) { 1530 isolate_huge_page(page, pagelist); 1531 err = 0; 1532 } 1533 } else { 1534 struct page *head; 1535 1536 head = compound_head(page); 1537 err = isolate_lru_page(head); 1538 if (err) 1539 goto out_putpage; 1540 1541 err = 0; 1542 list_add_tail(&head->lru, pagelist); 1543 mod_node_page_state(page_pgdat(head), 1544 NR_ISOLATED_ANON + page_is_file_cache(head), 1545 hpage_nr_pages(head)); 1546 } 1547 out_putpage: 1548 /* 1549 * Either remove the duplicate refcount from 1550 * isolate_lru_page() or drop the page ref if it was 1551 * not isolated. 1552 */ 1553 put_page(page); 1554 out: 1555 up_read(&mm->mmap_sem); 1556 return err; 1557 } 1558 1559 /* 1560 * Migrate an array of page address onto an array of nodes and fill 1561 * the corresponding array of status. 1562 */ 1563 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 1564 unsigned long nr_pages, 1565 const void __user * __user *pages, 1566 const int __user *nodes, 1567 int __user *status, int flags) 1568 { 1569 int current_node = NUMA_NO_NODE; 1570 LIST_HEAD(pagelist); 1571 int start, i; 1572 int err = 0, err1; 1573 1574 migrate_prep(); 1575 1576 for (i = start = 0; i < nr_pages; i++) { 1577 const void __user *p; 1578 unsigned long addr; 1579 int node; 1580 1581 err = -EFAULT; 1582 if (get_user(p, pages + i)) 1583 goto out_flush; 1584 if (get_user(node, nodes + i)) 1585 goto out_flush; 1586 addr = (unsigned long)p; 1587 1588 err = -ENODEV; 1589 if (node < 0 || node >= MAX_NUMNODES) 1590 goto out_flush; 1591 if (!node_state(node, N_MEMORY)) 1592 goto out_flush; 1593 1594 err = -EACCES; 1595 if (!node_isset(node, task_nodes)) 1596 goto out_flush; 1597 1598 if (current_node == NUMA_NO_NODE) { 1599 current_node = node; 1600 start = i; 1601 } else if (node != current_node) { 1602 err = do_move_pages_to_node(mm, &pagelist, current_node); 1603 if (err) 1604 goto out; 1605 err = store_status(status, start, current_node, i - start); 1606 if (err) 1607 goto out; 1608 start = i; 1609 current_node = node; 1610 } 1611 1612 /* 1613 * Errors in the page lookup or isolation are not fatal and we simply 1614 * report them via status 1615 */ 1616 err = add_page_for_migration(mm, addr, current_node, 1617 &pagelist, flags & MPOL_MF_MOVE_ALL); 1618 if (!err) 1619 continue; 1620 1621 err = store_status(status, i, err, 1); 1622 if (err) 1623 goto out_flush; 1624 1625 err = do_move_pages_to_node(mm, &pagelist, current_node); 1626 if (err) 1627 goto out; 1628 if (i > start) { 1629 err = store_status(status, start, current_node, i - start); 1630 if (err) 1631 goto out; 1632 } 1633 current_node = NUMA_NO_NODE; 1634 } 1635 out_flush: 1636 if (list_empty(&pagelist)) 1637 return err; 1638 1639 /* Make sure we do not overwrite the existing error */ 1640 err1 = do_move_pages_to_node(mm, &pagelist, current_node); 1641 if (!err1) 1642 err1 = store_status(status, start, current_node, i - start); 1643 if (!err) 1644 err = err1; 1645 out: 1646 return err; 1647 } 1648 1649 /* 1650 * Determine the nodes of an array of pages and store it in an array of status. 1651 */ 1652 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 1653 const void __user **pages, int *status) 1654 { 1655 unsigned long i; 1656 1657 down_read(&mm->mmap_sem); 1658 1659 for (i = 0; i < nr_pages; i++) { 1660 unsigned long addr = (unsigned long)(*pages); 1661 struct vm_area_struct *vma; 1662 struct page *page; 1663 int err = -EFAULT; 1664 1665 vma = find_vma(mm, addr); 1666 if (!vma || addr < vma->vm_start) 1667 goto set_status; 1668 1669 /* FOLL_DUMP to ignore special (like zero) pages */ 1670 page = follow_page(vma, addr, FOLL_DUMP); 1671 1672 err = PTR_ERR(page); 1673 if (IS_ERR(page)) 1674 goto set_status; 1675 1676 err = page ? page_to_nid(page) : -ENOENT; 1677 set_status: 1678 *status = err; 1679 1680 pages++; 1681 status++; 1682 } 1683 1684 up_read(&mm->mmap_sem); 1685 } 1686 1687 /* 1688 * Determine the nodes of a user array of pages and store it in 1689 * a user array of status. 1690 */ 1691 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1692 const void __user * __user *pages, 1693 int __user *status) 1694 { 1695 #define DO_PAGES_STAT_CHUNK_NR 16 1696 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1697 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1698 1699 while (nr_pages) { 1700 unsigned long chunk_nr; 1701 1702 chunk_nr = nr_pages; 1703 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1704 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1705 1706 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1707 break; 1708 1709 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1710 1711 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1712 break; 1713 1714 pages += chunk_nr; 1715 status += chunk_nr; 1716 nr_pages -= chunk_nr; 1717 } 1718 return nr_pages ? -EFAULT : 0; 1719 } 1720 1721 /* 1722 * Move a list of pages in the address space of the currently executing 1723 * process. 1724 */ 1725 static int kernel_move_pages(pid_t pid, unsigned long nr_pages, 1726 const void __user * __user *pages, 1727 const int __user *nodes, 1728 int __user *status, int flags) 1729 { 1730 struct task_struct *task; 1731 struct mm_struct *mm; 1732 int err; 1733 nodemask_t task_nodes; 1734 1735 /* Check flags */ 1736 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1737 return -EINVAL; 1738 1739 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1740 return -EPERM; 1741 1742 /* Find the mm_struct */ 1743 rcu_read_lock(); 1744 task = pid ? find_task_by_vpid(pid) : current; 1745 if (!task) { 1746 rcu_read_unlock(); 1747 return -ESRCH; 1748 } 1749 get_task_struct(task); 1750 1751 /* 1752 * Check if this process has the right to modify the specified 1753 * process. Use the regular "ptrace_may_access()" checks. 1754 */ 1755 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1756 rcu_read_unlock(); 1757 err = -EPERM; 1758 goto out; 1759 } 1760 rcu_read_unlock(); 1761 1762 err = security_task_movememory(task); 1763 if (err) 1764 goto out; 1765 1766 task_nodes = cpuset_mems_allowed(task); 1767 mm = get_task_mm(task); 1768 put_task_struct(task); 1769 1770 if (!mm) 1771 return -EINVAL; 1772 1773 if (nodes) 1774 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1775 nodes, status, flags); 1776 else 1777 err = do_pages_stat(mm, nr_pages, pages, status); 1778 1779 mmput(mm); 1780 return err; 1781 1782 out: 1783 put_task_struct(task); 1784 return err; 1785 } 1786 1787 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1788 const void __user * __user *, pages, 1789 const int __user *, nodes, 1790 int __user *, status, int, flags) 1791 { 1792 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1793 } 1794 1795 #ifdef CONFIG_COMPAT 1796 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 1797 compat_uptr_t __user *, pages32, 1798 const int __user *, nodes, 1799 int __user *, status, 1800 int, flags) 1801 { 1802 const void __user * __user *pages; 1803 int i; 1804 1805 pages = compat_alloc_user_space(nr_pages * sizeof(void *)); 1806 for (i = 0; i < nr_pages; i++) { 1807 compat_uptr_t p; 1808 1809 if (get_user(p, pages32 + i) || 1810 put_user(compat_ptr(p), pages + i)) 1811 return -EFAULT; 1812 } 1813 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1814 } 1815 #endif /* CONFIG_COMPAT */ 1816 1817 #ifdef CONFIG_NUMA_BALANCING 1818 /* 1819 * Returns true if this is a safe migration target node for misplaced NUMA 1820 * pages. Currently it only checks the watermarks which crude 1821 */ 1822 static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1823 unsigned long nr_migrate_pages) 1824 { 1825 int z; 1826 1827 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1828 struct zone *zone = pgdat->node_zones + z; 1829 1830 if (!populated_zone(zone)) 1831 continue; 1832 1833 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1834 if (!zone_watermark_ok(zone, 0, 1835 high_wmark_pages(zone) + 1836 nr_migrate_pages, 1837 0, 0)) 1838 continue; 1839 return true; 1840 } 1841 return false; 1842 } 1843 1844 static struct page *alloc_misplaced_dst_page(struct page *page, 1845 unsigned long data) 1846 { 1847 int nid = (int) data; 1848 struct page *newpage; 1849 1850 newpage = __alloc_pages_node(nid, 1851 (GFP_HIGHUSER_MOVABLE | 1852 __GFP_THISNODE | __GFP_NOMEMALLOC | 1853 __GFP_NORETRY | __GFP_NOWARN) & 1854 ~__GFP_RECLAIM, 0); 1855 1856 return newpage; 1857 } 1858 1859 /* 1860 * page migration rate limiting control. 1861 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1862 * window of time. Default here says do not migrate more than 1280M per second. 1863 */ 1864 static unsigned int migrate_interval_millisecs __read_mostly = 100; 1865 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1866 1867 /* Returns true if the node is migrate rate-limited after the update */ 1868 static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1869 unsigned long nr_pages) 1870 { 1871 /* 1872 * Rate-limit the amount of data that is being migrated to a node. 1873 * Optimal placement is no good if the memory bus is saturated and 1874 * all the time is being spent migrating! 1875 */ 1876 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1877 spin_lock(&pgdat->numabalancing_migrate_lock); 1878 pgdat->numabalancing_migrate_nr_pages = 0; 1879 pgdat->numabalancing_migrate_next_window = jiffies + 1880 msecs_to_jiffies(migrate_interval_millisecs); 1881 spin_unlock(&pgdat->numabalancing_migrate_lock); 1882 } 1883 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1884 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, 1885 nr_pages); 1886 return true; 1887 } 1888 1889 /* 1890 * This is an unlocked non-atomic update so errors are possible. 1891 * The consequences are failing to migrate when we potentiall should 1892 * have which is not severe enough to warrant locking. If it is ever 1893 * a problem, it can be converted to a per-cpu counter. 1894 */ 1895 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1896 return false; 1897 } 1898 1899 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1900 { 1901 int page_lru; 1902 1903 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 1904 1905 /* Avoid migrating to a node that is nearly full */ 1906 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1907 return 0; 1908 1909 if (isolate_lru_page(page)) 1910 return 0; 1911 1912 /* 1913 * migrate_misplaced_transhuge_page() skips page migration's usual 1914 * check on page_count(), so we must do it here, now that the page 1915 * has been isolated: a GUP pin, or any other pin, prevents migration. 1916 * The expected page count is 3: 1 for page's mapcount and 1 for the 1917 * caller's pin and 1 for the reference taken by isolate_lru_page(). 1918 */ 1919 if (PageTransHuge(page) && page_count(page) != 3) { 1920 putback_lru_page(page); 1921 return 0; 1922 } 1923 1924 page_lru = page_is_file_cache(page); 1925 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, 1926 hpage_nr_pages(page)); 1927 1928 /* 1929 * Isolating the page has taken another reference, so the 1930 * caller's reference can be safely dropped without the page 1931 * disappearing underneath us during migration. 1932 */ 1933 put_page(page); 1934 return 1; 1935 } 1936 1937 bool pmd_trans_migrating(pmd_t pmd) 1938 { 1939 struct page *page = pmd_page(pmd); 1940 return PageLocked(page); 1941 } 1942 1943 /* 1944 * Attempt to migrate a misplaced page to the specified destination 1945 * node. Caller is expected to have an elevated reference count on 1946 * the page that will be dropped by this function before returning. 1947 */ 1948 int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 1949 int node) 1950 { 1951 pg_data_t *pgdat = NODE_DATA(node); 1952 int isolated; 1953 int nr_remaining; 1954 LIST_HEAD(migratepages); 1955 1956 /* 1957 * Don't migrate file pages that are mapped in multiple processes 1958 * with execute permissions as they are probably shared libraries. 1959 */ 1960 if (page_mapcount(page) != 1 && page_is_file_cache(page) && 1961 (vma->vm_flags & VM_EXEC)) 1962 goto out; 1963 1964 /* 1965 * Also do not migrate dirty pages as not all filesystems can move 1966 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 1967 */ 1968 if (page_is_file_cache(page) && PageDirty(page)) 1969 goto out; 1970 1971 /* 1972 * Rate-limit the amount of data that is being migrated to a node. 1973 * Optimal placement is no good if the memory bus is saturated and 1974 * all the time is being spent migrating! 1975 */ 1976 if (numamigrate_update_ratelimit(pgdat, 1)) 1977 goto out; 1978 1979 isolated = numamigrate_isolate_page(pgdat, page); 1980 if (!isolated) 1981 goto out; 1982 1983 list_add(&page->lru, &migratepages); 1984 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1985 NULL, node, MIGRATE_ASYNC, 1986 MR_NUMA_MISPLACED); 1987 if (nr_remaining) { 1988 if (!list_empty(&migratepages)) { 1989 list_del(&page->lru); 1990 dec_node_page_state(page, NR_ISOLATED_ANON + 1991 page_is_file_cache(page)); 1992 putback_lru_page(page); 1993 } 1994 isolated = 0; 1995 } else 1996 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1997 BUG_ON(!list_empty(&migratepages)); 1998 return isolated; 1999 2000 out: 2001 put_page(page); 2002 return 0; 2003 } 2004 #endif /* CONFIG_NUMA_BALANCING */ 2005 2006 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 2007 /* 2008 * Migrates a THP to a given target node. page must be locked and is unlocked 2009 * before returning. 2010 */ 2011 int migrate_misplaced_transhuge_page(struct mm_struct *mm, 2012 struct vm_area_struct *vma, 2013 pmd_t *pmd, pmd_t entry, 2014 unsigned long address, 2015 struct page *page, int node) 2016 { 2017 spinlock_t *ptl; 2018 pg_data_t *pgdat = NODE_DATA(node); 2019 int isolated = 0; 2020 struct page *new_page = NULL; 2021 int page_lru = page_is_file_cache(page); 2022 unsigned long mmun_start = address & HPAGE_PMD_MASK; 2023 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 2024 2025 /* 2026 * Rate-limit the amount of data that is being migrated to a node. 2027 * Optimal placement is no good if the memory bus is saturated and 2028 * all the time is being spent migrating! 2029 */ 2030 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) 2031 goto out_dropref; 2032 2033 new_page = alloc_pages_node(node, 2034 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 2035 HPAGE_PMD_ORDER); 2036 if (!new_page) 2037 goto out_fail; 2038 prep_transhuge_page(new_page); 2039 2040 isolated = numamigrate_isolate_page(pgdat, page); 2041 if (!isolated) { 2042 put_page(new_page); 2043 goto out_fail; 2044 } 2045 2046 /* Prepare a page as a migration target */ 2047 __SetPageLocked(new_page); 2048 if (PageSwapBacked(page)) 2049 __SetPageSwapBacked(new_page); 2050 2051 /* anon mapping, we can simply copy page->mapping to the new page: */ 2052 new_page->mapping = page->mapping; 2053 new_page->index = page->index; 2054 migrate_page_copy(new_page, page); 2055 WARN_ON(PageLRU(new_page)); 2056 2057 /* Recheck the target PMD */ 2058 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2059 ptl = pmd_lock(mm, pmd); 2060 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2061 spin_unlock(ptl); 2062 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2063 2064 /* Reverse changes made by migrate_page_copy() */ 2065 if (TestClearPageActive(new_page)) 2066 SetPageActive(page); 2067 if (TestClearPageUnevictable(new_page)) 2068 SetPageUnevictable(page); 2069 2070 unlock_page(new_page); 2071 put_page(new_page); /* Free it */ 2072 2073 /* Retake the callers reference and putback on LRU */ 2074 get_page(page); 2075 putback_lru_page(page); 2076 mod_node_page_state(page_pgdat(page), 2077 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 2078 2079 goto out_unlock; 2080 } 2081 2082 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2083 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2084 2085 /* 2086 * Clear the old entry under pagetable lock and establish the new PTE. 2087 * Any parallel GUP will either observe the old page blocking on the 2088 * page lock, block on the page table lock or observe the new page. 2089 * The SetPageUptodate on the new page and page_add_new_anon_rmap 2090 * guarantee the copy is visible before the pagetable update. 2091 */ 2092 flush_cache_range(vma, mmun_start, mmun_end); 2093 page_add_anon_rmap(new_page, vma, mmun_start, true); 2094 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); 2095 set_pmd_at(mm, mmun_start, pmd, entry); 2096 update_mmu_cache_pmd(vma, address, &entry); 2097 2098 page_ref_unfreeze(page, 2); 2099 mlock_migrate_page(new_page, page); 2100 page_remove_rmap(page, true); 2101 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2102 2103 spin_unlock(ptl); 2104 /* 2105 * No need to double call mmu_notifier->invalidate_range() callback as 2106 * the above pmdp_huge_clear_flush_notify() did already call it. 2107 */ 2108 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); 2109 2110 /* Take an "isolate" reference and put new page on the LRU. */ 2111 get_page(new_page); 2112 putback_lru_page(new_page); 2113 2114 unlock_page(new_page); 2115 unlock_page(page); 2116 put_page(page); /* Drop the rmap reference */ 2117 put_page(page); /* Drop the LRU isolation reference */ 2118 2119 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2120 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2121 2122 mod_node_page_state(page_pgdat(page), 2123 NR_ISOLATED_ANON + page_lru, 2124 -HPAGE_PMD_NR); 2125 return isolated; 2126 2127 out_fail: 2128 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2129 out_dropref: 2130 ptl = pmd_lock(mm, pmd); 2131 if (pmd_same(*pmd, entry)) { 2132 entry = pmd_modify(entry, vma->vm_page_prot); 2133 set_pmd_at(mm, mmun_start, pmd, entry); 2134 update_mmu_cache_pmd(vma, address, &entry); 2135 } 2136 spin_unlock(ptl); 2137 2138 out_unlock: 2139 unlock_page(page); 2140 put_page(page); 2141 return 0; 2142 } 2143 #endif /* CONFIG_NUMA_BALANCING */ 2144 2145 #endif /* CONFIG_NUMA */ 2146 2147 #if defined(CONFIG_MIGRATE_VMA_HELPER) 2148 struct migrate_vma { 2149 struct vm_area_struct *vma; 2150 unsigned long *dst; 2151 unsigned long *src; 2152 unsigned long cpages; 2153 unsigned long npages; 2154 unsigned long start; 2155 unsigned long end; 2156 }; 2157 2158 static int migrate_vma_collect_hole(unsigned long start, 2159 unsigned long end, 2160 struct mm_walk *walk) 2161 { 2162 struct migrate_vma *migrate = walk->private; 2163 unsigned long addr; 2164 2165 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2166 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 2167 migrate->dst[migrate->npages] = 0; 2168 migrate->npages++; 2169 migrate->cpages++; 2170 } 2171 2172 return 0; 2173 } 2174 2175 static int migrate_vma_collect_skip(unsigned long start, 2176 unsigned long end, 2177 struct mm_walk *walk) 2178 { 2179 struct migrate_vma *migrate = walk->private; 2180 unsigned long addr; 2181 2182 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2183 migrate->dst[migrate->npages] = 0; 2184 migrate->src[migrate->npages++] = 0; 2185 } 2186 2187 return 0; 2188 } 2189 2190 static int migrate_vma_collect_pmd(pmd_t *pmdp, 2191 unsigned long start, 2192 unsigned long end, 2193 struct mm_walk *walk) 2194 { 2195 struct migrate_vma *migrate = walk->private; 2196 struct vm_area_struct *vma = walk->vma; 2197 struct mm_struct *mm = vma->vm_mm; 2198 unsigned long addr = start, unmapped = 0; 2199 spinlock_t *ptl; 2200 pte_t *ptep; 2201 2202 again: 2203 if (pmd_none(*pmdp)) 2204 return migrate_vma_collect_hole(start, end, walk); 2205 2206 if (pmd_trans_huge(*pmdp)) { 2207 struct page *page; 2208 2209 ptl = pmd_lock(mm, pmdp); 2210 if (unlikely(!pmd_trans_huge(*pmdp))) { 2211 spin_unlock(ptl); 2212 goto again; 2213 } 2214 2215 page = pmd_page(*pmdp); 2216 if (is_huge_zero_page(page)) { 2217 spin_unlock(ptl); 2218 split_huge_pmd(vma, pmdp, addr); 2219 if (pmd_trans_unstable(pmdp)) 2220 return migrate_vma_collect_skip(start, end, 2221 walk); 2222 } else { 2223 int ret; 2224 2225 get_page(page); 2226 spin_unlock(ptl); 2227 if (unlikely(!trylock_page(page))) 2228 return migrate_vma_collect_skip(start, end, 2229 walk); 2230 ret = split_huge_page(page); 2231 unlock_page(page); 2232 put_page(page); 2233 if (ret) 2234 return migrate_vma_collect_skip(start, end, 2235 walk); 2236 if (pmd_none(*pmdp)) 2237 return migrate_vma_collect_hole(start, end, 2238 walk); 2239 } 2240 } 2241 2242 if (unlikely(pmd_bad(*pmdp))) 2243 return migrate_vma_collect_skip(start, end, walk); 2244 2245 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2246 arch_enter_lazy_mmu_mode(); 2247 2248 for (; addr < end; addr += PAGE_SIZE, ptep++) { 2249 unsigned long mpfn, pfn; 2250 struct page *page; 2251 swp_entry_t entry; 2252 pte_t pte; 2253 2254 pte = *ptep; 2255 pfn = pte_pfn(pte); 2256 2257 if (pte_none(pte)) { 2258 mpfn = MIGRATE_PFN_MIGRATE; 2259 migrate->cpages++; 2260 pfn = 0; 2261 goto next; 2262 } 2263 2264 if (!pte_present(pte)) { 2265 mpfn = pfn = 0; 2266 2267 /* 2268 * Only care about unaddressable device page special 2269 * page table entry. Other special swap entries are not 2270 * migratable, and we ignore regular swapped page. 2271 */ 2272 entry = pte_to_swp_entry(pte); 2273 if (!is_device_private_entry(entry)) 2274 goto next; 2275 2276 page = device_private_entry_to_page(entry); 2277 mpfn = migrate_pfn(page_to_pfn(page))| 2278 MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; 2279 if (is_write_device_private_entry(entry)) 2280 mpfn |= MIGRATE_PFN_WRITE; 2281 } else { 2282 if (is_zero_pfn(pfn)) { 2283 mpfn = MIGRATE_PFN_MIGRATE; 2284 migrate->cpages++; 2285 pfn = 0; 2286 goto next; 2287 } 2288 page = _vm_normal_page(migrate->vma, addr, pte, true); 2289 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2290 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2291 } 2292 2293 /* FIXME support THP */ 2294 if (!page || !page->mapping || PageTransCompound(page)) { 2295 mpfn = pfn = 0; 2296 goto next; 2297 } 2298 pfn = page_to_pfn(page); 2299 2300 /* 2301 * By getting a reference on the page we pin it and that blocks 2302 * any kind of migration. Side effect is that it "freezes" the 2303 * pte. 2304 * 2305 * We drop this reference after isolating the page from the lru 2306 * for non device page (device page are not on the lru and thus 2307 * can't be dropped from it). 2308 */ 2309 get_page(page); 2310 migrate->cpages++; 2311 2312 /* 2313 * Optimize for the common case where page is only mapped once 2314 * in one process. If we can lock the page, then we can safely 2315 * set up a special migration page table entry now. 2316 */ 2317 if (trylock_page(page)) { 2318 pte_t swp_pte; 2319 2320 mpfn |= MIGRATE_PFN_LOCKED; 2321 ptep_get_and_clear(mm, addr, ptep); 2322 2323 /* Setup special migration page table entry */ 2324 entry = make_migration_entry(page, mpfn & 2325 MIGRATE_PFN_WRITE); 2326 swp_pte = swp_entry_to_pte(entry); 2327 if (pte_soft_dirty(pte)) 2328 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2329 set_pte_at(mm, addr, ptep, swp_pte); 2330 2331 /* 2332 * This is like regular unmap: we remove the rmap and 2333 * drop page refcount. Page won't be freed, as we took 2334 * a reference just above. 2335 */ 2336 page_remove_rmap(page, false); 2337 put_page(page); 2338 2339 if (pte_present(pte)) 2340 unmapped++; 2341 } 2342 2343 next: 2344 migrate->dst[migrate->npages] = 0; 2345 migrate->src[migrate->npages++] = mpfn; 2346 } 2347 arch_leave_lazy_mmu_mode(); 2348 pte_unmap_unlock(ptep - 1, ptl); 2349 2350 /* Only flush the TLB if we actually modified any entries */ 2351 if (unmapped) 2352 flush_tlb_range(walk->vma, start, end); 2353 2354 return 0; 2355 } 2356 2357 /* 2358 * migrate_vma_collect() - collect pages over a range of virtual addresses 2359 * @migrate: migrate struct containing all migration information 2360 * 2361 * This will walk the CPU page table. For each virtual address backed by a 2362 * valid page, it updates the src array and takes a reference on the page, in 2363 * order to pin the page until we lock it and unmap it. 2364 */ 2365 static void migrate_vma_collect(struct migrate_vma *migrate) 2366 { 2367 struct mm_walk mm_walk; 2368 2369 mm_walk.pmd_entry = migrate_vma_collect_pmd; 2370 mm_walk.pte_entry = NULL; 2371 mm_walk.pte_hole = migrate_vma_collect_hole; 2372 mm_walk.hugetlb_entry = NULL; 2373 mm_walk.test_walk = NULL; 2374 mm_walk.vma = migrate->vma; 2375 mm_walk.mm = migrate->vma->vm_mm; 2376 mm_walk.private = migrate; 2377 2378 mmu_notifier_invalidate_range_start(mm_walk.mm, 2379 migrate->start, 2380 migrate->end); 2381 walk_page_range(migrate->start, migrate->end, &mm_walk); 2382 mmu_notifier_invalidate_range_end(mm_walk.mm, 2383 migrate->start, 2384 migrate->end); 2385 2386 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2387 } 2388 2389 /* 2390 * migrate_vma_check_page() - check if page is pinned or not 2391 * @page: struct page to check 2392 * 2393 * Pinned pages cannot be migrated. This is the same test as in 2394 * migrate_page_move_mapping(), except that here we allow migration of a 2395 * ZONE_DEVICE page. 2396 */ 2397 static bool migrate_vma_check_page(struct page *page) 2398 { 2399 /* 2400 * One extra ref because caller holds an extra reference, either from 2401 * isolate_lru_page() for a regular page, or migrate_vma_collect() for 2402 * a device page. 2403 */ 2404 int extra = 1; 2405 2406 /* 2407 * FIXME support THP (transparent huge page), it is bit more complex to 2408 * check them than regular pages, because they can be mapped with a pmd 2409 * or with a pte (split pte mapping). 2410 */ 2411 if (PageCompound(page)) 2412 return false; 2413 2414 /* Page from ZONE_DEVICE have one extra reference */ 2415 if (is_zone_device_page(page)) { 2416 /* 2417 * Private page can never be pin as they have no valid pte and 2418 * GUP will fail for those. Yet if there is a pending migration 2419 * a thread might try to wait on the pte migration entry and 2420 * will bump the page reference count. Sadly there is no way to 2421 * differentiate a regular pin from migration wait. Hence to 2422 * avoid 2 racing thread trying to migrate back to CPU to enter 2423 * infinite loop (one stoping migration because the other is 2424 * waiting on pte migration entry). We always return true here. 2425 * 2426 * FIXME proper solution is to rework migration_entry_wait() so 2427 * it does not need to take a reference on page. 2428 */ 2429 if (is_device_private_page(page)) 2430 return true; 2431 2432 /* 2433 * Only allow device public page to be migrated and account for 2434 * the extra reference count imply by ZONE_DEVICE pages. 2435 */ 2436 if (!is_device_public_page(page)) 2437 return false; 2438 extra++; 2439 } 2440 2441 /* For file back page */ 2442 if (page_mapping(page)) 2443 extra += 1 + page_has_private(page); 2444 2445 if ((page_count(page) - extra) > page_mapcount(page)) 2446 return false; 2447 2448 return true; 2449 } 2450 2451 /* 2452 * migrate_vma_prepare() - lock pages and isolate them from the lru 2453 * @migrate: migrate struct containing all migration information 2454 * 2455 * This locks pages that have been collected by migrate_vma_collect(). Once each 2456 * page is locked it is isolated from the lru (for non-device pages). Finally, 2457 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be 2458 * migrated by concurrent kernel threads. 2459 */ 2460 static void migrate_vma_prepare(struct migrate_vma *migrate) 2461 { 2462 const unsigned long npages = migrate->npages; 2463 const unsigned long start = migrate->start; 2464 unsigned long addr, i, restore = 0; 2465 bool allow_drain = true; 2466 2467 lru_add_drain(); 2468 2469 for (i = 0; (i < npages) && migrate->cpages; i++) { 2470 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2471 bool remap = true; 2472 2473 if (!page) 2474 continue; 2475 2476 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { 2477 /* 2478 * Because we are migrating several pages there can be 2479 * a deadlock between 2 concurrent migration where each 2480 * are waiting on each other page lock. 2481 * 2482 * Make migrate_vma() a best effort thing and backoff 2483 * for any page we can not lock right away. 2484 */ 2485 if (!trylock_page(page)) { 2486 migrate->src[i] = 0; 2487 migrate->cpages--; 2488 put_page(page); 2489 continue; 2490 } 2491 remap = false; 2492 migrate->src[i] |= MIGRATE_PFN_LOCKED; 2493 } 2494 2495 /* ZONE_DEVICE pages are not on LRU */ 2496 if (!is_zone_device_page(page)) { 2497 if (!PageLRU(page) && allow_drain) { 2498 /* Drain CPU's pagevec */ 2499 lru_add_drain_all(); 2500 allow_drain = false; 2501 } 2502 2503 if (isolate_lru_page(page)) { 2504 if (remap) { 2505 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2506 migrate->cpages--; 2507 restore++; 2508 } else { 2509 migrate->src[i] = 0; 2510 unlock_page(page); 2511 migrate->cpages--; 2512 put_page(page); 2513 } 2514 continue; 2515 } 2516 2517 /* Drop the reference we took in collect */ 2518 put_page(page); 2519 } 2520 2521 if (!migrate_vma_check_page(page)) { 2522 if (remap) { 2523 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2524 migrate->cpages--; 2525 restore++; 2526 2527 if (!is_zone_device_page(page)) { 2528 get_page(page); 2529 putback_lru_page(page); 2530 } 2531 } else { 2532 migrate->src[i] = 0; 2533 unlock_page(page); 2534 migrate->cpages--; 2535 2536 if (!is_zone_device_page(page)) 2537 putback_lru_page(page); 2538 else 2539 put_page(page); 2540 } 2541 } 2542 } 2543 2544 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { 2545 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2546 2547 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2548 continue; 2549 2550 remove_migration_pte(page, migrate->vma, addr, page); 2551 2552 migrate->src[i] = 0; 2553 unlock_page(page); 2554 put_page(page); 2555 restore--; 2556 } 2557 } 2558 2559 /* 2560 * migrate_vma_unmap() - replace page mapping with special migration pte entry 2561 * @migrate: migrate struct containing all migration information 2562 * 2563 * Replace page mapping (CPU page table pte) with a special migration pte entry 2564 * and check again if it has been pinned. Pinned pages are restored because we 2565 * cannot migrate them. 2566 * 2567 * This is the last step before we call the device driver callback to allocate 2568 * destination memory and copy contents of original page over to new page. 2569 */ 2570 static void migrate_vma_unmap(struct migrate_vma *migrate) 2571 { 2572 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 2573 const unsigned long npages = migrate->npages; 2574 const unsigned long start = migrate->start; 2575 unsigned long addr, i, restore = 0; 2576 2577 for (i = 0; i < npages; i++) { 2578 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2579 2580 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2581 continue; 2582 2583 if (page_mapped(page)) { 2584 try_to_unmap(page, flags); 2585 if (page_mapped(page)) 2586 goto restore; 2587 } 2588 2589 if (migrate_vma_check_page(page)) 2590 continue; 2591 2592 restore: 2593 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2594 migrate->cpages--; 2595 restore++; 2596 } 2597 2598 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { 2599 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2600 2601 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2602 continue; 2603 2604 remove_migration_ptes(page, page, false); 2605 2606 migrate->src[i] = 0; 2607 unlock_page(page); 2608 restore--; 2609 2610 if (is_zone_device_page(page)) 2611 put_page(page); 2612 else 2613 putback_lru_page(page); 2614 } 2615 } 2616 2617 static void migrate_vma_insert_page(struct migrate_vma *migrate, 2618 unsigned long addr, 2619 struct page *page, 2620 unsigned long *src, 2621 unsigned long *dst) 2622 { 2623 struct vm_area_struct *vma = migrate->vma; 2624 struct mm_struct *mm = vma->vm_mm; 2625 struct mem_cgroup *memcg; 2626 bool flush = false; 2627 spinlock_t *ptl; 2628 pte_t entry; 2629 pgd_t *pgdp; 2630 p4d_t *p4dp; 2631 pud_t *pudp; 2632 pmd_t *pmdp; 2633 pte_t *ptep; 2634 2635 /* Only allow populating anonymous memory */ 2636 if (!vma_is_anonymous(vma)) 2637 goto abort; 2638 2639 pgdp = pgd_offset(mm, addr); 2640 p4dp = p4d_alloc(mm, pgdp, addr); 2641 if (!p4dp) 2642 goto abort; 2643 pudp = pud_alloc(mm, p4dp, addr); 2644 if (!pudp) 2645 goto abort; 2646 pmdp = pmd_alloc(mm, pudp, addr); 2647 if (!pmdp) 2648 goto abort; 2649 2650 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 2651 goto abort; 2652 2653 /* 2654 * Use pte_alloc() instead of pte_alloc_map(). We can't run 2655 * pte_offset_map() on pmds where a huge pmd might be created 2656 * from a different thread. 2657 * 2658 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when 2659 * parallel threads are excluded by other means. 2660 * 2661 * Here we only have down_read(mmap_sem). 2662 */ 2663 if (pte_alloc(mm, pmdp, addr)) 2664 goto abort; 2665 2666 /* See the comment in pte_alloc_one_map() */ 2667 if (unlikely(pmd_trans_unstable(pmdp))) 2668 goto abort; 2669 2670 if (unlikely(anon_vma_prepare(vma))) 2671 goto abort; 2672 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) 2673 goto abort; 2674 2675 /* 2676 * The memory barrier inside __SetPageUptodate makes sure that 2677 * preceding stores to the page contents become visible before 2678 * the set_pte_at() write. 2679 */ 2680 __SetPageUptodate(page); 2681 2682 if (is_zone_device_page(page)) { 2683 if (is_device_private_page(page)) { 2684 swp_entry_t swp_entry; 2685 2686 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); 2687 entry = swp_entry_to_pte(swp_entry); 2688 } else if (is_device_public_page(page)) { 2689 entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); 2690 if (vma->vm_flags & VM_WRITE) 2691 entry = pte_mkwrite(pte_mkdirty(entry)); 2692 entry = pte_mkdevmap(entry); 2693 } 2694 } else { 2695 entry = mk_pte(page, vma->vm_page_prot); 2696 if (vma->vm_flags & VM_WRITE) 2697 entry = pte_mkwrite(pte_mkdirty(entry)); 2698 } 2699 2700 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2701 2702 if (pte_present(*ptep)) { 2703 unsigned long pfn = pte_pfn(*ptep); 2704 2705 if (!is_zero_pfn(pfn)) { 2706 pte_unmap_unlock(ptep, ptl); 2707 mem_cgroup_cancel_charge(page, memcg, false); 2708 goto abort; 2709 } 2710 flush = true; 2711 } else if (!pte_none(*ptep)) { 2712 pte_unmap_unlock(ptep, ptl); 2713 mem_cgroup_cancel_charge(page, memcg, false); 2714 goto abort; 2715 } 2716 2717 /* 2718 * Check for usefaultfd but do not deliver the fault. Instead, 2719 * just back off. 2720 */ 2721 if (userfaultfd_missing(vma)) { 2722 pte_unmap_unlock(ptep, ptl); 2723 mem_cgroup_cancel_charge(page, memcg, false); 2724 goto abort; 2725 } 2726 2727 inc_mm_counter(mm, MM_ANONPAGES); 2728 page_add_new_anon_rmap(page, vma, addr, false); 2729 mem_cgroup_commit_charge(page, memcg, false, false); 2730 if (!is_zone_device_page(page)) 2731 lru_cache_add_active_or_unevictable(page, vma); 2732 get_page(page); 2733 2734 if (flush) { 2735 flush_cache_page(vma, addr, pte_pfn(*ptep)); 2736 ptep_clear_flush_notify(vma, addr, ptep); 2737 set_pte_at_notify(mm, addr, ptep, entry); 2738 update_mmu_cache(vma, addr, ptep); 2739 } else { 2740 /* No need to invalidate - it was non-present before */ 2741 set_pte_at(mm, addr, ptep, entry); 2742 update_mmu_cache(vma, addr, ptep); 2743 } 2744 2745 pte_unmap_unlock(ptep, ptl); 2746 *src = MIGRATE_PFN_MIGRATE; 2747 return; 2748 2749 abort: 2750 *src &= ~MIGRATE_PFN_MIGRATE; 2751 } 2752 2753 /* 2754 * migrate_vma_pages() - migrate meta-data from src page to dst page 2755 * @migrate: migrate struct containing all migration information 2756 * 2757 * This migrates struct page meta-data from source struct page to destination 2758 * struct page. This effectively finishes the migration from source page to the 2759 * destination page. 2760 */ 2761 static void migrate_vma_pages(struct migrate_vma *migrate) 2762 { 2763 const unsigned long npages = migrate->npages; 2764 const unsigned long start = migrate->start; 2765 struct vm_area_struct *vma = migrate->vma; 2766 struct mm_struct *mm = vma->vm_mm; 2767 unsigned long addr, i, mmu_start; 2768 bool notified = false; 2769 2770 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 2771 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2772 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2773 struct address_space *mapping; 2774 int r; 2775 2776 if (!newpage) { 2777 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2778 continue; 2779 } 2780 2781 if (!page) { 2782 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { 2783 continue; 2784 } 2785 if (!notified) { 2786 mmu_start = addr; 2787 notified = true; 2788 mmu_notifier_invalidate_range_start(mm, 2789 mmu_start, 2790 migrate->end); 2791 } 2792 migrate_vma_insert_page(migrate, addr, newpage, 2793 &migrate->src[i], 2794 &migrate->dst[i]); 2795 continue; 2796 } 2797 2798 mapping = page_mapping(page); 2799 2800 if (is_zone_device_page(newpage)) { 2801 if (is_device_private_page(newpage)) { 2802 /* 2803 * For now only support private anonymous when 2804 * migrating to un-addressable device memory. 2805 */ 2806 if (mapping) { 2807 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2808 continue; 2809 } 2810 } else if (!is_device_public_page(newpage)) { 2811 /* 2812 * Other types of ZONE_DEVICE page are not 2813 * supported. 2814 */ 2815 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2816 continue; 2817 } 2818 } 2819 2820 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 2821 if (r != MIGRATEPAGE_SUCCESS) 2822 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2823 } 2824 2825 /* 2826 * No need to double call mmu_notifier->invalidate_range() callback as 2827 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 2828 * did already call it. 2829 */ 2830 if (notified) 2831 mmu_notifier_invalidate_range_only_end(mm, mmu_start, 2832 migrate->end); 2833 } 2834 2835 /* 2836 * migrate_vma_finalize() - restore CPU page table entry 2837 * @migrate: migrate struct containing all migration information 2838 * 2839 * This replaces the special migration pte entry with either a mapping to the 2840 * new page if migration was successful for that page, or to the original page 2841 * otherwise. 2842 * 2843 * This also unlocks the pages and puts them back on the lru, or drops the extra 2844 * refcount, for device pages. 2845 */ 2846 static void migrate_vma_finalize(struct migrate_vma *migrate) 2847 { 2848 const unsigned long npages = migrate->npages; 2849 unsigned long i; 2850 2851 for (i = 0; i < npages; i++) { 2852 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2853 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2854 2855 if (!page) { 2856 if (newpage) { 2857 unlock_page(newpage); 2858 put_page(newpage); 2859 } 2860 continue; 2861 } 2862 2863 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 2864 if (newpage) { 2865 unlock_page(newpage); 2866 put_page(newpage); 2867 } 2868 newpage = page; 2869 } 2870 2871 remove_migration_ptes(page, newpage, false); 2872 unlock_page(page); 2873 migrate->cpages--; 2874 2875 if (is_zone_device_page(page)) 2876 put_page(page); 2877 else 2878 putback_lru_page(page); 2879 2880 if (newpage != page) { 2881 unlock_page(newpage); 2882 if (is_zone_device_page(newpage)) 2883 put_page(newpage); 2884 else 2885 putback_lru_page(newpage); 2886 } 2887 } 2888 } 2889 2890 /* 2891 * migrate_vma() - migrate a range of memory inside vma 2892 * 2893 * @ops: migration callback for allocating destination memory and copying 2894 * @vma: virtual memory area containing the range to be migrated 2895 * @start: start address of the range to migrate (inclusive) 2896 * @end: end address of the range to migrate (exclusive) 2897 * @src: array of hmm_pfn_t containing source pfns 2898 * @dst: array of hmm_pfn_t containing destination pfns 2899 * @private: pointer passed back to each of the callback 2900 * Returns: 0 on success, error code otherwise 2901 * 2902 * This function tries to migrate a range of memory virtual address range, using 2903 * callbacks to allocate and copy memory from source to destination. First it 2904 * collects all the pages backing each virtual address in the range, saving this 2905 * inside the src array. Then it locks those pages and unmaps them. Once the pages 2906 * are locked and unmapped, it checks whether each page is pinned or not. Pages 2907 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) 2908 * in the corresponding src array entry. It then restores any pages that are 2909 * pinned, by remapping and unlocking those pages. 2910 * 2911 * At this point it calls the alloc_and_copy() callback. For documentation on 2912 * what is expected from that callback, see struct migrate_vma_ops comments in 2913 * include/linux/migrate.h 2914 * 2915 * After the alloc_and_copy() callback, this function goes over each entry in 2916 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2917 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2918 * then the function tries to migrate struct page information from the source 2919 * struct page to the destination struct page. If it fails to migrate the struct 2920 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src 2921 * array. 2922 * 2923 * At this point all successfully migrated pages have an entry in the src 2924 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2925 * array entry with MIGRATE_PFN_VALID flag set. 2926 * 2927 * It then calls the finalize_and_map() callback. See comments for "struct 2928 * migrate_vma_ops", in include/linux/migrate.h for details about 2929 * finalize_and_map() behavior. 2930 * 2931 * After the finalize_and_map() callback, for successfully migrated pages, this 2932 * function updates the CPU page table to point to new pages, otherwise it 2933 * restores the CPU page table to point to the original source pages. 2934 * 2935 * Function returns 0 after the above steps, even if no pages were migrated 2936 * (The function only returns an error if any of the arguments are invalid.) 2937 * 2938 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT 2939 * unsigned long entries. 2940 */ 2941 int migrate_vma(const struct migrate_vma_ops *ops, 2942 struct vm_area_struct *vma, 2943 unsigned long start, 2944 unsigned long end, 2945 unsigned long *src, 2946 unsigned long *dst, 2947 void *private) 2948 { 2949 struct migrate_vma migrate; 2950 2951 /* Sanity check the arguments */ 2952 start &= PAGE_MASK; 2953 end &= PAGE_MASK; 2954 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) 2955 return -EINVAL; 2956 if (start < vma->vm_start || start >= vma->vm_end) 2957 return -EINVAL; 2958 if (end <= vma->vm_start || end > vma->vm_end) 2959 return -EINVAL; 2960 if (!ops || !src || !dst || start >= end) 2961 return -EINVAL; 2962 2963 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); 2964 migrate.src = src; 2965 migrate.dst = dst; 2966 migrate.start = start; 2967 migrate.npages = 0; 2968 migrate.cpages = 0; 2969 migrate.end = end; 2970 migrate.vma = vma; 2971 2972 /* Collect, and try to unmap source pages */ 2973 migrate_vma_collect(&migrate); 2974 if (!migrate.cpages) 2975 return 0; 2976 2977 /* Lock and isolate page */ 2978 migrate_vma_prepare(&migrate); 2979 if (!migrate.cpages) 2980 return 0; 2981 2982 /* Unmap pages */ 2983 migrate_vma_unmap(&migrate); 2984 if (!migrate.cpages) 2985 return 0; 2986 2987 /* 2988 * At this point pages are locked and unmapped, and thus they have 2989 * stable content and can safely be copied to destination memory that 2990 * is allocated by the callback. 2991 * 2992 * Note that migration can fail in migrate_vma_struct_page() for each 2993 * individual page. 2994 */ 2995 ops->alloc_and_copy(vma, src, dst, start, end, private); 2996 2997 /* This does the real migration of struct page */ 2998 migrate_vma_pages(&migrate); 2999 3000 ops->finalize_and_map(vma, src, dst, start, end, private); 3001 3002 /* Unlock and remap pages */ 3003 migrate_vma_finalize(&migrate); 3004 3005 return 0; 3006 } 3007 EXPORT_SYMBOL(migrate_vma); 3008 #endif /* defined(MIGRATE_VMA_HELPER) */ 3009