1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory Migration functionality - linux/mm/migrate.c 4 * 5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 6 * 7 * Page migration was first developed in the context of the memory hotplug 8 * project. The main authors of the migration code are: 9 * 10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 11 * Hirokazu Takahashi <taka@valinux.co.jp> 12 * Dave Hansen <haveblue@us.ibm.com> 13 * Christoph Lameter 14 */ 15 16 #include <linux/migrate.h> 17 #include <linux/export.h> 18 #include <linux/swap.h> 19 #include <linux/swapops.h> 20 #include <linux/pagemap.h> 21 #include <linux/buffer_head.h> 22 #include <linux/mm_inline.h> 23 #include <linux/nsproxy.h> 24 #include <linux/pagevec.h> 25 #include <linux/ksm.h> 26 #include <linux/rmap.h> 27 #include <linux/topology.h> 28 #include <linux/cpu.h> 29 #include <linux/cpuset.h> 30 #include <linux/writeback.h> 31 #include <linux/mempolicy.h> 32 #include <linux/vmalloc.h> 33 #include <linux/security.h> 34 #include <linux/backing-dev.h> 35 #include <linux/compaction.h> 36 #include <linux/syscalls.h> 37 #include <linux/compat.h> 38 #include <linux/hugetlb.h> 39 #include <linux/hugetlb_cgroup.h> 40 #include <linux/gfp.h> 41 #include <linux/pfn_t.h> 42 #include <linux/memremap.h> 43 #include <linux/userfaultfd_k.h> 44 #include <linux/balloon_compaction.h> 45 #include <linux/mmu_notifier.h> 46 #include <linux/page_idle.h> 47 #include <linux/page_owner.h> 48 #include <linux/sched/mm.h> 49 #include <linux/ptrace.h> 50 51 #include <asm/tlbflush.h> 52 53 #define CREATE_TRACE_POINTS 54 #include <trace/events/migrate.h> 55 56 #include "internal.h" 57 58 /* 59 * migrate_prep() needs to be called before we start compiling a list of pages 60 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 61 * undesirable, use migrate_prep_local() 62 */ 63 int migrate_prep(void) 64 { 65 /* 66 * Clear the LRU lists so pages can be isolated. 67 * Note that pages may be moved off the LRU after we have 68 * drained them. Those pages will fail to migrate like other 69 * pages that may be busy. 70 */ 71 lru_add_drain_all(); 72 73 return 0; 74 } 75 76 /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 77 int migrate_prep_local(void) 78 { 79 lru_add_drain(); 80 81 return 0; 82 } 83 84 int isolate_movable_page(struct page *page, isolate_mode_t mode) 85 { 86 struct address_space *mapping; 87 88 /* 89 * Avoid burning cycles with pages that are yet under __free_pages(), 90 * or just got freed under us. 91 * 92 * In case we 'win' a race for a movable page being freed under us and 93 * raise its refcount preventing __free_pages() from doing its job 94 * the put_page() at the end of this block will take care of 95 * release this page, thus avoiding a nasty leakage. 96 */ 97 if (unlikely(!get_page_unless_zero(page))) 98 goto out; 99 100 /* 101 * Check PageMovable before holding a PG_lock because page's owner 102 * assumes anybody doesn't touch PG_lock of newly allocated page 103 * so unconditionally grapping the lock ruins page's owner side. 104 */ 105 if (unlikely(!__PageMovable(page))) 106 goto out_putpage; 107 /* 108 * As movable pages are not isolated from LRU lists, concurrent 109 * compaction threads can race against page migration functions 110 * as well as race against the releasing a page. 111 * 112 * In order to avoid having an already isolated movable page 113 * being (wrongly) re-isolated while it is under migration, 114 * or to avoid attempting to isolate pages being released, 115 * lets be sure we have the page lock 116 * before proceeding with the movable page isolation steps. 117 */ 118 if (unlikely(!trylock_page(page))) 119 goto out_putpage; 120 121 if (!PageMovable(page) || PageIsolated(page)) 122 goto out_no_isolated; 123 124 mapping = page_mapping(page); 125 VM_BUG_ON_PAGE(!mapping, page); 126 127 if (!mapping->a_ops->isolate_page(page, mode)) 128 goto out_no_isolated; 129 130 /* Driver shouldn't use PG_isolated bit of page->flags */ 131 WARN_ON_ONCE(PageIsolated(page)); 132 __SetPageIsolated(page); 133 unlock_page(page); 134 135 return 0; 136 137 out_no_isolated: 138 unlock_page(page); 139 out_putpage: 140 put_page(page); 141 out: 142 return -EBUSY; 143 } 144 145 /* It should be called on page which is PG_movable */ 146 void putback_movable_page(struct page *page) 147 { 148 struct address_space *mapping; 149 150 VM_BUG_ON_PAGE(!PageLocked(page), page); 151 VM_BUG_ON_PAGE(!PageMovable(page), page); 152 VM_BUG_ON_PAGE(!PageIsolated(page), page); 153 154 mapping = page_mapping(page); 155 mapping->a_ops->putback_page(page); 156 __ClearPageIsolated(page); 157 } 158 159 /* 160 * Put previously isolated pages back onto the appropriate lists 161 * from where they were once taken off for compaction/migration. 162 * 163 * This function shall be used whenever the isolated pageset has been 164 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 165 * and isolate_huge_page(). 166 */ 167 void putback_movable_pages(struct list_head *l) 168 { 169 struct page *page; 170 struct page *page2; 171 172 list_for_each_entry_safe(page, page2, l, lru) { 173 if (unlikely(PageHuge(page))) { 174 putback_active_hugepage(page); 175 continue; 176 } 177 list_del(&page->lru); 178 /* 179 * We isolated non-lru movable page so here we can use 180 * __PageMovable because LRU page's mapping cannot have 181 * PAGE_MAPPING_MOVABLE. 182 */ 183 if (unlikely(__PageMovable(page))) { 184 VM_BUG_ON_PAGE(!PageIsolated(page), page); 185 lock_page(page); 186 if (PageMovable(page)) 187 putback_movable_page(page); 188 else 189 __ClearPageIsolated(page); 190 unlock_page(page); 191 put_page(page); 192 } else { 193 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 194 page_is_file_cache(page), -hpage_nr_pages(page)); 195 putback_lru_page(page); 196 } 197 } 198 } 199 200 /* 201 * Restore a potential migration pte to a working pte entry 202 */ 203 static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 204 unsigned long addr, void *old) 205 { 206 struct page_vma_mapped_walk pvmw = { 207 .page = old, 208 .vma = vma, 209 .address = addr, 210 .flags = PVMW_SYNC | PVMW_MIGRATION, 211 }; 212 struct page *new; 213 pte_t pte; 214 swp_entry_t entry; 215 216 VM_BUG_ON_PAGE(PageTail(page), page); 217 while (page_vma_mapped_walk(&pvmw)) { 218 if (PageKsm(page)) 219 new = page; 220 else 221 new = page - pvmw.page->index + 222 linear_page_index(vma, pvmw.address); 223 224 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 225 /* PMD-mapped THP migration entry */ 226 if (!pvmw.pte) { 227 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 228 remove_migration_pmd(&pvmw, new); 229 continue; 230 } 231 #endif 232 233 get_page(new); 234 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 235 if (pte_swp_soft_dirty(*pvmw.pte)) 236 pte = pte_mksoft_dirty(pte); 237 238 /* 239 * Recheck VMA as permissions can change since migration started 240 */ 241 entry = pte_to_swp_entry(*pvmw.pte); 242 if (is_write_migration_entry(entry)) 243 pte = maybe_mkwrite(pte, vma); 244 245 if (unlikely(is_zone_device_page(new))) { 246 if (is_device_private_page(new)) { 247 entry = make_device_private_entry(new, pte_write(pte)); 248 pte = swp_entry_to_pte(entry); 249 } else if (is_device_public_page(new)) { 250 pte = pte_mkdevmap(pte); 251 flush_dcache_page(new); 252 } 253 } else 254 flush_dcache_page(new); 255 256 #ifdef CONFIG_HUGETLB_PAGE 257 if (PageHuge(new)) { 258 pte = pte_mkhuge(pte); 259 pte = arch_make_huge_pte(pte, vma, new, 0); 260 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 261 if (PageAnon(new)) 262 hugepage_add_anon_rmap(new, vma, pvmw.address); 263 else 264 page_dup_rmap(new, true); 265 } else 266 #endif 267 { 268 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 269 270 if (PageAnon(new)) 271 page_add_anon_rmap(new, vma, pvmw.address, false); 272 else 273 page_add_file_rmap(new, false); 274 } 275 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 276 mlock_vma_page(new); 277 278 /* No need to invalidate - it was non-present before */ 279 update_mmu_cache(vma, pvmw.address, pvmw.pte); 280 } 281 282 return true; 283 } 284 285 /* 286 * Get rid of all migration entries and replace them by 287 * references to the indicated page. 288 */ 289 void remove_migration_ptes(struct page *old, struct page *new, bool locked) 290 { 291 struct rmap_walk_control rwc = { 292 .rmap_one = remove_migration_pte, 293 .arg = old, 294 }; 295 296 if (locked) 297 rmap_walk_locked(new, &rwc); 298 else 299 rmap_walk(new, &rwc); 300 } 301 302 /* 303 * Something used the pte of a page under migration. We need to 304 * get to the page and wait until migration is finished. 305 * When we return from this function the fault will be retried. 306 */ 307 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 308 spinlock_t *ptl) 309 { 310 pte_t pte; 311 swp_entry_t entry; 312 struct page *page; 313 314 spin_lock(ptl); 315 pte = *ptep; 316 if (!is_swap_pte(pte)) 317 goto out; 318 319 entry = pte_to_swp_entry(pte); 320 if (!is_migration_entry(entry)) 321 goto out; 322 323 page = migration_entry_to_page(entry); 324 325 /* 326 * Once radix-tree replacement of page migration started, page_count 327 * *must* be zero. And, we don't want to call wait_on_page_locked() 328 * against a page without get_page(). 329 * So, we use get_page_unless_zero(), here. Even failed, page fault 330 * will occur again. 331 */ 332 if (!get_page_unless_zero(page)) 333 goto out; 334 pte_unmap_unlock(ptep, ptl); 335 wait_on_page_locked(page); 336 put_page(page); 337 return; 338 out: 339 pte_unmap_unlock(ptep, ptl); 340 } 341 342 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 343 unsigned long address) 344 { 345 spinlock_t *ptl = pte_lockptr(mm, pmd); 346 pte_t *ptep = pte_offset_map(pmd, address); 347 __migration_entry_wait(mm, ptep, ptl); 348 } 349 350 void migration_entry_wait_huge(struct vm_area_struct *vma, 351 struct mm_struct *mm, pte_t *pte) 352 { 353 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 354 __migration_entry_wait(mm, pte, ptl); 355 } 356 357 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 358 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 359 { 360 spinlock_t *ptl; 361 struct page *page; 362 363 ptl = pmd_lock(mm, pmd); 364 if (!is_pmd_migration_entry(*pmd)) 365 goto unlock; 366 page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); 367 if (!get_page_unless_zero(page)) 368 goto unlock; 369 spin_unlock(ptl); 370 wait_on_page_locked(page); 371 put_page(page); 372 return; 373 unlock: 374 spin_unlock(ptl); 375 } 376 #endif 377 378 #ifdef CONFIG_BLOCK 379 /* Returns true if all buffers are successfully locked */ 380 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 381 enum migrate_mode mode) 382 { 383 struct buffer_head *bh = head; 384 385 /* Simple case, sync compaction */ 386 if (mode != MIGRATE_ASYNC) { 387 do { 388 get_bh(bh); 389 lock_buffer(bh); 390 bh = bh->b_this_page; 391 392 } while (bh != head); 393 394 return true; 395 } 396 397 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 398 do { 399 get_bh(bh); 400 if (!trylock_buffer(bh)) { 401 /* 402 * We failed to lock the buffer and cannot stall in 403 * async migration. Release the taken locks 404 */ 405 struct buffer_head *failed_bh = bh; 406 put_bh(failed_bh); 407 bh = head; 408 while (bh != failed_bh) { 409 unlock_buffer(bh); 410 put_bh(bh); 411 bh = bh->b_this_page; 412 } 413 return false; 414 } 415 416 bh = bh->b_this_page; 417 } while (bh != head); 418 return true; 419 } 420 #else 421 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, 422 enum migrate_mode mode) 423 { 424 return true; 425 } 426 #endif /* CONFIG_BLOCK */ 427 428 /* 429 * Replace the page in the mapping. 430 * 431 * The number of remaining references must be: 432 * 1 for anonymous pages without a mapping 433 * 2 for pages with a mapping 434 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 435 */ 436 int migrate_page_move_mapping(struct address_space *mapping, 437 struct page *newpage, struct page *page, 438 struct buffer_head *head, enum migrate_mode mode, 439 int extra_count) 440 { 441 struct zone *oldzone, *newzone; 442 int dirty; 443 int expected_count = 1 + extra_count; 444 void **pslot; 445 446 /* 447 * Device public or private pages have an extra refcount as they are 448 * ZONE_DEVICE pages. 449 */ 450 expected_count += is_device_private_page(page); 451 expected_count += is_device_public_page(page); 452 453 if (!mapping) { 454 /* Anonymous page without mapping */ 455 if (page_count(page) != expected_count) 456 return -EAGAIN; 457 458 /* No turning back from here */ 459 newpage->index = page->index; 460 newpage->mapping = page->mapping; 461 if (PageSwapBacked(page)) 462 __SetPageSwapBacked(newpage); 463 464 return MIGRATEPAGE_SUCCESS; 465 } 466 467 oldzone = page_zone(page); 468 newzone = page_zone(newpage); 469 470 xa_lock_irq(&mapping->i_pages); 471 472 pslot = radix_tree_lookup_slot(&mapping->i_pages, 473 page_index(page)); 474 475 expected_count += hpage_nr_pages(page) + page_has_private(page); 476 if (page_count(page) != expected_count || 477 radix_tree_deref_slot_protected(pslot, 478 &mapping->i_pages.xa_lock) != page) { 479 xa_unlock_irq(&mapping->i_pages); 480 return -EAGAIN; 481 } 482 483 if (!page_ref_freeze(page, expected_count)) { 484 xa_unlock_irq(&mapping->i_pages); 485 return -EAGAIN; 486 } 487 488 /* 489 * In the async migration case of moving a page with buffers, lock the 490 * buffers using trylock before the mapping is moved. If the mapping 491 * was moved, we later failed to lock the buffers and could not move 492 * the mapping back due to an elevated page count, we would have to 493 * block waiting on other references to be dropped. 494 */ 495 if (mode == MIGRATE_ASYNC && head && 496 !buffer_migrate_lock_buffers(head, mode)) { 497 page_ref_unfreeze(page, expected_count); 498 xa_unlock_irq(&mapping->i_pages); 499 return -EAGAIN; 500 } 501 502 /* 503 * Now we know that no one else is looking at the page: 504 * no turning back from here. 505 */ 506 newpage->index = page->index; 507 newpage->mapping = page->mapping; 508 page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ 509 if (PageSwapBacked(page)) { 510 __SetPageSwapBacked(newpage); 511 if (PageSwapCache(page)) { 512 SetPageSwapCache(newpage); 513 set_page_private(newpage, page_private(page)); 514 } 515 } else { 516 VM_BUG_ON_PAGE(PageSwapCache(page), page); 517 } 518 519 /* Move dirty while page refs frozen and newpage not yet exposed */ 520 dirty = PageDirty(page); 521 if (dirty) { 522 ClearPageDirty(page); 523 SetPageDirty(newpage); 524 } 525 526 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 527 if (PageTransHuge(page)) { 528 int i; 529 int index = page_index(page); 530 531 for (i = 1; i < HPAGE_PMD_NR; i++) { 532 pslot = radix_tree_lookup_slot(&mapping->i_pages, 533 index + i); 534 radix_tree_replace_slot(&mapping->i_pages, pslot, 535 newpage + i); 536 } 537 } 538 539 /* 540 * Drop cache reference from old page by unfreezing 541 * to one less reference. 542 * We know this isn't the last reference. 543 */ 544 page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); 545 546 xa_unlock(&mapping->i_pages); 547 /* Leave irq disabled to prevent preemption while updating stats */ 548 549 /* 550 * If moved to a different zone then also account 551 * the page for that zone. Other VM counters will be 552 * taken care of when we establish references to the 553 * new page and drop references to the old page. 554 * 555 * Note that anonymous pages are accounted for 556 * via NR_FILE_PAGES and NR_ANON_MAPPED if they 557 * are mapped to swap space. 558 */ 559 if (newzone != oldzone) { 560 __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); 561 __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); 562 if (PageSwapBacked(page) && !PageSwapCache(page)) { 563 __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); 564 __inc_node_state(newzone->zone_pgdat, NR_SHMEM); 565 } 566 if (dirty && mapping_cap_account_dirty(mapping)) { 567 __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); 568 __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); 569 __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); 570 __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); 571 } 572 } 573 local_irq_enable(); 574 575 return MIGRATEPAGE_SUCCESS; 576 } 577 EXPORT_SYMBOL(migrate_page_move_mapping); 578 579 /* 580 * The expected number of remaining references is the same as that 581 * of migrate_page_move_mapping(). 582 */ 583 int migrate_huge_page_move_mapping(struct address_space *mapping, 584 struct page *newpage, struct page *page) 585 { 586 int expected_count; 587 void **pslot; 588 589 xa_lock_irq(&mapping->i_pages); 590 591 pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); 592 593 expected_count = 2 + page_has_private(page); 594 if (page_count(page) != expected_count || 595 radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { 596 xa_unlock_irq(&mapping->i_pages); 597 return -EAGAIN; 598 } 599 600 if (!page_ref_freeze(page, expected_count)) { 601 xa_unlock_irq(&mapping->i_pages); 602 return -EAGAIN; 603 } 604 605 newpage->index = page->index; 606 newpage->mapping = page->mapping; 607 608 get_page(newpage); 609 610 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 611 612 page_ref_unfreeze(page, expected_count - 1); 613 614 xa_unlock_irq(&mapping->i_pages); 615 616 return MIGRATEPAGE_SUCCESS; 617 } 618 619 /* 620 * Gigantic pages are so large that we do not guarantee that page++ pointer 621 * arithmetic will work across the entire page. We need something more 622 * specialized. 623 */ 624 static void __copy_gigantic_page(struct page *dst, struct page *src, 625 int nr_pages) 626 { 627 int i; 628 struct page *dst_base = dst; 629 struct page *src_base = src; 630 631 for (i = 0; i < nr_pages; ) { 632 cond_resched(); 633 copy_highpage(dst, src); 634 635 i++; 636 dst = mem_map_next(dst, dst_base, i); 637 src = mem_map_next(src, src_base, i); 638 } 639 } 640 641 static void copy_huge_page(struct page *dst, struct page *src) 642 { 643 int i; 644 int nr_pages; 645 646 if (PageHuge(src)) { 647 /* hugetlbfs page */ 648 struct hstate *h = page_hstate(src); 649 nr_pages = pages_per_huge_page(h); 650 651 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { 652 __copy_gigantic_page(dst, src, nr_pages); 653 return; 654 } 655 } else { 656 /* thp page */ 657 BUG_ON(!PageTransHuge(src)); 658 nr_pages = hpage_nr_pages(src); 659 } 660 661 for (i = 0; i < nr_pages; i++) { 662 cond_resched(); 663 copy_highpage(dst + i, src + i); 664 } 665 } 666 667 /* 668 * Copy the page to its new location 669 */ 670 void migrate_page_states(struct page *newpage, struct page *page) 671 { 672 int cpupid; 673 674 if (PageError(page)) 675 SetPageError(newpage); 676 if (PageReferenced(page)) 677 SetPageReferenced(newpage); 678 if (PageUptodate(page)) 679 SetPageUptodate(newpage); 680 if (TestClearPageActive(page)) { 681 VM_BUG_ON_PAGE(PageUnevictable(page), page); 682 SetPageActive(newpage); 683 } else if (TestClearPageUnevictable(page)) 684 SetPageUnevictable(newpage); 685 if (PageChecked(page)) 686 SetPageChecked(newpage); 687 if (PageMappedToDisk(page)) 688 SetPageMappedToDisk(newpage); 689 690 /* Move dirty on pages not done by migrate_page_move_mapping() */ 691 if (PageDirty(page)) 692 SetPageDirty(newpage); 693 694 if (page_is_young(page)) 695 set_page_young(newpage); 696 if (page_is_idle(page)) 697 set_page_idle(newpage); 698 699 /* 700 * Copy NUMA information to the new page, to prevent over-eager 701 * future migrations of this same page. 702 */ 703 cpupid = page_cpupid_xchg_last(page, -1); 704 page_cpupid_xchg_last(newpage, cpupid); 705 706 ksm_migrate_page(newpage, page); 707 /* 708 * Please do not reorder this without considering how mm/ksm.c's 709 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 710 */ 711 if (PageSwapCache(page)) 712 ClearPageSwapCache(page); 713 ClearPagePrivate(page); 714 set_page_private(page, 0); 715 716 /* 717 * If any waiters have accumulated on the new page then 718 * wake them up. 719 */ 720 if (PageWriteback(newpage)) 721 end_page_writeback(newpage); 722 723 copy_page_owner(page, newpage); 724 725 mem_cgroup_migrate(page, newpage); 726 } 727 EXPORT_SYMBOL(migrate_page_states); 728 729 void migrate_page_copy(struct page *newpage, struct page *page) 730 { 731 if (PageHuge(page) || PageTransHuge(page)) 732 copy_huge_page(newpage, page); 733 else 734 copy_highpage(newpage, page); 735 736 migrate_page_states(newpage, page); 737 } 738 EXPORT_SYMBOL(migrate_page_copy); 739 740 /************************************************************ 741 * Migration functions 742 ***********************************************************/ 743 744 /* 745 * Common logic to directly migrate a single LRU page suitable for 746 * pages that do not use PagePrivate/PagePrivate2. 747 * 748 * Pages are locked upon entry and exit. 749 */ 750 int migrate_page(struct address_space *mapping, 751 struct page *newpage, struct page *page, 752 enum migrate_mode mode) 753 { 754 int rc; 755 756 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 757 758 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); 759 760 if (rc != MIGRATEPAGE_SUCCESS) 761 return rc; 762 763 if (mode != MIGRATE_SYNC_NO_COPY) 764 migrate_page_copy(newpage, page); 765 else 766 migrate_page_states(newpage, page); 767 return MIGRATEPAGE_SUCCESS; 768 } 769 EXPORT_SYMBOL(migrate_page); 770 771 #ifdef CONFIG_BLOCK 772 /* 773 * Migration function for pages with buffers. This function can only be used 774 * if the underlying filesystem guarantees that no other references to "page" 775 * exist. 776 */ 777 int buffer_migrate_page(struct address_space *mapping, 778 struct page *newpage, struct page *page, enum migrate_mode mode) 779 { 780 struct buffer_head *bh, *head; 781 int rc; 782 783 if (!page_has_buffers(page)) 784 return migrate_page(mapping, newpage, page, mode); 785 786 head = page_buffers(page); 787 788 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); 789 790 if (rc != MIGRATEPAGE_SUCCESS) 791 return rc; 792 793 /* 794 * In the async case, migrate_page_move_mapping locked the buffers 795 * with an IRQ-safe spinlock held. In the sync case, the buffers 796 * need to be locked now 797 */ 798 if (mode != MIGRATE_ASYNC) 799 BUG_ON(!buffer_migrate_lock_buffers(head, mode)); 800 801 ClearPagePrivate(page); 802 set_page_private(newpage, page_private(page)); 803 set_page_private(page, 0); 804 put_page(page); 805 get_page(newpage); 806 807 bh = head; 808 do { 809 set_bh_page(bh, newpage, bh_offset(bh)); 810 bh = bh->b_this_page; 811 812 } while (bh != head); 813 814 SetPagePrivate(newpage); 815 816 if (mode != MIGRATE_SYNC_NO_COPY) 817 migrate_page_copy(newpage, page); 818 else 819 migrate_page_states(newpage, page); 820 821 bh = head; 822 do { 823 unlock_buffer(bh); 824 put_bh(bh); 825 bh = bh->b_this_page; 826 827 } while (bh != head); 828 829 return MIGRATEPAGE_SUCCESS; 830 } 831 EXPORT_SYMBOL(buffer_migrate_page); 832 #endif 833 834 /* 835 * Writeback a page to clean the dirty state 836 */ 837 static int writeout(struct address_space *mapping, struct page *page) 838 { 839 struct writeback_control wbc = { 840 .sync_mode = WB_SYNC_NONE, 841 .nr_to_write = 1, 842 .range_start = 0, 843 .range_end = LLONG_MAX, 844 .for_reclaim = 1 845 }; 846 int rc; 847 848 if (!mapping->a_ops->writepage) 849 /* No write method for the address space */ 850 return -EINVAL; 851 852 if (!clear_page_dirty_for_io(page)) 853 /* Someone else already triggered a write */ 854 return -EAGAIN; 855 856 /* 857 * A dirty page may imply that the underlying filesystem has 858 * the page on some queue. So the page must be clean for 859 * migration. Writeout may mean we loose the lock and the 860 * page state is no longer what we checked for earlier. 861 * At this point we know that the migration attempt cannot 862 * be successful. 863 */ 864 remove_migration_ptes(page, page, false); 865 866 rc = mapping->a_ops->writepage(page, &wbc); 867 868 if (rc != AOP_WRITEPAGE_ACTIVATE) 869 /* unlocked. Relock */ 870 lock_page(page); 871 872 return (rc < 0) ? -EIO : -EAGAIN; 873 } 874 875 /* 876 * Default handling if a filesystem does not provide a migration function. 877 */ 878 static int fallback_migrate_page(struct address_space *mapping, 879 struct page *newpage, struct page *page, enum migrate_mode mode) 880 { 881 if (PageDirty(page)) { 882 /* Only writeback pages in full synchronous migration */ 883 switch (mode) { 884 case MIGRATE_SYNC: 885 case MIGRATE_SYNC_NO_COPY: 886 break; 887 default: 888 return -EBUSY; 889 } 890 return writeout(mapping, page); 891 } 892 893 /* 894 * Buffers may be managed in a filesystem specific way. 895 * We must have no buffers or drop them. 896 */ 897 if (page_has_private(page) && 898 !try_to_release_page(page, GFP_KERNEL)) 899 return -EAGAIN; 900 901 return migrate_page(mapping, newpage, page, mode); 902 } 903 904 /* 905 * Move a page to a newly allocated page 906 * The page is locked and all ptes have been successfully removed. 907 * 908 * The new page will have replaced the old page if this function 909 * is successful. 910 * 911 * Return value: 912 * < 0 - error code 913 * MIGRATEPAGE_SUCCESS - success 914 */ 915 static int move_to_new_page(struct page *newpage, struct page *page, 916 enum migrate_mode mode) 917 { 918 struct address_space *mapping; 919 int rc = -EAGAIN; 920 bool is_lru = !__PageMovable(page); 921 922 VM_BUG_ON_PAGE(!PageLocked(page), page); 923 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 924 925 mapping = page_mapping(page); 926 927 if (likely(is_lru)) { 928 if (!mapping) 929 rc = migrate_page(mapping, newpage, page, mode); 930 else if (mapping->a_ops->migratepage) 931 /* 932 * Most pages have a mapping and most filesystems 933 * provide a migratepage callback. Anonymous pages 934 * are part of swap space which also has its own 935 * migratepage callback. This is the most common path 936 * for page migration. 937 */ 938 rc = mapping->a_ops->migratepage(mapping, newpage, 939 page, mode); 940 else 941 rc = fallback_migrate_page(mapping, newpage, 942 page, mode); 943 } else { 944 /* 945 * In case of non-lru page, it could be released after 946 * isolation step. In that case, we shouldn't try migration. 947 */ 948 VM_BUG_ON_PAGE(!PageIsolated(page), page); 949 if (!PageMovable(page)) { 950 rc = MIGRATEPAGE_SUCCESS; 951 __ClearPageIsolated(page); 952 goto out; 953 } 954 955 rc = mapping->a_ops->migratepage(mapping, newpage, 956 page, mode); 957 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 958 !PageIsolated(page)); 959 } 960 961 /* 962 * When successful, old pagecache page->mapping must be cleared before 963 * page is freed; but stats require that PageAnon be left as PageAnon. 964 */ 965 if (rc == MIGRATEPAGE_SUCCESS) { 966 if (__PageMovable(page)) { 967 VM_BUG_ON_PAGE(!PageIsolated(page), page); 968 969 /* 970 * We clear PG_movable under page_lock so any compactor 971 * cannot try to migrate this page. 972 */ 973 __ClearPageIsolated(page); 974 } 975 976 /* 977 * Anonymous and movable page->mapping will be cleard by 978 * free_pages_prepare so don't reset it here for keeping 979 * the type to work PageAnon, for example. 980 */ 981 if (!PageMappingFlags(page)) 982 page->mapping = NULL; 983 } 984 out: 985 return rc; 986 } 987 988 static int __unmap_and_move(struct page *page, struct page *newpage, 989 int force, enum migrate_mode mode) 990 { 991 int rc = -EAGAIN; 992 int page_was_mapped = 0; 993 struct anon_vma *anon_vma = NULL; 994 bool is_lru = !__PageMovable(page); 995 996 if (!trylock_page(page)) { 997 if (!force || mode == MIGRATE_ASYNC) 998 goto out; 999 1000 /* 1001 * It's not safe for direct compaction to call lock_page. 1002 * For example, during page readahead pages are added locked 1003 * to the LRU. Later, when the IO completes the pages are 1004 * marked uptodate and unlocked. However, the queueing 1005 * could be merging multiple pages for one bio (e.g. 1006 * mpage_readpages). If an allocation happens for the 1007 * second or third page, the process can end up locking 1008 * the same page twice and deadlocking. Rather than 1009 * trying to be clever about what pages can be locked, 1010 * avoid the use of lock_page for direct compaction 1011 * altogether. 1012 */ 1013 if (current->flags & PF_MEMALLOC) 1014 goto out; 1015 1016 lock_page(page); 1017 } 1018 1019 if (PageWriteback(page)) { 1020 /* 1021 * Only in the case of a full synchronous migration is it 1022 * necessary to wait for PageWriteback. In the async case, 1023 * the retry loop is too short and in the sync-light case, 1024 * the overhead of stalling is too much 1025 */ 1026 switch (mode) { 1027 case MIGRATE_SYNC: 1028 case MIGRATE_SYNC_NO_COPY: 1029 break; 1030 default: 1031 rc = -EBUSY; 1032 goto out_unlock; 1033 } 1034 if (!force) 1035 goto out_unlock; 1036 wait_on_page_writeback(page); 1037 } 1038 1039 /* 1040 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 1041 * we cannot notice that anon_vma is freed while we migrates a page. 1042 * This get_anon_vma() delays freeing anon_vma pointer until the end 1043 * of migration. File cache pages are no problem because of page_lock() 1044 * File Caches may use write_page() or lock_page() in migration, then, 1045 * just care Anon page here. 1046 * 1047 * Only page_get_anon_vma() understands the subtleties of 1048 * getting a hold on an anon_vma from outside one of its mms. 1049 * But if we cannot get anon_vma, then we won't need it anyway, 1050 * because that implies that the anon page is no longer mapped 1051 * (and cannot be remapped so long as we hold the page lock). 1052 */ 1053 if (PageAnon(page) && !PageKsm(page)) 1054 anon_vma = page_get_anon_vma(page); 1055 1056 /* 1057 * Block others from accessing the new page when we get around to 1058 * establishing additional references. We are usually the only one 1059 * holding a reference to newpage at this point. We used to have a BUG 1060 * here if trylock_page(newpage) fails, but would like to allow for 1061 * cases where there might be a race with the previous use of newpage. 1062 * This is much like races on refcount of oldpage: just don't BUG(). 1063 */ 1064 if (unlikely(!trylock_page(newpage))) 1065 goto out_unlock; 1066 1067 if (unlikely(!is_lru)) { 1068 rc = move_to_new_page(newpage, page, mode); 1069 goto out_unlock_both; 1070 } 1071 1072 /* 1073 * Corner case handling: 1074 * 1. When a new swap-cache page is read into, it is added to the LRU 1075 * and treated as swapcache but it has no rmap yet. 1076 * Calling try_to_unmap() against a page->mapping==NULL page will 1077 * trigger a BUG. So handle it here. 1078 * 2. An orphaned page (see truncate_complete_page) might have 1079 * fs-private metadata. The page can be picked up due to memory 1080 * offlining. Everywhere else except page reclaim, the page is 1081 * invisible to the vm, so the page can not be migrated. So try to 1082 * free the metadata, so the page can be freed. 1083 */ 1084 if (!page->mapping) { 1085 VM_BUG_ON_PAGE(PageAnon(page), page); 1086 if (page_has_private(page)) { 1087 try_to_free_buffers(page); 1088 goto out_unlock_both; 1089 } 1090 } else if (page_mapped(page)) { 1091 /* Establish migration ptes */ 1092 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 1093 page); 1094 try_to_unmap(page, 1095 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1096 page_was_mapped = 1; 1097 } 1098 1099 if (!page_mapped(page)) 1100 rc = move_to_new_page(newpage, page, mode); 1101 1102 if (page_was_mapped) 1103 remove_migration_ptes(page, 1104 rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 1105 1106 out_unlock_both: 1107 unlock_page(newpage); 1108 out_unlock: 1109 /* Drop an anon_vma reference if we took one */ 1110 if (anon_vma) 1111 put_anon_vma(anon_vma); 1112 unlock_page(page); 1113 out: 1114 /* 1115 * If migration is successful, decrease refcount of the newpage 1116 * which will not free the page because new page owner increased 1117 * refcounter. As well, if it is LRU page, add the page to LRU 1118 * list in here. 1119 */ 1120 if (rc == MIGRATEPAGE_SUCCESS) { 1121 if (unlikely(__PageMovable(newpage))) 1122 put_page(newpage); 1123 else 1124 putback_lru_page(newpage); 1125 } 1126 1127 return rc; 1128 } 1129 1130 /* 1131 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work 1132 * around it. 1133 */ 1134 #if defined(CONFIG_ARM) && \ 1135 defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 1136 #define ICE_noinline noinline 1137 #else 1138 #define ICE_noinline 1139 #endif 1140 1141 /* 1142 * Obtain the lock on page, remove all ptes and migrate the page 1143 * to the newly allocated page in newpage. 1144 */ 1145 static ICE_noinline int unmap_and_move(new_page_t get_new_page, 1146 free_page_t put_new_page, 1147 unsigned long private, struct page *page, 1148 int force, enum migrate_mode mode, 1149 enum migrate_reason reason) 1150 { 1151 int rc = MIGRATEPAGE_SUCCESS; 1152 struct page *newpage; 1153 1154 if (!thp_migration_supported() && PageTransHuge(page)) 1155 return -ENOMEM; 1156 1157 newpage = get_new_page(page, private); 1158 if (!newpage) 1159 return -ENOMEM; 1160 1161 if (page_count(page) == 1) { 1162 /* page was freed from under us. So we are done. */ 1163 ClearPageActive(page); 1164 ClearPageUnevictable(page); 1165 if (unlikely(__PageMovable(page))) { 1166 lock_page(page); 1167 if (!PageMovable(page)) 1168 __ClearPageIsolated(page); 1169 unlock_page(page); 1170 } 1171 if (put_new_page) 1172 put_new_page(newpage, private); 1173 else 1174 put_page(newpage); 1175 goto out; 1176 } 1177 1178 rc = __unmap_and_move(page, newpage, force, mode); 1179 if (rc == MIGRATEPAGE_SUCCESS) 1180 set_page_owner_migrate_reason(newpage, reason); 1181 1182 out: 1183 if (rc != -EAGAIN) { 1184 /* 1185 * A page that has been migrated has all references 1186 * removed and will be freed. A page that has not been 1187 * migrated will have kepts its references and be 1188 * restored. 1189 */ 1190 list_del(&page->lru); 1191 1192 /* 1193 * Compaction can migrate also non-LRU pages which are 1194 * not accounted to NR_ISOLATED_*. They can be recognized 1195 * as __PageMovable 1196 */ 1197 if (likely(!__PageMovable(page))) 1198 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 1199 page_is_file_cache(page), -hpage_nr_pages(page)); 1200 } 1201 1202 /* 1203 * If migration is successful, releases reference grabbed during 1204 * isolation. Otherwise, restore the page to right list unless 1205 * we want to retry. 1206 */ 1207 if (rc == MIGRATEPAGE_SUCCESS) { 1208 put_page(page); 1209 if (reason == MR_MEMORY_FAILURE) { 1210 /* 1211 * Set PG_HWPoison on just freed page 1212 * intentionally. Although it's rather weird, 1213 * it's how HWPoison flag works at the moment. 1214 */ 1215 if (set_hwpoison_free_buddy_page(page)) 1216 num_poisoned_pages_inc(); 1217 } 1218 } else { 1219 if (rc != -EAGAIN) { 1220 if (likely(!__PageMovable(page))) { 1221 putback_lru_page(page); 1222 goto put_new; 1223 } 1224 1225 lock_page(page); 1226 if (PageMovable(page)) 1227 putback_movable_page(page); 1228 else 1229 __ClearPageIsolated(page); 1230 unlock_page(page); 1231 put_page(page); 1232 } 1233 put_new: 1234 if (put_new_page) 1235 put_new_page(newpage, private); 1236 else 1237 put_page(newpage); 1238 } 1239 1240 return rc; 1241 } 1242 1243 /* 1244 * Counterpart of unmap_and_move_page() for hugepage migration. 1245 * 1246 * This function doesn't wait the completion of hugepage I/O 1247 * because there is no race between I/O and migration for hugepage. 1248 * Note that currently hugepage I/O occurs only in direct I/O 1249 * where no lock is held and PG_writeback is irrelevant, 1250 * and writeback status of all subpages are counted in the reference 1251 * count of the head page (i.e. if all subpages of a 2MB hugepage are 1252 * under direct I/O, the reference of the head page is 512 and a bit more.) 1253 * This means that when we try to migrate hugepage whose subpages are 1254 * doing direct I/O, some references remain after try_to_unmap() and 1255 * hugepage migration fails without data corruption. 1256 * 1257 * There is also no race when direct I/O is issued on the page under migration, 1258 * because then pte is replaced with migration swap entry and direct I/O code 1259 * will wait in the page fault for migration to complete. 1260 */ 1261 static int unmap_and_move_huge_page(new_page_t get_new_page, 1262 free_page_t put_new_page, unsigned long private, 1263 struct page *hpage, int force, 1264 enum migrate_mode mode, int reason) 1265 { 1266 int rc = -EAGAIN; 1267 int page_was_mapped = 0; 1268 struct page *new_hpage; 1269 struct anon_vma *anon_vma = NULL; 1270 1271 /* 1272 * Movability of hugepages depends on architectures and hugepage size. 1273 * This check is necessary because some callers of hugepage migration 1274 * like soft offline and memory hotremove don't walk through page 1275 * tables or check whether the hugepage is pmd-based or not before 1276 * kicking migration. 1277 */ 1278 if (!hugepage_migration_supported(page_hstate(hpage))) { 1279 putback_active_hugepage(hpage); 1280 return -ENOSYS; 1281 } 1282 1283 new_hpage = get_new_page(hpage, private); 1284 if (!new_hpage) 1285 return -ENOMEM; 1286 1287 if (!trylock_page(hpage)) { 1288 if (!force) 1289 goto out; 1290 switch (mode) { 1291 case MIGRATE_SYNC: 1292 case MIGRATE_SYNC_NO_COPY: 1293 break; 1294 default: 1295 goto out; 1296 } 1297 lock_page(hpage); 1298 } 1299 1300 if (PageAnon(hpage)) 1301 anon_vma = page_get_anon_vma(hpage); 1302 1303 if (unlikely(!trylock_page(new_hpage))) 1304 goto put_anon; 1305 1306 if (page_mapped(hpage)) { 1307 try_to_unmap(hpage, 1308 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1309 page_was_mapped = 1; 1310 } 1311 1312 if (!page_mapped(hpage)) 1313 rc = move_to_new_page(new_hpage, hpage, mode); 1314 1315 if (page_was_mapped) 1316 remove_migration_ptes(hpage, 1317 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); 1318 1319 unlock_page(new_hpage); 1320 1321 put_anon: 1322 if (anon_vma) 1323 put_anon_vma(anon_vma); 1324 1325 if (rc == MIGRATEPAGE_SUCCESS) { 1326 move_hugetlb_state(hpage, new_hpage, reason); 1327 put_new_page = NULL; 1328 } 1329 1330 unlock_page(hpage); 1331 out: 1332 if (rc != -EAGAIN) 1333 putback_active_hugepage(hpage); 1334 1335 /* 1336 * If migration was not successful and there's a freeing callback, use 1337 * it. Otherwise, put_page() will drop the reference grabbed during 1338 * isolation. 1339 */ 1340 if (put_new_page) 1341 put_new_page(new_hpage, private); 1342 else 1343 putback_active_hugepage(new_hpage); 1344 1345 return rc; 1346 } 1347 1348 /* 1349 * migrate_pages - migrate the pages specified in a list, to the free pages 1350 * supplied as the target for the page migration 1351 * 1352 * @from: The list of pages to be migrated. 1353 * @get_new_page: The function used to allocate free pages to be used 1354 * as the target of the page migration. 1355 * @put_new_page: The function used to free target pages if migration 1356 * fails, or NULL if no special handling is necessary. 1357 * @private: Private data to be passed on to get_new_page() 1358 * @mode: The migration mode that specifies the constraints for 1359 * page migration, if any. 1360 * @reason: The reason for page migration. 1361 * 1362 * The function returns after 10 attempts or if no pages are movable any more 1363 * because the list has become empty or no retryable pages exist any more. 1364 * The caller should call putback_movable_pages() to return pages to the LRU 1365 * or free list only if ret != 0. 1366 * 1367 * Returns the number of pages that were not migrated, or an error code. 1368 */ 1369 int migrate_pages(struct list_head *from, new_page_t get_new_page, 1370 free_page_t put_new_page, unsigned long private, 1371 enum migrate_mode mode, int reason) 1372 { 1373 int retry = 1; 1374 int nr_failed = 0; 1375 int nr_succeeded = 0; 1376 int pass = 0; 1377 struct page *page; 1378 struct page *page2; 1379 int swapwrite = current->flags & PF_SWAPWRITE; 1380 int rc; 1381 1382 if (!swapwrite) 1383 current->flags |= PF_SWAPWRITE; 1384 1385 for(pass = 0; pass < 10 && retry; pass++) { 1386 retry = 0; 1387 1388 list_for_each_entry_safe(page, page2, from, lru) { 1389 retry: 1390 cond_resched(); 1391 1392 if (PageHuge(page)) 1393 rc = unmap_and_move_huge_page(get_new_page, 1394 put_new_page, private, page, 1395 pass > 2, mode, reason); 1396 else 1397 rc = unmap_and_move(get_new_page, put_new_page, 1398 private, page, pass > 2, mode, 1399 reason); 1400 1401 switch(rc) { 1402 case -ENOMEM: 1403 /* 1404 * THP migration might be unsupported or the 1405 * allocation could've failed so we should 1406 * retry on the same page with the THP split 1407 * to base pages. 1408 * 1409 * Head page is retried immediately and tail 1410 * pages are added to the tail of the list so 1411 * we encounter them after the rest of the list 1412 * is processed. 1413 */ 1414 if (PageTransHuge(page)) { 1415 lock_page(page); 1416 rc = split_huge_page_to_list(page, from); 1417 unlock_page(page); 1418 if (!rc) { 1419 list_safe_reset_next(page, page2, lru); 1420 goto retry; 1421 } 1422 } 1423 nr_failed++; 1424 goto out; 1425 case -EAGAIN: 1426 retry++; 1427 break; 1428 case MIGRATEPAGE_SUCCESS: 1429 nr_succeeded++; 1430 break; 1431 default: 1432 /* 1433 * Permanent failure (-EBUSY, -ENOSYS, etc.): 1434 * unlike -EAGAIN case, the failed page is 1435 * removed from migration page list and not 1436 * retried in the next outer loop. 1437 */ 1438 nr_failed++; 1439 break; 1440 } 1441 } 1442 } 1443 nr_failed += retry; 1444 rc = nr_failed; 1445 out: 1446 if (nr_succeeded) 1447 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1448 if (nr_failed) 1449 count_vm_events(PGMIGRATE_FAIL, nr_failed); 1450 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); 1451 1452 if (!swapwrite) 1453 current->flags &= ~PF_SWAPWRITE; 1454 1455 return rc; 1456 } 1457 1458 #ifdef CONFIG_NUMA 1459 1460 static int store_status(int __user *status, int start, int value, int nr) 1461 { 1462 while (nr-- > 0) { 1463 if (put_user(value, status + start)) 1464 return -EFAULT; 1465 start++; 1466 } 1467 1468 return 0; 1469 } 1470 1471 static int do_move_pages_to_node(struct mm_struct *mm, 1472 struct list_head *pagelist, int node) 1473 { 1474 int err; 1475 1476 if (list_empty(pagelist)) 1477 return 0; 1478 1479 err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, 1480 MIGRATE_SYNC, MR_SYSCALL); 1481 if (err) 1482 putback_movable_pages(pagelist); 1483 return err; 1484 } 1485 1486 /* 1487 * Resolves the given address to a struct page, isolates it from the LRU and 1488 * puts it to the given pagelist. 1489 * Returns -errno if the page cannot be found/isolated or 0 when it has been 1490 * queued or the page doesn't need to be migrated because it is already on 1491 * the target node 1492 */ 1493 static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, 1494 int node, struct list_head *pagelist, bool migrate_all) 1495 { 1496 struct vm_area_struct *vma; 1497 struct page *page; 1498 unsigned int follflags; 1499 int err; 1500 1501 down_read(&mm->mmap_sem); 1502 err = -EFAULT; 1503 vma = find_vma(mm, addr); 1504 if (!vma || addr < vma->vm_start || !vma_migratable(vma)) 1505 goto out; 1506 1507 /* FOLL_DUMP to ignore special (like zero) pages */ 1508 follflags = FOLL_GET | FOLL_DUMP; 1509 page = follow_page(vma, addr, follflags); 1510 1511 err = PTR_ERR(page); 1512 if (IS_ERR(page)) 1513 goto out; 1514 1515 err = -ENOENT; 1516 if (!page) 1517 goto out; 1518 1519 err = 0; 1520 if (page_to_nid(page) == node) 1521 goto out_putpage; 1522 1523 err = -EACCES; 1524 if (page_mapcount(page) > 1 && !migrate_all) 1525 goto out_putpage; 1526 1527 if (PageHuge(page)) { 1528 if (PageHead(page)) { 1529 isolate_huge_page(page, pagelist); 1530 err = 0; 1531 } 1532 } else { 1533 struct page *head; 1534 1535 head = compound_head(page); 1536 err = isolate_lru_page(head); 1537 if (err) 1538 goto out_putpage; 1539 1540 err = 0; 1541 list_add_tail(&head->lru, pagelist); 1542 mod_node_page_state(page_pgdat(head), 1543 NR_ISOLATED_ANON + page_is_file_cache(head), 1544 hpage_nr_pages(head)); 1545 } 1546 out_putpage: 1547 /* 1548 * Either remove the duplicate refcount from 1549 * isolate_lru_page() or drop the page ref if it was 1550 * not isolated. 1551 */ 1552 put_page(page); 1553 out: 1554 up_read(&mm->mmap_sem); 1555 return err; 1556 } 1557 1558 /* 1559 * Migrate an array of page address onto an array of nodes and fill 1560 * the corresponding array of status. 1561 */ 1562 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 1563 unsigned long nr_pages, 1564 const void __user * __user *pages, 1565 const int __user *nodes, 1566 int __user *status, int flags) 1567 { 1568 int current_node = NUMA_NO_NODE; 1569 LIST_HEAD(pagelist); 1570 int start, i; 1571 int err = 0, err1; 1572 1573 migrate_prep(); 1574 1575 for (i = start = 0; i < nr_pages; i++) { 1576 const void __user *p; 1577 unsigned long addr; 1578 int node; 1579 1580 err = -EFAULT; 1581 if (get_user(p, pages + i)) 1582 goto out_flush; 1583 if (get_user(node, nodes + i)) 1584 goto out_flush; 1585 addr = (unsigned long)p; 1586 1587 err = -ENODEV; 1588 if (node < 0 || node >= MAX_NUMNODES) 1589 goto out_flush; 1590 if (!node_state(node, N_MEMORY)) 1591 goto out_flush; 1592 1593 err = -EACCES; 1594 if (!node_isset(node, task_nodes)) 1595 goto out_flush; 1596 1597 if (current_node == NUMA_NO_NODE) { 1598 current_node = node; 1599 start = i; 1600 } else if (node != current_node) { 1601 err = do_move_pages_to_node(mm, &pagelist, current_node); 1602 if (err) 1603 goto out; 1604 err = store_status(status, start, current_node, i - start); 1605 if (err) 1606 goto out; 1607 start = i; 1608 current_node = node; 1609 } 1610 1611 /* 1612 * Errors in the page lookup or isolation are not fatal and we simply 1613 * report them via status 1614 */ 1615 err = add_page_for_migration(mm, addr, current_node, 1616 &pagelist, flags & MPOL_MF_MOVE_ALL); 1617 if (!err) 1618 continue; 1619 1620 err = store_status(status, i, err, 1); 1621 if (err) 1622 goto out_flush; 1623 1624 err = do_move_pages_to_node(mm, &pagelist, current_node); 1625 if (err) 1626 goto out; 1627 if (i > start) { 1628 err = store_status(status, start, current_node, i - start); 1629 if (err) 1630 goto out; 1631 } 1632 current_node = NUMA_NO_NODE; 1633 } 1634 out_flush: 1635 if (list_empty(&pagelist)) 1636 return err; 1637 1638 /* Make sure we do not overwrite the existing error */ 1639 err1 = do_move_pages_to_node(mm, &pagelist, current_node); 1640 if (!err1) 1641 err1 = store_status(status, start, current_node, i - start); 1642 if (!err) 1643 err = err1; 1644 out: 1645 return err; 1646 } 1647 1648 /* 1649 * Determine the nodes of an array of pages and store it in an array of status. 1650 */ 1651 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 1652 const void __user **pages, int *status) 1653 { 1654 unsigned long i; 1655 1656 down_read(&mm->mmap_sem); 1657 1658 for (i = 0; i < nr_pages; i++) { 1659 unsigned long addr = (unsigned long)(*pages); 1660 struct vm_area_struct *vma; 1661 struct page *page; 1662 int err = -EFAULT; 1663 1664 vma = find_vma(mm, addr); 1665 if (!vma || addr < vma->vm_start) 1666 goto set_status; 1667 1668 /* FOLL_DUMP to ignore special (like zero) pages */ 1669 page = follow_page(vma, addr, FOLL_DUMP); 1670 1671 err = PTR_ERR(page); 1672 if (IS_ERR(page)) 1673 goto set_status; 1674 1675 err = page ? page_to_nid(page) : -ENOENT; 1676 set_status: 1677 *status = err; 1678 1679 pages++; 1680 status++; 1681 } 1682 1683 up_read(&mm->mmap_sem); 1684 } 1685 1686 /* 1687 * Determine the nodes of a user array of pages and store it in 1688 * a user array of status. 1689 */ 1690 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1691 const void __user * __user *pages, 1692 int __user *status) 1693 { 1694 #define DO_PAGES_STAT_CHUNK_NR 16 1695 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1696 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1697 1698 while (nr_pages) { 1699 unsigned long chunk_nr; 1700 1701 chunk_nr = nr_pages; 1702 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1703 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1704 1705 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1706 break; 1707 1708 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1709 1710 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1711 break; 1712 1713 pages += chunk_nr; 1714 status += chunk_nr; 1715 nr_pages -= chunk_nr; 1716 } 1717 return nr_pages ? -EFAULT : 0; 1718 } 1719 1720 /* 1721 * Move a list of pages in the address space of the currently executing 1722 * process. 1723 */ 1724 static int kernel_move_pages(pid_t pid, unsigned long nr_pages, 1725 const void __user * __user *pages, 1726 const int __user *nodes, 1727 int __user *status, int flags) 1728 { 1729 struct task_struct *task; 1730 struct mm_struct *mm; 1731 int err; 1732 nodemask_t task_nodes; 1733 1734 /* Check flags */ 1735 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1736 return -EINVAL; 1737 1738 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1739 return -EPERM; 1740 1741 /* Find the mm_struct */ 1742 rcu_read_lock(); 1743 task = pid ? find_task_by_vpid(pid) : current; 1744 if (!task) { 1745 rcu_read_unlock(); 1746 return -ESRCH; 1747 } 1748 get_task_struct(task); 1749 1750 /* 1751 * Check if this process has the right to modify the specified 1752 * process. Use the regular "ptrace_may_access()" checks. 1753 */ 1754 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1755 rcu_read_unlock(); 1756 err = -EPERM; 1757 goto out; 1758 } 1759 rcu_read_unlock(); 1760 1761 err = security_task_movememory(task); 1762 if (err) 1763 goto out; 1764 1765 task_nodes = cpuset_mems_allowed(task); 1766 mm = get_task_mm(task); 1767 put_task_struct(task); 1768 1769 if (!mm) 1770 return -EINVAL; 1771 1772 if (nodes) 1773 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1774 nodes, status, flags); 1775 else 1776 err = do_pages_stat(mm, nr_pages, pages, status); 1777 1778 mmput(mm); 1779 return err; 1780 1781 out: 1782 put_task_struct(task); 1783 return err; 1784 } 1785 1786 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1787 const void __user * __user *, pages, 1788 const int __user *, nodes, 1789 int __user *, status, int, flags) 1790 { 1791 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1792 } 1793 1794 #ifdef CONFIG_COMPAT 1795 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 1796 compat_uptr_t __user *, pages32, 1797 const int __user *, nodes, 1798 int __user *, status, 1799 int, flags) 1800 { 1801 const void __user * __user *pages; 1802 int i; 1803 1804 pages = compat_alloc_user_space(nr_pages * sizeof(void *)); 1805 for (i = 0; i < nr_pages; i++) { 1806 compat_uptr_t p; 1807 1808 if (get_user(p, pages32 + i) || 1809 put_user(compat_ptr(p), pages + i)) 1810 return -EFAULT; 1811 } 1812 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1813 } 1814 #endif /* CONFIG_COMPAT */ 1815 1816 #ifdef CONFIG_NUMA_BALANCING 1817 /* 1818 * Returns true if this is a safe migration target node for misplaced NUMA 1819 * pages. Currently it only checks the watermarks which crude 1820 */ 1821 static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1822 unsigned long nr_migrate_pages) 1823 { 1824 int z; 1825 1826 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1827 struct zone *zone = pgdat->node_zones + z; 1828 1829 if (!populated_zone(zone)) 1830 continue; 1831 1832 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1833 if (!zone_watermark_ok(zone, 0, 1834 high_wmark_pages(zone) + 1835 nr_migrate_pages, 1836 0, 0)) 1837 continue; 1838 return true; 1839 } 1840 return false; 1841 } 1842 1843 static struct page *alloc_misplaced_dst_page(struct page *page, 1844 unsigned long data) 1845 { 1846 int nid = (int) data; 1847 struct page *newpage; 1848 1849 newpage = __alloc_pages_node(nid, 1850 (GFP_HIGHUSER_MOVABLE | 1851 __GFP_THISNODE | __GFP_NOMEMALLOC | 1852 __GFP_NORETRY | __GFP_NOWARN) & 1853 ~__GFP_RECLAIM, 0); 1854 1855 return newpage; 1856 } 1857 1858 /* 1859 * page migration rate limiting control. 1860 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1861 * window of time. Default here says do not migrate more than 1280M per second. 1862 */ 1863 static unsigned int migrate_interval_millisecs __read_mostly = 100; 1864 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1865 1866 /* Returns true if the node is migrate rate-limited after the update */ 1867 static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1868 unsigned long nr_pages) 1869 { 1870 /* 1871 * Rate-limit the amount of data that is being migrated to a node. 1872 * Optimal placement is no good if the memory bus is saturated and 1873 * all the time is being spent migrating! 1874 */ 1875 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1876 spin_lock(&pgdat->numabalancing_migrate_lock); 1877 pgdat->numabalancing_migrate_nr_pages = 0; 1878 pgdat->numabalancing_migrate_next_window = jiffies + 1879 msecs_to_jiffies(migrate_interval_millisecs); 1880 spin_unlock(&pgdat->numabalancing_migrate_lock); 1881 } 1882 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1883 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, 1884 nr_pages); 1885 return true; 1886 } 1887 1888 /* 1889 * This is an unlocked non-atomic update so errors are possible. 1890 * The consequences are failing to migrate when we potentiall should 1891 * have which is not severe enough to warrant locking. If it is ever 1892 * a problem, it can be converted to a per-cpu counter. 1893 */ 1894 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1895 return false; 1896 } 1897 1898 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1899 { 1900 int page_lru; 1901 1902 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 1903 1904 /* Avoid migrating to a node that is nearly full */ 1905 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1906 return 0; 1907 1908 if (isolate_lru_page(page)) 1909 return 0; 1910 1911 /* 1912 * migrate_misplaced_transhuge_page() skips page migration's usual 1913 * check on page_count(), so we must do it here, now that the page 1914 * has been isolated: a GUP pin, or any other pin, prevents migration. 1915 * The expected page count is 3: 1 for page's mapcount and 1 for the 1916 * caller's pin and 1 for the reference taken by isolate_lru_page(). 1917 */ 1918 if (PageTransHuge(page) && page_count(page) != 3) { 1919 putback_lru_page(page); 1920 return 0; 1921 } 1922 1923 page_lru = page_is_file_cache(page); 1924 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, 1925 hpage_nr_pages(page)); 1926 1927 /* 1928 * Isolating the page has taken another reference, so the 1929 * caller's reference can be safely dropped without the page 1930 * disappearing underneath us during migration. 1931 */ 1932 put_page(page); 1933 return 1; 1934 } 1935 1936 bool pmd_trans_migrating(pmd_t pmd) 1937 { 1938 struct page *page = pmd_page(pmd); 1939 return PageLocked(page); 1940 } 1941 1942 /* 1943 * Attempt to migrate a misplaced page to the specified destination 1944 * node. Caller is expected to have an elevated reference count on 1945 * the page that will be dropped by this function before returning. 1946 */ 1947 int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 1948 int node) 1949 { 1950 pg_data_t *pgdat = NODE_DATA(node); 1951 int isolated; 1952 int nr_remaining; 1953 LIST_HEAD(migratepages); 1954 1955 /* 1956 * Don't migrate file pages that are mapped in multiple processes 1957 * with execute permissions as they are probably shared libraries. 1958 */ 1959 if (page_mapcount(page) != 1 && page_is_file_cache(page) && 1960 (vma->vm_flags & VM_EXEC)) 1961 goto out; 1962 1963 /* 1964 * Also do not migrate dirty pages as not all filesystems can move 1965 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 1966 */ 1967 if (page_is_file_cache(page) && PageDirty(page)) 1968 goto out; 1969 1970 /* 1971 * Rate-limit the amount of data that is being migrated to a node. 1972 * Optimal placement is no good if the memory bus is saturated and 1973 * all the time is being spent migrating! 1974 */ 1975 if (numamigrate_update_ratelimit(pgdat, 1)) 1976 goto out; 1977 1978 isolated = numamigrate_isolate_page(pgdat, page); 1979 if (!isolated) 1980 goto out; 1981 1982 list_add(&page->lru, &migratepages); 1983 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1984 NULL, node, MIGRATE_ASYNC, 1985 MR_NUMA_MISPLACED); 1986 if (nr_remaining) { 1987 if (!list_empty(&migratepages)) { 1988 list_del(&page->lru); 1989 dec_node_page_state(page, NR_ISOLATED_ANON + 1990 page_is_file_cache(page)); 1991 putback_lru_page(page); 1992 } 1993 isolated = 0; 1994 } else 1995 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1996 BUG_ON(!list_empty(&migratepages)); 1997 return isolated; 1998 1999 out: 2000 put_page(page); 2001 return 0; 2002 } 2003 #endif /* CONFIG_NUMA_BALANCING */ 2004 2005 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 2006 /* 2007 * Migrates a THP to a given target node. page must be locked and is unlocked 2008 * before returning. 2009 */ 2010 int migrate_misplaced_transhuge_page(struct mm_struct *mm, 2011 struct vm_area_struct *vma, 2012 pmd_t *pmd, pmd_t entry, 2013 unsigned long address, 2014 struct page *page, int node) 2015 { 2016 spinlock_t *ptl; 2017 pg_data_t *pgdat = NODE_DATA(node); 2018 int isolated = 0; 2019 struct page *new_page = NULL; 2020 int page_lru = page_is_file_cache(page); 2021 unsigned long mmun_start = address & HPAGE_PMD_MASK; 2022 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 2023 2024 /* 2025 * Rate-limit the amount of data that is being migrated to a node. 2026 * Optimal placement is no good if the memory bus is saturated and 2027 * all the time is being spent migrating! 2028 */ 2029 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) 2030 goto out_dropref; 2031 2032 new_page = alloc_pages_node(node, 2033 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 2034 HPAGE_PMD_ORDER); 2035 if (!new_page) 2036 goto out_fail; 2037 prep_transhuge_page(new_page); 2038 2039 isolated = numamigrate_isolate_page(pgdat, page); 2040 if (!isolated) { 2041 put_page(new_page); 2042 goto out_fail; 2043 } 2044 2045 /* Prepare a page as a migration target */ 2046 __SetPageLocked(new_page); 2047 if (PageSwapBacked(page)) 2048 __SetPageSwapBacked(new_page); 2049 2050 /* anon mapping, we can simply copy page->mapping to the new page: */ 2051 new_page->mapping = page->mapping; 2052 new_page->index = page->index; 2053 migrate_page_copy(new_page, page); 2054 WARN_ON(PageLRU(new_page)); 2055 2056 /* Recheck the target PMD */ 2057 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2058 ptl = pmd_lock(mm, pmd); 2059 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2060 spin_unlock(ptl); 2061 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2062 2063 /* Reverse changes made by migrate_page_copy() */ 2064 if (TestClearPageActive(new_page)) 2065 SetPageActive(page); 2066 if (TestClearPageUnevictable(new_page)) 2067 SetPageUnevictable(page); 2068 2069 unlock_page(new_page); 2070 put_page(new_page); /* Free it */ 2071 2072 /* Retake the callers reference and putback on LRU */ 2073 get_page(page); 2074 putback_lru_page(page); 2075 mod_node_page_state(page_pgdat(page), 2076 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 2077 2078 goto out_unlock; 2079 } 2080 2081 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2082 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2083 2084 /* 2085 * Clear the old entry under pagetable lock and establish the new PTE. 2086 * Any parallel GUP will either observe the old page blocking on the 2087 * page lock, block on the page table lock or observe the new page. 2088 * The SetPageUptodate on the new page and page_add_new_anon_rmap 2089 * guarantee the copy is visible before the pagetable update. 2090 */ 2091 flush_cache_range(vma, mmun_start, mmun_end); 2092 page_add_anon_rmap(new_page, vma, mmun_start, true); 2093 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); 2094 set_pmd_at(mm, mmun_start, pmd, entry); 2095 update_mmu_cache_pmd(vma, address, &entry); 2096 2097 page_ref_unfreeze(page, 2); 2098 mlock_migrate_page(new_page, page); 2099 page_remove_rmap(page, true); 2100 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2101 2102 spin_unlock(ptl); 2103 /* 2104 * No need to double call mmu_notifier->invalidate_range() callback as 2105 * the above pmdp_huge_clear_flush_notify() did already call it. 2106 */ 2107 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); 2108 2109 /* Take an "isolate" reference and put new page on the LRU. */ 2110 get_page(new_page); 2111 putback_lru_page(new_page); 2112 2113 unlock_page(new_page); 2114 unlock_page(page); 2115 put_page(page); /* Drop the rmap reference */ 2116 put_page(page); /* Drop the LRU isolation reference */ 2117 2118 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2119 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2120 2121 mod_node_page_state(page_pgdat(page), 2122 NR_ISOLATED_ANON + page_lru, 2123 -HPAGE_PMD_NR); 2124 return isolated; 2125 2126 out_fail: 2127 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2128 out_dropref: 2129 ptl = pmd_lock(mm, pmd); 2130 if (pmd_same(*pmd, entry)) { 2131 entry = pmd_modify(entry, vma->vm_page_prot); 2132 set_pmd_at(mm, mmun_start, pmd, entry); 2133 update_mmu_cache_pmd(vma, address, &entry); 2134 } 2135 spin_unlock(ptl); 2136 2137 out_unlock: 2138 unlock_page(page); 2139 put_page(page); 2140 return 0; 2141 } 2142 #endif /* CONFIG_NUMA_BALANCING */ 2143 2144 #endif /* CONFIG_NUMA */ 2145 2146 #if defined(CONFIG_MIGRATE_VMA_HELPER) 2147 struct migrate_vma { 2148 struct vm_area_struct *vma; 2149 unsigned long *dst; 2150 unsigned long *src; 2151 unsigned long cpages; 2152 unsigned long npages; 2153 unsigned long start; 2154 unsigned long end; 2155 }; 2156 2157 static int migrate_vma_collect_hole(unsigned long start, 2158 unsigned long end, 2159 struct mm_walk *walk) 2160 { 2161 struct migrate_vma *migrate = walk->private; 2162 unsigned long addr; 2163 2164 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2165 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 2166 migrate->dst[migrate->npages] = 0; 2167 migrate->npages++; 2168 migrate->cpages++; 2169 } 2170 2171 return 0; 2172 } 2173 2174 static int migrate_vma_collect_skip(unsigned long start, 2175 unsigned long end, 2176 struct mm_walk *walk) 2177 { 2178 struct migrate_vma *migrate = walk->private; 2179 unsigned long addr; 2180 2181 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2182 migrate->dst[migrate->npages] = 0; 2183 migrate->src[migrate->npages++] = 0; 2184 } 2185 2186 return 0; 2187 } 2188 2189 static int migrate_vma_collect_pmd(pmd_t *pmdp, 2190 unsigned long start, 2191 unsigned long end, 2192 struct mm_walk *walk) 2193 { 2194 struct migrate_vma *migrate = walk->private; 2195 struct vm_area_struct *vma = walk->vma; 2196 struct mm_struct *mm = vma->vm_mm; 2197 unsigned long addr = start, unmapped = 0; 2198 spinlock_t *ptl; 2199 pte_t *ptep; 2200 2201 again: 2202 if (pmd_none(*pmdp)) 2203 return migrate_vma_collect_hole(start, end, walk); 2204 2205 if (pmd_trans_huge(*pmdp)) { 2206 struct page *page; 2207 2208 ptl = pmd_lock(mm, pmdp); 2209 if (unlikely(!pmd_trans_huge(*pmdp))) { 2210 spin_unlock(ptl); 2211 goto again; 2212 } 2213 2214 page = pmd_page(*pmdp); 2215 if (is_huge_zero_page(page)) { 2216 spin_unlock(ptl); 2217 split_huge_pmd(vma, pmdp, addr); 2218 if (pmd_trans_unstable(pmdp)) 2219 return migrate_vma_collect_skip(start, end, 2220 walk); 2221 } else { 2222 int ret; 2223 2224 get_page(page); 2225 spin_unlock(ptl); 2226 if (unlikely(!trylock_page(page))) 2227 return migrate_vma_collect_skip(start, end, 2228 walk); 2229 ret = split_huge_page(page); 2230 unlock_page(page); 2231 put_page(page); 2232 if (ret) 2233 return migrate_vma_collect_skip(start, end, 2234 walk); 2235 if (pmd_none(*pmdp)) 2236 return migrate_vma_collect_hole(start, end, 2237 walk); 2238 } 2239 } 2240 2241 if (unlikely(pmd_bad(*pmdp))) 2242 return migrate_vma_collect_skip(start, end, walk); 2243 2244 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2245 arch_enter_lazy_mmu_mode(); 2246 2247 for (; addr < end; addr += PAGE_SIZE, ptep++) { 2248 unsigned long mpfn, pfn; 2249 struct page *page; 2250 swp_entry_t entry; 2251 pte_t pte; 2252 2253 pte = *ptep; 2254 pfn = pte_pfn(pte); 2255 2256 if (pte_none(pte)) { 2257 mpfn = MIGRATE_PFN_MIGRATE; 2258 migrate->cpages++; 2259 pfn = 0; 2260 goto next; 2261 } 2262 2263 if (!pte_present(pte)) { 2264 mpfn = pfn = 0; 2265 2266 /* 2267 * Only care about unaddressable device page special 2268 * page table entry. Other special swap entries are not 2269 * migratable, and we ignore regular swapped page. 2270 */ 2271 entry = pte_to_swp_entry(pte); 2272 if (!is_device_private_entry(entry)) 2273 goto next; 2274 2275 page = device_private_entry_to_page(entry); 2276 mpfn = migrate_pfn(page_to_pfn(page))| 2277 MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; 2278 if (is_write_device_private_entry(entry)) 2279 mpfn |= MIGRATE_PFN_WRITE; 2280 } else { 2281 if (is_zero_pfn(pfn)) { 2282 mpfn = MIGRATE_PFN_MIGRATE; 2283 migrate->cpages++; 2284 pfn = 0; 2285 goto next; 2286 } 2287 page = _vm_normal_page(migrate->vma, addr, pte, true); 2288 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2289 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2290 } 2291 2292 /* FIXME support THP */ 2293 if (!page || !page->mapping || PageTransCompound(page)) { 2294 mpfn = pfn = 0; 2295 goto next; 2296 } 2297 pfn = page_to_pfn(page); 2298 2299 /* 2300 * By getting a reference on the page we pin it and that blocks 2301 * any kind of migration. Side effect is that it "freezes" the 2302 * pte. 2303 * 2304 * We drop this reference after isolating the page from the lru 2305 * for non device page (device page are not on the lru and thus 2306 * can't be dropped from it). 2307 */ 2308 get_page(page); 2309 migrate->cpages++; 2310 2311 /* 2312 * Optimize for the common case where page is only mapped once 2313 * in one process. If we can lock the page, then we can safely 2314 * set up a special migration page table entry now. 2315 */ 2316 if (trylock_page(page)) { 2317 pte_t swp_pte; 2318 2319 mpfn |= MIGRATE_PFN_LOCKED; 2320 ptep_get_and_clear(mm, addr, ptep); 2321 2322 /* Setup special migration page table entry */ 2323 entry = make_migration_entry(page, mpfn & 2324 MIGRATE_PFN_WRITE); 2325 swp_pte = swp_entry_to_pte(entry); 2326 if (pte_soft_dirty(pte)) 2327 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2328 set_pte_at(mm, addr, ptep, swp_pte); 2329 2330 /* 2331 * This is like regular unmap: we remove the rmap and 2332 * drop page refcount. Page won't be freed, as we took 2333 * a reference just above. 2334 */ 2335 page_remove_rmap(page, false); 2336 put_page(page); 2337 2338 if (pte_present(pte)) 2339 unmapped++; 2340 } 2341 2342 next: 2343 migrate->dst[migrate->npages] = 0; 2344 migrate->src[migrate->npages++] = mpfn; 2345 } 2346 arch_leave_lazy_mmu_mode(); 2347 pte_unmap_unlock(ptep - 1, ptl); 2348 2349 /* Only flush the TLB if we actually modified any entries */ 2350 if (unmapped) 2351 flush_tlb_range(walk->vma, start, end); 2352 2353 return 0; 2354 } 2355 2356 /* 2357 * migrate_vma_collect() - collect pages over a range of virtual addresses 2358 * @migrate: migrate struct containing all migration information 2359 * 2360 * This will walk the CPU page table. For each virtual address backed by a 2361 * valid page, it updates the src array and takes a reference on the page, in 2362 * order to pin the page until we lock it and unmap it. 2363 */ 2364 static void migrate_vma_collect(struct migrate_vma *migrate) 2365 { 2366 struct mm_walk mm_walk; 2367 2368 mm_walk.pmd_entry = migrate_vma_collect_pmd; 2369 mm_walk.pte_entry = NULL; 2370 mm_walk.pte_hole = migrate_vma_collect_hole; 2371 mm_walk.hugetlb_entry = NULL; 2372 mm_walk.test_walk = NULL; 2373 mm_walk.vma = migrate->vma; 2374 mm_walk.mm = migrate->vma->vm_mm; 2375 mm_walk.private = migrate; 2376 2377 mmu_notifier_invalidate_range_start(mm_walk.mm, 2378 migrate->start, 2379 migrate->end); 2380 walk_page_range(migrate->start, migrate->end, &mm_walk); 2381 mmu_notifier_invalidate_range_end(mm_walk.mm, 2382 migrate->start, 2383 migrate->end); 2384 2385 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2386 } 2387 2388 /* 2389 * migrate_vma_check_page() - check if page is pinned or not 2390 * @page: struct page to check 2391 * 2392 * Pinned pages cannot be migrated. This is the same test as in 2393 * migrate_page_move_mapping(), except that here we allow migration of a 2394 * ZONE_DEVICE page. 2395 */ 2396 static bool migrate_vma_check_page(struct page *page) 2397 { 2398 /* 2399 * One extra ref because caller holds an extra reference, either from 2400 * isolate_lru_page() for a regular page, or migrate_vma_collect() for 2401 * a device page. 2402 */ 2403 int extra = 1; 2404 2405 /* 2406 * FIXME support THP (transparent huge page), it is bit more complex to 2407 * check them than regular pages, because they can be mapped with a pmd 2408 * or with a pte (split pte mapping). 2409 */ 2410 if (PageCompound(page)) 2411 return false; 2412 2413 /* Page from ZONE_DEVICE have one extra reference */ 2414 if (is_zone_device_page(page)) { 2415 /* 2416 * Private page can never be pin as they have no valid pte and 2417 * GUP will fail for those. Yet if there is a pending migration 2418 * a thread might try to wait on the pte migration entry and 2419 * will bump the page reference count. Sadly there is no way to 2420 * differentiate a regular pin from migration wait. Hence to 2421 * avoid 2 racing thread trying to migrate back to CPU to enter 2422 * infinite loop (one stoping migration because the other is 2423 * waiting on pte migration entry). We always return true here. 2424 * 2425 * FIXME proper solution is to rework migration_entry_wait() so 2426 * it does not need to take a reference on page. 2427 */ 2428 if (is_device_private_page(page)) 2429 return true; 2430 2431 /* 2432 * Only allow device public page to be migrated and account for 2433 * the extra reference count imply by ZONE_DEVICE pages. 2434 */ 2435 if (!is_device_public_page(page)) 2436 return false; 2437 extra++; 2438 } 2439 2440 /* For file back page */ 2441 if (page_mapping(page)) 2442 extra += 1 + page_has_private(page); 2443 2444 if ((page_count(page) - extra) > page_mapcount(page)) 2445 return false; 2446 2447 return true; 2448 } 2449 2450 /* 2451 * migrate_vma_prepare() - lock pages and isolate them from the lru 2452 * @migrate: migrate struct containing all migration information 2453 * 2454 * This locks pages that have been collected by migrate_vma_collect(). Once each 2455 * page is locked it is isolated from the lru (for non-device pages). Finally, 2456 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be 2457 * migrated by concurrent kernel threads. 2458 */ 2459 static void migrate_vma_prepare(struct migrate_vma *migrate) 2460 { 2461 const unsigned long npages = migrate->npages; 2462 const unsigned long start = migrate->start; 2463 unsigned long addr, i, restore = 0; 2464 bool allow_drain = true; 2465 2466 lru_add_drain(); 2467 2468 for (i = 0; (i < npages) && migrate->cpages; i++) { 2469 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2470 bool remap = true; 2471 2472 if (!page) 2473 continue; 2474 2475 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { 2476 /* 2477 * Because we are migrating several pages there can be 2478 * a deadlock between 2 concurrent migration where each 2479 * are waiting on each other page lock. 2480 * 2481 * Make migrate_vma() a best effort thing and backoff 2482 * for any page we can not lock right away. 2483 */ 2484 if (!trylock_page(page)) { 2485 migrate->src[i] = 0; 2486 migrate->cpages--; 2487 put_page(page); 2488 continue; 2489 } 2490 remap = false; 2491 migrate->src[i] |= MIGRATE_PFN_LOCKED; 2492 } 2493 2494 /* ZONE_DEVICE pages are not on LRU */ 2495 if (!is_zone_device_page(page)) { 2496 if (!PageLRU(page) && allow_drain) { 2497 /* Drain CPU's pagevec */ 2498 lru_add_drain_all(); 2499 allow_drain = false; 2500 } 2501 2502 if (isolate_lru_page(page)) { 2503 if (remap) { 2504 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2505 migrate->cpages--; 2506 restore++; 2507 } else { 2508 migrate->src[i] = 0; 2509 unlock_page(page); 2510 migrate->cpages--; 2511 put_page(page); 2512 } 2513 continue; 2514 } 2515 2516 /* Drop the reference we took in collect */ 2517 put_page(page); 2518 } 2519 2520 if (!migrate_vma_check_page(page)) { 2521 if (remap) { 2522 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2523 migrate->cpages--; 2524 restore++; 2525 2526 if (!is_zone_device_page(page)) { 2527 get_page(page); 2528 putback_lru_page(page); 2529 } 2530 } else { 2531 migrate->src[i] = 0; 2532 unlock_page(page); 2533 migrate->cpages--; 2534 2535 if (!is_zone_device_page(page)) 2536 putback_lru_page(page); 2537 else 2538 put_page(page); 2539 } 2540 } 2541 } 2542 2543 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { 2544 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2545 2546 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2547 continue; 2548 2549 remove_migration_pte(page, migrate->vma, addr, page); 2550 2551 migrate->src[i] = 0; 2552 unlock_page(page); 2553 put_page(page); 2554 restore--; 2555 } 2556 } 2557 2558 /* 2559 * migrate_vma_unmap() - replace page mapping with special migration pte entry 2560 * @migrate: migrate struct containing all migration information 2561 * 2562 * Replace page mapping (CPU page table pte) with a special migration pte entry 2563 * and check again if it has been pinned. Pinned pages are restored because we 2564 * cannot migrate them. 2565 * 2566 * This is the last step before we call the device driver callback to allocate 2567 * destination memory and copy contents of original page over to new page. 2568 */ 2569 static void migrate_vma_unmap(struct migrate_vma *migrate) 2570 { 2571 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 2572 const unsigned long npages = migrate->npages; 2573 const unsigned long start = migrate->start; 2574 unsigned long addr, i, restore = 0; 2575 2576 for (i = 0; i < npages; i++) { 2577 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2578 2579 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2580 continue; 2581 2582 if (page_mapped(page)) { 2583 try_to_unmap(page, flags); 2584 if (page_mapped(page)) 2585 goto restore; 2586 } 2587 2588 if (migrate_vma_check_page(page)) 2589 continue; 2590 2591 restore: 2592 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2593 migrate->cpages--; 2594 restore++; 2595 } 2596 2597 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { 2598 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2599 2600 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2601 continue; 2602 2603 remove_migration_ptes(page, page, false); 2604 2605 migrate->src[i] = 0; 2606 unlock_page(page); 2607 restore--; 2608 2609 if (is_zone_device_page(page)) 2610 put_page(page); 2611 else 2612 putback_lru_page(page); 2613 } 2614 } 2615 2616 static void migrate_vma_insert_page(struct migrate_vma *migrate, 2617 unsigned long addr, 2618 struct page *page, 2619 unsigned long *src, 2620 unsigned long *dst) 2621 { 2622 struct vm_area_struct *vma = migrate->vma; 2623 struct mm_struct *mm = vma->vm_mm; 2624 struct mem_cgroup *memcg; 2625 bool flush = false; 2626 spinlock_t *ptl; 2627 pte_t entry; 2628 pgd_t *pgdp; 2629 p4d_t *p4dp; 2630 pud_t *pudp; 2631 pmd_t *pmdp; 2632 pte_t *ptep; 2633 2634 /* Only allow populating anonymous memory */ 2635 if (!vma_is_anonymous(vma)) 2636 goto abort; 2637 2638 pgdp = pgd_offset(mm, addr); 2639 p4dp = p4d_alloc(mm, pgdp, addr); 2640 if (!p4dp) 2641 goto abort; 2642 pudp = pud_alloc(mm, p4dp, addr); 2643 if (!pudp) 2644 goto abort; 2645 pmdp = pmd_alloc(mm, pudp, addr); 2646 if (!pmdp) 2647 goto abort; 2648 2649 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 2650 goto abort; 2651 2652 /* 2653 * Use pte_alloc() instead of pte_alloc_map(). We can't run 2654 * pte_offset_map() on pmds where a huge pmd might be created 2655 * from a different thread. 2656 * 2657 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when 2658 * parallel threads are excluded by other means. 2659 * 2660 * Here we only have down_read(mmap_sem). 2661 */ 2662 if (pte_alloc(mm, pmdp, addr)) 2663 goto abort; 2664 2665 /* See the comment in pte_alloc_one_map() */ 2666 if (unlikely(pmd_trans_unstable(pmdp))) 2667 goto abort; 2668 2669 if (unlikely(anon_vma_prepare(vma))) 2670 goto abort; 2671 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) 2672 goto abort; 2673 2674 /* 2675 * The memory barrier inside __SetPageUptodate makes sure that 2676 * preceding stores to the page contents become visible before 2677 * the set_pte_at() write. 2678 */ 2679 __SetPageUptodate(page); 2680 2681 if (is_zone_device_page(page)) { 2682 if (is_device_private_page(page)) { 2683 swp_entry_t swp_entry; 2684 2685 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); 2686 entry = swp_entry_to_pte(swp_entry); 2687 } else if (is_device_public_page(page)) { 2688 entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); 2689 if (vma->vm_flags & VM_WRITE) 2690 entry = pte_mkwrite(pte_mkdirty(entry)); 2691 entry = pte_mkdevmap(entry); 2692 } 2693 } else { 2694 entry = mk_pte(page, vma->vm_page_prot); 2695 if (vma->vm_flags & VM_WRITE) 2696 entry = pte_mkwrite(pte_mkdirty(entry)); 2697 } 2698 2699 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2700 2701 if (pte_present(*ptep)) { 2702 unsigned long pfn = pte_pfn(*ptep); 2703 2704 if (!is_zero_pfn(pfn)) { 2705 pte_unmap_unlock(ptep, ptl); 2706 mem_cgroup_cancel_charge(page, memcg, false); 2707 goto abort; 2708 } 2709 flush = true; 2710 } else if (!pte_none(*ptep)) { 2711 pte_unmap_unlock(ptep, ptl); 2712 mem_cgroup_cancel_charge(page, memcg, false); 2713 goto abort; 2714 } 2715 2716 /* 2717 * Check for usefaultfd but do not deliver the fault. Instead, 2718 * just back off. 2719 */ 2720 if (userfaultfd_missing(vma)) { 2721 pte_unmap_unlock(ptep, ptl); 2722 mem_cgroup_cancel_charge(page, memcg, false); 2723 goto abort; 2724 } 2725 2726 inc_mm_counter(mm, MM_ANONPAGES); 2727 page_add_new_anon_rmap(page, vma, addr, false); 2728 mem_cgroup_commit_charge(page, memcg, false, false); 2729 if (!is_zone_device_page(page)) 2730 lru_cache_add_active_or_unevictable(page, vma); 2731 get_page(page); 2732 2733 if (flush) { 2734 flush_cache_page(vma, addr, pte_pfn(*ptep)); 2735 ptep_clear_flush_notify(vma, addr, ptep); 2736 set_pte_at_notify(mm, addr, ptep, entry); 2737 update_mmu_cache(vma, addr, ptep); 2738 } else { 2739 /* No need to invalidate - it was non-present before */ 2740 set_pte_at(mm, addr, ptep, entry); 2741 update_mmu_cache(vma, addr, ptep); 2742 } 2743 2744 pte_unmap_unlock(ptep, ptl); 2745 *src = MIGRATE_PFN_MIGRATE; 2746 return; 2747 2748 abort: 2749 *src &= ~MIGRATE_PFN_MIGRATE; 2750 } 2751 2752 /* 2753 * migrate_vma_pages() - migrate meta-data from src page to dst page 2754 * @migrate: migrate struct containing all migration information 2755 * 2756 * This migrates struct page meta-data from source struct page to destination 2757 * struct page. This effectively finishes the migration from source page to the 2758 * destination page. 2759 */ 2760 static void migrate_vma_pages(struct migrate_vma *migrate) 2761 { 2762 const unsigned long npages = migrate->npages; 2763 const unsigned long start = migrate->start; 2764 struct vm_area_struct *vma = migrate->vma; 2765 struct mm_struct *mm = vma->vm_mm; 2766 unsigned long addr, i, mmu_start; 2767 bool notified = false; 2768 2769 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 2770 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2771 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2772 struct address_space *mapping; 2773 int r; 2774 2775 if (!newpage) { 2776 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2777 continue; 2778 } 2779 2780 if (!page) { 2781 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { 2782 continue; 2783 } 2784 if (!notified) { 2785 mmu_start = addr; 2786 notified = true; 2787 mmu_notifier_invalidate_range_start(mm, 2788 mmu_start, 2789 migrate->end); 2790 } 2791 migrate_vma_insert_page(migrate, addr, newpage, 2792 &migrate->src[i], 2793 &migrate->dst[i]); 2794 continue; 2795 } 2796 2797 mapping = page_mapping(page); 2798 2799 if (is_zone_device_page(newpage)) { 2800 if (is_device_private_page(newpage)) { 2801 /* 2802 * For now only support private anonymous when 2803 * migrating to un-addressable device memory. 2804 */ 2805 if (mapping) { 2806 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2807 continue; 2808 } 2809 } else if (!is_device_public_page(newpage)) { 2810 /* 2811 * Other types of ZONE_DEVICE page are not 2812 * supported. 2813 */ 2814 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2815 continue; 2816 } 2817 } 2818 2819 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 2820 if (r != MIGRATEPAGE_SUCCESS) 2821 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2822 } 2823 2824 /* 2825 * No need to double call mmu_notifier->invalidate_range() callback as 2826 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 2827 * did already call it. 2828 */ 2829 if (notified) 2830 mmu_notifier_invalidate_range_only_end(mm, mmu_start, 2831 migrate->end); 2832 } 2833 2834 /* 2835 * migrate_vma_finalize() - restore CPU page table entry 2836 * @migrate: migrate struct containing all migration information 2837 * 2838 * This replaces the special migration pte entry with either a mapping to the 2839 * new page if migration was successful for that page, or to the original page 2840 * otherwise. 2841 * 2842 * This also unlocks the pages and puts them back on the lru, or drops the extra 2843 * refcount, for device pages. 2844 */ 2845 static void migrate_vma_finalize(struct migrate_vma *migrate) 2846 { 2847 const unsigned long npages = migrate->npages; 2848 unsigned long i; 2849 2850 for (i = 0; i < npages; i++) { 2851 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2852 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2853 2854 if (!page) { 2855 if (newpage) { 2856 unlock_page(newpage); 2857 put_page(newpage); 2858 } 2859 continue; 2860 } 2861 2862 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 2863 if (newpage) { 2864 unlock_page(newpage); 2865 put_page(newpage); 2866 } 2867 newpage = page; 2868 } 2869 2870 remove_migration_ptes(page, newpage, false); 2871 unlock_page(page); 2872 migrate->cpages--; 2873 2874 if (is_zone_device_page(page)) 2875 put_page(page); 2876 else 2877 putback_lru_page(page); 2878 2879 if (newpage != page) { 2880 unlock_page(newpage); 2881 if (is_zone_device_page(newpage)) 2882 put_page(newpage); 2883 else 2884 putback_lru_page(newpage); 2885 } 2886 } 2887 } 2888 2889 /* 2890 * migrate_vma() - migrate a range of memory inside vma 2891 * 2892 * @ops: migration callback for allocating destination memory and copying 2893 * @vma: virtual memory area containing the range to be migrated 2894 * @start: start address of the range to migrate (inclusive) 2895 * @end: end address of the range to migrate (exclusive) 2896 * @src: array of hmm_pfn_t containing source pfns 2897 * @dst: array of hmm_pfn_t containing destination pfns 2898 * @private: pointer passed back to each of the callback 2899 * Returns: 0 on success, error code otherwise 2900 * 2901 * This function tries to migrate a range of memory virtual address range, using 2902 * callbacks to allocate and copy memory from source to destination. First it 2903 * collects all the pages backing each virtual address in the range, saving this 2904 * inside the src array. Then it locks those pages and unmaps them. Once the pages 2905 * are locked and unmapped, it checks whether each page is pinned or not. Pages 2906 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) 2907 * in the corresponding src array entry. It then restores any pages that are 2908 * pinned, by remapping and unlocking those pages. 2909 * 2910 * At this point it calls the alloc_and_copy() callback. For documentation on 2911 * what is expected from that callback, see struct migrate_vma_ops comments in 2912 * include/linux/migrate.h 2913 * 2914 * After the alloc_and_copy() callback, this function goes over each entry in 2915 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2916 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2917 * then the function tries to migrate struct page information from the source 2918 * struct page to the destination struct page. If it fails to migrate the struct 2919 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src 2920 * array. 2921 * 2922 * At this point all successfully migrated pages have an entry in the src 2923 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2924 * array entry with MIGRATE_PFN_VALID flag set. 2925 * 2926 * It then calls the finalize_and_map() callback. See comments for "struct 2927 * migrate_vma_ops", in include/linux/migrate.h for details about 2928 * finalize_and_map() behavior. 2929 * 2930 * After the finalize_and_map() callback, for successfully migrated pages, this 2931 * function updates the CPU page table to point to new pages, otherwise it 2932 * restores the CPU page table to point to the original source pages. 2933 * 2934 * Function returns 0 after the above steps, even if no pages were migrated 2935 * (The function only returns an error if any of the arguments are invalid.) 2936 * 2937 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT 2938 * unsigned long entries. 2939 */ 2940 int migrate_vma(const struct migrate_vma_ops *ops, 2941 struct vm_area_struct *vma, 2942 unsigned long start, 2943 unsigned long end, 2944 unsigned long *src, 2945 unsigned long *dst, 2946 void *private) 2947 { 2948 struct migrate_vma migrate; 2949 2950 /* Sanity check the arguments */ 2951 start &= PAGE_MASK; 2952 end &= PAGE_MASK; 2953 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 2954 vma_is_dax(vma)) 2955 return -EINVAL; 2956 if (start < vma->vm_start || start >= vma->vm_end) 2957 return -EINVAL; 2958 if (end <= vma->vm_start || end > vma->vm_end) 2959 return -EINVAL; 2960 if (!ops || !src || !dst || start >= end) 2961 return -EINVAL; 2962 2963 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); 2964 migrate.src = src; 2965 migrate.dst = dst; 2966 migrate.start = start; 2967 migrate.npages = 0; 2968 migrate.cpages = 0; 2969 migrate.end = end; 2970 migrate.vma = vma; 2971 2972 /* Collect, and try to unmap source pages */ 2973 migrate_vma_collect(&migrate); 2974 if (!migrate.cpages) 2975 return 0; 2976 2977 /* Lock and isolate page */ 2978 migrate_vma_prepare(&migrate); 2979 if (!migrate.cpages) 2980 return 0; 2981 2982 /* Unmap pages */ 2983 migrate_vma_unmap(&migrate); 2984 if (!migrate.cpages) 2985 return 0; 2986 2987 /* 2988 * At this point pages are locked and unmapped, and thus they have 2989 * stable content and can safely be copied to destination memory that 2990 * is allocated by the callback. 2991 * 2992 * Note that migration can fail in migrate_vma_struct_page() for each 2993 * individual page. 2994 */ 2995 ops->alloc_and_copy(vma, src, dst, start, end, private); 2996 2997 /* This does the real migration of struct page */ 2998 migrate_vma_pages(&migrate); 2999 3000 ops->finalize_and_map(vma, src, dst, start, end, private); 3001 3002 /* Unlock and remap pages */ 3003 migrate_vma_finalize(&migrate); 3004 3005 return 0; 3006 } 3007 EXPORT_SYMBOL(migrate_vma); 3008 #endif /* defined(MIGRATE_VMA_HELPER) */ 3009