1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory Migration functionality - linux/mm/migrate.c 4 * 5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 6 * 7 * Page migration was first developed in the context of the memory hotplug 8 * project. The main authors of the migration code are: 9 * 10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 11 * Hirokazu Takahashi <taka@valinux.co.jp> 12 * Dave Hansen <haveblue@us.ibm.com> 13 * Christoph Lameter 14 */ 15 16 #include <linux/migrate.h> 17 #include <linux/export.h> 18 #include <linux/swap.h> 19 #include <linux/swapops.h> 20 #include <linux/pagemap.h> 21 #include <linux/buffer_head.h> 22 #include <linux/mm_inline.h> 23 #include <linux/nsproxy.h> 24 #include <linux/pagevec.h> 25 #include <linux/ksm.h> 26 #include <linux/rmap.h> 27 #include <linux/topology.h> 28 #include <linux/cpu.h> 29 #include <linux/cpuset.h> 30 #include <linux/writeback.h> 31 #include <linux/mempolicy.h> 32 #include <linux/vmalloc.h> 33 #include <linux/security.h> 34 #include <linux/backing-dev.h> 35 #include <linux/compaction.h> 36 #include <linux/syscalls.h> 37 #include <linux/compat.h> 38 #include <linux/hugetlb.h> 39 #include <linux/hugetlb_cgroup.h> 40 #include <linux/gfp.h> 41 #include <linux/pfn_t.h> 42 #include <linux/memremap.h> 43 #include <linux/userfaultfd_k.h> 44 #include <linux/balloon_compaction.h> 45 #include <linux/mmu_notifier.h> 46 #include <linux/page_idle.h> 47 #include <linux/page_owner.h> 48 #include <linux/sched/mm.h> 49 #include <linux/ptrace.h> 50 51 #include <asm/tlbflush.h> 52 53 #define CREATE_TRACE_POINTS 54 #include <trace/events/migrate.h> 55 56 #include "internal.h" 57 58 /* 59 * migrate_prep() needs to be called before we start compiling a list of pages 60 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 61 * undesirable, use migrate_prep_local() 62 */ 63 int migrate_prep(void) 64 { 65 /* 66 * Clear the LRU lists so pages can be isolated. 67 * Note that pages may be moved off the LRU after we have 68 * drained them. Those pages will fail to migrate like other 69 * pages that may be busy. 70 */ 71 lru_add_drain_all(); 72 73 return 0; 74 } 75 76 /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 77 int migrate_prep_local(void) 78 { 79 lru_add_drain(); 80 81 return 0; 82 } 83 84 int isolate_movable_page(struct page *page, isolate_mode_t mode) 85 { 86 struct address_space *mapping; 87 88 /* 89 * Avoid burning cycles with pages that are yet under __free_pages(), 90 * or just got freed under us. 91 * 92 * In case we 'win' a race for a movable page being freed under us and 93 * raise its refcount preventing __free_pages() from doing its job 94 * the put_page() at the end of this block will take care of 95 * release this page, thus avoiding a nasty leakage. 96 */ 97 if (unlikely(!get_page_unless_zero(page))) 98 goto out; 99 100 /* 101 * Check PageMovable before holding a PG_lock because page's owner 102 * assumes anybody doesn't touch PG_lock of newly allocated page 103 * so unconditionally grapping the lock ruins page's owner side. 104 */ 105 if (unlikely(!__PageMovable(page))) 106 goto out_putpage; 107 /* 108 * As movable pages are not isolated from LRU lists, concurrent 109 * compaction threads can race against page migration functions 110 * as well as race against the releasing a page. 111 * 112 * In order to avoid having an already isolated movable page 113 * being (wrongly) re-isolated while it is under migration, 114 * or to avoid attempting to isolate pages being released, 115 * lets be sure we have the page lock 116 * before proceeding with the movable page isolation steps. 117 */ 118 if (unlikely(!trylock_page(page))) 119 goto out_putpage; 120 121 if (!PageMovable(page) || PageIsolated(page)) 122 goto out_no_isolated; 123 124 mapping = page_mapping(page); 125 VM_BUG_ON_PAGE(!mapping, page); 126 127 if (!mapping->a_ops->isolate_page(page, mode)) 128 goto out_no_isolated; 129 130 /* Driver shouldn't use PG_isolated bit of page->flags */ 131 WARN_ON_ONCE(PageIsolated(page)); 132 __SetPageIsolated(page); 133 unlock_page(page); 134 135 return 0; 136 137 out_no_isolated: 138 unlock_page(page); 139 out_putpage: 140 put_page(page); 141 out: 142 return -EBUSY; 143 } 144 145 /* It should be called on page which is PG_movable */ 146 void putback_movable_page(struct page *page) 147 { 148 struct address_space *mapping; 149 150 VM_BUG_ON_PAGE(!PageLocked(page), page); 151 VM_BUG_ON_PAGE(!PageMovable(page), page); 152 VM_BUG_ON_PAGE(!PageIsolated(page), page); 153 154 mapping = page_mapping(page); 155 mapping->a_ops->putback_page(page); 156 __ClearPageIsolated(page); 157 } 158 159 /* 160 * Put previously isolated pages back onto the appropriate lists 161 * from where they were once taken off for compaction/migration. 162 * 163 * This function shall be used whenever the isolated pageset has been 164 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 165 * and isolate_huge_page(). 166 */ 167 void putback_movable_pages(struct list_head *l) 168 { 169 struct page *page; 170 struct page *page2; 171 172 list_for_each_entry_safe(page, page2, l, lru) { 173 if (unlikely(PageHuge(page))) { 174 putback_active_hugepage(page); 175 continue; 176 } 177 list_del(&page->lru); 178 /* 179 * We isolated non-lru movable page so here we can use 180 * __PageMovable because LRU page's mapping cannot have 181 * PAGE_MAPPING_MOVABLE. 182 */ 183 if (unlikely(__PageMovable(page))) { 184 VM_BUG_ON_PAGE(!PageIsolated(page), page); 185 lock_page(page); 186 if (PageMovable(page)) 187 putback_movable_page(page); 188 else 189 __ClearPageIsolated(page); 190 unlock_page(page); 191 put_page(page); 192 } else { 193 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 194 page_is_file_cache(page), -hpage_nr_pages(page)); 195 putback_lru_page(page); 196 } 197 } 198 } 199 200 /* 201 * Restore a potential migration pte to a working pte entry 202 */ 203 static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 204 unsigned long addr, void *old) 205 { 206 struct page_vma_mapped_walk pvmw = { 207 .page = old, 208 .vma = vma, 209 .address = addr, 210 .flags = PVMW_SYNC | PVMW_MIGRATION, 211 }; 212 struct page *new; 213 pte_t pte; 214 swp_entry_t entry; 215 216 VM_BUG_ON_PAGE(PageTail(page), page); 217 while (page_vma_mapped_walk(&pvmw)) { 218 if (PageKsm(page)) 219 new = page; 220 else 221 new = page - pvmw.page->index + 222 linear_page_index(vma, pvmw.address); 223 224 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 225 /* PMD-mapped THP migration entry */ 226 if (!pvmw.pte) { 227 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 228 remove_migration_pmd(&pvmw, new); 229 continue; 230 } 231 #endif 232 233 get_page(new); 234 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 235 if (pte_swp_soft_dirty(*pvmw.pte)) 236 pte = pte_mksoft_dirty(pte); 237 238 /* 239 * Recheck VMA as permissions can change since migration started 240 */ 241 entry = pte_to_swp_entry(*pvmw.pte); 242 if (is_write_migration_entry(entry)) 243 pte = maybe_mkwrite(pte, vma); 244 245 if (unlikely(is_zone_device_page(new))) { 246 if (is_device_private_page(new)) { 247 entry = make_device_private_entry(new, pte_write(pte)); 248 pte = swp_entry_to_pte(entry); 249 } else if (is_device_public_page(new)) { 250 pte = pte_mkdevmap(pte); 251 flush_dcache_page(new); 252 } 253 } else 254 flush_dcache_page(new); 255 256 #ifdef CONFIG_HUGETLB_PAGE 257 if (PageHuge(new)) { 258 pte = pte_mkhuge(pte); 259 pte = arch_make_huge_pte(pte, vma, new, 0); 260 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 261 if (PageAnon(new)) 262 hugepage_add_anon_rmap(new, vma, pvmw.address); 263 else 264 page_dup_rmap(new, true); 265 } else 266 #endif 267 { 268 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 269 270 if (PageAnon(new)) 271 page_add_anon_rmap(new, vma, pvmw.address, false); 272 else 273 page_add_file_rmap(new, false); 274 } 275 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 276 mlock_vma_page(new); 277 278 if (PageTransHuge(page) && PageMlocked(page)) 279 clear_page_mlock(page); 280 281 /* No need to invalidate - it was non-present before */ 282 update_mmu_cache(vma, pvmw.address, pvmw.pte); 283 } 284 285 return true; 286 } 287 288 /* 289 * Get rid of all migration entries and replace them by 290 * references to the indicated page. 291 */ 292 void remove_migration_ptes(struct page *old, struct page *new, bool locked) 293 { 294 struct rmap_walk_control rwc = { 295 .rmap_one = remove_migration_pte, 296 .arg = old, 297 }; 298 299 if (locked) 300 rmap_walk_locked(new, &rwc); 301 else 302 rmap_walk(new, &rwc); 303 } 304 305 /* 306 * Something used the pte of a page under migration. We need to 307 * get to the page and wait until migration is finished. 308 * When we return from this function the fault will be retried. 309 */ 310 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 311 spinlock_t *ptl) 312 { 313 pte_t pte; 314 swp_entry_t entry; 315 struct page *page; 316 317 spin_lock(ptl); 318 pte = *ptep; 319 if (!is_swap_pte(pte)) 320 goto out; 321 322 entry = pte_to_swp_entry(pte); 323 if (!is_migration_entry(entry)) 324 goto out; 325 326 page = migration_entry_to_page(entry); 327 328 /* 329 * Once radix-tree replacement of page migration started, page_count 330 * *must* be zero. And, we don't want to call wait_on_page_locked() 331 * against a page without get_page(). 332 * So, we use get_page_unless_zero(), here. Even failed, page fault 333 * will occur again. 334 */ 335 if (!get_page_unless_zero(page)) 336 goto out; 337 pte_unmap_unlock(ptep, ptl); 338 wait_on_page_locked(page); 339 put_page(page); 340 return; 341 out: 342 pte_unmap_unlock(ptep, ptl); 343 } 344 345 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 346 unsigned long address) 347 { 348 spinlock_t *ptl = pte_lockptr(mm, pmd); 349 pte_t *ptep = pte_offset_map(pmd, address); 350 __migration_entry_wait(mm, ptep, ptl); 351 } 352 353 void migration_entry_wait_huge(struct vm_area_struct *vma, 354 struct mm_struct *mm, pte_t *pte) 355 { 356 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 357 __migration_entry_wait(mm, pte, ptl); 358 } 359 360 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 361 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 362 { 363 spinlock_t *ptl; 364 struct page *page; 365 366 ptl = pmd_lock(mm, pmd); 367 if (!is_pmd_migration_entry(*pmd)) 368 goto unlock; 369 page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); 370 if (!get_page_unless_zero(page)) 371 goto unlock; 372 spin_unlock(ptl); 373 wait_on_page_locked(page); 374 put_page(page); 375 return; 376 unlock: 377 spin_unlock(ptl); 378 } 379 #endif 380 381 #ifdef CONFIG_BLOCK 382 /* Returns true if all buffers are successfully locked */ 383 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 384 enum migrate_mode mode) 385 { 386 struct buffer_head *bh = head; 387 388 /* Simple case, sync compaction */ 389 if (mode != MIGRATE_ASYNC) { 390 do { 391 get_bh(bh); 392 lock_buffer(bh); 393 bh = bh->b_this_page; 394 395 } while (bh != head); 396 397 return true; 398 } 399 400 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 401 do { 402 get_bh(bh); 403 if (!trylock_buffer(bh)) { 404 /* 405 * We failed to lock the buffer and cannot stall in 406 * async migration. Release the taken locks 407 */ 408 struct buffer_head *failed_bh = bh; 409 put_bh(failed_bh); 410 bh = head; 411 while (bh != failed_bh) { 412 unlock_buffer(bh); 413 put_bh(bh); 414 bh = bh->b_this_page; 415 } 416 return false; 417 } 418 419 bh = bh->b_this_page; 420 } while (bh != head); 421 return true; 422 } 423 #else 424 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, 425 enum migrate_mode mode) 426 { 427 return true; 428 } 429 #endif /* CONFIG_BLOCK */ 430 431 /* 432 * Replace the page in the mapping. 433 * 434 * The number of remaining references must be: 435 * 1 for anonymous pages without a mapping 436 * 2 for pages with a mapping 437 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 438 */ 439 int migrate_page_move_mapping(struct address_space *mapping, 440 struct page *newpage, struct page *page, 441 struct buffer_head *head, enum migrate_mode mode, 442 int extra_count) 443 { 444 struct zone *oldzone, *newzone; 445 int dirty; 446 int expected_count = 1 + extra_count; 447 void **pslot; 448 449 /* 450 * Device public or private pages have an extra refcount as they are 451 * ZONE_DEVICE pages. 452 */ 453 expected_count += is_device_private_page(page); 454 expected_count += is_device_public_page(page); 455 456 if (!mapping) { 457 /* Anonymous page without mapping */ 458 if (page_count(page) != expected_count) 459 return -EAGAIN; 460 461 /* No turning back from here */ 462 newpage->index = page->index; 463 newpage->mapping = page->mapping; 464 if (PageSwapBacked(page)) 465 __SetPageSwapBacked(newpage); 466 467 return MIGRATEPAGE_SUCCESS; 468 } 469 470 oldzone = page_zone(page); 471 newzone = page_zone(newpage); 472 473 xa_lock_irq(&mapping->i_pages); 474 475 pslot = radix_tree_lookup_slot(&mapping->i_pages, 476 page_index(page)); 477 478 expected_count += hpage_nr_pages(page) + page_has_private(page); 479 if (page_count(page) != expected_count || 480 radix_tree_deref_slot_protected(pslot, 481 &mapping->i_pages.xa_lock) != page) { 482 xa_unlock_irq(&mapping->i_pages); 483 return -EAGAIN; 484 } 485 486 if (!page_ref_freeze(page, expected_count)) { 487 xa_unlock_irq(&mapping->i_pages); 488 return -EAGAIN; 489 } 490 491 /* 492 * In the async migration case of moving a page with buffers, lock the 493 * buffers using trylock before the mapping is moved. If the mapping 494 * was moved, we later failed to lock the buffers and could not move 495 * the mapping back due to an elevated page count, we would have to 496 * block waiting on other references to be dropped. 497 */ 498 if (mode == MIGRATE_ASYNC && head && 499 !buffer_migrate_lock_buffers(head, mode)) { 500 page_ref_unfreeze(page, expected_count); 501 xa_unlock_irq(&mapping->i_pages); 502 return -EAGAIN; 503 } 504 505 /* 506 * Now we know that no one else is looking at the page: 507 * no turning back from here. 508 */ 509 newpage->index = page->index; 510 newpage->mapping = page->mapping; 511 page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ 512 if (PageSwapBacked(page)) { 513 __SetPageSwapBacked(newpage); 514 if (PageSwapCache(page)) { 515 SetPageSwapCache(newpage); 516 set_page_private(newpage, page_private(page)); 517 } 518 } else { 519 VM_BUG_ON_PAGE(PageSwapCache(page), page); 520 } 521 522 /* Move dirty while page refs frozen and newpage not yet exposed */ 523 dirty = PageDirty(page); 524 if (dirty) { 525 ClearPageDirty(page); 526 SetPageDirty(newpage); 527 } 528 529 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 530 if (PageTransHuge(page)) { 531 int i; 532 int index = page_index(page); 533 534 for (i = 1; i < HPAGE_PMD_NR; i++) { 535 pslot = radix_tree_lookup_slot(&mapping->i_pages, 536 index + i); 537 radix_tree_replace_slot(&mapping->i_pages, pslot, 538 newpage + i); 539 } 540 } 541 542 /* 543 * Drop cache reference from old page by unfreezing 544 * to one less reference. 545 * We know this isn't the last reference. 546 */ 547 page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); 548 549 xa_unlock(&mapping->i_pages); 550 /* Leave irq disabled to prevent preemption while updating stats */ 551 552 /* 553 * If moved to a different zone then also account 554 * the page for that zone. Other VM counters will be 555 * taken care of when we establish references to the 556 * new page and drop references to the old page. 557 * 558 * Note that anonymous pages are accounted for 559 * via NR_FILE_PAGES and NR_ANON_MAPPED if they 560 * are mapped to swap space. 561 */ 562 if (newzone != oldzone) { 563 __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); 564 __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); 565 if (PageSwapBacked(page) && !PageSwapCache(page)) { 566 __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); 567 __inc_node_state(newzone->zone_pgdat, NR_SHMEM); 568 } 569 if (dirty && mapping_cap_account_dirty(mapping)) { 570 __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); 571 __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); 572 __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); 573 __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); 574 } 575 } 576 local_irq_enable(); 577 578 return MIGRATEPAGE_SUCCESS; 579 } 580 EXPORT_SYMBOL(migrate_page_move_mapping); 581 582 /* 583 * The expected number of remaining references is the same as that 584 * of migrate_page_move_mapping(). 585 */ 586 int migrate_huge_page_move_mapping(struct address_space *mapping, 587 struct page *newpage, struct page *page) 588 { 589 int expected_count; 590 void **pslot; 591 592 xa_lock_irq(&mapping->i_pages); 593 594 pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); 595 596 expected_count = 2 + page_has_private(page); 597 if (page_count(page) != expected_count || 598 radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { 599 xa_unlock_irq(&mapping->i_pages); 600 return -EAGAIN; 601 } 602 603 if (!page_ref_freeze(page, expected_count)) { 604 xa_unlock_irq(&mapping->i_pages); 605 return -EAGAIN; 606 } 607 608 newpage->index = page->index; 609 newpage->mapping = page->mapping; 610 611 get_page(newpage); 612 613 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 614 615 page_ref_unfreeze(page, expected_count - 1); 616 617 xa_unlock_irq(&mapping->i_pages); 618 619 return MIGRATEPAGE_SUCCESS; 620 } 621 622 /* 623 * Gigantic pages are so large that we do not guarantee that page++ pointer 624 * arithmetic will work across the entire page. We need something more 625 * specialized. 626 */ 627 static void __copy_gigantic_page(struct page *dst, struct page *src, 628 int nr_pages) 629 { 630 int i; 631 struct page *dst_base = dst; 632 struct page *src_base = src; 633 634 for (i = 0; i < nr_pages; ) { 635 cond_resched(); 636 copy_highpage(dst, src); 637 638 i++; 639 dst = mem_map_next(dst, dst_base, i); 640 src = mem_map_next(src, src_base, i); 641 } 642 } 643 644 static void copy_huge_page(struct page *dst, struct page *src) 645 { 646 int i; 647 int nr_pages; 648 649 if (PageHuge(src)) { 650 /* hugetlbfs page */ 651 struct hstate *h = page_hstate(src); 652 nr_pages = pages_per_huge_page(h); 653 654 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { 655 __copy_gigantic_page(dst, src, nr_pages); 656 return; 657 } 658 } else { 659 /* thp page */ 660 BUG_ON(!PageTransHuge(src)); 661 nr_pages = hpage_nr_pages(src); 662 } 663 664 for (i = 0; i < nr_pages; i++) { 665 cond_resched(); 666 copy_highpage(dst + i, src + i); 667 } 668 } 669 670 /* 671 * Copy the page to its new location 672 */ 673 void migrate_page_states(struct page *newpage, struct page *page) 674 { 675 int cpupid; 676 677 if (PageError(page)) 678 SetPageError(newpage); 679 if (PageReferenced(page)) 680 SetPageReferenced(newpage); 681 if (PageUptodate(page)) 682 SetPageUptodate(newpage); 683 if (TestClearPageActive(page)) { 684 VM_BUG_ON_PAGE(PageUnevictable(page), page); 685 SetPageActive(newpage); 686 } else if (TestClearPageUnevictable(page)) 687 SetPageUnevictable(newpage); 688 if (PageChecked(page)) 689 SetPageChecked(newpage); 690 if (PageMappedToDisk(page)) 691 SetPageMappedToDisk(newpage); 692 693 /* Move dirty on pages not done by migrate_page_move_mapping() */ 694 if (PageDirty(page)) 695 SetPageDirty(newpage); 696 697 if (page_is_young(page)) 698 set_page_young(newpage); 699 if (page_is_idle(page)) 700 set_page_idle(newpage); 701 702 /* 703 * Copy NUMA information to the new page, to prevent over-eager 704 * future migrations of this same page. 705 */ 706 cpupid = page_cpupid_xchg_last(page, -1); 707 page_cpupid_xchg_last(newpage, cpupid); 708 709 ksm_migrate_page(newpage, page); 710 /* 711 * Please do not reorder this without considering how mm/ksm.c's 712 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 713 */ 714 if (PageSwapCache(page)) 715 ClearPageSwapCache(page); 716 ClearPagePrivate(page); 717 set_page_private(page, 0); 718 719 /* 720 * If any waiters have accumulated on the new page then 721 * wake them up. 722 */ 723 if (PageWriteback(newpage)) 724 end_page_writeback(newpage); 725 726 copy_page_owner(page, newpage); 727 728 mem_cgroup_migrate(page, newpage); 729 } 730 EXPORT_SYMBOL(migrate_page_states); 731 732 void migrate_page_copy(struct page *newpage, struct page *page) 733 { 734 if (PageHuge(page) || PageTransHuge(page)) 735 copy_huge_page(newpage, page); 736 else 737 copy_highpage(newpage, page); 738 739 migrate_page_states(newpage, page); 740 } 741 EXPORT_SYMBOL(migrate_page_copy); 742 743 /************************************************************ 744 * Migration functions 745 ***********************************************************/ 746 747 /* 748 * Common logic to directly migrate a single LRU page suitable for 749 * pages that do not use PagePrivate/PagePrivate2. 750 * 751 * Pages are locked upon entry and exit. 752 */ 753 int migrate_page(struct address_space *mapping, 754 struct page *newpage, struct page *page, 755 enum migrate_mode mode) 756 { 757 int rc; 758 759 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 760 761 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); 762 763 if (rc != MIGRATEPAGE_SUCCESS) 764 return rc; 765 766 if (mode != MIGRATE_SYNC_NO_COPY) 767 migrate_page_copy(newpage, page); 768 else 769 migrate_page_states(newpage, page); 770 return MIGRATEPAGE_SUCCESS; 771 } 772 EXPORT_SYMBOL(migrate_page); 773 774 #ifdef CONFIG_BLOCK 775 /* 776 * Migration function for pages with buffers. This function can only be used 777 * if the underlying filesystem guarantees that no other references to "page" 778 * exist. 779 */ 780 int buffer_migrate_page(struct address_space *mapping, 781 struct page *newpage, struct page *page, enum migrate_mode mode) 782 { 783 struct buffer_head *bh, *head; 784 int rc; 785 786 if (!page_has_buffers(page)) 787 return migrate_page(mapping, newpage, page, mode); 788 789 head = page_buffers(page); 790 791 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); 792 793 if (rc != MIGRATEPAGE_SUCCESS) 794 return rc; 795 796 /* 797 * In the async case, migrate_page_move_mapping locked the buffers 798 * with an IRQ-safe spinlock held. In the sync case, the buffers 799 * need to be locked now 800 */ 801 if (mode != MIGRATE_ASYNC) 802 BUG_ON(!buffer_migrate_lock_buffers(head, mode)); 803 804 ClearPagePrivate(page); 805 set_page_private(newpage, page_private(page)); 806 set_page_private(page, 0); 807 put_page(page); 808 get_page(newpage); 809 810 bh = head; 811 do { 812 set_bh_page(bh, newpage, bh_offset(bh)); 813 bh = bh->b_this_page; 814 815 } while (bh != head); 816 817 SetPagePrivate(newpage); 818 819 if (mode != MIGRATE_SYNC_NO_COPY) 820 migrate_page_copy(newpage, page); 821 else 822 migrate_page_states(newpage, page); 823 824 bh = head; 825 do { 826 unlock_buffer(bh); 827 put_bh(bh); 828 bh = bh->b_this_page; 829 830 } while (bh != head); 831 832 return MIGRATEPAGE_SUCCESS; 833 } 834 EXPORT_SYMBOL(buffer_migrate_page); 835 #endif 836 837 /* 838 * Writeback a page to clean the dirty state 839 */ 840 static int writeout(struct address_space *mapping, struct page *page) 841 { 842 struct writeback_control wbc = { 843 .sync_mode = WB_SYNC_NONE, 844 .nr_to_write = 1, 845 .range_start = 0, 846 .range_end = LLONG_MAX, 847 .for_reclaim = 1 848 }; 849 int rc; 850 851 if (!mapping->a_ops->writepage) 852 /* No write method for the address space */ 853 return -EINVAL; 854 855 if (!clear_page_dirty_for_io(page)) 856 /* Someone else already triggered a write */ 857 return -EAGAIN; 858 859 /* 860 * A dirty page may imply that the underlying filesystem has 861 * the page on some queue. So the page must be clean for 862 * migration. Writeout may mean we loose the lock and the 863 * page state is no longer what we checked for earlier. 864 * At this point we know that the migration attempt cannot 865 * be successful. 866 */ 867 remove_migration_ptes(page, page, false); 868 869 rc = mapping->a_ops->writepage(page, &wbc); 870 871 if (rc != AOP_WRITEPAGE_ACTIVATE) 872 /* unlocked. Relock */ 873 lock_page(page); 874 875 return (rc < 0) ? -EIO : -EAGAIN; 876 } 877 878 /* 879 * Default handling if a filesystem does not provide a migration function. 880 */ 881 static int fallback_migrate_page(struct address_space *mapping, 882 struct page *newpage, struct page *page, enum migrate_mode mode) 883 { 884 if (PageDirty(page)) { 885 /* Only writeback pages in full synchronous migration */ 886 switch (mode) { 887 case MIGRATE_SYNC: 888 case MIGRATE_SYNC_NO_COPY: 889 break; 890 default: 891 return -EBUSY; 892 } 893 return writeout(mapping, page); 894 } 895 896 /* 897 * Buffers may be managed in a filesystem specific way. 898 * We must have no buffers or drop them. 899 */ 900 if (page_has_private(page) && 901 !try_to_release_page(page, GFP_KERNEL)) 902 return -EAGAIN; 903 904 return migrate_page(mapping, newpage, page, mode); 905 } 906 907 /* 908 * Move a page to a newly allocated page 909 * The page is locked and all ptes have been successfully removed. 910 * 911 * The new page will have replaced the old page if this function 912 * is successful. 913 * 914 * Return value: 915 * < 0 - error code 916 * MIGRATEPAGE_SUCCESS - success 917 */ 918 static int move_to_new_page(struct page *newpage, struct page *page, 919 enum migrate_mode mode) 920 { 921 struct address_space *mapping; 922 int rc = -EAGAIN; 923 bool is_lru = !__PageMovable(page); 924 925 VM_BUG_ON_PAGE(!PageLocked(page), page); 926 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 927 928 mapping = page_mapping(page); 929 930 if (likely(is_lru)) { 931 if (!mapping) 932 rc = migrate_page(mapping, newpage, page, mode); 933 else if (mapping->a_ops->migratepage) 934 /* 935 * Most pages have a mapping and most filesystems 936 * provide a migratepage callback. Anonymous pages 937 * are part of swap space which also has its own 938 * migratepage callback. This is the most common path 939 * for page migration. 940 */ 941 rc = mapping->a_ops->migratepage(mapping, newpage, 942 page, mode); 943 else 944 rc = fallback_migrate_page(mapping, newpage, 945 page, mode); 946 } else { 947 /* 948 * In case of non-lru page, it could be released after 949 * isolation step. In that case, we shouldn't try migration. 950 */ 951 VM_BUG_ON_PAGE(!PageIsolated(page), page); 952 if (!PageMovable(page)) { 953 rc = MIGRATEPAGE_SUCCESS; 954 __ClearPageIsolated(page); 955 goto out; 956 } 957 958 rc = mapping->a_ops->migratepage(mapping, newpage, 959 page, mode); 960 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 961 !PageIsolated(page)); 962 } 963 964 /* 965 * When successful, old pagecache page->mapping must be cleared before 966 * page is freed; but stats require that PageAnon be left as PageAnon. 967 */ 968 if (rc == MIGRATEPAGE_SUCCESS) { 969 if (__PageMovable(page)) { 970 VM_BUG_ON_PAGE(!PageIsolated(page), page); 971 972 /* 973 * We clear PG_movable under page_lock so any compactor 974 * cannot try to migrate this page. 975 */ 976 __ClearPageIsolated(page); 977 } 978 979 /* 980 * Anonymous and movable page->mapping will be cleard by 981 * free_pages_prepare so don't reset it here for keeping 982 * the type to work PageAnon, for example. 983 */ 984 if (!PageMappingFlags(page)) 985 page->mapping = NULL; 986 } 987 out: 988 return rc; 989 } 990 991 static int __unmap_and_move(struct page *page, struct page *newpage, 992 int force, enum migrate_mode mode) 993 { 994 int rc = -EAGAIN; 995 int page_was_mapped = 0; 996 struct anon_vma *anon_vma = NULL; 997 bool is_lru = !__PageMovable(page); 998 999 if (!trylock_page(page)) { 1000 if (!force || mode == MIGRATE_ASYNC) 1001 goto out; 1002 1003 /* 1004 * It's not safe for direct compaction to call lock_page. 1005 * For example, during page readahead pages are added locked 1006 * to the LRU. Later, when the IO completes the pages are 1007 * marked uptodate and unlocked. However, the queueing 1008 * could be merging multiple pages for one bio (e.g. 1009 * mpage_readpages). If an allocation happens for the 1010 * second or third page, the process can end up locking 1011 * the same page twice and deadlocking. Rather than 1012 * trying to be clever about what pages can be locked, 1013 * avoid the use of lock_page for direct compaction 1014 * altogether. 1015 */ 1016 if (current->flags & PF_MEMALLOC) 1017 goto out; 1018 1019 lock_page(page); 1020 } 1021 1022 if (PageWriteback(page)) { 1023 /* 1024 * Only in the case of a full synchronous migration is it 1025 * necessary to wait for PageWriteback. In the async case, 1026 * the retry loop is too short and in the sync-light case, 1027 * the overhead of stalling is too much 1028 */ 1029 switch (mode) { 1030 case MIGRATE_SYNC: 1031 case MIGRATE_SYNC_NO_COPY: 1032 break; 1033 default: 1034 rc = -EBUSY; 1035 goto out_unlock; 1036 } 1037 if (!force) 1038 goto out_unlock; 1039 wait_on_page_writeback(page); 1040 } 1041 1042 /* 1043 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 1044 * we cannot notice that anon_vma is freed while we migrates a page. 1045 * This get_anon_vma() delays freeing anon_vma pointer until the end 1046 * of migration. File cache pages are no problem because of page_lock() 1047 * File Caches may use write_page() or lock_page() in migration, then, 1048 * just care Anon page here. 1049 * 1050 * Only page_get_anon_vma() understands the subtleties of 1051 * getting a hold on an anon_vma from outside one of its mms. 1052 * But if we cannot get anon_vma, then we won't need it anyway, 1053 * because that implies that the anon page is no longer mapped 1054 * (and cannot be remapped so long as we hold the page lock). 1055 */ 1056 if (PageAnon(page) && !PageKsm(page)) 1057 anon_vma = page_get_anon_vma(page); 1058 1059 /* 1060 * Block others from accessing the new page when we get around to 1061 * establishing additional references. We are usually the only one 1062 * holding a reference to newpage at this point. We used to have a BUG 1063 * here if trylock_page(newpage) fails, but would like to allow for 1064 * cases where there might be a race with the previous use of newpage. 1065 * This is much like races on refcount of oldpage: just don't BUG(). 1066 */ 1067 if (unlikely(!trylock_page(newpage))) 1068 goto out_unlock; 1069 1070 if (unlikely(!is_lru)) { 1071 rc = move_to_new_page(newpage, page, mode); 1072 goto out_unlock_both; 1073 } 1074 1075 /* 1076 * Corner case handling: 1077 * 1. When a new swap-cache page is read into, it is added to the LRU 1078 * and treated as swapcache but it has no rmap yet. 1079 * Calling try_to_unmap() against a page->mapping==NULL page will 1080 * trigger a BUG. So handle it here. 1081 * 2. An orphaned page (see truncate_complete_page) might have 1082 * fs-private metadata. The page can be picked up due to memory 1083 * offlining. Everywhere else except page reclaim, the page is 1084 * invisible to the vm, so the page can not be migrated. So try to 1085 * free the metadata, so the page can be freed. 1086 */ 1087 if (!page->mapping) { 1088 VM_BUG_ON_PAGE(PageAnon(page), page); 1089 if (page_has_private(page)) { 1090 try_to_free_buffers(page); 1091 goto out_unlock_both; 1092 } 1093 } else if (page_mapped(page)) { 1094 /* Establish migration ptes */ 1095 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 1096 page); 1097 try_to_unmap(page, 1098 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1099 page_was_mapped = 1; 1100 } 1101 1102 if (!page_mapped(page)) 1103 rc = move_to_new_page(newpage, page, mode); 1104 1105 if (page_was_mapped) 1106 remove_migration_ptes(page, 1107 rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 1108 1109 out_unlock_both: 1110 unlock_page(newpage); 1111 out_unlock: 1112 /* Drop an anon_vma reference if we took one */ 1113 if (anon_vma) 1114 put_anon_vma(anon_vma); 1115 unlock_page(page); 1116 out: 1117 /* 1118 * If migration is successful, decrease refcount of the newpage 1119 * which will not free the page because new page owner increased 1120 * refcounter. As well, if it is LRU page, add the page to LRU 1121 * list in here. 1122 */ 1123 if (rc == MIGRATEPAGE_SUCCESS) { 1124 if (unlikely(__PageMovable(newpage))) 1125 put_page(newpage); 1126 else 1127 putback_lru_page(newpage); 1128 } 1129 1130 return rc; 1131 } 1132 1133 /* 1134 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work 1135 * around it. 1136 */ 1137 #if defined(CONFIG_ARM) && \ 1138 defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 1139 #define ICE_noinline noinline 1140 #else 1141 #define ICE_noinline 1142 #endif 1143 1144 /* 1145 * Obtain the lock on page, remove all ptes and migrate the page 1146 * to the newly allocated page in newpage. 1147 */ 1148 static ICE_noinline int unmap_and_move(new_page_t get_new_page, 1149 free_page_t put_new_page, 1150 unsigned long private, struct page *page, 1151 int force, enum migrate_mode mode, 1152 enum migrate_reason reason) 1153 { 1154 int rc = MIGRATEPAGE_SUCCESS; 1155 struct page *newpage; 1156 1157 if (!thp_migration_supported() && PageTransHuge(page)) 1158 return -ENOMEM; 1159 1160 newpage = get_new_page(page, private); 1161 if (!newpage) 1162 return -ENOMEM; 1163 1164 if (page_count(page) == 1) { 1165 /* page was freed from under us. So we are done. */ 1166 ClearPageActive(page); 1167 ClearPageUnevictable(page); 1168 if (unlikely(__PageMovable(page))) { 1169 lock_page(page); 1170 if (!PageMovable(page)) 1171 __ClearPageIsolated(page); 1172 unlock_page(page); 1173 } 1174 if (put_new_page) 1175 put_new_page(newpage, private); 1176 else 1177 put_page(newpage); 1178 goto out; 1179 } 1180 1181 rc = __unmap_and_move(page, newpage, force, mode); 1182 if (rc == MIGRATEPAGE_SUCCESS) 1183 set_page_owner_migrate_reason(newpage, reason); 1184 1185 out: 1186 if (rc != -EAGAIN) { 1187 /* 1188 * A page that has been migrated has all references 1189 * removed and will be freed. A page that has not been 1190 * migrated will have kepts its references and be 1191 * restored. 1192 */ 1193 list_del(&page->lru); 1194 1195 /* 1196 * Compaction can migrate also non-LRU pages which are 1197 * not accounted to NR_ISOLATED_*. They can be recognized 1198 * as __PageMovable 1199 */ 1200 if (likely(!__PageMovable(page))) 1201 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 1202 page_is_file_cache(page), -hpage_nr_pages(page)); 1203 } 1204 1205 /* 1206 * If migration is successful, releases reference grabbed during 1207 * isolation. Otherwise, restore the page to right list unless 1208 * we want to retry. 1209 */ 1210 if (rc == MIGRATEPAGE_SUCCESS) { 1211 put_page(page); 1212 if (reason == MR_MEMORY_FAILURE) { 1213 /* 1214 * Set PG_HWPoison on just freed page 1215 * intentionally. Although it's rather weird, 1216 * it's how HWPoison flag works at the moment. 1217 */ 1218 if (set_hwpoison_free_buddy_page(page)) 1219 num_poisoned_pages_inc(); 1220 } 1221 } else { 1222 if (rc != -EAGAIN) { 1223 if (likely(!__PageMovable(page))) { 1224 putback_lru_page(page); 1225 goto put_new; 1226 } 1227 1228 lock_page(page); 1229 if (PageMovable(page)) 1230 putback_movable_page(page); 1231 else 1232 __ClearPageIsolated(page); 1233 unlock_page(page); 1234 put_page(page); 1235 } 1236 put_new: 1237 if (put_new_page) 1238 put_new_page(newpage, private); 1239 else 1240 put_page(newpage); 1241 } 1242 1243 return rc; 1244 } 1245 1246 /* 1247 * Counterpart of unmap_and_move_page() for hugepage migration. 1248 * 1249 * This function doesn't wait the completion of hugepage I/O 1250 * because there is no race between I/O and migration for hugepage. 1251 * Note that currently hugepage I/O occurs only in direct I/O 1252 * where no lock is held and PG_writeback is irrelevant, 1253 * and writeback status of all subpages are counted in the reference 1254 * count of the head page (i.e. if all subpages of a 2MB hugepage are 1255 * under direct I/O, the reference of the head page is 512 and a bit more.) 1256 * This means that when we try to migrate hugepage whose subpages are 1257 * doing direct I/O, some references remain after try_to_unmap() and 1258 * hugepage migration fails without data corruption. 1259 * 1260 * There is also no race when direct I/O is issued on the page under migration, 1261 * because then pte is replaced with migration swap entry and direct I/O code 1262 * will wait in the page fault for migration to complete. 1263 */ 1264 static int unmap_and_move_huge_page(new_page_t get_new_page, 1265 free_page_t put_new_page, unsigned long private, 1266 struct page *hpage, int force, 1267 enum migrate_mode mode, int reason) 1268 { 1269 int rc = -EAGAIN; 1270 int page_was_mapped = 0; 1271 struct page *new_hpage; 1272 struct anon_vma *anon_vma = NULL; 1273 1274 /* 1275 * Movability of hugepages depends on architectures and hugepage size. 1276 * This check is necessary because some callers of hugepage migration 1277 * like soft offline and memory hotremove don't walk through page 1278 * tables or check whether the hugepage is pmd-based or not before 1279 * kicking migration. 1280 */ 1281 if (!hugepage_migration_supported(page_hstate(hpage))) { 1282 putback_active_hugepage(hpage); 1283 return -ENOSYS; 1284 } 1285 1286 new_hpage = get_new_page(hpage, private); 1287 if (!new_hpage) 1288 return -ENOMEM; 1289 1290 if (!trylock_page(hpage)) { 1291 if (!force) 1292 goto out; 1293 switch (mode) { 1294 case MIGRATE_SYNC: 1295 case MIGRATE_SYNC_NO_COPY: 1296 break; 1297 default: 1298 goto out; 1299 } 1300 lock_page(hpage); 1301 } 1302 1303 if (PageAnon(hpage)) 1304 anon_vma = page_get_anon_vma(hpage); 1305 1306 if (unlikely(!trylock_page(new_hpage))) 1307 goto put_anon; 1308 1309 if (page_mapped(hpage)) { 1310 try_to_unmap(hpage, 1311 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1312 page_was_mapped = 1; 1313 } 1314 1315 if (!page_mapped(hpage)) 1316 rc = move_to_new_page(new_hpage, hpage, mode); 1317 1318 if (page_was_mapped) 1319 remove_migration_ptes(hpage, 1320 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); 1321 1322 unlock_page(new_hpage); 1323 1324 put_anon: 1325 if (anon_vma) 1326 put_anon_vma(anon_vma); 1327 1328 if (rc == MIGRATEPAGE_SUCCESS) { 1329 move_hugetlb_state(hpage, new_hpage, reason); 1330 put_new_page = NULL; 1331 } 1332 1333 unlock_page(hpage); 1334 out: 1335 if (rc != -EAGAIN) 1336 putback_active_hugepage(hpage); 1337 1338 /* 1339 * If migration was not successful and there's a freeing callback, use 1340 * it. Otherwise, put_page() will drop the reference grabbed during 1341 * isolation. 1342 */ 1343 if (put_new_page) 1344 put_new_page(new_hpage, private); 1345 else 1346 putback_active_hugepage(new_hpage); 1347 1348 return rc; 1349 } 1350 1351 /* 1352 * migrate_pages - migrate the pages specified in a list, to the free pages 1353 * supplied as the target for the page migration 1354 * 1355 * @from: The list of pages to be migrated. 1356 * @get_new_page: The function used to allocate free pages to be used 1357 * as the target of the page migration. 1358 * @put_new_page: The function used to free target pages if migration 1359 * fails, or NULL if no special handling is necessary. 1360 * @private: Private data to be passed on to get_new_page() 1361 * @mode: The migration mode that specifies the constraints for 1362 * page migration, if any. 1363 * @reason: The reason for page migration. 1364 * 1365 * The function returns after 10 attempts or if no pages are movable any more 1366 * because the list has become empty or no retryable pages exist any more. 1367 * The caller should call putback_movable_pages() to return pages to the LRU 1368 * or free list only if ret != 0. 1369 * 1370 * Returns the number of pages that were not migrated, or an error code. 1371 */ 1372 int migrate_pages(struct list_head *from, new_page_t get_new_page, 1373 free_page_t put_new_page, unsigned long private, 1374 enum migrate_mode mode, int reason) 1375 { 1376 int retry = 1; 1377 int nr_failed = 0; 1378 int nr_succeeded = 0; 1379 int pass = 0; 1380 struct page *page; 1381 struct page *page2; 1382 int swapwrite = current->flags & PF_SWAPWRITE; 1383 int rc; 1384 1385 if (!swapwrite) 1386 current->flags |= PF_SWAPWRITE; 1387 1388 for(pass = 0; pass < 10 && retry; pass++) { 1389 retry = 0; 1390 1391 list_for_each_entry_safe(page, page2, from, lru) { 1392 retry: 1393 cond_resched(); 1394 1395 if (PageHuge(page)) 1396 rc = unmap_and_move_huge_page(get_new_page, 1397 put_new_page, private, page, 1398 pass > 2, mode, reason); 1399 else 1400 rc = unmap_and_move(get_new_page, put_new_page, 1401 private, page, pass > 2, mode, 1402 reason); 1403 1404 switch(rc) { 1405 case -ENOMEM: 1406 /* 1407 * THP migration might be unsupported or the 1408 * allocation could've failed so we should 1409 * retry on the same page with the THP split 1410 * to base pages. 1411 * 1412 * Head page is retried immediately and tail 1413 * pages are added to the tail of the list so 1414 * we encounter them after the rest of the list 1415 * is processed. 1416 */ 1417 if (PageTransHuge(page) && !PageHuge(page)) { 1418 lock_page(page); 1419 rc = split_huge_page_to_list(page, from); 1420 unlock_page(page); 1421 if (!rc) { 1422 list_safe_reset_next(page, page2, lru); 1423 goto retry; 1424 } 1425 } 1426 nr_failed++; 1427 goto out; 1428 case -EAGAIN: 1429 retry++; 1430 break; 1431 case MIGRATEPAGE_SUCCESS: 1432 nr_succeeded++; 1433 break; 1434 default: 1435 /* 1436 * Permanent failure (-EBUSY, -ENOSYS, etc.): 1437 * unlike -EAGAIN case, the failed page is 1438 * removed from migration page list and not 1439 * retried in the next outer loop. 1440 */ 1441 nr_failed++; 1442 break; 1443 } 1444 } 1445 } 1446 nr_failed += retry; 1447 rc = nr_failed; 1448 out: 1449 if (nr_succeeded) 1450 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1451 if (nr_failed) 1452 count_vm_events(PGMIGRATE_FAIL, nr_failed); 1453 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); 1454 1455 if (!swapwrite) 1456 current->flags &= ~PF_SWAPWRITE; 1457 1458 return rc; 1459 } 1460 1461 #ifdef CONFIG_NUMA 1462 1463 static int store_status(int __user *status, int start, int value, int nr) 1464 { 1465 while (nr-- > 0) { 1466 if (put_user(value, status + start)) 1467 return -EFAULT; 1468 start++; 1469 } 1470 1471 return 0; 1472 } 1473 1474 static int do_move_pages_to_node(struct mm_struct *mm, 1475 struct list_head *pagelist, int node) 1476 { 1477 int err; 1478 1479 if (list_empty(pagelist)) 1480 return 0; 1481 1482 err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, 1483 MIGRATE_SYNC, MR_SYSCALL); 1484 if (err) 1485 putback_movable_pages(pagelist); 1486 return err; 1487 } 1488 1489 /* 1490 * Resolves the given address to a struct page, isolates it from the LRU and 1491 * puts it to the given pagelist. 1492 * Returns -errno if the page cannot be found/isolated or 0 when it has been 1493 * queued or the page doesn't need to be migrated because it is already on 1494 * the target node 1495 */ 1496 static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, 1497 int node, struct list_head *pagelist, bool migrate_all) 1498 { 1499 struct vm_area_struct *vma; 1500 struct page *page; 1501 unsigned int follflags; 1502 int err; 1503 1504 down_read(&mm->mmap_sem); 1505 err = -EFAULT; 1506 vma = find_vma(mm, addr); 1507 if (!vma || addr < vma->vm_start || !vma_migratable(vma)) 1508 goto out; 1509 1510 /* FOLL_DUMP to ignore special (like zero) pages */ 1511 follflags = FOLL_GET | FOLL_DUMP; 1512 page = follow_page(vma, addr, follflags); 1513 1514 err = PTR_ERR(page); 1515 if (IS_ERR(page)) 1516 goto out; 1517 1518 err = -ENOENT; 1519 if (!page) 1520 goto out; 1521 1522 err = 0; 1523 if (page_to_nid(page) == node) 1524 goto out_putpage; 1525 1526 err = -EACCES; 1527 if (page_mapcount(page) > 1 && !migrate_all) 1528 goto out_putpage; 1529 1530 if (PageHuge(page)) { 1531 if (PageHead(page)) { 1532 isolate_huge_page(page, pagelist); 1533 err = 0; 1534 } 1535 } else { 1536 struct page *head; 1537 1538 head = compound_head(page); 1539 err = isolate_lru_page(head); 1540 if (err) 1541 goto out_putpage; 1542 1543 err = 0; 1544 list_add_tail(&head->lru, pagelist); 1545 mod_node_page_state(page_pgdat(head), 1546 NR_ISOLATED_ANON + page_is_file_cache(head), 1547 hpage_nr_pages(head)); 1548 } 1549 out_putpage: 1550 /* 1551 * Either remove the duplicate refcount from 1552 * isolate_lru_page() or drop the page ref if it was 1553 * not isolated. 1554 */ 1555 put_page(page); 1556 out: 1557 up_read(&mm->mmap_sem); 1558 return err; 1559 } 1560 1561 /* 1562 * Migrate an array of page address onto an array of nodes and fill 1563 * the corresponding array of status. 1564 */ 1565 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 1566 unsigned long nr_pages, 1567 const void __user * __user *pages, 1568 const int __user *nodes, 1569 int __user *status, int flags) 1570 { 1571 int current_node = NUMA_NO_NODE; 1572 LIST_HEAD(pagelist); 1573 int start, i; 1574 int err = 0, err1; 1575 1576 migrate_prep(); 1577 1578 for (i = start = 0; i < nr_pages; i++) { 1579 const void __user *p; 1580 unsigned long addr; 1581 int node; 1582 1583 err = -EFAULT; 1584 if (get_user(p, pages + i)) 1585 goto out_flush; 1586 if (get_user(node, nodes + i)) 1587 goto out_flush; 1588 addr = (unsigned long)p; 1589 1590 err = -ENODEV; 1591 if (node < 0 || node >= MAX_NUMNODES) 1592 goto out_flush; 1593 if (!node_state(node, N_MEMORY)) 1594 goto out_flush; 1595 1596 err = -EACCES; 1597 if (!node_isset(node, task_nodes)) 1598 goto out_flush; 1599 1600 if (current_node == NUMA_NO_NODE) { 1601 current_node = node; 1602 start = i; 1603 } else if (node != current_node) { 1604 err = do_move_pages_to_node(mm, &pagelist, current_node); 1605 if (err) 1606 goto out; 1607 err = store_status(status, start, current_node, i - start); 1608 if (err) 1609 goto out; 1610 start = i; 1611 current_node = node; 1612 } 1613 1614 /* 1615 * Errors in the page lookup or isolation are not fatal and we simply 1616 * report them via status 1617 */ 1618 err = add_page_for_migration(mm, addr, current_node, 1619 &pagelist, flags & MPOL_MF_MOVE_ALL); 1620 if (!err) 1621 continue; 1622 1623 err = store_status(status, i, err, 1); 1624 if (err) 1625 goto out_flush; 1626 1627 err = do_move_pages_to_node(mm, &pagelist, current_node); 1628 if (err) 1629 goto out; 1630 if (i > start) { 1631 err = store_status(status, start, current_node, i - start); 1632 if (err) 1633 goto out; 1634 } 1635 current_node = NUMA_NO_NODE; 1636 } 1637 out_flush: 1638 if (list_empty(&pagelist)) 1639 return err; 1640 1641 /* Make sure we do not overwrite the existing error */ 1642 err1 = do_move_pages_to_node(mm, &pagelist, current_node); 1643 if (!err1) 1644 err1 = store_status(status, start, current_node, i - start); 1645 if (!err) 1646 err = err1; 1647 out: 1648 return err; 1649 } 1650 1651 /* 1652 * Determine the nodes of an array of pages and store it in an array of status. 1653 */ 1654 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 1655 const void __user **pages, int *status) 1656 { 1657 unsigned long i; 1658 1659 down_read(&mm->mmap_sem); 1660 1661 for (i = 0; i < nr_pages; i++) { 1662 unsigned long addr = (unsigned long)(*pages); 1663 struct vm_area_struct *vma; 1664 struct page *page; 1665 int err = -EFAULT; 1666 1667 vma = find_vma(mm, addr); 1668 if (!vma || addr < vma->vm_start) 1669 goto set_status; 1670 1671 /* FOLL_DUMP to ignore special (like zero) pages */ 1672 page = follow_page(vma, addr, FOLL_DUMP); 1673 1674 err = PTR_ERR(page); 1675 if (IS_ERR(page)) 1676 goto set_status; 1677 1678 err = page ? page_to_nid(page) : -ENOENT; 1679 set_status: 1680 *status = err; 1681 1682 pages++; 1683 status++; 1684 } 1685 1686 up_read(&mm->mmap_sem); 1687 } 1688 1689 /* 1690 * Determine the nodes of a user array of pages and store it in 1691 * a user array of status. 1692 */ 1693 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1694 const void __user * __user *pages, 1695 int __user *status) 1696 { 1697 #define DO_PAGES_STAT_CHUNK_NR 16 1698 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1699 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1700 1701 while (nr_pages) { 1702 unsigned long chunk_nr; 1703 1704 chunk_nr = nr_pages; 1705 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1706 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1707 1708 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1709 break; 1710 1711 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1712 1713 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1714 break; 1715 1716 pages += chunk_nr; 1717 status += chunk_nr; 1718 nr_pages -= chunk_nr; 1719 } 1720 return nr_pages ? -EFAULT : 0; 1721 } 1722 1723 /* 1724 * Move a list of pages in the address space of the currently executing 1725 * process. 1726 */ 1727 static int kernel_move_pages(pid_t pid, unsigned long nr_pages, 1728 const void __user * __user *pages, 1729 const int __user *nodes, 1730 int __user *status, int flags) 1731 { 1732 struct task_struct *task; 1733 struct mm_struct *mm; 1734 int err; 1735 nodemask_t task_nodes; 1736 1737 /* Check flags */ 1738 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1739 return -EINVAL; 1740 1741 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1742 return -EPERM; 1743 1744 /* Find the mm_struct */ 1745 rcu_read_lock(); 1746 task = pid ? find_task_by_vpid(pid) : current; 1747 if (!task) { 1748 rcu_read_unlock(); 1749 return -ESRCH; 1750 } 1751 get_task_struct(task); 1752 1753 /* 1754 * Check if this process has the right to modify the specified 1755 * process. Use the regular "ptrace_may_access()" checks. 1756 */ 1757 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1758 rcu_read_unlock(); 1759 err = -EPERM; 1760 goto out; 1761 } 1762 rcu_read_unlock(); 1763 1764 err = security_task_movememory(task); 1765 if (err) 1766 goto out; 1767 1768 task_nodes = cpuset_mems_allowed(task); 1769 mm = get_task_mm(task); 1770 put_task_struct(task); 1771 1772 if (!mm) 1773 return -EINVAL; 1774 1775 if (nodes) 1776 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1777 nodes, status, flags); 1778 else 1779 err = do_pages_stat(mm, nr_pages, pages, status); 1780 1781 mmput(mm); 1782 return err; 1783 1784 out: 1785 put_task_struct(task); 1786 return err; 1787 } 1788 1789 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1790 const void __user * __user *, pages, 1791 const int __user *, nodes, 1792 int __user *, status, int, flags) 1793 { 1794 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1795 } 1796 1797 #ifdef CONFIG_COMPAT 1798 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 1799 compat_uptr_t __user *, pages32, 1800 const int __user *, nodes, 1801 int __user *, status, 1802 int, flags) 1803 { 1804 const void __user * __user *pages; 1805 int i; 1806 1807 pages = compat_alloc_user_space(nr_pages * sizeof(void *)); 1808 for (i = 0; i < nr_pages; i++) { 1809 compat_uptr_t p; 1810 1811 if (get_user(p, pages32 + i) || 1812 put_user(compat_ptr(p), pages + i)) 1813 return -EFAULT; 1814 } 1815 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1816 } 1817 #endif /* CONFIG_COMPAT */ 1818 1819 #ifdef CONFIG_NUMA_BALANCING 1820 /* 1821 * Returns true if this is a safe migration target node for misplaced NUMA 1822 * pages. Currently it only checks the watermarks which crude 1823 */ 1824 static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1825 unsigned long nr_migrate_pages) 1826 { 1827 int z; 1828 1829 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1830 struct zone *zone = pgdat->node_zones + z; 1831 1832 if (!populated_zone(zone)) 1833 continue; 1834 1835 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1836 if (!zone_watermark_ok(zone, 0, 1837 high_wmark_pages(zone) + 1838 nr_migrate_pages, 1839 0, 0)) 1840 continue; 1841 return true; 1842 } 1843 return false; 1844 } 1845 1846 static struct page *alloc_misplaced_dst_page(struct page *page, 1847 unsigned long data) 1848 { 1849 int nid = (int) data; 1850 struct page *newpage; 1851 1852 newpage = __alloc_pages_node(nid, 1853 (GFP_HIGHUSER_MOVABLE | 1854 __GFP_THISNODE | __GFP_NOMEMALLOC | 1855 __GFP_NORETRY | __GFP_NOWARN) & 1856 ~__GFP_RECLAIM, 0); 1857 1858 return newpage; 1859 } 1860 1861 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1862 { 1863 int page_lru; 1864 1865 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 1866 1867 /* Avoid migrating to a node that is nearly full */ 1868 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1869 return 0; 1870 1871 if (isolate_lru_page(page)) 1872 return 0; 1873 1874 /* 1875 * migrate_misplaced_transhuge_page() skips page migration's usual 1876 * check on page_count(), so we must do it here, now that the page 1877 * has been isolated: a GUP pin, or any other pin, prevents migration. 1878 * The expected page count is 3: 1 for page's mapcount and 1 for the 1879 * caller's pin and 1 for the reference taken by isolate_lru_page(). 1880 */ 1881 if (PageTransHuge(page) && page_count(page) != 3) { 1882 putback_lru_page(page); 1883 return 0; 1884 } 1885 1886 page_lru = page_is_file_cache(page); 1887 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, 1888 hpage_nr_pages(page)); 1889 1890 /* 1891 * Isolating the page has taken another reference, so the 1892 * caller's reference can be safely dropped without the page 1893 * disappearing underneath us during migration. 1894 */ 1895 put_page(page); 1896 return 1; 1897 } 1898 1899 bool pmd_trans_migrating(pmd_t pmd) 1900 { 1901 struct page *page = pmd_page(pmd); 1902 return PageLocked(page); 1903 } 1904 1905 /* 1906 * Attempt to migrate a misplaced page to the specified destination 1907 * node. Caller is expected to have an elevated reference count on 1908 * the page that will be dropped by this function before returning. 1909 */ 1910 int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 1911 int node) 1912 { 1913 pg_data_t *pgdat = NODE_DATA(node); 1914 int isolated; 1915 int nr_remaining; 1916 LIST_HEAD(migratepages); 1917 1918 /* 1919 * Don't migrate file pages that are mapped in multiple processes 1920 * with execute permissions as they are probably shared libraries. 1921 */ 1922 if (page_mapcount(page) != 1 && page_is_file_cache(page) && 1923 (vma->vm_flags & VM_EXEC)) 1924 goto out; 1925 1926 /* 1927 * Also do not migrate dirty pages as not all filesystems can move 1928 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 1929 */ 1930 if (page_is_file_cache(page) && PageDirty(page)) 1931 goto out; 1932 1933 isolated = numamigrate_isolate_page(pgdat, page); 1934 if (!isolated) 1935 goto out; 1936 1937 list_add(&page->lru, &migratepages); 1938 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1939 NULL, node, MIGRATE_ASYNC, 1940 MR_NUMA_MISPLACED); 1941 if (nr_remaining) { 1942 if (!list_empty(&migratepages)) { 1943 list_del(&page->lru); 1944 dec_node_page_state(page, NR_ISOLATED_ANON + 1945 page_is_file_cache(page)); 1946 putback_lru_page(page); 1947 } 1948 isolated = 0; 1949 } else 1950 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1951 BUG_ON(!list_empty(&migratepages)); 1952 return isolated; 1953 1954 out: 1955 put_page(page); 1956 return 0; 1957 } 1958 #endif /* CONFIG_NUMA_BALANCING */ 1959 1960 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1961 /* 1962 * Migrates a THP to a given target node. page must be locked and is unlocked 1963 * before returning. 1964 */ 1965 int migrate_misplaced_transhuge_page(struct mm_struct *mm, 1966 struct vm_area_struct *vma, 1967 pmd_t *pmd, pmd_t entry, 1968 unsigned long address, 1969 struct page *page, int node) 1970 { 1971 spinlock_t *ptl; 1972 pg_data_t *pgdat = NODE_DATA(node); 1973 int isolated = 0; 1974 struct page *new_page = NULL; 1975 int page_lru = page_is_file_cache(page); 1976 unsigned long mmun_start = address & HPAGE_PMD_MASK; 1977 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 1978 1979 new_page = alloc_pages_node(node, 1980 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 1981 HPAGE_PMD_ORDER); 1982 if (!new_page) 1983 goto out_fail; 1984 prep_transhuge_page(new_page); 1985 1986 isolated = numamigrate_isolate_page(pgdat, page); 1987 if (!isolated) { 1988 put_page(new_page); 1989 goto out_fail; 1990 } 1991 1992 /* Prepare a page as a migration target */ 1993 __SetPageLocked(new_page); 1994 if (PageSwapBacked(page)) 1995 __SetPageSwapBacked(new_page); 1996 1997 /* anon mapping, we can simply copy page->mapping to the new page: */ 1998 new_page->mapping = page->mapping; 1999 new_page->index = page->index; 2000 migrate_page_copy(new_page, page); 2001 WARN_ON(PageLRU(new_page)); 2002 2003 /* Recheck the target PMD */ 2004 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2005 ptl = pmd_lock(mm, pmd); 2006 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2007 spin_unlock(ptl); 2008 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2009 2010 /* Reverse changes made by migrate_page_copy() */ 2011 if (TestClearPageActive(new_page)) 2012 SetPageActive(page); 2013 if (TestClearPageUnevictable(new_page)) 2014 SetPageUnevictable(page); 2015 2016 unlock_page(new_page); 2017 put_page(new_page); /* Free it */ 2018 2019 /* Retake the callers reference and putback on LRU */ 2020 get_page(page); 2021 putback_lru_page(page); 2022 mod_node_page_state(page_pgdat(page), 2023 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 2024 2025 goto out_unlock; 2026 } 2027 2028 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2029 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2030 2031 /* 2032 * Clear the old entry under pagetable lock and establish the new PTE. 2033 * Any parallel GUP will either observe the old page blocking on the 2034 * page lock, block on the page table lock or observe the new page. 2035 * The SetPageUptodate on the new page and page_add_new_anon_rmap 2036 * guarantee the copy is visible before the pagetable update. 2037 */ 2038 flush_cache_range(vma, mmun_start, mmun_end); 2039 page_add_anon_rmap(new_page, vma, mmun_start, true); 2040 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); 2041 set_pmd_at(mm, mmun_start, pmd, entry); 2042 update_mmu_cache_pmd(vma, address, &entry); 2043 2044 page_ref_unfreeze(page, 2); 2045 mlock_migrate_page(new_page, page); 2046 page_remove_rmap(page, true); 2047 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2048 2049 spin_unlock(ptl); 2050 /* 2051 * No need to double call mmu_notifier->invalidate_range() callback as 2052 * the above pmdp_huge_clear_flush_notify() did already call it. 2053 */ 2054 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); 2055 2056 /* Take an "isolate" reference and put new page on the LRU. */ 2057 get_page(new_page); 2058 putback_lru_page(new_page); 2059 2060 unlock_page(new_page); 2061 unlock_page(page); 2062 put_page(page); /* Drop the rmap reference */ 2063 put_page(page); /* Drop the LRU isolation reference */ 2064 2065 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2066 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2067 2068 mod_node_page_state(page_pgdat(page), 2069 NR_ISOLATED_ANON + page_lru, 2070 -HPAGE_PMD_NR); 2071 return isolated; 2072 2073 out_fail: 2074 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2075 ptl = pmd_lock(mm, pmd); 2076 if (pmd_same(*pmd, entry)) { 2077 entry = pmd_modify(entry, vma->vm_page_prot); 2078 set_pmd_at(mm, mmun_start, pmd, entry); 2079 update_mmu_cache_pmd(vma, address, &entry); 2080 } 2081 spin_unlock(ptl); 2082 2083 out_unlock: 2084 unlock_page(page); 2085 put_page(page); 2086 return 0; 2087 } 2088 #endif /* CONFIG_NUMA_BALANCING */ 2089 2090 #endif /* CONFIG_NUMA */ 2091 2092 #if defined(CONFIG_MIGRATE_VMA_HELPER) 2093 struct migrate_vma { 2094 struct vm_area_struct *vma; 2095 unsigned long *dst; 2096 unsigned long *src; 2097 unsigned long cpages; 2098 unsigned long npages; 2099 unsigned long start; 2100 unsigned long end; 2101 }; 2102 2103 static int migrate_vma_collect_hole(unsigned long start, 2104 unsigned long end, 2105 struct mm_walk *walk) 2106 { 2107 struct migrate_vma *migrate = walk->private; 2108 unsigned long addr; 2109 2110 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2111 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 2112 migrate->dst[migrate->npages] = 0; 2113 migrate->npages++; 2114 migrate->cpages++; 2115 } 2116 2117 return 0; 2118 } 2119 2120 static int migrate_vma_collect_skip(unsigned long start, 2121 unsigned long end, 2122 struct mm_walk *walk) 2123 { 2124 struct migrate_vma *migrate = walk->private; 2125 unsigned long addr; 2126 2127 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 2128 migrate->dst[migrate->npages] = 0; 2129 migrate->src[migrate->npages++] = 0; 2130 } 2131 2132 return 0; 2133 } 2134 2135 static int migrate_vma_collect_pmd(pmd_t *pmdp, 2136 unsigned long start, 2137 unsigned long end, 2138 struct mm_walk *walk) 2139 { 2140 struct migrate_vma *migrate = walk->private; 2141 struct vm_area_struct *vma = walk->vma; 2142 struct mm_struct *mm = vma->vm_mm; 2143 unsigned long addr = start, unmapped = 0; 2144 spinlock_t *ptl; 2145 pte_t *ptep; 2146 2147 again: 2148 if (pmd_none(*pmdp)) 2149 return migrate_vma_collect_hole(start, end, walk); 2150 2151 if (pmd_trans_huge(*pmdp)) { 2152 struct page *page; 2153 2154 ptl = pmd_lock(mm, pmdp); 2155 if (unlikely(!pmd_trans_huge(*pmdp))) { 2156 spin_unlock(ptl); 2157 goto again; 2158 } 2159 2160 page = pmd_page(*pmdp); 2161 if (is_huge_zero_page(page)) { 2162 spin_unlock(ptl); 2163 split_huge_pmd(vma, pmdp, addr); 2164 if (pmd_trans_unstable(pmdp)) 2165 return migrate_vma_collect_skip(start, end, 2166 walk); 2167 } else { 2168 int ret; 2169 2170 get_page(page); 2171 spin_unlock(ptl); 2172 if (unlikely(!trylock_page(page))) 2173 return migrate_vma_collect_skip(start, end, 2174 walk); 2175 ret = split_huge_page(page); 2176 unlock_page(page); 2177 put_page(page); 2178 if (ret) 2179 return migrate_vma_collect_skip(start, end, 2180 walk); 2181 if (pmd_none(*pmdp)) 2182 return migrate_vma_collect_hole(start, end, 2183 walk); 2184 } 2185 } 2186 2187 if (unlikely(pmd_bad(*pmdp))) 2188 return migrate_vma_collect_skip(start, end, walk); 2189 2190 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2191 arch_enter_lazy_mmu_mode(); 2192 2193 for (; addr < end; addr += PAGE_SIZE, ptep++) { 2194 unsigned long mpfn, pfn; 2195 struct page *page; 2196 swp_entry_t entry; 2197 pte_t pte; 2198 2199 pte = *ptep; 2200 pfn = pte_pfn(pte); 2201 2202 if (pte_none(pte)) { 2203 mpfn = MIGRATE_PFN_MIGRATE; 2204 migrate->cpages++; 2205 pfn = 0; 2206 goto next; 2207 } 2208 2209 if (!pte_present(pte)) { 2210 mpfn = pfn = 0; 2211 2212 /* 2213 * Only care about unaddressable device page special 2214 * page table entry. Other special swap entries are not 2215 * migratable, and we ignore regular swapped page. 2216 */ 2217 entry = pte_to_swp_entry(pte); 2218 if (!is_device_private_entry(entry)) 2219 goto next; 2220 2221 page = device_private_entry_to_page(entry); 2222 mpfn = migrate_pfn(page_to_pfn(page))| 2223 MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; 2224 if (is_write_device_private_entry(entry)) 2225 mpfn |= MIGRATE_PFN_WRITE; 2226 } else { 2227 if (is_zero_pfn(pfn)) { 2228 mpfn = MIGRATE_PFN_MIGRATE; 2229 migrate->cpages++; 2230 pfn = 0; 2231 goto next; 2232 } 2233 page = _vm_normal_page(migrate->vma, addr, pte, true); 2234 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2235 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2236 } 2237 2238 /* FIXME support THP */ 2239 if (!page || !page->mapping || PageTransCompound(page)) { 2240 mpfn = pfn = 0; 2241 goto next; 2242 } 2243 pfn = page_to_pfn(page); 2244 2245 /* 2246 * By getting a reference on the page we pin it and that blocks 2247 * any kind of migration. Side effect is that it "freezes" the 2248 * pte. 2249 * 2250 * We drop this reference after isolating the page from the lru 2251 * for non device page (device page are not on the lru and thus 2252 * can't be dropped from it). 2253 */ 2254 get_page(page); 2255 migrate->cpages++; 2256 2257 /* 2258 * Optimize for the common case where page is only mapped once 2259 * in one process. If we can lock the page, then we can safely 2260 * set up a special migration page table entry now. 2261 */ 2262 if (trylock_page(page)) { 2263 pte_t swp_pte; 2264 2265 mpfn |= MIGRATE_PFN_LOCKED; 2266 ptep_get_and_clear(mm, addr, ptep); 2267 2268 /* Setup special migration page table entry */ 2269 entry = make_migration_entry(page, mpfn & 2270 MIGRATE_PFN_WRITE); 2271 swp_pte = swp_entry_to_pte(entry); 2272 if (pte_soft_dirty(pte)) 2273 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2274 set_pte_at(mm, addr, ptep, swp_pte); 2275 2276 /* 2277 * This is like regular unmap: we remove the rmap and 2278 * drop page refcount. Page won't be freed, as we took 2279 * a reference just above. 2280 */ 2281 page_remove_rmap(page, false); 2282 put_page(page); 2283 2284 if (pte_present(pte)) 2285 unmapped++; 2286 } 2287 2288 next: 2289 migrate->dst[migrate->npages] = 0; 2290 migrate->src[migrate->npages++] = mpfn; 2291 } 2292 arch_leave_lazy_mmu_mode(); 2293 pte_unmap_unlock(ptep - 1, ptl); 2294 2295 /* Only flush the TLB if we actually modified any entries */ 2296 if (unmapped) 2297 flush_tlb_range(walk->vma, start, end); 2298 2299 return 0; 2300 } 2301 2302 /* 2303 * migrate_vma_collect() - collect pages over a range of virtual addresses 2304 * @migrate: migrate struct containing all migration information 2305 * 2306 * This will walk the CPU page table. For each virtual address backed by a 2307 * valid page, it updates the src array and takes a reference on the page, in 2308 * order to pin the page until we lock it and unmap it. 2309 */ 2310 static void migrate_vma_collect(struct migrate_vma *migrate) 2311 { 2312 struct mm_walk mm_walk; 2313 2314 mm_walk.pmd_entry = migrate_vma_collect_pmd; 2315 mm_walk.pte_entry = NULL; 2316 mm_walk.pte_hole = migrate_vma_collect_hole; 2317 mm_walk.hugetlb_entry = NULL; 2318 mm_walk.test_walk = NULL; 2319 mm_walk.vma = migrate->vma; 2320 mm_walk.mm = migrate->vma->vm_mm; 2321 mm_walk.private = migrate; 2322 2323 mmu_notifier_invalidate_range_start(mm_walk.mm, 2324 migrate->start, 2325 migrate->end); 2326 walk_page_range(migrate->start, migrate->end, &mm_walk); 2327 mmu_notifier_invalidate_range_end(mm_walk.mm, 2328 migrate->start, 2329 migrate->end); 2330 2331 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2332 } 2333 2334 /* 2335 * migrate_vma_check_page() - check if page is pinned or not 2336 * @page: struct page to check 2337 * 2338 * Pinned pages cannot be migrated. This is the same test as in 2339 * migrate_page_move_mapping(), except that here we allow migration of a 2340 * ZONE_DEVICE page. 2341 */ 2342 static bool migrate_vma_check_page(struct page *page) 2343 { 2344 /* 2345 * One extra ref because caller holds an extra reference, either from 2346 * isolate_lru_page() for a regular page, or migrate_vma_collect() for 2347 * a device page. 2348 */ 2349 int extra = 1; 2350 2351 /* 2352 * FIXME support THP (transparent huge page), it is bit more complex to 2353 * check them than regular pages, because they can be mapped with a pmd 2354 * or with a pte (split pte mapping). 2355 */ 2356 if (PageCompound(page)) 2357 return false; 2358 2359 /* Page from ZONE_DEVICE have one extra reference */ 2360 if (is_zone_device_page(page)) { 2361 /* 2362 * Private page can never be pin as they have no valid pte and 2363 * GUP will fail for those. Yet if there is a pending migration 2364 * a thread might try to wait on the pte migration entry and 2365 * will bump the page reference count. Sadly there is no way to 2366 * differentiate a regular pin from migration wait. Hence to 2367 * avoid 2 racing thread trying to migrate back to CPU to enter 2368 * infinite loop (one stoping migration because the other is 2369 * waiting on pte migration entry). We always return true here. 2370 * 2371 * FIXME proper solution is to rework migration_entry_wait() so 2372 * it does not need to take a reference on page. 2373 */ 2374 if (is_device_private_page(page)) 2375 return true; 2376 2377 /* 2378 * Only allow device public page to be migrated and account for 2379 * the extra reference count imply by ZONE_DEVICE pages. 2380 */ 2381 if (!is_device_public_page(page)) 2382 return false; 2383 extra++; 2384 } 2385 2386 /* For file back page */ 2387 if (page_mapping(page)) 2388 extra += 1 + page_has_private(page); 2389 2390 if ((page_count(page) - extra) > page_mapcount(page)) 2391 return false; 2392 2393 return true; 2394 } 2395 2396 /* 2397 * migrate_vma_prepare() - lock pages and isolate them from the lru 2398 * @migrate: migrate struct containing all migration information 2399 * 2400 * This locks pages that have been collected by migrate_vma_collect(). Once each 2401 * page is locked it is isolated from the lru (for non-device pages). Finally, 2402 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be 2403 * migrated by concurrent kernel threads. 2404 */ 2405 static void migrate_vma_prepare(struct migrate_vma *migrate) 2406 { 2407 const unsigned long npages = migrate->npages; 2408 const unsigned long start = migrate->start; 2409 unsigned long addr, i, restore = 0; 2410 bool allow_drain = true; 2411 2412 lru_add_drain(); 2413 2414 for (i = 0; (i < npages) && migrate->cpages; i++) { 2415 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2416 bool remap = true; 2417 2418 if (!page) 2419 continue; 2420 2421 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { 2422 /* 2423 * Because we are migrating several pages there can be 2424 * a deadlock between 2 concurrent migration where each 2425 * are waiting on each other page lock. 2426 * 2427 * Make migrate_vma() a best effort thing and backoff 2428 * for any page we can not lock right away. 2429 */ 2430 if (!trylock_page(page)) { 2431 migrate->src[i] = 0; 2432 migrate->cpages--; 2433 put_page(page); 2434 continue; 2435 } 2436 remap = false; 2437 migrate->src[i] |= MIGRATE_PFN_LOCKED; 2438 } 2439 2440 /* ZONE_DEVICE pages are not on LRU */ 2441 if (!is_zone_device_page(page)) { 2442 if (!PageLRU(page) && allow_drain) { 2443 /* Drain CPU's pagevec */ 2444 lru_add_drain_all(); 2445 allow_drain = false; 2446 } 2447 2448 if (isolate_lru_page(page)) { 2449 if (remap) { 2450 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2451 migrate->cpages--; 2452 restore++; 2453 } else { 2454 migrate->src[i] = 0; 2455 unlock_page(page); 2456 migrate->cpages--; 2457 put_page(page); 2458 } 2459 continue; 2460 } 2461 2462 /* Drop the reference we took in collect */ 2463 put_page(page); 2464 } 2465 2466 if (!migrate_vma_check_page(page)) { 2467 if (remap) { 2468 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2469 migrate->cpages--; 2470 restore++; 2471 2472 if (!is_zone_device_page(page)) { 2473 get_page(page); 2474 putback_lru_page(page); 2475 } 2476 } else { 2477 migrate->src[i] = 0; 2478 unlock_page(page); 2479 migrate->cpages--; 2480 2481 if (!is_zone_device_page(page)) 2482 putback_lru_page(page); 2483 else 2484 put_page(page); 2485 } 2486 } 2487 } 2488 2489 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { 2490 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2491 2492 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2493 continue; 2494 2495 remove_migration_pte(page, migrate->vma, addr, page); 2496 2497 migrate->src[i] = 0; 2498 unlock_page(page); 2499 put_page(page); 2500 restore--; 2501 } 2502 } 2503 2504 /* 2505 * migrate_vma_unmap() - replace page mapping with special migration pte entry 2506 * @migrate: migrate struct containing all migration information 2507 * 2508 * Replace page mapping (CPU page table pte) with a special migration pte entry 2509 * and check again if it has been pinned. Pinned pages are restored because we 2510 * cannot migrate them. 2511 * 2512 * This is the last step before we call the device driver callback to allocate 2513 * destination memory and copy contents of original page over to new page. 2514 */ 2515 static void migrate_vma_unmap(struct migrate_vma *migrate) 2516 { 2517 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 2518 const unsigned long npages = migrate->npages; 2519 const unsigned long start = migrate->start; 2520 unsigned long addr, i, restore = 0; 2521 2522 for (i = 0; i < npages; i++) { 2523 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2524 2525 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2526 continue; 2527 2528 if (page_mapped(page)) { 2529 try_to_unmap(page, flags); 2530 if (page_mapped(page)) 2531 goto restore; 2532 } 2533 2534 if (migrate_vma_check_page(page)) 2535 continue; 2536 2537 restore: 2538 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2539 migrate->cpages--; 2540 restore++; 2541 } 2542 2543 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { 2544 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2545 2546 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2547 continue; 2548 2549 remove_migration_ptes(page, page, false); 2550 2551 migrate->src[i] = 0; 2552 unlock_page(page); 2553 restore--; 2554 2555 if (is_zone_device_page(page)) 2556 put_page(page); 2557 else 2558 putback_lru_page(page); 2559 } 2560 } 2561 2562 static void migrate_vma_insert_page(struct migrate_vma *migrate, 2563 unsigned long addr, 2564 struct page *page, 2565 unsigned long *src, 2566 unsigned long *dst) 2567 { 2568 struct vm_area_struct *vma = migrate->vma; 2569 struct mm_struct *mm = vma->vm_mm; 2570 struct mem_cgroup *memcg; 2571 bool flush = false; 2572 spinlock_t *ptl; 2573 pte_t entry; 2574 pgd_t *pgdp; 2575 p4d_t *p4dp; 2576 pud_t *pudp; 2577 pmd_t *pmdp; 2578 pte_t *ptep; 2579 2580 /* Only allow populating anonymous memory */ 2581 if (!vma_is_anonymous(vma)) 2582 goto abort; 2583 2584 pgdp = pgd_offset(mm, addr); 2585 p4dp = p4d_alloc(mm, pgdp, addr); 2586 if (!p4dp) 2587 goto abort; 2588 pudp = pud_alloc(mm, p4dp, addr); 2589 if (!pudp) 2590 goto abort; 2591 pmdp = pmd_alloc(mm, pudp, addr); 2592 if (!pmdp) 2593 goto abort; 2594 2595 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 2596 goto abort; 2597 2598 /* 2599 * Use pte_alloc() instead of pte_alloc_map(). We can't run 2600 * pte_offset_map() on pmds where a huge pmd might be created 2601 * from a different thread. 2602 * 2603 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when 2604 * parallel threads are excluded by other means. 2605 * 2606 * Here we only have down_read(mmap_sem). 2607 */ 2608 if (pte_alloc(mm, pmdp, addr)) 2609 goto abort; 2610 2611 /* See the comment in pte_alloc_one_map() */ 2612 if (unlikely(pmd_trans_unstable(pmdp))) 2613 goto abort; 2614 2615 if (unlikely(anon_vma_prepare(vma))) 2616 goto abort; 2617 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) 2618 goto abort; 2619 2620 /* 2621 * The memory barrier inside __SetPageUptodate makes sure that 2622 * preceding stores to the page contents become visible before 2623 * the set_pte_at() write. 2624 */ 2625 __SetPageUptodate(page); 2626 2627 if (is_zone_device_page(page)) { 2628 if (is_device_private_page(page)) { 2629 swp_entry_t swp_entry; 2630 2631 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); 2632 entry = swp_entry_to_pte(swp_entry); 2633 } else if (is_device_public_page(page)) { 2634 entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); 2635 if (vma->vm_flags & VM_WRITE) 2636 entry = pte_mkwrite(pte_mkdirty(entry)); 2637 entry = pte_mkdevmap(entry); 2638 } 2639 } else { 2640 entry = mk_pte(page, vma->vm_page_prot); 2641 if (vma->vm_flags & VM_WRITE) 2642 entry = pte_mkwrite(pte_mkdirty(entry)); 2643 } 2644 2645 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2646 2647 if (pte_present(*ptep)) { 2648 unsigned long pfn = pte_pfn(*ptep); 2649 2650 if (!is_zero_pfn(pfn)) { 2651 pte_unmap_unlock(ptep, ptl); 2652 mem_cgroup_cancel_charge(page, memcg, false); 2653 goto abort; 2654 } 2655 flush = true; 2656 } else if (!pte_none(*ptep)) { 2657 pte_unmap_unlock(ptep, ptl); 2658 mem_cgroup_cancel_charge(page, memcg, false); 2659 goto abort; 2660 } 2661 2662 /* 2663 * Check for usefaultfd but do not deliver the fault. Instead, 2664 * just back off. 2665 */ 2666 if (userfaultfd_missing(vma)) { 2667 pte_unmap_unlock(ptep, ptl); 2668 mem_cgroup_cancel_charge(page, memcg, false); 2669 goto abort; 2670 } 2671 2672 inc_mm_counter(mm, MM_ANONPAGES); 2673 page_add_new_anon_rmap(page, vma, addr, false); 2674 mem_cgroup_commit_charge(page, memcg, false, false); 2675 if (!is_zone_device_page(page)) 2676 lru_cache_add_active_or_unevictable(page, vma); 2677 get_page(page); 2678 2679 if (flush) { 2680 flush_cache_page(vma, addr, pte_pfn(*ptep)); 2681 ptep_clear_flush_notify(vma, addr, ptep); 2682 set_pte_at_notify(mm, addr, ptep, entry); 2683 update_mmu_cache(vma, addr, ptep); 2684 } else { 2685 /* No need to invalidate - it was non-present before */ 2686 set_pte_at(mm, addr, ptep, entry); 2687 update_mmu_cache(vma, addr, ptep); 2688 } 2689 2690 pte_unmap_unlock(ptep, ptl); 2691 *src = MIGRATE_PFN_MIGRATE; 2692 return; 2693 2694 abort: 2695 *src &= ~MIGRATE_PFN_MIGRATE; 2696 } 2697 2698 /* 2699 * migrate_vma_pages() - migrate meta-data from src page to dst page 2700 * @migrate: migrate struct containing all migration information 2701 * 2702 * This migrates struct page meta-data from source struct page to destination 2703 * struct page. This effectively finishes the migration from source page to the 2704 * destination page. 2705 */ 2706 static void migrate_vma_pages(struct migrate_vma *migrate) 2707 { 2708 const unsigned long npages = migrate->npages; 2709 const unsigned long start = migrate->start; 2710 struct vm_area_struct *vma = migrate->vma; 2711 struct mm_struct *mm = vma->vm_mm; 2712 unsigned long addr, i, mmu_start; 2713 bool notified = false; 2714 2715 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 2716 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2717 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2718 struct address_space *mapping; 2719 int r; 2720 2721 if (!newpage) { 2722 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2723 continue; 2724 } 2725 2726 if (!page) { 2727 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { 2728 continue; 2729 } 2730 if (!notified) { 2731 mmu_start = addr; 2732 notified = true; 2733 mmu_notifier_invalidate_range_start(mm, 2734 mmu_start, 2735 migrate->end); 2736 } 2737 migrate_vma_insert_page(migrate, addr, newpage, 2738 &migrate->src[i], 2739 &migrate->dst[i]); 2740 continue; 2741 } 2742 2743 mapping = page_mapping(page); 2744 2745 if (is_zone_device_page(newpage)) { 2746 if (is_device_private_page(newpage)) { 2747 /* 2748 * For now only support private anonymous when 2749 * migrating to un-addressable device memory. 2750 */ 2751 if (mapping) { 2752 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2753 continue; 2754 } 2755 } else if (!is_device_public_page(newpage)) { 2756 /* 2757 * Other types of ZONE_DEVICE page are not 2758 * supported. 2759 */ 2760 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2761 continue; 2762 } 2763 } 2764 2765 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 2766 if (r != MIGRATEPAGE_SUCCESS) 2767 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2768 } 2769 2770 /* 2771 * No need to double call mmu_notifier->invalidate_range() callback as 2772 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 2773 * did already call it. 2774 */ 2775 if (notified) 2776 mmu_notifier_invalidate_range_only_end(mm, mmu_start, 2777 migrate->end); 2778 } 2779 2780 /* 2781 * migrate_vma_finalize() - restore CPU page table entry 2782 * @migrate: migrate struct containing all migration information 2783 * 2784 * This replaces the special migration pte entry with either a mapping to the 2785 * new page if migration was successful for that page, or to the original page 2786 * otherwise. 2787 * 2788 * This also unlocks the pages and puts them back on the lru, or drops the extra 2789 * refcount, for device pages. 2790 */ 2791 static void migrate_vma_finalize(struct migrate_vma *migrate) 2792 { 2793 const unsigned long npages = migrate->npages; 2794 unsigned long i; 2795 2796 for (i = 0; i < npages; i++) { 2797 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2798 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2799 2800 if (!page) { 2801 if (newpage) { 2802 unlock_page(newpage); 2803 put_page(newpage); 2804 } 2805 continue; 2806 } 2807 2808 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 2809 if (newpage) { 2810 unlock_page(newpage); 2811 put_page(newpage); 2812 } 2813 newpage = page; 2814 } 2815 2816 remove_migration_ptes(page, newpage, false); 2817 unlock_page(page); 2818 migrate->cpages--; 2819 2820 if (is_zone_device_page(page)) 2821 put_page(page); 2822 else 2823 putback_lru_page(page); 2824 2825 if (newpage != page) { 2826 unlock_page(newpage); 2827 if (is_zone_device_page(newpage)) 2828 put_page(newpage); 2829 else 2830 putback_lru_page(newpage); 2831 } 2832 } 2833 } 2834 2835 /* 2836 * migrate_vma() - migrate a range of memory inside vma 2837 * 2838 * @ops: migration callback for allocating destination memory and copying 2839 * @vma: virtual memory area containing the range to be migrated 2840 * @start: start address of the range to migrate (inclusive) 2841 * @end: end address of the range to migrate (exclusive) 2842 * @src: array of hmm_pfn_t containing source pfns 2843 * @dst: array of hmm_pfn_t containing destination pfns 2844 * @private: pointer passed back to each of the callback 2845 * Returns: 0 on success, error code otherwise 2846 * 2847 * This function tries to migrate a range of memory virtual address range, using 2848 * callbacks to allocate and copy memory from source to destination. First it 2849 * collects all the pages backing each virtual address in the range, saving this 2850 * inside the src array. Then it locks those pages and unmaps them. Once the pages 2851 * are locked and unmapped, it checks whether each page is pinned or not. Pages 2852 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) 2853 * in the corresponding src array entry. It then restores any pages that are 2854 * pinned, by remapping and unlocking those pages. 2855 * 2856 * At this point it calls the alloc_and_copy() callback. For documentation on 2857 * what is expected from that callback, see struct migrate_vma_ops comments in 2858 * include/linux/migrate.h 2859 * 2860 * After the alloc_and_copy() callback, this function goes over each entry in 2861 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2862 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2863 * then the function tries to migrate struct page information from the source 2864 * struct page to the destination struct page. If it fails to migrate the struct 2865 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src 2866 * array. 2867 * 2868 * At this point all successfully migrated pages have an entry in the src 2869 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2870 * array entry with MIGRATE_PFN_VALID flag set. 2871 * 2872 * It then calls the finalize_and_map() callback. See comments for "struct 2873 * migrate_vma_ops", in include/linux/migrate.h for details about 2874 * finalize_and_map() behavior. 2875 * 2876 * After the finalize_and_map() callback, for successfully migrated pages, this 2877 * function updates the CPU page table to point to new pages, otherwise it 2878 * restores the CPU page table to point to the original source pages. 2879 * 2880 * Function returns 0 after the above steps, even if no pages were migrated 2881 * (The function only returns an error if any of the arguments are invalid.) 2882 * 2883 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT 2884 * unsigned long entries. 2885 */ 2886 int migrate_vma(const struct migrate_vma_ops *ops, 2887 struct vm_area_struct *vma, 2888 unsigned long start, 2889 unsigned long end, 2890 unsigned long *src, 2891 unsigned long *dst, 2892 void *private) 2893 { 2894 struct migrate_vma migrate; 2895 2896 /* Sanity check the arguments */ 2897 start &= PAGE_MASK; 2898 end &= PAGE_MASK; 2899 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 2900 vma_is_dax(vma)) 2901 return -EINVAL; 2902 if (start < vma->vm_start || start >= vma->vm_end) 2903 return -EINVAL; 2904 if (end <= vma->vm_start || end > vma->vm_end) 2905 return -EINVAL; 2906 if (!ops || !src || !dst || start >= end) 2907 return -EINVAL; 2908 2909 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); 2910 migrate.src = src; 2911 migrate.dst = dst; 2912 migrate.start = start; 2913 migrate.npages = 0; 2914 migrate.cpages = 0; 2915 migrate.end = end; 2916 migrate.vma = vma; 2917 2918 /* Collect, and try to unmap source pages */ 2919 migrate_vma_collect(&migrate); 2920 if (!migrate.cpages) 2921 return 0; 2922 2923 /* Lock and isolate page */ 2924 migrate_vma_prepare(&migrate); 2925 if (!migrate.cpages) 2926 return 0; 2927 2928 /* Unmap pages */ 2929 migrate_vma_unmap(&migrate); 2930 if (!migrate.cpages) 2931 return 0; 2932 2933 /* 2934 * At this point pages are locked and unmapped, and thus they have 2935 * stable content and can safely be copied to destination memory that 2936 * is allocated by the callback. 2937 * 2938 * Note that migration can fail in migrate_vma_struct_page() for each 2939 * individual page. 2940 */ 2941 ops->alloc_and_copy(vma, src, dst, start, end, private); 2942 2943 /* This does the real migration of struct page */ 2944 migrate_vma_pages(&migrate); 2945 2946 ops->finalize_and_map(vma, src, dst, start, end, private); 2947 2948 /* Unlock and remap pages */ 2949 migrate_vma_finalize(&migrate); 2950 2951 return 0; 2952 } 2953 EXPORT_SYMBOL(migrate_vma); 2954 #endif /* defined(MIGRATE_VMA_HELPER) */ 2955