1 /* 2 * Memory Migration functionality - linux/mm/migration.c 3 * 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5 * 6 * Page migration was first developed in the context of the memory hotplug 7 * project. The main authors of the migration code are: 8 * 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp> 11 * Dave Hansen <haveblue@us.ibm.com> 12 * Christoph Lameter 13 */ 14 15 #include <linux/migrate.h> 16 #include <linux/export.h> 17 #include <linux/swap.h> 18 #include <linux/swapops.h> 19 #include <linux/pagemap.h> 20 #include <linux/buffer_head.h> 21 #include <linux/mm_inline.h> 22 #include <linux/nsproxy.h> 23 #include <linux/pagevec.h> 24 #include <linux/ksm.h> 25 #include <linux/rmap.h> 26 #include <linux/topology.h> 27 #include <linux/cpu.h> 28 #include <linux/cpuset.h> 29 #include <linux/writeback.h> 30 #include <linux/mempolicy.h> 31 #include <linux/vmalloc.h> 32 #include <linux/security.h> 33 #include <linux/memcontrol.h> 34 #include <linux/syscalls.h> 35 #include <linux/hugetlb.h> 36 #include <linux/hugetlb_cgroup.h> 37 #include <linux/gfp.h> 38 #include <linux/balloon_compaction.h> 39 40 #include <asm/tlbflush.h> 41 42 #define CREATE_TRACE_POINTS 43 #include <trace/events/migrate.h> 44 45 #include "internal.h" 46 47 /* 48 * migrate_prep() needs to be called before we start compiling a list of pages 49 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 50 * undesirable, use migrate_prep_local() 51 */ 52 int migrate_prep(void) 53 { 54 /* 55 * Clear the LRU lists so pages can be isolated. 56 * Note that pages may be moved off the LRU after we have 57 * drained them. Those pages will fail to migrate like other 58 * pages that may be busy. 59 */ 60 lru_add_drain_all(); 61 62 return 0; 63 } 64 65 /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 66 int migrate_prep_local(void) 67 { 68 lru_add_drain(); 69 70 return 0; 71 } 72 73 /* 74 * Add isolated pages on the list back to the LRU under page lock 75 * to avoid leaking evictable pages back onto unevictable list. 76 */ 77 void putback_lru_pages(struct list_head *l) 78 { 79 struct page *page; 80 struct page *page2; 81 82 list_for_each_entry_safe(page, page2, l, lru) { 83 list_del(&page->lru); 84 dec_zone_page_state(page, NR_ISOLATED_ANON + 85 page_is_file_cache(page)); 86 putback_lru_page(page); 87 } 88 } 89 90 /* 91 * Put previously isolated pages back onto the appropriate lists 92 * from where they were once taken off for compaction/migration. 93 * 94 * This function shall be used instead of putback_lru_pages(), 95 * whenever the isolated pageset has been built by isolate_migratepages_range() 96 */ 97 void putback_movable_pages(struct list_head *l) 98 { 99 struct page *page; 100 struct page *page2; 101 102 list_for_each_entry_safe(page, page2, l, lru) { 103 list_del(&page->lru); 104 dec_zone_page_state(page, NR_ISOLATED_ANON + 105 page_is_file_cache(page)); 106 if (unlikely(balloon_page_movable(page))) 107 balloon_page_putback(page); 108 else 109 putback_lru_page(page); 110 } 111 } 112 113 /* 114 * Restore a potential migration pte to a working pte entry 115 */ 116 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 117 unsigned long addr, void *old) 118 { 119 struct mm_struct *mm = vma->vm_mm; 120 swp_entry_t entry; 121 pmd_t *pmd; 122 pte_t *ptep, pte; 123 spinlock_t *ptl; 124 125 if (unlikely(PageHuge(new))) { 126 ptep = huge_pte_offset(mm, addr); 127 if (!ptep) 128 goto out; 129 ptl = &mm->page_table_lock; 130 } else { 131 pmd = mm_find_pmd(mm, addr); 132 if (!pmd) 133 goto out; 134 if (pmd_trans_huge(*pmd)) 135 goto out; 136 137 ptep = pte_offset_map(pmd, addr); 138 139 /* 140 * Peek to check is_swap_pte() before taking ptlock? No, we 141 * can race mremap's move_ptes(), which skips anon_vma lock. 142 */ 143 144 ptl = pte_lockptr(mm, pmd); 145 } 146 147 spin_lock(ptl); 148 pte = *ptep; 149 if (!is_swap_pte(pte)) 150 goto unlock; 151 152 entry = pte_to_swp_entry(pte); 153 154 if (!is_migration_entry(entry) || 155 migration_entry_to_page(entry) != old) 156 goto unlock; 157 158 get_page(new); 159 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 160 if (is_write_migration_entry(entry)) 161 pte = pte_mkwrite(pte); 162 #ifdef CONFIG_HUGETLB_PAGE 163 if (PageHuge(new)) { 164 pte = pte_mkhuge(pte); 165 pte = arch_make_huge_pte(pte, vma, new, 0); 166 } 167 #endif 168 flush_cache_page(vma, addr, pte_pfn(pte)); 169 set_pte_at(mm, addr, ptep, pte); 170 171 if (PageHuge(new)) { 172 if (PageAnon(new)) 173 hugepage_add_anon_rmap(new, vma, addr); 174 else 175 page_dup_rmap(new); 176 } else if (PageAnon(new)) 177 page_add_anon_rmap(new, vma, addr); 178 else 179 page_add_file_rmap(new); 180 181 /* No need to invalidate - it was non-present before */ 182 update_mmu_cache(vma, addr, ptep); 183 unlock: 184 pte_unmap_unlock(ptep, ptl); 185 out: 186 return SWAP_AGAIN; 187 } 188 189 /* 190 * Get rid of all migration entries and replace them by 191 * references to the indicated page. 192 */ 193 static void remove_migration_ptes(struct page *old, struct page *new) 194 { 195 rmap_walk(new, remove_migration_pte, old); 196 } 197 198 /* 199 * Something used the pte of a page under migration. We need to 200 * get to the page and wait until migration is finished. 201 * When we return from this function the fault will be retried. 202 */ 203 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 204 unsigned long address) 205 { 206 pte_t *ptep, pte; 207 spinlock_t *ptl; 208 swp_entry_t entry; 209 struct page *page; 210 211 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 212 pte = *ptep; 213 if (!is_swap_pte(pte)) 214 goto out; 215 216 entry = pte_to_swp_entry(pte); 217 if (!is_migration_entry(entry)) 218 goto out; 219 220 page = migration_entry_to_page(entry); 221 222 /* 223 * Once radix-tree replacement of page migration started, page_count 224 * *must* be zero. And, we don't want to call wait_on_page_locked() 225 * against a page without get_page(). 226 * So, we use get_page_unless_zero(), here. Even failed, page fault 227 * will occur again. 228 */ 229 if (!get_page_unless_zero(page)) 230 goto out; 231 pte_unmap_unlock(ptep, ptl); 232 wait_on_page_locked(page); 233 put_page(page); 234 return; 235 out: 236 pte_unmap_unlock(ptep, ptl); 237 } 238 239 #ifdef CONFIG_BLOCK 240 /* Returns true if all buffers are successfully locked */ 241 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 242 enum migrate_mode mode) 243 { 244 struct buffer_head *bh = head; 245 246 /* Simple case, sync compaction */ 247 if (mode != MIGRATE_ASYNC) { 248 do { 249 get_bh(bh); 250 lock_buffer(bh); 251 bh = bh->b_this_page; 252 253 } while (bh != head); 254 255 return true; 256 } 257 258 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 259 do { 260 get_bh(bh); 261 if (!trylock_buffer(bh)) { 262 /* 263 * We failed to lock the buffer and cannot stall in 264 * async migration. Release the taken locks 265 */ 266 struct buffer_head *failed_bh = bh; 267 put_bh(failed_bh); 268 bh = head; 269 while (bh != failed_bh) { 270 unlock_buffer(bh); 271 put_bh(bh); 272 bh = bh->b_this_page; 273 } 274 return false; 275 } 276 277 bh = bh->b_this_page; 278 } while (bh != head); 279 return true; 280 } 281 #else 282 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, 283 enum migrate_mode mode) 284 { 285 return true; 286 } 287 #endif /* CONFIG_BLOCK */ 288 289 /* 290 * Replace the page in the mapping. 291 * 292 * The number of remaining references must be: 293 * 1 for anonymous pages without a mapping 294 * 2 for pages with a mapping 295 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 296 */ 297 static int migrate_page_move_mapping(struct address_space *mapping, 298 struct page *newpage, struct page *page, 299 struct buffer_head *head, enum migrate_mode mode) 300 { 301 int expected_count = 0; 302 void **pslot; 303 304 if (!mapping) { 305 /* Anonymous page without mapping */ 306 if (page_count(page) != 1) 307 return -EAGAIN; 308 return MIGRATEPAGE_SUCCESS; 309 } 310 311 spin_lock_irq(&mapping->tree_lock); 312 313 pslot = radix_tree_lookup_slot(&mapping->page_tree, 314 page_index(page)); 315 316 expected_count = 2 + page_has_private(page); 317 if (page_count(page) != expected_count || 318 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { 319 spin_unlock_irq(&mapping->tree_lock); 320 return -EAGAIN; 321 } 322 323 if (!page_freeze_refs(page, expected_count)) { 324 spin_unlock_irq(&mapping->tree_lock); 325 return -EAGAIN; 326 } 327 328 /* 329 * In the async migration case of moving a page with buffers, lock the 330 * buffers using trylock before the mapping is moved. If the mapping 331 * was moved, we later failed to lock the buffers and could not move 332 * the mapping back due to an elevated page count, we would have to 333 * block waiting on other references to be dropped. 334 */ 335 if (mode == MIGRATE_ASYNC && head && 336 !buffer_migrate_lock_buffers(head, mode)) { 337 page_unfreeze_refs(page, expected_count); 338 spin_unlock_irq(&mapping->tree_lock); 339 return -EAGAIN; 340 } 341 342 /* 343 * Now we know that no one else is looking at the page. 344 */ 345 get_page(newpage); /* add cache reference */ 346 if (PageSwapCache(page)) { 347 SetPageSwapCache(newpage); 348 set_page_private(newpage, page_private(page)); 349 } 350 351 radix_tree_replace_slot(pslot, newpage); 352 353 /* 354 * Drop cache reference from old page by unfreezing 355 * to one less reference. 356 * We know this isn't the last reference. 357 */ 358 page_unfreeze_refs(page, expected_count - 1); 359 360 /* 361 * If moved to a different zone then also account 362 * the page for that zone. Other VM counters will be 363 * taken care of when we establish references to the 364 * new page and drop references to the old page. 365 * 366 * Note that anonymous pages are accounted for 367 * via NR_FILE_PAGES and NR_ANON_PAGES if they 368 * are mapped to swap space. 369 */ 370 __dec_zone_page_state(page, NR_FILE_PAGES); 371 __inc_zone_page_state(newpage, NR_FILE_PAGES); 372 if (!PageSwapCache(page) && PageSwapBacked(page)) { 373 __dec_zone_page_state(page, NR_SHMEM); 374 __inc_zone_page_state(newpage, NR_SHMEM); 375 } 376 spin_unlock_irq(&mapping->tree_lock); 377 378 return MIGRATEPAGE_SUCCESS; 379 } 380 381 /* 382 * The expected number of remaining references is the same as that 383 * of migrate_page_move_mapping(). 384 */ 385 int migrate_huge_page_move_mapping(struct address_space *mapping, 386 struct page *newpage, struct page *page) 387 { 388 int expected_count; 389 void **pslot; 390 391 if (!mapping) { 392 if (page_count(page) != 1) 393 return -EAGAIN; 394 return MIGRATEPAGE_SUCCESS; 395 } 396 397 spin_lock_irq(&mapping->tree_lock); 398 399 pslot = radix_tree_lookup_slot(&mapping->page_tree, 400 page_index(page)); 401 402 expected_count = 2 + page_has_private(page); 403 if (page_count(page) != expected_count || 404 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { 405 spin_unlock_irq(&mapping->tree_lock); 406 return -EAGAIN; 407 } 408 409 if (!page_freeze_refs(page, expected_count)) { 410 spin_unlock_irq(&mapping->tree_lock); 411 return -EAGAIN; 412 } 413 414 get_page(newpage); 415 416 radix_tree_replace_slot(pslot, newpage); 417 418 page_unfreeze_refs(page, expected_count - 1); 419 420 spin_unlock_irq(&mapping->tree_lock); 421 return MIGRATEPAGE_SUCCESS; 422 } 423 424 /* 425 * Copy the page to its new location 426 */ 427 void migrate_page_copy(struct page *newpage, struct page *page) 428 { 429 if (PageHuge(page) || PageTransHuge(page)) 430 copy_huge_page(newpage, page); 431 else 432 copy_highpage(newpage, page); 433 434 if (PageError(page)) 435 SetPageError(newpage); 436 if (PageReferenced(page)) 437 SetPageReferenced(newpage); 438 if (PageUptodate(page)) 439 SetPageUptodate(newpage); 440 if (TestClearPageActive(page)) { 441 VM_BUG_ON(PageUnevictable(page)); 442 SetPageActive(newpage); 443 } else if (TestClearPageUnevictable(page)) 444 SetPageUnevictable(newpage); 445 if (PageChecked(page)) 446 SetPageChecked(newpage); 447 if (PageMappedToDisk(page)) 448 SetPageMappedToDisk(newpage); 449 450 if (PageDirty(page)) { 451 clear_page_dirty_for_io(page); 452 /* 453 * Want to mark the page and the radix tree as dirty, and 454 * redo the accounting that clear_page_dirty_for_io undid, 455 * but we can't use set_page_dirty because that function 456 * is actually a signal that all of the page has become dirty. 457 * Whereas only part of our page may be dirty. 458 */ 459 if (PageSwapBacked(page)) 460 SetPageDirty(newpage); 461 else 462 __set_page_dirty_nobuffers(newpage); 463 } 464 465 mlock_migrate_page(newpage, page); 466 ksm_migrate_page(newpage, page); 467 468 ClearPageSwapCache(page); 469 ClearPagePrivate(page); 470 set_page_private(page, 0); 471 472 /* 473 * If any waiters have accumulated on the new page then 474 * wake them up. 475 */ 476 if (PageWriteback(newpage)) 477 end_page_writeback(newpage); 478 } 479 480 /************************************************************ 481 * Migration functions 482 ***********************************************************/ 483 484 /* Always fail migration. Used for mappings that are not movable */ 485 int fail_migrate_page(struct address_space *mapping, 486 struct page *newpage, struct page *page) 487 { 488 return -EIO; 489 } 490 EXPORT_SYMBOL(fail_migrate_page); 491 492 /* 493 * Common logic to directly migrate a single page suitable for 494 * pages that do not use PagePrivate/PagePrivate2. 495 * 496 * Pages are locked upon entry and exit. 497 */ 498 int migrate_page(struct address_space *mapping, 499 struct page *newpage, struct page *page, 500 enum migrate_mode mode) 501 { 502 int rc; 503 504 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 505 506 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 507 508 if (rc != MIGRATEPAGE_SUCCESS) 509 return rc; 510 511 migrate_page_copy(newpage, page); 512 return MIGRATEPAGE_SUCCESS; 513 } 514 EXPORT_SYMBOL(migrate_page); 515 516 #ifdef CONFIG_BLOCK 517 /* 518 * Migration function for pages with buffers. This function can only be used 519 * if the underlying filesystem guarantees that no other references to "page" 520 * exist. 521 */ 522 int buffer_migrate_page(struct address_space *mapping, 523 struct page *newpage, struct page *page, enum migrate_mode mode) 524 { 525 struct buffer_head *bh, *head; 526 int rc; 527 528 if (!page_has_buffers(page)) 529 return migrate_page(mapping, newpage, page, mode); 530 531 head = page_buffers(page); 532 533 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 534 535 if (rc != MIGRATEPAGE_SUCCESS) 536 return rc; 537 538 /* 539 * In the async case, migrate_page_move_mapping locked the buffers 540 * with an IRQ-safe spinlock held. In the sync case, the buffers 541 * need to be locked now 542 */ 543 if (mode != MIGRATE_ASYNC) 544 BUG_ON(!buffer_migrate_lock_buffers(head, mode)); 545 546 ClearPagePrivate(page); 547 set_page_private(newpage, page_private(page)); 548 set_page_private(page, 0); 549 put_page(page); 550 get_page(newpage); 551 552 bh = head; 553 do { 554 set_bh_page(bh, newpage, bh_offset(bh)); 555 bh = bh->b_this_page; 556 557 } while (bh != head); 558 559 SetPagePrivate(newpage); 560 561 migrate_page_copy(newpage, page); 562 563 bh = head; 564 do { 565 unlock_buffer(bh); 566 put_bh(bh); 567 bh = bh->b_this_page; 568 569 } while (bh != head); 570 571 return MIGRATEPAGE_SUCCESS; 572 } 573 EXPORT_SYMBOL(buffer_migrate_page); 574 #endif 575 576 /* 577 * Writeback a page to clean the dirty state 578 */ 579 static int writeout(struct address_space *mapping, struct page *page) 580 { 581 struct writeback_control wbc = { 582 .sync_mode = WB_SYNC_NONE, 583 .nr_to_write = 1, 584 .range_start = 0, 585 .range_end = LLONG_MAX, 586 .for_reclaim = 1 587 }; 588 int rc; 589 590 if (!mapping->a_ops->writepage) 591 /* No write method for the address space */ 592 return -EINVAL; 593 594 if (!clear_page_dirty_for_io(page)) 595 /* Someone else already triggered a write */ 596 return -EAGAIN; 597 598 /* 599 * A dirty page may imply that the underlying filesystem has 600 * the page on some queue. So the page must be clean for 601 * migration. Writeout may mean we loose the lock and the 602 * page state is no longer what we checked for earlier. 603 * At this point we know that the migration attempt cannot 604 * be successful. 605 */ 606 remove_migration_ptes(page, page); 607 608 rc = mapping->a_ops->writepage(page, &wbc); 609 610 if (rc != AOP_WRITEPAGE_ACTIVATE) 611 /* unlocked. Relock */ 612 lock_page(page); 613 614 return (rc < 0) ? -EIO : -EAGAIN; 615 } 616 617 /* 618 * Default handling if a filesystem does not provide a migration function. 619 */ 620 static int fallback_migrate_page(struct address_space *mapping, 621 struct page *newpage, struct page *page, enum migrate_mode mode) 622 { 623 if (PageDirty(page)) { 624 /* Only writeback pages in full synchronous migration */ 625 if (mode != MIGRATE_SYNC) 626 return -EBUSY; 627 return writeout(mapping, page); 628 } 629 630 /* 631 * Buffers may be managed in a filesystem specific way. 632 * We must have no buffers or drop them. 633 */ 634 if (page_has_private(page) && 635 !try_to_release_page(page, GFP_KERNEL)) 636 return -EAGAIN; 637 638 return migrate_page(mapping, newpage, page, mode); 639 } 640 641 /* 642 * Move a page to a newly allocated page 643 * The page is locked and all ptes have been successfully removed. 644 * 645 * The new page will have replaced the old page if this function 646 * is successful. 647 * 648 * Return value: 649 * < 0 - error code 650 * MIGRATEPAGE_SUCCESS - success 651 */ 652 static int move_to_new_page(struct page *newpage, struct page *page, 653 int remap_swapcache, enum migrate_mode mode) 654 { 655 struct address_space *mapping; 656 int rc; 657 658 /* 659 * Block others from accessing the page when we get around to 660 * establishing additional references. We are the only one 661 * holding a reference to the new page at this point. 662 */ 663 if (!trylock_page(newpage)) 664 BUG(); 665 666 /* Prepare mapping for the new page.*/ 667 newpage->index = page->index; 668 newpage->mapping = page->mapping; 669 if (PageSwapBacked(page)) 670 SetPageSwapBacked(newpage); 671 672 mapping = page_mapping(page); 673 if (!mapping) 674 rc = migrate_page(mapping, newpage, page, mode); 675 else if (mapping->a_ops->migratepage) 676 /* 677 * Most pages have a mapping and most filesystems provide a 678 * migratepage callback. Anonymous pages are part of swap 679 * space which also has its own migratepage callback. This 680 * is the most common path for page migration. 681 */ 682 rc = mapping->a_ops->migratepage(mapping, 683 newpage, page, mode); 684 else 685 rc = fallback_migrate_page(mapping, newpage, page, mode); 686 687 if (rc != MIGRATEPAGE_SUCCESS) { 688 newpage->mapping = NULL; 689 } else { 690 if (remap_swapcache) 691 remove_migration_ptes(page, newpage); 692 page->mapping = NULL; 693 } 694 695 unlock_page(newpage); 696 697 return rc; 698 } 699 700 static int __unmap_and_move(struct page *page, struct page *newpage, 701 int force, bool offlining, enum migrate_mode mode) 702 { 703 int rc = -EAGAIN; 704 int remap_swapcache = 1; 705 struct mem_cgroup *mem; 706 struct anon_vma *anon_vma = NULL; 707 708 if (!trylock_page(page)) { 709 if (!force || mode == MIGRATE_ASYNC) 710 goto out; 711 712 /* 713 * It's not safe for direct compaction to call lock_page. 714 * For example, during page readahead pages are added locked 715 * to the LRU. Later, when the IO completes the pages are 716 * marked uptodate and unlocked. However, the queueing 717 * could be merging multiple pages for one bio (e.g. 718 * mpage_readpages). If an allocation happens for the 719 * second or third page, the process can end up locking 720 * the same page twice and deadlocking. Rather than 721 * trying to be clever about what pages can be locked, 722 * avoid the use of lock_page for direct compaction 723 * altogether. 724 */ 725 if (current->flags & PF_MEMALLOC) 726 goto out; 727 728 lock_page(page); 729 } 730 731 /* 732 * Only memory hotplug's offline_pages() caller has locked out KSM, 733 * and can safely migrate a KSM page. The other cases have skipped 734 * PageKsm along with PageReserved - but it is only now when we have 735 * the page lock that we can be certain it will not go KSM beneath us 736 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 737 * its pagecount raised, but only here do we take the page lock which 738 * serializes that). 739 */ 740 if (PageKsm(page) && !offlining) { 741 rc = -EBUSY; 742 goto unlock; 743 } 744 745 /* charge against new page */ 746 mem_cgroup_prepare_migration(page, newpage, &mem); 747 748 if (PageWriteback(page)) { 749 /* 750 * Only in the case of a full syncronous migration is it 751 * necessary to wait for PageWriteback. In the async case, 752 * the retry loop is too short and in the sync-light case, 753 * the overhead of stalling is too much 754 */ 755 if (mode != MIGRATE_SYNC) { 756 rc = -EBUSY; 757 goto uncharge; 758 } 759 if (!force) 760 goto uncharge; 761 wait_on_page_writeback(page); 762 } 763 /* 764 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 765 * we cannot notice that anon_vma is freed while we migrates a page. 766 * This get_anon_vma() delays freeing anon_vma pointer until the end 767 * of migration. File cache pages are no problem because of page_lock() 768 * File Caches may use write_page() or lock_page() in migration, then, 769 * just care Anon page here. 770 */ 771 if (PageAnon(page)) { 772 /* 773 * Only page_lock_anon_vma_read() understands the subtleties of 774 * getting a hold on an anon_vma from outside one of its mms. 775 */ 776 anon_vma = page_get_anon_vma(page); 777 if (anon_vma) { 778 /* 779 * Anon page 780 */ 781 } else if (PageSwapCache(page)) { 782 /* 783 * We cannot be sure that the anon_vma of an unmapped 784 * swapcache page is safe to use because we don't 785 * know in advance if the VMA that this page belonged 786 * to still exists. If the VMA and others sharing the 787 * data have been freed, then the anon_vma could 788 * already be invalid. 789 * 790 * To avoid this possibility, swapcache pages get 791 * migrated but are not remapped when migration 792 * completes 793 */ 794 remap_swapcache = 0; 795 } else { 796 goto uncharge; 797 } 798 } 799 800 if (unlikely(balloon_page_movable(page))) { 801 /* 802 * A ballooned page does not need any special attention from 803 * physical to virtual reverse mapping procedures. 804 * Skip any attempt to unmap PTEs or to remap swap cache, 805 * in order to avoid burning cycles at rmap level, and perform 806 * the page migration right away (proteced by page lock). 807 */ 808 rc = balloon_page_migrate(newpage, page, mode); 809 goto uncharge; 810 } 811 812 /* 813 * Corner case handling: 814 * 1. When a new swap-cache page is read into, it is added to the LRU 815 * and treated as swapcache but it has no rmap yet. 816 * Calling try_to_unmap() against a page->mapping==NULL page will 817 * trigger a BUG. So handle it here. 818 * 2. An orphaned page (see truncate_complete_page) might have 819 * fs-private metadata. The page can be picked up due to memory 820 * offlining. Everywhere else except page reclaim, the page is 821 * invisible to the vm, so the page can not be migrated. So try to 822 * free the metadata, so the page can be freed. 823 */ 824 if (!page->mapping) { 825 VM_BUG_ON(PageAnon(page)); 826 if (page_has_private(page)) { 827 try_to_free_buffers(page); 828 goto uncharge; 829 } 830 goto skip_unmap; 831 } 832 833 /* Establish migration ptes or remove ptes */ 834 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 835 836 skip_unmap: 837 if (!page_mapped(page)) 838 rc = move_to_new_page(newpage, page, remap_swapcache, mode); 839 840 if (rc && remap_swapcache) 841 remove_migration_ptes(page, page); 842 843 /* Drop an anon_vma reference if we took one */ 844 if (anon_vma) 845 put_anon_vma(anon_vma); 846 847 uncharge: 848 mem_cgroup_end_migration(mem, page, newpage, 849 (rc == MIGRATEPAGE_SUCCESS || 850 rc == MIGRATEPAGE_BALLOON_SUCCESS)); 851 unlock: 852 unlock_page(page); 853 out: 854 return rc; 855 } 856 857 /* 858 * Obtain the lock on page, remove all ptes and migrate the page 859 * to the newly allocated page in newpage. 860 */ 861 static int unmap_and_move(new_page_t get_new_page, unsigned long private, 862 struct page *page, int force, bool offlining, 863 enum migrate_mode mode) 864 { 865 int rc = 0; 866 int *result = NULL; 867 struct page *newpage = get_new_page(page, private, &result); 868 869 if (!newpage) 870 return -ENOMEM; 871 872 if (page_count(page) == 1) { 873 /* page was freed from under us. So we are done. */ 874 goto out; 875 } 876 877 if (unlikely(PageTransHuge(page))) 878 if (unlikely(split_huge_page(page))) 879 goto out; 880 881 rc = __unmap_and_move(page, newpage, force, offlining, mode); 882 883 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { 884 /* 885 * A ballooned page has been migrated already. 886 * Now, it's the time to wrap-up counters, 887 * handle the page back to Buddy and return. 888 */ 889 dec_zone_page_state(page, NR_ISOLATED_ANON + 890 page_is_file_cache(page)); 891 balloon_page_free(page); 892 return MIGRATEPAGE_SUCCESS; 893 } 894 out: 895 if (rc != -EAGAIN) { 896 /* 897 * A page that has been migrated has all references 898 * removed and will be freed. A page that has not been 899 * migrated will have kepts its references and be 900 * restored. 901 */ 902 list_del(&page->lru); 903 dec_zone_page_state(page, NR_ISOLATED_ANON + 904 page_is_file_cache(page)); 905 putback_lru_page(page); 906 } 907 /* 908 * Move the new page to the LRU. If migration was not successful 909 * then this will free the page. 910 */ 911 putback_lru_page(newpage); 912 if (result) { 913 if (rc) 914 *result = rc; 915 else 916 *result = page_to_nid(newpage); 917 } 918 return rc; 919 } 920 921 /* 922 * Counterpart of unmap_and_move_page() for hugepage migration. 923 * 924 * This function doesn't wait the completion of hugepage I/O 925 * because there is no race between I/O and migration for hugepage. 926 * Note that currently hugepage I/O occurs only in direct I/O 927 * where no lock is held and PG_writeback is irrelevant, 928 * and writeback status of all subpages are counted in the reference 929 * count of the head page (i.e. if all subpages of a 2MB hugepage are 930 * under direct I/O, the reference of the head page is 512 and a bit more.) 931 * This means that when we try to migrate hugepage whose subpages are 932 * doing direct I/O, some references remain after try_to_unmap() and 933 * hugepage migration fails without data corruption. 934 * 935 * There is also no race when direct I/O is issued on the page under migration, 936 * because then pte is replaced with migration swap entry and direct I/O code 937 * will wait in the page fault for migration to complete. 938 */ 939 static int unmap_and_move_huge_page(new_page_t get_new_page, 940 unsigned long private, struct page *hpage, 941 int force, bool offlining, 942 enum migrate_mode mode) 943 { 944 int rc = 0; 945 int *result = NULL; 946 struct page *new_hpage = get_new_page(hpage, private, &result); 947 struct anon_vma *anon_vma = NULL; 948 949 if (!new_hpage) 950 return -ENOMEM; 951 952 rc = -EAGAIN; 953 954 if (!trylock_page(hpage)) { 955 if (!force || mode != MIGRATE_SYNC) 956 goto out; 957 lock_page(hpage); 958 } 959 960 if (PageAnon(hpage)) 961 anon_vma = page_get_anon_vma(hpage); 962 963 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 964 965 if (!page_mapped(hpage)) 966 rc = move_to_new_page(new_hpage, hpage, 1, mode); 967 968 if (rc) 969 remove_migration_ptes(hpage, hpage); 970 971 if (anon_vma) 972 put_anon_vma(anon_vma); 973 974 if (!rc) 975 hugetlb_cgroup_migrate(hpage, new_hpage); 976 977 unlock_page(hpage); 978 out: 979 put_page(new_hpage); 980 if (result) { 981 if (rc) 982 *result = rc; 983 else 984 *result = page_to_nid(new_hpage); 985 } 986 return rc; 987 } 988 989 /* 990 * migrate_pages 991 * 992 * The function takes one list of pages to migrate and a function 993 * that determines from the page to be migrated and the private data 994 * the target of the move and allocates the page. 995 * 996 * The function returns after 10 attempts or if no pages 997 * are movable anymore because to has become empty 998 * or no retryable pages exist anymore. 999 * Caller should call putback_lru_pages to return pages to the LRU 1000 * or free list only if ret != 0. 1001 * 1002 * Return: Number of pages not migrated or error code. 1003 */ 1004 int migrate_pages(struct list_head *from, 1005 new_page_t get_new_page, unsigned long private, bool offlining, 1006 enum migrate_mode mode, int reason) 1007 { 1008 int retry = 1; 1009 int nr_failed = 0; 1010 int nr_succeeded = 0; 1011 int pass = 0; 1012 struct page *page; 1013 struct page *page2; 1014 int swapwrite = current->flags & PF_SWAPWRITE; 1015 int rc; 1016 1017 if (!swapwrite) 1018 current->flags |= PF_SWAPWRITE; 1019 1020 for(pass = 0; pass < 10 && retry; pass++) { 1021 retry = 0; 1022 1023 list_for_each_entry_safe(page, page2, from, lru) { 1024 cond_resched(); 1025 1026 rc = unmap_and_move(get_new_page, private, 1027 page, pass > 2, offlining, 1028 mode); 1029 1030 switch(rc) { 1031 case -ENOMEM: 1032 goto out; 1033 case -EAGAIN: 1034 retry++; 1035 break; 1036 case MIGRATEPAGE_SUCCESS: 1037 nr_succeeded++; 1038 break; 1039 default: 1040 /* Permanent failure */ 1041 nr_failed++; 1042 break; 1043 } 1044 } 1045 } 1046 rc = nr_failed + retry; 1047 out: 1048 if (nr_succeeded) 1049 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1050 if (nr_failed) 1051 count_vm_events(PGMIGRATE_FAIL, nr_failed); 1052 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); 1053 1054 if (!swapwrite) 1055 current->flags &= ~PF_SWAPWRITE; 1056 1057 return rc; 1058 } 1059 1060 int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1061 unsigned long private, bool offlining, 1062 enum migrate_mode mode) 1063 { 1064 int pass, rc; 1065 1066 for (pass = 0; pass < 10; pass++) { 1067 rc = unmap_and_move_huge_page(get_new_page, 1068 private, hpage, pass > 2, offlining, 1069 mode); 1070 switch (rc) { 1071 case -ENOMEM: 1072 goto out; 1073 case -EAGAIN: 1074 /* try again */ 1075 cond_resched(); 1076 break; 1077 case MIGRATEPAGE_SUCCESS: 1078 goto out; 1079 default: 1080 rc = -EIO; 1081 goto out; 1082 } 1083 } 1084 out: 1085 return rc; 1086 } 1087 1088 #ifdef CONFIG_NUMA 1089 /* 1090 * Move a list of individual pages 1091 */ 1092 struct page_to_node { 1093 unsigned long addr; 1094 struct page *page; 1095 int node; 1096 int status; 1097 }; 1098 1099 static struct page *new_page_node(struct page *p, unsigned long private, 1100 int **result) 1101 { 1102 struct page_to_node *pm = (struct page_to_node *)private; 1103 1104 while (pm->node != MAX_NUMNODES && pm->page != p) 1105 pm++; 1106 1107 if (pm->node == MAX_NUMNODES) 1108 return NULL; 1109 1110 *result = &pm->status; 1111 1112 return alloc_pages_exact_node(pm->node, 1113 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 1114 } 1115 1116 /* 1117 * Move a set of pages as indicated in the pm array. The addr 1118 * field must be set to the virtual address of the page to be moved 1119 * and the node number must contain a valid target node. 1120 * The pm array ends with node = MAX_NUMNODES. 1121 */ 1122 static int do_move_page_to_node_array(struct mm_struct *mm, 1123 struct page_to_node *pm, 1124 int migrate_all) 1125 { 1126 int err; 1127 struct page_to_node *pp; 1128 LIST_HEAD(pagelist); 1129 1130 down_read(&mm->mmap_sem); 1131 1132 /* 1133 * Build a list of pages to migrate 1134 */ 1135 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 1136 struct vm_area_struct *vma; 1137 struct page *page; 1138 1139 err = -EFAULT; 1140 vma = find_vma(mm, pp->addr); 1141 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1142 goto set_status; 1143 1144 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); 1145 1146 err = PTR_ERR(page); 1147 if (IS_ERR(page)) 1148 goto set_status; 1149 1150 err = -ENOENT; 1151 if (!page) 1152 goto set_status; 1153 1154 /* Use PageReserved to check for zero page */ 1155 if (PageReserved(page) || PageKsm(page)) 1156 goto put_and_set; 1157 1158 pp->page = page; 1159 err = page_to_nid(page); 1160 1161 if (err == pp->node) 1162 /* 1163 * Node already in the right place 1164 */ 1165 goto put_and_set; 1166 1167 err = -EACCES; 1168 if (page_mapcount(page) > 1 && 1169 !migrate_all) 1170 goto put_and_set; 1171 1172 err = isolate_lru_page(page); 1173 if (!err) { 1174 list_add_tail(&page->lru, &pagelist); 1175 inc_zone_page_state(page, NR_ISOLATED_ANON + 1176 page_is_file_cache(page)); 1177 } 1178 put_and_set: 1179 /* 1180 * Either remove the duplicate refcount from 1181 * isolate_lru_page() or drop the page ref if it was 1182 * not isolated. 1183 */ 1184 put_page(page); 1185 set_status: 1186 pp->status = err; 1187 } 1188 1189 err = 0; 1190 if (!list_empty(&pagelist)) { 1191 err = migrate_pages(&pagelist, new_page_node, 1192 (unsigned long)pm, 0, MIGRATE_SYNC, 1193 MR_SYSCALL); 1194 if (err) 1195 putback_lru_pages(&pagelist); 1196 } 1197 1198 up_read(&mm->mmap_sem); 1199 return err; 1200 } 1201 1202 /* 1203 * Migrate an array of page address onto an array of nodes and fill 1204 * the corresponding array of status. 1205 */ 1206 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 1207 unsigned long nr_pages, 1208 const void __user * __user *pages, 1209 const int __user *nodes, 1210 int __user *status, int flags) 1211 { 1212 struct page_to_node *pm; 1213 unsigned long chunk_nr_pages; 1214 unsigned long chunk_start; 1215 int err; 1216 1217 err = -ENOMEM; 1218 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 1219 if (!pm) 1220 goto out; 1221 1222 migrate_prep(); 1223 1224 /* 1225 * Store a chunk of page_to_node array in a page, 1226 * but keep the last one as a marker 1227 */ 1228 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 1229 1230 for (chunk_start = 0; 1231 chunk_start < nr_pages; 1232 chunk_start += chunk_nr_pages) { 1233 int j; 1234 1235 if (chunk_start + chunk_nr_pages > nr_pages) 1236 chunk_nr_pages = nr_pages - chunk_start; 1237 1238 /* fill the chunk pm with addrs and nodes from user-space */ 1239 for (j = 0; j < chunk_nr_pages; j++) { 1240 const void __user *p; 1241 int node; 1242 1243 err = -EFAULT; 1244 if (get_user(p, pages + j + chunk_start)) 1245 goto out_pm; 1246 pm[j].addr = (unsigned long) p; 1247 1248 if (get_user(node, nodes + j + chunk_start)) 1249 goto out_pm; 1250 1251 err = -ENODEV; 1252 if (node < 0 || node >= MAX_NUMNODES) 1253 goto out_pm; 1254 1255 if (!node_state(node, N_MEMORY)) 1256 goto out_pm; 1257 1258 err = -EACCES; 1259 if (!node_isset(node, task_nodes)) 1260 goto out_pm; 1261 1262 pm[j].node = node; 1263 } 1264 1265 /* End marker for this chunk */ 1266 pm[chunk_nr_pages].node = MAX_NUMNODES; 1267 1268 /* Migrate this chunk */ 1269 err = do_move_page_to_node_array(mm, pm, 1270 flags & MPOL_MF_MOVE_ALL); 1271 if (err < 0) 1272 goto out_pm; 1273 1274 /* Return status information */ 1275 for (j = 0; j < chunk_nr_pages; j++) 1276 if (put_user(pm[j].status, status + j + chunk_start)) { 1277 err = -EFAULT; 1278 goto out_pm; 1279 } 1280 } 1281 err = 0; 1282 1283 out_pm: 1284 free_page((unsigned long)pm); 1285 out: 1286 return err; 1287 } 1288 1289 /* 1290 * Determine the nodes of an array of pages and store it in an array of status. 1291 */ 1292 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 1293 const void __user **pages, int *status) 1294 { 1295 unsigned long i; 1296 1297 down_read(&mm->mmap_sem); 1298 1299 for (i = 0; i < nr_pages; i++) { 1300 unsigned long addr = (unsigned long)(*pages); 1301 struct vm_area_struct *vma; 1302 struct page *page; 1303 int err = -EFAULT; 1304 1305 vma = find_vma(mm, addr); 1306 if (!vma || addr < vma->vm_start) 1307 goto set_status; 1308 1309 page = follow_page(vma, addr, 0); 1310 1311 err = PTR_ERR(page); 1312 if (IS_ERR(page)) 1313 goto set_status; 1314 1315 err = -ENOENT; 1316 /* Use PageReserved to check for zero page */ 1317 if (!page || PageReserved(page) || PageKsm(page)) 1318 goto set_status; 1319 1320 err = page_to_nid(page); 1321 set_status: 1322 *status = err; 1323 1324 pages++; 1325 status++; 1326 } 1327 1328 up_read(&mm->mmap_sem); 1329 } 1330 1331 /* 1332 * Determine the nodes of a user array of pages and store it in 1333 * a user array of status. 1334 */ 1335 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1336 const void __user * __user *pages, 1337 int __user *status) 1338 { 1339 #define DO_PAGES_STAT_CHUNK_NR 16 1340 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1341 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1342 1343 while (nr_pages) { 1344 unsigned long chunk_nr; 1345 1346 chunk_nr = nr_pages; 1347 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1348 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1349 1350 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1351 break; 1352 1353 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1354 1355 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1356 break; 1357 1358 pages += chunk_nr; 1359 status += chunk_nr; 1360 nr_pages -= chunk_nr; 1361 } 1362 return nr_pages ? -EFAULT : 0; 1363 } 1364 1365 /* 1366 * Move a list of pages in the address space of the currently executing 1367 * process. 1368 */ 1369 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1370 const void __user * __user *, pages, 1371 const int __user *, nodes, 1372 int __user *, status, int, flags) 1373 { 1374 const struct cred *cred = current_cred(), *tcred; 1375 struct task_struct *task; 1376 struct mm_struct *mm; 1377 int err; 1378 nodemask_t task_nodes; 1379 1380 /* Check flags */ 1381 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1382 return -EINVAL; 1383 1384 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1385 return -EPERM; 1386 1387 /* Find the mm_struct */ 1388 rcu_read_lock(); 1389 task = pid ? find_task_by_vpid(pid) : current; 1390 if (!task) { 1391 rcu_read_unlock(); 1392 return -ESRCH; 1393 } 1394 get_task_struct(task); 1395 1396 /* 1397 * Check if this process has the right to modify the specified 1398 * process. The right exists if the process has administrative 1399 * capabilities, superuser privileges or the same 1400 * userid as the target process. 1401 */ 1402 tcred = __task_cred(task); 1403 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && 1404 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && 1405 !capable(CAP_SYS_NICE)) { 1406 rcu_read_unlock(); 1407 err = -EPERM; 1408 goto out; 1409 } 1410 rcu_read_unlock(); 1411 1412 err = security_task_movememory(task); 1413 if (err) 1414 goto out; 1415 1416 task_nodes = cpuset_mems_allowed(task); 1417 mm = get_task_mm(task); 1418 put_task_struct(task); 1419 1420 if (!mm) 1421 return -EINVAL; 1422 1423 if (nodes) 1424 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1425 nodes, status, flags); 1426 else 1427 err = do_pages_stat(mm, nr_pages, pages, status); 1428 1429 mmput(mm); 1430 return err; 1431 1432 out: 1433 put_task_struct(task); 1434 return err; 1435 } 1436 1437 /* 1438 * Call migration functions in the vma_ops that may prepare 1439 * memory in a vm for migration. migration functions may perform 1440 * the migration for vmas that do not have an underlying page struct. 1441 */ 1442 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 1443 const nodemask_t *from, unsigned long flags) 1444 { 1445 struct vm_area_struct *vma; 1446 int err = 0; 1447 1448 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 1449 if (vma->vm_ops && vma->vm_ops->migrate) { 1450 err = vma->vm_ops->migrate(vma, to, from, flags); 1451 if (err) 1452 break; 1453 } 1454 } 1455 return err; 1456 } 1457 1458 #ifdef CONFIG_NUMA_BALANCING 1459 /* 1460 * Returns true if this is a safe migration target node for misplaced NUMA 1461 * pages. Currently it only checks the watermarks which crude 1462 */ 1463 static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1464 int nr_migrate_pages) 1465 { 1466 int z; 1467 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1468 struct zone *zone = pgdat->node_zones + z; 1469 1470 if (!populated_zone(zone)) 1471 continue; 1472 1473 if (zone->all_unreclaimable) 1474 continue; 1475 1476 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1477 if (!zone_watermark_ok(zone, 0, 1478 high_wmark_pages(zone) + 1479 nr_migrate_pages, 1480 0, 0)) 1481 continue; 1482 return true; 1483 } 1484 return false; 1485 } 1486 1487 static struct page *alloc_misplaced_dst_page(struct page *page, 1488 unsigned long data, 1489 int **result) 1490 { 1491 int nid = (int) data; 1492 struct page *newpage; 1493 1494 newpage = alloc_pages_exact_node(nid, 1495 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | 1496 __GFP_NOMEMALLOC | __GFP_NORETRY | 1497 __GFP_NOWARN) & 1498 ~GFP_IOFS, 0); 1499 if (newpage) 1500 page_xchg_last_nid(newpage, page_last_nid(page)); 1501 1502 return newpage; 1503 } 1504 1505 /* 1506 * page migration rate limiting control. 1507 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1508 * window of time. Default here says do not migrate more than 1280M per second. 1509 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However 1510 * as it is faults that reset the window, pte updates will happen unconditionally 1511 * if there has not been a fault since @pteupdate_interval_millisecs after the 1512 * throttle window closed. 1513 */ 1514 static unsigned int migrate_interval_millisecs __read_mostly = 100; 1515 static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; 1516 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1517 1518 /* Returns true if NUMA migration is currently rate limited */ 1519 bool migrate_ratelimited(int node) 1520 { 1521 pg_data_t *pgdat = NODE_DATA(node); 1522 1523 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + 1524 msecs_to_jiffies(pteupdate_interval_millisecs))) 1525 return false; 1526 1527 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) 1528 return false; 1529 1530 return true; 1531 } 1532 1533 /* Returns true if the node is migrate rate-limited after the update */ 1534 bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) 1535 { 1536 bool rate_limited = false; 1537 1538 /* 1539 * Rate-limit the amount of data that is being migrated to a node. 1540 * Optimal placement is no good if the memory bus is saturated and 1541 * all the time is being spent migrating! 1542 */ 1543 spin_lock(&pgdat->numabalancing_migrate_lock); 1544 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1545 pgdat->numabalancing_migrate_nr_pages = 0; 1546 pgdat->numabalancing_migrate_next_window = jiffies + 1547 msecs_to_jiffies(migrate_interval_millisecs); 1548 } 1549 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) 1550 rate_limited = true; 1551 else 1552 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1553 spin_unlock(&pgdat->numabalancing_migrate_lock); 1554 1555 return rate_limited; 1556 } 1557 1558 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1559 { 1560 int ret = 0; 1561 1562 /* Avoid migrating to a node that is nearly full */ 1563 if (migrate_balanced_pgdat(pgdat, 1)) { 1564 int page_lru; 1565 1566 if (isolate_lru_page(page)) { 1567 put_page(page); 1568 return 0; 1569 } 1570 1571 /* Page is isolated */ 1572 ret = 1; 1573 page_lru = page_is_file_cache(page); 1574 if (!PageTransHuge(page)) 1575 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); 1576 else 1577 mod_zone_page_state(page_zone(page), 1578 NR_ISOLATED_ANON + page_lru, 1579 HPAGE_PMD_NR); 1580 } 1581 1582 /* 1583 * Page is either isolated or there is not enough space on the target 1584 * node. If isolated, then it has taken a reference count and the 1585 * callers reference can be safely dropped without the page 1586 * disappearing underneath us during migration. Otherwise the page is 1587 * not to be migrated but the callers reference should still be 1588 * dropped so it does not leak. 1589 */ 1590 put_page(page); 1591 1592 return ret; 1593 } 1594 1595 /* 1596 * Attempt to migrate a misplaced page to the specified destination 1597 * node. Caller is expected to have an elevated reference count on 1598 * the page that will be dropped by this function before returning. 1599 */ 1600 int migrate_misplaced_page(struct page *page, int node) 1601 { 1602 pg_data_t *pgdat = NODE_DATA(node); 1603 int isolated = 0; 1604 int nr_remaining; 1605 LIST_HEAD(migratepages); 1606 1607 /* 1608 * Don't migrate pages that are mapped in multiple processes. 1609 * TODO: Handle false sharing detection instead of this hammer 1610 */ 1611 if (page_mapcount(page) != 1) { 1612 put_page(page); 1613 goto out; 1614 } 1615 1616 /* 1617 * Rate-limit the amount of data that is being migrated to a node. 1618 * Optimal placement is no good if the memory bus is saturated and 1619 * all the time is being spent migrating! 1620 */ 1621 if (numamigrate_update_ratelimit(pgdat, 1)) { 1622 put_page(page); 1623 goto out; 1624 } 1625 1626 isolated = numamigrate_isolate_page(pgdat, page); 1627 if (!isolated) 1628 goto out; 1629 1630 list_add(&page->lru, &migratepages); 1631 nr_remaining = migrate_pages(&migratepages, 1632 alloc_misplaced_dst_page, 1633 node, false, MIGRATE_ASYNC, 1634 MR_NUMA_MISPLACED); 1635 if (nr_remaining) { 1636 putback_lru_pages(&migratepages); 1637 isolated = 0; 1638 } else 1639 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1640 BUG_ON(!list_empty(&migratepages)); 1641 out: 1642 return isolated; 1643 } 1644 #endif /* CONFIG_NUMA_BALANCING */ 1645 1646 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1647 int migrate_misplaced_transhuge_page(struct mm_struct *mm, 1648 struct vm_area_struct *vma, 1649 pmd_t *pmd, pmd_t entry, 1650 unsigned long address, 1651 struct page *page, int node) 1652 { 1653 unsigned long haddr = address & HPAGE_PMD_MASK; 1654 pg_data_t *pgdat = NODE_DATA(node); 1655 int isolated = 0; 1656 struct page *new_page = NULL; 1657 struct mem_cgroup *memcg = NULL; 1658 int page_lru = page_is_file_cache(page); 1659 1660 /* 1661 * Don't migrate pages that are mapped in multiple processes. 1662 * TODO: Handle false sharing detection instead of this hammer 1663 */ 1664 if (page_mapcount(page) != 1) 1665 goto out_dropref; 1666 1667 /* 1668 * Rate-limit the amount of data that is being migrated to a node. 1669 * Optimal placement is no good if the memory bus is saturated and 1670 * all the time is being spent migrating! 1671 */ 1672 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) 1673 goto out_dropref; 1674 1675 new_page = alloc_pages_node(node, 1676 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); 1677 if (!new_page) { 1678 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1679 goto out_dropref; 1680 } 1681 page_xchg_last_nid(new_page, page_last_nid(page)); 1682 1683 isolated = numamigrate_isolate_page(pgdat, page); 1684 1685 /* 1686 * Failing to isolate or a GUP pin prevents migration. The expected 1687 * page count is 2. 1 for anonymous pages without a mapping and 1 1688 * for the callers pin. If the page was isolated, the page will 1689 * need to be put back on the LRU. 1690 */ 1691 if (!isolated || page_count(page) != 2) { 1692 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1693 put_page(new_page); 1694 if (isolated) { 1695 putback_lru_page(page); 1696 isolated = 0; 1697 goto out; 1698 } 1699 goto out_keep_locked; 1700 } 1701 1702 /* Prepare a page as a migration target */ 1703 __set_page_locked(new_page); 1704 SetPageSwapBacked(new_page); 1705 1706 /* anon mapping, we can simply copy page->mapping to the new page: */ 1707 new_page->mapping = page->mapping; 1708 new_page->index = page->index; 1709 migrate_page_copy(new_page, page); 1710 WARN_ON(PageLRU(new_page)); 1711 1712 /* Recheck the target PMD */ 1713 spin_lock(&mm->page_table_lock); 1714 if (unlikely(!pmd_same(*pmd, entry))) { 1715 spin_unlock(&mm->page_table_lock); 1716 1717 /* Reverse changes made by migrate_page_copy() */ 1718 if (TestClearPageActive(new_page)) 1719 SetPageActive(page); 1720 if (TestClearPageUnevictable(new_page)) 1721 SetPageUnevictable(page); 1722 mlock_migrate_page(page, new_page); 1723 1724 unlock_page(new_page); 1725 put_page(new_page); /* Free it */ 1726 1727 unlock_page(page); 1728 putback_lru_page(page); 1729 1730 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1731 goto out; 1732 } 1733 1734 /* 1735 * Traditional migration needs to prepare the memcg charge 1736 * transaction early to prevent the old page from being 1737 * uncharged when installing migration entries. Here we can 1738 * save the potential rollback and start the charge transfer 1739 * only when migration is already known to end successfully. 1740 */ 1741 mem_cgroup_prepare_migration(page, new_page, &memcg); 1742 1743 entry = mk_pmd(new_page, vma->vm_page_prot); 1744 entry = pmd_mknonnuma(entry); 1745 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1746 entry = pmd_mkhuge(entry); 1747 1748 page_add_new_anon_rmap(new_page, vma, haddr); 1749 1750 set_pmd_at(mm, haddr, pmd, entry); 1751 update_mmu_cache_pmd(vma, address, &entry); 1752 page_remove_rmap(page); 1753 /* 1754 * Finish the charge transaction under the page table lock to 1755 * prevent split_huge_page() from dividing up the charge 1756 * before it's fully transferred to the new page. 1757 */ 1758 mem_cgroup_end_migration(memcg, page, new_page, true); 1759 spin_unlock(&mm->page_table_lock); 1760 1761 unlock_page(new_page); 1762 unlock_page(page); 1763 put_page(page); /* Drop the rmap reference */ 1764 put_page(page); /* Drop the LRU isolation reference */ 1765 1766 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 1767 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 1768 1769 out: 1770 mod_zone_page_state(page_zone(page), 1771 NR_ISOLATED_ANON + page_lru, 1772 -HPAGE_PMD_NR); 1773 return isolated; 1774 1775 out_dropref: 1776 put_page(page); 1777 out_keep_locked: 1778 return 0; 1779 } 1780 #endif /* CONFIG_NUMA_BALANCING */ 1781 1782 #endif /* CONFIG_NUMA */ 1783