1 /* 2 * Memory Migration functionality - linux/mm/migration.c 3 * 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5 * 6 * Page migration was first developed in the context of the memory hotplug 7 * project. The main authors of the migration code are: 8 * 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp> 11 * Dave Hansen <haveblue@us.ibm.com> 12 * Christoph Lameter 13 */ 14 15 #include <linux/migrate.h> 16 #include <linux/module.h> 17 #include <linux/swap.h> 18 #include <linux/swapops.h> 19 #include <linux/pagemap.h> 20 #include <linux/buffer_head.h> 21 #include <linux/mm_inline.h> 22 #include <linux/nsproxy.h> 23 #include <linux/pagevec.h> 24 #include <linux/ksm.h> 25 #include <linux/rmap.h> 26 #include <linux/topology.h> 27 #include <linux/cpu.h> 28 #include <linux/cpuset.h> 29 #include <linux/writeback.h> 30 #include <linux/mempolicy.h> 31 #include <linux/vmalloc.h> 32 #include <linux/security.h> 33 #include <linux/memcontrol.h> 34 #include <linux/syscalls.h> 35 36 #include "internal.h" 37 38 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 39 40 /* 41 * migrate_prep() needs to be called before we start compiling a list of pages 42 * to be migrated using isolate_lru_page(). 43 */ 44 int migrate_prep(void) 45 { 46 /* 47 * Clear the LRU lists so pages can be isolated. 48 * Note that pages may be moved off the LRU after we have 49 * drained them. Those pages will fail to migrate like other 50 * pages that may be busy. 51 */ 52 lru_add_drain_all(); 53 54 return 0; 55 } 56 57 /* 58 * Add isolated pages on the list back to the LRU under page lock 59 * to avoid leaking evictable pages back onto unevictable list. 60 * 61 * returns the number of pages put back. 62 */ 63 int putback_lru_pages(struct list_head *l) 64 { 65 struct page *page; 66 struct page *page2; 67 int count = 0; 68 69 list_for_each_entry_safe(page, page2, l, lru) { 70 list_del(&page->lru); 71 dec_zone_page_state(page, NR_ISOLATED_ANON + 72 page_is_file_cache(page)); 73 putback_lru_page(page); 74 count++; 75 } 76 return count; 77 } 78 79 /* 80 * Restore a potential migration pte to a working pte entry 81 */ 82 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 83 unsigned long addr, void *old) 84 { 85 struct mm_struct *mm = vma->vm_mm; 86 swp_entry_t entry; 87 pgd_t *pgd; 88 pud_t *pud; 89 pmd_t *pmd; 90 pte_t *ptep, pte; 91 spinlock_t *ptl; 92 93 pgd = pgd_offset(mm, addr); 94 if (!pgd_present(*pgd)) 95 goto out; 96 97 pud = pud_offset(pgd, addr); 98 if (!pud_present(*pud)) 99 goto out; 100 101 pmd = pmd_offset(pud, addr); 102 if (!pmd_present(*pmd)) 103 goto out; 104 105 ptep = pte_offset_map(pmd, addr); 106 107 if (!is_swap_pte(*ptep)) { 108 pte_unmap(ptep); 109 goto out; 110 } 111 112 ptl = pte_lockptr(mm, pmd); 113 spin_lock(ptl); 114 pte = *ptep; 115 if (!is_swap_pte(pte)) 116 goto unlock; 117 118 entry = pte_to_swp_entry(pte); 119 120 if (!is_migration_entry(entry) || 121 migration_entry_to_page(entry) != old) 122 goto unlock; 123 124 get_page(new); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 126 if (is_write_migration_entry(entry)) 127 pte = pte_mkwrite(pte); 128 flush_cache_page(vma, addr, pte_pfn(pte)); 129 set_pte_at(mm, addr, ptep, pte); 130 131 if (PageAnon(new)) 132 page_add_anon_rmap(new, vma, addr); 133 else 134 page_add_file_rmap(new); 135 136 /* No need to invalidate - it was non-present before */ 137 update_mmu_cache(vma, addr, pte); 138 unlock: 139 pte_unmap_unlock(ptep, ptl); 140 out: 141 return SWAP_AGAIN; 142 } 143 144 /* 145 * Get rid of all migration entries and replace them by 146 * references to the indicated page. 147 */ 148 static void remove_migration_ptes(struct page *old, struct page *new) 149 { 150 rmap_walk(new, remove_migration_pte, old); 151 } 152 153 /* 154 * Something used the pte of a page under migration. We need to 155 * get to the page and wait until migration is finished. 156 * When we return from this function the fault will be retried. 157 * 158 * This function is called from do_swap_page(). 159 */ 160 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 161 unsigned long address) 162 { 163 pte_t *ptep, pte; 164 spinlock_t *ptl; 165 swp_entry_t entry; 166 struct page *page; 167 168 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 169 pte = *ptep; 170 if (!is_swap_pte(pte)) 171 goto out; 172 173 entry = pte_to_swp_entry(pte); 174 if (!is_migration_entry(entry)) 175 goto out; 176 177 page = migration_entry_to_page(entry); 178 179 /* 180 * Once radix-tree replacement of page migration started, page_count 181 * *must* be zero. And, we don't want to call wait_on_page_locked() 182 * against a page without get_page(). 183 * So, we use get_page_unless_zero(), here. Even failed, page fault 184 * will occur again. 185 */ 186 if (!get_page_unless_zero(page)) 187 goto out; 188 pte_unmap_unlock(ptep, ptl); 189 wait_on_page_locked(page); 190 put_page(page); 191 return; 192 out: 193 pte_unmap_unlock(ptep, ptl); 194 } 195 196 /* 197 * Replace the page in the mapping. 198 * 199 * The number of remaining references must be: 200 * 1 for anonymous pages without a mapping 201 * 2 for pages with a mapping 202 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 203 */ 204 static int migrate_page_move_mapping(struct address_space *mapping, 205 struct page *newpage, struct page *page) 206 { 207 int expected_count; 208 void **pslot; 209 210 if (!mapping) { 211 /* Anonymous page without mapping */ 212 if (page_count(page) != 1) 213 return -EAGAIN; 214 return 0; 215 } 216 217 spin_lock_irq(&mapping->tree_lock); 218 219 pslot = radix_tree_lookup_slot(&mapping->page_tree, 220 page_index(page)); 221 222 expected_count = 2 + page_has_private(page); 223 if (page_count(page) != expected_count || 224 (struct page *)radix_tree_deref_slot(pslot) != page) { 225 spin_unlock_irq(&mapping->tree_lock); 226 return -EAGAIN; 227 } 228 229 if (!page_freeze_refs(page, expected_count)) { 230 spin_unlock_irq(&mapping->tree_lock); 231 return -EAGAIN; 232 } 233 234 /* 235 * Now we know that no one else is looking at the page. 236 */ 237 get_page(newpage); /* add cache reference */ 238 if (PageSwapCache(page)) { 239 SetPageSwapCache(newpage); 240 set_page_private(newpage, page_private(page)); 241 } 242 243 radix_tree_replace_slot(pslot, newpage); 244 245 page_unfreeze_refs(page, expected_count); 246 /* 247 * Drop cache reference from old page. 248 * We know this isn't the last reference. 249 */ 250 __put_page(page); 251 252 /* 253 * If moved to a different zone then also account 254 * the page for that zone. Other VM counters will be 255 * taken care of when we establish references to the 256 * new page and drop references to the old page. 257 * 258 * Note that anonymous pages are accounted for 259 * via NR_FILE_PAGES and NR_ANON_PAGES if they 260 * are mapped to swap space. 261 */ 262 __dec_zone_page_state(page, NR_FILE_PAGES); 263 __inc_zone_page_state(newpage, NR_FILE_PAGES); 264 if (PageSwapBacked(page)) { 265 __dec_zone_page_state(page, NR_SHMEM); 266 __inc_zone_page_state(newpage, NR_SHMEM); 267 } 268 spin_unlock_irq(&mapping->tree_lock); 269 270 return 0; 271 } 272 273 /* 274 * Copy the page to its new location 275 */ 276 static void migrate_page_copy(struct page *newpage, struct page *page) 277 { 278 int anon; 279 280 copy_highpage(newpage, page); 281 282 if (PageError(page)) 283 SetPageError(newpage); 284 if (PageReferenced(page)) 285 SetPageReferenced(newpage); 286 if (PageUptodate(page)) 287 SetPageUptodate(newpage); 288 if (TestClearPageActive(page)) { 289 VM_BUG_ON(PageUnevictable(page)); 290 SetPageActive(newpage); 291 } else if (TestClearPageUnevictable(page)) 292 SetPageUnevictable(newpage); 293 if (PageChecked(page)) 294 SetPageChecked(newpage); 295 if (PageMappedToDisk(page)) 296 SetPageMappedToDisk(newpage); 297 298 if (PageDirty(page)) { 299 clear_page_dirty_for_io(page); 300 /* 301 * Want to mark the page and the radix tree as dirty, and 302 * redo the accounting that clear_page_dirty_for_io undid, 303 * but we can't use set_page_dirty because that function 304 * is actually a signal that all of the page has become dirty. 305 * Wheras only part of our page may be dirty. 306 */ 307 __set_page_dirty_nobuffers(newpage); 308 } 309 310 mlock_migrate_page(newpage, page); 311 ksm_migrate_page(newpage, page); 312 313 ClearPageSwapCache(page); 314 ClearPagePrivate(page); 315 set_page_private(page, 0); 316 /* page->mapping contains a flag for PageAnon() */ 317 anon = PageAnon(page); 318 page->mapping = NULL; 319 320 /* 321 * If any waiters have accumulated on the new page then 322 * wake them up. 323 */ 324 if (PageWriteback(newpage)) 325 end_page_writeback(newpage); 326 } 327 328 /************************************************************ 329 * Migration functions 330 ***********************************************************/ 331 332 /* Always fail migration. Used for mappings that are not movable */ 333 int fail_migrate_page(struct address_space *mapping, 334 struct page *newpage, struct page *page) 335 { 336 return -EIO; 337 } 338 EXPORT_SYMBOL(fail_migrate_page); 339 340 /* 341 * Common logic to directly migrate a single page suitable for 342 * pages that do not use PagePrivate/PagePrivate2. 343 * 344 * Pages are locked upon entry and exit. 345 */ 346 int migrate_page(struct address_space *mapping, 347 struct page *newpage, struct page *page) 348 { 349 int rc; 350 351 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 352 353 rc = migrate_page_move_mapping(mapping, newpage, page); 354 355 if (rc) 356 return rc; 357 358 migrate_page_copy(newpage, page); 359 return 0; 360 } 361 EXPORT_SYMBOL(migrate_page); 362 363 #ifdef CONFIG_BLOCK 364 /* 365 * Migration function for pages with buffers. This function can only be used 366 * if the underlying filesystem guarantees that no other references to "page" 367 * exist. 368 */ 369 int buffer_migrate_page(struct address_space *mapping, 370 struct page *newpage, struct page *page) 371 { 372 struct buffer_head *bh, *head; 373 int rc; 374 375 if (!page_has_buffers(page)) 376 return migrate_page(mapping, newpage, page); 377 378 head = page_buffers(page); 379 380 rc = migrate_page_move_mapping(mapping, newpage, page); 381 382 if (rc) 383 return rc; 384 385 bh = head; 386 do { 387 get_bh(bh); 388 lock_buffer(bh); 389 bh = bh->b_this_page; 390 391 } while (bh != head); 392 393 ClearPagePrivate(page); 394 set_page_private(newpage, page_private(page)); 395 set_page_private(page, 0); 396 put_page(page); 397 get_page(newpage); 398 399 bh = head; 400 do { 401 set_bh_page(bh, newpage, bh_offset(bh)); 402 bh = bh->b_this_page; 403 404 } while (bh != head); 405 406 SetPagePrivate(newpage); 407 408 migrate_page_copy(newpage, page); 409 410 bh = head; 411 do { 412 unlock_buffer(bh); 413 put_bh(bh); 414 bh = bh->b_this_page; 415 416 } while (bh != head); 417 418 return 0; 419 } 420 EXPORT_SYMBOL(buffer_migrate_page); 421 #endif 422 423 /* 424 * Writeback a page to clean the dirty state 425 */ 426 static int writeout(struct address_space *mapping, struct page *page) 427 { 428 struct writeback_control wbc = { 429 .sync_mode = WB_SYNC_NONE, 430 .nr_to_write = 1, 431 .range_start = 0, 432 .range_end = LLONG_MAX, 433 .nonblocking = 1, 434 .for_reclaim = 1 435 }; 436 int rc; 437 438 if (!mapping->a_ops->writepage) 439 /* No write method for the address space */ 440 return -EINVAL; 441 442 if (!clear_page_dirty_for_io(page)) 443 /* Someone else already triggered a write */ 444 return -EAGAIN; 445 446 /* 447 * A dirty page may imply that the underlying filesystem has 448 * the page on some queue. So the page must be clean for 449 * migration. Writeout may mean we loose the lock and the 450 * page state is no longer what we checked for earlier. 451 * At this point we know that the migration attempt cannot 452 * be successful. 453 */ 454 remove_migration_ptes(page, page); 455 456 rc = mapping->a_ops->writepage(page, &wbc); 457 458 if (rc != AOP_WRITEPAGE_ACTIVATE) 459 /* unlocked. Relock */ 460 lock_page(page); 461 462 return (rc < 0) ? -EIO : -EAGAIN; 463 } 464 465 /* 466 * Default handling if a filesystem does not provide a migration function. 467 */ 468 static int fallback_migrate_page(struct address_space *mapping, 469 struct page *newpage, struct page *page) 470 { 471 if (PageDirty(page)) 472 return writeout(mapping, page); 473 474 /* 475 * Buffers may be managed in a filesystem specific way. 476 * We must have no buffers or drop them. 477 */ 478 if (page_has_private(page) && 479 !try_to_release_page(page, GFP_KERNEL)) 480 return -EAGAIN; 481 482 return migrate_page(mapping, newpage, page); 483 } 484 485 /* 486 * Move a page to a newly allocated page 487 * The page is locked and all ptes have been successfully removed. 488 * 489 * The new page will have replaced the old page if this function 490 * is successful. 491 * 492 * Return value: 493 * < 0 - error code 494 * == 0 - success 495 */ 496 static int move_to_new_page(struct page *newpage, struct page *page) 497 { 498 struct address_space *mapping; 499 int rc; 500 501 /* 502 * Block others from accessing the page when we get around to 503 * establishing additional references. We are the only one 504 * holding a reference to the new page at this point. 505 */ 506 if (!trylock_page(newpage)) 507 BUG(); 508 509 /* Prepare mapping for the new page.*/ 510 newpage->index = page->index; 511 newpage->mapping = page->mapping; 512 if (PageSwapBacked(page)) 513 SetPageSwapBacked(newpage); 514 515 mapping = page_mapping(page); 516 if (!mapping) 517 rc = migrate_page(mapping, newpage, page); 518 else if (mapping->a_ops->migratepage) 519 /* 520 * Most pages have a mapping and most filesystems 521 * should provide a migration function. Anonymous 522 * pages are part of swap space which also has its 523 * own migration function. This is the most common 524 * path for page migration. 525 */ 526 rc = mapping->a_ops->migratepage(mapping, 527 newpage, page); 528 else 529 rc = fallback_migrate_page(mapping, newpage, page); 530 531 if (!rc) 532 remove_migration_ptes(page, newpage); 533 else 534 newpage->mapping = NULL; 535 536 unlock_page(newpage); 537 538 return rc; 539 } 540 541 /* 542 * Obtain the lock on page, remove all ptes and migrate the page 543 * to the newly allocated page in newpage. 544 */ 545 static int unmap_and_move(new_page_t get_new_page, unsigned long private, 546 struct page *page, int force, int offlining) 547 { 548 int rc = 0; 549 int *result = NULL; 550 struct page *newpage = get_new_page(page, private, &result); 551 int rcu_locked = 0; 552 int charge = 0; 553 struct mem_cgroup *mem = NULL; 554 555 if (!newpage) 556 return -ENOMEM; 557 558 if (page_count(page) == 1) { 559 /* page was freed from under us. So we are done. */ 560 goto move_newpage; 561 } 562 563 /* prepare cgroup just returns 0 or -ENOMEM */ 564 rc = -EAGAIN; 565 566 if (!trylock_page(page)) { 567 if (!force) 568 goto move_newpage; 569 lock_page(page); 570 } 571 572 /* 573 * Only memory hotplug's offline_pages() caller has locked out KSM, 574 * and can safely migrate a KSM page. The other cases have skipped 575 * PageKsm along with PageReserved - but it is only now when we have 576 * the page lock that we can be certain it will not go KSM beneath us 577 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 578 * its pagecount raised, but only here do we take the page lock which 579 * serializes that). 580 */ 581 if (PageKsm(page) && !offlining) { 582 rc = -EBUSY; 583 goto unlock; 584 } 585 586 /* charge against new page */ 587 charge = mem_cgroup_prepare_migration(page, &mem); 588 if (charge == -ENOMEM) { 589 rc = -ENOMEM; 590 goto unlock; 591 } 592 BUG_ON(charge); 593 594 if (PageWriteback(page)) { 595 if (!force) 596 goto uncharge; 597 wait_on_page_writeback(page); 598 } 599 /* 600 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 601 * we cannot notice that anon_vma is freed while we migrates a page. 602 * This rcu_read_lock() delays freeing anon_vma pointer until the end 603 * of migration. File cache pages are no problem because of page_lock() 604 * File Caches may use write_page() or lock_page() in migration, then, 605 * just care Anon page here. 606 */ 607 if (PageAnon(page)) { 608 rcu_read_lock(); 609 rcu_locked = 1; 610 } 611 612 /* 613 * Corner case handling: 614 * 1. When a new swap-cache page is read into, it is added to the LRU 615 * and treated as swapcache but it has no rmap yet. 616 * Calling try_to_unmap() against a page->mapping==NULL page will 617 * trigger a BUG. So handle it here. 618 * 2. An orphaned page (see truncate_complete_page) might have 619 * fs-private metadata. The page can be picked up due to memory 620 * offlining. Everywhere else except page reclaim, the page is 621 * invisible to the vm, so the page can not be migrated. So try to 622 * free the metadata, so the page can be freed. 623 */ 624 if (!page->mapping) { 625 if (!PageAnon(page) && page_has_private(page)) { 626 /* 627 * Go direct to try_to_free_buffers() here because 628 * a) that's what try_to_release_page() would do anyway 629 * b) we may be under rcu_read_lock() here, so we can't 630 * use GFP_KERNEL which is what try_to_release_page() 631 * needs to be effective. 632 */ 633 try_to_free_buffers(page); 634 goto rcu_unlock; 635 } 636 goto skip_unmap; 637 } 638 639 /* Establish migration ptes or remove ptes */ 640 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 641 642 skip_unmap: 643 if (!page_mapped(page)) 644 rc = move_to_new_page(newpage, page); 645 646 if (rc) 647 remove_migration_ptes(page, page); 648 rcu_unlock: 649 if (rcu_locked) 650 rcu_read_unlock(); 651 uncharge: 652 if (!charge) 653 mem_cgroup_end_migration(mem, page, newpage); 654 unlock: 655 unlock_page(page); 656 657 if (rc != -EAGAIN) { 658 /* 659 * A page that has been migrated has all references 660 * removed and will be freed. A page that has not been 661 * migrated will have kepts its references and be 662 * restored. 663 */ 664 list_del(&page->lru); 665 dec_zone_page_state(page, NR_ISOLATED_ANON + 666 page_is_file_cache(page)); 667 putback_lru_page(page); 668 } 669 670 move_newpage: 671 672 /* 673 * Move the new page to the LRU. If migration was not successful 674 * then this will free the page. 675 */ 676 putback_lru_page(newpage); 677 678 if (result) { 679 if (rc) 680 *result = rc; 681 else 682 *result = page_to_nid(newpage); 683 } 684 return rc; 685 } 686 687 /* 688 * migrate_pages 689 * 690 * The function takes one list of pages to migrate and a function 691 * that determines from the page to be migrated and the private data 692 * the target of the move and allocates the page. 693 * 694 * The function returns after 10 attempts or if no pages 695 * are movable anymore because to has become empty 696 * or no retryable pages exist anymore. All pages will be 697 * returned to the LRU or freed. 698 * 699 * Return: Number of pages not migrated or error code. 700 */ 701 int migrate_pages(struct list_head *from, 702 new_page_t get_new_page, unsigned long private, int offlining) 703 { 704 int retry = 1; 705 int nr_failed = 0; 706 int pass = 0; 707 struct page *page; 708 struct page *page2; 709 int swapwrite = current->flags & PF_SWAPWRITE; 710 int rc; 711 712 if (!swapwrite) 713 current->flags |= PF_SWAPWRITE; 714 715 for(pass = 0; pass < 10 && retry; pass++) { 716 retry = 0; 717 718 list_for_each_entry_safe(page, page2, from, lru) { 719 cond_resched(); 720 721 rc = unmap_and_move(get_new_page, private, 722 page, pass > 2, offlining); 723 724 switch(rc) { 725 case -ENOMEM: 726 goto out; 727 case -EAGAIN: 728 retry++; 729 break; 730 case 0: 731 break; 732 default: 733 /* Permanent failure */ 734 nr_failed++; 735 break; 736 } 737 } 738 } 739 rc = 0; 740 out: 741 if (!swapwrite) 742 current->flags &= ~PF_SWAPWRITE; 743 744 putback_lru_pages(from); 745 746 if (rc) 747 return rc; 748 749 return nr_failed + retry; 750 } 751 752 #ifdef CONFIG_NUMA 753 /* 754 * Move a list of individual pages 755 */ 756 struct page_to_node { 757 unsigned long addr; 758 struct page *page; 759 int node; 760 int status; 761 }; 762 763 static struct page *new_page_node(struct page *p, unsigned long private, 764 int **result) 765 { 766 struct page_to_node *pm = (struct page_to_node *)private; 767 768 while (pm->node != MAX_NUMNODES && pm->page != p) 769 pm++; 770 771 if (pm->node == MAX_NUMNODES) 772 return NULL; 773 774 *result = &pm->status; 775 776 return alloc_pages_exact_node(pm->node, 777 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 778 } 779 780 /* 781 * Move a set of pages as indicated in the pm array. The addr 782 * field must be set to the virtual address of the page to be moved 783 * and the node number must contain a valid target node. 784 * The pm array ends with node = MAX_NUMNODES. 785 */ 786 static int do_move_page_to_node_array(struct mm_struct *mm, 787 struct page_to_node *pm, 788 int migrate_all) 789 { 790 int err; 791 struct page_to_node *pp; 792 LIST_HEAD(pagelist); 793 794 down_read(&mm->mmap_sem); 795 796 /* 797 * Build a list of pages to migrate 798 */ 799 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 800 struct vm_area_struct *vma; 801 struct page *page; 802 803 err = -EFAULT; 804 vma = find_vma(mm, pp->addr); 805 if (!vma || !vma_migratable(vma)) 806 goto set_status; 807 808 page = follow_page(vma, pp->addr, FOLL_GET); 809 810 err = PTR_ERR(page); 811 if (IS_ERR(page)) 812 goto set_status; 813 814 err = -ENOENT; 815 if (!page) 816 goto set_status; 817 818 /* Use PageReserved to check for zero page */ 819 if (PageReserved(page) || PageKsm(page)) 820 goto put_and_set; 821 822 pp->page = page; 823 err = page_to_nid(page); 824 825 if (err == pp->node) 826 /* 827 * Node already in the right place 828 */ 829 goto put_and_set; 830 831 err = -EACCES; 832 if (page_mapcount(page) > 1 && 833 !migrate_all) 834 goto put_and_set; 835 836 err = isolate_lru_page(page); 837 if (!err) { 838 list_add_tail(&page->lru, &pagelist); 839 inc_zone_page_state(page, NR_ISOLATED_ANON + 840 page_is_file_cache(page)); 841 } 842 put_and_set: 843 /* 844 * Either remove the duplicate refcount from 845 * isolate_lru_page() or drop the page ref if it was 846 * not isolated. 847 */ 848 put_page(page); 849 set_status: 850 pp->status = err; 851 } 852 853 err = 0; 854 if (!list_empty(&pagelist)) 855 err = migrate_pages(&pagelist, new_page_node, 856 (unsigned long)pm, 0); 857 858 up_read(&mm->mmap_sem); 859 return err; 860 } 861 862 /* 863 * Migrate an array of page address onto an array of nodes and fill 864 * the corresponding array of status. 865 */ 866 static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 867 unsigned long nr_pages, 868 const void __user * __user *pages, 869 const int __user *nodes, 870 int __user *status, int flags) 871 { 872 struct page_to_node *pm; 873 nodemask_t task_nodes; 874 unsigned long chunk_nr_pages; 875 unsigned long chunk_start; 876 int err; 877 878 task_nodes = cpuset_mems_allowed(task); 879 880 err = -ENOMEM; 881 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 882 if (!pm) 883 goto out; 884 885 migrate_prep(); 886 887 /* 888 * Store a chunk of page_to_node array in a page, 889 * but keep the last one as a marker 890 */ 891 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 892 893 for (chunk_start = 0; 894 chunk_start < nr_pages; 895 chunk_start += chunk_nr_pages) { 896 int j; 897 898 if (chunk_start + chunk_nr_pages > nr_pages) 899 chunk_nr_pages = nr_pages - chunk_start; 900 901 /* fill the chunk pm with addrs and nodes from user-space */ 902 for (j = 0; j < chunk_nr_pages; j++) { 903 const void __user *p; 904 int node; 905 906 err = -EFAULT; 907 if (get_user(p, pages + j + chunk_start)) 908 goto out_pm; 909 pm[j].addr = (unsigned long) p; 910 911 if (get_user(node, nodes + j + chunk_start)) 912 goto out_pm; 913 914 err = -ENODEV; 915 if (!node_state(node, N_HIGH_MEMORY)) 916 goto out_pm; 917 918 err = -EACCES; 919 if (!node_isset(node, task_nodes)) 920 goto out_pm; 921 922 pm[j].node = node; 923 } 924 925 /* End marker for this chunk */ 926 pm[chunk_nr_pages].node = MAX_NUMNODES; 927 928 /* Migrate this chunk */ 929 err = do_move_page_to_node_array(mm, pm, 930 flags & MPOL_MF_MOVE_ALL); 931 if (err < 0) 932 goto out_pm; 933 934 /* Return status information */ 935 for (j = 0; j < chunk_nr_pages; j++) 936 if (put_user(pm[j].status, status + j + chunk_start)) { 937 err = -EFAULT; 938 goto out_pm; 939 } 940 } 941 err = 0; 942 943 out_pm: 944 free_page((unsigned long)pm); 945 out: 946 return err; 947 } 948 949 /* 950 * Determine the nodes of an array of pages and store it in an array of status. 951 */ 952 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 953 const void __user **pages, int *status) 954 { 955 unsigned long i; 956 957 down_read(&mm->mmap_sem); 958 959 for (i = 0; i < nr_pages; i++) { 960 unsigned long addr = (unsigned long)(*pages); 961 struct vm_area_struct *vma; 962 struct page *page; 963 int err = -EFAULT; 964 965 vma = find_vma(mm, addr); 966 if (!vma) 967 goto set_status; 968 969 page = follow_page(vma, addr, 0); 970 971 err = PTR_ERR(page); 972 if (IS_ERR(page)) 973 goto set_status; 974 975 err = -ENOENT; 976 /* Use PageReserved to check for zero page */ 977 if (!page || PageReserved(page) || PageKsm(page)) 978 goto set_status; 979 980 err = page_to_nid(page); 981 set_status: 982 *status = err; 983 984 pages++; 985 status++; 986 } 987 988 up_read(&mm->mmap_sem); 989 } 990 991 /* 992 * Determine the nodes of a user array of pages and store it in 993 * a user array of status. 994 */ 995 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 996 const void __user * __user *pages, 997 int __user *status) 998 { 999 #define DO_PAGES_STAT_CHUNK_NR 16 1000 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1001 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1002 unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1003 int err; 1004 1005 for (i = 0; i < nr_pages; i += chunk_nr) { 1006 if (chunk_nr > nr_pages - i) 1007 chunk_nr = nr_pages - i; 1008 1009 err = copy_from_user(chunk_pages, &pages[i], 1010 chunk_nr * sizeof(*chunk_pages)); 1011 if (err) { 1012 err = -EFAULT; 1013 goto out; 1014 } 1015 1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1017 1018 err = copy_to_user(&status[i], chunk_status, 1019 chunk_nr * sizeof(*chunk_status)); 1020 if (err) { 1021 err = -EFAULT; 1022 goto out; 1023 } 1024 } 1025 err = 0; 1026 1027 out: 1028 return err; 1029 } 1030 1031 /* 1032 * Move a list of pages in the address space of the currently executing 1033 * process. 1034 */ 1035 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1036 const void __user * __user *, pages, 1037 const int __user *, nodes, 1038 int __user *, status, int, flags) 1039 { 1040 const struct cred *cred = current_cred(), *tcred; 1041 struct task_struct *task; 1042 struct mm_struct *mm; 1043 int err; 1044 1045 /* Check flags */ 1046 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1047 return -EINVAL; 1048 1049 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1050 return -EPERM; 1051 1052 /* Find the mm_struct */ 1053 read_lock(&tasklist_lock); 1054 task = pid ? find_task_by_vpid(pid) : current; 1055 if (!task) { 1056 read_unlock(&tasklist_lock); 1057 return -ESRCH; 1058 } 1059 mm = get_task_mm(task); 1060 read_unlock(&tasklist_lock); 1061 1062 if (!mm) 1063 return -EINVAL; 1064 1065 /* 1066 * Check if this process has the right to modify the specified 1067 * process. The right exists if the process has administrative 1068 * capabilities, superuser privileges or the same 1069 * userid as the target process. 1070 */ 1071 rcu_read_lock(); 1072 tcred = __task_cred(task); 1073 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1074 cred->uid != tcred->suid && cred->uid != tcred->uid && 1075 !capable(CAP_SYS_NICE)) { 1076 rcu_read_unlock(); 1077 err = -EPERM; 1078 goto out; 1079 } 1080 rcu_read_unlock(); 1081 1082 err = security_task_movememory(task); 1083 if (err) 1084 goto out; 1085 1086 if (nodes) { 1087 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1088 flags); 1089 } else { 1090 err = do_pages_stat(mm, nr_pages, pages, status); 1091 } 1092 1093 out: 1094 mmput(mm); 1095 return err; 1096 } 1097 1098 /* 1099 * Call migration functions in the vma_ops that may prepare 1100 * memory in a vm for migration. migration functions may perform 1101 * the migration for vmas that do not have an underlying page struct. 1102 */ 1103 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 1104 const nodemask_t *from, unsigned long flags) 1105 { 1106 struct vm_area_struct *vma; 1107 int err = 0; 1108 1109 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 1110 if (vma->vm_ops && vma->vm_ops->migrate) { 1111 err = vma->vm_ops->migrate(vma, to, from, flags); 1112 if (err) 1113 break; 1114 } 1115 } 1116 return err; 1117 } 1118 #endif 1119