1 /* 2 * Simple NUMA memory policy for the Linux kernel. 3 * 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 6 * Subject to the GNU Public License, version 2. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support four policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * bind Only allocate memory on a specific set of nodes, 23 * no fallback. 24 * FIXME: memory is allocated starting with the first node 25 * to the last. It would be better if bind would truly restrict 26 * the allocation to memory nodes instead 27 * 28 * preferred Try a specific node first before normal fallback. 29 * As a special case node -1 here means do the allocation 30 * on the local CPU. This is normally identical to default, 31 * but useful to set in a VMA when you have a non default 32 * process policy. 33 * 34 * default Allocate on the local node first, or when on a VMA 35 * use the process policy. This is what Linux always did 36 * in a NUMA aware kernel and still does by, ahem, default. 37 * 38 * The process policy is applied for most non interrupt memory allocations 39 * in that process' context. Interrupts ignore the policies and always 40 * try to allocate on the local CPU. The VMA policy is only applied for memory 41 * allocations for a VMA in the VM. 42 * 43 * Currently there are a few corner cases in swapping where the policy 44 * is not applied, but the majority should be handled. When process policy 45 * is used it is not remembered over swap outs/swap ins. 46 * 47 * Only the highest zone in the zone hierarchy gets policied. Allocations 48 * requesting a lower zone just use default policy. This implies that 49 * on systems with highmem kernel lowmem allocation don't get policied. 50 * Same with GFP_DMA allocations. 51 * 52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 53 * all users and remembered even when nobody has memory mapped. 54 */ 55 56 /* Notebook: 57 fix mmap readahead to honour policy and enable policy for any page cache 58 object 59 statistics for bigpages 60 global policy for page cache? currently it uses process policy. Requires 61 first item above. 62 handle mremap for shared memory (currently ignored for the policy) 63 grows down? 64 make bind policy root only? It can trigger oom much faster and the 65 kernel is not always grateful with that. 66 could replace all the switch()es with a mempolicy_ops structure. 67 */ 68 69 #include <linux/mempolicy.h> 70 #include <linux/mm.h> 71 #include <linux/highmem.h> 72 #include <linux/hugetlb.h> 73 #include <linux/kernel.h> 74 #include <linux/sched.h> 75 #include <linux/mm.h> 76 #include <linux/nodemask.h> 77 #include <linux/cpuset.h> 78 #include <linux/gfp.h> 79 #include <linux/slab.h> 80 #include <linux/string.h> 81 #include <linux/module.h> 82 #include <linux/interrupt.h> 83 #include <linux/init.h> 84 #include <linux/compat.h> 85 #include <linux/mempolicy.h> 86 #include <linux/swap.h> 87 88 #include <asm/tlbflush.h> 89 #include <asm/uaccess.h> 90 91 /* Internal MPOL_MF_xxx flags */ 92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 93 94 static kmem_cache_t *policy_cache; 95 static kmem_cache_t *sn_cache; 96 97 #define PDprintk(fmt...) 98 99 /* Highest zone. An specific allocation for a zone below that is not 100 policied. */ 101 int policy_zone = ZONE_DMA; 102 103 struct mempolicy default_policy = { 104 .refcnt = ATOMIC_INIT(1), /* never free it */ 105 .policy = MPOL_DEFAULT, 106 }; 107 108 /* Do sanity checking on a policy */ 109 static int mpol_check_policy(int mode, nodemask_t *nodes) 110 { 111 int empty = nodes_empty(*nodes); 112 113 switch (mode) { 114 case MPOL_DEFAULT: 115 if (!empty) 116 return -EINVAL; 117 break; 118 case MPOL_BIND: 119 case MPOL_INTERLEAVE: 120 /* Preferred will only use the first bit, but allow 121 more for now. */ 122 if (empty) 123 return -EINVAL; 124 break; 125 } 126 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 127 } 128 /* Generate a custom zonelist for the BIND policy. */ 129 static struct zonelist *bind_zonelist(nodemask_t *nodes) 130 { 131 struct zonelist *zl; 132 int num, max, nd; 133 134 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 135 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 136 if (!zl) 137 return NULL; 138 num = 0; 139 for_each_node_mask(nd, *nodes) 140 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 141 zl->zones[num] = NULL; 142 return zl; 143 } 144 145 /* Create a new policy */ 146 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) 147 { 148 struct mempolicy *policy; 149 150 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); 151 if (mode == MPOL_DEFAULT) 152 return NULL; 153 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 154 if (!policy) 155 return ERR_PTR(-ENOMEM); 156 atomic_set(&policy->refcnt, 1); 157 switch (mode) { 158 case MPOL_INTERLEAVE: 159 policy->v.nodes = *nodes; 160 if (nodes_weight(*nodes) == 0) { 161 kmem_cache_free(policy_cache, policy); 162 return ERR_PTR(-EINVAL); 163 } 164 break; 165 case MPOL_PREFERRED: 166 policy->v.preferred_node = first_node(*nodes); 167 if (policy->v.preferred_node >= MAX_NUMNODES) 168 policy->v.preferred_node = -1; 169 break; 170 case MPOL_BIND: 171 policy->v.zonelist = bind_zonelist(nodes); 172 if (policy->v.zonelist == NULL) { 173 kmem_cache_free(policy_cache, policy); 174 return ERR_PTR(-ENOMEM); 175 } 176 break; 177 } 178 policy->policy = mode; 179 return policy; 180 } 181 182 /* Check if we are the only process mapping the page in question */ 183 static inline int single_mm_mapping(struct mm_struct *mm, 184 struct address_space *mapping) 185 { 186 struct vm_area_struct *vma; 187 struct prio_tree_iter iter; 188 int rc = 1; 189 190 spin_lock(&mapping->i_mmap_lock); 191 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) 192 if (mm != vma->vm_mm) { 193 rc = 0; 194 goto out; 195 } 196 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 197 if (mm != vma->vm_mm) { 198 rc = 0; 199 goto out; 200 } 201 out: 202 spin_unlock(&mapping->i_mmap_lock); 203 return rc; 204 } 205 206 /* 207 * Add a page to be migrated to the pagelist 208 */ 209 static void migrate_page_add(struct vm_area_struct *vma, 210 struct page *page, struct list_head *pagelist, unsigned long flags) 211 { 212 /* 213 * Avoid migrating a page that is shared by others and not writable. 214 */ 215 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || 216 mapping_writably_mapped(page->mapping) || 217 single_mm_mapping(vma->vm_mm, page->mapping)) { 218 int rc = isolate_lru_page(page); 219 220 if (rc == 1) 221 list_add(&page->lru, pagelist); 222 /* 223 * If the isolate attempt was not successful then we just 224 * encountered an unswappable page. Something must be wrong. 225 */ 226 WARN_ON(rc == 0); 227 } 228 } 229 230 /* Ensure all existing pages follow the policy. */ 231 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 232 unsigned long addr, unsigned long end, 233 const nodemask_t *nodes, unsigned long flags, 234 struct list_head *pagelist) 235 { 236 pte_t *orig_pte; 237 pte_t *pte; 238 spinlock_t *ptl; 239 240 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 241 do { 242 struct page *page; 243 unsigned int nid; 244 245 if (!pte_present(*pte)) 246 continue; 247 page = vm_normal_page(vma, addr, *pte); 248 if (!page) 249 continue; 250 nid = page_to_nid(page); 251 if (!node_isset(nid, *nodes)) { 252 if (pagelist) 253 migrate_page_add(vma, page, pagelist, flags); 254 else 255 break; 256 } 257 } while (pte++, addr += PAGE_SIZE, addr != end); 258 pte_unmap_unlock(orig_pte, ptl); 259 return addr != end; 260 } 261 262 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 263 unsigned long addr, unsigned long end, 264 const nodemask_t *nodes, unsigned long flags, 265 struct list_head *pagelist) 266 { 267 pmd_t *pmd; 268 unsigned long next; 269 270 pmd = pmd_offset(pud, addr); 271 do { 272 next = pmd_addr_end(addr, end); 273 if (pmd_none_or_clear_bad(pmd)) 274 continue; 275 if (check_pte_range(vma, pmd, addr, next, nodes, 276 flags, pagelist)) 277 return -EIO; 278 } while (pmd++, addr = next, addr != end); 279 return 0; 280 } 281 282 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 283 unsigned long addr, unsigned long end, 284 const nodemask_t *nodes, unsigned long flags, 285 struct list_head *pagelist) 286 { 287 pud_t *pud; 288 unsigned long next; 289 290 pud = pud_offset(pgd, addr); 291 do { 292 next = pud_addr_end(addr, end); 293 if (pud_none_or_clear_bad(pud)) 294 continue; 295 if (check_pmd_range(vma, pud, addr, next, nodes, 296 flags, pagelist)) 297 return -EIO; 298 } while (pud++, addr = next, addr != end); 299 return 0; 300 } 301 302 static inline int check_pgd_range(struct vm_area_struct *vma, 303 unsigned long addr, unsigned long end, 304 const nodemask_t *nodes, unsigned long flags, 305 struct list_head *pagelist) 306 { 307 pgd_t *pgd; 308 unsigned long next; 309 310 pgd = pgd_offset(vma->vm_mm, addr); 311 do { 312 next = pgd_addr_end(addr, end); 313 if (pgd_none_or_clear_bad(pgd)) 314 continue; 315 if (check_pud_range(vma, pgd, addr, next, nodes, 316 flags, pagelist)) 317 return -EIO; 318 } while (pgd++, addr = next, addr != end); 319 return 0; 320 } 321 322 /* Check if a vma is migratable */ 323 static inline int vma_migratable(struct vm_area_struct *vma) 324 { 325 if (vma->vm_flags & ( 326 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) 327 return 0; 328 return 1; 329 } 330 331 /* 332 * Check if all pages in a range are on a set of nodes. 333 * If pagelist != NULL then isolate pages from the LRU and 334 * put them on the pagelist. 335 */ 336 static struct vm_area_struct * 337 check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 338 const nodemask_t *nodes, unsigned long flags, 339 struct list_head *pagelist) 340 { 341 int err; 342 struct vm_area_struct *first, *vma, *prev; 343 344 first = find_vma(mm, start); 345 if (!first) 346 return ERR_PTR(-EFAULT); 347 prev = NULL; 348 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 349 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 350 if (!vma->vm_next && vma->vm_end < end) 351 return ERR_PTR(-EFAULT); 352 if (prev && prev->vm_end < vma->vm_start) 353 return ERR_PTR(-EFAULT); 354 } 355 if (!is_vm_hugetlb_page(vma) && 356 ((flags & MPOL_MF_STRICT) || 357 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 358 vma_migratable(vma)))) { 359 unsigned long endvma = vma->vm_end; 360 361 if (endvma > end) 362 endvma = end; 363 if (vma->vm_start > start) 364 start = vma->vm_start; 365 err = check_pgd_range(vma, start, endvma, nodes, 366 flags, pagelist); 367 if (err) { 368 first = ERR_PTR(err); 369 break; 370 } 371 } 372 prev = vma; 373 } 374 return first; 375 } 376 377 /* Apply policy to a single VMA */ 378 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 379 { 380 int err = 0; 381 struct mempolicy *old = vma->vm_policy; 382 383 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 384 vma->vm_start, vma->vm_end, vma->vm_pgoff, 385 vma->vm_ops, vma->vm_file, 386 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 387 388 if (vma->vm_ops && vma->vm_ops->set_policy) 389 err = vma->vm_ops->set_policy(vma, new); 390 if (!err) { 391 mpol_get(new); 392 vma->vm_policy = new; 393 mpol_free(old); 394 } 395 return err; 396 } 397 398 /* Step 2: apply policy to a range and do splits. */ 399 static int mbind_range(struct vm_area_struct *vma, unsigned long start, 400 unsigned long end, struct mempolicy *new) 401 { 402 struct vm_area_struct *next; 403 int err; 404 405 err = 0; 406 for (; vma && vma->vm_start < end; vma = next) { 407 next = vma->vm_next; 408 if (vma->vm_start < start) 409 err = split_vma(vma->vm_mm, vma, start, 1); 410 if (!err && vma->vm_end > end) 411 err = split_vma(vma->vm_mm, vma, end, 0); 412 if (!err) 413 err = policy_vma(vma, new); 414 if (err) 415 break; 416 } 417 return err; 418 } 419 420 static int contextualize_policy(int mode, nodemask_t *nodes) 421 { 422 if (!nodes) 423 return 0; 424 425 /* Update current mems_allowed */ 426 cpuset_update_current_mems_allowed(); 427 /* Ignore nodes not set in current->mems_allowed */ 428 cpuset_restrict_to_mems_allowed(nodes->bits); 429 return mpol_check_policy(mode, nodes); 430 } 431 432 static int swap_pages(struct list_head *pagelist) 433 { 434 LIST_HEAD(moved); 435 LIST_HEAD(failed); 436 int n; 437 438 n = migrate_pages(pagelist, NULL, &moved, &failed); 439 putback_lru_pages(&failed); 440 putback_lru_pages(&moved); 441 442 return n; 443 } 444 445 long do_mbind(unsigned long start, unsigned long len, 446 unsigned long mode, nodemask_t *nmask, unsigned long flags) 447 { 448 struct vm_area_struct *vma; 449 struct mm_struct *mm = current->mm; 450 struct mempolicy *new; 451 unsigned long end; 452 int err; 453 LIST_HEAD(pagelist); 454 455 if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 456 || mode > MPOL_MAX) 457 return -EINVAL; 458 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) 459 return -EPERM; 460 461 if (start & ~PAGE_MASK) 462 return -EINVAL; 463 464 if (mode == MPOL_DEFAULT) 465 flags &= ~MPOL_MF_STRICT; 466 467 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 468 end = start + len; 469 470 if (end < start) 471 return -EINVAL; 472 if (end == start) 473 return 0; 474 475 if (mpol_check_policy(mode, nmask)) 476 return -EINVAL; 477 478 new = mpol_new(mode, nmask); 479 if (IS_ERR(new)) 480 return PTR_ERR(new); 481 482 /* 483 * If we are using the default policy then operation 484 * on discontinuous address spaces is okay after all 485 */ 486 if (!new) 487 flags |= MPOL_MF_DISCONTIG_OK; 488 489 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 490 mode,nodes_addr(nodes)[0]); 491 492 down_write(&mm->mmap_sem); 493 vma = check_range(mm, start, end, nmask, flags, 494 (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); 495 err = PTR_ERR(vma); 496 if (!IS_ERR(vma)) { 497 int nr_failed = 0; 498 499 err = mbind_range(vma, start, end, new); 500 if (!list_empty(&pagelist)) 501 nr_failed = swap_pages(&pagelist); 502 503 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 504 err = -EIO; 505 } 506 if (!list_empty(&pagelist)) 507 putback_lru_pages(&pagelist); 508 509 up_write(&mm->mmap_sem); 510 mpol_free(new); 511 return err; 512 } 513 514 /* Set the process memory policy */ 515 long do_set_mempolicy(int mode, nodemask_t *nodes) 516 { 517 struct mempolicy *new; 518 519 if (contextualize_policy(mode, nodes)) 520 return -EINVAL; 521 new = mpol_new(mode, nodes); 522 if (IS_ERR(new)) 523 return PTR_ERR(new); 524 mpol_free(current->mempolicy); 525 current->mempolicy = new; 526 if (new && new->policy == MPOL_INTERLEAVE) 527 current->il_next = first_node(new->v.nodes); 528 return 0; 529 } 530 531 /* Fill a zone bitmap for a policy */ 532 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 533 { 534 int i; 535 536 nodes_clear(*nodes); 537 switch (p->policy) { 538 case MPOL_BIND: 539 for (i = 0; p->v.zonelist->zones[i]; i++) 540 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 541 *nodes); 542 break; 543 case MPOL_DEFAULT: 544 break; 545 case MPOL_INTERLEAVE: 546 *nodes = p->v.nodes; 547 break; 548 case MPOL_PREFERRED: 549 /* or use current node instead of online map? */ 550 if (p->v.preferred_node < 0) 551 *nodes = node_online_map; 552 else 553 node_set(p->v.preferred_node, *nodes); 554 break; 555 default: 556 BUG(); 557 } 558 } 559 560 static int lookup_node(struct mm_struct *mm, unsigned long addr) 561 { 562 struct page *p; 563 int err; 564 565 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 566 if (err >= 0) { 567 err = page_to_nid(p); 568 put_page(p); 569 } 570 return err; 571 } 572 573 /* Retrieve NUMA policy */ 574 long do_get_mempolicy(int *policy, nodemask_t *nmask, 575 unsigned long addr, unsigned long flags) 576 { 577 int err; 578 struct mm_struct *mm = current->mm; 579 struct vm_area_struct *vma = NULL; 580 struct mempolicy *pol = current->mempolicy; 581 582 cpuset_update_current_mems_allowed(); 583 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 584 return -EINVAL; 585 if (flags & MPOL_F_ADDR) { 586 down_read(&mm->mmap_sem); 587 vma = find_vma_intersection(mm, addr, addr+1); 588 if (!vma) { 589 up_read(&mm->mmap_sem); 590 return -EFAULT; 591 } 592 if (vma->vm_ops && vma->vm_ops->get_policy) 593 pol = vma->vm_ops->get_policy(vma, addr); 594 else 595 pol = vma->vm_policy; 596 } else if (addr) 597 return -EINVAL; 598 599 if (!pol) 600 pol = &default_policy; 601 602 if (flags & MPOL_F_NODE) { 603 if (flags & MPOL_F_ADDR) { 604 err = lookup_node(mm, addr); 605 if (err < 0) 606 goto out; 607 *policy = err; 608 } else if (pol == current->mempolicy && 609 pol->policy == MPOL_INTERLEAVE) { 610 *policy = current->il_next; 611 } else { 612 err = -EINVAL; 613 goto out; 614 } 615 } else 616 *policy = pol->policy; 617 618 if (vma) { 619 up_read(¤t->mm->mmap_sem); 620 vma = NULL; 621 } 622 623 err = 0; 624 if (nmask) 625 get_zonemask(pol, nmask); 626 627 out: 628 if (vma) 629 up_read(¤t->mm->mmap_sem); 630 return err; 631 } 632 633 /* 634 * For now migrate_pages simply swaps out the pages from nodes that are in 635 * the source set but not in the target set. In the future, we would 636 * want a function that moves pages between the two nodesets in such 637 * a way as to preserve the physical layout as much as possible. 638 * 639 * Returns the number of page that could not be moved. 640 */ 641 int do_migrate_pages(struct mm_struct *mm, 642 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 643 { 644 LIST_HEAD(pagelist); 645 int count = 0; 646 nodemask_t nodes; 647 648 nodes_andnot(nodes, *from_nodes, *to_nodes); 649 nodes_complement(nodes, nodes); 650 651 down_read(&mm->mmap_sem); 652 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, 653 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 654 655 if (!list_empty(&pagelist)) { 656 count = swap_pages(&pagelist); 657 putback_lru_pages(&pagelist); 658 } 659 660 up_read(&mm->mmap_sem); 661 return count; 662 } 663 664 /* 665 * User space interface with variable sized bitmaps for nodelists. 666 */ 667 668 /* Copy a node mask from user space. */ 669 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 670 unsigned long maxnode) 671 { 672 unsigned long k; 673 unsigned long nlongs; 674 unsigned long endmask; 675 676 --maxnode; 677 nodes_clear(*nodes); 678 if (maxnode == 0 || !nmask) 679 return 0; 680 681 nlongs = BITS_TO_LONGS(maxnode); 682 if ((maxnode % BITS_PER_LONG) == 0) 683 endmask = ~0UL; 684 else 685 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 686 687 /* When the user specified more nodes than supported just check 688 if the non supported part is all zero. */ 689 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 690 if (nlongs > PAGE_SIZE/sizeof(long)) 691 return -EINVAL; 692 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 693 unsigned long t; 694 if (get_user(t, nmask + k)) 695 return -EFAULT; 696 if (k == nlongs - 1) { 697 if (t & endmask) 698 return -EINVAL; 699 } else if (t) 700 return -EINVAL; 701 } 702 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 703 endmask = ~0UL; 704 } 705 706 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 707 return -EFAULT; 708 nodes_addr(*nodes)[nlongs-1] &= endmask; 709 return 0; 710 } 711 712 /* Copy a kernel node mask to user space */ 713 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 714 nodemask_t *nodes) 715 { 716 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 717 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 718 719 if (copy > nbytes) { 720 if (copy > PAGE_SIZE) 721 return -EINVAL; 722 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 723 return -EFAULT; 724 copy = nbytes; 725 } 726 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 727 } 728 729 asmlinkage long sys_mbind(unsigned long start, unsigned long len, 730 unsigned long mode, 731 unsigned long __user *nmask, unsigned long maxnode, 732 unsigned flags) 733 { 734 nodemask_t nodes; 735 int err; 736 737 err = get_nodes(&nodes, nmask, maxnode); 738 if (err) 739 return err; 740 return do_mbind(start, len, mode, &nodes, flags); 741 } 742 743 /* Set the process memory policy */ 744 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 745 unsigned long maxnode) 746 { 747 int err; 748 nodemask_t nodes; 749 750 if (mode < 0 || mode > MPOL_MAX) 751 return -EINVAL; 752 err = get_nodes(&nodes, nmask, maxnode); 753 if (err) 754 return err; 755 return do_set_mempolicy(mode, &nodes); 756 } 757 758 /* Macro needed until Paul implements this function in kernel/cpusets.c */ 759 #define cpuset_mems_allowed(task) node_online_map 760 761 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 762 const unsigned long __user *old_nodes, 763 const unsigned long __user *new_nodes) 764 { 765 struct mm_struct *mm; 766 struct task_struct *task; 767 nodemask_t old; 768 nodemask_t new; 769 nodemask_t task_nodes; 770 int err; 771 772 err = get_nodes(&old, old_nodes, maxnode); 773 if (err) 774 return err; 775 776 err = get_nodes(&new, new_nodes, maxnode); 777 if (err) 778 return err; 779 780 /* Find the mm_struct */ 781 read_lock(&tasklist_lock); 782 task = pid ? find_task_by_pid(pid) : current; 783 if (!task) { 784 read_unlock(&tasklist_lock); 785 return -ESRCH; 786 } 787 mm = get_task_mm(task); 788 read_unlock(&tasklist_lock); 789 790 if (!mm) 791 return -EINVAL; 792 793 /* 794 * Check if this process has the right to modify the specified 795 * process. The right exists if the process has administrative 796 * capabilities, superuser priviledges or the same 797 * userid as the target process. 798 */ 799 if ((current->euid != task->suid) && (current->euid != task->uid) && 800 (current->uid != task->suid) && (current->uid != task->uid) && 801 !capable(CAP_SYS_ADMIN)) { 802 err = -EPERM; 803 goto out; 804 } 805 806 task_nodes = cpuset_mems_allowed(task); 807 /* Is the user allowed to access the target nodes? */ 808 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { 809 err = -EPERM; 810 goto out; 811 } 812 813 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); 814 out: 815 mmput(mm); 816 return err; 817 } 818 819 820 /* Retrieve NUMA policy */ 821 asmlinkage long sys_get_mempolicy(int __user *policy, 822 unsigned long __user *nmask, 823 unsigned long maxnode, 824 unsigned long addr, unsigned long flags) 825 { 826 int err, pval; 827 nodemask_t nodes; 828 829 if (nmask != NULL && maxnode < MAX_NUMNODES) 830 return -EINVAL; 831 832 err = do_get_mempolicy(&pval, &nodes, addr, flags); 833 834 if (err) 835 return err; 836 837 if (policy && put_user(pval, policy)) 838 return -EFAULT; 839 840 if (nmask) 841 err = copy_nodes_to_user(nmask, maxnode, &nodes); 842 843 return err; 844 } 845 846 #ifdef CONFIG_COMPAT 847 848 asmlinkage long compat_sys_get_mempolicy(int __user *policy, 849 compat_ulong_t __user *nmask, 850 compat_ulong_t maxnode, 851 compat_ulong_t addr, compat_ulong_t flags) 852 { 853 long err; 854 unsigned long __user *nm = NULL; 855 unsigned long nr_bits, alloc_size; 856 DECLARE_BITMAP(bm, MAX_NUMNODES); 857 858 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 859 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 860 861 if (nmask) 862 nm = compat_alloc_user_space(alloc_size); 863 864 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 865 866 if (!err && nmask) { 867 err = copy_from_user(bm, nm, alloc_size); 868 /* ensure entire bitmap is zeroed */ 869 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 870 err |= compat_put_bitmap(nmask, bm, nr_bits); 871 } 872 873 return err; 874 } 875 876 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 877 compat_ulong_t maxnode) 878 { 879 long err = 0; 880 unsigned long __user *nm = NULL; 881 unsigned long nr_bits, alloc_size; 882 DECLARE_BITMAP(bm, MAX_NUMNODES); 883 884 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 885 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 886 887 if (nmask) { 888 err = compat_get_bitmap(bm, nmask, nr_bits); 889 nm = compat_alloc_user_space(alloc_size); 890 err |= copy_to_user(nm, bm, alloc_size); 891 } 892 893 if (err) 894 return -EFAULT; 895 896 return sys_set_mempolicy(mode, nm, nr_bits+1); 897 } 898 899 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 900 compat_ulong_t mode, compat_ulong_t __user *nmask, 901 compat_ulong_t maxnode, compat_ulong_t flags) 902 { 903 long err = 0; 904 unsigned long __user *nm = NULL; 905 unsigned long nr_bits, alloc_size; 906 nodemask_t bm; 907 908 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 909 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 910 911 if (nmask) { 912 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 913 nm = compat_alloc_user_space(alloc_size); 914 err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 915 } 916 917 if (err) 918 return -EFAULT; 919 920 return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 921 } 922 923 #endif 924 925 /* Return effective policy for a VMA */ 926 struct mempolicy * 927 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 928 { 929 struct mempolicy *pol = task->mempolicy; 930 931 if (vma) { 932 if (vma->vm_ops && vma->vm_ops->get_policy) 933 pol = vma->vm_ops->get_policy(vma, addr); 934 else if (vma->vm_policy && 935 vma->vm_policy->policy != MPOL_DEFAULT) 936 pol = vma->vm_policy; 937 } 938 if (!pol) 939 pol = &default_policy; 940 return pol; 941 } 942 943 /* Return a zonelist representing a mempolicy */ 944 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 945 { 946 int nd; 947 948 switch (policy->policy) { 949 case MPOL_PREFERRED: 950 nd = policy->v.preferred_node; 951 if (nd < 0) 952 nd = numa_node_id(); 953 break; 954 case MPOL_BIND: 955 /* Lower zones don't get a policy applied */ 956 /* Careful: current->mems_allowed might have moved */ 957 if (gfp_zone(gfp) >= policy_zone) 958 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 959 return policy->v.zonelist; 960 /*FALL THROUGH*/ 961 case MPOL_INTERLEAVE: /* should not happen */ 962 case MPOL_DEFAULT: 963 nd = numa_node_id(); 964 break; 965 default: 966 nd = 0; 967 BUG(); 968 } 969 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); 970 } 971 972 /* Do dynamic interleaving for a process */ 973 static unsigned interleave_nodes(struct mempolicy *policy) 974 { 975 unsigned nid, next; 976 struct task_struct *me = current; 977 978 nid = me->il_next; 979 next = next_node(nid, policy->v.nodes); 980 if (next >= MAX_NUMNODES) 981 next = first_node(policy->v.nodes); 982 me->il_next = next; 983 return nid; 984 } 985 986 /* Do static interleaving for a VMA with known offset. */ 987 static unsigned offset_il_node(struct mempolicy *pol, 988 struct vm_area_struct *vma, unsigned long off) 989 { 990 unsigned nnodes = nodes_weight(pol->v.nodes); 991 unsigned target = (unsigned)off % nnodes; 992 int c; 993 int nid = -1; 994 995 c = 0; 996 do { 997 nid = next_node(nid, pol->v.nodes); 998 c++; 999 } while (c <= target); 1000 return nid; 1001 } 1002 1003 /* Determine a node number for interleave */ 1004 static inline unsigned interleave_nid(struct mempolicy *pol, 1005 struct vm_area_struct *vma, unsigned long addr, int shift) 1006 { 1007 if (vma) { 1008 unsigned long off; 1009 1010 off = vma->vm_pgoff; 1011 off += (addr - vma->vm_start) >> shift; 1012 return offset_il_node(pol, vma, off); 1013 } else 1014 return interleave_nodes(pol); 1015 } 1016 1017 /* Return a zonelist suitable for a huge page allocation. */ 1018 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) 1019 { 1020 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1021 1022 if (pol->policy == MPOL_INTERLEAVE) { 1023 unsigned nid; 1024 1025 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1026 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); 1027 } 1028 return zonelist_policy(GFP_HIGHUSER, pol); 1029 } 1030 1031 /* Allocate a page in interleaved policy. 1032 Own path because it needs to do special accounting. */ 1033 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1034 unsigned nid) 1035 { 1036 struct zonelist *zl; 1037 struct page *page; 1038 1039 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 1040 page = __alloc_pages(gfp, order, zl); 1041 if (page && page_zone(page) == zl->zones[0]) { 1042 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; 1043 put_cpu(); 1044 } 1045 return page; 1046 } 1047 1048 /** 1049 * alloc_page_vma - Allocate a page for a VMA. 1050 * 1051 * @gfp: 1052 * %GFP_USER user allocation. 1053 * %GFP_KERNEL kernel allocations, 1054 * %GFP_HIGHMEM highmem/user allocations, 1055 * %GFP_FS allocation should not call back into a file system. 1056 * %GFP_ATOMIC don't sleep. 1057 * 1058 * @vma: Pointer to VMA or NULL if not available. 1059 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1060 * 1061 * This function allocates a page from the kernel page pool and applies 1062 * a NUMA policy associated with the VMA or the current process. 1063 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1064 * mm_struct of the VMA to prevent it from going away. Should be used for 1065 * all allocations for pages that will be mapped into 1066 * user space. Returns NULL when no page can be allocated. 1067 * 1068 * Should be called with the mm_sem of the vma hold. 1069 */ 1070 struct page * 1071 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1072 { 1073 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1074 1075 cpuset_update_current_mems_allowed(); 1076 1077 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1078 unsigned nid; 1079 1080 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1081 return alloc_page_interleave(gfp, 0, nid); 1082 } 1083 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1084 } 1085 1086 /** 1087 * alloc_pages_current - Allocate pages. 1088 * 1089 * @gfp: 1090 * %GFP_USER user allocation, 1091 * %GFP_KERNEL kernel allocation, 1092 * %GFP_HIGHMEM highmem allocation, 1093 * %GFP_FS don't call back into a file system. 1094 * %GFP_ATOMIC don't sleep. 1095 * @order: Power of two of allocation size in pages. 0 is a single page. 1096 * 1097 * Allocate a page from the kernel page pool. When not in 1098 * interrupt context and apply the current process NUMA policy. 1099 * Returns NULL when no page can be allocated. 1100 * 1101 * Don't call cpuset_update_current_mems_allowed() unless 1102 * 1) it's ok to take cpuset_sem (can WAIT), and 1103 * 2) allocating for current task (not interrupt). 1104 */ 1105 struct page *alloc_pages_current(gfp_t gfp, unsigned order) 1106 { 1107 struct mempolicy *pol = current->mempolicy; 1108 1109 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1110 cpuset_update_current_mems_allowed(); 1111 if (!pol || in_interrupt()) 1112 pol = &default_policy; 1113 if (pol->policy == MPOL_INTERLEAVE) 1114 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1115 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1116 } 1117 EXPORT_SYMBOL(alloc_pages_current); 1118 1119 /* Slow path of a mempolicy copy */ 1120 struct mempolicy *__mpol_copy(struct mempolicy *old) 1121 { 1122 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 1123 1124 if (!new) 1125 return ERR_PTR(-ENOMEM); 1126 *new = *old; 1127 atomic_set(&new->refcnt, 1); 1128 if (new->policy == MPOL_BIND) { 1129 int sz = ksize(old->v.zonelist); 1130 new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 1131 if (!new->v.zonelist) { 1132 kmem_cache_free(policy_cache, new); 1133 return ERR_PTR(-ENOMEM); 1134 } 1135 memcpy(new->v.zonelist, old->v.zonelist, sz); 1136 } 1137 return new; 1138 } 1139 1140 /* Slow path of a mempolicy comparison */ 1141 int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1142 { 1143 if (!a || !b) 1144 return 0; 1145 if (a->policy != b->policy) 1146 return 0; 1147 switch (a->policy) { 1148 case MPOL_DEFAULT: 1149 return 1; 1150 case MPOL_INTERLEAVE: 1151 return nodes_equal(a->v.nodes, b->v.nodes); 1152 case MPOL_PREFERRED: 1153 return a->v.preferred_node == b->v.preferred_node; 1154 case MPOL_BIND: { 1155 int i; 1156 for (i = 0; a->v.zonelist->zones[i]; i++) 1157 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) 1158 return 0; 1159 return b->v.zonelist->zones[i] == NULL; 1160 } 1161 default: 1162 BUG(); 1163 return 0; 1164 } 1165 } 1166 1167 /* Slow path of a mpol destructor. */ 1168 void __mpol_free(struct mempolicy *p) 1169 { 1170 if (!atomic_dec_and_test(&p->refcnt)) 1171 return; 1172 if (p->policy == MPOL_BIND) 1173 kfree(p->v.zonelist); 1174 p->policy = MPOL_DEFAULT; 1175 kmem_cache_free(policy_cache, p); 1176 } 1177 1178 /* 1179 * Shared memory backing store policy support. 1180 * 1181 * Remember policies even when nobody has shared memory mapped. 1182 * The policies are kept in Red-Black tree linked from the inode. 1183 * They are protected by the sp->lock spinlock, which should be held 1184 * for any accesses to the tree. 1185 */ 1186 1187 /* lookup first element intersecting start-end */ 1188 /* Caller holds sp->lock */ 1189 static struct sp_node * 1190 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 1191 { 1192 struct rb_node *n = sp->root.rb_node; 1193 1194 while (n) { 1195 struct sp_node *p = rb_entry(n, struct sp_node, nd); 1196 1197 if (start >= p->end) 1198 n = n->rb_right; 1199 else if (end <= p->start) 1200 n = n->rb_left; 1201 else 1202 break; 1203 } 1204 if (!n) 1205 return NULL; 1206 for (;;) { 1207 struct sp_node *w = NULL; 1208 struct rb_node *prev = rb_prev(n); 1209 if (!prev) 1210 break; 1211 w = rb_entry(prev, struct sp_node, nd); 1212 if (w->end <= start) 1213 break; 1214 n = prev; 1215 } 1216 return rb_entry(n, struct sp_node, nd); 1217 } 1218 1219 /* Insert a new shared policy into the list. */ 1220 /* Caller holds sp->lock */ 1221 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 1222 { 1223 struct rb_node **p = &sp->root.rb_node; 1224 struct rb_node *parent = NULL; 1225 struct sp_node *nd; 1226 1227 while (*p) { 1228 parent = *p; 1229 nd = rb_entry(parent, struct sp_node, nd); 1230 if (new->start < nd->start) 1231 p = &(*p)->rb_left; 1232 else if (new->end > nd->end) 1233 p = &(*p)->rb_right; 1234 else 1235 BUG(); 1236 } 1237 rb_link_node(&new->nd, parent, p); 1238 rb_insert_color(&new->nd, &sp->root); 1239 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 1240 new->policy ? new->policy->policy : 0); 1241 } 1242 1243 /* Find shared policy intersecting idx */ 1244 struct mempolicy * 1245 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 1246 { 1247 struct mempolicy *pol = NULL; 1248 struct sp_node *sn; 1249 1250 if (!sp->root.rb_node) 1251 return NULL; 1252 spin_lock(&sp->lock); 1253 sn = sp_lookup(sp, idx, idx+1); 1254 if (sn) { 1255 mpol_get(sn->policy); 1256 pol = sn->policy; 1257 } 1258 spin_unlock(&sp->lock); 1259 return pol; 1260 } 1261 1262 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 1263 { 1264 PDprintk("deleting %lx-l%x\n", n->start, n->end); 1265 rb_erase(&n->nd, &sp->root); 1266 mpol_free(n->policy); 1267 kmem_cache_free(sn_cache, n); 1268 } 1269 1270 struct sp_node * 1271 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1272 { 1273 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1274 1275 if (!n) 1276 return NULL; 1277 n->start = start; 1278 n->end = end; 1279 mpol_get(pol); 1280 n->policy = pol; 1281 return n; 1282 } 1283 1284 /* Replace a policy range. */ 1285 static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 1286 unsigned long end, struct sp_node *new) 1287 { 1288 struct sp_node *n, *new2 = NULL; 1289 1290 restart: 1291 spin_lock(&sp->lock); 1292 n = sp_lookup(sp, start, end); 1293 /* Take care of old policies in the same range. */ 1294 while (n && n->start < end) { 1295 struct rb_node *next = rb_next(&n->nd); 1296 if (n->start >= start) { 1297 if (n->end <= end) 1298 sp_delete(sp, n); 1299 else 1300 n->start = end; 1301 } else { 1302 /* Old policy spanning whole new range. */ 1303 if (n->end > end) { 1304 if (!new2) { 1305 spin_unlock(&sp->lock); 1306 new2 = sp_alloc(end, n->end, n->policy); 1307 if (!new2) 1308 return -ENOMEM; 1309 goto restart; 1310 } 1311 n->end = start; 1312 sp_insert(sp, new2); 1313 new2 = NULL; 1314 break; 1315 } else 1316 n->end = start; 1317 } 1318 if (!next) 1319 break; 1320 n = rb_entry(next, struct sp_node, nd); 1321 } 1322 if (new) 1323 sp_insert(sp, new); 1324 spin_unlock(&sp->lock); 1325 if (new2) { 1326 mpol_free(new2->policy); 1327 kmem_cache_free(sn_cache, new2); 1328 } 1329 return 0; 1330 } 1331 1332 int mpol_set_shared_policy(struct shared_policy *info, 1333 struct vm_area_struct *vma, struct mempolicy *npol) 1334 { 1335 int err; 1336 struct sp_node *new = NULL; 1337 unsigned long sz = vma_pages(vma); 1338 1339 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1340 vma->vm_pgoff, 1341 sz, npol? npol->policy : -1, 1342 npol ? nodes_addr(npol->v.nodes)[0] : -1); 1343 1344 if (npol) { 1345 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1346 if (!new) 1347 return -ENOMEM; 1348 } 1349 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 1350 if (err && new) 1351 kmem_cache_free(sn_cache, new); 1352 return err; 1353 } 1354 1355 /* Free a backing policy store on inode delete. */ 1356 void mpol_free_shared_policy(struct shared_policy *p) 1357 { 1358 struct sp_node *n; 1359 struct rb_node *next; 1360 1361 if (!p->root.rb_node) 1362 return; 1363 spin_lock(&p->lock); 1364 next = rb_first(&p->root); 1365 while (next) { 1366 n = rb_entry(next, struct sp_node, nd); 1367 next = rb_next(&n->nd); 1368 rb_erase(&n->nd, &p->root); 1369 mpol_free(n->policy); 1370 kmem_cache_free(sn_cache, n); 1371 } 1372 spin_unlock(&p->lock); 1373 } 1374 1375 /* assumes fs == KERNEL_DS */ 1376 void __init numa_policy_init(void) 1377 { 1378 policy_cache = kmem_cache_create("numa_policy", 1379 sizeof(struct mempolicy), 1380 0, SLAB_PANIC, NULL, NULL); 1381 1382 sn_cache = kmem_cache_create("shared_policy_node", 1383 sizeof(struct sp_node), 1384 0, SLAB_PANIC, NULL, NULL); 1385 1386 /* Set interleaving policy for system init. This way not all 1387 the data structures allocated at system boot end up in node zero. */ 1388 1389 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) 1390 printk("numa_policy_init: interleaving failed\n"); 1391 } 1392 1393 /* Reset policy of current process to default */ 1394 void numa_default_policy(void) 1395 { 1396 do_set_mempolicy(MPOL_DEFAULT, NULL); 1397 } 1398 1399 /* Migrate a policy to a different set of nodes */ 1400 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1401 const nodemask_t *new) 1402 { 1403 nodemask_t tmp; 1404 1405 if (!pol) 1406 return; 1407 1408 switch (pol->policy) { 1409 case MPOL_DEFAULT: 1410 break; 1411 case MPOL_INTERLEAVE: 1412 nodes_remap(tmp, pol->v.nodes, *old, *new); 1413 pol->v.nodes = tmp; 1414 current->il_next = node_remap(current->il_next, *old, *new); 1415 break; 1416 case MPOL_PREFERRED: 1417 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1418 *old, *new); 1419 break; 1420 case MPOL_BIND: { 1421 nodemask_t nodes; 1422 struct zone **z; 1423 struct zonelist *zonelist; 1424 1425 nodes_clear(nodes); 1426 for (z = pol->v.zonelist->zones; *z; z++) 1427 node_set((*z)->zone_pgdat->node_id, nodes); 1428 nodes_remap(tmp, nodes, *old, *new); 1429 nodes = tmp; 1430 1431 zonelist = bind_zonelist(&nodes); 1432 1433 /* If no mem, then zonelist is NULL and we keep old zonelist. 1434 * If that old zonelist has no remaining mems_allowed nodes, 1435 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. 1436 */ 1437 1438 if (zonelist) { 1439 /* Good - got mem - substitute new zonelist */ 1440 kfree(pol->v.zonelist); 1441 pol->v.zonelist = zonelist; 1442 } 1443 break; 1444 } 1445 default: 1446 BUG(); 1447 break; 1448 } 1449 } 1450 1451 /* 1452 * Someone moved this task to different nodes. Fixup mempolicies. 1453 * 1454 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1455 * once we have a cpuset mechanism to mark which cpuset subtree is migrating. 1456 */ 1457 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1458 { 1459 rebind_policy(current->mempolicy, old, new); 1460 } 1461