xref: /openbmc/linux/mm/mempolicy.c (revision d498471133ff1f9586a06820beaeebc575fe2814)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 
88 #include <asm/tlbflush.h>
89 #include <asm/uaccess.h>
90 
91 /* Internal MPOL_MF_xxx flags */
92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
93 
94 static kmem_cache_t *policy_cache;
95 static kmem_cache_t *sn_cache;
96 
97 #define PDprintk(fmt...)
98 
99 /* Highest zone. An specific allocation for a zone below that is not
100    policied. */
101 int policy_zone = ZONE_DMA;
102 
103 struct mempolicy default_policy = {
104 	.refcnt = ATOMIC_INIT(1), /* never free it */
105 	.policy = MPOL_DEFAULT,
106 };
107 
108 /* Do sanity checking on a policy */
109 static int mpol_check_policy(int mode, nodemask_t *nodes)
110 {
111 	int empty = nodes_empty(*nodes);
112 
113 	switch (mode) {
114 	case MPOL_DEFAULT:
115 		if (!empty)
116 			return -EINVAL;
117 		break;
118 	case MPOL_BIND:
119 	case MPOL_INTERLEAVE:
120 		/* Preferred will only use the first bit, but allow
121 		   more for now. */
122 		if (empty)
123 			return -EINVAL;
124 		break;
125 	}
126 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
127 }
128 /* Generate a custom zonelist for the BIND policy. */
129 static struct zonelist *bind_zonelist(nodemask_t *nodes)
130 {
131 	struct zonelist *zl;
132 	int num, max, nd;
133 
134 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
135 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
136 	if (!zl)
137 		return NULL;
138 	num = 0;
139 	for_each_node_mask(nd, *nodes)
140 		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
141 	zl->zones[num] = NULL;
142 	return zl;
143 }
144 
145 /* Create a new policy */
146 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
147 {
148 	struct mempolicy *policy;
149 
150 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
151 	if (mode == MPOL_DEFAULT)
152 		return NULL;
153 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
154 	if (!policy)
155 		return ERR_PTR(-ENOMEM);
156 	atomic_set(&policy->refcnt, 1);
157 	switch (mode) {
158 	case MPOL_INTERLEAVE:
159 		policy->v.nodes = *nodes;
160 		if (nodes_weight(*nodes) == 0) {
161 			kmem_cache_free(policy_cache, policy);
162 			return ERR_PTR(-EINVAL);
163 		}
164 		break;
165 	case MPOL_PREFERRED:
166 		policy->v.preferred_node = first_node(*nodes);
167 		if (policy->v.preferred_node >= MAX_NUMNODES)
168 			policy->v.preferred_node = -1;
169 		break;
170 	case MPOL_BIND:
171 		policy->v.zonelist = bind_zonelist(nodes);
172 		if (policy->v.zonelist == NULL) {
173 			kmem_cache_free(policy_cache, policy);
174 			return ERR_PTR(-ENOMEM);
175 		}
176 		break;
177 	}
178 	policy->policy = mode;
179 	return policy;
180 }
181 
182 /* Check if we are the only process mapping the page in question */
183 static inline int single_mm_mapping(struct mm_struct *mm,
184 			struct address_space *mapping)
185 {
186 	struct vm_area_struct *vma;
187 	struct prio_tree_iter iter;
188 	int rc = 1;
189 
190 	spin_lock(&mapping->i_mmap_lock);
191 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
192 		if (mm != vma->vm_mm) {
193 			rc = 0;
194 			goto out;
195 		}
196 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
197 		if (mm != vma->vm_mm) {
198 			rc = 0;
199 			goto out;
200 		}
201 out:
202 	spin_unlock(&mapping->i_mmap_lock);
203 	return rc;
204 }
205 
206 /*
207  * Add a page to be migrated to the pagelist
208  */
209 static void migrate_page_add(struct vm_area_struct *vma,
210 	struct page *page, struct list_head *pagelist, unsigned long flags)
211 {
212 	/*
213 	 * Avoid migrating a page that is shared by others and not writable.
214 	 */
215 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
216 	    mapping_writably_mapped(page->mapping) ||
217 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
218 		int rc = isolate_lru_page(page);
219 
220 		if (rc == 1)
221 			list_add(&page->lru, pagelist);
222 		/*
223 		 * If the isolate attempt was not successful then we just
224 		 * encountered an unswappable page. Something must be wrong.
225 	 	 */
226 		WARN_ON(rc == 0);
227 	}
228 }
229 
230 /* Ensure all existing pages follow the policy. */
231 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
232 		unsigned long addr, unsigned long end,
233 		const nodemask_t *nodes, unsigned long flags,
234 		struct list_head *pagelist)
235 {
236 	pte_t *orig_pte;
237 	pte_t *pte;
238 	spinlock_t *ptl;
239 
240 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
241 	do {
242 		struct page *page;
243 		unsigned int nid;
244 
245 		if (!pte_present(*pte))
246 			continue;
247 		page = vm_normal_page(vma, addr, *pte);
248 		if (!page)
249 			continue;
250 		nid = page_to_nid(page);
251 		if (!node_isset(nid, *nodes)) {
252 			if (pagelist)
253 				migrate_page_add(vma, page, pagelist, flags);
254 			else
255 				break;
256 		}
257 	} while (pte++, addr += PAGE_SIZE, addr != end);
258 	pte_unmap_unlock(orig_pte, ptl);
259 	return addr != end;
260 }
261 
262 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
263 		unsigned long addr, unsigned long end,
264 		const nodemask_t *nodes, unsigned long flags,
265 		struct list_head *pagelist)
266 {
267 	pmd_t *pmd;
268 	unsigned long next;
269 
270 	pmd = pmd_offset(pud, addr);
271 	do {
272 		next = pmd_addr_end(addr, end);
273 		if (pmd_none_or_clear_bad(pmd))
274 			continue;
275 		if (check_pte_range(vma, pmd, addr, next, nodes,
276 				    flags, pagelist))
277 			return -EIO;
278 	} while (pmd++, addr = next, addr != end);
279 	return 0;
280 }
281 
282 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
283 		unsigned long addr, unsigned long end,
284 		const nodemask_t *nodes, unsigned long flags,
285 		struct list_head *pagelist)
286 {
287 	pud_t *pud;
288 	unsigned long next;
289 
290 	pud = pud_offset(pgd, addr);
291 	do {
292 		next = pud_addr_end(addr, end);
293 		if (pud_none_or_clear_bad(pud))
294 			continue;
295 		if (check_pmd_range(vma, pud, addr, next, nodes,
296 				    flags, pagelist))
297 			return -EIO;
298 	} while (pud++, addr = next, addr != end);
299 	return 0;
300 }
301 
302 static inline int check_pgd_range(struct vm_area_struct *vma,
303 		unsigned long addr, unsigned long end,
304 		const nodemask_t *nodes, unsigned long flags,
305 		struct list_head *pagelist)
306 {
307 	pgd_t *pgd;
308 	unsigned long next;
309 
310 	pgd = pgd_offset(vma->vm_mm, addr);
311 	do {
312 		next = pgd_addr_end(addr, end);
313 		if (pgd_none_or_clear_bad(pgd))
314 			continue;
315 		if (check_pud_range(vma, pgd, addr, next, nodes,
316 				    flags, pagelist))
317 			return -EIO;
318 	} while (pgd++, addr = next, addr != end);
319 	return 0;
320 }
321 
322 /* Check if a vma is migratable */
323 static inline int vma_migratable(struct vm_area_struct *vma)
324 {
325 	if (vma->vm_flags & (
326 		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
327 		return 0;
328 	return 1;
329 }
330 
331 /*
332  * Check if all pages in a range are on a set of nodes.
333  * If pagelist != NULL then isolate pages from the LRU and
334  * put them on the pagelist.
335  */
336 static struct vm_area_struct *
337 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
338 		const nodemask_t *nodes, unsigned long flags,
339 		struct list_head *pagelist)
340 {
341 	int err;
342 	struct vm_area_struct *first, *vma, *prev;
343 
344 	first = find_vma(mm, start);
345 	if (!first)
346 		return ERR_PTR(-EFAULT);
347 	prev = NULL;
348 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
349 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
350 			if (!vma->vm_next && vma->vm_end < end)
351 				return ERR_PTR(-EFAULT);
352 			if (prev && prev->vm_end < vma->vm_start)
353 				return ERR_PTR(-EFAULT);
354 		}
355 		if (!is_vm_hugetlb_page(vma) &&
356 		    ((flags & MPOL_MF_STRICT) ||
357 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
358 				vma_migratable(vma)))) {
359 			unsigned long endvma = vma->vm_end;
360 
361 			if (endvma > end)
362 				endvma = end;
363 			if (vma->vm_start > start)
364 				start = vma->vm_start;
365 			err = check_pgd_range(vma, start, endvma, nodes,
366 						flags, pagelist);
367 			if (err) {
368 				first = ERR_PTR(err);
369 				break;
370 			}
371 		}
372 		prev = vma;
373 	}
374 	return first;
375 }
376 
377 /* Apply policy to a single VMA */
378 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
379 {
380 	int err = 0;
381 	struct mempolicy *old = vma->vm_policy;
382 
383 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
384 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
385 		 vma->vm_ops, vma->vm_file,
386 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
387 
388 	if (vma->vm_ops && vma->vm_ops->set_policy)
389 		err = vma->vm_ops->set_policy(vma, new);
390 	if (!err) {
391 		mpol_get(new);
392 		vma->vm_policy = new;
393 		mpol_free(old);
394 	}
395 	return err;
396 }
397 
398 /* Step 2: apply policy to a range and do splits. */
399 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
400 		       unsigned long end, struct mempolicy *new)
401 {
402 	struct vm_area_struct *next;
403 	int err;
404 
405 	err = 0;
406 	for (; vma && vma->vm_start < end; vma = next) {
407 		next = vma->vm_next;
408 		if (vma->vm_start < start)
409 			err = split_vma(vma->vm_mm, vma, start, 1);
410 		if (!err && vma->vm_end > end)
411 			err = split_vma(vma->vm_mm, vma, end, 0);
412 		if (!err)
413 			err = policy_vma(vma, new);
414 		if (err)
415 			break;
416 	}
417 	return err;
418 }
419 
420 static int contextualize_policy(int mode, nodemask_t *nodes)
421 {
422 	if (!nodes)
423 		return 0;
424 
425 	/* Update current mems_allowed */
426 	cpuset_update_current_mems_allowed();
427 	/* Ignore nodes not set in current->mems_allowed */
428 	cpuset_restrict_to_mems_allowed(nodes->bits);
429 	return mpol_check_policy(mode, nodes);
430 }
431 
432 static int swap_pages(struct list_head *pagelist)
433 {
434 	LIST_HEAD(moved);
435 	LIST_HEAD(failed);
436 	int n;
437 
438 	n = migrate_pages(pagelist, NULL, &moved, &failed);
439 	putback_lru_pages(&failed);
440 	putback_lru_pages(&moved);
441 
442 	return n;
443 }
444 
445 long do_mbind(unsigned long start, unsigned long len,
446 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
447 {
448 	struct vm_area_struct *vma;
449 	struct mm_struct *mm = current->mm;
450 	struct mempolicy *new;
451 	unsigned long end;
452 	int err;
453 	LIST_HEAD(pagelist);
454 
455 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
456 	    || mode > MPOL_MAX)
457 		return -EINVAL;
458 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
459 		return -EPERM;
460 
461 	if (start & ~PAGE_MASK)
462 		return -EINVAL;
463 
464 	if (mode == MPOL_DEFAULT)
465 		flags &= ~MPOL_MF_STRICT;
466 
467 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
468 	end = start + len;
469 
470 	if (end < start)
471 		return -EINVAL;
472 	if (end == start)
473 		return 0;
474 
475 	if (mpol_check_policy(mode, nmask))
476 		return -EINVAL;
477 
478 	new = mpol_new(mode, nmask);
479 	if (IS_ERR(new))
480 		return PTR_ERR(new);
481 
482 	/*
483 	 * If we are using the default policy then operation
484 	 * on discontinuous address spaces is okay after all
485 	 */
486 	if (!new)
487 		flags |= MPOL_MF_DISCONTIG_OK;
488 
489 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
490 			mode,nodes_addr(nodes)[0]);
491 
492 	down_write(&mm->mmap_sem);
493 	vma = check_range(mm, start, end, nmask, flags,
494 	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
495 	err = PTR_ERR(vma);
496 	if (!IS_ERR(vma)) {
497 		int nr_failed = 0;
498 
499 		err = mbind_range(vma, start, end, new);
500 		if (!list_empty(&pagelist))
501 			nr_failed = swap_pages(&pagelist);
502 
503 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
504 			err = -EIO;
505 	}
506 	if (!list_empty(&pagelist))
507 		putback_lru_pages(&pagelist);
508 
509 	up_write(&mm->mmap_sem);
510 	mpol_free(new);
511 	return err;
512 }
513 
514 /* Set the process memory policy */
515 long do_set_mempolicy(int mode, nodemask_t *nodes)
516 {
517 	struct mempolicy *new;
518 
519 	if (contextualize_policy(mode, nodes))
520 		return -EINVAL;
521 	new = mpol_new(mode, nodes);
522 	if (IS_ERR(new))
523 		return PTR_ERR(new);
524 	mpol_free(current->mempolicy);
525 	current->mempolicy = new;
526 	if (new && new->policy == MPOL_INTERLEAVE)
527 		current->il_next = first_node(new->v.nodes);
528 	return 0;
529 }
530 
531 /* Fill a zone bitmap for a policy */
532 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
533 {
534 	int i;
535 
536 	nodes_clear(*nodes);
537 	switch (p->policy) {
538 	case MPOL_BIND:
539 		for (i = 0; p->v.zonelist->zones[i]; i++)
540 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
541 				*nodes);
542 		break;
543 	case MPOL_DEFAULT:
544 		break;
545 	case MPOL_INTERLEAVE:
546 		*nodes = p->v.nodes;
547 		break;
548 	case MPOL_PREFERRED:
549 		/* or use current node instead of online map? */
550 		if (p->v.preferred_node < 0)
551 			*nodes = node_online_map;
552 		else
553 			node_set(p->v.preferred_node, *nodes);
554 		break;
555 	default:
556 		BUG();
557 	}
558 }
559 
560 static int lookup_node(struct mm_struct *mm, unsigned long addr)
561 {
562 	struct page *p;
563 	int err;
564 
565 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
566 	if (err >= 0) {
567 		err = page_to_nid(p);
568 		put_page(p);
569 	}
570 	return err;
571 }
572 
573 /* Retrieve NUMA policy */
574 long do_get_mempolicy(int *policy, nodemask_t *nmask,
575 			unsigned long addr, unsigned long flags)
576 {
577 	int err;
578 	struct mm_struct *mm = current->mm;
579 	struct vm_area_struct *vma = NULL;
580 	struct mempolicy *pol = current->mempolicy;
581 
582 	cpuset_update_current_mems_allowed();
583 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
584 		return -EINVAL;
585 	if (flags & MPOL_F_ADDR) {
586 		down_read(&mm->mmap_sem);
587 		vma = find_vma_intersection(mm, addr, addr+1);
588 		if (!vma) {
589 			up_read(&mm->mmap_sem);
590 			return -EFAULT;
591 		}
592 		if (vma->vm_ops && vma->vm_ops->get_policy)
593 			pol = vma->vm_ops->get_policy(vma, addr);
594 		else
595 			pol = vma->vm_policy;
596 	} else if (addr)
597 		return -EINVAL;
598 
599 	if (!pol)
600 		pol = &default_policy;
601 
602 	if (flags & MPOL_F_NODE) {
603 		if (flags & MPOL_F_ADDR) {
604 			err = lookup_node(mm, addr);
605 			if (err < 0)
606 				goto out;
607 			*policy = err;
608 		} else if (pol == current->mempolicy &&
609 				pol->policy == MPOL_INTERLEAVE) {
610 			*policy = current->il_next;
611 		} else {
612 			err = -EINVAL;
613 			goto out;
614 		}
615 	} else
616 		*policy = pol->policy;
617 
618 	if (vma) {
619 		up_read(&current->mm->mmap_sem);
620 		vma = NULL;
621 	}
622 
623 	err = 0;
624 	if (nmask)
625 		get_zonemask(pol, nmask);
626 
627  out:
628 	if (vma)
629 		up_read(&current->mm->mmap_sem);
630 	return err;
631 }
632 
633 /*
634  * For now migrate_pages simply swaps out the pages from nodes that are in
635  * the source set but not in the target set. In the future, we would
636  * want a function that moves pages between the two nodesets in such
637  * a way as to preserve the physical layout as much as possible.
638  *
639  * Returns the number of page that could not be moved.
640  */
641 int do_migrate_pages(struct mm_struct *mm,
642 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
643 {
644 	LIST_HEAD(pagelist);
645 	int count = 0;
646 	nodemask_t nodes;
647 
648 	nodes_andnot(nodes, *from_nodes, *to_nodes);
649 	nodes_complement(nodes, nodes);
650 
651 	down_read(&mm->mmap_sem);
652 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
653 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
654 
655 	if (!list_empty(&pagelist)) {
656 		count = swap_pages(&pagelist);
657 		putback_lru_pages(&pagelist);
658 	}
659 
660 	up_read(&mm->mmap_sem);
661 	return count;
662 }
663 
664 /*
665  * User space interface with variable sized bitmaps for nodelists.
666  */
667 
668 /* Copy a node mask from user space. */
669 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
670 		     unsigned long maxnode)
671 {
672 	unsigned long k;
673 	unsigned long nlongs;
674 	unsigned long endmask;
675 
676 	--maxnode;
677 	nodes_clear(*nodes);
678 	if (maxnode == 0 || !nmask)
679 		return 0;
680 
681 	nlongs = BITS_TO_LONGS(maxnode);
682 	if ((maxnode % BITS_PER_LONG) == 0)
683 		endmask = ~0UL;
684 	else
685 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
686 
687 	/* When the user specified more nodes than supported just check
688 	   if the non supported part is all zero. */
689 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
690 		if (nlongs > PAGE_SIZE/sizeof(long))
691 			return -EINVAL;
692 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
693 			unsigned long t;
694 			if (get_user(t, nmask + k))
695 				return -EFAULT;
696 			if (k == nlongs - 1) {
697 				if (t & endmask)
698 					return -EINVAL;
699 			} else if (t)
700 				return -EINVAL;
701 		}
702 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
703 		endmask = ~0UL;
704 	}
705 
706 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
707 		return -EFAULT;
708 	nodes_addr(*nodes)[nlongs-1] &= endmask;
709 	return 0;
710 }
711 
712 /* Copy a kernel node mask to user space */
713 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
714 			      nodemask_t *nodes)
715 {
716 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
717 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
718 
719 	if (copy > nbytes) {
720 		if (copy > PAGE_SIZE)
721 			return -EINVAL;
722 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
723 			return -EFAULT;
724 		copy = nbytes;
725 	}
726 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
727 }
728 
729 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
730 			unsigned long mode,
731 			unsigned long __user *nmask, unsigned long maxnode,
732 			unsigned flags)
733 {
734 	nodemask_t nodes;
735 	int err;
736 
737 	err = get_nodes(&nodes, nmask, maxnode);
738 	if (err)
739 		return err;
740 	return do_mbind(start, len, mode, &nodes, flags);
741 }
742 
743 /* Set the process memory policy */
744 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
745 		unsigned long maxnode)
746 {
747 	int err;
748 	nodemask_t nodes;
749 
750 	if (mode < 0 || mode > MPOL_MAX)
751 		return -EINVAL;
752 	err = get_nodes(&nodes, nmask, maxnode);
753 	if (err)
754 		return err;
755 	return do_set_mempolicy(mode, &nodes);
756 }
757 
758 /* Macro needed until Paul implements this function in kernel/cpusets.c */
759 #define cpuset_mems_allowed(task) node_online_map
760 
761 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
762 		const unsigned long __user *old_nodes,
763 		const unsigned long __user *new_nodes)
764 {
765 	struct mm_struct *mm;
766 	struct task_struct *task;
767 	nodemask_t old;
768 	nodemask_t new;
769 	nodemask_t task_nodes;
770 	int err;
771 
772 	err = get_nodes(&old, old_nodes, maxnode);
773 	if (err)
774 		return err;
775 
776 	err = get_nodes(&new, new_nodes, maxnode);
777 	if (err)
778 		return err;
779 
780 	/* Find the mm_struct */
781 	read_lock(&tasklist_lock);
782 	task = pid ? find_task_by_pid(pid) : current;
783 	if (!task) {
784 		read_unlock(&tasklist_lock);
785 		return -ESRCH;
786 	}
787 	mm = get_task_mm(task);
788 	read_unlock(&tasklist_lock);
789 
790 	if (!mm)
791 		return -EINVAL;
792 
793 	/*
794 	 * Check if this process has the right to modify the specified
795 	 * process. The right exists if the process has administrative
796 	 * capabilities, superuser priviledges or the same
797 	 * userid as the target process.
798 	 */
799 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
800 	    (current->uid != task->suid) && (current->uid != task->uid) &&
801 	    !capable(CAP_SYS_ADMIN)) {
802 		err = -EPERM;
803 		goto out;
804 	}
805 
806 	task_nodes = cpuset_mems_allowed(task);
807 	/* Is the user allowed to access the target nodes? */
808 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
809 		err = -EPERM;
810 		goto out;
811 	}
812 
813 	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
814 out:
815 	mmput(mm);
816 	return err;
817 }
818 
819 
820 /* Retrieve NUMA policy */
821 asmlinkage long sys_get_mempolicy(int __user *policy,
822 				unsigned long __user *nmask,
823 				unsigned long maxnode,
824 				unsigned long addr, unsigned long flags)
825 {
826 	int err, pval;
827 	nodemask_t nodes;
828 
829 	if (nmask != NULL && maxnode < MAX_NUMNODES)
830 		return -EINVAL;
831 
832 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
833 
834 	if (err)
835 		return err;
836 
837 	if (policy && put_user(pval, policy))
838 		return -EFAULT;
839 
840 	if (nmask)
841 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
842 
843 	return err;
844 }
845 
846 #ifdef CONFIG_COMPAT
847 
848 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
849 				     compat_ulong_t __user *nmask,
850 				     compat_ulong_t maxnode,
851 				     compat_ulong_t addr, compat_ulong_t flags)
852 {
853 	long err;
854 	unsigned long __user *nm = NULL;
855 	unsigned long nr_bits, alloc_size;
856 	DECLARE_BITMAP(bm, MAX_NUMNODES);
857 
858 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
859 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
860 
861 	if (nmask)
862 		nm = compat_alloc_user_space(alloc_size);
863 
864 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
865 
866 	if (!err && nmask) {
867 		err = copy_from_user(bm, nm, alloc_size);
868 		/* ensure entire bitmap is zeroed */
869 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
870 		err |= compat_put_bitmap(nmask, bm, nr_bits);
871 	}
872 
873 	return err;
874 }
875 
876 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
877 				     compat_ulong_t maxnode)
878 {
879 	long err = 0;
880 	unsigned long __user *nm = NULL;
881 	unsigned long nr_bits, alloc_size;
882 	DECLARE_BITMAP(bm, MAX_NUMNODES);
883 
884 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
885 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
886 
887 	if (nmask) {
888 		err = compat_get_bitmap(bm, nmask, nr_bits);
889 		nm = compat_alloc_user_space(alloc_size);
890 		err |= copy_to_user(nm, bm, alloc_size);
891 	}
892 
893 	if (err)
894 		return -EFAULT;
895 
896 	return sys_set_mempolicy(mode, nm, nr_bits+1);
897 }
898 
899 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
900 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
901 			     compat_ulong_t maxnode, compat_ulong_t flags)
902 {
903 	long err = 0;
904 	unsigned long __user *nm = NULL;
905 	unsigned long nr_bits, alloc_size;
906 	nodemask_t bm;
907 
908 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
909 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
910 
911 	if (nmask) {
912 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
913 		nm = compat_alloc_user_space(alloc_size);
914 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
915 	}
916 
917 	if (err)
918 		return -EFAULT;
919 
920 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
921 }
922 
923 #endif
924 
925 /* Return effective policy for a VMA */
926 struct mempolicy *
927 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
928 {
929 	struct mempolicy *pol = task->mempolicy;
930 
931 	if (vma) {
932 		if (vma->vm_ops && vma->vm_ops->get_policy)
933 			pol = vma->vm_ops->get_policy(vma, addr);
934 		else if (vma->vm_policy &&
935 				vma->vm_policy->policy != MPOL_DEFAULT)
936 			pol = vma->vm_policy;
937 	}
938 	if (!pol)
939 		pol = &default_policy;
940 	return pol;
941 }
942 
943 /* Return a zonelist representing a mempolicy */
944 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
945 {
946 	int nd;
947 
948 	switch (policy->policy) {
949 	case MPOL_PREFERRED:
950 		nd = policy->v.preferred_node;
951 		if (nd < 0)
952 			nd = numa_node_id();
953 		break;
954 	case MPOL_BIND:
955 		/* Lower zones don't get a policy applied */
956 		/* Careful: current->mems_allowed might have moved */
957 		if (gfp_zone(gfp) >= policy_zone)
958 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
959 				return policy->v.zonelist;
960 		/*FALL THROUGH*/
961 	case MPOL_INTERLEAVE: /* should not happen */
962 	case MPOL_DEFAULT:
963 		nd = numa_node_id();
964 		break;
965 	default:
966 		nd = 0;
967 		BUG();
968 	}
969 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
970 }
971 
972 /* Do dynamic interleaving for a process */
973 static unsigned interleave_nodes(struct mempolicy *policy)
974 {
975 	unsigned nid, next;
976 	struct task_struct *me = current;
977 
978 	nid = me->il_next;
979 	next = next_node(nid, policy->v.nodes);
980 	if (next >= MAX_NUMNODES)
981 		next = first_node(policy->v.nodes);
982 	me->il_next = next;
983 	return nid;
984 }
985 
986 /* Do static interleaving for a VMA with known offset. */
987 static unsigned offset_il_node(struct mempolicy *pol,
988 		struct vm_area_struct *vma, unsigned long off)
989 {
990 	unsigned nnodes = nodes_weight(pol->v.nodes);
991 	unsigned target = (unsigned)off % nnodes;
992 	int c;
993 	int nid = -1;
994 
995 	c = 0;
996 	do {
997 		nid = next_node(nid, pol->v.nodes);
998 		c++;
999 	} while (c <= target);
1000 	return nid;
1001 }
1002 
1003 /* Determine a node number for interleave */
1004 static inline unsigned interleave_nid(struct mempolicy *pol,
1005 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1006 {
1007 	if (vma) {
1008 		unsigned long off;
1009 
1010 		off = vma->vm_pgoff;
1011 		off += (addr - vma->vm_start) >> shift;
1012 		return offset_il_node(pol, vma, off);
1013 	} else
1014 		return interleave_nodes(pol);
1015 }
1016 
1017 /* Return a zonelist suitable for a huge page allocation. */
1018 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1019 {
1020 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1021 
1022 	if (pol->policy == MPOL_INTERLEAVE) {
1023 		unsigned nid;
1024 
1025 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1026 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1027 	}
1028 	return zonelist_policy(GFP_HIGHUSER, pol);
1029 }
1030 
1031 /* Allocate a page in interleaved policy.
1032    Own path because it needs to do special accounting. */
1033 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1034 					unsigned nid)
1035 {
1036 	struct zonelist *zl;
1037 	struct page *page;
1038 
1039 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1040 	page = __alloc_pages(gfp, order, zl);
1041 	if (page && page_zone(page) == zl->zones[0]) {
1042 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1043 		put_cpu();
1044 	}
1045 	return page;
1046 }
1047 
1048 /**
1049  * 	alloc_page_vma	- Allocate a page for a VMA.
1050  *
1051  * 	@gfp:
1052  *      %GFP_USER    user allocation.
1053  *      %GFP_KERNEL  kernel allocations,
1054  *      %GFP_HIGHMEM highmem/user allocations,
1055  *      %GFP_FS      allocation should not call back into a file system.
1056  *      %GFP_ATOMIC  don't sleep.
1057  *
1058  * 	@vma:  Pointer to VMA or NULL if not available.
1059  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1060  *
1061  * 	This function allocates a page from the kernel page pool and applies
1062  *	a NUMA policy associated with the VMA or the current process.
1063  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1064  *	mm_struct of the VMA to prevent it from going away. Should be used for
1065  *	all allocations for pages that will be mapped into
1066  * 	user space. Returns NULL when no page can be allocated.
1067  *
1068  *	Should be called with the mm_sem of the vma hold.
1069  */
1070 struct page *
1071 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1072 {
1073 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1074 
1075 	cpuset_update_current_mems_allowed();
1076 
1077 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1078 		unsigned nid;
1079 
1080 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1081 		return alloc_page_interleave(gfp, 0, nid);
1082 	}
1083 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1084 }
1085 
1086 /**
1087  * 	alloc_pages_current - Allocate pages.
1088  *
1089  *	@gfp:
1090  *		%GFP_USER   user allocation,
1091  *      	%GFP_KERNEL kernel allocation,
1092  *      	%GFP_HIGHMEM highmem allocation,
1093  *      	%GFP_FS     don't call back into a file system.
1094  *      	%GFP_ATOMIC don't sleep.
1095  *	@order: Power of two of allocation size in pages. 0 is a single page.
1096  *
1097  *	Allocate a page from the kernel page pool.  When not in
1098  *	interrupt context and apply the current process NUMA policy.
1099  *	Returns NULL when no page can be allocated.
1100  *
1101  *	Don't call cpuset_update_current_mems_allowed() unless
1102  *	1) it's ok to take cpuset_sem (can WAIT), and
1103  *	2) allocating for current task (not interrupt).
1104  */
1105 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1106 {
1107 	struct mempolicy *pol = current->mempolicy;
1108 
1109 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1110 		cpuset_update_current_mems_allowed();
1111 	if (!pol || in_interrupt())
1112 		pol = &default_policy;
1113 	if (pol->policy == MPOL_INTERLEAVE)
1114 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1115 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1116 }
1117 EXPORT_SYMBOL(alloc_pages_current);
1118 
1119 /* Slow path of a mempolicy copy */
1120 struct mempolicy *__mpol_copy(struct mempolicy *old)
1121 {
1122 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1123 
1124 	if (!new)
1125 		return ERR_PTR(-ENOMEM);
1126 	*new = *old;
1127 	atomic_set(&new->refcnt, 1);
1128 	if (new->policy == MPOL_BIND) {
1129 		int sz = ksize(old->v.zonelist);
1130 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1131 		if (!new->v.zonelist) {
1132 			kmem_cache_free(policy_cache, new);
1133 			return ERR_PTR(-ENOMEM);
1134 		}
1135 		memcpy(new->v.zonelist, old->v.zonelist, sz);
1136 	}
1137 	return new;
1138 }
1139 
1140 /* Slow path of a mempolicy comparison */
1141 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1142 {
1143 	if (!a || !b)
1144 		return 0;
1145 	if (a->policy != b->policy)
1146 		return 0;
1147 	switch (a->policy) {
1148 	case MPOL_DEFAULT:
1149 		return 1;
1150 	case MPOL_INTERLEAVE:
1151 		return nodes_equal(a->v.nodes, b->v.nodes);
1152 	case MPOL_PREFERRED:
1153 		return a->v.preferred_node == b->v.preferred_node;
1154 	case MPOL_BIND: {
1155 		int i;
1156 		for (i = 0; a->v.zonelist->zones[i]; i++)
1157 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1158 				return 0;
1159 		return b->v.zonelist->zones[i] == NULL;
1160 	}
1161 	default:
1162 		BUG();
1163 		return 0;
1164 	}
1165 }
1166 
1167 /* Slow path of a mpol destructor. */
1168 void __mpol_free(struct mempolicy *p)
1169 {
1170 	if (!atomic_dec_and_test(&p->refcnt))
1171 		return;
1172 	if (p->policy == MPOL_BIND)
1173 		kfree(p->v.zonelist);
1174 	p->policy = MPOL_DEFAULT;
1175 	kmem_cache_free(policy_cache, p);
1176 }
1177 
1178 /*
1179  * Shared memory backing store policy support.
1180  *
1181  * Remember policies even when nobody has shared memory mapped.
1182  * The policies are kept in Red-Black tree linked from the inode.
1183  * They are protected by the sp->lock spinlock, which should be held
1184  * for any accesses to the tree.
1185  */
1186 
1187 /* lookup first element intersecting start-end */
1188 /* Caller holds sp->lock */
1189 static struct sp_node *
1190 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1191 {
1192 	struct rb_node *n = sp->root.rb_node;
1193 
1194 	while (n) {
1195 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1196 
1197 		if (start >= p->end)
1198 			n = n->rb_right;
1199 		else if (end <= p->start)
1200 			n = n->rb_left;
1201 		else
1202 			break;
1203 	}
1204 	if (!n)
1205 		return NULL;
1206 	for (;;) {
1207 		struct sp_node *w = NULL;
1208 		struct rb_node *prev = rb_prev(n);
1209 		if (!prev)
1210 			break;
1211 		w = rb_entry(prev, struct sp_node, nd);
1212 		if (w->end <= start)
1213 			break;
1214 		n = prev;
1215 	}
1216 	return rb_entry(n, struct sp_node, nd);
1217 }
1218 
1219 /* Insert a new shared policy into the list. */
1220 /* Caller holds sp->lock */
1221 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1222 {
1223 	struct rb_node **p = &sp->root.rb_node;
1224 	struct rb_node *parent = NULL;
1225 	struct sp_node *nd;
1226 
1227 	while (*p) {
1228 		parent = *p;
1229 		nd = rb_entry(parent, struct sp_node, nd);
1230 		if (new->start < nd->start)
1231 			p = &(*p)->rb_left;
1232 		else if (new->end > nd->end)
1233 			p = &(*p)->rb_right;
1234 		else
1235 			BUG();
1236 	}
1237 	rb_link_node(&new->nd, parent, p);
1238 	rb_insert_color(&new->nd, &sp->root);
1239 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1240 		 new->policy ? new->policy->policy : 0);
1241 }
1242 
1243 /* Find shared policy intersecting idx */
1244 struct mempolicy *
1245 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1246 {
1247 	struct mempolicy *pol = NULL;
1248 	struct sp_node *sn;
1249 
1250 	if (!sp->root.rb_node)
1251 		return NULL;
1252 	spin_lock(&sp->lock);
1253 	sn = sp_lookup(sp, idx, idx+1);
1254 	if (sn) {
1255 		mpol_get(sn->policy);
1256 		pol = sn->policy;
1257 	}
1258 	spin_unlock(&sp->lock);
1259 	return pol;
1260 }
1261 
1262 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1263 {
1264 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1265 	rb_erase(&n->nd, &sp->root);
1266 	mpol_free(n->policy);
1267 	kmem_cache_free(sn_cache, n);
1268 }
1269 
1270 struct sp_node *
1271 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1272 {
1273 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1274 
1275 	if (!n)
1276 		return NULL;
1277 	n->start = start;
1278 	n->end = end;
1279 	mpol_get(pol);
1280 	n->policy = pol;
1281 	return n;
1282 }
1283 
1284 /* Replace a policy range. */
1285 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1286 				 unsigned long end, struct sp_node *new)
1287 {
1288 	struct sp_node *n, *new2 = NULL;
1289 
1290 restart:
1291 	spin_lock(&sp->lock);
1292 	n = sp_lookup(sp, start, end);
1293 	/* Take care of old policies in the same range. */
1294 	while (n && n->start < end) {
1295 		struct rb_node *next = rb_next(&n->nd);
1296 		if (n->start >= start) {
1297 			if (n->end <= end)
1298 				sp_delete(sp, n);
1299 			else
1300 				n->start = end;
1301 		} else {
1302 			/* Old policy spanning whole new range. */
1303 			if (n->end > end) {
1304 				if (!new2) {
1305 					spin_unlock(&sp->lock);
1306 					new2 = sp_alloc(end, n->end, n->policy);
1307 					if (!new2)
1308 						return -ENOMEM;
1309 					goto restart;
1310 				}
1311 				n->end = start;
1312 				sp_insert(sp, new2);
1313 				new2 = NULL;
1314 				break;
1315 			} else
1316 				n->end = start;
1317 		}
1318 		if (!next)
1319 			break;
1320 		n = rb_entry(next, struct sp_node, nd);
1321 	}
1322 	if (new)
1323 		sp_insert(sp, new);
1324 	spin_unlock(&sp->lock);
1325 	if (new2) {
1326 		mpol_free(new2->policy);
1327 		kmem_cache_free(sn_cache, new2);
1328 	}
1329 	return 0;
1330 }
1331 
1332 int mpol_set_shared_policy(struct shared_policy *info,
1333 			struct vm_area_struct *vma, struct mempolicy *npol)
1334 {
1335 	int err;
1336 	struct sp_node *new = NULL;
1337 	unsigned long sz = vma_pages(vma);
1338 
1339 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1340 		 vma->vm_pgoff,
1341 		 sz, npol? npol->policy : -1,
1342 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1343 
1344 	if (npol) {
1345 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1346 		if (!new)
1347 			return -ENOMEM;
1348 	}
1349 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1350 	if (err && new)
1351 		kmem_cache_free(sn_cache, new);
1352 	return err;
1353 }
1354 
1355 /* Free a backing policy store on inode delete. */
1356 void mpol_free_shared_policy(struct shared_policy *p)
1357 {
1358 	struct sp_node *n;
1359 	struct rb_node *next;
1360 
1361 	if (!p->root.rb_node)
1362 		return;
1363 	spin_lock(&p->lock);
1364 	next = rb_first(&p->root);
1365 	while (next) {
1366 		n = rb_entry(next, struct sp_node, nd);
1367 		next = rb_next(&n->nd);
1368 		rb_erase(&n->nd, &p->root);
1369 		mpol_free(n->policy);
1370 		kmem_cache_free(sn_cache, n);
1371 	}
1372 	spin_unlock(&p->lock);
1373 }
1374 
1375 /* assumes fs == KERNEL_DS */
1376 void __init numa_policy_init(void)
1377 {
1378 	policy_cache = kmem_cache_create("numa_policy",
1379 					 sizeof(struct mempolicy),
1380 					 0, SLAB_PANIC, NULL, NULL);
1381 
1382 	sn_cache = kmem_cache_create("shared_policy_node",
1383 				     sizeof(struct sp_node),
1384 				     0, SLAB_PANIC, NULL, NULL);
1385 
1386 	/* Set interleaving policy for system init. This way not all
1387 	   the data structures allocated at system boot end up in node zero. */
1388 
1389 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1390 		printk("numa_policy_init: interleaving failed\n");
1391 }
1392 
1393 /* Reset policy of current process to default */
1394 void numa_default_policy(void)
1395 {
1396 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1397 }
1398 
1399 /* Migrate a policy to a different set of nodes */
1400 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1401 							const nodemask_t *new)
1402 {
1403 	nodemask_t tmp;
1404 
1405 	if (!pol)
1406 		return;
1407 
1408 	switch (pol->policy) {
1409 	case MPOL_DEFAULT:
1410 		break;
1411 	case MPOL_INTERLEAVE:
1412 		nodes_remap(tmp, pol->v.nodes, *old, *new);
1413 		pol->v.nodes = tmp;
1414 		current->il_next = node_remap(current->il_next, *old, *new);
1415 		break;
1416 	case MPOL_PREFERRED:
1417 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1418 								*old, *new);
1419 		break;
1420 	case MPOL_BIND: {
1421 		nodemask_t nodes;
1422 		struct zone **z;
1423 		struct zonelist *zonelist;
1424 
1425 		nodes_clear(nodes);
1426 		for (z = pol->v.zonelist->zones; *z; z++)
1427 			node_set((*z)->zone_pgdat->node_id, nodes);
1428 		nodes_remap(tmp, nodes, *old, *new);
1429 		nodes = tmp;
1430 
1431 		zonelist = bind_zonelist(&nodes);
1432 
1433 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1434 		 * If that old zonelist has no remaining mems_allowed nodes,
1435 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1436 		 */
1437 
1438 		if (zonelist) {
1439 			/* Good - got mem - substitute new zonelist */
1440 			kfree(pol->v.zonelist);
1441 			pol->v.zonelist = zonelist;
1442 		}
1443 		break;
1444 	}
1445 	default:
1446 		BUG();
1447 		break;
1448 	}
1449 }
1450 
1451 /*
1452  * Someone moved this task to different nodes.  Fixup mempolicies.
1453  *
1454  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1455  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1456  */
1457 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1458 {
1459 	rebind_policy(current->mempolicy, old, new);
1460 }
1461