xref: /openbmc/linux/mm/mempolicy.c (revision 38e35860dbe6197a4b42eb6e8b47da940b7695dd)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 
88 #include <asm/tlbflush.h>
89 #include <asm/uaccess.h>
90 
91 /* Internal flags */
92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
93 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
94 
95 static kmem_cache_t *policy_cache;
96 static kmem_cache_t *sn_cache;
97 
98 #define PDprintk(fmt...)
99 
100 /* Highest zone. An specific allocation for a zone below that is not
101    policied. */
102 int policy_zone = ZONE_DMA;
103 
104 struct mempolicy default_policy = {
105 	.refcnt = ATOMIC_INIT(1), /* never free it */
106 	.policy = MPOL_DEFAULT,
107 };
108 
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, nodemask_t *nodes)
111 {
112 	int empty = nodes_empty(*nodes);
113 
114 	switch (mode) {
115 	case MPOL_DEFAULT:
116 		if (!empty)
117 			return -EINVAL;
118 		break;
119 	case MPOL_BIND:
120 	case MPOL_INTERLEAVE:
121 		/* Preferred will only use the first bit, but allow
122 		   more for now. */
123 		if (empty)
124 			return -EINVAL;
125 		break;
126 	}
127 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
128 }
129 /* Generate a custom zonelist for the BIND policy. */
130 static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 {
132 	struct zonelist *zl;
133 	int num, max, nd;
134 
135 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
136 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
137 	if (!zl)
138 		return NULL;
139 	num = 0;
140 	for_each_node_mask(nd, *nodes)
141 		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
142 	zl->zones[num] = NULL;
143 	return zl;
144 }
145 
146 /* Create a new policy */
147 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
148 {
149 	struct mempolicy *policy;
150 
151 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
152 	if (mode == MPOL_DEFAULT)
153 		return NULL;
154 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
155 	if (!policy)
156 		return ERR_PTR(-ENOMEM);
157 	atomic_set(&policy->refcnt, 1);
158 	switch (mode) {
159 	case MPOL_INTERLEAVE:
160 		policy->v.nodes = *nodes;
161 		if (nodes_weight(*nodes) == 0) {
162 			kmem_cache_free(policy_cache, policy);
163 			return ERR_PTR(-EINVAL);
164 		}
165 		break;
166 	case MPOL_PREFERRED:
167 		policy->v.preferred_node = first_node(*nodes);
168 		if (policy->v.preferred_node >= MAX_NUMNODES)
169 			policy->v.preferred_node = -1;
170 		break;
171 	case MPOL_BIND:
172 		policy->v.zonelist = bind_zonelist(nodes);
173 		if (policy->v.zonelist == NULL) {
174 			kmem_cache_free(policy_cache, policy);
175 			return ERR_PTR(-ENOMEM);
176 		}
177 		break;
178 	}
179 	policy->policy = mode;
180 	return policy;
181 }
182 
183 /* Check if we are the only process mapping the page in question */
184 static inline int single_mm_mapping(struct mm_struct *mm,
185 			struct address_space *mapping)
186 {
187 	struct vm_area_struct *vma;
188 	struct prio_tree_iter iter;
189 	int rc = 1;
190 
191 	spin_lock(&mapping->i_mmap_lock);
192 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
193 		if (mm != vma->vm_mm) {
194 			rc = 0;
195 			goto out;
196 		}
197 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
198 		if (mm != vma->vm_mm) {
199 			rc = 0;
200 			goto out;
201 		}
202 out:
203 	spin_unlock(&mapping->i_mmap_lock);
204 	return rc;
205 }
206 
207 /*
208  * Add a page to be migrated to the pagelist
209  */
210 static void migrate_page_add(struct vm_area_struct *vma,
211 	struct page *page, struct list_head *pagelist, unsigned long flags)
212 {
213 	/*
214 	 * Avoid migrating a page that is shared by others and not writable.
215 	 */
216 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
217 	    mapping_writably_mapped(page->mapping) ||
218 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
219 		int rc = isolate_lru_page(page);
220 
221 		if (rc == 1)
222 			list_add(&page->lru, pagelist);
223 		/*
224 		 * If the isolate attempt was not successful then we just
225 		 * encountered an unswappable page. Something must be wrong.
226 	 	 */
227 		WARN_ON(rc == 0);
228 	}
229 }
230 
231 /* Scan through pages checking if pages follow certain conditions. */
232 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
233 		unsigned long addr, unsigned long end,
234 		const nodemask_t *nodes, unsigned long flags,
235 		void *private)
236 {
237 	pte_t *orig_pte;
238 	pte_t *pte;
239 	spinlock_t *ptl;
240 
241 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
242 	do {
243 		struct page *page;
244 		unsigned int nid;
245 
246 		if (!pte_present(*pte))
247 			continue;
248 		page = vm_normal_page(vma, addr, *pte);
249 		if (!page)
250 			continue;
251 		nid = page_to_nid(page);
252 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
253 			continue;
254 
255 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 			migrate_page_add(vma, page, private, flags);
257 		else
258 			break;
259 	} while (pte++, addr += PAGE_SIZE, addr != end);
260 	pte_unmap_unlock(orig_pte, ptl);
261 	return addr != end;
262 }
263 
264 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
265 		unsigned long addr, unsigned long end,
266 		const nodemask_t *nodes, unsigned long flags,
267 		void *private)
268 {
269 	pmd_t *pmd;
270 	unsigned long next;
271 
272 	pmd = pmd_offset(pud, addr);
273 	do {
274 		next = pmd_addr_end(addr, end);
275 		if (pmd_none_or_clear_bad(pmd))
276 			continue;
277 		if (check_pte_range(vma, pmd, addr, next, nodes,
278 				    flags, private))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 		unsigned long addr, unsigned long end,
286 		const nodemask_t *nodes, unsigned long flags,
287 		void *private)
288 {
289 	pud_t *pud;
290 	unsigned long next;
291 
292 	pud = pud_offset(pgd, addr);
293 	do {
294 		next = pud_addr_end(addr, end);
295 		if (pud_none_or_clear_bad(pud))
296 			continue;
297 		if (check_pmd_range(vma, pud, addr, next, nodes,
298 				    flags, private))
299 			return -EIO;
300 	} while (pud++, addr = next, addr != end);
301 	return 0;
302 }
303 
304 static inline int check_pgd_range(struct vm_area_struct *vma,
305 		unsigned long addr, unsigned long end,
306 		const nodemask_t *nodes, unsigned long flags,
307 		void *private)
308 {
309 	pgd_t *pgd;
310 	unsigned long next;
311 
312 	pgd = pgd_offset(vma->vm_mm, addr);
313 	do {
314 		next = pgd_addr_end(addr, end);
315 		if (pgd_none_or_clear_bad(pgd))
316 			continue;
317 		if (check_pud_range(vma, pgd, addr, next, nodes,
318 				    flags, private))
319 			return -EIO;
320 	} while (pgd++, addr = next, addr != end);
321 	return 0;
322 }
323 
324 /* Check if a vma is migratable */
325 static inline int vma_migratable(struct vm_area_struct *vma)
326 {
327 	if (vma->vm_flags & (
328 		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
329 		return 0;
330 	return 1;
331 }
332 
333 /*
334  * Check if all pages in a range are on a set of nodes.
335  * If pagelist != NULL then isolate pages from the LRU and
336  * put them on the pagelist.
337  */
338 static struct vm_area_struct *
339 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
340 		const nodemask_t *nodes, unsigned long flags, void *private)
341 {
342 	int err;
343 	struct vm_area_struct *first, *vma, *prev;
344 
345 	first = find_vma(mm, start);
346 	if (!first)
347 		return ERR_PTR(-EFAULT);
348 	prev = NULL;
349 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
350 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
351 			if (!vma->vm_next && vma->vm_end < end)
352 				return ERR_PTR(-EFAULT);
353 			if (prev && prev->vm_end < vma->vm_start)
354 				return ERR_PTR(-EFAULT);
355 		}
356 		if (!is_vm_hugetlb_page(vma) &&
357 		    ((flags & MPOL_MF_STRICT) ||
358 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
359 				vma_migratable(vma)))) {
360 			unsigned long endvma = vma->vm_end;
361 
362 			if (endvma > end)
363 				endvma = end;
364 			if (vma->vm_start > start)
365 				start = vma->vm_start;
366 			err = check_pgd_range(vma, start, endvma, nodes,
367 						flags, private);
368 			if (err) {
369 				first = ERR_PTR(err);
370 				break;
371 			}
372 		}
373 		prev = vma;
374 	}
375 	return first;
376 }
377 
378 /* Apply policy to a single VMA */
379 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
380 {
381 	int err = 0;
382 	struct mempolicy *old = vma->vm_policy;
383 
384 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
385 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
386 		 vma->vm_ops, vma->vm_file,
387 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
388 
389 	if (vma->vm_ops && vma->vm_ops->set_policy)
390 		err = vma->vm_ops->set_policy(vma, new);
391 	if (!err) {
392 		mpol_get(new);
393 		vma->vm_policy = new;
394 		mpol_free(old);
395 	}
396 	return err;
397 }
398 
399 /* Step 2: apply policy to a range and do splits. */
400 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
401 		       unsigned long end, struct mempolicy *new)
402 {
403 	struct vm_area_struct *next;
404 	int err;
405 
406 	err = 0;
407 	for (; vma && vma->vm_start < end; vma = next) {
408 		next = vma->vm_next;
409 		if (vma->vm_start < start)
410 			err = split_vma(vma->vm_mm, vma, start, 1);
411 		if (!err && vma->vm_end > end)
412 			err = split_vma(vma->vm_mm, vma, end, 0);
413 		if (!err)
414 			err = policy_vma(vma, new);
415 		if (err)
416 			break;
417 	}
418 	return err;
419 }
420 
421 static int contextualize_policy(int mode, nodemask_t *nodes)
422 {
423 	if (!nodes)
424 		return 0;
425 
426 	/* Update current mems_allowed */
427 	cpuset_update_current_mems_allowed();
428 	/* Ignore nodes not set in current->mems_allowed */
429 	cpuset_restrict_to_mems_allowed(nodes->bits);
430 	return mpol_check_policy(mode, nodes);
431 }
432 
433 static int swap_pages(struct list_head *pagelist)
434 {
435 	LIST_HEAD(moved);
436 	LIST_HEAD(failed);
437 	int n;
438 
439 	n = migrate_pages(pagelist, NULL, &moved, &failed);
440 	putback_lru_pages(&failed);
441 	putback_lru_pages(&moved);
442 
443 	return n;
444 }
445 
446 long do_mbind(unsigned long start, unsigned long len,
447 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
448 {
449 	struct vm_area_struct *vma;
450 	struct mm_struct *mm = current->mm;
451 	struct mempolicy *new;
452 	unsigned long end;
453 	int err;
454 	LIST_HEAD(pagelist);
455 
456 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
457 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
458 	    || mode > MPOL_MAX)
459 		return -EINVAL;
460 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
461 		return -EPERM;
462 
463 	if (start & ~PAGE_MASK)
464 		return -EINVAL;
465 
466 	if (mode == MPOL_DEFAULT)
467 		flags &= ~MPOL_MF_STRICT;
468 
469 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
470 	end = start + len;
471 
472 	if (end < start)
473 		return -EINVAL;
474 	if (end == start)
475 		return 0;
476 
477 	if (mpol_check_policy(mode, nmask))
478 		return -EINVAL;
479 
480 	new = mpol_new(mode, nmask);
481 	if (IS_ERR(new))
482 		return PTR_ERR(new);
483 
484 	/*
485 	 * If we are using the default policy then operation
486 	 * on discontinuous address spaces is okay after all
487 	 */
488 	if (!new)
489 		flags |= MPOL_MF_DISCONTIG_OK;
490 
491 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
492 			mode,nodes_addr(nodes)[0]);
493 
494 	down_write(&mm->mmap_sem);
495 	vma = check_range(mm, start, end, nmask,
496 			  flags | MPOL_MF_INVERT, &pagelist);
497 
498 	err = PTR_ERR(vma);
499 	if (!IS_ERR(vma)) {
500 		int nr_failed = 0;
501 
502 		err = mbind_range(vma, start, end, new);
503 		if (!list_empty(&pagelist))
504 			nr_failed = swap_pages(&pagelist);
505 
506 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
507 			err = -EIO;
508 	}
509 	if (!list_empty(&pagelist))
510 		putback_lru_pages(&pagelist);
511 
512 	up_write(&mm->mmap_sem);
513 	mpol_free(new);
514 	return err;
515 }
516 
517 /* Set the process memory policy */
518 long do_set_mempolicy(int mode, nodemask_t *nodes)
519 {
520 	struct mempolicy *new;
521 
522 	if (contextualize_policy(mode, nodes))
523 		return -EINVAL;
524 	new = mpol_new(mode, nodes);
525 	if (IS_ERR(new))
526 		return PTR_ERR(new);
527 	mpol_free(current->mempolicy);
528 	current->mempolicy = new;
529 	if (new && new->policy == MPOL_INTERLEAVE)
530 		current->il_next = first_node(new->v.nodes);
531 	return 0;
532 }
533 
534 /* Fill a zone bitmap for a policy */
535 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
536 {
537 	int i;
538 
539 	nodes_clear(*nodes);
540 	switch (p->policy) {
541 	case MPOL_BIND:
542 		for (i = 0; p->v.zonelist->zones[i]; i++)
543 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
544 				*nodes);
545 		break;
546 	case MPOL_DEFAULT:
547 		break;
548 	case MPOL_INTERLEAVE:
549 		*nodes = p->v.nodes;
550 		break;
551 	case MPOL_PREFERRED:
552 		/* or use current node instead of online map? */
553 		if (p->v.preferred_node < 0)
554 			*nodes = node_online_map;
555 		else
556 			node_set(p->v.preferred_node, *nodes);
557 		break;
558 	default:
559 		BUG();
560 	}
561 }
562 
563 static int lookup_node(struct mm_struct *mm, unsigned long addr)
564 {
565 	struct page *p;
566 	int err;
567 
568 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
569 	if (err >= 0) {
570 		err = page_to_nid(p);
571 		put_page(p);
572 	}
573 	return err;
574 }
575 
576 /* Retrieve NUMA policy */
577 long do_get_mempolicy(int *policy, nodemask_t *nmask,
578 			unsigned long addr, unsigned long flags)
579 {
580 	int err;
581 	struct mm_struct *mm = current->mm;
582 	struct vm_area_struct *vma = NULL;
583 	struct mempolicy *pol = current->mempolicy;
584 
585 	cpuset_update_current_mems_allowed();
586 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
587 		return -EINVAL;
588 	if (flags & MPOL_F_ADDR) {
589 		down_read(&mm->mmap_sem);
590 		vma = find_vma_intersection(mm, addr, addr+1);
591 		if (!vma) {
592 			up_read(&mm->mmap_sem);
593 			return -EFAULT;
594 		}
595 		if (vma->vm_ops && vma->vm_ops->get_policy)
596 			pol = vma->vm_ops->get_policy(vma, addr);
597 		else
598 			pol = vma->vm_policy;
599 	} else if (addr)
600 		return -EINVAL;
601 
602 	if (!pol)
603 		pol = &default_policy;
604 
605 	if (flags & MPOL_F_NODE) {
606 		if (flags & MPOL_F_ADDR) {
607 			err = lookup_node(mm, addr);
608 			if (err < 0)
609 				goto out;
610 			*policy = err;
611 		} else if (pol == current->mempolicy &&
612 				pol->policy == MPOL_INTERLEAVE) {
613 			*policy = current->il_next;
614 		} else {
615 			err = -EINVAL;
616 			goto out;
617 		}
618 	} else
619 		*policy = pol->policy;
620 
621 	if (vma) {
622 		up_read(&current->mm->mmap_sem);
623 		vma = NULL;
624 	}
625 
626 	err = 0;
627 	if (nmask)
628 		get_zonemask(pol, nmask);
629 
630  out:
631 	if (vma)
632 		up_read(&current->mm->mmap_sem);
633 	return err;
634 }
635 
636 /*
637  * For now migrate_pages simply swaps out the pages from nodes that are in
638  * the source set but not in the target set. In the future, we would
639  * want a function that moves pages between the two nodesets in such
640  * a way as to preserve the physical layout as much as possible.
641  *
642  * Returns the number of page that could not be moved.
643  */
644 int do_migrate_pages(struct mm_struct *mm,
645 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646 {
647 	LIST_HEAD(pagelist);
648 	int count = 0;
649 	nodemask_t nodes;
650 
651 	nodes_andnot(nodes, *from_nodes, *to_nodes);
652 
653 	down_read(&mm->mmap_sem);
654 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
655 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
656 
657 	if (!list_empty(&pagelist)) {
658 		count = swap_pages(&pagelist);
659 		putback_lru_pages(&pagelist);
660 	}
661 
662 	up_read(&mm->mmap_sem);
663 	return count;
664 }
665 
666 /*
667  * User space interface with variable sized bitmaps for nodelists.
668  */
669 
670 /* Copy a node mask from user space. */
671 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
672 		     unsigned long maxnode)
673 {
674 	unsigned long k;
675 	unsigned long nlongs;
676 	unsigned long endmask;
677 
678 	--maxnode;
679 	nodes_clear(*nodes);
680 	if (maxnode == 0 || !nmask)
681 		return 0;
682 
683 	nlongs = BITS_TO_LONGS(maxnode);
684 	if ((maxnode % BITS_PER_LONG) == 0)
685 		endmask = ~0UL;
686 	else
687 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
688 
689 	/* When the user specified more nodes than supported just check
690 	   if the non supported part is all zero. */
691 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
692 		if (nlongs > PAGE_SIZE/sizeof(long))
693 			return -EINVAL;
694 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
695 			unsigned long t;
696 			if (get_user(t, nmask + k))
697 				return -EFAULT;
698 			if (k == nlongs - 1) {
699 				if (t & endmask)
700 					return -EINVAL;
701 			} else if (t)
702 				return -EINVAL;
703 		}
704 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
705 		endmask = ~0UL;
706 	}
707 
708 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
709 		return -EFAULT;
710 	nodes_addr(*nodes)[nlongs-1] &= endmask;
711 	return 0;
712 }
713 
714 /* Copy a kernel node mask to user space */
715 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
716 			      nodemask_t *nodes)
717 {
718 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
719 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
720 
721 	if (copy > nbytes) {
722 		if (copy > PAGE_SIZE)
723 			return -EINVAL;
724 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
725 			return -EFAULT;
726 		copy = nbytes;
727 	}
728 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
729 }
730 
731 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
732 			unsigned long mode,
733 			unsigned long __user *nmask, unsigned long maxnode,
734 			unsigned flags)
735 {
736 	nodemask_t nodes;
737 	int err;
738 
739 	err = get_nodes(&nodes, nmask, maxnode);
740 	if (err)
741 		return err;
742 	return do_mbind(start, len, mode, &nodes, flags);
743 }
744 
745 /* Set the process memory policy */
746 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
747 		unsigned long maxnode)
748 {
749 	int err;
750 	nodemask_t nodes;
751 
752 	if (mode < 0 || mode > MPOL_MAX)
753 		return -EINVAL;
754 	err = get_nodes(&nodes, nmask, maxnode);
755 	if (err)
756 		return err;
757 	return do_set_mempolicy(mode, &nodes);
758 }
759 
760 /* Macro needed until Paul implements this function in kernel/cpusets.c */
761 #define cpuset_mems_allowed(task) node_online_map
762 
763 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
764 		const unsigned long __user *old_nodes,
765 		const unsigned long __user *new_nodes)
766 {
767 	struct mm_struct *mm;
768 	struct task_struct *task;
769 	nodemask_t old;
770 	nodemask_t new;
771 	nodemask_t task_nodes;
772 	int err;
773 
774 	err = get_nodes(&old, old_nodes, maxnode);
775 	if (err)
776 		return err;
777 
778 	err = get_nodes(&new, new_nodes, maxnode);
779 	if (err)
780 		return err;
781 
782 	/* Find the mm_struct */
783 	read_lock(&tasklist_lock);
784 	task = pid ? find_task_by_pid(pid) : current;
785 	if (!task) {
786 		read_unlock(&tasklist_lock);
787 		return -ESRCH;
788 	}
789 	mm = get_task_mm(task);
790 	read_unlock(&tasklist_lock);
791 
792 	if (!mm)
793 		return -EINVAL;
794 
795 	/*
796 	 * Check if this process has the right to modify the specified
797 	 * process. The right exists if the process has administrative
798 	 * capabilities, superuser priviledges or the same
799 	 * userid as the target process.
800 	 */
801 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
802 	    (current->uid != task->suid) && (current->uid != task->uid) &&
803 	    !capable(CAP_SYS_ADMIN)) {
804 		err = -EPERM;
805 		goto out;
806 	}
807 
808 	task_nodes = cpuset_mems_allowed(task);
809 	/* Is the user allowed to access the target nodes? */
810 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
811 		err = -EPERM;
812 		goto out;
813 	}
814 
815 	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
816 out:
817 	mmput(mm);
818 	return err;
819 }
820 
821 
822 /* Retrieve NUMA policy */
823 asmlinkage long sys_get_mempolicy(int __user *policy,
824 				unsigned long __user *nmask,
825 				unsigned long maxnode,
826 				unsigned long addr, unsigned long flags)
827 {
828 	int err, pval;
829 	nodemask_t nodes;
830 
831 	if (nmask != NULL && maxnode < MAX_NUMNODES)
832 		return -EINVAL;
833 
834 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
835 
836 	if (err)
837 		return err;
838 
839 	if (policy && put_user(pval, policy))
840 		return -EFAULT;
841 
842 	if (nmask)
843 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
844 
845 	return err;
846 }
847 
848 #ifdef CONFIG_COMPAT
849 
850 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
851 				     compat_ulong_t __user *nmask,
852 				     compat_ulong_t maxnode,
853 				     compat_ulong_t addr, compat_ulong_t flags)
854 {
855 	long err;
856 	unsigned long __user *nm = NULL;
857 	unsigned long nr_bits, alloc_size;
858 	DECLARE_BITMAP(bm, MAX_NUMNODES);
859 
860 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
861 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
862 
863 	if (nmask)
864 		nm = compat_alloc_user_space(alloc_size);
865 
866 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
867 
868 	if (!err && nmask) {
869 		err = copy_from_user(bm, nm, alloc_size);
870 		/* ensure entire bitmap is zeroed */
871 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
872 		err |= compat_put_bitmap(nmask, bm, nr_bits);
873 	}
874 
875 	return err;
876 }
877 
878 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
879 				     compat_ulong_t maxnode)
880 {
881 	long err = 0;
882 	unsigned long __user *nm = NULL;
883 	unsigned long nr_bits, alloc_size;
884 	DECLARE_BITMAP(bm, MAX_NUMNODES);
885 
886 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
887 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
888 
889 	if (nmask) {
890 		err = compat_get_bitmap(bm, nmask, nr_bits);
891 		nm = compat_alloc_user_space(alloc_size);
892 		err |= copy_to_user(nm, bm, alloc_size);
893 	}
894 
895 	if (err)
896 		return -EFAULT;
897 
898 	return sys_set_mempolicy(mode, nm, nr_bits+1);
899 }
900 
901 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
902 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
903 			     compat_ulong_t maxnode, compat_ulong_t flags)
904 {
905 	long err = 0;
906 	unsigned long __user *nm = NULL;
907 	unsigned long nr_bits, alloc_size;
908 	nodemask_t bm;
909 
910 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
911 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
912 
913 	if (nmask) {
914 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
915 		nm = compat_alloc_user_space(alloc_size);
916 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
917 	}
918 
919 	if (err)
920 		return -EFAULT;
921 
922 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
923 }
924 
925 #endif
926 
927 /* Return effective policy for a VMA */
928 struct mempolicy *
929 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
930 {
931 	struct mempolicy *pol = task->mempolicy;
932 
933 	if (vma) {
934 		if (vma->vm_ops && vma->vm_ops->get_policy)
935 			pol = vma->vm_ops->get_policy(vma, addr);
936 		else if (vma->vm_policy &&
937 				vma->vm_policy->policy != MPOL_DEFAULT)
938 			pol = vma->vm_policy;
939 	}
940 	if (!pol)
941 		pol = &default_policy;
942 	return pol;
943 }
944 
945 /* Return a zonelist representing a mempolicy */
946 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
947 {
948 	int nd;
949 
950 	switch (policy->policy) {
951 	case MPOL_PREFERRED:
952 		nd = policy->v.preferred_node;
953 		if (nd < 0)
954 			nd = numa_node_id();
955 		break;
956 	case MPOL_BIND:
957 		/* Lower zones don't get a policy applied */
958 		/* Careful: current->mems_allowed might have moved */
959 		if (gfp_zone(gfp) >= policy_zone)
960 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
961 				return policy->v.zonelist;
962 		/*FALL THROUGH*/
963 	case MPOL_INTERLEAVE: /* should not happen */
964 	case MPOL_DEFAULT:
965 		nd = numa_node_id();
966 		break;
967 	default:
968 		nd = 0;
969 		BUG();
970 	}
971 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
972 }
973 
974 /* Do dynamic interleaving for a process */
975 static unsigned interleave_nodes(struct mempolicy *policy)
976 {
977 	unsigned nid, next;
978 	struct task_struct *me = current;
979 
980 	nid = me->il_next;
981 	next = next_node(nid, policy->v.nodes);
982 	if (next >= MAX_NUMNODES)
983 		next = first_node(policy->v.nodes);
984 	me->il_next = next;
985 	return nid;
986 }
987 
988 /* Do static interleaving for a VMA with known offset. */
989 static unsigned offset_il_node(struct mempolicy *pol,
990 		struct vm_area_struct *vma, unsigned long off)
991 {
992 	unsigned nnodes = nodes_weight(pol->v.nodes);
993 	unsigned target = (unsigned)off % nnodes;
994 	int c;
995 	int nid = -1;
996 
997 	c = 0;
998 	do {
999 		nid = next_node(nid, pol->v.nodes);
1000 		c++;
1001 	} while (c <= target);
1002 	return nid;
1003 }
1004 
1005 /* Determine a node number for interleave */
1006 static inline unsigned interleave_nid(struct mempolicy *pol,
1007 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1008 {
1009 	if (vma) {
1010 		unsigned long off;
1011 
1012 		off = vma->vm_pgoff;
1013 		off += (addr - vma->vm_start) >> shift;
1014 		return offset_il_node(pol, vma, off);
1015 	} else
1016 		return interleave_nodes(pol);
1017 }
1018 
1019 /* Return a zonelist suitable for a huge page allocation. */
1020 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1021 {
1022 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1023 
1024 	if (pol->policy == MPOL_INTERLEAVE) {
1025 		unsigned nid;
1026 
1027 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1028 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1029 	}
1030 	return zonelist_policy(GFP_HIGHUSER, pol);
1031 }
1032 
1033 /* Allocate a page in interleaved policy.
1034    Own path because it needs to do special accounting. */
1035 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1036 					unsigned nid)
1037 {
1038 	struct zonelist *zl;
1039 	struct page *page;
1040 
1041 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1042 	page = __alloc_pages(gfp, order, zl);
1043 	if (page && page_zone(page) == zl->zones[0]) {
1044 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1045 		put_cpu();
1046 	}
1047 	return page;
1048 }
1049 
1050 /**
1051  * 	alloc_page_vma	- Allocate a page for a VMA.
1052  *
1053  * 	@gfp:
1054  *      %GFP_USER    user allocation.
1055  *      %GFP_KERNEL  kernel allocations,
1056  *      %GFP_HIGHMEM highmem/user allocations,
1057  *      %GFP_FS      allocation should not call back into a file system.
1058  *      %GFP_ATOMIC  don't sleep.
1059  *
1060  * 	@vma:  Pointer to VMA or NULL if not available.
1061  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1062  *
1063  * 	This function allocates a page from the kernel page pool and applies
1064  *	a NUMA policy associated with the VMA or the current process.
1065  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1066  *	mm_struct of the VMA to prevent it from going away. Should be used for
1067  *	all allocations for pages that will be mapped into
1068  * 	user space. Returns NULL when no page can be allocated.
1069  *
1070  *	Should be called with the mm_sem of the vma hold.
1071  */
1072 struct page *
1073 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1074 {
1075 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1076 
1077 	cpuset_update_current_mems_allowed();
1078 
1079 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1080 		unsigned nid;
1081 
1082 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1083 		return alloc_page_interleave(gfp, 0, nid);
1084 	}
1085 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1086 }
1087 
1088 /**
1089  * 	alloc_pages_current - Allocate pages.
1090  *
1091  *	@gfp:
1092  *		%GFP_USER   user allocation,
1093  *      	%GFP_KERNEL kernel allocation,
1094  *      	%GFP_HIGHMEM highmem allocation,
1095  *      	%GFP_FS     don't call back into a file system.
1096  *      	%GFP_ATOMIC don't sleep.
1097  *	@order: Power of two of allocation size in pages. 0 is a single page.
1098  *
1099  *	Allocate a page from the kernel page pool.  When not in
1100  *	interrupt context and apply the current process NUMA policy.
1101  *	Returns NULL when no page can be allocated.
1102  *
1103  *	Don't call cpuset_update_current_mems_allowed() unless
1104  *	1) it's ok to take cpuset_sem (can WAIT), and
1105  *	2) allocating for current task (not interrupt).
1106  */
1107 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1108 {
1109 	struct mempolicy *pol = current->mempolicy;
1110 
1111 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1112 		cpuset_update_current_mems_allowed();
1113 	if (!pol || in_interrupt())
1114 		pol = &default_policy;
1115 	if (pol->policy == MPOL_INTERLEAVE)
1116 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1117 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1118 }
1119 EXPORT_SYMBOL(alloc_pages_current);
1120 
1121 /* Slow path of a mempolicy copy */
1122 struct mempolicy *__mpol_copy(struct mempolicy *old)
1123 {
1124 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1125 
1126 	if (!new)
1127 		return ERR_PTR(-ENOMEM);
1128 	*new = *old;
1129 	atomic_set(&new->refcnt, 1);
1130 	if (new->policy == MPOL_BIND) {
1131 		int sz = ksize(old->v.zonelist);
1132 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1133 		if (!new->v.zonelist) {
1134 			kmem_cache_free(policy_cache, new);
1135 			return ERR_PTR(-ENOMEM);
1136 		}
1137 		memcpy(new->v.zonelist, old->v.zonelist, sz);
1138 	}
1139 	return new;
1140 }
1141 
1142 /* Slow path of a mempolicy comparison */
1143 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1144 {
1145 	if (!a || !b)
1146 		return 0;
1147 	if (a->policy != b->policy)
1148 		return 0;
1149 	switch (a->policy) {
1150 	case MPOL_DEFAULT:
1151 		return 1;
1152 	case MPOL_INTERLEAVE:
1153 		return nodes_equal(a->v.nodes, b->v.nodes);
1154 	case MPOL_PREFERRED:
1155 		return a->v.preferred_node == b->v.preferred_node;
1156 	case MPOL_BIND: {
1157 		int i;
1158 		for (i = 0; a->v.zonelist->zones[i]; i++)
1159 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1160 				return 0;
1161 		return b->v.zonelist->zones[i] == NULL;
1162 	}
1163 	default:
1164 		BUG();
1165 		return 0;
1166 	}
1167 }
1168 
1169 /* Slow path of a mpol destructor. */
1170 void __mpol_free(struct mempolicy *p)
1171 {
1172 	if (!atomic_dec_and_test(&p->refcnt))
1173 		return;
1174 	if (p->policy == MPOL_BIND)
1175 		kfree(p->v.zonelist);
1176 	p->policy = MPOL_DEFAULT;
1177 	kmem_cache_free(policy_cache, p);
1178 }
1179 
1180 /*
1181  * Shared memory backing store policy support.
1182  *
1183  * Remember policies even when nobody has shared memory mapped.
1184  * The policies are kept in Red-Black tree linked from the inode.
1185  * They are protected by the sp->lock spinlock, which should be held
1186  * for any accesses to the tree.
1187  */
1188 
1189 /* lookup first element intersecting start-end */
1190 /* Caller holds sp->lock */
1191 static struct sp_node *
1192 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1193 {
1194 	struct rb_node *n = sp->root.rb_node;
1195 
1196 	while (n) {
1197 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1198 
1199 		if (start >= p->end)
1200 			n = n->rb_right;
1201 		else if (end <= p->start)
1202 			n = n->rb_left;
1203 		else
1204 			break;
1205 	}
1206 	if (!n)
1207 		return NULL;
1208 	for (;;) {
1209 		struct sp_node *w = NULL;
1210 		struct rb_node *prev = rb_prev(n);
1211 		if (!prev)
1212 			break;
1213 		w = rb_entry(prev, struct sp_node, nd);
1214 		if (w->end <= start)
1215 			break;
1216 		n = prev;
1217 	}
1218 	return rb_entry(n, struct sp_node, nd);
1219 }
1220 
1221 /* Insert a new shared policy into the list. */
1222 /* Caller holds sp->lock */
1223 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1224 {
1225 	struct rb_node **p = &sp->root.rb_node;
1226 	struct rb_node *parent = NULL;
1227 	struct sp_node *nd;
1228 
1229 	while (*p) {
1230 		parent = *p;
1231 		nd = rb_entry(parent, struct sp_node, nd);
1232 		if (new->start < nd->start)
1233 			p = &(*p)->rb_left;
1234 		else if (new->end > nd->end)
1235 			p = &(*p)->rb_right;
1236 		else
1237 			BUG();
1238 	}
1239 	rb_link_node(&new->nd, parent, p);
1240 	rb_insert_color(&new->nd, &sp->root);
1241 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1242 		 new->policy ? new->policy->policy : 0);
1243 }
1244 
1245 /* Find shared policy intersecting idx */
1246 struct mempolicy *
1247 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1248 {
1249 	struct mempolicy *pol = NULL;
1250 	struct sp_node *sn;
1251 
1252 	if (!sp->root.rb_node)
1253 		return NULL;
1254 	spin_lock(&sp->lock);
1255 	sn = sp_lookup(sp, idx, idx+1);
1256 	if (sn) {
1257 		mpol_get(sn->policy);
1258 		pol = sn->policy;
1259 	}
1260 	spin_unlock(&sp->lock);
1261 	return pol;
1262 }
1263 
1264 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1265 {
1266 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1267 	rb_erase(&n->nd, &sp->root);
1268 	mpol_free(n->policy);
1269 	kmem_cache_free(sn_cache, n);
1270 }
1271 
1272 struct sp_node *
1273 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1274 {
1275 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1276 
1277 	if (!n)
1278 		return NULL;
1279 	n->start = start;
1280 	n->end = end;
1281 	mpol_get(pol);
1282 	n->policy = pol;
1283 	return n;
1284 }
1285 
1286 /* Replace a policy range. */
1287 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1288 				 unsigned long end, struct sp_node *new)
1289 {
1290 	struct sp_node *n, *new2 = NULL;
1291 
1292 restart:
1293 	spin_lock(&sp->lock);
1294 	n = sp_lookup(sp, start, end);
1295 	/* Take care of old policies in the same range. */
1296 	while (n && n->start < end) {
1297 		struct rb_node *next = rb_next(&n->nd);
1298 		if (n->start >= start) {
1299 			if (n->end <= end)
1300 				sp_delete(sp, n);
1301 			else
1302 				n->start = end;
1303 		} else {
1304 			/* Old policy spanning whole new range. */
1305 			if (n->end > end) {
1306 				if (!new2) {
1307 					spin_unlock(&sp->lock);
1308 					new2 = sp_alloc(end, n->end, n->policy);
1309 					if (!new2)
1310 						return -ENOMEM;
1311 					goto restart;
1312 				}
1313 				n->end = start;
1314 				sp_insert(sp, new2);
1315 				new2 = NULL;
1316 				break;
1317 			} else
1318 				n->end = start;
1319 		}
1320 		if (!next)
1321 			break;
1322 		n = rb_entry(next, struct sp_node, nd);
1323 	}
1324 	if (new)
1325 		sp_insert(sp, new);
1326 	spin_unlock(&sp->lock);
1327 	if (new2) {
1328 		mpol_free(new2->policy);
1329 		kmem_cache_free(sn_cache, new2);
1330 	}
1331 	return 0;
1332 }
1333 
1334 int mpol_set_shared_policy(struct shared_policy *info,
1335 			struct vm_area_struct *vma, struct mempolicy *npol)
1336 {
1337 	int err;
1338 	struct sp_node *new = NULL;
1339 	unsigned long sz = vma_pages(vma);
1340 
1341 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1342 		 vma->vm_pgoff,
1343 		 sz, npol? npol->policy : -1,
1344 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1345 
1346 	if (npol) {
1347 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1348 		if (!new)
1349 			return -ENOMEM;
1350 	}
1351 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1352 	if (err && new)
1353 		kmem_cache_free(sn_cache, new);
1354 	return err;
1355 }
1356 
1357 /* Free a backing policy store on inode delete. */
1358 void mpol_free_shared_policy(struct shared_policy *p)
1359 {
1360 	struct sp_node *n;
1361 	struct rb_node *next;
1362 
1363 	if (!p->root.rb_node)
1364 		return;
1365 	spin_lock(&p->lock);
1366 	next = rb_first(&p->root);
1367 	while (next) {
1368 		n = rb_entry(next, struct sp_node, nd);
1369 		next = rb_next(&n->nd);
1370 		rb_erase(&n->nd, &p->root);
1371 		mpol_free(n->policy);
1372 		kmem_cache_free(sn_cache, n);
1373 	}
1374 	spin_unlock(&p->lock);
1375 }
1376 
1377 /* assumes fs == KERNEL_DS */
1378 void __init numa_policy_init(void)
1379 {
1380 	policy_cache = kmem_cache_create("numa_policy",
1381 					 sizeof(struct mempolicy),
1382 					 0, SLAB_PANIC, NULL, NULL);
1383 
1384 	sn_cache = kmem_cache_create("shared_policy_node",
1385 				     sizeof(struct sp_node),
1386 				     0, SLAB_PANIC, NULL, NULL);
1387 
1388 	/* Set interleaving policy for system init. This way not all
1389 	   the data structures allocated at system boot end up in node zero. */
1390 
1391 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1392 		printk("numa_policy_init: interleaving failed\n");
1393 }
1394 
1395 /* Reset policy of current process to default */
1396 void numa_default_policy(void)
1397 {
1398 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1399 }
1400 
1401 /* Migrate a policy to a different set of nodes */
1402 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1403 							const nodemask_t *new)
1404 {
1405 	nodemask_t tmp;
1406 
1407 	if (!pol)
1408 		return;
1409 
1410 	switch (pol->policy) {
1411 	case MPOL_DEFAULT:
1412 		break;
1413 	case MPOL_INTERLEAVE:
1414 		nodes_remap(tmp, pol->v.nodes, *old, *new);
1415 		pol->v.nodes = tmp;
1416 		current->il_next = node_remap(current->il_next, *old, *new);
1417 		break;
1418 	case MPOL_PREFERRED:
1419 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1420 								*old, *new);
1421 		break;
1422 	case MPOL_BIND: {
1423 		nodemask_t nodes;
1424 		struct zone **z;
1425 		struct zonelist *zonelist;
1426 
1427 		nodes_clear(nodes);
1428 		for (z = pol->v.zonelist->zones; *z; z++)
1429 			node_set((*z)->zone_pgdat->node_id, nodes);
1430 		nodes_remap(tmp, nodes, *old, *new);
1431 		nodes = tmp;
1432 
1433 		zonelist = bind_zonelist(&nodes);
1434 
1435 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1436 		 * If that old zonelist has no remaining mems_allowed nodes,
1437 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1438 		 */
1439 
1440 		if (zonelist) {
1441 			/* Good - got mem - substitute new zonelist */
1442 			kfree(pol->v.zonelist);
1443 			pol->v.zonelist = zonelist;
1444 		}
1445 		break;
1446 	}
1447 	default:
1448 		BUG();
1449 		break;
1450 	}
1451 }
1452 
1453 /*
1454  * Someone moved this task to different nodes.  Fixup mempolicies.
1455  *
1456  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1457  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1458  */
1459 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1460 {
1461 	rebind_policy(current->mempolicy, old, new);
1462 }
1463