xref: /openbmc/linux/mm/mempolicy.c (revision fd589a8f)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/gfp.h>
77 #include <linux/slab.h>
78 #include <linux/string.h>
79 #include <linux/module.h>
80 #include <linux/nsproxy.h>
81 #include <linux/interrupt.h>
82 #include <linux/init.h>
83 #include <linux/compat.h>
84 #include <linux/swap.h>
85 #include <linux/seq_file.h>
86 #include <linux/proc_fs.h>
87 #include <linux/migrate.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 #include "internal.h"
97 
98 /* Internal flags */
99 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
100 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
101 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
102 
103 static struct kmem_cache *policy_cache;
104 static struct kmem_cache *sn_cache;
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 enum zone_type policy_zone = 0;
109 
110 /*
111  * run-time system-wide default policy => local allocation
112  */
113 struct mempolicy default_policy = {
114 	.refcnt = ATOMIC_INIT(1), /* never free it */
115 	.mode = MPOL_PREFERRED,
116 	.flags = MPOL_F_LOCAL,
117 };
118 
119 static const struct mempolicy_operations {
120 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122 } mpol_ops[MPOL_MAX];
123 
124 /* Check that the nodemask contains at least one populated zone */
125 static int is_valid_nodemask(const nodemask_t *nodemask)
126 {
127 	int nd, k;
128 
129 	/* Check that there is something useful in this mask */
130 	k = policy_zone;
131 
132 	for_each_node_mask(nd, *nodemask) {
133 		struct zone *z;
134 
135 		for (k = 0; k <= policy_zone; k++) {
136 			z = &NODE_DATA(nd)->node_zones[k];
137 			if (z->present_pages > 0)
138 				return 1;
139 		}
140 	}
141 
142 	return 0;
143 }
144 
145 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146 {
147 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148 }
149 
150 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151 				   const nodemask_t *rel)
152 {
153 	nodemask_t tmp;
154 	nodes_fold(tmp, *orig, nodes_weight(*rel));
155 	nodes_onto(*ret, tmp, *rel);
156 }
157 
158 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159 {
160 	if (nodes_empty(*nodes))
161 		return -EINVAL;
162 	pol->v.nodes = *nodes;
163 	return 0;
164 }
165 
166 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167 {
168 	if (!nodes)
169 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
170 	else if (nodes_empty(*nodes))
171 		return -EINVAL;			/*  no allowed nodes */
172 	else
173 		pol->v.preferred_node = first_node(*nodes);
174 	return 0;
175 }
176 
177 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (!is_valid_nodemask(nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
185 /*
186  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187  * any, for the new policy.  mpol_new() has already validated the nodes
188  * parameter with respect to the policy mode and flags.  But, we need to
189  * handle an empty nodemask with MPOL_PREFERRED here.
190  *
191  * Must be called holding task's alloc_lock to protect task's mems_allowed
192  * and mempolicy.  May also be called holding the mmap_semaphore for write.
193  */
194 static int mpol_set_nodemask(struct mempolicy *pol,
195 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
196 {
197 	int ret;
198 
199 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 	if (pol == NULL)
201 		return 0;
202 	/* Check N_HIGH_MEMORY */
203 	nodes_and(nsc->mask1,
204 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
205 
206 	VM_BUG_ON(!nodes);
207 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
208 		nodes = NULL;	/* explicit local allocation */
209 	else {
210 		if (pol->flags & MPOL_F_RELATIVE_NODES)
211 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
212 		else
213 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
214 
215 		if (mpol_store_user_nodemask(pol))
216 			pol->w.user_nodemask = *nodes;
217 		else
218 			pol->w.cpuset_mems_allowed =
219 						cpuset_current_mems_allowed;
220 	}
221 
222 	if (nodes)
223 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 	else
225 		ret = mpol_ops[pol->mode].create(pol, NULL);
226 	return ret;
227 }
228 
229 /*
230  * This function just creates a new policy, does some check and simple
231  * initialization. You must invoke mpol_set_nodemask() to set nodes.
232  */
233 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
234 				  nodemask_t *nodes)
235 {
236 	struct mempolicy *policy;
237 
238 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
239 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
240 
241 	if (mode == MPOL_DEFAULT) {
242 		if (nodes && !nodes_empty(*nodes))
243 			return ERR_PTR(-EINVAL);
244 		return NULL;	/* simply delete any existing policy */
245 	}
246 	VM_BUG_ON(!nodes);
247 
248 	/*
249 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
250 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
251 	 * All other modes require a valid pointer to a non-empty nodemask.
252 	 */
253 	if (mode == MPOL_PREFERRED) {
254 		if (nodes_empty(*nodes)) {
255 			if (((flags & MPOL_F_STATIC_NODES) ||
256 			     (flags & MPOL_F_RELATIVE_NODES)))
257 				return ERR_PTR(-EINVAL);
258 		}
259 	} else if (nodes_empty(*nodes))
260 		return ERR_PTR(-EINVAL);
261 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
262 	if (!policy)
263 		return ERR_PTR(-ENOMEM);
264 	atomic_set(&policy->refcnt, 1);
265 	policy->mode = mode;
266 	policy->flags = flags;
267 
268 	return policy;
269 }
270 
271 /* Slow path of a mpol destructor. */
272 void __mpol_put(struct mempolicy *p)
273 {
274 	if (!atomic_dec_and_test(&p->refcnt))
275 		return;
276 	kmem_cache_free(policy_cache, p);
277 }
278 
279 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
280 {
281 }
282 
283 static void mpol_rebind_nodemask(struct mempolicy *pol,
284 				 const nodemask_t *nodes)
285 {
286 	nodemask_t tmp;
287 
288 	if (pol->flags & MPOL_F_STATIC_NODES)
289 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
290 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
291 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
292 	else {
293 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
294 			    *nodes);
295 		pol->w.cpuset_mems_allowed = *nodes;
296 	}
297 
298 	pol->v.nodes = tmp;
299 	if (!node_isset(current->il_next, tmp)) {
300 		current->il_next = next_node(current->il_next, tmp);
301 		if (current->il_next >= MAX_NUMNODES)
302 			current->il_next = first_node(tmp);
303 		if (current->il_next >= MAX_NUMNODES)
304 			current->il_next = numa_node_id();
305 	}
306 }
307 
308 static void mpol_rebind_preferred(struct mempolicy *pol,
309 				  const nodemask_t *nodes)
310 {
311 	nodemask_t tmp;
312 
313 	if (pol->flags & MPOL_F_STATIC_NODES) {
314 		int node = first_node(pol->w.user_nodemask);
315 
316 		if (node_isset(node, *nodes)) {
317 			pol->v.preferred_node = node;
318 			pol->flags &= ~MPOL_F_LOCAL;
319 		} else
320 			pol->flags |= MPOL_F_LOCAL;
321 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 		pol->v.preferred_node = first_node(tmp);
324 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
325 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
326 						   pol->w.cpuset_mems_allowed,
327 						   *nodes);
328 		pol->w.cpuset_mems_allowed = *nodes;
329 	}
330 }
331 
332 /* Migrate a policy to a different set of nodes */
333 static void mpol_rebind_policy(struct mempolicy *pol,
334 			       const nodemask_t *newmask)
335 {
336 	if (!pol)
337 		return;
338 	if (!mpol_store_user_nodemask(pol) &&
339 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
340 		return;
341 	mpol_ops[pol->mode].rebind(pol, newmask);
342 }
343 
344 /*
345  * Wrapper for mpol_rebind_policy() that just requires task
346  * pointer, and updates task mempolicy.
347  *
348  * Called with task's alloc_lock held.
349  */
350 
351 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
352 {
353 	mpol_rebind_policy(tsk->mempolicy, new);
354 }
355 
356 /*
357  * Rebind each vma in mm to new nodemask.
358  *
359  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
360  */
361 
362 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
363 {
364 	struct vm_area_struct *vma;
365 
366 	down_write(&mm->mmap_sem);
367 	for (vma = mm->mmap; vma; vma = vma->vm_next)
368 		mpol_rebind_policy(vma->vm_policy, new);
369 	up_write(&mm->mmap_sem);
370 }
371 
372 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
373 	[MPOL_DEFAULT] = {
374 		.rebind = mpol_rebind_default,
375 	},
376 	[MPOL_INTERLEAVE] = {
377 		.create = mpol_new_interleave,
378 		.rebind = mpol_rebind_nodemask,
379 	},
380 	[MPOL_PREFERRED] = {
381 		.create = mpol_new_preferred,
382 		.rebind = mpol_rebind_preferred,
383 	},
384 	[MPOL_BIND] = {
385 		.create = mpol_new_bind,
386 		.rebind = mpol_rebind_nodemask,
387 	},
388 };
389 
390 static void gather_stats(struct page *, void *, int pte_dirty);
391 static void migrate_page_add(struct page *page, struct list_head *pagelist,
392 				unsigned long flags);
393 
394 /* Scan through pages checking if pages follow certain conditions. */
395 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
396 		unsigned long addr, unsigned long end,
397 		const nodemask_t *nodes, unsigned long flags,
398 		void *private)
399 {
400 	pte_t *orig_pte;
401 	pte_t *pte;
402 	spinlock_t *ptl;
403 
404 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
405 	do {
406 		struct page *page;
407 		int nid;
408 
409 		if (!pte_present(*pte))
410 			continue;
411 		page = vm_normal_page(vma, addr, *pte);
412 		if (!page)
413 			continue;
414 		/*
415 		 * The check for PageReserved here is important to avoid
416 		 * handling zero pages and other pages that may have been
417 		 * marked special by the system.
418 		 *
419 		 * If the PageReserved would not be checked here then f.e.
420 		 * the location of the zero page could have an influence
421 		 * on MPOL_MF_STRICT, zero pages would be counted for
422 		 * the per node stats, and there would be useless attempts
423 		 * to put zero pages on the migration list.
424 		 */
425 		if (PageReserved(page))
426 			continue;
427 		nid = page_to_nid(page);
428 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
429 			continue;
430 
431 		if (flags & MPOL_MF_STATS)
432 			gather_stats(page, private, pte_dirty(*pte));
433 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
434 			migrate_page_add(page, private, flags);
435 		else
436 			break;
437 	} while (pte++, addr += PAGE_SIZE, addr != end);
438 	pte_unmap_unlock(orig_pte, ptl);
439 	return addr != end;
440 }
441 
442 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
443 		unsigned long addr, unsigned long end,
444 		const nodemask_t *nodes, unsigned long flags,
445 		void *private)
446 {
447 	pmd_t *pmd;
448 	unsigned long next;
449 
450 	pmd = pmd_offset(pud, addr);
451 	do {
452 		next = pmd_addr_end(addr, end);
453 		if (pmd_none_or_clear_bad(pmd))
454 			continue;
455 		if (check_pte_range(vma, pmd, addr, next, nodes,
456 				    flags, private))
457 			return -EIO;
458 	} while (pmd++, addr = next, addr != end);
459 	return 0;
460 }
461 
462 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
463 		unsigned long addr, unsigned long end,
464 		const nodemask_t *nodes, unsigned long flags,
465 		void *private)
466 {
467 	pud_t *pud;
468 	unsigned long next;
469 
470 	pud = pud_offset(pgd, addr);
471 	do {
472 		next = pud_addr_end(addr, end);
473 		if (pud_none_or_clear_bad(pud))
474 			continue;
475 		if (check_pmd_range(vma, pud, addr, next, nodes,
476 				    flags, private))
477 			return -EIO;
478 	} while (pud++, addr = next, addr != end);
479 	return 0;
480 }
481 
482 static inline int check_pgd_range(struct vm_area_struct *vma,
483 		unsigned long addr, unsigned long end,
484 		const nodemask_t *nodes, unsigned long flags,
485 		void *private)
486 {
487 	pgd_t *pgd;
488 	unsigned long next;
489 
490 	pgd = pgd_offset(vma->vm_mm, addr);
491 	do {
492 		next = pgd_addr_end(addr, end);
493 		if (pgd_none_or_clear_bad(pgd))
494 			continue;
495 		if (check_pud_range(vma, pgd, addr, next, nodes,
496 				    flags, private))
497 			return -EIO;
498 	} while (pgd++, addr = next, addr != end);
499 	return 0;
500 }
501 
502 /*
503  * Check if all pages in a range are on a set of nodes.
504  * If pagelist != NULL then isolate pages from the LRU and
505  * put them on the pagelist.
506  */
507 static struct vm_area_struct *
508 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
509 		const nodemask_t *nodes, unsigned long flags, void *private)
510 {
511 	int err;
512 	struct vm_area_struct *first, *vma, *prev;
513 
514 
515 	first = find_vma(mm, start);
516 	if (!first)
517 		return ERR_PTR(-EFAULT);
518 	prev = NULL;
519 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
520 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
521 			if (!vma->vm_next && vma->vm_end < end)
522 				return ERR_PTR(-EFAULT);
523 			if (prev && prev->vm_end < vma->vm_start)
524 				return ERR_PTR(-EFAULT);
525 		}
526 		if (!is_vm_hugetlb_page(vma) &&
527 		    ((flags & MPOL_MF_STRICT) ||
528 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
529 				vma_migratable(vma)))) {
530 			unsigned long endvma = vma->vm_end;
531 
532 			if (endvma > end)
533 				endvma = end;
534 			if (vma->vm_start > start)
535 				start = vma->vm_start;
536 			err = check_pgd_range(vma, start, endvma, nodes,
537 						flags, private);
538 			if (err) {
539 				first = ERR_PTR(err);
540 				break;
541 			}
542 		}
543 		prev = vma;
544 	}
545 	return first;
546 }
547 
548 /* Apply policy to a single VMA */
549 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
550 {
551 	int err = 0;
552 	struct mempolicy *old = vma->vm_policy;
553 
554 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
555 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
556 		 vma->vm_ops, vma->vm_file,
557 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
558 
559 	if (vma->vm_ops && vma->vm_ops->set_policy)
560 		err = vma->vm_ops->set_policy(vma, new);
561 	if (!err) {
562 		mpol_get(new);
563 		vma->vm_policy = new;
564 		mpol_put(old);
565 	}
566 	return err;
567 }
568 
569 /* Step 2: apply policy to a range and do splits. */
570 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
571 		       unsigned long end, struct mempolicy *new)
572 {
573 	struct vm_area_struct *next;
574 	int err;
575 
576 	err = 0;
577 	for (; vma && vma->vm_start < end; vma = next) {
578 		next = vma->vm_next;
579 		if (vma->vm_start < start)
580 			err = split_vma(vma->vm_mm, vma, start, 1);
581 		if (!err && vma->vm_end > end)
582 			err = split_vma(vma->vm_mm, vma, end, 0);
583 		if (!err)
584 			err = policy_vma(vma, new);
585 		if (err)
586 			break;
587 	}
588 	return err;
589 }
590 
591 /*
592  * Update task->flags PF_MEMPOLICY bit: set iff non-default
593  * mempolicy.  Allows more rapid checking of this (combined perhaps
594  * with other PF_* flag bits) on memory allocation hot code paths.
595  *
596  * If called from outside this file, the task 'p' should -only- be
597  * a newly forked child not yet visible on the task list, because
598  * manipulating the task flags of a visible task is not safe.
599  *
600  * The above limitation is why this routine has the funny name
601  * mpol_fix_fork_child_flag().
602  *
603  * It is also safe to call this with a task pointer of current,
604  * which the static wrapper mpol_set_task_struct_flag() does,
605  * for use within this file.
606  */
607 
608 void mpol_fix_fork_child_flag(struct task_struct *p)
609 {
610 	if (p->mempolicy)
611 		p->flags |= PF_MEMPOLICY;
612 	else
613 		p->flags &= ~PF_MEMPOLICY;
614 }
615 
616 static void mpol_set_task_struct_flag(void)
617 {
618 	mpol_fix_fork_child_flag(current);
619 }
620 
621 /* Set the process memory policy */
622 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
623 			     nodemask_t *nodes)
624 {
625 	struct mempolicy *new, *old;
626 	struct mm_struct *mm = current->mm;
627 	NODEMASK_SCRATCH(scratch);
628 	int ret;
629 
630 	if (!scratch)
631 		return -ENOMEM;
632 
633 	new = mpol_new(mode, flags, nodes);
634 	if (IS_ERR(new)) {
635 		ret = PTR_ERR(new);
636 		goto out;
637 	}
638 	/*
639 	 * prevent changing our mempolicy while show_numa_maps()
640 	 * is using it.
641 	 * Note:  do_set_mempolicy() can be called at init time
642 	 * with no 'mm'.
643 	 */
644 	if (mm)
645 		down_write(&mm->mmap_sem);
646 	task_lock(current);
647 	ret = mpol_set_nodemask(new, nodes, scratch);
648 	if (ret) {
649 		task_unlock(current);
650 		if (mm)
651 			up_write(&mm->mmap_sem);
652 		mpol_put(new);
653 		goto out;
654 	}
655 	old = current->mempolicy;
656 	current->mempolicy = new;
657 	mpol_set_task_struct_flag();
658 	if (new && new->mode == MPOL_INTERLEAVE &&
659 	    nodes_weight(new->v.nodes))
660 		current->il_next = first_node(new->v.nodes);
661 	task_unlock(current);
662 	if (mm)
663 		up_write(&mm->mmap_sem);
664 
665 	mpol_put(old);
666 	ret = 0;
667 out:
668 	NODEMASK_SCRATCH_FREE(scratch);
669 	return ret;
670 }
671 
672 /*
673  * Return nodemask for policy for get_mempolicy() query
674  *
675  * Called with task's alloc_lock held
676  */
677 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
678 {
679 	nodes_clear(*nodes);
680 	if (p == &default_policy)
681 		return;
682 
683 	switch (p->mode) {
684 	case MPOL_BIND:
685 		/* Fall through */
686 	case MPOL_INTERLEAVE:
687 		*nodes = p->v.nodes;
688 		break;
689 	case MPOL_PREFERRED:
690 		if (!(p->flags & MPOL_F_LOCAL))
691 			node_set(p->v.preferred_node, *nodes);
692 		/* else return empty node mask for local allocation */
693 		break;
694 	default:
695 		BUG();
696 	}
697 }
698 
699 static int lookup_node(struct mm_struct *mm, unsigned long addr)
700 {
701 	struct page *p;
702 	int err;
703 
704 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
705 	if (err >= 0) {
706 		err = page_to_nid(p);
707 		put_page(p);
708 	}
709 	return err;
710 }
711 
712 /* Retrieve NUMA policy */
713 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
714 			     unsigned long addr, unsigned long flags)
715 {
716 	int err;
717 	struct mm_struct *mm = current->mm;
718 	struct vm_area_struct *vma = NULL;
719 	struct mempolicy *pol = current->mempolicy;
720 
721 	if (flags &
722 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
723 		return -EINVAL;
724 
725 	if (flags & MPOL_F_MEMS_ALLOWED) {
726 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
727 			return -EINVAL;
728 		*policy = 0;	/* just so it's initialized */
729 		task_lock(current);
730 		*nmask  = cpuset_current_mems_allowed;
731 		task_unlock(current);
732 		return 0;
733 	}
734 
735 	if (flags & MPOL_F_ADDR) {
736 		/*
737 		 * Do NOT fall back to task policy if the
738 		 * vma/shared policy at addr is NULL.  We
739 		 * want to return MPOL_DEFAULT in this case.
740 		 */
741 		down_read(&mm->mmap_sem);
742 		vma = find_vma_intersection(mm, addr, addr+1);
743 		if (!vma) {
744 			up_read(&mm->mmap_sem);
745 			return -EFAULT;
746 		}
747 		if (vma->vm_ops && vma->vm_ops->get_policy)
748 			pol = vma->vm_ops->get_policy(vma, addr);
749 		else
750 			pol = vma->vm_policy;
751 	} else if (addr)
752 		return -EINVAL;
753 
754 	if (!pol)
755 		pol = &default_policy;	/* indicates default behavior */
756 
757 	if (flags & MPOL_F_NODE) {
758 		if (flags & MPOL_F_ADDR) {
759 			err = lookup_node(mm, addr);
760 			if (err < 0)
761 				goto out;
762 			*policy = err;
763 		} else if (pol == current->mempolicy &&
764 				pol->mode == MPOL_INTERLEAVE) {
765 			*policy = current->il_next;
766 		} else {
767 			err = -EINVAL;
768 			goto out;
769 		}
770 	} else {
771 		*policy = pol == &default_policy ? MPOL_DEFAULT :
772 						pol->mode;
773 		/*
774 		 * Internal mempolicy flags must be masked off before exposing
775 		 * the policy to userspace.
776 		 */
777 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
778 	}
779 
780 	if (vma) {
781 		up_read(&current->mm->mmap_sem);
782 		vma = NULL;
783 	}
784 
785 	err = 0;
786 	if (nmask) {
787 		task_lock(current);
788 		get_policy_nodemask(pol, nmask);
789 		task_unlock(current);
790 	}
791 
792  out:
793 	mpol_cond_put(pol);
794 	if (vma)
795 		up_read(&current->mm->mmap_sem);
796 	return err;
797 }
798 
799 #ifdef CONFIG_MIGRATION
800 /*
801  * page migration
802  */
803 static void migrate_page_add(struct page *page, struct list_head *pagelist,
804 				unsigned long flags)
805 {
806 	/*
807 	 * Avoid migrating a page that is shared with others.
808 	 */
809 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 		if (!isolate_lru_page(page)) {
811 			list_add_tail(&page->lru, pagelist);
812 		}
813 	}
814 }
815 
816 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
817 {
818 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
819 }
820 
821 /*
822  * Migrate pages from one node to a target node.
823  * Returns error or the number of pages not migrated.
824  */
825 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
826 			   int flags)
827 {
828 	nodemask_t nmask;
829 	LIST_HEAD(pagelist);
830 	int err = 0;
831 
832 	nodes_clear(nmask);
833 	node_set(source, nmask);
834 
835 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
836 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 
838 	if (!list_empty(&pagelist))
839 		err = migrate_pages(&pagelist, new_node_page, dest);
840 
841 	return err;
842 }
843 
844 /*
845  * Move pages between the two nodesets so as to preserve the physical
846  * layout as much as possible.
847  *
848  * Returns the number of page that could not be moved.
849  */
850 int do_migrate_pages(struct mm_struct *mm,
851 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
852 {
853 	int busy = 0;
854 	int err;
855 	nodemask_t tmp;
856 
857 	err = migrate_prep();
858 	if (err)
859 		return err;
860 
861 	down_read(&mm->mmap_sem);
862 
863 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
864 	if (err)
865 		goto out;
866 
867 /*
868  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
869  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
870  * bit in 'tmp', and return that <source, dest> pair for migration.
871  * The pair of nodemasks 'to' and 'from' define the map.
872  *
873  * If no pair of bits is found that way, fallback to picking some
874  * pair of 'source' and 'dest' bits that are not the same.  If the
875  * 'source' and 'dest' bits are the same, this represents a node
876  * that will be migrating to itself, so no pages need move.
877  *
878  * If no bits are left in 'tmp', or if all remaining bits left
879  * in 'tmp' correspond to the same bit in 'to', return false
880  * (nothing left to migrate).
881  *
882  * This lets us pick a pair of nodes to migrate between, such that
883  * if possible the dest node is not already occupied by some other
884  * source node, minimizing the risk of overloading the memory on a
885  * node that would happen if we migrated incoming memory to a node
886  * before migrating outgoing memory source that same node.
887  *
888  * A single scan of tmp is sufficient.  As we go, we remember the
889  * most recent <s, d> pair that moved (s != d).  If we find a pair
890  * that not only moved, but what's better, moved to an empty slot
891  * (d is not set in tmp), then we break out then, with that pair.
892  * Otherwise when we finish scannng from_tmp, we at least have the
893  * most recent <s, d> pair that moved.  If we get all the way through
894  * the scan of tmp without finding any node that moved, much less
895  * moved to an empty node, then there is nothing left worth migrating.
896  */
897 
898 	tmp = *from_nodes;
899 	while (!nodes_empty(tmp)) {
900 		int s,d;
901 		int source = -1;
902 		int dest = 0;
903 
904 		for_each_node_mask(s, tmp) {
905 			d = node_remap(s, *from_nodes, *to_nodes);
906 			if (s == d)
907 				continue;
908 
909 			source = s;	/* Node moved. Memorize */
910 			dest = d;
911 
912 			/* dest not in remaining from nodes? */
913 			if (!node_isset(dest, tmp))
914 				break;
915 		}
916 		if (source == -1)
917 			break;
918 
919 		node_clear(source, tmp);
920 		err = migrate_to_node(mm, source, dest, flags);
921 		if (err > 0)
922 			busy += err;
923 		if (err < 0)
924 			break;
925 	}
926 out:
927 	up_read(&mm->mmap_sem);
928 	if (err < 0)
929 		return err;
930 	return busy;
931 
932 }
933 
934 /*
935  * Allocate a new page for page migration based on vma policy.
936  * Start assuming that page is mapped by vma pointed to by @private.
937  * Search forward from there, if not.  N.B., this assumes that the
938  * list of pages handed to migrate_pages()--which is how we get here--
939  * is in virtual address order.
940  */
941 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
942 {
943 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
944 	unsigned long uninitialized_var(address);
945 
946 	while (vma) {
947 		address = page_address_in_vma(page, vma);
948 		if (address != -EFAULT)
949 			break;
950 		vma = vma->vm_next;
951 	}
952 
953 	/*
954 	 * if !vma, alloc_page_vma() will use task or system default policy
955 	 */
956 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
957 }
958 #else
959 
960 static void migrate_page_add(struct page *page, struct list_head *pagelist,
961 				unsigned long flags)
962 {
963 }
964 
965 int do_migrate_pages(struct mm_struct *mm,
966 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
967 {
968 	return -ENOSYS;
969 }
970 
971 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
972 {
973 	return NULL;
974 }
975 #endif
976 
977 static long do_mbind(unsigned long start, unsigned long len,
978 		     unsigned short mode, unsigned short mode_flags,
979 		     nodemask_t *nmask, unsigned long flags)
980 {
981 	struct vm_area_struct *vma;
982 	struct mm_struct *mm = current->mm;
983 	struct mempolicy *new;
984 	unsigned long end;
985 	int err;
986 	LIST_HEAD(pagelist);
987 
988 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
989 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
990 		return -EINVAL;
991 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
992 		return -EPERM;
993 
994 	if (start & ~PAGE_MASK)
995 		return -EINVAL;
996 
997 	if (mode == MPOL_DEFAULT)
998 		flags &= ~MPOL_MF_STRICT;
999 
1000 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1001 	end = start + len;
1002 
1003 	if (end < start)
1004 		return -EINVAL;
1005 	if (end == start)
1006 		return 0;
1007 
1008 	new = mpol_new(mode, mode_flags, nmask);
1009 	if (IS_ERR(new))
1010 		return PTR_ERR(new);
1011 
1012 	/*
1013 	 * If we are using the default policy then operation
1014 	 * on discontinuous address spaces is okay after all
1015 	 */
1016 	if (!new)
1017 		flags |= MPOL_MF_DISCONTIG_OK;
1018 
1019 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1020 		 start, start + len, mode, mode_flags,
1021 		 nmask ? nodes_addr(*nmask)[0] : -1);
1022 
1023 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1024 
1025 		err = migrate_prep();
1026 		if (err)
1027 			return err;
1028 	}
1029 	{
1030 		NODEMASK_SCRATCH(scratch);
1031 		if (scratch) {
1032 			down_write(&mm->mmap_sem);
1033 			task_lock(current);
1034 			err = mpol_set_nodemask(new, nmask, scratch);
1035 			task_unlock(current);
1036 			if (err)
1037 				up_write(&mm->mmap_sem);
1038 		} else
1039 			err = -ENOMEM;
1040 		NODEMASK_SCRATCH_FREE(scratch);
1041 	}
1042 	if (err) {
1043 		mpol_put(new);
1044 		return err;
1045 	}
1046 	vma = check_range(mm, start, end, nmask,
1047 			  flags | MPOL_MF_INVERT, &pagelist);
1048 
1049 	err = PTR_ERR(vma);
1050 	if (!IS_ERR(vma)) {
1051 		int nr_failed = 0;
1052 
1053 		err = mbind_range(vma, start, end, new);
1054 
1055 		if (!list_empty(&pagelist))
1056 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1057 						(unsigned long)vma);
1058 
1059 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1060 			err = -EIO;
1061 	}
1062 
1063 	up_write(&mm->mmap_sem);
1064 	mpol_put(new);
1065 	return err;
1066 }
1067 
1068 /*
1069  * User space interface with variable sized bitmaps for nodelists.
1070  */
1071 
1072 /* Copy a node mask from user space. */
1073 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1074 		     unsigned long maxnode)
1075 {
1076 	unsigned long k;
1077 	unsigned long nlongs;
1078 	unsigned long endmask;
1079 
1080 	--maxnode;
1081 	nodes_clear(*nodes);
1082 	if (maxnode == 0 || !nmask)
1083 		return 0;
1084 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1085 		return -EINVAL;
1086 
1087 	nlongs = BITS_TO_LONGS(maxnode);
1088 	if ((maxnode % BITS_PER_LONG) == 0)
1089 		endmask = ~0UL;
1090 	else
1091 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1092 
1093 	/* When the user specified more nodes than supported just check
1094 	   if the non supported part is all zero. */
1095 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1096 		if (nlongs > PAGE_SIZE/sizeof(long))
1097 			return -EINVAL;
1098 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1099 			unsigned long t;
1100 			if (get_user(t, nmask + k))
1101 				return -EFAULT;
1102 			if (k == nlongs - 1) {
1103 				if (t & endmask)
1104 					return -EINVAL;
1105 			} else if (t)
1106 				return -EINVAL;
1107 		}
1108 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1109 		endmask = ~0UL;
1110 	}
1111 
1112 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1113 		return -EFAULT;
1114 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1115 	return 0;
1116 }
1117 
1118 /* Copy a kernel node mask to user space */
1119 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1120 			      nodemask_t *nodes)
1121 {
1122 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1123 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1124 
1125 	if (copy > nbytes) {
1126 		if (copy > PAGE_SIZE)
1127 			return -EINVAL;
1128 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1129 			return -EFAULT;
1130 		copy = nbytes;
1131 	}
1132 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1133 }
1134 
1135 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1136 		unsigned long, mode, unsigned long __user *, nmask,
1137 		unsigned long, maxnode, unsigned, flags)
1138 {
1139 	nodemask_t nodes;
1140 	int err;
1141 	unsigned short mode_flags;
1142 
1143 	mode_flags = mode & MPOL_MODE_FLAGS;
1144 	mode &= ~MPOL_MODE_FLAGS;
1145 	if (mode >= MPOL_MAX)
1146 		return -EINVAL;
1147 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1148 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1149 		return -EINVAL;
1150 	err = get_nodes(&nodes, nmask, maxnode);
1151 	if (err)
1152 		return err;
1153 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1154 }
1155 
1156 /* Set the process memory policy */
1157 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1158 		unsigned long, maxnode)
1159 {
1160 	int err;
1161 	nodemask_t nodes;
1162 	unsigned short flags;
1163 
1164 	flags = mode & MPOL_MODE_FLAGS;
1165 	mode &= ~MPOL_MODE_FLAGS;
1166 	if ((unsigned int)mode >= MPOL_MAX)
1167 		return -EINVAL;
1168 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1169 		return -EINVAL;
1170 	err = get_nodes(&nodes, nmask, maxnode);
1171 	if (err)
1172 		return err;
1173 	return do_set_mempolicy(mode, flags, &nodes);
1174 }
1175 
1176 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1177 		const unsigned long __user *, old_nodes,
1178 		const unsigned long __user *, new_nodes)
1179 {
1180 	const struct cred *cred = current_cred(), *tcred;
1181 	struct mm_struct *mm;
1182 	struct task_struct *task;
1183 	nodemask_t old;
1184 	nodemask_t new;
1185 	nodemask_t task_nodes;
1186 	int err;
1187 
1188 	err = get_nodes(&old, old_nodes, maxnode);
1189 	if (err)
1190 		return err;
1191 
1192 	err = get_nodes(&new, new_nodes, maxnode);
1193 	if (err)
1194 		return err;
1195 
1196 	/* Find the mm_struct */
1197 	read_lock(&tasklist_lock);
1198 	task = pid ? find_task_by_vpid(pid) : current;
1199 	if (!task) {
1200 		read_unlock(&tasklist_lock);
1201 		return -ESRCH;
1202 	}
1203 	mm = get_task_mm(task);
1204 	read_unlock(&tasklist_lock);
1205 
1206 	if (!mm)
1207 		return -EINVAL;
1208 
1209 	/*
1210 	 * Check if this process has the right to modify the specified
1211 	 * process. The right exists if the process has administrative
1212 	 * capabilities, superuser privileges or the same
1213 	 * userid as the target process.
1214 	 */
1215 	rcu_read_lock();
1216 	tcred = __task_cred(task);
1217 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1218 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1219 	    !capable(CAP_SYS_NICE)) {
1220 		rcu_read_unlock();
1221 		err = -EPERM;
1222 		goto out;
1223 	}
1224 	rcu_read_unlock();
1225 
1226 	task_nodes = cpuset_mems_allowed(task);
1227 	/* Is the user allowed to access the target nodes? */
1228 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1229 		err = -EPERM;
1230 		goto out;
1231 	}
1232 
1233 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1234 		err = -EINVAL;
1235 		goto out;
1236 	}
1237 
1238 	err = security_task_movememory(task);
1239 	if (err)
1240 		goto out;
1241 
1242 	err = do_migrate_pages(mm, &old, &new,
1243 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1244 out:
1245 	mmput(mm);
1246 	return err;
1247 }
1248 
1249 
1250 /* Retrieve NUMA policy */
1251 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1252 		unsigned long __user *, nmask, unsigned long, maxnode,
1253 		unsigned long, addr, unsigned long, flags)
1254 {
1255 	int err;
1256 	int uninitialized_var(pval);
1257 	nodemask_t nodes;
1258 
1259 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1260 		return -EINVAL;
1261 
1262 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1263 
1264 	if (err)
1265 		return err;
1266 
1267 	if (policy && put_user(pval, policy))
1268 		return -EFAULT;
1269 
1270 	if (nmask)
1271 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1272 
1273 	return err;
1274 }
1275 
1276 #ifdef CONFIG_COMPAT
1277 
1278 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1279 				     compat_ulong_t __user *nmask,
1280 				     compat_ulong_t maxnode,
1281 				     compat_ulong_t addr, compat_ulong_t flags)
1282 {
1283 	long err;
1284 	unsigned long __user *nm = NULL;
1285 	unsigned long nr_bits, alloc_size;
1286 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1287 
1288 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1289 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1290 
1291 	if (nmask)
1292 		nm = compat_alloc_user_space(alloc_size);
1293 
1294 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1295 
1296 	if (!err && nmask) {
1297 		err = copy_from_user(bm, nm, alloc_size);
1298 		/* ensure entire bitmap is zeroed */
1299 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1300 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1301 	}
1302 
1303 	return err;
1304 }
1305 
1306 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1307 				     compat_ulong_t maxnode)
1308 {
1309 	long err = 0;
1310 	unsigned long __user *nm = NULL;
1311 	unsigned long nr_bits, alloc_size;
1312 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1313 
1314 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1315 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1316 
1317 	if (nmask) {
1318 		err = compat_get_bitmap(bm, nmask, nr_bits);
1319 		nm = compat_alloc_user_space(alloc_size);
1320 		err |= copy_to_user(nm, bm, alloc_size);
1321 	}
1322 
1323 	if (err)
1324 		return -EFAULT;
1325 
1326 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1327 }
1328 
1329 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1330 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1331 			     compat_ulong_t maxnode, compat_ulong_t flags)
1332 {
1333 	long err = 0;
1334 	unsigned long __user *nm = NULL;
1335 	unsigned long nr_bits, alloc_size;
1336 	nodemask_t bm;
1337 
1338 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1339 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1340 
1341 	if (nmask) {
1342 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1343 		nm = compat_alloc_user_space(alloc_size);
1344 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1345 	}
1346 
1347 	if (err)
1348 		return -EFAULT;
1349 
1350 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1351 }
1352 
1353 #endif
1354 
1355 /*
1356  * get_vma_policy(@task, @vma, @addr)
1357  * @task - task for fallback if vma policy == default
1358  * @vma   - virtual memory area whose policy is sought
1359  * @addr  - address in @vma for shared policy lookup
1360  *
1361  * Returns effective policy for a VMA at specified address.
1362  * Falls back to @task or system default policy, as necessary.
1363  * Current or other task's task mempolicy and non-shared vma policies
1364  * are protected by the task's mmap_sem, which must be held for read by
1365  * the caller.
1366  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1367  * count--added by the get_policy() vm_op, as appropriate--to protect against
1368  * freeing by another task.  It is the caller's responsibility to free the
1369  * extra reference for shared policies.
1370  */
1371 static struct mempolicy *get_vma_policy(struct task_struct *task,
1372 		struct vm_area_struct *vma, unsigned long addr)
1373 {
1374 	struct mempolicy *pol = task->mempolicy;
1375 
1376 	if (vma) {
1377 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1378 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1379 									addr);
1380 			if (vpol)
1381 				pol = vpol;
1382 		} else if (vma->vm_policy)
1383 			pol = vma->vm_policy;
1384 	}
1385 	if (!pol)
1386 		pol = &default_policy;
1387 	return pol;
1388 }
1389 
1390 /*
1391  * Return a nodemask representing a mempolicy for filtering nodes for
1392  * page allocation
1393  */
1394 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1395 {
1396 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1397 	if (unlikely(policy->mode == MPOL_BIND) &&
1398 			gfp_zone(gfp) >= policy_zone &&
1399 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1400 		return &policy->v.nodes;
1401 
1402 	return NULL;
1403 }
1404 
1405 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1406 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1407 {
1408 	int nd = numa_node_id();
1409 
1410 	switch (policy->mode) {
1411 	case MPOL_PREFERRED:
1412 		if (!(policy->flags & MPOL_F_LOCAL))
1413 			nd = policy->v.preferred_node;
1414 		break;
1415 	case MPOL_BIND:
1416 		/*
1417 		 * Normally, MPOL_BIND allocations are node-local within the
1418 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1419 		 * current node is part of the mask, we use the zonelist for
1420 		 * the first node in the mask instead.
1421 		 */
1422 		if (unlikely(gfp & __GFP_THISNODE) &&
1423 				unlikely(!node_isset(nd, policy->v.nodes)))
1424 			nd = first_node(policy->v.nodes);
1425 		break;
1426 	case MPOL_INTERLEAVE: /* should not happen */
1427 		break;
1428 	default:
1429 		BUG();
1430 	}
1431 	return node_zonelist(nd, gfp);
1432 }
1433 
1434 /* Do dynamic interleaving for a process */
1435 static unsigned interleave_nodes(struct mempolicy *policy)
1436 {
1437 	unsigned nid, next;
1438 	struct task_struct *me = current;
1439 
1440 	nid = me->il_next;
1441 	next = next_node(nid, policy->v.nodes);
1442 	if (next >= MAX_NUMNODES)
1443 		next = first_node(policy->v.nodes);
1444 	if (next < MAX_NUMNODES)
1445 		me->il_next = next;
1446 	return nid;
1447 }
1448 
1449 /*
1450  * Depending on the memory policy provide a node from which to allocate the
1451  * next slab entry.
1452  * @policy must be protected by freeing by the caller.  If @policy is
1453  * the current task's mempolicy, this protection is implicit, as only the
1454  * task can change it's policy.  The system default policy requires no
1455  * such protection.
1456  */
1457 unsigned slab_node(struct mempolicy *policy)
1458 {
1459 	if (!policy || policy->flags & MPOL_F_LOCAL)
1460 		return numa_node_id();
1461 
1462 	switch (policy->mode) {
1463 	case MPOL_PREFERRED:
1464 		/*
1465 		 * handled MPOL_F_LOCAL above
1466 		 */
1467 		return policy->v.preferred_node;
1468 
1469 	case MPOL_INTERLEAVE:
1470 		return interleave_nodes(policy);
1471 
1472 	case MPOL_BIND: {
1473 		/*
1474 		 * Follow bind policy behavior and start allocation at the
1475 		 * first node.
1476 		 */
1477 		struct zonelist *zonelist;
1478 		struct zone *zone;
1479 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1480 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1481 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1482 							&policy->v.nodes,
1483 							&zone);
1484 		return zone->node;
1485 	}
1486 
1487 	default:
1488 		BUG();
1489 	}
1490 }
1491 
1492 /* Do static interleaving for a VMA with known offset. */
1493 static unsigned offset_il_node(struct mempolicy *pol,
1494 		struct vm_area_struct *vma, unsigned long off)
1495 {
1496 	unsigned nnodes = nodes_weight(pol->v.nodes);
1497 	unsigned target;
1498 	int c;
1499 	int nid = -1;
1500 
1501 	if (!nnodes)
1502 		return numa_node_id();
1503 	target = (unsigned int)off % nnodes;
1504 	c = 0;
1505 	do {
1506 		nid = next_node(nid, pol->v.nodes);
1507 		c++;
1508 	} while (c <= target);
1509 	return nid;
1510 }
1511 
1512 /* Determine a node number for interleave */
1513 static inline unsigned interleave_nid(struct mempolicy *pol,
1514 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1515 {
1516 	if (vma) {
1517 		unsigned long off;
1518 
1519 		/*
1520 		 * for small pages, there is no difference between
1521 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1522 		 * for huge pages, since vm_pgoff is in units of small
1523 		 * pages, we need to shift off the always 0 bits to get
1524 		 * a useful offset.
1525 		 */
1526 		BUG_ON(shift < PAGE_SHIFT);
1527 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1528 		off += (addr - vma->vm_start) >> shift;
1529 		return offset_il_node(pol, vma, off);
1530 	} else
1531 		return interleave_nodes(pol);
1532 }
1533 
1534 #ifdef CONFIG_HUGETLBFS
1535 /*
1536  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1537  * @vma = virtual memory area whose policy is sought
1538  * @addr = address in @vma for shared policy lookup and interleave policy
1539  * @gfp_flags = for requested zone
1540  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1541  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1542  *
1543  * Returns a zonelist suitable for a huge page allocation and a pointer
1544  * to the struct mempolicy for conditional unref after allocation.
1545  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1546  * @nodemask for filtering the zonelist.
1547  */
1548 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1549 				gfp_t gfp_flags, struct mempolicy **mpol,
1550 				nodemask_t **nodemask)
1551 {
1552 	struct zonelist *zl;
1553 
1554 	*mpol = get_vma_policy(current, vma, addr);
1555 	*nodemask = NULL;	/* assume !MPOL_BIND */
1556 
1557 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1558 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1559 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1560 	} else {
1561 		zl = policy_zonelist(gfp_flags, *mpol);
1562 		if ((*mpol)->mode == MPOL_BIND)
1563 			*nodemask = &(*mpol)->v.nodes;
1564 	}
1565 	return zl;
1566 }
1567 #endif
1568 
1569 /* Allocate a page in interleaved policy.
1570    Own path because it needs to do special accounting. */
1571 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1572 					unsigned nid)
1573 {
1574 	struct zonelist *zl;
1575 	struct page *page;
1576 
1577 	zl = node_zonelist(nid, gfp);
1578 	page = __alloc_pages(gfp, order, zl);
1579 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1580 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1581 	return page;
1582 }
1583 
1584 /**
1585  * 	alloc_page_vma	- Allocate a page for a VMA.
1586  *
1587  * 	@gfp:
1588  *      %GFP_USER    user allocation.
1589  *      %GFP_KERNEL  kernel allocations,
1590  *      %GFP_HIGHMEM highmem/user allocations,
1591  *      %GFP_FS      allocation should not call back into a file system.
1592  *      %GFP_ATOMIC  don't sleep.
1593  *
1594  * 	@vma:  Pointer to VMA or NULL if not available.
1595  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1596  *
1597  * 	This function allocates a page from the kernel page pool and applies
1598  *	a NUMA policy associated with the VMA or the current process.
1599  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1600  *	mm_struct of the VMA to prevent it from going away. Should be used for
1601  *	all allocations for pages that will be mapped into
1602  * 	user space. Returns NULL when no page can be allocated.
1603  *
1604  *	Should be called with the mm_sem of the vma hold.
1605  */
1606 struct page *
1607 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1608 {
1609 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1610 	struct zonelist *zl;
1611 
1612 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1613 		unsigned nid;
1614 
1615 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1616 		mpol_cond_put(pol);
1617 		return alloc_page_interleave(gfp, 0, nid);
1618 	}
1619 	zl = policy_zonelist(gfp, pol);
1620 	if (unlikely(mpol_needs_cond_ref(pol))) {
1621 		/*
1622 		 * slow path: ref counted shared policy
1623 		 */
1624 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1625 						zl, policy_nodemask(gfp, pol));
1626 		__mpol_put(pol);
1627 		return page;
1628 	}
1629 	/*
1630 	 * fast path:  default or task policy
1631 	 */
1632 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1633 }
1634 
1635 /**
1636  * 	alloc_pages_current - Allocate pages.
1637  *
1638  *	@gfp:
1639  *		%GFP_USER   user allocation,
1640  *      	%GFP_KERNEL kernel allocation,
1641  *      	%GFP_HIGHMEM highmem allocation,
1642  *      	%GFP_FS     don't call back into a file system.
1643  *      	%GFP_ATOMIC don't sleep.
1644  *	@order: Power of two of allocation size in pages. 0 is a single page.
1645  *
1646  *	Allocate a page from the kernel page pool.  When not in
1647  *	interrupt context and apply the current process NUMA policy.
1648  *	Returns NULL when no page can be allocated.
1649  *
1650  *	Don't call cpuset_update_task_memory_state() unless
1651  *	1) it's ok to take cpuset_sem (can WAIT), and
1652  *	2) allocating for current task (not interrupt).
1653  */
1654 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1655 {
1656 	struct mempolicy *pol = current->mempolicy;
1657 
1658 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1659 		pol = &default_policy;
1660 
1661 	/*
1662 	 * No reference counting needed for current->mempolicy
1663 	 * nor system default_policy
1664 	 */
1665 	if (pol->mode == MPOL_INTERLEAVE)
1666 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1667 	return __alloc_pages_nodemask(gfp, order,
1668 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1669 }
1670 EXPORT_SYMBOL(alloc_pages_current);
1671 
1672 /*
1673  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1674  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1675  * with the mems_allowed returned by cpuset_mems_allowed().  This
1676  * keeps mempolicies cpuset relative after its cpuset moves.  See
1677  * further kernel/cpuset.c update_nodemask().
1678  */
1679 
1680 /* Slow path of a mempolicy duplicate */
1681 struct mempolicy *__mpol_dup(struct mempolicy *old)
1682 {
1683 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1684 
1685 	if (!new)
1686 		return ERR_PTR(-ENOMEM);
1687 	if (current_cpuset_is_being_rebound()) {
1688 		nodemask_t mems = cpuset_mems_allowed(current);
1689 		mpol_rebind_policy(old, &mems);
1690 	}
1691 	*new = *old;
1692 	atomic_set(&new->refcnt, 1);
1693 	return new;
1694 }
1695 
1696 /*
1697  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1698  * eliminate the * MPOL_F_* flags that require conditional ref and
1699  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1700  * after return.  Use the returned value.
1701  *
1702  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1703  * policy lookup, even if the policy needs/has extra ref on lookup.
1704  * shmem_readahead needs this.
1705  */
1706 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1707 						struct mempolicy *frompol)
1708 {
1709 	if (!mpol_needs_cond_ref(frompol))
1710 		return frompol;
1711 
1712 	*tompol = *frompol;
1713 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1714 	__mpol_put(frompol);
1715 	return tompol;
1716 }
1717 
1718 static int mpol_match_intent(const struct mempolicy *a,
1719 			     const struct mempolicy *b)
1720 {
1721 	if (a->flags != b->flags)
1722 		return 0;
1723 	if (!mpol_store_user_nodemask(a))
1724 		return 1;
1725 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1726 }
1727 
1728 /* Slow path of a mempolicy comparison */
1729 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1730 {
1731 	if (!a || !b)
1732 		return 0;
1733 	if (a->mode != b->mode)
1734 		return 0;
1735 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1736 		return 0;
1737 	switch (a->mode) {
1738 	case MPOL_BIND:
1739 		/* Fall through */
1740 	case MPOL_INTERLEAVE:
1741 		return nodes_equal(a->v.nodes, b->v.nodes);
1742 	case MPOL_PREFERRED:
1743 		return a->v.preferred_node == b->v.preferred_node &&
1744 			a->flags == b->flags;
1745 	default:
1746 		BUG();
1747 		return 0;
1748 	}
1749 }
1750 
1751 /*
1752  * Shared memory backing store policy support.
1753  *
1754  * Remember policies even when nobody has shared memory mapped.
1755  * The policies are kept in Red-Black tree linked from the inode.
1756  * They are protected by the sp->lock spinlock, which should be held
1757  * for any accesses to the tree.
1758  */
1759 
1760 /* lookup first element intersecting start-end */
1761 /* Caller holds sp->lock */
1762 static struct sp_node *
1763 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1764 {
1765 	struct rb_node *n = sp->root.rb_node;
1766 
1767 	while (n) {
1768 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1769 
1770 		if (start >= p->end)
1771 			n = n->rb_right;
1772 		else if (end <= p->start)
1773 			n = n->rb_left;
1774 		else
1775 			break;
1776 	}
1777 	if (!n)
1778 		return NULL;
1779 	for (;;) {
1780 		struct sp_node *w = NULL;
1781 		struct rb_node *prev = rb_prev(n);
1782 		if (!prev)
1783 			break;
1784 		w = rb_entry(prev, struct sp_node, nd);
1785 		if (w->end <= start)
1786 			break;
1787 		n = prev;
1788 	}
1789 	return rb_entry(n, struct sp_node, nd);
1790 }
1791 
1792 /* Insert a new shared policy into the list. */
1793 /* Caller holds sp->lock */
1794 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1795 {
1796 	struct rb_node **p = &sp->root.rb_node;
1797 	struct rb_node *parent = NULL;
1798 	struct sp_node *nd;
1799 
1800 	while (*p) {
1801 		parent = *p;
1802 		nd = rb_entry(parent, struct sp_node, nd);
1803 		if (new->start < nd->start)
1804 			p = &(*p)->rb_left;
1805 		else if (new->end > nd->end)
1806 			p = &(*p)->rb_right;
1807 		else
1808 			BUG();
1809 	}
1810 	rb_link_node(&new->nd, parent, p);
1811 	rb_insert_color(&new->nd, &sp->root);
1812 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1813 		 new->policy ? new->policy->mode : 0);
1814 }
1815 
1816 /* Find shared policy intersecting idx */
1817 struct mempolicy *
1818 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1819 {
1820 	struct mempolicy *pol = NULL;
1821 	struct sp_node *sn;
1822 
1823 	if (!sp->root.rb_node)
1824 		return NULL;
1825 	spin_lock(&sp->lock);
1826 	sn = sp_lookup(sp, idx, idx+1);
1827 	if (sn) {
1828 		mpol_get(sn->policy);
1829 		pol = sn->policy;
1830 	}
1831 	spin_unlock(&sp->lock);
1832 	return pol;
1833 }
1834 
1835 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1836 {
1837 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1838 	rb_erase(&n->nd, &sp->root);
1839 	mpol_put(n->policy);
1840 	kmem_cache_free(sn_cache, n);
1841 }
1842 
1843 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1844 				struct mempolicy *pol)
1845 {
1846 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1847 
1848 	if (!n)
1849 		return NULL;
1850 	n->start = start;
1851 	n->end = end;
1852 	mpol_get(pol);
1853 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1854 	n->policy = pol;
1855 	return n;
1856 }
1857 
1858 /* Replace a policy range. */
1859 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1860 				 unsigned long end, struct sp_node *new)
1861 {
1862 	struct sp_node *n, *new2 = NULL;
1863 
1864 restart:
1865 	spin_lock(&sp->lock);
1866 	n = sp_lookup(sp, start, end);
1867 	/* Take care of old policies in the same range. */
1868 	while (n && n->start < end) {
1869 		struct rb_node *next = rb_next(&n->nd);
1870 		if (n->start >= start) {
1871 			if (n->end <= end)
1872 				sp_delete(sp, n);
1873 			else
1874 				n->start = end;
1875 		} else {
1876 			/* Old policy spanning whole new range. */
1877 			if (n->end > end) {
1878 				if (!new2) {
1879 					spin_unlock(&sp->lock);
1880 					new2 = sp_alloc(end, n->end, n->policy);
1881 					if (!new2)
1882 						return -ENOMEM;
1883 					goto restart;
1884 				}
1885 				n->end = start;
1886 				sp_insert(sp, new2);
1887 				new2 = NULL;
1888 				break;
1889 			} else
1890 				n->end = start;
1891 		}
1892 		if (!next)
1893 			break;
1894 		n = rb_entry(next, struct sp_node, nd);
1895 	}
1896 	if (new)
1897 		sp_insert(sp, new);
1898 	spin_unlock(&sp->lock);
1899 	if (new2) {
1900 		mpol_put(new2->policy);
1901 		kmem_cache_free(sn_cache, new2);
1902 	}
1903 	return 0;
1904 }
1905 
1906 /**
1907  * mpol_shared_policy_init - initialize shared policy for inode
1908  * @sp: pointer to inode shared policy
1909  * @mpol:  struct mempolicy to install
1910  *
1911  * Install non-NULL @mpol in inode's shared policy rb-tree.
1912  * On entry, the current task has a reference on a non-NULL @mpol.
1913  * This must be released on exit.
1914  * This is called at get_inode() calls and we can use GFP_KERNEL.
1915  */
1916 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1917 {
1918 	int ret;
1919 
1920 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1921 	spin_lock_init(&sp->lock);
1922 
1923 	if (mpol) {
1924 		struct vm_area_struct pvma;
1925 		struct mempolicy *new;
1926 		NODEMASK_SCRATCH(scratch);
1927 
1928 		if (!scratch)
1929 			return;
1930 		/* contextualize the tmpfs mount point mempolicy */
1931 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1932 		if (IS_ERR(new)) {
1933 			mpol_put(mpol);	/* drop our ref on sb mpol */
1934 			NODEMASK_SCRATCH_FREE(scratch);
1935 			return;		/* no valid nodemask intersection */
1936 		}
1937 
1938 		task_lock(current);
1939 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1940 		task_unlock(current);
1941 		mpol_put(mpol);	/* drop our ref on sb mpol */
1942 		if (ret) {
1943 			NODEMASK_SCRATCH_FREE(scratch);
1944 			mpol_put(new);
1945 			return;
1946 		}
1947 
1948 		/* Create pseudo-vma that contains just the policy */
1949 		memset(&pvma, 0, sizeof(struct vm_area_struct));
1950 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1951 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1952 		mpol_put(new);			/* drop initial ref */
1953 		NODEMASK_SCRATCH_FREE(scratch);
1954 	}
1955 }
1956 
1957 int mpol_set_shared_policy(struct shared_policy *info,
1958 			struct vm_area_struct *vma, struct mempolicy *npol)
1959 {
1960 	int err;
1961 	struct sp_node *new = NULL;
1962 	unsigned long sz = vma_pages(vma);
1963 
1964 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1965 		 vma->vm_pgoff,
1966 		 sz, npol ? npol->mode : -1,
1967 		 npol ? npol->flags : -1,
1968 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1969 
1970 	if (npol) {
1971 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1972 		if (!new)
1973 			return -ENOMEM;
1974 	}
1975 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1976 	if (err && new)
1977 		kmem_cache_free(sn_cache, new);
1978 	return err;
1979 }
1980 
1981 /* Free a backing policy store on inode delete. */
1982 void mpol_free_shared_policy(struct shared_policy *p)
1983 {
1984 	struct sp_node *n;
1985 	struct rb_node *next;
1986 
1987 	if (!p->root.rb_node)
1988 		return;
1989 	spin_lock(&p->lock);
1990 	next = rb_first(&p->root);
1991 	while (next) {
1992 		n = rb_entry(next, struct sp_node, nd);
1993 		next = rb_next(&n->nd);
1994 		rb_erase(&n->nd, &p->root);
1995 		mpol_put(n->policy);
1996 		kmem_cache_free(sn_cache, n);
1997 	}
1998 	spin_unlock(&p->lock);
1999 }
2000 
2001 /* assumes fs == KERNEL_DS */
2002 void __init numa_policy_init(void)
2003 {
2004 	nodemask_t interleave_nodes;
2005 	unsigned long largest = 0;
2006 	int nid, prefer = 0;
2007 
2008 	policy_cache = kmem_cache_create("numa_policy",
2009 					 sizeof(struct mempolicy),
2010 					 0, SLAB_PANIC, NULL);
2011 
2012 	sn_cache = kmem_cache_create("shared_policy_node",
2013 				     sizeof(struct sp_node),
2014 				     0, SLAB_PANIC, NULL);
2015 
2016 	/*
2017 	 * Set interleaving policy for system init. Interleaving is only
2018 	 * enabled across suitably sized nodes (default is >= 16MB), or
2019 	 * fall back to the largest node if they're all smaller.
2020 	 */
2021 	nodes_clear(interleave_nodes);
2022 	for_each_node_state(nid, N_HIGH_MEMORY) {
2023 		unsigned long total_pages = node_present_pages(nid);
2024 
2025 		/* Preserve the largest node */
2026 		if (largest < total_pages) {
2027 			largest = total_pages;
2028 			prefer = nid;
2029 		}
2030 
2031 		/* Interleave this node? */
2032 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2033 			node_set(nid, interleave_nodes);
2034 	}
2035 
2036 	/* All too small, use the largest */
2037 	if (unlikely(nodes_empty(interleave_nodes)))
2038 		node_set(prefer, interleave_nodes);
2039 
2040 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2041 		printk("numa_policy_init: interleaving failed\n");
2042 }
2043 
2044 /* Reset policy of current process to default */
2045 void numa_default_policy(void)
2046 {
2047 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2048 }
2049 
2050 /*
2051  * Parse and format mempolicy from/to strings
2052  */
2053 
2054 /*
2055  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2056  * Used only for mpol_parse_str() and mpol_to_str()
2057  */
2058 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2059 static const char * const policy_types[] =
2060 	{ "default", "prefer", "bind", "interleave", "local" };
2061 
2062 
2063 #ifdef CONFIG_TMPFS
2064 /**
2065  * mpol_parse_str - parse string to mempolicy
2066  * @str:  string containing mempolicy to parse
2067  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2068  * @no_context:  flag whether to "contextualize" the mempolicy
2069  *
2070  * Format of input:
2071  *	<mode>[=<flags>][:<nodelist>]
2072  *
2073  * if @no_context is true, save the input nodemask in w.user_nodemask in
2074  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2075  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2076  * mount option.  Note that if 'static' or 'relative' mode flags were
2077  * specified, the input nodemask will already have been saved.  Saving
2078  * it again is redundant, but safe.
2079  *
2080  * On success, returns 0, else 1
2081  */
2082 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2083 {
2084 	struct mempolicy *new = NULL;
2085 	unsigned short uninitialized_var(mode);
2086 	unsigned short uninitialized_var(mode_flags);
2087 	nodemask_t nodes;
2088 	char *nodelist = strchr(str, ':');
2089 	char *flags = strchr(str, '=');
2090 	int i;
2091 	int err = 1;
2092 
2093 	if (nodelist) {
2094 		/* NUL-terminate mode or flags string */
2095 		*nodelist++ = '\0';
2096 		if (nodelist_parse(nodelist, nodes))
2097 			goto out;
2098 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2099 			goto out;
2100 	} else
2101 		nodes_clear(nodes);
2102 
2103 	if (flags)
2104 		*flags++ = '\0';	/* terminate mode string */
2105 
2106 	for (i = 0; i <= MPOL_LOCAL; i++) {
2107 		if (!strcmp(str, policy_types[i])) {
2108 			mode = i;
2109 			break;
2110 		}
2111 	}
2112 	if (i > MPOL_LOCAL)
2113 		goto out;
2114 
2115 	switch (mode) {
2116 	case MPOL_PREFERRED:
2117 		/*
2118 		 * Insist on a nodelist of one node only
2119 		 */
2120 		if (nodelist) {
2121 			char *rest = nodelist;
2122 			while (isdigit(*rest))
2123 				rest++;
2124 			if (!*rest)
2125 				err = 0;
2126 		}
2127 		break;
2128 	case MPOL_INTERLEAVE:
2129 		/*
2130 		 * Default to online nodes with memory if no nodelist
2131 		 */
2132 		if (!nodelist)
2133 			nodes = node_states[N_HIGH_MEMORY];
2134 		err = 0;
2135 		break;
2136 	case MPOL_LOCAL:
2137 		/*
2138 		 * Don't allow a nodelist;  mpol_new() checks flags
2139 		 */
2140 		if (nodelist)
2141 			goto out;
2142 		mode = MPOL_PREFERRED;
2143 		break;
2144 
2145 	/*
2146 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2147 	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2148 	 */
2149 	}
2150 
2151 	mode_flags = 0;
2152 	if (flags) {
2153 		/*
2154 		 * Currently, we only support two mutually exclusive
2155 		 * mode flags.
2156 		 */
2157 		if (!strcmp(flags, "static"))
2158 			mode_flags |= MPOL_F_STATIC_NODES;
2159 		else if (!strcmp(flags, "relative"))
2160 			mode_flags |= MPOL_F_RELATIVE_NODES;
2161 		else
2162 			err = 1;
2163 	}
2164 
2165 	new = mpol_new(mode, mode_flags, &nodes);
2166 	if (IS_ERR(new))
2167 		err = 1;
2168 	else {
2169 		int ret;
2170 		NODEMASK_SCRATCH(scratch);
2171 		if (scratch) {
2172 			task_lock(current);
2173 			ret = mpol_set_nodemask(new, &nodes, scratch);
2174 			task_unlock(current);
2175 		} else
2176 			ret = -ENOMEM;
2177 		NODEMASK_SCRATCH_FREE(scratch);
2178 		if (ret) {
2179 			err = 1;
2180 			mpol_put(new);
2181 		} else if (no_context) {
2182 			/* save for contextualization */
2183 			new->w.user_nodemask = nodes;
2184 		}
2185 	}
2186 
2187 out:
2188 	/* Restore string for error message */
2189 	if (nodelist)
2190 		*--nodelist = ':';
2191 	if (flags)
2192 		*--flags = '=';
2193 	if (!err)
2194 		*mpol = new;
2195 	return err;
2196 }
2197 #endif /* CONFIG_TMPFS */
2198 
2199 /**
2200  * mpol_to_str - format a mempolicy structure for printing
2201  * @buffer:  to contain formatted mempolicy string
2202  * @maxlen:  length of @buffer
2203  * @pol:  pointer to mempolicy to be formatted
2204  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2205  *
2206  * Convert a mempolicy into a string.
2207  * Returns the number of characters in buffer (if positive)
2208  * or an error (negative)
2209  */
2210 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2211 {
2212 	char *p = buffer;
2213 	int l;
2214 	nodemask_t nodes;
2215 	unsigned short mode;
2216 	unsigned short flags = pol ? pol->flags : 0;
2217 
2218 	/*
2219 	 * Sanity check:  room for longest mode, flag and some nodes
2220 	 */
2221 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2222 
2223 	if (!pol || pol == &default_policy)
2224 		mode = MPOL_DEFAULT;
2225 	else
2226 		mode = pol->mode;
2227 
2228 	switch (mode) {
2229 	case MPOL_DEFAULT:
2230 		nodes_clear(nodes);
2231 		break;
2232 
2233 	case MPOL_PREFERRED:
2234 		nodes_clear(nodes);
2235 		if (flags & MPOL_F_LOCAL)
2236 			mode = MPOL_LOCAL;	/* pseudo-policy */
2237 		else
2238 			node_set(pol->v.preferred_node, nodes);
2239 		break;
2240 
2241 	case MPOL_BIND:
2242 		/* Fall through */
2243 	case MPOL_INTERLEAVE:
2244 		if (no_context)
2245 			nodes = pol->w.user_nodemask;
2246 		else
2247 			nodes = pol->v.nodes;
2248 		break;
2249 
2250 	default:
2251 		BUG();
2252 	}
2253 
2254 	l = strlen(policy_types[mode]);
2255 	if (buffer + maxlen < p + l + 1)
2256 		return -ENOSPC;
2257 
2258 	strcpy(p, policy_types[mode]);
2259 	p += l;
2260 
2261 	if (flags & MPOL_MODE_FLAGS) {
2262 		if (buffer + maxlen < p + 2)
2263 			return -ENOSPC;
2264 		*p++ = '=';
2265 
2266 		/*
2267 		 * Currently, the only defined flags are mutually exclusive
2268 		 */
2269 		if (flags & MPOL_F_STATIC_NODES)
2270 			p += snprintf(p, buffer + maxlen - p, "static");
2271 		else if (flags & MPOL_F_RELATIVE_NODES)
2272 			p += snprintf(p, buffer + maxlen - p, "relative");
2273 	}
2274 
2275 	if (!nodes_empty(nodes)) {
2276 		if (buffer + maxlen < p + 2)
2277 			return -ENOSPC;
2278 		*p++ = ':';
2279 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2280 	}
2281 	return p - buffer;
2282 }
2283 
2284 struct numa_maps {
2285 	unsigned long pages;
2286 	unsigned long anon;
2287 	unsigned long active;
2288 	unsigned long writeback;
2289 	unsigned long mapcount_max;
2290 	unsigned long dirty;
2291 	unsigned long swapcache;
2292 	unsigned long node[MAX_NUMNODES];
2293 };
2294 
2295 static void gather_stats(struct page *page, void *private, int pte_dirty)
2296 {
2297 	struct numa_maps *md = private;
2298 	int count = page_mapcount(page);
2299 
2300 	md->pages++;
2301 	if (pte_dirty || PageDirty(page))
2302 		md->dirty++;
2303 
2304 	if (PageSwapCache(page))
2305 		md->swapcache++;
2306 
2307 	if (PageActive(page) || PageUnevictable(page))
2308 		md->active++;
2309 
2310 	if (PageWriteback(page))
2311 		md->writeback++;
2312 
2313 	if (PageAnon(page))
2314 		md->anon++;
2315 
2316 	if (count > md->mapcount_max)
2317 		md->mapcount_max = count;
2318 
2319 	md->node[page_to_nid(page)]++;
2320 }
2321 
2322 #ifdef CONFIG_HUGETLB_PAGE
2323 static void check_huge_range(struct vm_area_struct *vma,
2324 		unsigned long start, unsigned long end,
2325 		struct numa_maps *md)
2326 {
2327 	unsigned long addr;
2328 	struct page *page;
2329 	struct hstate *h = hstate_vma(vma);
2330 	unsigned long sz = huge_page_size(h);
2331 
2332 	for (addr = start; addr < end; addr += sz) {
2333 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2334 						addr & huge_page_mask(h));
2335 		pte_t pte;
2336 
2337 		if (!ptep)
2338 			continue;
2339 
2340 		pte = *ptep;
2341 		if (pte_none(pte))
2342 			continue;
2343 
2344 		page = pte_page(pte);
2345 		if (!page)
2346 			continue;
2347 
2348 		gather_stats(page, md, pte_dirty(*ptep));
2349 	}
2350 }
2351 #else
2352 static inline void check_huge_range(struct vm_area_struct *vma,
2353 		unsigned long start, unsigned long end,
2354 		struct numa_maps *md)
2355 {
2356 }
2357 #endif
2358 
2359 /*
2360  * Display pages allocated per node and memory policy via /proc.
2361  */
2362 int show_numa_map(struct seq_file *m, void *v)
2363 {
2364 	struct proc_maps_private *priv = m->private;
2365 	struct vm_area_struct *vma = v;
2366 	struct numa_maps *md;
2367 	struct file *file = vma->vm_file;
2368 	struct mm_struct *mm = vma->vm_mm;
2369 	struct mempolicy *pol;
2370 	int n;
2371 	char buffer[50];
2372 
2373 	if (!mm)
2374 		return 0;
2375 
2376 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2377 	if (!md)
2378 		return 0;
2379 
2380 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2381 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2382 	mpol_cond_put(pol);
2383 
2384 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2385 
2386 	if (file) {
2387 		seq_printf(m, " file=");
2388 		seq_path(m, &file->f_path, "\n\t= ");
2389 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2390 		seq_printf(m, " heap");
2391 	} else if (vma->vm_start <= mm->start_stack &&
2392 			vma->vm_end >= mm->start_stack) {
2393 		seq_printf(m, " stack");
2394 	}
2395 
2396 	if (is_vm_hugetlb_page(vma)) {
2397 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2398 		seq_printf(m, " huge");
2399 	} else {
2400 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2401 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2402 	}
2403 
2404 	if (!md->pages)
2405 		goto out;
2406 
2407 	if (md->anon)
2408 		seq_printf(m," anon=%lu",md->anon);
2409 
2410 	if (md->dirty)
2411 		seq_printf(m," dirty=%lu",md->dirty);
2412 
2413 	if (md->pages != md->anon && md->pages != md->dirty)
2414 		seq_printf(m, " mapped=%lu", md->pages);
2415 
2416 	if (md->mapcount_max > 1)
2417 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2418 
2419 	if (md->swapcache)
2420 		seq_printf(m," swapcache=%lu", md->swapcache);
2421 
2422 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2423 		seq_printf(m," active=%lu", md->active);
2424 
2425 	if (md->writeback)
2426 		seq_printf(m," writeback=%lu", md->writeback);
2427 
2428 	for_each_node_state(n, N_HIGH_MEMORY)
2429 		if (md->node[n])
2430 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2431 out:
2432 	seq_putc(m, '\n');
2433 	kfree(md);
2434 
2435 	if (m->count < m->size)
2436 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2437 	return 0;
2438 }
2439