xref: /openbmc/linux/mm/mempolicy.c (revision 7dd65feb)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/gfp.h>
77 #include <linux/slab.h>
78 #include <linux/string.h>
79 #include <linux/module.h>
80 #include <linux/nsproxy.h>
81 #include <linux/interrupt.h>
82 #include <linux/init.h>
83 #include <linux/compat.h>
84 #include <linux/swap.h>
85 #include <linux/seq_file.h>
86 #include <linux/proc_fs.h>
87 #include <linux/migrate.h>
88 #include <linux/ksm.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 #include <linux/ctype.h>
93 #include <linux/mm_inline.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 
98 #include "internal.h"
99 
100 /* Internal flags */
101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
103 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static const struct mempolicy_operations {
122 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
123 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
124 } mpol_ops[MPOL_MAX];
125 
126 /* Check that the nodemask contains at least one populated zone */
127 static int is_valid_nodemask(const nodemask_t *nodemask)
128 {
129 	int nd, k;
130 
131 	/* Check that there is something useful in this mask */
132 	k = policy_zone;
133 
134 	for_each_node_mask(nd, *nodemask) {
135 		struct zone *z;
136 
137 		for (k = 0; k <= policy_zone; k++) {
138 			z = &NODE_DATA(nd)->node_zones[k];
139 			if (z->present_pages > 0)
140 				return 1;
141 		}
142 	}
143 
144 	return 0;
145 }
146 
147 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
148 {
149 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
150 }
151 
152 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
153 				   const nodemask_t *rel)
154 {
155 	nodemask_t tmp;
156 	nodes_fold(tmp, *orig, nodes_weight(*rel));
157 	nodes_onto(*ret, tmp, *rel);
158 }
159 
160 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
161 {
162 	if (nodes_empty(*nodes))
163 		return -EINVAL;
164 	pol->v.nodes = *nodes;
165 	return 0;
166 }
167 
168 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
169 {
170 	if (!nodes)
171 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
172 	else if (nodes_empty(*nodes))
173 		return -EINVAL;			/*  no allowed nodes */
174 	else
175 		pol->v.preferred_node = first_node(*nodes);
176 	return 0;
177 }
178 
179 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (!is_valid_nodemask(nodes))
182 		return -EINVAL;
183 	pol->v.nodes = *nodes;
184 	return 0;
185 }
186 
187 /*
188  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
189  * any, for the new policy.  mpol_new() has already validated the nodes
190  * parameter with respect to the policy mode and flags.  But, we need to
191  * handle an empty nodemask with MPOL_PREFERRED here.
192  *
193  * Must be called holding task's alloc_lock to protect task's mems_allowed
194  * and mempolicy.  May also be called holding the mmap_semaphore for write.
195  */
196 static int mpol_set_nodemask(struct mempolicy *pol,
197 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
198 {
199 	int ret;
200 
201 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
202 	if (pol == NULL)
203 		return 0;
204 	/* Check N_HIGH_MEMORY */
205 	nodes_and(nsc->mask1,
206 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
207 
208 	VM_BUG_ON(!nodes);
209 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
210 		nodes = NULL;	/* explicit local allocation */
211 	else {
212 		if (pol->flags & MPOL_F_RELATIVE_NODES)
213 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
214 		else
215 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
216 
217 		if (mpol_store_user_nodemask(pol))
218 			pol->w.user_nodemask = *nodes;
219 		else
220 			pol->w.cpuset_mems_allowed =
221 						cpuset_current_mems_allowed;
222 	}
223 
224 	if (nodes)
225 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
226 	else
227 		ret = mpol_ops[pol->mode].create(pol, NULL);
228 	return ret;
229 }
230 
231 /*
232  * This function just creates a new policy, does some check and simple
233  * initialization. You must invoke mpol_set_nodemask() to set nodes.
234  */
235 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
236 				  nodemask_t *nodes)
237 {
238 	struct mempolicy *policy;
239 
240 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
241 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
242 
243 	if (mode == MPOL_DEFAULT) {
244 		if (nodes && !nodes_empty(*nodes))
245 			return ERR_PTR(-EINVAL);
246 		return NULL;	/* simply delete any existing policy */
247 	}
248 	VM_BUG_ON(!nodes);
249 
250 	/*
251 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
252 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
253 	 * All other modes require a valid pointer to a non-empty nodemask.
254 	 */
255 	if (mode == MPOL_PREFERRED) {
256 		if (nodes_empty(*nodes)) {
257 			if (((flags & MPOL_F_STATIC_NODES) ||
258 			     (flags & MPOL_F_RELATIVE_NODES)))
259 				return ERR_PTR(-EINVAL);
260 		}
261 	} else if (nodes_empty(*nodes))
262 		return ERR_PTR(-EINVAL);
263 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
264 	if (!policy)
265 		return ERR_PTR(-ENOMEM);
266 	atomic_set(&policy->refcnt, 1);
267 	policy->mode = mode;
268 	policy->flags = flags;
269 
270 	return policy;
271 }
272 
273 /* Slow path of a mpol destructor. */
274 void __mpol_put(struct mempolicy *p)
275 {
276 	if (!atomic_dec_and_test(&p->refcnt))
277 		return;
278 	kmem_cache_free(policy_cache, p);
279 }
280 
281 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
282 {
283 }
284 
285 static void mpol_rebind_nodemask(struct mempolicy *pol,
286 				 const nodemask_t *nodes)
287 {
288 	nodemask_t tmp;
289 
290 	if (pol->flags & MPOL_F_STATIC_NODES)
291 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
292 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
293 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
294 	else {
295 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
296 			    *nodes);
297 		pol->w.cpuset_mems_allowed = *nodes;
298 	}
299 
300 	pol->v.nodes = tmp;
301 	if (!node_isset(current->il_next, tmp)) {
302 		current->il_next = next_node(current->il_next, tmp);
303 		if (current->il_next >= MAX_NUMNODES)
304 			current->il_next = first_node(tmp);
305 		if (current->il_next >= MAX_NUMNODES)
306 			current->il_next = numa_node_id();
307 	}
308 }
309 
310 static void mpol_rebind_preferred(struct mempolicy *pol,
311 				  const nodemask_t *nodes)
312 {
313 	nodemask_t tmp;
314 
315 	if (pol->flags & MPOL_F_STATIC_NODES) {
316 		int node = first_node(pol->w.user_nodemask);
317 
318 		if (node_isset(node, *nodes)) {
319 			pol->v.preferred_node = node;
320 			pol->flags &= ~MPOL_F_LOCAL;
321 		} else
322 			pol->flags |= MPOL_F_LOCAL;
323 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
324 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
325 		pol->v.preferred_node = first_node(tmp);
326 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
327 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
328 						   pol->w.cpuset_mems_allowed,
329 						   *nodes);
330 		pol->w.cpuset_mems_allowed = *nodes;
331 	}
332 }
333 
334 /* Migrate a policy to a different set of nodes */
335 static void mpol_rebind_policy(struct mempolicy *pol,
336 			       const nodemask_t *newmask)
337 {
338 	if (!pol)
339 		return;
340 	if (!mpol_store_user_nodemask(pol) &&
341 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
342 		return;
343 	mpol_ops[pol->mode].rebind(pol, newmask);
344 }
345 
346 /*
347  * Wrapper for mpol_rebind_policy() that just requires task
348  * pointer, and updates task mempolicy.
349  *
350  * Called with task's alloc_lock held.
351  */
352 
353 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
354 {
355 	mpol_rebind_policy(tsk->mempolicy, new);
356 }
357 
358 /*
359  * Rebind each vma in mm to new nodemask.
360  *
361  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
362  */
363 
364 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
365 {
366 	struct vm_area_struct *vma;
367 
368 	down_write(&mm->mmap_sem);
369 	for (vma = mm->mmap; vma; vma = vma->vm_next)
370 		mpol_rebind_policy(vma->vm_policy, new);
371 	up_write(&mm->mmap_sem);
372 }
373 
374 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
375 	[MPOL_DEFAULT] = {
376 		.rebind = mpol_rebind_default,
377 	},
378 	[MPOL_INTERLEAVE] = {
379 		.create = mpol_new_interleave,
380 		.rebind = mpol_rebind_nodemask,
381 	},
382 	[MPOL_PREFERRED] = {
383 		.create = mpol_new_preferred,
384 		.rebind = mpol_rebind_preferred,
385 	},
386 	[MPOL_BIND] = {
387 		.create = mpol_new_bind,
388 		.rebind = mpol_rebind_nodemask,
389 	},
390 };
391 
392 static void gather_stats(struct page *, void *, int pte_dirty);
393 static void migrate_page_add(struct page *page, struct list_head *pagelist,
394 				unsigned long flags);
395 
396 /* Scan through pages checking if pages follow certain conditions. */
397 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
398 		unsigned long addr, unsigned long end,
399 		const nodemask_t *nodes, unsigned long flags,
400 		void *private)
401 {
402 	pte_t *orig_pte;
403 	pte_t *pte;
404 	spinlock_t *ptl;
405 
406 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
407 	do {
408 		struct page *page;
409 		int nid;
410 
411 		if (!pte_present(*pte))
412 			continue;
413 		page = vm_normal_page(vma, addr, *pte);
414 		if (!page)
415 			continue;
416 		/*
417 		 * vm_normal_page() filters out zero pages, but there might
418 		 * still be PageReserved pages to skip, perhaps in a VDSO.
419 		 * And we cannot move PageKsm pages sensibly or safely yet.
420 		 */
421 		if (PageReserved(page) || PageKsm(page))
422 			continue;
423 		nid = page_to_nid(page);
424 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
425 			continue;
426 
427 		if (flags & MPOL_MF_STATS)
428 			gather_stats(page, private, pte_dirty(*pte));
429 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
430 			migrate_page_add(page, private, flags);
431 		else
432 			break;
433 	} while (pte++, addr += PAGE_SIZE, addr != end);
434 	pte_unmap_unlock(orig_pte, ptl);
435 	return addr != end;
436 }
437 
438 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
439 		unsigned long addr, unsigned long end,
440 		const nodemask_t *nodes, unsigned long flags,
441 		void *private)
442 {
443 	pmd_t *pmd;
444 	unsigned long next;
445 
446 	pmd = pmd_offset(pud, addr);
447 	do {
448 		next = pmd_addr_end(addr, end);
449 		if (pmd_none_or_clear_bad(pmd))
450 			continue;
451 		if (check_pte_range(vma, pmd, addr, next, nodes,
452 				    flags, private))
453 			return -EIO;
454 	} while (pmd++, addr = next, addr != end);
455 	return 0;
456 }
457 
458 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
459 		unsigned long addr, unsigned long end,
460 		const nodemask_t *nodes, unsigned long flags,
461 		void *private)
462 {
463 	pud_t *pud;
464 	unsigned long next;
465 
466 	pud = pud_offset(pgd, addr);
467 	do {
468 		next = pud_addr_end(addr, end);
469 		if (pud_none_or_clear_bad(pud))
470 			continue;
471 		if (check_pmd_range(vma, pud, addr, next, nodes,
472 				    flags, private))
473 			return -EIO;
474 	} while (pud++, addr = next, addr != end);
475 	return 0;
476 }
477 
478 static inline int check_pgd_range(struct vm_area_struct *vma,
479 		unsigned long addr, unsigned long end,
480 		const nodemask_t *nodes, unsigned long flags,
481 		void *private)
482 {
483 	pgd_t *pgd;
484 	unsigned long next;
485 
486 	pgd = pgd_offset(vma->vm_mm, addr);
487 	do {
488 		next = pgd_addr_end(addr, end);
489 		if (pgd_none_or_clear_bad(pgd))
490 			continue;
491 		if (check_pud_range(vma, pgd, addr, next, nodes,
492 				    flags, private))
493 			return -EIO;
494 	} while (pgd++, addr = next, addr != end);
495 	return 0;
496 }
497 
498 /*
499  * Check if all pages in a range are on a set of nodes.
500  * If pagelist != NULL then isolate pages from the LRU and
501  * put them on the pagelist.
502  */
503 static struct vm_area_struct *
504 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
505 		const nodemask_t *nodes, unsigned long flags, void *private)
506 {
507 	int err;
508 	struct vm_area_struct *first, *vma, *prev;
509 
510 
511 	first = find_vma(mm, start);
512 	if (!first)
513 		return ERR_PTR(-EFAULT);
514 	prev = NULL;
515 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
516 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
517 			if (!vma->vm_next && vma->vm_end < end)
518 				return ERR_PTR(-EFAULT);
519 			if (prev && prev->vm_end < vma->vm_start)
520 				return ERR_PTR(-EFAULT);
521 		}
522 		if (!is_vm_hugetlb_page(vma) &&
523 		    ((flags & MPOL_MF_STRICT) ||
524 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
525 				vma_migratable(vma)))) {
526 			unsigned long endvma = vma->vm_end;
527 
528 			if (endvma > end)
529 				endvma = end;
530 			if (vma->vm_start > start)
531 				start = vma->vm_start;
532 			err = check_pgd_range(vma, start, endvma, nodes,
533 						flags, private);
534 			if (err) {
535 				first = ERR_PTR(err);
536 				break;
537 			}
538 		}
539 		prev = vma;
540 	}
541 	return first;
542 }
543 
544 /* Apply policy to a single VMA */
545 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
546 {
547 	int err = 0;
548 	struct mempolicy *old = vma->vm_policy;
549 
550 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
551 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
552 		 vma->vm_ops, vma->vm_file,
553 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
554 
555 	if (vma->vm_ops && vma->vm_ops->set_policy)
556 		err = vma->vm_ops->set_policy(vma, new);
557 	if (!err) {
558 		mpol_get(new);
559 		vma->vm_policy = new;
560 		mpol_put(old);
561 	}
562 	return err;
563 }
564 
565 /* Step 2: apply policy to a range and do splits. */
566 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
567 		       unsigned long end, struct mempolicy *new)
568 {
569 	struct vm_area_struct *next;
570 	int err;
571 
572 	err = 0;
573 	for (; vma && vma->vm_start < end; vma = next) {
574 		next = vma->vm_next;
575 		if (vma->vm_start < start)
576 			err = split_vma(vma->vm_mm, vma, start, 1);
577 		if (!err && vma->vm_end > end)
578 			err = split_vma(vma->vm_mm, vma, end, 0);
579 		if (!err)
580 			err = policy_vma(vma, new);
581 		if (err)
582 			break;
583 	}
584 	return err;
585 }
586 
587 /*
588  * Update task->flags PF_MEMPOLICY bit: set iff non-default
589  * mempolicy.  Allows more rapid checking of this (combined perhaps
590  * with other PF_* flag bits) on memory allocation hot code paths.
591  *
592  * If called from outside this file, the task 'p' should -only- be
593  * a newly forked child not yet visible on the task list, because
594  * manipulating the task flags of a visible task is not safe.
595  *
596  * The above limitation is why this routine has the funny name
597  * mpol_fix_fork_child_flag().
598  *
599  * It is also safe to call this with a task pointer of current,
600  * which the static wrapper mpol_set_task_struct_flag() does,
601  * for use within this file.
602  */
603 
604 void mpol_fix_fork_child_flag(struct task_struct *p)
605 {
606 	if (p->mempolicy)
607 		p->flags |= PF_MEMPOLICY;
608 	else
609 		p->flags &= ~PF_MEMPOLICY;
610 }
611 
612 static void mpol_set_task_struct_flag(void)
613 {
614 	mpol_fix_fork_child_flag(current);
615 }
616 
617 /* Set the process memory policy */
618 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
619 			     nodemask_t *nodes)
620 {
621 	struct mempolicy *new, *old;
622 	struct mm_struct *mm = current->mm;
623 	NODEMASK_SCRATCH(scratch);
624 	int ret;
625 
626 	if (!scratch)
627 		return -ENOMEM;
628 
629 	new = mpol_new(mode, flags, nodes);
630 	if (IS_ERR(new)) {
631 		ret = PTR_ERR(new);
632 		goto out;
633 	}
634 	/*
635 	 * prevent changing our mempolicy while show_numa_maps()
636 	 * is using it.
637 	 * Note:  do_set_mempolicy() can be called at init time
638 	 * with no 'mm'.
639 	 */
640 	if (mm)
641 		down_write(&mm->mmap_sem);
642 	task_lock(current);
643 	ret = mpol_set_nodemask(new, nodes, scratch);
644 	if (ret) {
645 		task_unlock(current);
646 		if (mm)
647 			up_write(&mm->mmap_sem);
648 		mpol_put(new);
649 		goto out;
650 	}
651 	old = current->mempolicy;
652 	current->mempolicy = new;
653 	mpol_set_task_struct_flag();
654 	if (new && new->mode == MPOL_INTERLEAVE &&
655 	    nodes_weight(new->v.nodes))
656 		current->il_next = first_node(new->v.nodes);
657 	task_unlock(current);
658 	if (mm)
659 		up_write(&mm->mmap_sem);
660 
661 	mpol_put(old);
662 	ret = 0;
663 out:
664 	NODEMASK_SCRATCH_FREE(scratch);
665 	return ret;
666 }
667 
668 /*
669  * Return nodemask for policy for get_mempolicy() query
670  *
671  * Called with task's alloc_lock held
672  */
673 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
674 {
675 	nodes_clear(*nodes);
676 	if (p == &default_policy)
677 		return;
678 
679 	switch (p->mode) {
680 	case MPOL_BIND:
681 		/* Fall through */
682 	case MPOL_INTERLEAVE:
683 		*nodes = p->v.nodes;
684 		break;
685 	case MPOL_PREFERRED:
686 		if (!(p->flags & MPOL_F_LOCAL))
687 			node_set(p->v.preferred_node, *nodes);
688 		/* else return empty node mask for local allocation */
689 		break;
690 	default:
691 		BUG();
692 	}
693 }
694 
695 static int lookup_node(struct mm_struct *mm, unsigned long addr)
696 {
697 	struct page *p;
698 	int err;
699 
700 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
701 	if (err >= 0) {
702 		err = page_to_nid(p);
703 		put_page(p);
704 	}
705 	return err;
706 }
707 
708 /* Retrieve NUMA policy */
709 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
710 			     unsigned long addr, unsigned long flags)
711 {
712 	int err;
713 	struct mm_struct *mm = current->mm;
714 	struct vm_area_struct *vma = NULL;
715 	struct mempolicy *pol = current->mempolicy;
716 
717 	if (flags &
718 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
719 		return -EINVAL;
720 
721 	if (flags & MPOL_F_MEMS_ALLOWED) {
722 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
723 			return -EINVAL;
724 		*policy = 0;	/* just so it's initialized */
725 		task_lock(current);
726 		*nmask  = cpuset_current_mems_allowed;
727 		task_unlock(current);
728 		return 0;
729 	}
730 
731 	if (flags & MPOL_F_ADDR) {
732 		/*
733 		 * Do NOT fall back to task policy if the
734 		 * vma/shared policy at addr is NULL.  We
735 		 * want to return MPOL_DEFAULT in this case.
736 		 */
737 		down_read(&mm->mmap_sem);
738 		vma = find_vma_intersection(mm, addr, addr+1);
739 		if (!vma) {
740 			up_read(&mm->mmap_sem);
741 			return -EFAULT;
742 		}
743 		if (vma->vm_ops && vma->vm_ops->get_policy)
744 			pol = vma->vm_ops->get_policy(vma, addr);
745 		else
746 			pol = vma->vm_policy;
747 	} else if (addr)
748 		return -EINVAL;
749 
750 	if (!pol)
751 		pol = &default_policy;	/* indicates default behavior */
752 
753 	if (flags & MPOL_F_NODE) {
754 		if (flags & MPOL_F_ADDR) {
755 			err = lookup_node(mm, addr);
756 			if (err < 0)
757 				goto out;
758 			*policy = err;
759 		} else if (pol == current->mempolicy &&
760 				pol->mode == MPOL_INTERLEAVE) {
761 			*policy = current->il_next;
762 		} else {
763 			err = -EINVAL;
764 			goto out;
765 		}
766 	} else {
767 		*policy = pol == &default_policy ? MPOL_DEFAULT :
768 						pol->mode;
769 		/*
770 		 * Internal mempolicy flags must be masked off before exposing
771 		 * the policy to userspace.
772 		 */
773 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
774 	}
775 
776 	if (vma) {
777 		up_read(&current->mm->mmap_sem);
778 		vma = NULL;
779 	}
780 
781 	err = 0;
782 	if (nmask) {
783 		task_lock(current);
784 		get_policy_nodemask(pol, nmask);
785 		task_unlock(current);
786 	}
787 
788  out:
789 	mpol_cond_put(pol);
790 	if (vma)
791 		up_read(&current->mm->mmap_sem);
792 	return err;
793 }
794 
795 #ifdef CONFIG_MIGRATION
796 /*
797  * page migration
798  */
799 static void migrate_page_add(struct page *page, struct list_head *pagelist,
800 				unsigned long flags)
801 {
802 	/*
803 	 * Avoid migrating a page that is shared with others.
804 	 */
805 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
806 		if (!isolate_lru_page(page)) {
807 			list_add_tail(&page->lru, pagelist);
808 			inc_zone_page_state(page, NR_ISOLATED_ANON +
809 					    page_is_file_cache(page));
810 		}
811 	}
812 }
813 
814 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
815 {
816 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
817 }
818 
819 /*
820  * Migrate pages from one node to a target node.
821  * Returns error or the number of pages not migrated.
822  */
823 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
824 			   int flags)
825 {
826 	nodemask_t nmask;
827 	LIST_HEAD(pagelist);
828 	int err = 0;
829 
830 	nodes_clear(nmask);
831 	node_set(source, nmask);
832 
833 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
834 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
835 
836 	if (!list_empty(&pagelist))
837 		err = migrate_pages(&pagelist, new_node_page, dest, 0);
838 
839 	return err;
840 }
841 
842 /*
843  * Move pages between the two nodesets so as to preserve the physical
844  * layout as much as possible.
845  *
846  * Returns the number of page that could not be moved.
847  */
848 int do_migrate_pages(struct mm_struct *mm,
849 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
850 {
851 	int busy = 0;
852 	int err;
853 	nodemask_t tmp;
854 
855 	err = migrate_prep();
856 	if (err)
857 		return err;
858 
859 	down_read(&mm->mmap_sem);
860 
861 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
862 	if (err)
863 		goto out;
864 
865 /*
866  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
867  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
868  * bit in 'tmp', and return that <source, dest> pair for migration.
869  * The pair of nodemasks 'to' and 'from' define the map.
870  *
871  * If no pair of bits is found that way, fallback to picking some
872  * pair of 'source' and 'dest' bits that are not the same.  If the
873  * 'source' and 'dest' bits are the same, this represents a node
874  * that will be migrating to itself, so no pages need move.
875  *
876  * If no bits are left in 'tmp', or if all remaining bits left
877  * in 'tmp' correspond to the same bit in 'to', return false
878  * (nothing left to migrate).
879  *
880  * This lets us pick a pair of nodes to migrate between, such that
881  * if possible the dest node is not already occupied by some other
882  * source node, minimizing the risk of overloading the memory on a
883  * node that would happen if we migrated incoming memory to a node
884  * before migrating outgoing memory source that same node.
885  *
886  * A single scan of tmp is sufficient.  As we go, we remember the
887  * most recent <s, d> pair that moved (s != d).  If we find a pair
888  * that not only moved, but what's better, moved to an empty slot
889  * (d is not set in tmp), then we break out then, with that pair.
890  * Otherwise when we finish scannng from_tmp, we at least have the
891  * most recent <s, d> pair that moved.  If we get all the way through
892  * the scan of tmp without finding any node that moved, much less
893  * moved to an empty node, then there is nothing left worth migrating.
894  */
895 
896 	tmp = *from_nodes;
897 	while (!nodes_empty(tmp)) {
898 		int s,d;
899 		int source = -1;
900 		int dest = 0;
901 
902 		for_each_node_mask(s, tmp) {
903 			d = node_remap(s, *from_nodes, *to_nodes);
904 			if (s == d)
905 				continue;
906 
907 			source = s;	/* Node moved. Memorize */
908 			dest = d;
909 
910 			/* dest not in remaining from nodes? */
911 			if (!node_isset(dest, tmp))
912 				break;
913 		}
914 		if (source == -1)
915 			break;
916 
917 		node_clear(source, tmp);
918 		err = migrate_to_node(mm, source, dest, flags);
919 		if (err > 0)
920 			busy += err;
921 		if (err < 0)
922 			break;
923 	}
924 out:
925 	up_read(&mm->mmap_sem);
926 	if (err < 0)
927 		return err;
928 	return busy;
929 
930 }
931 
932 /*
933  * Allocate a new page for page migration based on vma policy.
934  * Start assuming that page is mapped by vma pointed to by @private.
935  * Search forward from there, if not.  N.B., this assumes that the
936  * list of pages handed to migrate_pages()--which is how we get here--
937  * is in virtual address order.
938  */
939 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
940 {
941 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
942 	unsigned long uninitialized_var(address);
943 
944 	while (vma) {
945 		address = page_address_in_vma(page, vma);
946 		if (address != -EFAULT)
947 			break;
948 		vma = vma->vm_next;
949 	}
950 
951 	/*
952 	 * if !vma, alloc_page_vma() will use task or system default policy
953 	 */
954 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
955 }
956 #else
957 
958 static void migrate_page_add(struct page *page, struct list_head *pagelist,
959 				unsigned long flags)
960 {
961 }
962 
963 int do_migrate_pages(struct mm_struct *mm,
964 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
965 {
966 	return -ENOSYS;
967 }
968 
969 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
970 {
971 	return NULL;
972 }
973 #endif
974 
975 static long do_mbind(unsigned long start, unsigned long len,
976 		     unsigned short mode, unsigned short mode_flags,
977 		     nodemask_t *nmask, unsigned long flags)
978 {
979 	struct vm_area_struct *vma;
980 	struct mm_struct *mm = current->mm;
981 	struct mempolicy *new;
982 	unsigned long end;
983 	int err;
984 	LIST_HEAD(pagelist);
985 
986 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
987 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
988 		return -EINVAL;
989 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
990 		return -EPERM;
991 
992 	if (start & ~PAGE_MASK)
993 		return -EINVAL;
994 
995 	if (mode == MPOL_DEFAULT)
996 		flags &= ~MPOL_MF_STRICT;
997 
998 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
999 	end = start + len;
1000 
1001 	if (end < start)
1002 		return -EINVAL;
1003 	if (end == start)
1004 		return 0;
1005 
1006 	new = mpol_new(mode, mode_flags, nmask);
1007 	if (IS_ERR(new))
1008 		return PTR_ERR(new);
1009 
1010 	/*
1011 	 * If we are using the default policy then operation
1012 	 * on discontinuous address spaces is okay after all
1013 	 */
1014 	if (!new)
1015 		flags |= MPOL_MF_DISCONTIG_OK;
1016 
1017 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1018 		 start, start + len, mode, mode_flags,
1019 		 nmask ? nodes_addr(*nmask)[0] : -1);
1020 
1021 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1022 
1023 		err = migrate_prep();
1024 		if (err)
1025 			goto mpol_out;
1026 	}
1027 	{
1028 		NODEMASK_SCRATCH(scratch);
1029 		if (scratch) {
1030 			down_write(&mm->mmap_sem);
1031 			task_lock(current);
1032 			err = mpol_set_nodemask(new, nmask, scratch);
1033 			task_unlock(current);
1034 			if (err)
1035 				up_write(&mm->mmap_sem);
1036 		} else
1037 			err = -ENOMEM;
1038 		NODEMASK_SCRATCH_FREE(scratch);
1039 	}
1040 	if (err)
1041 		goto mpol_out;
1042 
1043 	vma = check_range(mm, start, end, nmask,
1044 			  flags | MPOL_MF_INVERT, &pagelist);
1045 
1046 	err = PTR_ERR(vma);
1047 	if (!IS_ERR(vma)) {
1048 		int nr_failed = 0;
1049 
1050 		err = mbind_range(vma, start, end, new);
1051 
1052 		if (!list_empty(&pagelist))
1053 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1054 						(unsigned long)vma, 0);
1055 
1056 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1057 			err = -EIO;
1058 	} else
1059 		putback_lru_pages(&pagelist);
1060 
1061 	up_write(&mm->mmap_sem);
1062  mpol_out:
1063 	mpol_put(new);
1064 	return err;
1065 }
1066 
1067 /*
1068  * User space interface with variable sized bitmaps for nodelists.
1069  */
1070 
1071 /* Copy a node mask from user space. */
1072 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1073 		     unsigned long maxnode)
1074 {
1075 	unsigned long k;
1076 	unsigned long nlongs;
1077 	unsigned long endmask;
1078 
1079 	--maxnode;
1080 	nodes_clear(*nodes);
1081 	if (maxnode == 0 || !nmask)
1082 		return 0;
1083 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1084 		return -EINVAL;
1085 
1086 	nlongs = BITS_TO_LONGS(maxnode);
1087 	if ((maxnode % BITS_PER_LONG) == 0)
1088 		endmask = ~0UL;
1089 	else
1090 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1091 
1092 	/* When the user specified more nodes than supported just check
1093 	   if the non supported part is all zero. */
1094 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1095 		if (nlongs > PAGE_SIZE/sizeof(long))
1096 			return -EINVAL;
1097 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1098 			unsigned long t;
1099 			if (get_user(t, nmask + k))
1100 				return -EFAULT;
1101 			if (k == nlongs - 1) {
1102 				if (t & endmask)
1103 					return -EINVAL;
1104 			} else if (t)
1105 				return -EINVAL;
1106 		}
1107 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1108 		endmask = ~0UL;
1109 	}
1110 
1111 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1112 		return -EFAULT;
1113 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1114 	return 0;
1115 }
1116 
1117 /* Copy a kernel node mask to user space */
1118 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1119 			      nodemask_t *nodes)
1120 {
1121 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1122 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1123 
1124 	if (copy > nbytes) {
1125 		if (copy > PAGE_SIZE)
1126 			return -EINVAL;
1127 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1128 			return -EFAULT;
1129 		copy = nbytes;
1130 	}
1131 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1132 }
1133 
1134 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1135 		unsigned long, mode, unsigned long __user *, nmask,
1136 		unsigned long, maxnode, unsigned, flags)
1137 {
1138 	nodemask_t nodes;
1139 	int err;
1140 	unsigned short mode_flags;
1141 
1142 	mode_flags = mode & MPOL_MODE_FLAGS;
1143 	mode &= ~MPOL_MODE_FLAGS;
1144 	if (mode >= MPOL_MAX)
1145 		return -EINVAL;
1146 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1147 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1148 		return -EINVAL;
1149 	err = get_nodes(&nodes, nmask, maxnode);
1150 	if (err)
1151 		return err;
1152 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1153 }
1154 
1155 /* Set the process memory policy */
1156 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1157 		unsigned long, maxnode)
1158 {
1159 	int err;
1160 	nodemask_t nodes;
1161 	unsigned short flags;
1162 
1163 	flags = mode & MPOL_MODE_FLAGS;
1164 	mode &= ~MPOL_MODE_FLAGS;
1165 	if ((unsigned int)mode >= MPOL_MAX)
1166 		return -EINVAL;
1167 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1168 		return -EINVAL;
1169 	err = get_nodes(&nodes, nmask, maxnode);
1170 	if (err)
1171 		return err;
1172 	return do_set_mempolicy(mode, flags, &nodes);
1173 }
1174 
1175 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1176 		const unsigned long __user *, old_nodes,
1177 		const unsigned long __user *, new_nodes)
1178 {
1179 	const struct cred *cred = current_cred(), *tcred;
1180 	struct mm_struct *mm;
1181 	struct task_struct *task;
1182 	nodemask_t old;
1183 	nodemask_t new;
1184 	nodemask_t task_nodes;
1185 	int err;
1186 
1187 	err = get_nodes(&old, old_nodes, maxnode);
1188 	if (err)
1189 		return err;
1190 
1191 	err = get_nodes(&new, new_nodes, maxnode);
1192 	if (err)
1193 		return err;
1194 
1195 	/* Find the mm_struct */
1196 	read_lock(&tasklist_lock);
1197 	task = pid ? find_task_by_vpid(pid) : current;
1198 	if (!task) {
1199 		read_unlock(&tasklist_lock);
1200 		return -ESRCH;
1201 	}
1202 	mm = get_task_mm(task);
1203 	read_unlock(&tasklist_lock);
1204 
1205 	if (!mm)
1206 		return -EINVAL;
1207 
1208 	/*
1209 	 * Check if this process has the right to modify the specified
1210 	 * process. The right exists if the process has administrative
1211 	 * capabilities, superuser privileges or the same
1212 	 * userid as the target process.
1213 	 */
1214 	rcu_read_lock();
1215 	tcred = __task_cred(task);
1216 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1217 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1218 	    !capable(CAP_SYS_NICE)) {
1219 		rcu_read_unlock();
1220 		err = -EPERM;
1221 		goto out;
1222 	}
1223 	rcu_read_unlock();
1224 
1225 	task_nodes = cpuset_mems_allowed(task);
1226 	/* Is the user allowed to access the target nodes? */
1227 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1228 		err = -EPERM;
1229 		goto out;
1230 	}
1231 
1232 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1233 		err = -EINVAL;
1234 		goto out;
1235 	}
1236 
1237 	err = security_task_movememory(task);
1238 	if (err)
1239 		goto out;
1240 
1241 	err = do_migrate_pages(mm, &old, &new,
1242 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1243 out:
1244 	mmput(mm);
1245 	return err;
1246 }
1247 
1248 
1249 /* Retrieve NUMA policy */
1250 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1251 		unsigned long __user *, nmask, unsigned long, maxnode,
1252 		unsigned long, addr, unsigned long, flags)
1253 {
1254 	int err;
1255 	int uninitialized_var(pval);
1256 	nodemask_t nodes;
1257 
1258 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1259 		return -EINVAL;
1260 
1261 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1262 
1263 	if (err)
1264 		return err;
1265 
1266 	if (policy && put_user(pval, policy))
1267 		return -EFAULT;
1268 
1269 	if (nmask)
1270 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1271 
1272 	return err;
1273 }
1274 
1275 #ifdef CONFIG_COMPAT
1276 
1277 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1278 				     compat_ulong_t __user *nmask,
1279 				     compat_ulong_t maxnode,
1280 				     compat_ulong_t addr, compat_ulong_t flags)
1281 {
1282 	long err;
1283 	unsigned long __user *nm = NULL;
1284 	unsigned long nr_bits, alloc_size;
1285 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1286 
1287 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1288 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1289 
1290 	if (nmask)
1291 		nm = compat_alloc_user_space(alloc_size);
1292 
1293 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1294 
1295 	if (!err && nmask) {
1296 		err = copy_from_user(bm, nm, alloc_size);
1297 		/* ensure entire bitmap is zeroed */
1298 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1299 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1300 	}
1301 
1302 	return err;
1303 }
1304 
1305 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1306 				     compat_ulong_t maxnode)
1307 {
1308 	long err = 0;
1309 	unsigned long __user *nm = NULL;
1310 	unsigned long nr_bits, alloc_size;
1311 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1312 
1313 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1314 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1315 
1316 	if (nmask) {
1317 		err = compat_get_bitmap(bm, nmask, nr_bits);
1318 		nm = compat_alloc_user_space(alloc_size);
1319 		err |= copy_to_user(nm, bm, alloc_size);
1320 	}
1321 
1322 	if (err)
1323 		return -EFAULT;
1324 
1325 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1326 }
1327 
1328 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1329 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1330 			     compat_ulong_t maxnode, compat_ulong_t flags)
1331 {
1332 	long err = 0;
1333 	unsigned long __user *nm = NULL;
1334 	unsigned long nr_bits, alloc_size;
1335 	nodemask_t bm;
1336 
1337 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1338 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1339 
1340 	if (nmask) {
1341 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1342 		nm = compat_alloc_user_space(alloc_size);
1343 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1344 	}
1345 
1346 	if (err)
1347 		return -EFAULT;
1348 
1349 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1350 }
1351 
1352 #endif
1353 
1354 /*
1355  * get_vma_policy(@task, @vma, @addr)
1356  * @task - task for fallback if vma policy == default
1357  * @vma   - virtual memory area whose policy is sought
1358  * @addr  - address in @vma for shared policy lookup
1359  *
1360  * Returns effective policy for a VMA at specified address.
1361  * Falls back to @task or system default policy, as necessary.
1362  * Current or other task's task mempolicy and non-shared vma policies
1363  * are protected by the task's mmap_sem, which must be held for read by
1364  * the caller.
1365  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1366  * count--added by the get_policy() vm_op, as appropriate--to protect against
1367  * freeing by another task.  It is the caller's responsibility to free the
1368  * extra reference for shared policies.
1369  */
1370 static struct mempolicy *get_vma_policy(struct task_struct *task,
1371 		struct vm_area_struct *vma, unsigned long addr)
1372 {
1373 	struct mempolicy *pol = task->mempolicy;
1374 
1375 	if (vma) {
1376 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1377 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1378 									addr);
1379 			if (vpol)
1380 				pol = vpol;
1381 		} else if (vma->vm_policy)
1382 			pol = vma->vm_policy;
1383 	}
1384 	if (!pol)
1385 		pol = &default_policy;
1386 	return pol;
1387 }
1388 
1389 /*
1390  * Return a nodemask representing a mempolicy for filtering nodes for
1391  * page allocation
1392  */
1393 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1394 {
1395 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1396 	if (unlikely(policy->mode == MPOL_BIND) &&
1397 			gfp_zone(gfp) >= policy_zone &&
1398 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1399 		return &policy->v.nodes;
1400 
1401 	return NULL;
1402 }
1403 
1404 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1405 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1406 {
1407 	int nd = numa_node_id();
1408 
1409 	switch (policy->mode) {
1410 	case MPOL_PREFERRED:
1411 		if (!(policy->flags & MPOL_F_LOCAL))
1412 			nd = policy->v.preferred_node;
1413 		break;
1414 	case MPOL_BIND:
1415 		/*
1416 		 * Normally, MPOL_BIND allocations are node-local within the
1417 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1418 		 * current node is part of the mask, we use the zonelist for
1419 		 * the first node in the mask instead.
1420 		 */
1421 		if (unlikely(gfp & __GFP_THISNODE) &&
1422 				unlikely(!node_isset(nd, policy->v.nodes)))
1423 			nd = first_node(policy->v.nodes);
1424 		break;
1425 	case MPOL_INTERLEAVE: /* should not happen */
1426 		break;
1427 	default:
1428 		BUG();
1429 	}
1430 	return node_zonelist(nd, gfp);
1431 }
1432 
1433 /* Do dynamic interleaving for a process */
1434 static unsigned interleave_nodes(struct mempolicy *policy)
1435 {
1436 	unsigned nid, next;
1437 	struct task_struct *me = current;
1438 
1439 	nid = me->il_next;
1440 	next = next_node(nid, policy->v.nodes);
1441 	if (next >= MAX_NUMNODES)
1442 		next = first_node(policy->v.nodes);
1443 	if (next < MAX_NUMNODES)
1444 		me->il_next = next;
1445 	return nid;
1446 }
1447 
1448 /*
1449  * Depending on the memory policy provide a node from which to allocate the
1450  * next slab entry.
1451  * @policy must be protected by freeing by the caller.  If @policy is
1452  * the current task's mempolicy, this protection is implicit, as only the
1453  * task can change it's policy.  The system default policy requires no
1454  * such protection.
1455  */
1456 unsigned slab_node(struct mempolicy *policy)
1457 {
1458 	if (!policy || policy->flags & MPOL_F_LOCAL)
1459 		return numa_node_id();
1460 
1461 	switch (policy->mode) {
1462 	case MPOL_PREFERRED:
1463 		/*
1464 		 * handled MPOL_F_LOCAL above
1465 		 */
1466 		return policy->v.preferred_node;
1467 
1468 	case MPOL_INTERLEAVE:
1469 		return interleave_nodes(policy);
1470 
1471 	case MPOL_BIND: {
1472 		/*
1473 		 * Follow bind policy behavior and start allocation at the
1474 		 * first node.
1475 		 */
1476 		struct zonelist *zonelist;
1477 		struct zone *zone;
1478 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1479 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1480 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1481 							&policy->v.nodes,
1482 							&zone);
1483 		return zone->node;
1484 	}
1485 
1486 	default:
1487 		BUG();
1488 	}
1489 }
1490 
1491 /* Do static interleaving for a VMA with known offset. */
1492 static unsigned offset_il_node(struct mempolicy *pol,
1493 		struct vm_area_struct *vma, unsigned long off)
1494 {
1495 	unsigned nnodes = nodes_weight(pol->v.nodes);
1496 	unsigned target;
1497 	int c;
1498 	int nid = -1;
1499 
1500 	if (!nnodes)
1501 		return numa_node_id();
1502 	target = (unsigned int)off % nnodes;
1503 	c = 0;
1504 	do {
1505 		nid = next_node(nid, pol->v.nodes);
1506 		c++;
1507 	} while (c <= target);
1508 	return nid;
1509 }
1510 
1511 /* Determine a node number for interleave */
1512 static inline unsigned interleave_nid(struct mempolicy *pol,
1513 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1514 {
1515 	if (vma) {
1516 		unsigned long off;
1517 
1518 		/*
1519 		 * for small pages, there is no difference between
1520 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1521 		 * for huge pages, since vm_pgoff is in units of small
1522 		 * pages, we need to shift off the always 0 bits to get
1523 		 * a useful offset.
1524 		 */
1525 		BUG_ON(shift < PAGE_SHIFT);
1526 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1527 		off += (addr - vma->vm_start) >> shift;
1528 		return offset_il_node(pol, vma, off);
1529 	} else
1530 		return interleave_nodes(pol);
1531 }
1532 
1533 #ifdef CONFIG_HUGETLBFS
1534 /*
1535  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1536  * @vma = virtual memory area whose policy is sought
1537  * @addr = address in @vma for shared policy lookup and interleave policy
1538  * @gfp_flags = for requested zone
1539  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1540  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1541  *
1542  * Returns a zonelist suitable for a huge page allocation and a pointer
1543  * to the struct mempolicy for conditional unref after allocation.
1544  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1545  * @nodemask for filtering the zonelist.
1546  */
1547 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1548 				gfp_t gfp_flags, struct mempolicy **mpol,
1549 				nodemask_t **nodemask)
1550 {
1551 	struct zonelist *zl;
1552 
1553 	*mpol = get_vma_policy(current, vma, addr);
1554 	*nodemask = NULL;	/* assume !MPOL_BIND */
1555 
1556 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1557 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1558 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1559 	} else {
1560 		zl = policy_zonelist(gfp_flags, *mpol);
1561 		if ((*mpol)->mode == MPOL_BIND)
1562 			*nodemask = &(*mpol)->v.nodes;
1563 	}
1564 	return zl;
1565 }
1566 
1567 /*
1568  * init_nodemask_of_mempolicy
1569  *
1570  * If the current task's mempolicy is "default" [NULL], return 'false'
1571  * to indicate default policy.  Otherwise, extract the policy nodemask
1572  * for 'bind' or 'interleave' policy into the argument nodemask, or
1573  * initialize the argument nodemask to contain the single node for
1574  * 'preferred' or 'local' policy and return 'true' to indicate presence
1575  * of non-default mempolicy.
1576  *
1577  * We don't bother with reference counting the mempolicy [mpol_get/put]
1578  * because the current task is examining it's own mempolicy and a task's
1579  * mempolicy is only ever changed by the task itself.
1580  *
1581  * N.B., it is the caller's responsibility to free a returned nodemask.
1582  */
1583 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1584 {
1585 	struct mempolicy *mempolicy;
1586 	int nid;
1587 
1588 	if (!(mask && current->mempolicy))
1589 		return false;
1590 
1591 	mempolicy = current->mempolicy;
1592 	switch (mempolicy->mode) {
1593 	case MPOL_PREFERRED:
1594 		if (mempolicy->flags & MPOL_F_LOCAL)
1595 			nid = numa_node_id();
1596 		else
1597 			nid = mempolicy->v.preferred_node;
1598 		init_nodemask_of_node(mask, nid);
1599 		break;
1600 
1601 	case MPOL_BIND:
1602 		/* Fall through */
1603 	case MPOL_INTERLEAVE:
1604 		*mask =  mempolicy->v.nodes;
1605 		break;
1606 
1607 	default:
1608 		BUG();
1609 	}
1610 
1611 	return true;
1612 }
1613 #endif
1614 
1615 /* Allocate a page in interleaved policy.
1616    Own path because it needs to do special accounting. */
1617 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1618 					unsigned nid)
1619 {
1620 	struct zonelist *zl;
1621 	struct page *page;
1622 
1623 	zl = node_zonelist(nid, gfp);
1624 	page = __alloc_pages(gfp, order, zl);
1625 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1626 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1627 	return page;
1628 }
1629 
1630 /**
1631  * 	alloc_page_vma	- Allocate a page for a VMA.
1632  *
1633  * 	@gfp:
1634  *      %GFP_USER    user allocation.
1635  *      %GFP_KERNEL  kernel allocations,
1636  *      %GFP_HIGHMEM highmem/user allocations,
1637  *      %GFP_FS      allocation should not call back into a file system.
1638  *      %GFP_ATOMIC  don't sleep.
1639  *
1640  * 	@vma:  Pointer to VMA or NULL if not available.
1641  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1642  *
1643  * 	This function allocates a page from the kernel page pool and applies
1644  *	a NUMA policy associated with the VMA or the current process.
1645  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1646  *	mm_struct of the VMA to prevent it from going away. Should be used for
1647  *	all allocations for pages that will be mapped into
1648  * 	user space. Returns NULL when no page can be allocated.
1649  *
1650  *	Should be called with the mm_sem of the vma hold.
1651  */
1652 struct page *
1653 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1654 {
1655 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1656 	struct zonelist *zl;
1657 
1658 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1659 		unsigned nid;
1660 
1661 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1662 		mpol_cond_put(pol);
1663 		return alloc_page_interleave(gfp, 0, nid);
1664 	}
1665 	zl = policy_zonelist(gfp, pol);
1666 	if (unlikely(mpol_needs_cond_ref(pol))) {
1667 		/*
1668 		 * slow path: ref counted shared policy
1669 		 */
1670 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1671 						zl, policy_nodemask(gfp, pol));
1672 		__mpol_put(pol);
1673 		return page;
1674 	}
1675 	/*
1676 	 * fast path:  default or task policy
1677 	 */
1678 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1679 }
1680 
1681 /**
1682  * 	alloc_pages_current - Allocate pages.
1683  *
1684  *	@gfp:
1685  *		%GFP_USER   user allocation,
1686  *      	%GFP_KERNEL kernel allocation,
1687  *      	%GFP_HIGHMEM highmem allocation,
1688  *      	%GFP_FS     don't call back into a file system.
1689  *      	%GFP_ATOMIC don't sleep.
1690  *	@order: Power of two of allocation size in pages. 0 is a single page.
1691  *
1692  *	Allocate a page from the kernel page pool.  When not in
1693  *	interrupt context and apply the current process NUMA policy.
1694  *	Returns NULL when no page can be allocated.
1695  *
1696  *	Don't call cpuset_update_task_memory_state() unless
1697  *	1) it's ok to take cpuset_sem (can WAIT), and
1698  *	2) allocating for current task (not interrupt).
1699  */
1700 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1701 {
1702 	struct mempolicy *pol = current->mempolicy;
1703 
1704 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1705 		pol = &default_policy;
1706 
1707 	/*
1708 	 * No reference counting needed for current->mempolicy
1709 	 * nor system default_policy
1710 	 */
1711 	if (pol->mode == MPOL_INTERLEAVE)
1712 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1713 	return __alloc_pages_nodemask(gfp, order,
1714 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1715 }
1716 EXPORT_SYMBOL(alloc_pages_current);
1717 
1718 /*
1719  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1720  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1721  * with the mems_allowed returned by cpuset_mems_allowed().  This
1722  * keeps mempolicies cpuset relative after its cpuset moves.  See
1723  * further kernel/cpuset.c update_nodemask().
1724  */
1725 
1726 /* Slow path of a mempolicy duplicate */
1727 struct mempolicy *__mpol_dup(struct mempolicy *old)
1728 {
1729 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1730 
1731 	if (!new)
1732 		return ERR_PTR(-ENOMEM);
1733 	if (current_cpuset_is_being_rebound()) {
1734 		nodemask_t mems = cpuset_mems_allowed(current);
1735 		mpol_rebind_policy(old, &mems);
1736 	}
1737 	*new = *old;
1738 	atomic_set(&new->refcnt, 1);
1739 	return new;
1740 }
1741 
1742 /*
1743  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1744  * eliminate the * MPOL_F_* flags that require conditional ref and
1745  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1746  * after return.  Use the returned value.
1747  *
1748  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1749  * policy lookup, even if the policy needs/has extra ref on lookup.
1750  * shmem_readahead needs this.
1751  */
1752 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1753 						struct mempolicy *frompol)
1754 {
1755 	if (!mpol_needs_cond_ref(frompol))
1756 		return frompol;
1757 
1758 	*tompol = *frompol;
1759 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1760 	__mpol_put(frompol);
1761 	return tompol;
1762 }
1763 
1764 static int mpol_match_intent(const struct mempolicy *a,
1765 			     const struct mempolicy *b)
1766 {
1767 	if (a->flags != b->flags)
1768 		return 0;
1769 	if (!mpol_store_user_nodemask(a))
1770 		return 1;
1771 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1772 }
1773 
1774 /* Slow path of a mempolicy comparison */
1775 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1776 {
1777 	if (!a || !b)
1778 		return 0;
1779 	if (a->mode != b->mode)
1780 		return 0;
1781 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1782 		return 0;
1783 	switch (a->mode) {
1784 	case MPOL_BIND:
1785 		/* Fall through */
1786 	case MPOL_INTERLEAVE:
1787 		return nodes_equal(a->v.nodes, b->v.nodes);
1788 	case MPOL_PREFERRED:
1789 		return a->v.preferred_node == b->v.preferred_node &&
1790 			a->flags == b->flags;
1791 	default:
1792 		BUG();
1793 		return 0;
1794 	}
1795 }
1796 
1797 /*
1798  * Shared memory backing store policy support.
1799  *
1800  * Remember policies even when nobody has shared memory mapped.
1801  * The policies are kept in Red-Black tree linked from the inode.
1802  * They are protected by the sp->lock spinlock, which should be held
1803  * for any accesses to the tree.
1804  */
1805 
1806 /* lookup first element intersecting start-end */
1807 /* Caller holds sp->lock */
1808 static struct sp_node *
1809 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1810 {
1811 	struct rb_node *n = sp->root.rb_node;
1812 
1813 	while (n) {
1814 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1815 
1816 		if (start >= p->end)
1817 			n = n->rb_right;
1818 		else if (end <= p->start)
1819 			n = n->rb_left;
1820 		else
1821 			break;
1822 	}
1823 	if (!n)
1824 		return NULL;
1825 	for (;;) {
1826 		struct sp_node *w = NULL;
1827 		struct rb_node *prev = rb_prev(n);
1828 		if (!prev)
1829 			break;
1830 		w = rb_entry(prev, struct sp_node, nd);
1831 		if (w->end <= start)
1832 			break;
1833 		n = prev;
1834 	}
1835 	return rb_entry(n, struct sp_node, nd);
1836 }
1837 
1838 /* Insert a new shared policy into the list. */
1839 /* Caller holds sp->lock */
1840 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1841 {
1842 	struct rb_node **p = &sp->root.rb_node;
1843 	struct rb_node *parent = NULL;
1844 	struct sp_node *nd;
1845 
1846 	while (*p) {
1847 		parent = *p;
1848 		nd = rb_entry(parent, struct sp_node, nd);
1849 		if (new->start < nd->start)
1850 			p = &(*p)->rb_left;
1851 		else if (new->end > nd->end)
1852 			p = &(*p)->rb_right;
1853 		else
1854 			BUG();
1855 	}
1856 	rb_link_node(&new->nd, parent, p);
1857 	rb_insert_color(&new->nd, &sp->root);
1858 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1859 		 new->policy ? new->policy->mode : 0);
1860 }
1861 
1862 /* Find shared policy intersecting idx */
1863 struct mempolicy *
1864 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1865 {
1866 	struct mempolicy *pol = NULL;
1867 	struct sp_node *sn;
1868 
1869 	if (!sp->root.rb_node)
1870 		return NULL;
1871 	spin_lock(&sp->lock);
1872 	sn = sp_lookup(sp, idx, idx+1);
1873 	if (sn) {
1874 		mpol_get(sn->policy);
1875 		pol = sn->policy;
1876 	}
1877 	spin_unlock(&sp->lock);
1878 	return pol;
1879 }
1880 
1881 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1882 {
1883 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1884 	rb_erase(&n->nd, &sp->root);
1885 	mpol_put(n->policy);
1886 	kmem_cache_free(sn_cache, n);
1887 }
1888 
1889 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1890 				struct mempolicy *pol)
1891 {
1892 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1893 
1894 	if (!n)
1895 		return NULL;
1896 	n->start = start;
1897 	n->end = end;
1898 	mpol_get(pol);
1899 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1900 	n->policy = pol;
1901 	return n;
1902 }
1903 
1904 /* Replace a policy range. */
1905 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1906 				 unsigned long end, struct sp_node *new)
1907 {
1908 	struct sp_node *n, *new2 = NULL;
1909 
1910 restart:
1911 	spin_lock(&sp->lock);
1912 	n = sp_lookup(sp, start, end);
1913 	/* Take care of old policies in the same range. */
1914 	while (n && n->start < end) {
1915 		struct rb_node *next = rb_next(&n->nd);
1916 		if (n->start >= start) {
1917 			if (n->end <= end)
1918 				sp_delete(sp, n);
1919 			else
1920 				n->start = end;
1921 		} else {
1922 			/* Old policy spanning whole new range. */
1923 			if (n->end > end) {
1924 				if (!new2) {
1925 					spin_unlock(&sp->lock);
1926 					new2 = sp_alloc(end, n->end, n->policy);
1927 					if (!new2)
1928 						return -ENOMEM;
1929 					goto restart;
1930 				}
1931 				n->end = start;
1932 				sp_insert(sp, new2);
1933 				new2 = NULL;
1934 				break;
1935 			} else
1936 				n->end = start;
1937 		}
1938 		if (!next)
1939 			break;
1940 		n = rb_entry(next, struct sp_node, nd);
1941 	}
1942 	if (new)
1943 		sp_insert(sp, new);
1944 	spin_unlock(&sp->lock);
1945 	if (new2) {
1946 		mpol_put(new2->policy);
1947 		kmem_cache_free(sn_cache, new2);
1948 	}
1949 	return 0;
1950 }
1951 
1952 /**
1953  * mpol_shared_policy_init - initialize shared policy for inode
1954  * @sp: pointer to inode shared policy
1955  * @mpol:  struct mempolicy to install
1956  *
1957  * Install non-NULL @mpol in inode's shared policy rb-tree.
1958  * On entry, the current task has a reference on a non-NULL @mpol.
1959  * This must be released on exit.
1960  * This is called at get_inode() calls and we can use GFP_KERNEL.
1961  */
1962 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1963 {
1964 	int ret;
1965 
1966 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1967 	spin_lock_init(&sp->lock);
1968 
1969 	if (mpol) {
1970 		struct vm_area_struct pvma;
1971 		struct mempolicy *new;
1972 		NODEMASK_SCRATCH(scratch);
1973 
1974 		if (!scratch)
1975 			return;
1976 		/* contextualize the tmpfs mount point mempolicy */
1977 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1978 		if (IS_ERR(new)) {
1979 			mpol_put(mpol);	/* drop our ref on sb mpol */
1980 			NODEMASK_SCRATCH_FREE(scratch);
1981 			return;		/* no valid nodemask intersection */
1982 		}
1983 
1984 		task_lock(current);
1985 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1986 		task_unlock(current);
1987 		mpol_put(mpol);	/* drop our ref on sb mpol */
1988 		if (ret) {
1989 			NODEMASK_SCRATCH_FREE(scratch);
1990 			mpol_put(new);
1991 			return;
1992 		}
1993 
1994 		/* Create pseudo-vma that contains just the policy */
1995 		memset(&pvma, 0, sizeof(struct vm_area_struct));
1996 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1997 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1998 		mpol_put(new);			/* drop initial ref */
1999 		NODEMASK_SCRATCH_FREE(scratch);
2000 	}
2001 }
2002 
2003 int mpol_set_shared_policy(struct shared_policy *info,
2004 			struct vm_area_struct *vma, struct mempolicy *npol)
2005 {
2006 	int err;
2007 	struct sp_node *new = NULL;
2008 	unsigned long sz = vma_pages(vma);
2009 
2010 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2011 		 vma->vm_pgoff,
2012 		 sz, npol ? npol->mode : -1,
2013 		 npol ? npol->flags : -1,
2014 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2015 
2016 	if (npol) {
2017 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2018 		if (!new)
2019 			return -ENOMEM;
2020 	}
2021 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2022 	if (err && new)
2023 		kmem_cache_free(sn_cache, new);
2024 	return err;
2025 }
2026 
2027 /* Free a backing policy store on inode delete. */
2028 void mpol_free_shared_policy(struct shared_policy *p)
2029 {
2030 	struct sp_node *n;
2031 	struct rb_node *next;
2032 
2033 	if (!p->root.rb_node)
2034 		return;
2035 	spin_lock(&p->lock);
2036 	next = rb_first(&p->root);
2037 	while (next) {
2038 		n = rb_entry(next, struct sp_node, nd);
2039 		next = rb_next(&n->nd);
2040 		rb_erase(&n->nd, &p->root);
2041 		mpol_put(n->policy);
2042 		kmem_cache_free(sn_cache, n);
2043 	}
2044 	spin_unlock(&p->lock);
2045 }
2046 
2047 /* assumes fs == KERNEL_DS */
2048 void __init numa_policy_init(void)
2049 {
2050 	nodemask_t interleave_nodes;
2051 	unsigned long largest = 0;
2052 	int nid, prefer = 0;
2053 
2054 	policy_cache = kmem_cache_create("numa_policy",
2055 					 sizeof(struct mempolicy),
2056 					 0, SLAB_PANIC, NULL);
2057 
2058 	sn_cache = kmem_cache_create("shared_policy_node",
2059 				     sizeof(struct sp_node),
2060 				     0, SLAB_PANIC, NULL);
2061 
2062 	/*
2063 	 * Set interleaving policy for system init. Interleaving is only
2064 	 * enabled across suitably sized nodes (default is >= 16MB), or
2065 	 * fall back to the largest node if they're all smaller.
2066 	 */
2067 	nodes_clear(interleave_nodes);
2068 	for_each_node_state(nid, N_HIGH_MEMORY) {
2069 		unsigned long total_pages = node_present_pages(nid);
2070 
2071 		/* Preserve the largest node */
2072 		if (largest < total_pages) {
2073 			largest = total_pages;
2074 			prefer = nid;
2075 		}
2076 
2077 		/* Interleave this node? */
2078 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2079 			node_set(nid, interleave_nodes);
2080 	}
2081 
2082 	/* All too small, use the largest */
2083 	if (unlikely(nodes_empty(interleave_nodes)))
2084 		node_set(prefer, interleave_nodes);
2085 
2086 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2087 		printk("numa_policy_init: interleaving failed\n");
2088 }
2089 
2090 /* Reset policy of current process to default */
2091 void numa_default_policy(void)
2092 {
2093 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2094 }
2095 
2096 /*
2097  * Parse and format mempolicy from/to strings
2098  */
2099 
2100 /*
2101  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2102  * Used only for mpol_parse_str() and mpol_to_str()
2103  */
2104 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2105 static const char * const policy_types[] =
2106 	{ "default", "prefer", "bind", "interleave", "local" };
2107 
2108 
2109 #ifdef CONFIG_TMPFS
2110 /**
2111  * mpol_parse_str - parse string to mempolicy
2112  * @str:  string containing mempolicy to parse
2113  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2114  * @no_context:  flag whether to "contextualize" the mempolicy
2115  *
2116  * Format of input:
2117  *	<mode>[=<flags>][:<nodelist>]
2118  *
2119  * if @no_context is true, save the input nodemask in w.user_nodemask in
2120  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2121  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2122  * mount option.  Note that if 'static' or 'relative' mode flags were
2123  * specified, the input nodemask will already have been saved.  Saving
2124  * it again is redundant, but safe.
2125  *
2126  * On success, returns 0, else 1
2127  */
2128 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2129 {
2130 	struct mempolicy *new = NULL;
2131 	unsigned short uninitialized_var(mode);
2132 	unsigned short uninitialized_var(mode_flags);
2133 	nodemask_t nodes;
2134 	char *nodelist = strchr(str, ':');
2135 	char *flags = strchr(str, '=');
2136 	int i;
2137 	int err = 1;
2138 
2139 	if (nodelist) {
2140 		/* NUL-terminate mode or flags string */
2141 		*nodelist++ = '\0';
2142 		if (nodelist_parse(nodelist, nodes))
2143 			goto out;
2144 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2145 			goto out;
2146 	} else
2147 		nodes_clear(nodes);
2148 
2149 	if (flags)
2150 		*flags++ = '\0';	/* terminate mode string */
2151 
2152 	for (i = 0; i <= MPOL_LOCAL; i++) {
2153 		if (!strcmp(str, policy_types[i])) {
2154 			mode = i;
2155 			break;
2156 		}
2157 	}
2158 	if (i > MPOL_LOCAL)
2159 		goto out;
2160 
2161 	switch (mode) {
2162 	case MPOL_PREFERRED:
2163 		/*
2164 		 * Insist on a nodelist of one node only
2165 		 */
2166 		if (nodelist) {
2167 			char *rest = nodelist;
2168 			while (isdigit(*rest))
2169 				rest++;
2170 			if (!*rest)
2171 				err = 0;
2172 		}
2173 		break;
2174 	case MPOL_INTERLEAVE:
2175 		/*
2176 		 * Default to online nodes with memory if no nodelist
2177 		 */
2178 		if (!nodelist)
2179 			nodes = node_states[N_HIGH_MEMORY];
2180 		err = 0;
2181 		break;
2182 	case MPOL_LOCAL:
2183 		/*
2184 		 * Don't allow a nodelist;  mpol_new() checks flags
2185 		 */
2186 		if (nodelist)
2187 			goto out;
2188 		mode = MPOL_PREFERRED;
2189 		break;
2190 
2191 	/*
2192 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2193 	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2194 	 */
2195 	}
2196 
2197 	mode_flags = 0;
2198 	if (flags) {
2199 		/*
2200 		 * Currently, we only support two mutually exclusive
2201 		 * mode flags.
2202 		 */
2203 		if (!strcmp(flags, "static"))
2204 			mode_flags |= MPOL_F_STATIC_NODES;
2205 		else if (!strcmp(flags, "relative"))
2206 			mode_flags |= MPOL_F_RELATIVE_NODES;
2207 		else
2208 			err = 1;
2209 	}
2210 
2211 	new = mpol_new(mode, mode_flags, &nodes);
2212 	if (IS_ERR(new))
2213 		err = 1;
2214 	else {
2215 		int ret;
2216 		NODEMASK_SCRATCH(scratch);
2217 		if (scratch) {
2218 			task_lock(current);
2219 			ret = mpol_set_nodemask(new, &nodes, scratch);
2220 			task_unlock(current);
2221 		} else
2222 			ret = -ENOMEM;
2223 		NODEMASK_SCRATCH_FREE(scratch);
2224 		if (ret) {
2225 			err = 1;
2226 			mpol_put(new);
2227 		} else if (no_context) {
2228 			/* save for contextualization */
2229 			new->w.user_nodemask = nodes;
2230 		}
2231 	}
2232 
2233 out:
2234 	/* Restore string for error message */
2235 	if (nodelist)
2236 		*--nodelist = ':';
2237 	if (flags)
2238 		*--flags = '=';
2239 	if (!err)
2240 		*mpol = new;
2241 	return err;
2242 }
2243 #endif /* CONFIG_TMPFS */
2244 
2245 /**
2246  * mpol_to_str - format a mempolicy structure for printing
2247  * @buffer:  to contain formatted mempolicy string
2248  * @maxlen:  length of @buffer
2249  * @pol:  pointer to mempolicy to be formatted
2250  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2251  *
2252  * Convert a mempolicy into a string.
2253  * Returns the number of characters in buffer (if positive)
2254  * or an error (negative)
2255  */
2256 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2257 {
2258 	char *p = buffer;
2259 	int l;
2260 	nodemask_t nodes;
2261 	unsigned short mode;
2262 	unsigned short flags = pol ? pol->flags : 0;
2263 
2264 	/*
2265 	 * Sanity check:  room for longest mode, flag and some nodes
2266 	 */
2267 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2268 
2269 	if (!pol || pol == &default_policy)
2270 		mode = MPOL_DEFAULT;
2271 	else
2272 		mode = pol->mode;
2273 
2274 	switch (mode) {
2275 	case MPOL_DEFAULT:
2276 		nodes_clear(nodes);
2277 		break;
2278 
2279 	case MPOL_PREFERRED:
2280 		nodes_clear(nodes);
2281 		if (flags & MPOL_F_LOCAL)
2282 			mode = MPOL_LOCAL;	/* pseudo-policy */
2283 		else
2284 			node_set(pol->v.preferred_node, nodes);
2285 		break;
2286 
2287 	case MPOL_BIND:
2288 		/* Fall through */
2289 	case MPOL_INTERLEAVE:
2290 		if (no_context)
2291 			nodes = pol->w.user_nodemask;
2292 		else
2293 			nodes = pol->v.nodes;
2294 		break;
2295 
2296 	default:
2297 		BUG();
2298 	}
2299 
2300 	l = strlen(policy_types[mode]);
2301 	if (buffer + maxlen < p + l + 1)
2302 		return -ENOSPC;
2303 
2304 	strcpy(p, policy_types[mode]);
2305 	p += l;
2306 
2307 	if (flags & MPOL_MODE_FLAGS) {
2308 		if (buffer + maxlen < p + 2)
2309 			return -ENOSPC;
2310 		*p++ = '=';
2311 
2312 		/*
2313 		 * Currently, the only defined flags are mutually exclusive
2314 		 */
2315 		if (flags & MPOL_F_STATIC_NODES)
2316 			p += snprintf(p, buffer + maxlen - p, "static");
2317 		else if (flags & MPOL_F_RELATIVE_NODES)
2318 			p += snprintf(p, buffer + maxlen - p, "relative");
2319 	}
2320 
2321 	if (!nodes_empty(nodes)) {
2322 		if (buffer + maxlen < p + 2)
2323 			return -ENOSPC;
2324 		*p++ = ':';
2325 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2326 	}
2327 	return p - buffer;
2328 }
2329 
2330 struct numa_maps {
2331 	unsigned long pages;
2332 	unsigned long anon;
2333 	unsigned long active;
2334 	unsigned long writeback;
2335 	unsigned long mapcount_max;
2336 	unsigned long dirty;
2337 	unsigned long swapcache;
2338 	unsigned long node[MAX_NUMNODES];
2339 };
2340 
2341 static void gather_stats(struct page *page, void *private, int pte_dirty)
2342 {
2343 	struct numa_maps *md = private;
2344 	int count = page_mapcount(page);
2345 
2346 	md->pages++;
2347 	if (pte_dirty || PageDirty(page))
2348 		md->dirty++;
2349 
2350 	if (PageSwapCache(page))
2351 		md->swapcache++;
2352 
2353 	if (PageActive(page) || PageUnevictable(page))
2354 		md->active++;
2355 
2356 	if (PageWriteback(page))
2357 		md->writeback++;
2358 
2359 	if (PageAnon(page))
2360 		md->anon++;
2361 
2362 	if (count > md->mapcount_max)
2363 		md->mapcount_max = count;
2364 
2365 	md->node[page_to_nid(page)]++;
2366 }
2367 
2368 #ifdef CONFIG_HUGETLB_PAGE
2369 static void check_huge_range(struct vm_area_struct *vma,
2370 		unsigned long start, unsigned long end,
2371 		struct numa_maps *md)
2372 {
2373 	unsigned long addr;
2374 	struct page *page;
2375 	struct hstate *h = hstate_vma(vma);
2376 	unsigned long sz = huge_page_size(h);
2377 
2378 	for (addr = start; addr < end; addr += sz) {
2379 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2380 						addr & huge_page_mask(h));
2381 		pte_t pte;
2382 
2383 		if (!ptep)
2384 			continue;
2385 
2386 		pte = *ptep;
2387 		if (pte_none(pte))
2388 			continue;
2389 
2390 		page = pte_page(pte);
2391 		if (!page)
2392 			continue;
2393 
2394 		gather_stats(page, md, pte_dirty(*ptep));
2395 	}
2396 }
2397 #else
2398 static inline void check_huge_range(struct vm_area_struct *vma,
2399 		unsigned long start, unsigned long end,
2400 		struct numa_maps *md)
2401 {
2402 }
2403 #endif
2404 
2405 /*
2406  * Display pages allocated per node and memory policy via /proc.
2407  */
2408 int show_numa_map(struct seq_file *m, void *v)
2409 {
2410 	struct proc_maps_private *priv = m->private;
2411 	struct vm_area_struct *vma = v;
2412 	struct numa_maps *md;
2413 	struct file *file = vma->vm_file;
2414 	struct mm_struct *mm = vma->vm_mm;
2415 	struct mempolicy *pol;
2416 	int n;
2417 	char buffer[50];
2418 
2419 	if (!mm)
2420 		return 0;
2421 
2422 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2423 	if (!md)
2424 		return 0;
2425 
2426 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2427 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2428 	mpol_cond_put(pol);
2429 
2430 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2431 
2432 	if (file) {
2433 		seq_printf(m, " file=");
2434 		seq_path(m, &file->f_path, "\n\t= ");
2435 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2436 		seq_printf(m, " heap");
2437 	} else if (vma->vm_start <= mm->start_stack &&
2438 			vma->vm_end >= mm->start_stack) {
2439 		seq_printf(m, " stack");
2440 	}
2441 
2442 	if (is_vm_hugetlb_page(vma)) {
2443 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2444 		seq_printf(m, " huge");
2445 	} else {
2446 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2447 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2448 	}
2449 
2450 	if (!md->pages)
2451 		goto out;
2452 
2453 	if (md->anon)
2454 		seq_printf(m," anon=%lu",md->anon);
2455 
2456 	if (md->dirty)
2457 		seq_printf(m," dirty=%lu",md->dirty);
2458 
2459 	if (md->pages != md->anon && md->pages != md->dirty)
2460 		seq_printf(m, " mapped=%lu", md->pages);
2461 
2462 	if (md->mapcount_max > 1)
2463 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2464 
2465 	if (md->swapcache)
2466 		seq_printf(m," swapcache=%lu", md->swapcache);
2467 
2468 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2469 		seq_printf(m," active=%lu", md->active);
2470 
2471 	if (md->writeback)
2472 		seq_printf(m," writeback=%lu", md->writeback);
2473 
2474 	for_each_node_state(n, N_HIGH_MEMORY)
2475 		if (md->node[n])
2476 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2477 out:
2478 	seq_putc(m, '\n');
2479 	kfree(md);
2480 
2481 	if (m->count < m->size)
2482 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2483 	return 0;
2484 }
2485