xref: /openbmc/linux/mm/mempolicy.c (revision f77f13e2)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/gfp.h>
77 #include <linux/slab.h>
78 #include <linux/string.h>
79 #include <linux/module.h>
80 #include <linux/nsproxy.h>
81 #include <linux/interrupt.h>
82 #include <linux/init.h>
83 #include <linux/compat.h>
84 #include <linux/swap.h>
85 #include <linux/seq_file.h>
86 #include <linux/proc_fs.h>
87 #include <linux/migrate.h>
88 #include <linux/ksm.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 #include <linux/ctype.h>
93 #include <linux/mm_inline.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 
98 #include "internal.h"
99 
100 /* Internal flags */
101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
103 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static const struct mempolicy_operations {
122 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
123 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
124 } mpol_ops[MPOL_MAX];
125 
126 /* Check that the nodemask contains at least one populated zone */
127 static int is_valid_nodemask(const nodemask_t *nodemask)
128 {
129 	int nd, k;
130 
131 	/* Check that there is something useful in this mask */
132 	k = policy_zone;
133 
134 	for_each_node_mask(nd, *nodemask) {
135 		struct zone *z;
136 
137 		for (k = 0; k <= policy_zone; k++) {
138 			z = &NODE_DATA(nd)->node_zones[k];
139 			if (z->present_pages > 0)
140 				return 1;
141 		}
142 	}
143 
144 	return 0;
145 }
146 
147 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
148 {
149 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
150 }
151 
152 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
153 				   const nodemask_t *rel)
154 {
155 	nodemask_t tmp;
156 	nodes_fold(tmp, *orig, nodes_weight(*rel));
157 	nodes_onto(*ret, tmp, *rel);
158 }
159 
160 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
161 {
162 	if (nodes_empty(*nodes))
163 		return -EINVAL;
164 	pol->v.nodes = *nodes;
165 	return 0;
166 }
167 
168 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
169 {
170 	if (!nodes)
171 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
172 	else if (nodes_empty(*nodes))
173 		return -EINVAL;			/*  no allowed nodes */
174 	else
175 		pol->v.preferred_node = first_node(*nodes);
176 	return 0;
177 }
178 
179 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (!is_valid_nodemask(nodes))
182 		return -EINVAL;
183 	pol->v.nodes = *nodes;
184 	return 0;
185 }
186 
187 /*
188  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
189  * any, for the new policy.  mpol_new() has already validated the nodes
190  * parameter with respect to the policy mode and flags.  But, we need to
191  * handle an empty nodemask with MPOL_PREFERRED here.
192  *
193  * Must be called holding task's alloc_lock to protect task's mems_allowed
194  * and mempolicy.  May also be called holding the mmap_semaphore for write.
195  */
196 static int mpol_set_nodemask(struct mempolicy *pol,
197 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
198 {
199 	int ret;
200 
201 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
202 	if (pol == NULL)
203 		return 0;
204 	/* Check N_HIGH_MEMORY */
205 	nodes_and(nsc->mask1,
206 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
207 
208 	VM_BUG_ON(!nodes);
209 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
210 		nodes = NULL;	/* explicit local allocation */
211 	else {
212 		if (pol->flags & MPOL_F_RELATIVE_NODES)
213 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
214 		else
215 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
216 
217 		if (mpol_store_user_nodemask(pol))
218 			pol->w.user_nodemask = *nodes;
219 		else
220 			pol->w.cpuset_mems_allowed =
221 						cpuset_current_mems_allowed;
222 	}
223 
224 	if (nodes)
225 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
226 	else
227 		ret = mpol_ops[pol->mode].create(pol, NULL);
228 	return ret;
229 }
230 
231 /*
232  * This function just creates a new policy, does some check and simple
233  * initialization. You must invoke mpol_set_nodemask() to set nodes.
234  */
235 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
236 				  nodemask_t *nodes)
237 {
238 	struct mempolicy *policy;
239 
240 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
241 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
242 
243 	if (mode == MPOL_DEFAULT) {
244 		if (nodes && !nodes_empty(*nodes))
245 			return ERR_PTR(-EINVAL);
246 		return NULL;	/* simply delete any existing policy */
247 	}
248 	VM_BUG_ON(!nodes);
249 
250 	/*
251 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
252 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
253 	 * All other modes require a valid pointer to a non-empty nodemask.
254 	 */
255 	if (mode == MPOL_PREFERRED) {
256 		if (nodes_empty(*nodes)) {
257 			if (((flags & MPOL_F_STATIC_NODES) ||
258 			     (flags & MPOL_F_RELATIVE_NODES)))
259 				return ERR_PTR(-EINVAL);
260 		}
261 	} else if (nodes_empty(*nodes))
262 		return ERR_PTR(-EINVAL);
263 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
264 	if (!policy)
265 		return ERR_PTR(-ENOMEM);
266 	atomic_set(&policy->refcnt, 1);
267 	policy->mode = mode;
268 	policy->flags = flags;
269 
270 	return policy;
271 }
272 
273 /* Slow path of a mpol destructor. */
274 void __mpol_put(struct mempolicy *p)
275 {
276 	if (!atomic_dec_and_test(&p->refcnt))
277 		return;
278 	kmem_cache_free(policy_cache, p);
279 }
280 
281 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
282 {
283 }
284 
285 static void mpol_rebind_nodemask(struct mempolicy *pol,
286 				 const nodemask_t *nodes)
287 {
288 	nodemask_t tmp;
289 
290 	if (pol->flags & MPOL_F_STATIC_NODES)
291 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
292 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
293 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
294 	else {
295 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
296 			    *nodes);
297 		pol->w.cpuset_mems_allowed = *nodes;
298 	}
299 
300 	pol->v.nodes = tmp;
301 	if (!node_isset(current->il_next, tmp)) {
302 		current->il_next = next_node(current->il_next, tmp);
303 		if (current->il_next >= MAX_NUMNODES)
304 			current->il_next = first_node(tmp);
305 		if (current->il_next >= MAX_NUMNODES)
306 			current->il_next = numa_node_id();
307 	}
308 }
309 
310 static void mpol_rebind_preferred(struct mempolicy *pol,
311 				  const nodemask_t *nodes)
312 {
313 	nodemask_t tmp;
314 
315 	if (pol->flags & MPOL_F_STATIC_NODES) {
316 		int node = first_node(pol->w.user_nodemask);
317 
318 		if (node_isset(node, *nodes)) {
319 			pol->v.preferred_node = node;
320 			pol->flags &= ~MPOL_F_LOCAL;
321 		} else
322 			pol->flags |= MPOL_F_LOCAL;
323 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
324 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
325 		pol->v.preferred_node = first_node(tmp);
326 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
327 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
328 						   pol->w.cpuset_mems_allowed,
329 						   *nodes);
330 		pol->w.cpuset_mems_allowed = *nodes;
331 	}
332 }
333 
334 /* Migrate a policy to a different set of nodes */
335 static void mpol_rebind_policy(struct mempolicy *pol,
336 			       const nodemask_t *newmask)
337 {
338 	if (!pol)
339 		return;
340 	if (!mpol_store_user_nodemask(pol) &&
341 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
342 		return;
343 	mpol_ops[pol->mode].rebind(pol, newmask);
344 }
345 
346 /*
347  * Wrapper for mpol_rebind_policy() that just requires task
348  * pointer, and updates task mempolicy.
349  *
350  * Called with task's alloc_lock held.
351  */
352 
353 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
354 {
355 	mpol_rebind_policy(tsk->mempolicy, new);
356 }
357 
358 /*
359  * Rebind each vma in mm to new nodemask.
360  *
361  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
362  */
363 
364 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
365 {
366 	struct vm_area_struct *vma;
367 
368 	down_write(&mm->mmap_sem);
369 	for (vma = mm->mmap; vma; vma = vma->vm_next)
370 		mpol_rebind_policy(vma->vm_policy, new);
371 	up_write(&mm->mmap_sem);
372 }
373 
374 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
375 	[MPOL_DEFAULT] = {
376 		.rebind = mpol_rebind_default,
377 	},
378 	[MPOL_INTERLEAVE] = {
379 		.create = mpol_new_interleave,
380 		.rebind = mpol_rebind_nodemask,
381 	},
382 	[MPOL_PREFERRED] = {
383 		.create = mpol_new_preferred,
384 		.rebind = mpol_rebind_preferred,
385 	},
386 	[MPOL_BIND] = {
387 		.create = mpol_new_bind,
388 		.rebind = mpol_rebind_nodemask,
389 	},
390 };
391 
392 static void gather_stats(struct page *, void *, int pte_dirty);
393 static void migrate_page_add(struct page *page, struct list_head *pagelist,
394 				unsigned long flags);
395 
396 /* Scan through pages checking if pages follow certain conditions. */
397 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
398 		unsigned long addr, unsigned long end,
399 		const nodemask_t *nodes, unsigned long flags,
400 		void *private)
401 {
402 	pte_t *orig_pte;
403 	pte_t *pte;
404 	spinlock_t *ptl;
405 
406 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
407 	do {
408 		struct page *page;
409 		int nid;
410 
411 		if (!pte_present(*pte))
412 			continue;
413 		page = vm_normal_page(vma, addr, *pte);
414 		if (!page)
415 			continue;
416 		/*
417 		 * vm_normal_page() filters out zero pages, but there might
418 		 * still be PageReserved pages to skip, perhaps in a VDSO.
419 		 * And we cannot move PageKsm pages sensibly or safely yet.
420 		 */
421 		if (PageReserved(page) || PageKsm(page))
422 			continue;
423 		nid = page_to_nid(page);
424 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
425 			continue;
426 
427 		if (flags & MPOL_MF_STATS)
428 			gather_stats(page, private, pte_dirty(*pte));
429 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
430 			migrate_page_add(page, private, flags);
431 		else
432 			break;
433 	} while (pte++, addr += PAGE_SIZE, addr != end);
434 	pte_unmap_unlock(orig_pte, ptl);
435 	return addr != end;
436 }
437 
438 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
439 		unsigned long addr, unsigned long end,
440 		const nodemask_t *nodes, unsigned long flags,
441 		void *private)
442 {
443 	pmd_t *pmd;
444 	unsigned long next;
445 
446 	pmd = pmd_offset(pud, addr);
447 	do {
448 		next = pmd_addr_end(addr, end);
449 		if (pmd_none_or_clear_bad(pmd))
450 			continue;
451 		if (check_pte_range(vma, pmd, addr, next, nodes,
452 				    flags, private))
453 			return -EIO;
454 	} while (pmd++, addr = next, addr != end);
455 	return 0;
456 }
457 
458 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
459 		unsigned long addr, unsigned long end,
460 		const nodemask_t *nodes, unsigned long flags,
461 		void *private)
462 {
463 	pud_t *pud;
464 	unsigned long next;
465 
466 	pud = pud_offset(pgd, addr);
467 	do {
468 		next = pud_addr_end(addr, end);
469 		if (pud_none_or_clear_bad(pud))
470 			continue;
471 		if (check_pmd_range(vma, pud, addr, next, nodes,
472 				    flags, private))
473 			return -EIO;
474 	} while (pud++, addr = next, addr != end);
475 	return 0;
476 }
477 
478 static inline int check_pgd_range(struct vm_area_struct *vma,
479 		unsigned long addr, unsigned long end,
480 		const nodemask_t *nodes, unsigned long flags,
481 		void *private)
482 {
483 	pgd_t *pgd;
484 	unsigned long next;
485 
486 	pgd = pgd_offset(vma->vm_mm, addr);
487 	do {
488 		next = pgd_addr_end(addr, end);
489 		if (pgd_none_or_clear_bad(pgd))
490 			continue;
491 		if (check_pud_range(vma, pgd, addr, next, nodes,
492 				    flags, private))
493 			return -EIO;
494 	} while (pgd++, addr = next, addr != end);
495 	return 0;
496 }
497 
498 /*
499  * Check if all pages in a range are on a set of nodes.
500  * If pagelist != NULL then isolate pages from the LRU and
501  * put them on the pagelist.
502  */
503 static struct vm_area_struct *
504 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
505 		const nodemask_t *nodes, unsigned long flags, void *private)
506 {
507 	int err;
508 	struct vm_area_struct *first, *vma, *prev;
509 
510 
511 	first = find_vma(mm, start);
512 	if (!first)
513 		return ERR_PTR(-EFAULT);
514 	prev = NULL;
515 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
516 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
517 			if (!vma->vm_next && vma->vm_end < end)
518 				return ERR_PTR(-EFAULT);
519 			if (prev && prev->vm_end < vma->vm_start)
520 				return ERR_PTR(-EFAULT);
521 		}
522 		if (!is_vm_hugetlb_page(vma) &&
523 		    ((flags & MPOL_MF_STRICT) ||
524 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
525 				vma_migratable(vma)))) {
526 			unsigned long endvma = vma->vm_end;
527 
528 			if (endvma > end)
529 				endvma = end;
530 			if (vma->vm_start > start)
531 				start = vma->vm_start;
532 			err = check_pgd_range(vma, start, endvma, nodes,
533 						flags, private);
534 			if (err) {
535 				first = ERR_PTR(err);
536 				break;
537 			}
538 		}
539 		prev = vma;
540 	}
541 	return first;
542 }
543 
544 /* Apply policy to a single VMA */
545 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
546 {
547 	int err = 0;
548 	struct mempolicy *old = vma->vm_policy;
549 
550 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
551 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
552 		 vma->vm_ops, vma->vm_file,
553 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
554 
555 	if (vma->vm_ops && vma->vm_ops->set_policy)
556 		err = vma->vm_ops->set_policy(vma, new);
557 	if (!err) {
558 		mpol_get(new);
559 		vma->vm_policy = new;
560 		mpol_put(old);
561 	}
562 	return err;
563 }
564 
565 /* Step 2: apply policy to a range and do splits. */
566 static int mbind_range(struct mm_struct *mm, unsigned long start,
567 		       unsigned long end, struct mempolicy *new_pol)
568 {
569 	struct vm_area_struct *next;
570 	struct vm_area_struct *prev;
571 	struct vm_area_struct *vma;
572 	int err = 0;
573 	pgoff_t pgoff;
574 	unsigned long vmstart;
575 	unsigned long vmend;
576 
577 	vma = find_vma_prev(mm, start, &prev);
578 	if (!vma || vma->vm_start > start)
579 		return -EFAULT;
580 
581 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
582 		next = vma->vm_next;
583 		vmstart = max(start, vma->vm_start);
584 		vmend   = min(end, vma->vm_end);
585 
586 		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
587 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
588 				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
589 		if (prev) {
590 			vma = prev;
591 			next = vma->vm_next;
592 			continue;
593 		}
594 		if (vma->vm_start != vmstart) {
595 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
596 			if (err)
597 				goto out;
598 		}
599 		if (vma->vm_end != vmend) {
600 			err = split_vma(vma->vm_mm, vma, vmend, 0);
601 			if (err)
602 				goto out;
603 		}
604 		err = policy_vma(vma, new_pol);
605 		if (err)
606 			goto out;
607 	}
608 
609  out:
610 	return err;
611 }
612 
613 /*
614  * Update task->flags PF_MEMPOLICY bit: set iff non-default
615  * mempolicy.  Allows more rapid checking of this (combined perhaps
616  * with other PF_* flag bits) on memory allocation hot code paths.
617  *
618  * If called from outside this file, the task 'p' should -only- be
619  * a newly forked child not yet visible on the task list, because
620  * manipulating the task flags of a visible task is not safe.
621  *
622  * The above limitation is why this routine has the funny name
623  * mpol_fix_fork_child_flag().
624  *
625  * It is also safe to call this with a task pointer of current,
626  * which the static wrapper mpol_set_task_struct_flag() does,
627  * for use within this file.
628  */
629 
630 void mpol_fix_fork_child_flag(struct task_struct *p)
631 {
632 	if (p->mempolicy)
633 		p->flags |= PF_MEMPOLICY;
634 	else
635 		p->flags &= ~PF_MEMPOLICY;
636 }
637 
638 static void mpol_set_task_struct_flag(void)
639 {
640 	mpol_fix_fork_child_flag(current);
641 }
642 
643 /* Set the process memory policy */
644 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
645 			     nodemask_t *nodes)
646 {
647 	struct mempolicy *new, *old;
648 	struct mm_struct *mm = current->mm;
649 	NODEMASK_SCRATCH(scratch);
650 	int ret;
651 
652 	if (!scratch)
653 		return -ENOMEM;
654 
655 	new = mpol_new(mode, flags, nodes);
656 	if (IS_ERR(new)) {
657 		ret = PTR_ERR(new);
658 		goto out;
659 	}
660 	/*
661 	 * prevent changing our mempolicy while show_numa_maps()
662 	 * is using it.
663 	 * Note:  do_set_mempolicy() can be called at init time
664 	 * with no 'mm'.
665 	 */
666 	if (mm)
667 		down_write(&mm->mmap_sem);
668 	task_lock(current);
669 	ret = mpol_set_nodemask(new, nodes, scratch);
670 	if (ret) {
671 		task_unlock(current);
672 		if (mm)
673 			up_write(&mm->mmap_sem);
674 		mpol_put(new);
675 		goto out;
676 	}
677 	old = current->mempolicy;
678 	current->mempolicy = new;
679 	mpol_set_task_struct_flag();
680 	if (new && new->mode == MPOL_INTERLEAVE &&
681 	    nodes_weight(new->v.nodes))
682 		current->il_next = first_node(new->v.nodes);
683 	task_unlock(current);
684 	if (mm)
685 		up_write(&mm->mmap_sem);
686 
687 	mpol_put(old);
688 	ret = 0;
689 out:
690 	NODEMASK_SCRATCH_FREE(scratch);
691 	return ret;
692 }
693 
694 /*
695  * Return nodemask for policy for get_mempolicy() query
696  *
697  * Called with task's alloc_lock held
698  */
699 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
700 {
701 	nodes_clear(*nodes);
702 	if (p == &default_policy)
703 		return;
704 
705 	switch (p->mode) {
706 	case MPOL_BIND:
707 		/* Fall through */
708 	case MPOL_INTERLEAVE:
709 		*nodes = p->v.nodes;
710 		break;
711 	case MPOL_PREFERRED:
712 		if (!(p->flags & MPOL_F_LOCAL))
713 			node_set(p->v.preferred_node, *nodes);
714 		/* else return empty node mask for local allocation */
715 		break;
716 	default:
717 		BUG();
718 	}
719 }
720 
721 static int lookup_node(struct mm_struct *mm, unsigned long addr)
722 {
723 	struct page *p;
724 	int err;
725 
726 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
727 	if (err >= 0) {
728 		err = page_to_nid(p);
729 		put_page(p);
730 	}
731 	return err;
732 }
733 
734 /* Retrieve NUMA policy */
735 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
736 			     unsigned long addr, unsigned long flags)
737 {
738 	int err;
739 	struct mm_struct *mm = current->mm;
740 	struct vm_area_struct *vma = NULL;
741 	struct mempolicy *pol = current->mempolicy;
742 
743 	if (flags &
744 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
745 		return -EINVAL;
746 
747 	if (flags & MPOL_F_MEMS_ALLOWED) {
748 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
749 			return -EINVAL;
750 		*policy = 0;	/* just so it's initialized */
751 		task_lock(current);
752 		*nmask  = cpuset_current_mems_allowed;
753 		task_unlock(current);
754 		return 0;
755 	}
756 
757 	if (flags & MPOL_F_ADDR) {
758 		/*
759 		 * Do NOT fall back to task policy if the
760 		 * vma/shared policy at addr is NULL.  We
761 		 * want to return MPOL_DEFAULT in this case.
762 		 */
763 		down_read(&mm->mmap_sem);
764 		vma = find_vma_intersection(mm, addr, addr+1);
765 		if (!vma) {
766 			up_read(&mm->mmap_sem);
767 			return -EFAULT;
768 		}
769 		if (vma->vm_ops && vma->vm_ops->get_policy)
770 			pol = vma->vm_ops->get_policy(vma, addr);
771 		else
772 			pol = vma->vm_policy;
773 	} else if (addr)
774 		return -EINVAL;
775 
776 	if (!pol)
777 		pol = &default_policy;	/* indicates default behavior */
778 
779 	if (flags & MPOL_F_NODE) {
780 		if (flags & MPOL_F_ADDR) {
781 			err = lookup_node(mm, addr);
782 			if (err < 0)
783 				goto out;
784 			*policy = err;
785 		} else if (pol == current->mempolicy &&
786 				pol->mode == MPOL_INTERLEAVE) {
787 			*policy = current->il_next;
788 		} else {
789 			err = -EINVAL;
790 			goto out;
791 		}
792 	} else {
793 		*policy = pol == &default_policy ? MPOL_DEFAULT :
794 						pol->mode;
795 		/*
796 		 * Internal mempolicy flags must be masked off before exposing
797 		 * the policy to userspace.
798 		 */
799 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
800 	}
801 
802 	if (vma) {
803 		up_read(&current->mm->mmap_sem);
804 		vma = NULL;
805 	}
806 
807 	err = 0;
808 	if (nmask) {
809 		task_lock(current);
810 		get_policy_nodemask(pol, nmask);
811 		task_unlock(current);
812 	}
813 
814  out:
815 	mpol_cond_put(pol);
816 	if (vma)
817 		up_read(&current->mm->mmap_sem);
818 	return err;
819 }
820 
821 #ifdef CONFIG_MIGRATION
822 /*
823  * page migration
824  */
825 static void migrate_page_add(struct page *page, struct list_head *pagelist,
826 				unsigned long flags)
827 {
828 	/*
829 	 * Avoid migrating a page that is shared with others.
830 	 */
831 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
832 		if (!isolate_lru_page(page)) {
833 			list_add_tail(&page->lru, pagelist);
834 			inc_zone_page_state(page, NR_ISOLATED_ANON +
835 					    page_is_file_cache(page));
836 		}
837 	}
838 }
839 
840 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
841 {
842 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
843 }
844 
845 /*
846  * Migrate pages from one node to a target node.
847  * Returns error or the number of pages not migrated.
848  */
849 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
850 			   int flags)
851 {
852 	nodemask_t nmask;
853 	LIST_HEAD(pagelist);
854 	int err = 0;
855 
856 	nodes_clear(nmask);
857 	node_set(source, nmask);
858 
859 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
860 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
861 
862 	if (!list_empty(&pagelist))
863 		err = migrate_pages(&pagelist, new_node_page, dest, 0);
864 
865 	return err;
866 }
867 
868 /*
869  * Move pages between the two nodesets so as to preserve the physical
870  * layout as much as possible.
871  *
872  * Returns the number of page that could not be moved.
873  */
874 int do_migrate_pages(struct mm_struct *mm,
875 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
876 {
877 	int busy = 0;
878 	int err;
879 	nodemask_t tmp;
880 
881 	err = migrate_prep();
882 	if (err)
883 		return err;
884 
885 	down_read(&mm->mmap_sem);
886 
887 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
888 	if (err)
889 		goto out;
890 
891 	/*
892 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
893 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
894 	 * bit in 'tmp', and return that <source, dest> pair for migration.
895 	 * The pair of nodemasks 'to' and 'from' define the map.
896 	 *
897 	 * If no pair of bits is found that way, fallback to picking some
898 	 * pair of 'source' and 'dest' bits that are not the same.  If the
899 	 * 'source' and 'dest' bits are the same, this represents a node
900 	 * that will be migrating to itself, so no pages need move.
901 	 *
902 	 * If no bits are left in 'tmp', or if all remaining bits left
903 	 * in 'tmp' correspond to the same bit in 'to', return false
904 	 * (nothing left to migrate).
905 	 *
906 	 * This lets us pick a pair of nodes to migrate between, such that
907 	 * if possible the dest node is not already occupied by some other
908 	 * source node, minimizing the risk of overloading the memory on a
909 	 * node that would happen if we migrated incoming memory to a node
910 	 * before migrating outgoing memory source that same node.
911 	 *
912 	 * A single scan of tmp is sufficient.  As we go, we remember the
913 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
914 	 * that not only moved, but what's better, moved to an empty slot
915 	 * (d is not set in tmp), then we break out then, with that pair.
916 	 * Otherwise when we finish scannng from_tmp, we at least have the
917 	 * most recent <s, d> pair that moved.  If we get all the way through
918 	 * the scan of tmp without finding any node that moved, much less
919 	 * moved to an empty node, then there is nothing left worth migrating.
920 	 */
921 
922 	tmp = *from_nodes;
923 	while (!nodes_empty(tmp)) {
924 		int s,d;
925 		int source = -1;
926 		int dest = 0;
927 
928 		for_each_node_mask(s, tmp) {
929 			d = node_remap(s, *from_nodes, *to_nodes);
930 			if (s == d)
931 				continue;
932 
933 			source = s;	/* Node moved. Memorize */
934 			dest = d;
935 
936 			/* dest not in remaining from nodes? */
937 			if (!node_isset(dest, tmp))
938 				break;
939 		}
940 		if (source == -1)
941 			break;
942 
943 		node_clear(source, tmp);
944 		err = migrate_to_node(mm, source, dest, flags);
945 		if (err > 0)
946 			busy += err;
947 		if (err < 0)
948 			break;
949 	}
950 out:
951 	up_read(&mm->mmap_sem);
952 	if (err < 0)
953 		return err;
954 	return busy;
955 
956 }
957 
958 /*
959  * Allocate a new page for page migration based on vma policy.
960  * Start assuming that page is mapped by vma pointed to by @private.
961  * Search forward from there, if not.  N.B., this assumes that the
962  * list of pages handed to migrate_pages()--which is how we get here--
963  * is in virtual address order.
964  */
965 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
966 {
967 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
968 	unsigned long uninitialized_var(address);
969 
970 	while (vma) {
971 		address = page_address_in_vma(page, vma);
972 		if (address != -EFAULT)
973 			break;
974 		vma = vma->vm_next;
975 	}
976 
977 	/*
978 	 * if !vma, alloc_page_vma() will use task or system default policy
979 	 */
980 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
981 }
982 #else
983 
984 static void migrate_page_add(struct page *page, struct list_head *pagelist,
985 				unsigned long flags)
986 {
987 }
988 
989 int do_migrate_pages(struct mm_struct *mm,
990 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
991 {
992 	return -ENOSYS;
993 }
994 
995 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
996 {
997 	return NULL;
998 }
999 #endif
1000 
1001 static long do_mbind(unsigned long start, unsigned long len,
1002 		     unsigned short mode, unsigned short mode_flags,
1003 		     nodemask_t *nmask, unsigned long flags)
1004 {
1005 	struct vm_area_struct *vma;
1006 	struct mm_struct *mm = current->mm;
1007 	struct mempolicy *new;
1008 	unsigned long end;
1009 	int err;
1010 	LIST_HEAD(pagelist);
1011 
1012 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1013 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1014 		return -EINVAL;
1015 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1016 		return -EPERM;
1017 
1018 	if (start & ~PAGE_MASK)
1019 		return -EINVAL;
1020 
1021 	if (mode == MPOL_DEFAULT)
1022 		flags &= ~MPOL_MF_STRICT;
1023 
1024 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1025 	end = start + len;
1026 
1027 	if (end < start)
1028 		return -EINVAL;
1029 	if (end == start)
1030 		return 0;
1031 
1032 	new = mpol_new(mode, mode_flags, nmask);
1033 	if (IS_ERR(new))
1034 		return PTR_ERR(new);
1035 
1036 	/*
1037 	 * If we are using the default policy then operation
1038 	 * on discontinuous address spaces is okay after all
1039 	 */
1040 	if (!new)
1041 		flags |= MPOL_MF_DISCONTIG_OK;
1042 
1043 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1044 		 start, start + len, mode, mode_flags,
1045 		 nmask ? nodes_addr(*nmask)[0] : -1);
1046 
1047 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1048 
1049 		err = migrate_prep();
1050 		if (err)
1051 			goto mpol_out;
1052 	}
1053 	{
1054 		NODEMASK_SCRATCH(scratch);
1055 		if (scratch) {
1056 			down_write(&mm->mmap_sem);
1057 			task_lock(current);
1058 			err = mpol_set_nodemask(new, nmask, scratch);
1059 			task_unlock(current);
1060 			if (err)
1061 				up_write(&mm->mmap_sem);
1062 		} else
1063 			err = -ENOMEM;
1064 		NODEMASK_SCRATCH_FREE(scratch);
1065 	}
1066 	if (err)
1067 		goto mpol_out;
1068 
1069 	vma = check_range(mm, start, end, nmask,
1070 			  flags | MPOL_MF_INVERT, &pagelist);
1071 
1072 	err = PTR_ERR(vma);
1073 	if (!IS_ERR(vma)) {
1074 		int nr_failed = 0;
1075 
1076 		err = mbind_range(mm, start, end, new);
1077 
1078 		if (!list_empty(&pagelist))
1079 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1080 						(unsigned long)vma, 0);
1081 
1082 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1083 			err = -EIO;
1084 	} else
1085 		putback_lru_pages(&pagelist);
1086 
1087 	up_write(&mm->mmap_sem);
1088  mpol_out:
1089 	mpol_put(new);
1090 	return err;
1091 }
1092 
1093 /*
1094  * User space interface with variable sized bitmaps for nodelists.
1095  */
1096 
1097 /* Copy a node mask from user space. */
1098 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1099 		     unsigned long maxnode)
1100 {
1101 	unsigned long k;
1102 	unsigned long nlongs;
1103 	unsigned long endmask;
1104 
1105 	--maxnode;
1106 	nodes_clear(*nodes);
1107 	if (maxnode == 0 || !nmask)
1108 		return 0;
1109 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1110 		return -EINVAL;
1111 
1112 	nlongs = BITS_TO_LONGS(maxnode);
1113 	if ((maxnode % BITS_PER_LONG) == 0)
1114 		endmask = ~0UL;
1115 	else
1116 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1117 
1118 	/* When the user specified more nodes than supported just check
1119 	   if the non supported part is all zero. */
1120 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1121 		if (nlongs > PAGE_SIZE/sizeof(long))
1122 			return -EINVAL;
1123 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1124 			unsigned long t;
1125 			if (get_user(t, nmask + k))
1126 				return -EFAULT;
1127 			if (k == nlongs - 1) {
1128 				if (t & endmask)
1129 					return -EINVAL;
1130 			} else if (t)
1131 				return -EINVAL;
1132 		}
1133 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1134 		endmask = ~0UL;
1135 	}
1136 
1137 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1138 		return -EFAULT;
1139 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1140 	return 0;
1141 }
1142 
1143 /* Copy a kernel node mask to user space */
1144 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1145 			      nodemask_t *nodes)
1146 {
1147 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1148 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1149 
1150 	if (copy > nbytes) {
1151 		if (copy > PAGE_SIZE)
1152 			return -EINVAL;
1153 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1154 			return -EFAULT;
1155 		copy = nbytes;
1156 	}
1157 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1158 }
1159 
1160 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1161 		unsigned long, mode, unsigned long __user *, nmask,
1162 		unsigned long, maxnode, unsigned, flags)
1163 {
1164 	nodemask_t nodes;
1165 	int err;
1166 	unsigned short mode_flags;
1167 
1168 	mode_flags = mode & MPOL_MODE_FLAGS;
1169 	mode &= ~MPOL_MODE_FLAGS;
1170 	if (mode >= MPOL_MAX)
1171 		return -EINVAL;
1172 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1173 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1174 		return -EINVAL;
1175 	err = get_nodes(&nodes, nmask, maxnode);
1176 	if (err)
1177 		return err;
1178 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1179 }
1180 
1181 /* Set the process memory policy */
1182 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1183 		unsigned long, maxnode)
1184 {
1185 	int err;
1186 	nodemask_t nodes;
1187 	unsigned short flags;
1188 
1189 	flags = mode & MPOL_MODE_FLAGS;
1190 	mode &= ~MPOL_MODE_FLAGS;
1191 	if ((unsigned int)mode >= MPOL_MAX)
1192 		return -EINVAL;
1193 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1194 		return -EINVAL;
1195 	err = get_nodes(&nodes, nmask, maxnode);
1196 	if (err)
1197 		return err;
1198 	return do_set_mempolicy(mode, flags, &nodes);
1199 }
1200 
1201 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1202 		const unsigned long __user *, old_nodes,
1203 		const unsigned long __user *, new_nodes)
1204 {
1205 	const struct cred *cred = current_cred(), *tcred;
1206 	struct mm_struct *mm;
1207 	struct task_struct *task;
1208 	nodemask_t old;
1209 	nodemask_t new;
1210 	nodemask_t task_nodes;
1211 	int err;
1212 
1213 	err = get_nodes(&old, old_nodes, maxnode);
1214 	if (err)
1215 		return err;
1216 
1217 	err = get_nodes(&new, new_nodes, maxnode);
1218 	if (err)
1219 		return err;
1220 
1221 	/* Find the mm_struct */
1222 	read_lock(&tasklist_lock);
1223 	task = pid ? find_task_by_vpid(pid) : current;
1224 	if (!task) {
1225 		read_unlock(&tasklist_lock);
1226 		return -ESRCH;
1227 	}
1228 	mm = get_task_mm(task);
1229 	read_unlock(&tasklist_lock);
1230 
1231 	if (!mm)
1232 		return -EINVAL;
1233 
1234 	/*
1235 	 * Check if this process has the right to modify the specified
1236 	 * process. The right exists if the process has administrative
1237 	 * capabilities, superuser privileges or the same
1238 	 * userid as the target process.
1239 	 */
1240 	rcu_read_lock();
1241 	tcred = __task_cred(task);
1242 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1243 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1244 	    !capable(CAP_SYS_NICE)) {
1245 		rcu_read_unlock();
1246 		err = -EPERM;
1247 		goto out;
1248 	}
1249 	rcu_read_unlock();
1250 
1251 	task_nodes = cpuset_mems_allowed(task);
1252 	/* Is the user allowed to access the target nodes? */
1253 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1254 		err = -EPERM;
1255 		goto out;
1256 	}
1257 
1258 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1259 		err = -EINVAL;
1260 		goto out;
1261 	}
1262 
1263 	err = security_task_movememory(task);
1264 	if (err)
1265 		goto out;
1266 
1267 	err = do_migrate_pages(mm, &old, &new,
1268 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1269 out:
1270 	mmput(mm);
1271 	return err;
1272 }
1273 
1274 
1275 /* Retrieve NUMA policy */
1276 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1277 		unsigned long __user *, nmask, unsigned long, maxnode,
1278 		unsigned long, addr, unsigned long, flags)
1279 {
1280 	int err;
1281 	int uninitialized_var(pval);
1282 	nodemask_t nodes;
1283 
1284 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1285 		return -EINVAL;
1286 
1287 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1288 
1289 	if (err)
1290 		return err;
1291 
1292 	if (policy && put_user(pval, policy))
1293 		return -EFAULT;
1294 
1295 	if (nmask)
1296 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1297 
1298 	return err;
1299 }
1300 
1301 #ifdef CONFIG_COMPAT
1302 
1303 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1304 				     compat_ulong_t __user *nmask,
1305 				     compat_ulong_t maxnode,
1306 				     compat_ulong_t addr, compat_ulong_t flags)
1307 {
1308 	long err;
1309 	unsigned long __user *nm = NULL;
1310 	unsigned long nr_bits, alloc_size;
1311 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1312 
1313 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1314 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1315 
1316 	if (nmask)
1317 		nm = compat_alloc_user_space(alloc_size);
1318 
1319 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1320 
1321 	if (!err && nmask) {
1322 		err = copy_from_user(bm, nm, alloc_size);
1323 		/* ensure entire bitmap is zeroed */
1324 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1325 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1326 	}
1327 
1328 	return err;
1329 }
1330 
1331 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1332 				     compat_ulong_t maxnode)
1333 {
1334 	long err = 0;
1335 	unsigned long __user *nm = NULL;
1336 	unsigned long nr_bits, alloc_size;
1337 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1338 
1339 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1340 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1341 
1342 	if (nmask) {
1343 		err = compat_get_bitmap(bm, nmask, nr_bits);
1344 		nm = compat_alloc_user_space(alloc_size);
1345 		err |= copy_to_user(nm, bm, alloc_size);
1346 	}
1347 
1348 	if (err)
1349 		return -EFAULT;
1350 
1351 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1352 }
1353 
1354 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1355 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1356 			     compat_ulong_t maxnode, compat_ulong_t flags)
1357 {
1358 	long err = 0;
1359 	unsigned long __user *nm = NULL;
1360 	unsigned long nr_bits, alloc_size;
1361 	nodemask_t bm;
1362 
1363 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1364 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1365 
1366 	if (nmask) {
1367 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1368 		nm = compat_alloc_user_space(alloc_size);
1369 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1370 	}
1371 
1372 	if (err)
1373 		return -EFAULT;
1374 
1375 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1376 }
1377 
1378 #endif
1379 
1380 /*
1381  * get_vma_policy(@task, @vma, @addr)
1382  * @task - task for fallback if vma policy == default
1383  * @vma   - virtual memory area whose policy is sought
1384  * @addr  - address in @vma for shared policy lookup
1385  *
1386  * Returns effective policy for a VMA at specified address.
1387  * Falls back to @task or system default policy, as necessary.
1388  * Current or other task's task mempolicy and non-shared vma policies
1389  * are protected by the task's mmap_sem, which must be held for read by
1390  * the caller.
1391  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1392  * count--added by the get_policy() vm_op, as appropriate--to protect against
1393  * freeing by another task.  It is the caller's responsibility to free the
1394  * extra reference for shared policies.
1395  */
1396 static struct mempolicy *get_vma_policy(struct task_struct *task,
1397 		struct vm_area_struct *vma, unsigned long addr)
1398 {
1399 	struct mempolicy *pol = task->mempolicy;
1400 
1401 	if (vma) {
1402 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1403 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1404 									addr);
1405 			if (vpol)
1406 				pol = vpol;
1407 		} else if (vma->vm_policy)
1408 			pol = vma->vm_policy;
1409 	}
1410 	if (!pol)
1411 		pol = &default_policy;
1412 	return pol;
1413 }
1414 
1415 /*
1416  * Return a nodemask representing a mempolicy for filtering nodes for
1417  * page allocation
1418  */
1419 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1420 {
1421 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1422 	if (unlikely(policy->mode == MPOL_BIND) &&
1423 			gfp_zone(gfp) >= policy_zone &&
1424 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1425 		return &policy->v.nodes;
1426 
1427 	return NULL;
1428 }
1429 
1430 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1431 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1432 {
1433 	int nd = numa_node_id();
1434 
1435 	switch (policy->mode) {
1436 	case MPOL_PREFERRED:
1437 		if (!(policy->flags & MPOL_F_LOCAL))
1438 			nd = policy->v.preferred_node;
1439 		break;
1440 	case MPOL_BIND:
1441 		/*
1442 		 * Normally, MPOL_BIND allocations are node-local within the
1443 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1444 		 * current node is part of the mask, we use the zonelist for
1445 		 * the first node in the mask instead.
1446 		 */
1447 		if (unlikely(gfp & __GFP_THISNODE) &&
1448 				unlikely(!node_isset(nd, policy->v.nodes)))
1449 			nd = first_node(policy->v.nodes);
1450 		break;
1451 	case MPOL_INTERLEAVE: /* should not happen */
1452 		break;
1453 	default:
1454 		BUG();
1455 	}
1456 	return node_zonelist(nd, gfp);
1457 }
1458 
1459 /* Do dynamic interleaving for a process */
1460 static unsigned interleave_nodes(struct mempolicy *policy)
1461 {
1462 	unsigned nid, next;
1463 	struct task_struct *me = current;
1464 
1465 	nid = me->il_next;
1466 	next = next_node(nid, policy->v.nodes);
1467 	if (next >= MAX_NUMNODES)
1468 		next = first_node(policy->v.nodes);
1469 	if (next < MAX_NUMNODES)
1470 		me->il_next = next;
1471 	return nid;
1472 }
1473 
1474 /*
1475  * Depending on the memory policy provide a node from which to allocate the
1476  * next slab entry.
1477  * @policy must be protected by freeing by the caller.  If @policy is
1478  * the current task's mempolicy, this protection is implicit, as only the
1479  * task can change it's policy.  The system default policy requires no
1480  * such protection.
1481  */
1482 unsigned slab_node(struct mempolicy *policy)
1483 {
1484 	if (!policy || policy->flags & MPOL_F_LOCAL)
1485 		return numa_node_id();
1486 
1487 	switch (policy->mode) {
1488 	case MPOL_PREFERRED:
1489 		/*
1490 		 * handled MPOL_F_LOCAL above
1491 		 */
1492 		return policy->v.preferred_node;
1493 
1494 	case MPOL_INTERLEAVE:
1495 		return interleave_nodes(policy);
1496 
1497 	case MPOL_BIND: {
1498 		/*
1499 		 * Follow bind policy behavior and start allocation at the
1500 		 * first node.
1501 		 */
1502 		struct zonelist *zonelist;
1503 		struct zone *zone;
1504 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1505 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1506 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1507 							&policy->v.nodes,
1508 							&zone);
1509 		return zone->node;
1510 	}
1511 
1512 	default:
1513 		BUG();
1514 	}
1515 }
1516 
1517 /* Do static interleaving for a VMA with known offset. */
1518 static unsigned offset_il_node(struct mempolicy *pol,
1519 		struct vm_area_struct *vma, unsigned long off)
1520 {
1521 	unsigned nnodes = nodes_weight(pol->v.nodes);
1522 	unsigned target;
1523 	int c;
1524 	int nid = -1;
1525 
1526 	if (!nnodes)
1527 		return numa_node_id();
1528 	target = (unsigned int)off % nnodes;
1529 	c = 0;
1530 	do {
1531 		nid = next_node(nid, pol->v.nodes);
1532 		c++;
1533 	} while (c <= target);
1534 	return nid;
1535 }
1536 
1537 /* Determine a node number for interleave */
1538 static inline unsigned interleave_nid(struct mempolicy *pol,
1539 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1540 {
1541 	if (vma) {
1542 		unsigned long off;
1543 
1544 		/*
1545 		 * for small pages, there is no difference between
1546 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1547 		 * for huge pages, since vm_pgoff is in units of small
1548 		 * pages, we need to shift off the always 0 bits to get
1549 		 * a useful offset.
1550 		 */
1551 		BUG_ON(shift < PAGE_SHIFT);
1552 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1553 		off += (addr - vma->vm_start) >> shift;
1554 		return offset_il_node(pol, vma, off);
1555 	} else
1556 		return interleave_nodes(pol);
1557 }
1558 
1559 #ifdef CONFIG_HUGETLBFS
1560 /*
1561  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1562  * @vma = virtual memory area whose policy is sought
1563  * @addr = address in @vma for shared policy lookup and interleave policy
1564  * @gfp_flags = for requested zone
1565  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1566  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1567  *
1568  * Returns a zonelist suitable for a huge page allocation and a pointer
1569  * to the struct mempolicy for conditional unref after allocation.
1570  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1571  * @nodemask for filtering the zonelist.
1572  */
1573 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1574 				gfp_t gfp_flags, struct mempolicy **mpol,
1575 				nodemask_t **nodemask)
1576 {
1577 	struct zonelist *zl;
1578 
1579 	*mpol = get_vma_policy(current, vma, addr);
1580 	*nodemask = NULL;	/* assume !MPOL_BIND */
1581 
1582 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1583 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1584 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1585 	} else {
1586 		zl = policy_zonelist(gfp_flags, *mpol);
1587 		if ((*mpol)->mode == MPOL_BIND)
1588 			*nodemask = &(*mpol)->v.nodes;
1589 	}
1590 	return zl;
1591 }
1592 
1593 /*
1594  * init_nodemask_of_mempolicy
1595  *
1596  * If the current task's mempolicy is "default" [NULL], return 'false'
1597  * to indicate default policy.  Otherwise, extract the policy nodemask
1598  * for 'bind' or 'interleave' policy into the argument nodemask, or
1599  * initialize the argument nodemask to contain the single node for
1600  * 'preferred' or 'local' policy and return 'true' to indicate presence
1601  * of non-default mempolicy.
1602  *
1603  * We don't bother with reference counting the mempolicy [mpol_get/put]
1604  * because the current task is examining it's own mempolicy and a task's
1605  * mempolicy is only ever changed by the task itself.
1606  *
1607  * N.B., it is the caller's responsibility to free a returned nodemask.
1608  */
1609 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1610 {
1611 	struct mempolicy *mempolicy;
1612 	int nid;
1613 
1614 	if (!(mask && current->mempolicy))
1615 		return false;
1616 
1617 	mempolicy = current->mempolicy;
1618 	switch (mempolicy->mode) {
1619 	case MPOL_PREFERRED:
1620 		if (mempolicy->flags & MPOL_F_LOCAL)
1621 			nid = numa_node_id();
1622 		else
1623 			nid = mempolicy->v.preferred_node;
1624 		init_nodemask_of_node(mask, nid);
1625 		break;
1626 
1627 	case MPOL_BIND:
1628 		/* Fall through */
1629 	case MPOL_INTERLEAVE:
1630 		*mask =  mempolicy->v.nodes;
1631 		break;
1632 
1633 	default:
1634 		BUG();
1635 	}
1636 
1637 	return true;
1638 }
1639 #endif
1640 
1641 /* Allocate a page in interleaved policy.
1642    Own path because it needs to do special accounting. */
1643 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1644 					unsigned nid)
1645 {
1646 	struct zonelist *zl;
1647 	struct page *page;
1648 
1649 	zl = node_zonelist(nid, gfp);
1650 	page = __alloc_pages(gfp, order, zl);
1651 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1652 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1653 	return page;
1654 }
1655 
1656 /**
1657  * 	alloc_page_vma	- Allocate a page for a VMA.
1658  *
1659  * 	@gfp:
1660  *      %GFP_USER    user allocation.
1661  *      %GFP_KERNEL  kernel allocations,
1662  *      %GFP_HIGHMEM highmem/user allocations,
1663  *      %GFP_FS      allocation should not call back into a file system.
1664  *      %GFP_ATOMIC  don't sleep.
1665  *
1666  * 	@vma:  Pointer to VMA or NULL if not available.
1667  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1668  *
1669  * 	This function allocates a page from the kernel page pool and applies
1670  *	a NUMA policy associated with the VMA or the current process.
1671  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1672  *	mm_struct of the VMA to prevent it from going away. Should be used for
1673  *	all allocations for pages that will be mapped into
1674  * 	user space. Returns NULL when no page can be allocated.
1675  *
1676  *	Should be called with the mm_sem of the vma hold.
1677  */
1678 struct page *
1679 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1680 {
1681 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1682 	struct zonelist *zl;
1683 
1684 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1685 		unsigned nid;
1686 
1687 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1688 		mpol_cond_put(pol);
1689 		return alloc_page_interleave(gfp, 0, nid);
1690 	}
1691 	zl = policy_zonelist(gfp, pol);
1692 	if (unlikely(mpol_needs_cond_ref(pol))) {
1693 		/*
1694 		 * slow path: ref counted shared policy
1695 		 */
1696 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1697 						zl, policy_nodemask(gfp, pol));
1698 		__mpol_put(pol);
1699 		return page;
1700 	}
1701 	/*
1702 	 * fast path:  default or task policy
1703 	 */
1704 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1705 }
1706 
1707 /**
1708  * 	alloc_pages_current - Allocate pages.
1709  *
1710  *	@gfp:
1711  *		%GFP_USER   user allocation,
1712  *      	%GFP_KERNEL kernel allocation,
1713  *      	%GFP_HIGHMEM highmem allocation,
1714  *      	%GFP_FS     don't call back into a file system.
1715  *      	%GFP_ATOMIC don't sleep.
1716  *	@order: Power of two of allocation size in pages. 0 is a single page.
1717  *
1718  *	Allocate a page from the kernel page pool.  When not in
1719  *	interrupt context and apply the current process NUMA policy.
1720  *	Returns NULL when no page can be allocated.
1721  *
1722  *	Don't call cpuset_update_task_memory_state() unless
1723  *	1) it's ok to take cpuset_sem (can WAIT), and
1724  *	2) allocating for current task (not interrupt).
1725  */
1726 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1727 {
1728 	struct mempolicy *pol = current->mempolicy;
1729 
1730 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1731 		pol = &default_policy;
1732 
1733 	/*
1734 	 * No reference counting needed for current->mempolicy
1735 	 * nor system default_policy
1736 	 */
1737 	if (pol->mode == MPOL_INTERLEAVE)
1738 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1739 	return __alloc_pages_nodemask(gfp, order,
1740 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1741 }
1742 EXPORT_SYMBOL(alloc_pages_current);
1743 
1744 /*
1745  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1746  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1747  * with the mems_allowed returned by cpuset_mems_allowed().  This
1748  * keeps mempolicies cpuset relative after its cpuset moves.  See
1749  * further kernel/cpuset.c update_nodemask().
1750  */
1751 
1752 /* Slow path of a mempolicy duplicate */
1753 struct mempolicy *__mpol_dup(struct mempolicy *old)
1754 {
1755 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1756 
1757 	if (!new)
1758 		return ERR_PTR(-ENOMEM);
1759 	if (current_cpuset_is_being_rebound()) {
1760 		nodemask_t mems = cpuset_mems_allowed(current);
1761 		mpol_rebind_policy(old, &mems);
1762 	}
1763 	*new = *old;
1764 	atomic_set(&new->refcnt, 1);
1765 	return new;
1766 }
1767 
1768 /*
1769  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1770  * eliminate the * MPOL_F_* flags that require conditional ref and
1771  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1772  * after return.  Use the returned value.
1773  *
1774  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1775  * policy lookup, even if the policy needs/has extra ref on lookup.
1776  * shmem_readahead needs this.
1777  */
1778 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1779 						struct mempolicy *frompol)
1780 {
1781 	if (!mpol_needs_cond_ref(frompol))
1782 		return frompol;
1783 
1784 	*tompol = *frompol;
1785 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1786 	__mpol_put(frompol);
1787 	return tompol;
1788 }
1789 
1790 static int mpol_match_intent(const struct mempolicy *a,
1791 			     const struct mempolicy *b)
1792 {
1793 	if (a->flags != b->flags)
1794 		return 0;
1795 	if (!mpol_store_user_nodemask(a))
1796 		return 1;
1797 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1798 }
1799 
1800 /* Slow path of a mempolicy comparison */
1801 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1802 {
1803 	if (!a || !b)
1804 		return 0;
1805 	if (a->mode != b->mode)
1806 		return 0;
1807 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1808 		return 0;
1809 	switch (a->mode) {
1810 	case MPOL_BIND:
1811 		/* Fall through */
1812 	case MPOL_INTERLEAVE:
1813 		return nodes_equal(a->v.nodes, b->v.nodes);
1814 	case MPOL_PREFERRED:
1815 		return a->v.preferred_node == b->v.preferred_node &&
1816 			a->flags == b->flags;
1817 	default:
1818 		BUG();
1819 		return 0;
1820 	}
1821 }
1822 
1823 /*
1824  * Shared memory backing store policy support.
1825  *
1826  * Remember policies even when nobody has shared memory mapped.
1827  * The policies are kept in Red-Black tree linked from the inode.
1828  * They are protected by the sp->lock spinlock, which should be held
1829  * for any accesses to the tree.
1830  */
1831 
1832 /* lookup first element intersecting start-end */
1833 /* Caller holds sp->lock */
1834 static struct sp_node *
1835 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1836 {
1837 	struct rb_node *n = sp->root.rb_node;
1838 
1839 	while (n) {
1840 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1841 
1842 		if (start >= p->end)
1843 			n = n->rb_right;
1844 		else if (end <= p->start)
1845 			n = n->rb_left;
1846 		else
1847 			break;
1848 	}
1849 	if (!n)
1850 		return NULL;
1851 	for (;;) {
1852 		struct sp_node *w = NULL;
1853 		struct rb_node *prev = rb_prev(n);
1854 		if (!prev)
1855 			break;
1856 		w = rb_entry(prev, struct sp_node, nd);
1857 		if (w->end <= start)
1858 			break;
1859 		n = prev;
1860 	}
1861 	return rb_entry(n, struct sp_node, nd);
1862 }
1863 
1864 /* Insert a new shared policy into the list. */
1865 /* Caller holds sp->lock */
1866 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1867 {
1868 	struct rb_node **p = &sp->root.rb_node;
1869 	struct rb_node *parent = NULL;
1870 	struct sp_node *nd;
1871 
1872 	while (*p) {
1873 		parent = *p;
1874 		nd = rb_entry(parent, struct sp_node, nd);
1875 		if (new->start < nd->start)
1876 			p = &(*p)->rb_left;
1877 		else if (new->end > nd->end)
1878 			p = &(*p)->rb_right;
1879 		else
1880 			BUG();
1881 	}
1882 	rb_link_node(&new->nd, parent, p);
1883 	rb_insert_color(&new->nd, &sp->root);
1884 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1885 		 new->policy ? new->policy->mode : 0);
1886 }
1887 
1888 /* Find shared policy intersecting idx */
1889 struct mempolicy *
1890 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1891 {
1892 	struct mempolicy *pol = NULL;
1893 	struct sp_node *sn;
1894 
1895 	if (!sp->root.rb_node)
1896 		return NULL;
1897 	spin_lock(&sp->lock);
1898 	sn = sp_lookup(sp, idx, idx+1);
1899 	if (sn) {
1900 		mpol_get(sn->policy);
1901 		pol = sn->policy;
1902 	}
1903 	spin_unlock(&sp->lock);
1904 	return pol;
1905 }
1906 
1907 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1908 {
1909 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1910 	rb_erase(&n->nd, &sp->root);
1911 	mpol_put(n->policy);
1912 	kmem_cache_free(sn_cache, n);
1913 }
1914 
1915 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1916 				struct mempolicy *pol)
1917 {
1918 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1919 
1920 	if (!n)
1921 		return NULL;
1922 	n->start = start;
1923 	n->end = end;
1924 	mpol_get(pol);
1925 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1926 	n->policy = pol;
1927 	return n;
1928 }
1929 
1930 /* Replace a policy range. */
1931 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1932 				 unsigned long end, struct sp_node *new)
1933 {
1934 	struct sp_node *n, *new2 = NULL;
1935 
1936 restart:
1937 	spin_lock(&sp->lock);
1938 	n = sp_lookup(sp, start, end);
1939 	/* Take care of old policies in the same range. */
1940 	while (n && n->start < end) {
1941 		struct rb_node *next = rb_next(&n->nd);
1942 		if (n->start >= start) {
1943 			if (n->end <= end)
1944 				sp_delete(sp, n);
1945 			else
1946 				n->start = end;
1947 		} else {
1948 			/* Old policy spanning whole new range. */
1949 			if (n->end > end) {
1950 				if (!new2) {
1951 					spin_unlock(&sp->lock);
1952 					new2 = sp_alloc(end, n->end, n->policy);
1953 					if (!new2)
1954 						return -ENOMEM;
1955 					goto restart;
1956 				}
1957 				n->end = start;
1958 				sp_insert(sp, new2);
1959 				new2 = NULL;
1960 				break;
1961 			} else
1962 				n->end = start;
1963 		}
1964 		if (!next)
1965 			break;
1966 		n = rb_entry(next, struct sp_node, nd);
1967 	}
1968 	if (new)
1969 		sp_insert(sp, new);
1970 	spin_unlock(&sp->lock);
1971 	if (new2) {
1972 		mpol_put(new2->policy);
1973 		kmem_cache_free(sn_cache, new2);
1974 	}
1975 	return 0;
1976 }
1977 
1978 /**
1979  * mpol_shared_policy_init - initialize shared policy for inode
1980  * @sp: pointer to inode shared policy
1981  * @mpol:  struct mempolicy to install
1982  *
1983  * Install non-NULL @mpol in inode's shared policy rb-tree.
1984  * On entry, the current task has a reference on a non-NULL @mpol.
1985  * This must be released on exit.
1986  * This is called at get_inode() calls and we can use GFP_KERNEL.
1987  */
1988 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1989 {
1990 	int ret;
1991 
1992 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1993 	spin_lock_init(&sp->lock);
1994 
1995 	if (mpol) {
1996 		struct vm_area_struct pvma;
1997 		struct mempolicy *new;
1998 		NODEMASK_SCRATCH(scratch);
1999 
2000 		if (!scratch)
2001 			return;
2002 		/* contextualize the tmpfs mount point mempolicy */
2003 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2004 		if (IS_ERR(new)) {
2005 			mpol_put(mpol);	/* drop our ref on sb mpol */
2006 			NODEMASK_SCRATCH_FREE(scratch);
2007 			return;		/* no valid nodemask intersection */
2008 		}
2009 
2010 		task_lock(current);
2011 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2012 		task_unlock(current);
2013 		mpol_put(mpol);	/* drop our ref on sb mpol */
2014 		if (ret) {
2015 			NODEMASK_SCRATCH_FREE(scratch);
2016 			mpol_put(new);
2017 			return;
2018 		}
2019 
2020 		/* Create pseudo-vma that contains just the policy */
2021 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2022 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2023 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2024 		mpol_put(new);			/* drop initial ref */
2025 		NODEMASK_SCRATCH_FREE(scratch);
2026 	}
2027 }
2028 
2029 int mpol_set_shared_policy(struct shared_policy *info,
2030 			struct vm_area_struct *vma, struct mempolicy *npol)
2031 {
2032 	int err;
2033 	struct sp_node *new = NULL;
2034 	unsigned long sz = vma_pages(vma);
2035 
2036 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2037 		 vma->vm_pgoff,
2038 		 sz, npol ? npol->mode : -1,
2039 		 npol ? npol->flags : -1,
2040 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2041 
2042 	if (npol) {
2043 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2044 		if (!new)
2045 			return -ENOMEM;
2046 	}
2047 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2048 	if (err && new)
2049 		kmem_cache_free(sn_cache, new);
2050 	return err;
2051 }
2052 
2053 /* Free a backing policy store on inode delete. */
2054 void mpol_free_shared_policy(struct shared_policy *p)
2055 {
2056 	struct sp_node *n;
2057 	struct rb_node *next;
2058 
2059 	if (!p->root.rb_node)
2060 		return;
2061 	spin_lock(&p->lock);
2062 	next = rb_first(&p->root);
2063 	while (next) {
2064 		n = rb_entry(next, struct sp_node, nd);
2065 		next = rb_next(&n->nd);
2066 		rb_erase(&n->nd, &p->root);
2067 		mpol_put(n->policy);
2068 		kmem_cache_free(sn_cache, n);
2069 	}
2070 	spin_unlock(&p->lock);
2071 }
2072 
2073 /* assumes fs == KERNEL_DS */
2074 void __init numa_policy_init(void)
2075 {
2076 	nodemask_t interleave_nodes;
2077 	unsigned long largest = 0;
2078 	int nid, prefer = 0;
2079 
2080 	policy_cache = kmem_cache_create("numa_policy",
2081 					 sizeof(struct mempolicy),
2082 					 0, SLAB_PANIC, NULL);
2083 
2084 	sn_cache = kmem_cache_create("shared_policy_node",
2085 				     sizeof(struct sp_node),
2086 				     0, SLAB_PANIC, NULL);
2087 
2088 	/*
2089 	 * Set interleaving policy for system init. Interleaving is only
2090 	 * enabled across suitably sized nodes (default is >= 16MB), or
2091 	 * fall back to the largest node if they're all smaller.
2092 	 */
2093 	nodes_clear(interleave_nodes);
2094 	for_each_node_state(nid, N_HIGH_MEMORY) {
2095 		unsigned long total_pages = node_present_pages(nid);
2096 
2097 		/* Preserve the largest node */
2098 		if (largest < total_pages) {
2099 			largest = total_pages;
2100 			prefer = nid;
2101 		}
2102 
2103 		/* Interleave this node? */
2104 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2105 			node_set(nid, interleave_nodes);
2106 	}
2107 
2108 	/* All too small, use the largest */
2109 	if (unlikely(nodes_empty(interleave_nodes)))
2110 		node_set(prefer, interleave_nodes);
2111 
2112 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2113 		printk("numa_policy_init: interleaving failed\n");
2114 }
2115 
2116 /* Reset policy of current process to default */
2117 void numa_default_policy(void)
2118 {
2119 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2120 }
2121 
2122 /*
2123  * Parse and format mempolicy from/to strings
2124  */
2125 
2126 /*
2127  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2128  * Used only for mpol_parse_str() and mpol_to_str()
2129  */
2130 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2131 static const char * const policy_types[] =
2132 	{ "default", "prefer", "bind", "interleave", "local" };
2133 
2134 
2135 #ifdef CONFIG_TMPFS
2136 /**
2137  * mpol_parse_str - parse string to mempolicy
2138  * @str:  string containing mempolicy to parse
2139  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2140  * @no_context:  flag whether to "contextualize" the mempolicy
2141  *
2142  * Format of input:
2143  *	<mode>[=<flags>][:<nodelist>]
2144  *
2145  * if @no_context is true, save the input nodemask in w.user_nodemask in
2146  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2147  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2148  * mount option.  Note that if 'static' or 'relative' mode flags were
2149  * specified, the input nodemask will already have been saved.  Saving
2150  * it again is redundant, but safe.
2151  *
2152  * On success, returns 0, else 1
2153  */
2154 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2155 {
2156 	struct mempolicy *new = NULL;
2157 	unsigned short uninitialized_var(mode);
2158 	unsigned short uninitialized_var(mode_flags);
2159 	nodemask_t nodes;
2160 	char *nodelist = strchr(str, ':');
2161 	char *flags = strchr(str, '=');
2162 	int i;
2163 	int err = 1;
2164 
2165 	if (nodelist) {
2166 		/* NUL-terminate mode or flags string */
2167 		*nodelist++ = '\0';
2168 		if (nodelist_parse(nodelist, nodes))
2169 			goto out;
2170 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2171 			goto out;
2172 	} else
2173 		nodes_clear(nodes);
2174 
2175 	if (flags)
2176 		*flags++ = '\0';	/* terminate mode string */
2177 
2178 	for (i = 0; i <= MPOL_LOCAL; i++) {
2179 		if (!strcmp(str, policy_types[i])) {
2180 			mode = i;
2181 			break;
2182 		}
2183 	}
2184 	if (i > MPOL_LOCAL)
2185 		goto out;
2186 
2187 	switch (mode) {
2188 	case MPOL_PREFERRED:
2189 		/*
2190 		 * Insist on a nodelist of one node only
2191 		 */
2192 		if (nodelist) {
2193 			char *rest = nodelist;
2194 			while (isdigit(*rest))
2195 				rest++;
2196 			if (!*rest)
2197 				err = 0;
2198 		}
2199 		break;
2200 	case MPOL_INTERLEAVE:
2201 		/*
2202 		 * Default to online nodes with memory if no nodelist
2203 		 */
2204 		if (!nodelist)
2205 			nodes = node_states[N_HIGH_MEMORY];
2206 		err = 0;
2207 		break;
2208 	case MPOL_LOCAL:
2209 		/*
2210 		 * Don't allow a nodelist;  mpol_new() checks flags
2211 		 */
2212 		if (nodelist)
2213 			goto out;
2214 		mode = MPOL_PREFERRED;
2215 		break;
2216 
2217 	/*
2218 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2219 	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2220 	 */
2221 	}
2222 
2223 	mode_flags = 0;
2224 	if (flags) {
2225 		/*
2226 		 * Currently, we only support two mutually exclusive
2227 		 * mode flags.
2228 		 */
2229 		if (!strcmp(flags, "static"))
2230 			mode_flags |= MPOL_F_STATIC_NODES;
2231 		else if (!strcmp(flags, "relative"))
2232 			mode_flags |= MPOL_F_RELATIVE_NODES;
2233 		else
2234 			err = 1;
2235 	}
2236 
2237 	new = mpol_new(mode, mode_flags, &nodes);
2238 	if (IS_ERR(new))
2239 		err = 1;
2240 	else {
2241 		int ret;
2242 		NODEMASK_SCRATCH(scratch);
2243 		if (scratch) {
2244 			task_lock(current);
2245 			ret = mpol_set_nodemask(new, &nodes, scratch);
2246 			task_unlock(current);
2247 		} else
2248 			ret = -ENOMEM;
2249 		NODEMASK_SCRATCH_FREE(scratch);
2250 		if (ret) {
2251 			err = 1;
2252 			mpol_put(new);
2253 		} else if (no_context) {
2254 			/* save for contextualization */
2255 			new->w.user_nodemask = nodes;
2256 		}
2257 	}
2258 
2259 out:
2260 	/* Restore string for error message */
2261 	if (nodelist)
2262 		*--nodelist = ':';
2263 	if (flags)
2264 		*--flags = '=';
2265 	if (!err)
2266 		*mpol = new;
2267 	return err;
2268 }
2269 #endif /* CONFIG_TMPFS */
2270 
2271 /**
2272  * mpol_to_str - format a mempolicy structure for printing
2273  * @buffer:  to contain formatted mempolicy string
2274  * @maxlen:  length of @buffer
2275  * @pol:  pointer to mempolicy to be formatted
2276  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2277  *
2278  * Convert a mempolicy into a string.
2279  * Returns the number of characters in buffer (if positive)
2280  * or an error (negative)
2281  */
2282 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2283 {
2284 	char *p = buffer;
2285 	int l;
2286 	nodemask_t nodes;
2287 	unsigned short mode;
2288 	unsigned short flags = pol ? pol->flags : 0;
2289 
2290 	/*
2291 	 * Sanity check:  room for longest mode, flag and some nodes
2292 	 */
2293 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2294 
2295 	if (!pol || pol == &default_policy)
2296 		mode = MPOL_DEFAULT;
2297 	else
2298 		mode = pol->mode;
2299 
2300 	switch (mode) {
2301 	case MPOL_DEFAULT:
2302 		nodes_clear(nodes);
2303 		break;
2304 
2305 	case MPOL_PREFERRED:
2306 		nodes_clear(nodes);
2307 		if (flags & MPOL_F_LOCAL)
2308 			mode = MPOL_LOCAL;	/* pseudo-policy */
2309 		else
2310 			node_set(pol->v.preferred_node, nodes);
2311 		break;
2312 
2313 	case MPOL_BIND:
2314 		/* Fall through */
2315 	case MPOL_INTERLEAVE:
2316 		if (no_context)
2317 			nodes = pol->w.user_nodemask;
2318 		else
2319 			nodes = pol->v.nodes;
2320 		break;
2321 
2322 	default:
2323 		BUG();
2324 	}
2325 
2326 	l = strlen(policy_types[mode]);
2327 	if (buffer + maxlen < p + l + 1)
2328 		return -ENOSPC;
2329 
2330 	strcpy(p, policy_types[mode]);
2331 	p += l;
2332 
2333 	if (flags & MPOL_MODE_FLAGS) {
2334 		if (buffer + maxlen < p + 2)
2335 			return -ENOSPC;
2336 		*p++ = '=';
2337 
2338 		/*
2339 		 * Currently, the only defined flags are mutually exclusive
2340 		 */
2341 		if (flags & MPOL_F_STATIC_NODES)
2342 			p += snprintf(p, buffer + maxlen - p, "static");
2343 		else if (flags & MPOL_F_RELATIVE_NODES)
2344 			p += snprintf(p, buffer + maxlen - p, "relative");
2345 	}
2346 
2347 	if (!nodes_empty(nodes)) {
2348 		if (buffer + maxlen < p + 2)
2349 			return -ENOSPC;
2350 		*p++ = ':';
2351 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2352 	}
2353 	return p - buffer;
2354 }
2355 
2356 struct numa_maps {
2357 	unsigned long pages;
2358 	unsigned long anon;
2359 	unsigned long active;
2360 	unsigned long writeback;
2361 	unsigned long mapcount_max;
2362 	unsigned long dirty;
2363 	unsigned long swapcache;
2364 	unsigned long node[MAX_NUMNODES];
2365 };
2366 
2367 static void gather_stats(struct page *page, void *private, int pte_dirty)
2368 {
2369 	struct numa_maps *md = private;
2370 	int count = page_mapcount(page);
2371 
2372 	md->pages++;
2373 	if (pte_dirty || PageDirty(page))
2374 		md->dirty++;
2375 
2376 	if (PageSwapCache(page))
2377 		md->swapcache++;
2378 
2379 	if (PageActive(page) || PageUnevictable(page))
2380 		md->active++;
2381 
2382 	if (PageWriteback(page))
2383 		md->writeback++;
2384 
2385 	if (PageAnon(page))
2386 		md->anon++;
2387 
2388 	if (count > md->mapcount_max)
2389 		md->mapcount_max = count;
2390 
2391 	md->node[page_to_nid(page)]++;
2392 }
2393 
2394 #ifdef CONFIG_HUGETLB_PAGE
2395 static void check_huge_range(struct vm_area_struct *vma,
2396 		unsigned long start, unsigned long end,
2397 		struct numa_maps *md)
2398 {
2399 	unsigned long addr;
2400 	struct page *page;
2401 	struct hstate *h = hstate_vma(vma);
2402 	unsigned long sz = huge_page_size(h);
2403 
2404 	for (addr = start; addr < end; addr += sz) {
2405 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2406 						addr & huge_page_mask(h));
2407 		pte_t pte;
2408 
2409 		if (!ptep)
2410 			continue;
2411 
2412 		pte = *ptep;
2413 		if (pte_none(pte))
2414 			continue;
2415 
2416 		page = pte_page(pte);
2417 		if (!page)
2418 			continue;
2419 
2420 		gather_stats(page, md, pte_dirty(*ptep));
2421 	}
2422 }
2423 #else
2424 static inline void check_huge_range(struct vm_area_struct *vma,
2425 		unsigned long start, unsigned long end,
2426 		struct numa_maps *md)
2427 {
2428 }
2429 #endif
2430 
2431 /*
2432  * Display pages allocated per node and memory policy via /proc.
2433  */
2434 int show_numa_map(struct seq_file *m, void *v)
2435 {
2436 	struct proc_maps_private *priv = m->private;
2437 	struct vm_area_struct *vma = v;
2438 	struct numa_maps *md;
2439 	struct file *file = vma->vm_file;
2440 	struct mm_struct *mm = vma->vm_mm;
2441 	struct mempolicy *pol;
2442 	int n;
2443 	char buffer[50];
2444 
2445 	if (!mm)
2446 		return 0;
2447 
2448 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2449 	if (!md)
2450 		return 0;
2451 
2452 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2453 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2454 	mpol_cond_put(pol);
2455 
2456 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2457 
2458 	if (file) {
2459 		seq_printf(m, " file=");
2460 		seq_path(m, &file->f_path, "\n\t= ");
2461 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2462 		seq_printf(m, " heap");
2463 	} else if (vma->vm_start <= mm->start_stack &&
2464 			vma->vm_end >= mm->start_stack) {
2465 		seq_printf(m, " stack");
2466 	}
2467 
2468 	if (is_vm_hugetlb_page(vma)) {
2469 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2470 		seq_printf(m, " huge");
2471 	} else {
2472 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2473 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2474 	}
2475 
2476 	if (!md->pages)
2477 		goto out;
2478 
2479 	if (md->anon)
2480 		seq_printf(m," anon=%lu",md->anon);
2481 
2482 	if (md->dirty)
2483 		seq_printf(m," dirty=%lu",md->dirty);
2484 
2485 	if (md->pages != md->anon && md->pages != md->dirty)
2486 		seq_printf(m, " mapped=%lu", md->pages);
2487 
2488 	if (md->mapcount_max > 1)
2489 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2490 
2491 	if (md->swapcache)
2492 		seq_printf(m," swapcache=%lu", md->swapcache);
2493 
2494 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2495 		seq_printf(m," active=%lu", md->active);
2496 
2497 	if (md->writeback)
2498 		seq_printf(m," writeback=%lu", md->writeback);
2499 
2500 	for_each_node_state(n, N_HIGH_MEMORY)
2501 		if (md->node[n])
2502 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2503 out:
2504 	seq_putc(m, '\n');
2505 	kfree(md);
2506 
2507 	if (m->count < m->size)
2508 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2509 	return 0;
2510 }
2511