xref: /openbmc/linux/mm/mempolicy.c (revision 0d456bad)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 #include <linux/random.h>
98 
99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 	struct mempolicy *pol = p->mempolicy;
126 	int node;
127 
128 	if (!pol) {
129 		node = numa_node_id();
130 		if (node != -1)
131 			pol = &preferred_node_policy[node];
132 
133 		/* preferred_node_policy is not initialised early in boot */
134 		if (!pol->mode)
135 			pol = NULL;
136 	}
137 
138 	return pol;
139 }
140 
141 static const struct mempolicy_operations {
142 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
143 	/*
144 	 * If read-side task has no lock to protect task->mempolicy, write-side
145 	 * task will rebind the task->mempolicy by two step. The first step is
146 	 * setting all the newly nodes, and the second step is cleaning all the
147 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
148 	 * page.
149 	 * If we have a lock to protect task->mempolicy in read-side, we do
150 	 * rebind directly.
151 	 *
152 	 * step:
153 	 * 	MPOL_REBIND_ONCE - do rebind work at once
154 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
155 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
156 	 */
157 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
158 			enum mpol_rebind_step step);
159 } mpol_ops[MPOL_MAX];
160 
161 /* Check that the nodemask contains at least one populated zone */
162 static int is_valid_nodemask(const nodemask_t *nodemask)
163 {
164 	int nd, k;
165 
166 	for_each_node_mask(nd, *nodemask) {
167 		struct zone *z;
168 
169 		for (k = 0; k <= policy_zone; k++) {
170 			z = &NODE_DATA(nd)->node_zones[k];
171 			if (z->present_pages > 0)
172 				return 1;
173 		}
174 	}
175 
176 	return 0;
177 }
178 
179 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
180 {
181 	return pol->flags & MPOL_MODE_FLAGS;
182 }
183 
184 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
185 				   const nodemask_t *rel)
186 {
187 	nodemask_t tmp;
188 	nodes_fold(tmp, *orig, nodes_weight(*rel));
189 	nodes_onto(*ret, tmp, *rel);
190 }
191 
192 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
193 {
194 	if (nodes_empty(*nodes))
195 		return -EINVAL;
196 	pol->v.nodes = *nodes;
197 	return 0;
198 }
199 
200 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
201 {
202 	if (!nodes)
203 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
204 	else if (nodes_empty(*nodes))
205 		return -EINVAL;			/*  no allowed nodes */
206 	else
207 		pol->v.preferred_node = first_node(*nodes);
208 	return 0;
209 }
210 
211 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
212 {
213 	if (!is_valid_nodemask(nodes))
214 		return -EINVAL;
215 	pol->v.nodes = *nodes;
216 	return 0;
217 }
218 
219 /*
220  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
221  * any, for the new policy.  mpol_new() has already validated the nodes
222  * parameter with respect to the policy mode and flags.  But, we need to
223  * handle an empty nodemask with MPOL_PREFERRED here.
224  *
225  * Must be called holding task's alloc_lock to protect task's mems_allowed
226  * and mempolicy.  May also be called holding the mmap_semaphore for write.
227  */
228 static int mpol_set_nodemask(struct mempolicy *pol,
229 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
230 {
231 	int ret;
232 
233 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
234 	if (pol == NULL)
235 		return 0;
236 	/* Check N_MEMORY */
237 	nodes_and(nsc->mask1,
238 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
239 
240 	VM_BUG_ON(!nodes);
241 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
242 		nodes = NULL;	/* explicit local allocation */
243 	else {
244 		if (pol->flags & MPOL_F_RELATIVE_NODES)
245 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
246 		else
247 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
248 
249 		if (mpol_store_user_nodemask(pol))
250 			pol->w.user_nodemask = *nodes;
251 		else
252 			pol->w.cpuset_mems_allowed =
253 						cpuset_current_mems_allowed;
254 	}
255 
256 	if (nodes)
257 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
258 	else
259 		ret = mpol_ops[pol->mode].create(pol, NULL);
260 	return ret;
261 }
262 
263 /*
264  * This function just creates a new policy, does some check and simple
265  * initialization. You must invoke mpol_set_nodemask() to set nodes.
266  */
267 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
268 				  nodemask_t *nodes)
269 {
270 	struct mempolicy *policy;
271 
272 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
273 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
274 
275 	if (mode == MPOL_DEFAULT) {
276 		if (nodes && !nodes_empty(*nodes))
277 			return ERR_PTR(-EINVAL);
278 		return NULL;
279 	}
280 	VM_BUG_ON(!nodes);
281 
282 	/*
283 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
284 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
285 	 * All other modes require a valid pointer to a non-empty nodemask.
286 	 */
287 	if (mode == MPOL_PREFERRED) {
288 		if (nodes_empty(*nodes)) {
289 			if (((flags & MPOL_F_STATIC_NODES) ||
290 			     (flags & MPOL_F_RELATIVE_NODES)))
291 				return ERR_PTR(-EINVAL);
292 		}
293 	} else if (mode == MPOL_LOCAL) {
294 		if (!nodes_empty(*nodes))
295 			return ERR_PTR(-EINVAL);
296 		mode = MPOL_PREFERRED;
297 	} else if (nodes_empty(*nodes))
298 		return ERR_PTR(-EINVAL);
299 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
300 	if (!policy)
301 		return ERR_PTR(-ENOMEM);
302 	atomic_set(&policy->refcnt, 1);
303 	policy->mode = mode;
304 	policy->flags = flags;
305 
306 	return policy;
307 }
308 
309 /* Slow path of a mpol destructor. */
310 void __mpol_put(struct mempolicy *p)
311 {
312 	if (!atomic_dec_and_test(&p->refcnt))
313 		return;
314 	kmem_cache_free(policy_cache, p);
315 }
316 
317 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
318 				enum mpol_rebind_step step)
319 {
320 }
321 
322 /*
323  * step:
324  * 	MPOL_REBIND_ONCE  - do rebind work at once
325  * 	MPOL_REBIND_STEP1 - set all the newly nodes
326  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
327  */
328 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
329 				 enum mpol_rebind_step step)
330 {
331 	nodemask_t tmp;
332 
333 	if (pol->flags & MPOL_F_STATIC_NODES)
334 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
335 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
336 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
337 	else {
338 		/*
339 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
340 		 * result
341 		 */
342 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
343 			nodes_remap(tmp, pol->v.nodes,
344 					pol->w.cpuset_mems_allowed, *nodes);
345 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
346 		} else if (step == MPOL_REBIND_STEP2) {
347 			tmp = pol->w.cpuset_mems_allowed;
348 			pol->w.cpuset_mems_allowed = *nodes;
349 		} else
350 			BUG();
351 	}
352 
353 	if (nodes_empty(tmp))
354 		tmp = *nodes;
355 
356 	if (step == MPOL_REBIND_STEP1)
357 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
358 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
359 		pol->v.nodes = tmp;
360 	else
361 		BUG();
362 
363 	if (!node_isset(current->il_next, tmp)) {
364 		current->il_next = next_node(current->il_next, tmp);
365 		if (current->il_next >= MAX_NUMNODES)
366 			current->il_next = first_node(tmp);
367 		if (current->il_next >= MAX_NUMNODES)
368 			current->il_next = numa_node_id();
369 	}
370 }
371 
372 static void mpol_rebind_preferred(struct mempolicy *pol,
373 				  const nodemask_t *nodes,
374 				  enum mpol_rebind_step step)
375 {
376 	nodemask_t tmp;
377 
378 	if (pol->flags & MPOL_F_STATIC_NODES) {
379 		int node = first_node(pol->w.user_nodemask);
380 
381 		if (node_isset(node, *nodes)) {
382 			pol->v.preferred_node = node;
383 			pol->flags &= ~MPOL_F_LOCAL;
384 		} else
385 			pol->flags |= MPOL_F_LOCAL;
386 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
387 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
388 		pol->v.preferred_node = first_node(tmp);
389 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
390 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
391 						   pol->w.cpuset_mems_allowed,
392 						   *nodes);
393 		pol->w.cpuset_mems_allowed = *nodes;
394 	}
395 }
396 
397 /*
398  * mpol_rebind_policy - Migrate a policy to a different set of nodes
399  *
400  * If read-side task has no lock to protect task->mempolicy, write-side
401  * task will rebind the task->mempolicy by two step. The first step is
402  * setting all the newly nodes, and the second step is cleaning all the
403  * disallowed nodes. In this way, we can avoid finding no node to alloc
404  * page.
405  * If we have a lock to protect task->mempolicy in read-side, we do
406  * rebind directly.
407  *
408  * step:
409  * 	MPOL_REBIND_ONCE  - do rebind work at once
410  * 	MPOL_REBIND_STEP1 - set all the newly nodes
411  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
412  */
413 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
414 				enum mpol_rebind_step step)
415 {
416 	if (!pol)
417 		return;
418 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
419 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
420 		return;
421 
422 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
423 		return;
424 
425 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
426 		BUG();
427 
428 	if (step == MPOL_REBIND_STEP1)
429 		pol->flags |= MPOL_F_REBINDING;
430 	else if (step == MPOL_REBIND_STEP2)
431 		pol->flags &= ~MPOL_F_REBINDING;
432 	else if (step >= MPOL_REBIND_NSTEP)
433 		BUG();
434 
435 	mpol_ops[pol->mode].rebind(pol, newmask, step);
436 }
437 
438 /*
439  * Wrapper for mpol_rebind_policy() that just requires task
440  * pointer, and updates task mempolicy.
441  *
442  * Called with task's alloc_lock held.
443  */
444 
445 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
446 			enum mpol_rebind_step step)
447 {
448 	mpol_rebind_policy(tsk->mempolicy, new, step);
449 }
450 
451 /*
452  * Rebind each vma in mm to new nodemask.
453  *
454  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
455  */
456 
457 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
458 {
459 	struct vm_area_struct *vma;
460 
461 	down_write(&mm->mmap_sem);
462 	for (vma = mm->mmap; vma; vma = vma->vm_next)
463 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
464 	up_write(&mm->mmap_sem);
465 }
466 
467 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
468 	[MPOL_DEFAULT] = {
469 		.rebind = mpol_rebind_default,
470 	},
471 	[MPOL_INTERLEAVE] = {
472 		.create = mpol_new_interleave,
473 		.rebind = mpol_rebind_nodemask,
474 	},
475 	[MPOL_PREFERRED] = {
476 		.create = mpol_new_preferred,
477 		.rebind = mpol_rebind_preferred,
478 	},
479 	[MPOL_BIND] = {
480 		.create = mpol_new_bind,
481 		.rebind = mpol_rebind_nodemask,
482 	},
483 };
484 
485 static void migrate_page_add(struct page *page, struct list_head *pagelist,
486 				unsigned long flags);
487 
488 /* Scan through pages checking if pages follow certain conditions. */
489 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
490 		unsigned long addr, unsigned long end,
491 		const nodemask_t *nodes, unsigned long flags,
492 		void *private)
493 {
494 	pte_t *orig_pte;
495 	pte_t *pte;
496 	spinlock_t *ptl;
497 
498 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
499 	do {
500 		struct page *page;
501 		int nid;
502 
503 		if (!pte_present(*pte))
504 			continue;
505 		page = vm_normal_page(vma, addr, *pte);
506 		if (!page)
507 			continue;
508 		/*
509 		 * vm_normal_page() filters out zero pages, but there might
510 		 * still be PageReserved pages to skip, perhaps in a VDSO.
511 		 * And we cannot move PageKsm pages sensibly or safely yet.
512 		 */
513 		if (PageReserved(page) || PageKsm(page))
514 			continue;
515 		nid = page_to_nid(page);
516 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
517 			continue;
518 
519 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
520 			migrate_page_add(page, private, flags);
521 		else
522 			break;
523 	} while (pte++, addr += PAGE_SIZE, addr != end);
524 	pte_unmap_unlock(orig_pte, ptl);
525 	return addr != end;
526 }
527 
528 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
529 		unsigned long addr, unsigned long end,
530 		const nodemask_t *nodes, unsigned long flags,
531 		void *private)
532 {
533 	pmd_t *pmd;
534 	unsigned long next;
535 
536 	pmd = pmd_offset(pud, addr);
537 	do {
538 		next = pmd_addr_end(addr, end);
539 		split_huge_page_pmd(vma, addr, pmd);
540 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
541 			continue;
542 		if (check_pte_range(vma, pmd, addr, next, nodes,
543 				    flags, private))
544 			return -EIO;
545 	} while (pmd++, addr = next, addr != end);
546 	return 0;
547 }
548 
549 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
550 		unsigned long addr, unsigned long end,
551 		const nodemask_t *nodes, unsigned long flags,
552 		void *private)
553 {
554 	pud_t *pud;
555 	unsigned long next;
556 
557 	pud = pud_offset(pgd, addr);
558 	do {
559 		next = pud_addr_end(addr, end);
560 		if (pud_none_or_clear_bad(pud))
561 			continue;
562 		if (check_pmd_range(vma, pud, addr, next, nodes,
563 				    flags, private))
564 			return -EIO;
565 	} while (pud++, addr = next, addr != end);
566 	return 0;
567 }
568 
569 static inline int check_pgd_range(struct vm_area_struct *vma,
570 		unsigned long addr, unsigned long end,
571 		const nodemask_t *nodes, unsigned long flags,
572 		void *private)
573 {
574 	pgd_t *pgd;
575 	unsigned long next;
576 
577 	pgd = pgd_offset(vma->vm_mm, addr);
578 	do {
579 		next = pgd_addr_end(addr, end);
580 		if (pgd_none_or_clear_bad(pgd))
581 			continue;
582 		if (check_pud_range(vma, pgd, addr, next, nodes,
583 				    flags, private))
584 			return -EIO;
585 	} while (pgd++, addr = next, addr != end);
586 	return 0;
587 }
588 
589 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590 /*
591  * This is used to mark a range of virtual addresses to be inaccessible.
592  * These are later cleared by a NUMA hinting fault. Depending on these
593  * faults, pages may be migrated for better NUMA placement.
594  *
595  * This is assuming that NUMA faults are handled using PROT_NONE. If
596  * an architecture makes a different choice, it will need further
597  * changes to the core.
598  */
599 unsigned long change_prot_numa(struct vm_area_struct *vma,
600 			unsigned long addr, unsigned long end)
601 {
602 	int nr_updated;
603 	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604 
605 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 	if (nr_updated)
607 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608 
609 	return nr_updated;
610 }
611 #else
612 static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 			unsigned long addr, unsigned long end)
614 {
615 	return 0;
616 }
617 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
618 
619 /*
620  * Check if all pages in a range are on a set of nodes.
621  * If pagelist != NULL then isolate pages from the LRU and
622  * put them on the pagelist.
623  */
624 static struct vm_area_struct *
625 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
626 		const nodemask_t *nodes, unsigned long flags, void *private)
627 {
628 	int err;
629 	struct vm_area_struct *first, *vma, *prev;
630 
631 
632 	first = find_vma(mm, start);
633 	if (!first)
634 		return ERR_PTR(-EFAULT);
635 	prev = NULL;
636 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 		unsigned long endvma = vma->vm_end;
638 
639 		if (endvma > end)
640 			endvma = end;
641 		if (vma->vm_start > start)
642 			start = vma->vm_start;
643 
644 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
645 			if (!vma->vm_next && vma->vm_end < end)
646 				return ERR_PTR(-EFAULT);
647 			if (prev && prev->vm_end < vma->vm_start)
648 				return ERR_PTR(-EFAULT);
649 		}
650 
651 		if (is_vm_hugetlb_page(vma))
652 			goto next;
653 
654 		if (flags & MPOL_MF_LAZY) {
655 			change_prot_numa(vma, start, endvma);
656 			goto next;
657 		}
658 
659 		if ((flags & MPOL_MF_STRICT) ||
660 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
661 		      vma_migratable(vma))) {
662 
663 			err = check_pgd_range(vma, start, endvma, nodes,
664 						flags, private);
665 			if (err) {
666 				first = ERR_PTR(err);
667 				break;
668 			}
669 		}
670 next:
671 		prev = vma;
672 	}
673 	return first;
674 }
675 
676 /*
677  * Apply policy to a single VMA
678  * This must be called with the mmap_sem held for writing.
679  */
680 static int vma_replace_policy(struct vm_area_struct *vma,
681 						struct mempolicy *pol)
682 {
683 	int err;
684 	struct mempolicy *old;
685 	struct mempolicy *new;
686 
687 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
688 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
689 		 vma->vm_ops, vma->vm_file,
690 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
691 
692 	new = mpol_dup(pol);
693 	if (IS_ERR(new))
694 		return PTR_ERR(new);
695 
696 	if (vma->vm_ops && vma->vm_ops->set_policy) {
697 		err = vma->vm_ops->set_policy(vma, new);
698 		if (err)
699 			goto err_out;
700 	}
701 
702 	old = vma->vm_policy;
703 	vma->vm_policy = new; /* protected by mmap_sem */
704 	mpol_put(old);
705 
706 	return 0;
707  err_out:
708 	mpol_put(new);
709 	return err;
710 }
711 
712 /* Step 2: apply policy to a range and do splits. */
713 static int mbind_range(struct mm_struct *mm, unsigned long start,
714 		       unsigned long end, struct mempolicy *new_pol)
715 {
716 	struct vm_area_struct *next;
717 	struct vm_area_struct *prev;
718 	struct vm_area_struct *vma;
719 	int err = 0;
720 	pgoff_t pgoff;
721 	unsigned long vmstart;
722 	unsigned long vmend;
723 
724 	vma = find_vma(mm, start);
725 	if (!vma || vma->vm_start > start)
726 		return -EFAULT;
727 
728 	prev = vma->vm_prev;
729 	if (start > vma->vm_start)
730 		prev = vma;
731 
732 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
733 		next = vma->vm_next;
734 		vmstart = max(start, vma->vm_start);
735 		vmend   = min(end, vma->vm_end);
736 
737 		if (mpol_equal(vma_policy(vma), new_pol))
738 			continue;
739 
740 		pgoff = vma->vm_pgoff +
741 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
742 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
743 				  vma->anon_vma, vma->vm_file, pgoff,
744 				  new_pol);
745 		if (prev) {
746 			vma = prev;
747 			next = vma->vm_next;
748 			continue;
749 		}
750 		if (vma->vm_start != vmstart) {
751 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
752 			if (err)
753 				goto out;
754 		}
755 		if (vma->vm_end != vmend) {
756 			err = split_vma(vma->vm_mm, vma, vmend, 0);
757 			if (err)
758 				goto out;
759 		}
760 		err = vma_replace_policy(vma, new_pol);
761 		if (err)
762 			goto out;
763 	}
764 
765  out:
766 	return err;
767 }
768 
769 /*
770  * Update task->flags PF_MEMPOLICY bit: set iff non-default
771  * mempolicy.  Allows more rapid checking of this (combined perhaps
772  * with other PF_* flag bits) on memory allocation hot code paths.
773  *
774  * If called from outside this file, the task 'p' should -only- be
775  * a newly forked child not yet visible on the task list, because
776  * manipulating the task flags of a visible task is not safe.
777  *
778  * The above limitation is why this routine has the funny name
779  * mpol_fix_fork_child_flag().
780  *
781  * It is also safe to call this with a task pointer of current,
782  * which the static wrapper mpol_set_task_struct_flag() does,
783  * for use within this file.
784  */
785 
786 void mpol_fix_fork_child_flag(struct task_struct *p)
787 {
788 	if (p->mempolicy)
789 		p->flags |= PF_MEMPOLICY;
790 	else
791 		p->flags &= ~PF_MEMPOLICY;
792 }
793 
794 static void mpol_set_task_struct_flag(void)
795 {
796 	mpol_fix_fork_child_flag(current);
797 }
798 
799 /* Set the process memory policy */
800 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
801 			     nodemask_t *nodes)
802 {
803 	struct mempolicy *new, *old;
804 	struct mm_struct *mm = current->mm;
805 	NODEMASK_SCRATCH(scratch);
806 	int ret;
807 
808 	if (!scratch)
809 		return -ENOMEM;
810 
811 	new = mpol_new(mode, flags, nodes);
812 	if (IS_ERR(new)) {
813 		ret = PTR_ERR(new);
814 		goto out;
815 	}
816 	/*
817 	 * prevent changing our mempolicy while show_numa_maps()
818 	 * is using it.
819 	 * Note:  do_set_mempolicy() can be called at init time
820 	 * with no 'mm'.
821 	 */
822 	if (mm)
823 		down_write(&mm->mmap_sem);
824 	task_lock(current);
825 	ret = mpol_set_nodemask(new, nodes, scratch);
826 	if (ret) {
827 		task_unlock(current);
828 		if (mm)
829 			up_write(&mm->mmap_sem);
830 		mpol_put(new);
831 		goto out;
832 	}
833 	old = current->mempolicy;
834 	current->mempolicy = new;
835 	mpol_set_task_struct_flag();
836 	if (new && new->mode == MPOL_INTERLEAVE &&
837 	    nodes_weight(new->v.nodes))
838 		current->il_next = first_node(new->v.nodes);
839 	task_unlock(current);
840 	if (mm)
841 		up_write(&mm->mmap_sem);
842 
843 	mpol_put(old);
844 	ret = 0;
845 out:
846 	NODEMASK_SCRATCH_FREE(scratch);
847 	return ret;
848 }
849 
850 /*
851  * Return nodemask for policy for get_mempolicy() query
852  *
853  * Called with task's alloc_lock held
854  */
855 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
856 {
857 	nodes_clear(*nodes);
858 	if (p == &default_policy)
859 		return;
860 
861 	switch (p->mode) {
862 	case MPOL_BIND:
863 		/* Fall through */
864 	case MPOL_INTERLEAVE:
865 		*nodes = p->v.nodes;
866 		break;
867 	case MPOL_PREFERRED:
868 		if (!(p->flags & MPOL_F_LOCAL))
869 			node_set(p->v.preferred_node, *nodes);
870 		/* else return empty node mask for local allocation */
871 		break;
872 	default:
873 		BUG();
874 	}
875 }
876 
877 static int lookup_node(struct mm_struct *mm, unsigned long addr)
878 {
879 	struct page *p;
880 	int err;
881 
882 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
883 	if (err >= 0) {
884 		err = page_to_nid(p);
885 		put_page(p);
886 	}
887 	return err;
888 }
889 
890 /* Retrieve NUMA policy */
891 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
892 			     unsigned long addr, unsigned long flags)
893 {
894 	int err;
895 	struct mm_struct *mm = current->mm;
896 	struct vm_area_struct *vma = NULL;
897 	struct mempolicy *pol = current->mempolicy;
898 
899 	if (flags &
900 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
901 		return -EINVAL;
902 
903 	if (flags & MPOL_F_MEMS_ALLOWED) {
904 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
905 			return -EINVAL;
906 		*policy = 0;	/* just so it's initialized */
907 		task_lock(current);
908 		*nmask  = cpuset_current_mems_allowed;
909 		task_unlock(current);
910 		return 0;
911 	}
912 
913 	if (flags & MPOL_F_ADDR) {
914 		/*
915 		 * Do NOT fall back to task policy if the
916 		 * vma/shared policy at addr is NULL.  We
917 		 * want to return MPOL_DEFAULT in this case.
918 		 */
919 		down_read(&mm->mmap_sem);
920 		vma = find_vma_intersection(mm, addr, addr+1);
921 		if (!vma) {
922 			up_read(&mm->mmap_sem);
923 			return -EFAULT;
924 		}
925 		if (vma->vm_ops && vma->vm_ops->get_policy)
926 			pol = vma->vm_ops->get_policy(vma, addr);
927 		else
928 			pol = vma->vm_policy;
929 	} else if (addr)
930 		return -EINVAL;
931 
932 	if (!pol)
933 		pol = &default_policy;	/* indicates default behavior */
934 
935 	if (flags & MPOL_F_NODE) {
936 		if (flags & MPOL_F_ADDR) {
937 			err = lookup_node(mm, addr);
938 			if (err < 0)
939 				goto out;
940 			*policy = err;
941 		} else if (pol == current->mempolicy &&
942 				pol->mode == MPOL_INTERLEAVE) {
943 			*policy = current->il_next;
944 		} else {
945 			err = -EINVAL;
946 			goto out;
947 		}
948 	} else {
949 		*policy = pol == &default_policy ? MPOL_DEFAULT :
950 						pol->mode;
951 		/*
952 		 * Internal mempolicy flags must be masked off before exposing
953 		 * the policy to userspace.
954 		 */
955 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
956 	}
957 
958 	if (vma) {
959 		up_read(&current->mm->mmap_sem);
960 		vma = NULL;
961 	}
962 
963 	err = 0;
964 	if (nmask) {
965 		if (mpol_store_user_nodemask(pol)) {
966 			*nmask = pol->w.user_nodemask;
967 		} else {
968 			task_lock(current);
969 			get_policy_nodemask(pol, nmask);
970 			task_unlock(current);
971 		}
972 	}
973 
974  out:
975 	mpol_cond_put(pol);
976 	if (vma)
977 		up_read(&current->mm->mmap_sem);
978 	return err;
979 }
980 
981 #ifdef CONFIG_MIGRATION
982 /*
983  * page migration
984  */
985 static void migrate_page_add(struct page *page, struct list_head *pagelist,
986 				unsigned long flags)
987 {
988 	/*
989 	 * Avoid migrating a page that is shared with others.
990 	 */
991 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
992 		if (!isolate_lru_page(page)) {
993 			list_add_tail(&page->lru, pagelist);
994 			inc_zone_page_state(page, NR_ISOLATED_ANON +
995 					    page_is_file_cache(page));
996 		}
997 	}
998 }
999 
1000 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1001 {
1002 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1003 }
1004 
1005 /*
1006  * Migrate pages from one node to a target node.
1007  * Returns error or the number of pages not migrated.
1008  */
1009 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1010 			   int flags)
1011 {
1012 	nodemask_t nmask;
1013 	LIST_HEAD(pagelist);
1014 	int err = 0;
1015 
1016 	nodes_clear(nmask);
1017 	node_set(source, nmask);
1018 
1019 	/*
1020 	 * This does not "check" the range but isolates all pages that
1021 	 * need migration.  Between passing in the full user address
1022 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1023 	 */
1024 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1025 	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1026 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1027 
1028 	if (!list_empty(&pagelist)) {
1029 		err = migrate_pages(&pagelist, new_node_page, dest,
1030 							false, MIGRATE_SYNC,
1031 							MR_SYSCALL);
1032 		if (err)
1033 			putback_lru_pages(&pagelist);
1034 	}
1035 
1036 	return err;
1037 }
1038 
1039 /*
1040  * Move pages between the two nodesets so as to preserve the physical
1041  * layout as much as possible.
1042  *
1043  * Returns the number of page that could not be moved.
1044  */
1045 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1046 		     const nodemask_t *to, int flags)
1047 {
1048 	int busy = 0;
1049 	int err;
1050 	nodemask_t tmp;
1051 
1052 	err = migrate_prep();
1053 	if (err)
1054 		return err;
1055 
1056 	down_read(&mm->mmap_sem);
1057 
1058 	err = migrate_vmas(mm, from, to, flags);
1059 	if (err)
1060 		goto out;
1061 
1062 	/*
1063 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1064 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1065 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1066 	 * The pair of nodemasks 'to' and 'from' define the map.
1067 	 *
1068 	 * If no pair of bits is found that way, fallback to picking some
1069 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1070 	 * 'source' and 'dest' bits are the same, this represents a node
1071 	 * that will be migrating to itself, so no pages need move.
1072 	 *
1073 	 * If no bits are left in 'tmp', or if all remaining bits left
1074 	 * in 'tmp' correspond to the same bit in 'to', return false
1075 	 * (nothing left to migrate).
1076 	 *
1077 	 * This lets us pick a pair of nodes to migrate between, such that
1078 	 * if possible the dest node is not already occupied by some other
1079 	 * source node, minimizing the risk of overloading the memory on a
1080 	 * node that would happen if we migrated incoming memory to a node
1081 	 * before migrating outgoing memory source that same node.
1082 	 *
1083 	 * A single scan of tmp is sufficient.  As we go, we remember the
1084 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1085 	 * that not only moved, but what's better, moved to an empty slot
1086 	 * (d is not set in tmp), then we break out then, with that pair.
1087 	 * Otherwise when we finish scanning from_tmp, we at least have the
1088 	 * most recent <s, d> pair that moved.  If we get all the way through
1089 	 * the scan of tmp without finding any node that moved, much less
1090 	 * moved to an empty node, then there is nothing left worth migrating.
1091 	 */
1092 
1093 	tmp = *from;
1094 	while (!nodes_empty(tmp)) {
1095 		int s,d;
1096 		int source = -1;
1097 		int dest = 0;
1098 
1099 		for_each_node_mask(s, tmp) {
1100 
1101 			/*
1102 			 * do_migrate_pages() tries to maintain the relative
1103 			 * node relationship of the pages established between
1104 			 * threads and memory areas.
1105                          *
1106 			 * However if the number of source nodes is not equal to
1107 			 * the number of destination nodes we can not preserve
1108 			 * this node relative relationship.  In that case, skip
1109 			 * copying memory from a node that is in the destination
1110 			 * mask.
1111 			 *
1112 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1113 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1114 			 */
1115 
1116 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1117 						(node_isset(s, *to)))
1118 				continue;
1119 
1120 			d = node_remap(s, *from, *to);
1121 			if (s == d)
1122 				continue;
1123 
1124 			source = s;	/* Node moved. Memorize */
1125 			dest = d;
1126 
1127 			/* dest not in remaining from nodes? */
1128 			if (!node_isset(dest, tmp))
1129 				break;
1130 		}
1131 		if (source == -1)
1132 			break;
1133 
1134 		node_clear(source, tmp);
1135 		err = migrate_to_node(mm, source, dest, flags);
1136 		if (err > 0)
1137 			busy += err;
1138 		if (err < 0)
1139 			break;
1140 	}
1141 out:
1142 	up_read(&mm->mmap_sem);
1143 	if (err < 0)
1144 		return err;
1145 	return busy;
1146 
1147 }
1148 
1149 /*
1150  * Allocate a new page for page migration based on vma policy.
1151  * Start assuming that page is mapped by vma pointed to by @private.
1152  * Search forward from there, if not.  N.B., this assumes that the
1153  * list of pages handed to migrate_pages()--which is how we get here--
1154  * is in virtual address order.
1155  */
1156 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1157 {
1158 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1159 	unsigned long uninitialized_var(address);
1160 
1161 	while (vma) {
1162 		address = page_address_in_vma(page, vma);
1163 		if (address != -EFAULT)
1164 			break;
1165 		vma = vma->vm_next;
1166 	}
1167 
1168 	/*
1169 	 * if !vma, alloc_page_vma() will use task or system default policy
1170 	 */
1171 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1172 }
1173 #else
1174 
1175 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1176 				unsigned long flags)
1177 {
1178 }
1179 
1180 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1181 		     const nodemask_t *to, int flags)
1182 {
1183 	return -ENOSYS;
1184 }
1185 
1186 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1187 {
1188 	return NULL;
1189 }
1190 #endif
1191 
1192 static long do_mbind(unsigned long start, unsigned long len,
1193 		     unsigned short mode, unsigned short mode_flags,
1194 		     nodemask_t *nmask, unsigned long flags)
1195 {
1196 	struct vm_area_struct *vma;
1197 	struct mm_struct *mm = current->mm;
1198 	struct mempolicy *new;
1199 	unsigned long end;
1200 	int err;
1201 	LIST_HEAD(pagelist);
1202 
1203 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1204 		return -EINVAL;
1205 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1206 		return -EPERM;
1207 
1208 	if (start & ~PAGE_MASK)
1209 		return -EINVAL;
1210 
1211 	if (mode == MPOL_DEFAULT)
1212 		flags &= ~MPOL_MF_STRICT;
1213 
1214 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1215 	end = start + len;
1216 
1217 	if (end < start)
1218 		return -EINVAL;
1219 	if (end == start)
1220 		return 0;
1221 
1222 	new = mpol_new(mode, mode_flags, nmask);
1223 	if (IS_ERR(new))
1224 		return PTR_ERR(new);
1225 
1226 	if (flags & MPOL_MF_LAZY)
1227 		new->flags |= MPOL_F_MOF;
1228 
1229 	/*
1230 	 * If we are using the default policy then operation
1231 	 * on discontinuous address spaces is okay after all
1232 	 */
1233 	if (!new)
1234 		flags |= MPOL_MF_DISCONTIG_OK;
1235 
1236 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237 		 start, start + len, mode, mode_flags,
1238 		 nmask ? nodes_addr(*nmask)[0] : -1);
1239 
1240 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241 
1242 		err = migrate_prep();
1243 		if (err)
1244 			goto mpol_out;
1245 	}
1246 	{
1247 		NODEMASK_SCRATCH(scratch);
1248 		if (scratch) {
1249 			down_write(&mm->mmap_sem);
1250 			task_lock(current);
1251 			err = mpol_set_nodemask(new, nmask, scratch);
1252 			task_unlock(current);
1253 			if (err)
1254 				up_write(&mm->mmap_sem);
1255 		} else
1256 			err = -ENOMEM;
1257 		NODEMASK_SCRATCH_FREE(scratch);
1258 	}
1259 	if (err)
1260 		goto mpol_out;
1261 
1262 	vma = check_range(mm, start, end, nmask,
1263 			  flags | MPOL_MF_INVERT, &pagelist);
1264 
1265 	err = PTR_ERR(vma);	/* maybe ... */
1266 	if (!IS_ERR(vma))
1267 		err = mbind_range(mm, start, end, new);
1268 
1269 	if (!err) {
1270 		int nr_failed = 0;
1271 
1272 		if (!list_empty(&pagelist)) {
1273 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1275 						(unsigned long)vma,
1276 						false, MIGRATE_SYNC,
1277 						MR_MEMPOLICY_MBIND);
1278 			if (nr_failed)
1279 				putback_lru_pages(&pagelist);
1280 		}
1281 
1282 		if (nr_failed && (flags & MPOL_MF_STRICT))
1283 			err = -EIO;
1284 	} else
1285 		putback_lru_pages(&pagelist);
1286 
1287 	up_write(&mm->mmap_sem);
1288  mpol_out:
1289 	mpol_put(new);
1290 	return err;
1291 }
1292 
1293 /*
1294  * User space interface with variable sized bitmaps for nodelists.
1295  */
1296 
1297 /* Copy a node mask from user space. */
1298 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1299 		     unsigned long maxnode)
1300 {
1301 	unsigned long k;
1302 	unsigned long nlongs;
1303 	unsigned long endmask;
1304 
1305 	--maxnode;
1306 	nodes_clear(*nodes);
1307 	if (maxnode == 0 || !nmask)
1308 		return 0;
1309 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1310 		return -EINVAL;
1311 
1312 	nlongs = BITS_TO_LONGS(maxnode);
1313 	if ((maxnode % BITS_PER_LONG) == 0)
1314 		endmask = ~0UL;
1315 	else
1316 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1317 
1318 	/* When the user specified more nodes than supported just check
1319 	   if the non supported part is all zero. */
1320 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1321 		if (nlongs > PAGE_SIZE/sizeof(long))
1322 			return -EINVAL;
1323 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1324 			unsigned long t;
1325 			if (get_user(t, nmask + k))
1326 				return -EFAULT;
1327 			if (k == nlongs - 1) {
1328 				if (t & endmask)
1329 					return -EINVAL;
1330 			} else if (t)
1331 				return -EINVAL;
1332 		}
1333 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1334 		endmask = ~0UL;
1335 	}
1336 
1337 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1338 		return -EFAULT;
1339 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1340 	return 0;
1341 }
1342 
1343 /* Copy a kernel node mask to user space */
1344 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1345 			      nodemask_t *nodes)
1346 {
1347 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1348 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1349 
1350 	if (copy > nbytes) {
1351 		if (copy > PAGE_SIZE)
1352 			return -EINVAL;
1353 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1354 			return -EFAULT;
1355 		copy = nbytes;
1356 	}
1357 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1358 }
1359 
1360 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1361 		unsigned long, mode, unsigned long __user *, nmask,
1362 		unsigned long, maxnode, unsigned, flags)
1363 {
1364 	nodemask_t nodes;
1365 	int err;
1366 	unsigned short mode_flags;
1367 
1368 	mode_flags = mode & MPOL_MODE_FLAGS;
1369 	mode &= ~MPOL_MODE_FLAGS;
1370 	if (mode >= MPOL_MAX)
1371 		return -EINVAL;
1372 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1373 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1374 		return -EINVAL;
1375 	err = get_nodes(&nodes, nmask, maxnode);
1376 	if (err)
1377 		return err;
1378 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1379 }
1380 
1381 /* Set the process memory policy */
1382 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1383 		unsigned long, maxnode)
1384 {
1385 	int err;
1386 	nodemask_t nodes;
1387 	unsigned short flags;
1388 
1389 	flags = mode & MPOL_MODE_FLAGS;
1390 	mode &= ~MPOL_MODE_FLAGS;
1391 	if ((unsigned int)mode >= MPOL_MAX)
1392 		return -EINVAL;
1393 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1394 		return -EINVAL;
1395 	err = get_nodes(&nodes, nmask, maxnode);
1396 	if (err)
1397 		return err;
1398 	return do_set_mempolicy(mode, flags, &nodes);
1399 }
1400 
1401 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1402 		const unsigned long __user *, old_nodes,
1403 		const unsigned long __user *, new_nodes)
1404 {
1405 	const struct cred *cred = current_cred(), *tcred;
1406 	struct mm_struct *mm = NULL;
1407 	struct task_struct *task;
1408 	nodemask_t task_nodes;
1409 	int err;
1410 	nodemask_t *old;
1411 	nodemask_t *new;
1412 	NODEMASK_SCRATCH(scratch);
1413 
1414 	if (!scratch)
1415 		return -ENOMEM;
1416 
1417 	old = &scratch->mask1;
1418 	new = &scratch->mask2;
1419 
1420 	err = get_nodes(old, old_nodes, maxnode);
1421 	if (err)
1422 		goto out;
1423 
1424 	err = get_nodes(new, new_nodes, maxnode);
1425 	if (err)
1426 		goto out;
1427 
1428 	/* Find the mm_struct */
1429 	rcu_read_lock();
1430 	task = pid ? find_task_by_vpid(pid) : current;
1431 	if (!task) {
1432 		rcu_read_unlock();
1433 		err = -ESRCH;
1434 		goto out;
1435 	}
1436 	get_task_struct(task);
1437 
1438 	err = -EINVAL;
1439 
1440 	/*
1441 	 * Check if this process has the right to modify the specified
1442 	 * process. The right exists if the process has administrative
1443 	 * capabilities, superuser privileges or the same
1444 	 * userid as the target process.
1445 	 */
1446 	tcred = __task_cred(task);
1447 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1448 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1449 	    !capable(CAP_SYS_NICE)) {
1450 		rcu_read_unlock();
1451 		err = -EPERM;
1452 		goto out_put;
1453 	}
1454 	rcu_read_unlock();
1455 
1456 	task_nodes = cpuset_mems_allowed(task);
1457 	/* Is the user allowed to access the target nodes? */
1458 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1459 		err = -EPERM;
1460 		goto out_put;
1461 	}
1462 
1463 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1464 		err = -EINVAL;
1465 		goto out_put;
1466 	}
1467 
1468 	err = security_task_movememory(task);
1469 	if (err)
1470 		goto out_put;
1471 
1472 	mm = get_task_mm(task);
1473 	put_task_struct(task);
1474 
1475 	if (!mm) {
1476 		err = -EINVAL;
1477 		goto out;
1478 	}
1479 
1480 	err = do_migrate_pages(mm, old, new,
1481 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1482 
1483 	mmput(mm);
1484 out:
1485 	NODEMASK_SCRATCH_FREE(scratch);
1486 
1487 	return err;
1488 
1489 out_put:
1490 	put_task_struct(task);
1491 	goto out;
1492 
1493 }
1494 
1495 
1496 /* Retrieve NUMA policy */
1497 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498 		unsigned long __user *, nmask, unsigned long, maxnode,
1499 		unsigned long, addr, unsigned long, flags)
1500 {
1501 	int err;
1502 	int uninitialized_var(pval);
1503 	nodemask_t nodes;
1504 
1505 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1506 		return -EINVAL;
1507 
1508 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1509 
1510 	if (err)
1511 		return err;
1512 
1513 	if (policy && put_user(pval, policy))
1514 		return -EFAULT;
1515 
1516 	if (nmask)
1517 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1518 
1519 	return err;
1520 }
1521 
1522 #ifdef CONFIG_COMPAT
1523 
1524 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1525 				     compat_ulong_t __user *nmask,
1526 				     compat_ulong_t maxnode,
1527 				     compat_ulong_t addr, compat_ulong_t flags)
1528 {
1529 	long err;
1530 	unsigned long __user *nm = NULL;
1531 	unsigned long nr_bits, alloc_size;
1532 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1533 
1534 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536 
1537 	if (nmask)
1538 		nm = compat_alloc_user_space(alloc_size);
1539 
1540 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1541 
1542 	if (!err && nmask) {
1543 		unsigned long copy_size;
1544 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1545 		err = copy_from_user(bm, nm, copy_size);
1546 		/* ensure entire bitmap is zeroed */
1547 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1548 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1549 	}
1550 
1551 	return err;
1552 }
1553 
1554 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1555 				     compat_ulong_t maxnode)
1556 {
1557 	long err = 0;
1558 	unsigned long __user *nm = NULL;
1559 	unsigned long nr_bits, alloc_size;
1560 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1561 
1562 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1563 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1564 
1565 	if (nmask) {
1566 		err = compat_get_bitmap(bm, nmask, nr_bits);
1567 		nm = compat_alloc_user_space(alloc_size);
1568 		err |= copy_to_user(nm, bm, alloc_size);
1569 	}
1570 
1571 	if (err)
1572 		return -EFAULT;
1573 
1574 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1575 }
1576 
1577 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1578 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1579 			     compat_ulong_t maxnode, compat_ulong_t flags)
1580 {
1581 	long err = 0;
1582 	unsigned long __user *nm = NULL;
1583 	unsigned long nr_bits, alloc_size;
1584 	nodemask_t bm;
1585 
1586 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1587 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1588 
1589 	if (nmask) {
1590 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1591 		nm = compat_alloc_user_space(alloc_size);
1592 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1593 	}
1594 
1595 	if (err)
1596 		return -EFAULT;
1597 
1598 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1599 }
1600 
1601 #endif
1602 
1603 /*
1604  * get_vma_policy(@task, @vma, @addr)
1605  * @task - task for fallback if vma policy == default
1606  * @vma   - virtual memory area whose policy is sought
1607  * @addr  - address in @vma for shared policy lookup
1608  *
1609  * Returns effective policy for a VMA at specified address.
1610  * Falls back to @task or system default policy, as necessary.
1611  * Current or other task's task mempolicy and non-shared vma policies must be
1612  * protected by task_lock(task) by the caller.
1613  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1614  * count--added by the get_policy() vm_op, as appropriate--to protect against
1615  * freeing by another task.  It is the caller's responsibility to free the
1616  * extra reference for shared policies.
1617  */
1618 struct mempolicy *get_vma_policy(struct task_struct *task,
1619 		struct vm_area_struct *vma, unsigned long addr)
1620 {
1621 	struct mempolicy *pol = get_task_policy(task);
1622 
1623 	if (vma) {
1624 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1625 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1626 									addr);
1627 			if (vpol)
1628 				pol = vpol;
1629 		} else if (vma->vm_policy) {
1630 			pol = vma->vm_policy;
1631 
1632 			/*
1633 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1634 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1635 			 * count on these policies which will be dropped by
1636 			 * mpol_cond_put() later
1637 			 */
1638 			if (mpol_needs_cond_ref(pol))
1639 				mpol_get(pol);
1640 		}
1641 	}
1642 	if (!pol)
1643 		pol = &default_policy;
1644 	return pol;
1645 }
1646 
1647 /*
1648  * Return a nodemask representing a mempolicy for filtering nodes for
1649  * page allocation
1650  */
1651 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652 {
1653 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1654 	if (unlikely(policy->mode == MPOL_BIND) &&
1655 			gfp_zone(gfp) >= policy_zone &&
1656 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657 		return &policy->v.nodes;
1658 
1659 	return NULL;
1660 }
1661 
1662 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1663 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1664 	int nd)
1665 {
1666 	switch (policy->mode) {
1667 	case MPOL_PREFERRED:
1668 		if (!(policy->flags & MPOL_F_LOCAL))
1669 			nd = policy->v.preferred_node;
1670 		break;
1671 	case MPOL_BIND:
1672 		/*
1673 		 * Normally, MPOL_BIND allocations are node-local within the
1674 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1675 		 * current node isn't part of the mask, we use the zonelist for
1676 		 * the first node in the mask instead.
1677 		 */
1678 		if (unlikely(gfp & __GFP_THISNODE) &&
1679 				unlikely(!node_isset(nd, policy->v.nodes)))
1680 			nd = first_node(policy->v.nodes);
1681 		break;
1682 	default:
1683 		BUG();
1684 	}
1685 	return node_zonelist(nd, gfp);
1686 }
1687 
1688 /* Do dynamic interleaving for a process */
1689 static unsigned interleave_nodes(struct mempolicy *policy)
1690 {
1691 	unsigned nid, next;
1692 	struct task_struct *me = current;
1693 
1694 	nid = me->il_next;
1695 	next = next_node(nid, policy->v.nodes);
1696 	if (next >= MAX_NUMNODES)
1697 		next = first_node(policy->v.nodes);
1698 	if (next < MAX_NUMNODES)
1699 		me->il_next = next;
1700 	return nid;
1701 }
1702 
1703 /*
1704  * Depending on the memory policy provide a node from which to allocate the
1705  * next slab entry.
1706  * @policy must be protected by freeing by the caller.  If @policy is
1707  * the current task's mempolicy, this protection is implicit, as only the
1708  * task can change it's policy.  The system default policy requires no
1709  * such protection.
1710  */
1711 unsigned slab_node(void)
1712 {
1713 	struct mempolicy *policy;
1714 
1715 	if (in_interrupt())
1716 		return numa_node_id();
1717 
1718 	policy = current->mempolicy;
1719 	if (!policy || policy->flags & MPOL_F_LOCAL)
1720 		return numa_node_id();
1721 
1722 	switch (policy->mode) {
1723 	case MPOL_PREFERRED:
1724 		/*
1725 		 * handled MPOL_F_LOCAL above
1726 		 */
1727 		return policy->v.preferred_node;
1728 
1729 	case MPOL_INTERLEAVE:
1730 		return interleave_nodes(policy);
1731 
1732 	case MPOL_BIND: {
1733 		/*
1734 		 * Follow bind policy behavior and start allocation at the
1735 		 * first node.
1736 		 */
1737 		struct zonelist *zonelist;
1738 		struct zone *zone;
1739 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1740 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1741 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1742 							&policy->v.nodes,
1743 							&zone);
1744 		return zone ? zone->node : numa_node_id();
1745 	}
1746 
1747 	default:
1748 		BUG();
1749 	}
1750 }
1751 
1752 /* Do static interleaving for a VMA with known offset. */
1753 static unsigned offset_il_node(struct mempolicy *pol,
1754 		struct vm_area_struct *vma, unsigned long off)
1755 {
1756 	unsigned nnodes = nodes_weight(pol->v.nodes);
1757 	unsigned target;
1758 	int c;
1759 	int nid = -1;
1760 
1761 	if (!nnodes)
1762 		return numa_node_id();
1763 	target = (unsigned int)off % nnodes;
1764 	c = 0;
1765 	do {
1766 		nid = next_node(nid, pol->v.nodes);
1767 		c++;
1768 	} while (c <= target);
1769 	return nid;
1770 }
1771 
1772 /* Determine a node number for interleave */
1773 static inline unsigned interleave_nid(struct mempolicy *pol,
1774 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1775 {
1776 	if (vma) {
1777 		unsigned long off;
1778 
1779 		/*
1780 		 * for small pages, there is no difference between
1781 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1782 		 * for huge pages, since vm_pgoff is in units of small
1783 		 * pages, we need to shift off the always 0 bits to get
1784 		 * a useful offset.
1785 		 */
1786 		BUG_ON(shift < PAGE_SHIFT);
1787 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1788 		off += (addr - vma->vm_start) >> shift;
1789 		return offset_il_node(pol, vma, off);
1790 	} else
1791 		return interleave_nodes(pol);
1792 }
1793 
1794 /*
1795  * Return the bit number of a random bit set in the nodemask.
1796  * (returns -1 if nodemask is empty)
1797  */
1798 int node_random(const nodemask_t *maskp)
1799 {
1800 	int w, bit = -1;
1801 
1802 	w = nodes_weight(*maskp);
1803 	if (w)
1804 		bit = bitmap_ord_to_pos(maskp->bits,
1805 			get_random_int() % w, MAX_NUMNODES);
1806 	return bit;
1807 }
1808 
1809 #ifdef CONFIG_HUGETLBFS
1810 /*
1811  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1812  * @vma = virtual memory area whose policy is sought
1813  * @addr = address in @vma for shared policy lookup and interleave policy
1814  * @gfp_flags = for requested zone
1815  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1816  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1817  *
1818  * Returns a zonelist suitable for a huge page allocation and a pointer
1819  * to the struct mempolicy for conditional unref after allocation.
1820  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1821  * @nodemask for filtering the zonelist.
1822  *
1823  * Must be protected by get_mems_allowed()
1824  */
1825 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1826 				gfp_t gfp_flags, struct mempolicy **mpol,
1827 				nodemask_t **nodemask)
1828 {
1829 	struct zonelist *zl;
1830 
1831 	*mpol = get_vma_policy(current, vma, addr);
1832 	*nodemask = NULL;	/* assume !MPOL_BIND */
1833 
1834 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1835 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1836 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1837 	} else {
1838 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1839 		if ((*mpol)->mode == MPOL_BIND)
1840 			*nodemask = &(*mpol)->v.nodes;
1841 	}
1842 	return zl;
1843 }
1844 
1845 /*
1846  * init_nodemask_of_mempolicy
1847  *
1848  * If the current task's mempolicy is "default" [NULL], return 'false'
1849  * to indicate default policy.  Otherwise, extract the policy nodemask
1850  * for 'bind' or 'interleave' policy into the argument nodemask, or
1851  * initialize the argument nodemask to contain the single node for
1852  * 'preferred' or 'local' policy and return 'true' to indicate presence
1853  * of non-default mempolicy.
1854  *
1855  * We don't bother with reference counting the mempolicy [mpol_get/put]
1856  * because the current task is examining it's own mempolicy and a task's
1857  * mempolicy is only ever changed by the task itself.
1858  *
1859  * N.B., it is the caller's responsibility to free a returned nodemask.
1860  */
1861 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1862 {
1863 	struct mempolicy *mempolicy;
1864 	int nid;
1865 
1866 	if (!(mask && current->mempolicy))
1867 		return false;
1868 
1869 	task_lock(current);
1870 	mempolicy = current->mempolicy;
1871 	switch (mempolicy->mode) {
1872 	case MPOL_PREFERRED:
1873 		if (mempolicy->flags & MPOL_F_LOCAL)
1874 			nid = numa_node_id();
1875 		else
1876 			nid = mempolicy->v.preferred_node;
1877 		init_nodemask_of_node(mask, nid);
1878 		break;
1879 
1880 	case MPOL_BIND:
1881 		/* Fall through */
1882 	case MPOL_INTERLEAVE:
1883 		*mask =  mempolicy->v.nodes;
1884 		break;
1885 
1886 	default:
1887 		BUG();
1888 	}
1889 	task_unlock(current);
1890 
1891 	return true;
1892 }
1893 #endif
1894 
1895 /*
1896  * mempolicy_nodemask_intersects
1897  *
1898  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1899  * policy.  Otherwise, check for intersection between mask and the policy
1900  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1901  * policy, always return true since it may allocate elsewhere on fallback.
1902  *
1903  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1904  */
1905 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1906 					const nodemask_t *mask)
1907 {
1908 	struct mempolicy *mempolicy;
1909 	bool ret = true;
1910 
1911 	if (!mask)
1912 		return ret;
1913 	task_lock(tsk);
1914 	mempolicy = tsk->mempolicy;
1915 	if (!mempolicy)
1916 		goto out;
1917 
1918 	switch (mempolicy->mode) {
1919 	case MPOL_PREFERRED:
1920 		/*
1921 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1922 		 * allocate from, they may fallback to other nodes when oom.
1923 		 * Thus, it's possible for tsk to have allocated memory from
1924 		 * nodes in mask.
1925 		 */
1926 		break;
1927 	case MPOL_BIND:
1928 	case MPOL_INTERLEAVE:
1929 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1930 		break;
1931 	default:
1932 		BUG();
1933 	}
1934 out:
1935 	task_unlock(tsk);
1936 	return ret;
1937 }
1938 
1939 /* Allocate a page in interleaved policy.
1940    Own path because it needs to do special accounting. */
1941 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1942 					unsigned nid)
1943 {
1944 	struct zonelist *zl;
1945 	struct page *page;
1946 
1947 	zl = node_zonelist(nid, gfp);
1948 	page = __alloc_pages(gfp, order, zl);
1949 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1950 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1951 	return page;
1952 }
1953 
1954 /**
1955  * 	alloc_pages_vma	- Allocate a page for a VMA.
1956  *
1957  * 	@gfp:
1958  *      %GFP_USER    user allocation.
1959  *      %GFP_KERNEL  kernel allocations,
1960  *      %GFP_HIGHMEM highmem/user allocations,
1961  *      %GFP_FS      allocation should not call back into a file system.
1962  *      %GFP_ATOMIC  don't sleep.
1963  *
1964  *	@order:Order of the GFP allocation.
1965  * 	@vma:  Pointer to VMA or NULL if not available.
1966  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1967  *
1968  * 	This function allocates a page from the kernel page pool and applies
1969  *	a NUMA policy associated with the VMA or the current process.
1970  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1971  *	mm_struct of the VMA to prevent it from going away. Should be used for
1972  *	all allocations for pages that will be mapped into
1973  * 	user space. Returns NULL when no page can be allocated.
1974  *
1975  *	Should be called with the mm_sem of the vma hold.
1976  */
1977 struct page *
1978 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1979 		unsigned long addr, int node)
1980 {
1981 	struct mempolicy *pol;
1982 	struct page *page;
1983 	unsigned int cpuset_mems_cookie;
1984 
1985 retry_cpuset:
1986 	pol = get_vma_policy(current, vma, addr);
1987 	cpuset_mems_cookie = get_mems_allowed();
1988 
1989 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1990 		unsigned nid;
1991 
1992 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1993 		mpol_cond_put(pol);
1994 		page = alloc_page_interleave(gfp, order, nid);
1995 		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1996 			goto retry_cpuset;
1997 
1998 		return page;
1999 	}
2000 	page = __alloc_pages_nodemask(gfp, order,
2001 				      policy_zonelist(gfp, pol, node),
2002 				      policy_nodemask(gfp, pol));
2003 	if (unlikely(mpol_needs_cond_ref(pol)))
2004 		__mpol_put(pol);
2005 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2006 		goto retry_cpuset;
2007 	return page;
2008 }
2009 
2010 /**
2011  * 	alloc_pages_current - Allocate pages.
2012  *
2013  *	@gfp:
2014  *		%GFP_USER   user allocation,
2015  *      	%GFP_KERNEL kernel allocation,
2016  *      	%GFP_HIGHMEM highmem allocation,
2017  *      	%GFP_FS     don't call back into a file system.
2018  *      	%GFP_ATOMIC don't sleep.
2019  *	@order: Power of two of allocation size in pages. 0 is a single page.
2020  *
2021  *	Allocate a page from the kernel page pool.  When not in
2022  *	interrupt context and apply the current process NUMA policy.
2023  *	Returns NULL when no page can be allocated.
2024  *
2025  *	Don't call cpuset_update_task_memory_state() unless
2026  *	1) it's ok to take cpuset_sem (can WAIT), and
2027  *	2) allocating for current task (not interrupt).
2028  */
2029 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2030 {
2031 	struct mempolicy *pol = get_task_policy(current);
2032 	struct page *page;
2033 	unsigned int cpuset_mems_cookie;
2034 
2035 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2036 		pol = &default_policy;
2037 
2038 retry_cpuset:
2039 	cpuset_mems_cookie = get_mems_allowed();
2040 
2041 	/*
2042 	 * No reference counting needed for current->mempolicy
2043 	 * nor system default_policy
2044 	 */
2045 	if (pol->mode == MPOL_INTERLEAVE)
2046 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2047 	else
2048 		page = __alloc_pages_nodemask(gfp, order,
2049 				policy_zonelist(gfp, pol, numa_node_id()),
2050 				policy_nodemask(gfp, pol));
2051 
2052 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2053 		goto retry_cpuset;
2054 
2055 	return page;
2056 }
2057 EXPORT_SYMBOL(alloc_pages_current);
2058 
2059 /*
2060  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2061  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2062  * with the mems_allowed returned by cpuset_mems_allowed().  This
2063  * keeps mempolicies cpuset relative after its cpuset moves.  See
2064  * further kernel/cpuset.c update_nodemask().
2065  *
2066  * current's mempolicy may be rebinded by the other task(the task that changes
2067  * cpuset's mems), so we needn't do rebind work for current task.
2068  */
2069 
2070 /* Slow path of a mempolicy duplicate */
2071 struct mempolicy *__mpol_dup(struct mempolicy *old)
2072 {
2073 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2074 
2075 	if (!new)
2076 		return ERR_PTR(-ENOMEM);
2077 
2078 	/* task's mempolicy is protected by alloc_lock */
2079 	if (old == current->mempolicy) {
2080 		task_lock(current);
2081 		*new = *old;
2082 		task_unlock(current);
2083 	} else
2084 		*new = *old;
2085 
2086 	rcu_read_lock();
2087 	if (current_cpuset_is_being_rebound()) {
2088 		nodemask_t mems = cpuset_mems_allowed(current);
2089 		if (new->flags & MPOL_F_REBINDING)
2090 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2091 		else
2092 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2093 	}
2094 	rcu_read_unlock();
2095 	atomic_set(&new->refcnt, 1);
2096 	return new;
2097 }
2098 
2099 /* Slow path of a mempolicy comparison */
2100 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2101 {
2102 	if (!a || !b)
2103 		return false;
2104 	if (a->mode != b->mode)
2105 		return false;
2106 	if (a->flags != b->flags)
2107 		return false;
2108 	if (mpol_store_user_nodemask(a))
2109 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2110 			return false;
2111 
2112 	switch (a->mode) {
2113 	case MPOL_BIND:
2114 		/* Fall through */
2115 	case MPOL_INTERLEAVE:
2116 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2117 	case MPOL_PREFERRED:
2118 		return a->v.preferred_node == b->v.preferred_node;
2119 	default:
2120 		BUG();
2121 		return false;
2122 	}
2123 }
2124 
2125 /*
2126  * Shared memory backing store policy support.
2127  *
2128  * Remember policies even when nobody has shared memory mapped.
2129  * The policies are kept in Red-Black tree linked from the inode.
2130  * They are protected by the sp->lock spinlock, which should be held
2131  * for any accesses to the tree.
2132  */
2133 
2134 /* lookup first element intersecting start-end */
2135 /* Caller holds sp->mutex */
2136 static struct sp_node *
2137 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138 {
2139 	struct rb_node *n = sp->root.rb_node;
2140 
2141 	while (n) {
2142 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2143 
2144 		if (start >= p->end)
2145 			n = n->rb_right;
2146 		else if (end <= p->start)
2147 			n = n->rb_left;
2148 		else
2149 			break;
2150 	}
2151 	if (!n)
2152 		return NULL;
2153 	for (;;) {
2154 		struct sp_node *w = NULL;
2155 		struct rb_node *prev = rb_prev(n);
2156 		if (!prev)
2157 			break;
2158 		w = rb_entry(prev, struct sp_node, nd);
2159 		if (w->end <= start)
2160 			break;
2161 		n = prev;
2162 	}
2163 	return rb_entry(n, struct sp_node, nd);
2164 }
2165 
2166 /* Insert a new shared policy into the list. */
2167 /* Caller holds sp->lock */
2168 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2169 {
2170 	struct rb_node **p = &sp->root.rb_node;
2171 	struct rb_node *parent = NULL;
2172 	struct sp_node *nd;
2173 
2174 	while (*p) {
2175 		parent = *p;
2176 		nd = rb_entry(parent, struct sp_node, nd);
2177 		if (new->start < nd->start)
2178 			p = &(*p)->rb_left;
2179 		else if (new->end > nd->end)
2180 			p = &(*p)->rb_right;
2181 		else
2182 			BUG();
2183 	}
2184 	rb_link_node(&new->nd, parent, p);
2185 	rb_insert_color(&new->nd, &sp->root);
2186 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2187 		 new->policy ? new->policy->mode : 0);
2188 }
2189 
2190 /* Find shared policy intersecting idx */
2191 struct mempolicy *
2192 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2193 {
2194 	struct mempolicy *pol = NULL;
2195 	struct sp_node *sn;
2196 
2197 	if (!sp->root.rb_node)
2198 		return NULL;
2199 	mutex_lock(&sp->mutex);
2200 	sn = sp_lookup(sp, idx, idx+1);
2201 	if (sn) {
2202 		mpol_get(sn->policy);
2203 		pol = sn->policy;
2204 	}
2205 	mutex_unlock(&sp->mutex);
2206 	return pol;
2207 }
2208 
2209 static void sp_free(struct sp_node *n)
2210 {
2211 	mpol_put(n->policy);
2212 	kmem_cache_free(sn_cache, n);
2213 }
2214 
2215 /**
2216  * mpol_misplaced - check whether current page node is valid in policy
2217  *
2218  * @page   - page to be checked
2219  * @vma    - vm area where page mapped
2220  * @addr   - virtual address where page mapped
2221  *
2222  * Lookup current policy node id for vma,addr and "compare to" page's
2223  * node id.
2224  *
2225  * Returns:
2226  *	-1	- not misplaced, page is in the right node
2227  *	node	- node id where the page should be
2228  *
2229  * Policy determination "mimics" alloc_page_vma().
2230  * Called from fault path where we know the vma and faulting address.
2231  */
2232 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233 {
2234 	struct mempolicy *pol;
2235 	struct zone *zone;
2236 	int curnid = page_to_nid(page);
2237 	unsigned long pgoff;
2238 	int polnid = -1;
2239 	int ret = -1;
2240 
2241 	BUG_ON(!vma);
2242 
2243 	pol = get_vma_policy(current, vma, addr);
2244 	if (!(pol->flags & MPOL_F_MOF))
2245 		goto out;
2246 
2247 	switch (pol->mode) {
2248 	case MPOL_INTERLEAVE:
2249 		BUG_ON(addr >= vma->vm_end);
2250 		BUG_ON(addr < vma->vm_start);
2251 
2252 		pgoff = vma->vm_pgoff;
2253 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 		polnid = offset_il_node(pol, vma, pgoff);
2255 		break;
2256 
2257 	case MPOL_PREFERRED:
2258 		if (pol->flags & MPOL_F_LOCAL)
2259 			polnid = numa_node_id();
2260 		else
2261 			polnid = pol->v.preferred_node;
2262 		break;
2263 
2264 	case MPOL_BIND:
2265 		/*
2266 		 * allows binding to multiple nodes.
2267 		 * use current page if in policy nodemask,
2268 		 * else select nearest allowed node, if any.
2269 		 * If no allowed nodes, use current [!misplaced].
2270 		 */
2271 		if (node_isset(curnid, pol->v.nodes))
2272 			goto out;
2273 		(void)first_zones_zonelist(
2274 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 				gfp_zone(GFP_HIGHUSER),
2276 				&pol->v.nodes, &zone);
2277 		polnid = zone->node;
2278 		break;
2279 
2280 	default:
2281 		BUG();
2282 	}
2283 
2284 	/* Migrate the page towards the node whose CPU is referencing it */
2285 	if (pol->flags & MPOL_F_MORON) {
2286 		int last_nid;
2287 
2288 		polnid = numa_node_id();
2289 
2290 		/*
2291 		 * Multi-stage node selection is used in conjunction
2292 		 * with a periodic migration fault to build a temporal
2293 		 * task<->page relation. By using a two-stage filter we
2294 		 * remove short/unlikely relations.
2295 		 *
2296 		 * Using P(p) ~ n_p / n_t as per frequentist
2297 		 * probability, we can equate a task's usage of a
2298 		 * particular page (n_p) per total usage of this
2299 		 * page (n_t) (in a given time-span) to a probability.
2300 		 *
2301 		 * Our periodic faults will sample this probability and
2302 		 * getting the same result twice in a row, given these
2303 		 * samples are fully independent, is then given by
2304 		 * P(n)^2, provided our sample period is sufficiently
2305 		 * short compared to the usage pattern.
2306 		 *
2307 		 * This quadric squishes small probabilities, making
2308 		 * it less likely we act on an unlikely task<->page
2309 		 * relation.
2310 		 */
2311 		last_nid = page_xchg_last_nid(page, polnid);
2312 		if (last_nid != polnid)
2313 			goto out;
2314 	}
2315 
2316 	if (curnid != polnid)
2317 		ret = polnid;
2318 out:
2319 	mpol_cond_put(pol);
2320 
2321 	return ret;
2322 }
2323 
2324 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2325 {
2326 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2327 	rb_erase(&n->nd, &sp->root);
2328 	sp_free(n);
2329 }
2330 
2331 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2332 				struct mempolicy *pol)
2333 {
2334 	struct sp_node *n;
2335 	struct mempolicy *newpol;
2336 
2337 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2338 	if (!n)
2339 		return NULL;
2340 
2341 	newpol = mpol_dup(pol);
2342 	if (IS_ERR(newpol)) {
2343 		kmem_cache_free(sn_cache, n);
2344 		return NULL;
2345 	}
2346 	newpol->flags |= MPOL_F_SHARED;
2347 
2348 	n->start = start;
2349 	n->end = end;
2350 	n->policy = newpol;
2351 
2352 	return n;
2353 }
2354 
2355 /* Replace a policy range. */
2356 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2357 				 unsigned long end, struct sp_node *new)
2358 {
2359 	struct sp_node *n;
2360 	int ret = 0;
2361 
2362 	mutex_lock(&sp->mutex);
2363 	n = sp_lookup(sp, start, end);
2364 	/* Take care of old policies in the same range. */
2365 	while (n && n->start < end) {
2366 		struct rb_node *next = rb_next(&n->nd);
2367 		if (n->start >= start) {
2368 			if (n->end <= end)
2369 				sp_delete(sp, n);
2370 			else
2371 				n->start = end;
2372 		} else {
2373 			/* Old policy spanning whole new range. */
2374 			if (n->end > end) {
2375 				struct sp_node *new2;
2376 				new2 = sp_alloc(end, n->end, n->policy);
2377 				if (!new2) {
2378 					ret = -ENOMEM;
2379 					goto out;
2380 				}
2381 				n->end = start;
2382 				sp_insert(sp, new2);
2383 				break;
2384 			} else
2385 				n->end = start;
2386 		}
2387 		if (!next)
2388 			break;
2389 		n = rb_entry(next, struct sp_node, nd);
2390 	}
2391 	if (new)
2392 		sp_insert(sp, new);
2393 out:
2394 	mutex_unlock(&sp->mutex);
2395 	return ret;
2396 }
2397 
2398 /**
2399  * mpol_shared_policy_init - initialize shared policy for inode
2400  * @sp: pointer to inode shared policy
2401  * @mpol:  struct mempolicy to install
2402  *
2403  * Install non-NULL @mpol in inode's shared policy rb-tree.
2404  * On entry, the current task has a reference on a non-NULL @mpol.
2405  * This must be released on exit.
2406  * This is called at get_inode() calls and we can use GFP_KERNEL.
2407  */
2408 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2409 {
2410 	int ret;
2411 
2412 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2413 	mutex_init(&sp->mutex);
2414 
2415 	if (mpol) {
2416 		struct vm_area_struct pvma;
2417 		struct mempolicy *new;
2418 		NODEMASK_SCRATCH(scratch);
2419 
2420 		if (!scratch)
2421 			goto put_mpol;
2422 		/* contextualize the tmpfs mount point mempolicy */
2423 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2424 		if (IS_ERR(new))
2425 			goto free_scratch; /* no valid nodemask intersection */
2426 
2427 		task_lock(current);
2428 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2429 		task_unlock(current);
2430 		if (ret)
2431 			goto put_new;
2432 
2433 		/* Create pseudo-vma that contains just the policy */
2434 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2435 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2436 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2437 
2438 put_new:
2439 		mpol_put(new);			/* drop initial ref */
2440 free_scratch:
2441 		NODEMASK_SCRATCH_FREE(scratch);
2442 put_mpol:
2443 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2444 	}
2445 }
2446 
2447 int mpol_set_shared_policy(struct shared_policy *info,
2448 			struct vm_area_struct *vma, struct mempolicy *npol)
2449 {
2450 	int err;
2451 	struct sp_node *new = NULL;
2452 	unsigned long sz = vma_pages(vma);
2453 
2454 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2455 		 vma->vm_pgoff,
2456 		 sz, npol ? npol->mode : -1,
2457 		 npol ? npol->flags : -1,
2458 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2459 
2460 	if (npol) {
2461 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2462 		if (!new)
2463 			return -ENOMEM;
2464 	}
2465 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2466 	if (err && new)
2467 		sp_free(new);
2468 	return err;
2469 }
2470 
2471 /* Free a backing policy store on inode delete. */
2472 void mpol_free_shared_policy(struct shared_policy *p)
2473 {
2474 	struct sp_node *n;
2475 	struct rb_node *next;
2476 
2477 	if (!p->root.rb_node)
2478 		return;
2479 	mutex_lock(&p->mutex);
2480 	next = rb_first(&p->root);
2481 	while (next) {
2482 		n = rb_entry(next, struct sp_node, nd);
2483 		next = rb_next(&n->nd);
2484 		sp_delete(p, n);
2485 	}
2486 	mutex_unlock(&p->mutex);
2487 }
2488 
2489 #ifdef CONFIG_NUMA_BALANCING
2490 static bool __initdata numabalancing_override;
2491 
2492 static void __init check_numabalancing_enable(void)
2493 {
2494 	bool numabalancing_default = false;
2495 
2496 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2497 		numabalancing_default = true;
2498 
2499 	if (nr_node_ids > 1 && !numabalancing_override) {
2500 		printk(KERN_INFO "Enabling automatic NUMA balancing. "
2501 			"Configure with numa_balancing= or sysctl");
2502 		set_numabalancing_state(numabalancing_default);
2503 	}
2504 }
2505 
2506 static int __init setup_numabalancing(char *str)
2507 {
2508 	int ret = 0;
2509 	if (!str)
2510 		goto out;
2511 	numabalancing_override = true;
2512 
2513 	if (!strcmp(str, "enable")) {
2514 		set_numabalancing_state(true);
2515 		ret = 1;
2516 	} else if (!strcmp(str, "disable")) {
2517 		set_numabalancing_state(false);
2518 		ret = 1;
2519 	}
2520 out:
2521 	if (!ret)
2522 		printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2523 
2524 	return ret;
2525 }
2526 __setup("numa_balancing=", setup_numabalancing);
2527 #else
2528 static inline void __init check_numabalancing_enable(void)
2529 {
2530 }
2531 #endif /* CONFIG_NUMA_BALANCING */
2532 
2533 /* assumes fs == KERNEL_DS */
2534 void __init numa_policy_init(void)
2535 {
2536 	nodemask_t interleave_nodes;
2537 	unsigned long largest = 0;
2538 	int nid, prefer = 0;
2539 
2540 	policy_cache = kmem_cache_create("numa_policy",
2541 					 sizeof(struct mempolicy),
2542 					 0, SLAB_PANIC, NULL);
2543 
2544 	sn_cache = kmem_cache_create("shared_policy_node",
2545 				     sizeof(struct sp_node),
2546 				     0, SLAB_PANIC, NULL);
2547 
2548 	for_each_node(nid) {
2549 		preferred_node_policy[nid] = (struct mempolicy) {
2550 			.refcnt = ATOMIC_INIT(1),
2551 			.mode = MPOL_PREFERRED,
2552 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2553 			.v = { .preferred_node = nid, },
2554 		};
2555 	}
2556 
2557 	/*
2558 	 * Set interleaving policy for system init. Interleaving is only
2559 	 * enabled across suitably sized nodes (default is >= 16MB), or
2560 	 * fall back to the largest node if they're all smaller.
2561 	 */
2562 	nodes_clear(interleave_nodes);
2563 	for_each_node_state(nid, N_MEMORY) {
2564 		unsigned long total_pages = node_present_pages(nid);
2565 
2566 		/* Preserve the largest node */
2567 		if (largest < total_pages) {
2568 			largest = total_pages;
2569 			prefer = nid;
2570 		}
2571 
2572 		/* Interleave this node? */
2573 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2574 			node_set(nid, interleave_nodes);
2575 	}
2576 
2577 	/* All too small, use the largest */
2578 	if (unlikely(nodes_empty(interleave_nodes)))
2579 		node_set(prefer, interleave_nodes);
2580 
2581 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2582 		printk("numa_policy_init: interleaving failed\n");
2583 
2584 	check_numabalancing_enable();
2585 }
2586 
2587 /* Reset policy of current process to default */
2588 void numa_default_policy(void)
2589 {
2590 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2591 }
2592 
2593 /*
2594  * Parse and format mempolicy from/to strings
2595  */
2596 
2597 /*
2598  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2599  * Used only for mpol_parse_str() and mpol_to_str()
2600  */
2601 static const char * const policy_modes[] =
2602 {
2603 	[MPOL_DEFAULT]    = "default",
2604 	[MPOL_PREFERRED]  = "prefer",
2605 	[MPOL_BIND]       = "bind",
2606 	[MPOL_INTERLEAVE] = "interleave",
2607 	[MPOL_LOCAL]      = "local",
2608 };
2609 
2610 
2611 #ifdef CONFIG_TMPFS
2612 /**
2613  * mpol_parse_str - parse string to mempolicy
2614  * @str:  string containing mempolicy to parse
2615  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2616  * @no_context:  flag whether to "contextualize" the mempolicy
2617  *
2618  * Format of input:
2619  *	<mode>[=<flags>][:<nodelist>]
2620  *
2621  * if @no_context is true, save the input nodemask in w.user_nodemask in
2622  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2623  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2624  * mount option.  Note that if 'static' or 'relative' mode flags were
2625  * specified, the input nodemask will already have been saved.  Saving
2626  * it again is redundant, but safe.
2627  *
2628  * On success, returns 0, else 1
2629  */
2630 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2631 {
2632 	struct mempolicy *new = NULL;
2633 	unsigned short mode;
2634 	unsigned short uninitialized_var(mode_flags);
2635 	nodemask_t nodes;
2636 	char *nodelist = strchr(str, ':');
2637 	char *flags = strchr(str, '=');
2638 	int err = 1;
2639 
2640 	if (nodelist) {
2641 		/* NUL-terminate mode or flags string */
2642 		*nodelist++ = '\0';
2643 		if (nodelist_parse(nodelist, nodes))
2644 			goto out;
2645 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2646 			goto out;
2647 	} else
2648 		nodes_clear(nodes);
2649 
2650 	if (flags)
2651 		*flags++ = '\0';	/* terminate mode string */
2652 
2653 	for (mode = 0; mode < MPOL_MAX; mode++) {
2654 		if (!strcmp(str, policy_modes[mode])) {
2655 			break;
2656 		}
2657 	}
2658 	if (mode >= MPOL_MAX)
2659 		goto out;
2660 
2661 	switch (mode) {
2662 	case MPOL_PREFERRED:
2663 		/*
2664 		 * Insist on a nodelist of one node only
2665 		 */
2666 		if (nodelist) {
2667 			char *rest = nodelist;
2668 			while (isdigit(*rest))
2669 				rest++;
2670 			if (*rest)
2671 				goto out;
2672 		}
2673 		break;
2674 	case MPOL_INTERLEAVE:
2675 		/*
2676 		 * Default to online nodes with memory if no nodelist
2677 		 */
2678 		if (!nodelist)
2679 			nodes = node_states[N_MEMORY];
2680 		break;
2681 	case MPOL_LOCAL:
2682 		/*
2683 		 * Don't allow a nodelist;  mpol_new() checks flags
2684 		 */
2685 		if (nodelist)
2686 			goto out;
2687 		mode = MPOL_PREFERRED;
2688 		break;
2689 	case MPOL_DEFAULT:
2690 		/*
2691 		 * Insist on a empty nodelist
2692 		 */
2693 		if (!nodelist)
2694 			err = 0;
2695 		goto out;
2696 	case MPOL_BIND:
2697 		/*
2698 		 * Insist on a nodelist
2699 		 */
2700 		if (!nodelist)
2701 			goto out;
2702 	}
2703 
2704 	mode_flags = 0;
2705 	if (flags) {
2706 		/*
2707 		 * Currently, we only support two mutually exclusive
2708 		 * mode flags.
2709 		 */
2710 		if (!strcmp(flags, "static"))
2711 			mode_flags |= MPOL_F_STATIC_NODES;
2712 		else if (!strcmp(flags, "relative"))
2713 			mode_flags |= MPOL_F_RELATIVE_NODES;
2714 		else
2715 			goto out;
2716 	}
2717 
2718 	new = mpol_new(mode, mode_flags, &nodes);
2719 	if (IS_ERR(new))
2720 		goto out;
2721 
2722 	if (no_context) {
2723 		/* save for contextualization */
2724 		new->w.user_nodemask = nodes;
2725 	} else {
2726 		int ret;
2727 		NODEMASK_SCRATCH(scratch);
2728 		if (scratch) {
2729 			task_lock(current);
2730 			ret = mpol_set_nodemask(new, &nodes, scratch);
2731 			task_unlock(current);
2732 		} else
2733 			ret = -ENOMEM;
2734 		NODEMASK_SCRATCH_FREE(scratch);
2735 		if (ret) {
2736 			mpol_put(new);
2737 			goto out;
2738 		}
2739 	}
2740 	err = 0;
2741 
2742 out:
2743 	/* Restore string for error message */
2744 	if (nodelist)
2745 		*--nodelist = ':';
2746 	if (flags)
2747 		*--flags = '=';
2748 	if (!err)
2749 		*mpol = new;
2750 	return err;
2751 }
2752 #endif /* CONFIG_TMPFS */
2753 
2754 /**
2755  * mpol_to_str - format a mempolicy structure for printing
2756  * @buffer:  to contain formatted mempolicy string
2757  * @maxlen:  length of @buffer
2758  * @pol:  pointer to mempolicy to be formatted
2759  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2760  *
2761  * Convert a mempolicy into a string.
2762  * Returns the number of characters in buffer (if positive)
2763  * or an error (negative)
2764  */
2765 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2766 {
2767 	char *p = buffer;
2768 	int l;
2769 	nodemask_t nodes;
2770 	unsigned short mode;
2771 	unsigned short flags = pol ? pol->flags : 0;
2772 
2773 	/*
2774 	 * Sanity check:  room for longest mode, flag and some nodes
2775 	 */
2776 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2777 
2778 	if (!pol || pol == &default_policy)
2779 		mode = MPOL_DEFAULT;
2780 	else
2781 		mode = pol->mode;
2782 
2783 	switch (mode) {
2784 	case MPOL_DEFAULT:
2785 		nodes_clear(nodes);
2786 		break;
2787 
2788 	case MPOL_PREFERRED:
2789 		nodes_clear(nodes);
2790 		if (flags & MPOL_F_LOCAL)
2791 			mode = MPOL_LOCAL;	/* pseudo-policy */
2792 		else
2793 			node_set(pol->v.preferred_node, nodes);
2794 		break;
2795 
2796 	case MPOL_BIND:
2797 		/* Fall through */
2798 	case MPOL_INTERLEAVE:
2799 		if (no_context)
2800 			nodes = pol->w.user_nodemask;
2801 		else
2802 			nodes = pol->v.nodes;
2803 		break;
2804 
2805 	default:
2806 		return -EINVAL;
2807 	}
2808 
2809 	l = strlen(policy_modes[mode]);
2810 	if (buffer + maxlen < p + l + 1)
2811 		return -ENOSPC;
2812 
2813 	strcpy(p, policy_modes[mode]);
2814 	p += l;
2815 
2816 	if (flags & MPOL_MODE_FLAGS) {
2817 		if (buffer + maxlen < p + 2)
2818 			return -ENOSPC;
2819 		*p++ = '=';
2820 
2821 		/*
2822 		 * Currently, the only defined flags are mutually exclusive
2823 		 */
2824 		if (flags & MPOL_F_STATIC_NODES)
2825 			p += snprintf(p, buffer + maxlen - p, "static");
2826 		else if (flags & MPOL_F_RELATIVE_NODES)
2827 			p += snprintf(p, buffer + maxlen - p, "relative");
2828 	}
2829 
2830 	if (!nodes_empty(nodes)) {
2831 		if (buffer + maxlen < p + 2)
2832 			return -ENOSPC;
2833 		*p++ = ':';
2834 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2835 	}
2836 	return p - buffer;
2837 }
2838