xref: /openbmc/linux/mm/mempolicy.c (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 #include <linux/random.h>
101 
102 #include "internal.h"
103 
104 /* Internal flags */
105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107 
108 static struct kmem_cache *policy_cache;
109 static struct kmem_cache *sn_cache;
110 
111 /* Highest zone. An specific allocation for a zone below that is not
112    policied. */
113 enum zone_type policy_zone = 0;
114 
115 /*
116  * run-time system-wide default policy => local allocation
117  */
118 static struct mempolicy default_policy = {
119 	.refcnt = ATOMIC_INIT(1), /* never free it */
120 	.mode = MPOL_PREFERRED,
121 	.flags = MPOL_F_LOCAL,
122 };
123 
124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 
126 struct mempolicy *get_task_policy(struct task_struct *p)
127 {
128 	struct mempolicy *pol = p->mempolicy;
129 	int node;
130 
131 	if (pol)
132 		return pol;
133 
134 	node = numa_node_id();
135 	if (node != NUMA_NO_NODE) {
136 		pol = &preferred_node_policy[node];
137 		/* preferred_node_policy is not initialised early in boot */
138 		if (pol->mode)
139 			return pol;
140 	}
141 
142 	return &default_policy;
143 }
144 
145 static const struct mempolicy_operations {
146 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
147 	/*
148 	 * If read-side task has no lock to protect task->mempolicy, write-side
149 	 * task will rebind the task->mempolicy by two step. The first step is
150 	 * setting all the newly nodes, and the second step is cleaning all the
151 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
152 	 * page.
153 	 * If we have a lock to protect task->mempolicy in read-side, we do
154 	 * rebind directly.
155 	 *
156 	 * step:
157 	 * 	MPOL_REBIND_ONCE - do rebind work at once
158 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
159 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
160 	 */
161 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
162 			enum mpol_rebind_step step);
163 } mpol_ops[MPOL_MAX];
164 
165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
166 {
167 	return pol->flags & MPOL_MODE_FLAGS;
168 }
169 
170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
171 				   const nodemask_t *rel)
172 {
173 	nodemask_t tmp;
174 	nodes_fold(tmp, *orig, nodes_weight(*rel));
175 	nodes_onto(*ret, tmp, *rel);
176 }
177 
178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
179 {
180 	if (nodes_empty(*nodes))
181 		return -EINVAL;
182 	pol->v.nodes = *nodes;
183 	return 0;
184 }
185 
186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (!nodes)
189 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
190 	else if (nodes_empty(*nodes))
191 		return -EINVAL;			/*  no allowed nodes */
192 	else
193 		pol->v.preferred_node = first_node(*nodes);
194 	return 0;
195 }
196 
197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
198 {
199 	if (nodes_empty(*nodes))
200 		return -EINVAL;
201 	pol->v.nodes = *nodes;
202 	return 0;
203 }
204 
205 /*
206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
207  * any, for the new policy.  mpol_new() has already validated the nodes
208  * parameter with respect to the policy mode and flags.  But, we need to
209  * handle an empty nodemask with MPOL_PREFERRED here.
210  *
211  * Must be called holding task's alloc_lock to protect task's mems_allowed
212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
213  */
214 static int mpol_set_nodemask(struct mempolicy *pol,
215 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
216 {
217 	int ret;
218 
219 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
220 	if (pol == NULL)
221 		return 0;
222 	/* Check N_MEMORY */
223 	nodes_and(nsc->mask1,
224 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
225 
226 	VM_BUG_ON(!nodes);
227 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
228 		nodes = NULL;	/* explicit local allocation */
229 	else {
230 		if (pol->flags & MPOL_F_RELATIVE_NODES)
231 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
232 		else
233 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
234 
235 		if (mpol_store_user_nodemask(pol))
236 			pol->w.user_nodemask = *nodes;
237 		else
238 			pol->w.cpuset_mems_allowed =
239 						cpuset_current_mems_allowed;
240 	}
241 
242 	if (nodes)
243 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
244 	else
245 		ret = mpol_ops[pol->mode].create(pol, NULL);
246 	return ret;
247 }
248 
249 /*
250  * This function just creates a new policy, does some check and simple
251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
252  */
253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 				  nodemask_t *nodes)
255 {
256 	struct mempolicy *policy;
257 
258 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
259 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
260 
261 	if (mode == MPOL_DEFAULT) {
262 		if (nodes && !nodes_empty(*nodes))
263 			return ERR_PTR(-EINVAL);
264 		return NULL;
265 	}
266 	VM_BUG_ON(!nodes);
267 
268 	/*
269 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
270 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
271 	 * All other modes require a valid pointer to a non-empty nodemask.
272 	 */
273 	if (mode == MPOL_PREFERRED) {
274 		if (nodes_empty(*nodes)) {
275 			if (((flags & MPOL_F_STATIC_NODES) ||
276 			     (flags & MPOL_F_RELATIVE_NODES)))
277 				return ERR_PTR(-EINVAL);
278 		}
279 	} else if (mode == MPOL_LOCAL) {
280 		if (!nodes_empty(*nodes))
281 			return ERR_PTR(-EINVAL);
282 		mode = MPOL_PREFERRED;
283 	} else if (nodes_empty(*nodes))
284 		return ERR_PTR(-EINVAL);
285 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
286 	if (!policy)
287 		return ERR_PTR(-ENOMEM);
288 	atomic_set(&policy->refcnt, 1);
289 	policy->mode = mode;
290 	policy->flags = flags;
291 
292 	return policy;
293 }
294 
295 /* Slow path of a mpol destructor. */
296 void __mpol_put(struct mempolicy *p)
297 {
298 	if (!atomic_dec_and_test(&p->refcnt))
299 		return;
300 	kmem_cache_free(policy_cache, p);
301 }
302 
303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
304 				enum mpol_rebind_step step)
305 {
306 }
307 
308 /*
309  * step:
310  * 	MPOL_REBIND_ONCE  - do rebind work at once
311  * 	MPOL_REBIND_STEP1 - set all the newly nodes
312  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
313  */
314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
315 				 enum mpol_rebind_step step)
316 {
317 	nodemask_t tmp;
318 
319 	if (pol->flags & MPOL_F_STATIC_NODES)
320 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
321 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 	else {
324 		/*
325 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
326 		 * result
327 		 */
328 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
329 			nodes_remap(tmp, pol->v.nodes,
330 					pol->w.cpuset_mems_allowed, *nodes);
331 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
332 		} else if (step == MPOL_REBIND_STEP2) {
333 			tmp = pol->w.cpuset_mems_allowed;
334 			pol->w.cpuset_mems_allowed = *nodes;
335 		} else
336 			BUG();
337 	}
338 
339 	if (nodes_empty(tmp))
340 		tmp = *nodes;
341 
342 	if (step == MPOL_REBIND_STEP1)
343 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
344 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
345 		pol->v.nodes = tmp;
346 	else
347 		BUG();
348 
349 	if (!node_isset(current->il_next, tmp)) {
350 		current->il_next = next_node(current->il_next, tmp);
351 		if (current->il_next >= MAX_NUMNODES)
352 			current->il_next = first_node(tmp);
353 		if (current->il_next >= MAX_NUMNODES)
354 			current->il_next = numa_node_id();
355 	}
356 }
357 
358 static void mpol_rebind_preferred(struct mempolicy *pol,
359 				  const nodemask_t *nodes,
360 				  enum mpol_rebind_step step)
361 {
362 	nodemask_t tmp;
363 
364 	if (pol->flags & MPOL_F_STATIC_NODES) {
365 		int node = first_node(pol->w.user_nodemask);
366 
367 		if (node_isset(node, *nodes)) {
368 			pol->v.preferred_node = node;
369 			pol->flags &= ~MPOL_F_LOCAL;
370 		} else
371 			pol->flags |= MPOL_F_LOCAL;
372 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
373 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
374 		pol->v.preferred_node = first_node(tmp);
375 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
376 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
377 						   pol->w.cpuset_mems_allowed,
378 						   *nodes);
379 		pol->w.cpuset_mems_allowed = *nodes;
380 	}
381 }
382 
383 /*
384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
385  *
386  * If read-side task has no lock to protect task->mempolicy, write-side
387  * task will rebind the task->mempolicy by two step. The first step is
388  * setting all the newly nodes, and the second step is cleaning all the
389  * disallowed nodes. In this way, we can avoid finding no node to alloc
390  * page.
391  * If we have a lock to protect task->mempolicy in read-side, we do
392  * rebind directly.
393  *
394  * step:
395  * 	MPOL_REBIND_ONCE  - do rebind work at once
396  * 	MPOL_REBIND_STEP1 - set all the newly nodes
397  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
398  */
399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
400 				enum mpol_rebind_step step)
401 {
402 	if (!pol)
403 		return;
404 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
405 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
409 		return;
410 
411 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
412 		BUG();
413 
414 	if (step == MPOL_REBIND_STEP1)
415 		pol->flags |= MPOL_F_REBINDING;
416 	else if (step == MPOL_REBIND_STEP2)
417 		pol->flags &= ~MPOL_F_REBINDING;
418 	else if (step >= MPOL_REBIND_NSTEP)
419 		BUG();
420 
421 	mpol_ops[pol->mode].rebind(pol, newmask, step);
422 }
423 
424 /*
425  * Wrapper for mpol_rebind_policy() that just requires task
426  * pointer, and updates task mempolicy.
427  *
428  * Called with task's alloc_lock held.
429  */
430 
431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
432 			enum mpol_rebind_step step)
433 {
434 	mpol_rebind_policy(tsk->mempolicy, new, step);
435 }
436 
437 /*
438  * Rebind each vma in mm to new nodemask.
439  *
440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
441  */
442 
443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
444 {
445 	struct vm_area_struct *vma;
446 
447 	down_write(&mm->mmap_sem);
448 	for (vma = mm->mmap; vma; vma = vma->vm_next)
449 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
450 	up_write(&mm->mmap_sem);
451 }
452 
453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
454 	[MPOL_DEFAULT] = {
455 		.rebind = mpol_rebind_default,
456 	},
457 	[MPOL_INTERLEAVE] = {
458 		.create = mpol_new_interleave,
459 		.rebind = mpol_rebind_nodemask,
460 	},
461 	[MPOL_PREFERRED] = {
462 		.create = mpol_new_preferred,
463 		.rebind = mpol_rebind_preferred,
464 	},
465 	[MPOL_BIND] = {
466 		.create = mpol_new_bind,
467 		.rebind = mpol_rebind_nodemask,
468 	},
469 };
470 
471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 				unsigned long flags);
473 
474 struct queue_pages {
475 	struct list_head *pagelist;
476 	unsigned long flags;
477 	nodemask_t *nmask;
478 	struct vm_area_struct *prev;
479 };
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  */
485 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
486 			unsigned long end, struct mm_walk *walk)
487 {
488 	struct vm_area_struct *vma = walk->vma;
489 	struct page *page;
490 	struct queue_pages *qp = walk->private;
491 	unsigned long flags = qp->flags;
492 	int nid;
493 	pte_t *pte;
494 	spinlock_t *ptl;
495 
496 	split_huge_page_pmd(vma, addr, pmd);
497 	if (pmd_trans_unstable(pmd))
498 		return 0;
499 
500 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501 	for (; addr != end; pte++, addr += PAGE_SIZE) {
502 		if (!pte_present(*pte))
503 			continue;
504 		page = vm_normal_page(vma, addr, *pte);
505 		if (!page)
506 			continue;
507 		/*
508 		 * vm_normal_page() filters out zero pages, but there might
509 		 * still be PageReserved pages to skip, perhaps in a VDSO.
510 		 */
511 		if (PageReserved(page))
512 			continue;
513 		nid = page_to_nid(page);
514 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
515 			continue;
516 
517 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
518 			migrate_page_add(page, qp->pagelist, flags);
519 	}
520 	pte_unmap_unlock(pte - 1, ptl);
521 	cond_resched();
522 	return 0;
523 }
524 
525 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
526 			       unsigned long addr, unsigned long end,
527 			       struct mm_walk *walk)
528 {
529 #ifdef CONFIG_HUGETLB_PAGE
530 	struct queue_pages *qp = walk->private;
531 	unsigned long flags = qp->flags;
532 	int nid;
533 	struct page *page;
534 	spinlock_t *ptl;
535 	pte_t entry;
536 
537 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
538 	entry = huge_ptep_get(pte);
539 	if (!pte_present(entry))
540 		goto unlock;
541 	page = pte_page(entry);
542 	nid = page_to_nid(page);
543 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
544 		goto unlock;
545 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
546 	if (flags & (MPOL_MF_MOVE_ALL) ||
547 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
548 		isolate_huge_page(page, qp->pagelist);
549 unlock:
550 	spin_unlock(ptl);
551 #else
552 	BUG();
553 #endif
554 	return 0;
555 }
556 
557 #ifdef CONFIG_NUMA_BALANCING
558 /*
559  * This is used to mark a range of virtual addresses to be inaccessible.
560  * These are later cleared by a NUMA hinting fault. Depending on these
561  * faults, pages may be migrated for better NUMA placement.
562  *
563  * This is assuming that NUMA faults are handled using PROT_NONE. If
564  * an architecture makes a different choice, it will need further
565  * changes to the core.
566  */
567 unsigned long change_prot_numa(struct vm_area_struct *vma,
568 			unsigned long addr, unsigned long end)
569 {
570 	int nr_updated;
571 
572 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
573 	if (nr_updated)
574 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
575 
576 	return nr_updated;
577 }
578 #else
579 static unsigned long change_prot_numa(struct vm_area_struct *vma,
580 			unsigned long addr, unsigned long end)
581 {
582 	return 0;
583 }
584 #endif /* CONFIG_NUMA_BALANCING */
585 
586 static int queue_pages_test_walk(unsigned long start, unsigned long end,
587 				struct mm_walk *walk)
588 {
589 	struct vm_area_struct *vma = walk->vma;
590 	struct queue_pages *qp = walk->private;
591 	unsigned long endvma = vma->vm_end;
592 	unsigned long flags = qp->flags;
593 
594 	if (vma->vm_flags & VM_PFNMAP)
595 		return 1;
596 
597 	if (endvma > end)
598 		endvma = end;
599 	if (vma->vm_start > start)
600 		start = vma->vm_start;
601 
602 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
603 		if (!vma->vm_next && vma->vm_end < end)
604 			return -EFAULT;
605 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
606 			return -EFAULT;
607 	}
608 
609 	qp->prev = vma;
610 
611 	if (flags & MPOL_MF_LAZY) {
612 		/* Similar to task_numa_work, skip inaccessible VMAs */
613 		if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
614 			change_prot_numa(vma, start, endvma);
615 		return 1;
616 	}
617 
618 	if ((flags & MPOL_MF_STRICT) ||
619 	    ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
620 	     vma_migratable(vma)))
621 		/* queue pages from current vma */
622 		return 0;
623 	return 1;
624 }
625 
626 /*
627  * Walk through page tables and collect pages to be migrated.
628  *
629  * If pages found in a given range are on a set of nodes (determined by
630  * @nodes and @flags,) it's isolated and queued to the pagelist which is
631  * passed via @private.)
632  */
633 static int
634 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
635 		nodemask_t *nodes, unsigned long flags,
636 		struct list_head *pagelist)
637 {
638 	struct queue_pages qp = {
639 		.pagelist = pagelist,
640 		.flags = flags,
641 		.nmask = nodes,
642 		.prev = NULL,
643 	};
644 	struct mm_walk queue_pages_walk = {
645 		.hugetlb_entry = queue_pages_hugetlb,
646 		.pmd_entry = queue_pages_pte_range,
647 		.test_walk = queue_pages_test_walk,
648 		.mm = mm,
649 		.private = &qp,
650 	};
651 
652 	return walk_page_range(start, end, &queue_pages_walk);
653 }
654 
655 /*
656  * Apply policy to a single VMA
657  * This must be called with the mmap_sem held for writing.
658  */
659 static int vma_replace_policy(struct vm_area_struct *vma,
660 						struct mempolicy *pol)
661 {
662 	int err;
663 	struct mempolicy *old;
664 	struct mempolicy *new;
665 
666 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 		 vma->vm_ops, vma->vm_file,
669 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 
671 	new = mpol_dup(pol);
672 	if (IS_ERR(new))
673 		return PTR_ERR(new);
674 
675 	if (vma->vm_ops && vma->vm_ops->set_policy) {
676 		err = vma->vm_ops->set_policy(vma, new);
677 		if (err)
678 			goto err_out;
679 	}
680 
681 	old = vma->vm_policy;
682 	vma->vm_policy = new; /* protected by mmap_sem */
683 	mpol_put(old);
684 
685 	return 0;
686  err_out:
687 	mpol_put(new);
688 	return err;
689 }
690 
691 /* Step 2: apply policy to a range and do splits. */
692 static int mbind_range(struct mm_struct *mm, unsigned long start,
693 		       unsigned long end, struct mempolicy *new_pol)
694 {
695 	struct vm_area_struct *next;
696 	struct vm_area_struct *prev;
697 	struct vm_area_struct *vma;
698 	int err = 0;
699 	pgoff_t pgoff;
700 	unsigned long vmstart;
701 	unsigned long vmend;
702 
703 	vma = find_vma(mm, start);
704 	if (!vma || vma->vm_start > start)
705 		return -EFAULT;
706 
707 	prev = vma->vm_prev;
708 	if (start > vma->vm_start)
709 		prev = vma;
710 
711 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
712 		next = vma->vm_next;
713 		vmstart = max(start, vma->vm_start);
714 		vmend   = min(end, vma->vm_end);
715 
716 		if (mpol_equal(vma_policy(vma), new_pol))
717 			continue;
718 
719 		pgoff = vma->vm_pgoff +
720 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
721 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
722 				 vma->anon_vma, vma->vm_file, pgoff,
723 				 new_pol, vma->vm_userfaultfd_ctx);
724 		if (prev) {
725 			vma = prev;
726 			next = vma->vm_next;
727 			if (mpol_equal(vma_policy(vma), new_pol))
728 				continue;
729 			/* vma_merge() joined vma && vma->next, case 8 */
730 			goto replace;
731 		}
732 		if (vma->vm_start != vmstart) {
733 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
734 			if (err)
735 				goto out;
736 		}
737 		if (vma->vm_end != vmend) {
738 			err = split_vma(vma->vm_mm, vma, vmend, 0);
739 			if (err)
740 				goto out;
741 		}
742  replace:
743 		err = vma_replace_policy(vma, new_pol);
744 		if (err)
745 			goto out;
746 	}
747 
748  out:
749 	return err;
750 }
751 
752 /* Set the process memory policy */
753 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
754 			     nodemask_t *nodes)
755 {
756 	struct mempolicy *new, *old;
757 	NODEMASK_SCRATCH(scratch);
758 	int ret;
759 
760 	if (!scratch)
761 		return -ENOMEM;
762 
763 	new = mpol_new(mode, flags, nodes);
764 	if (IS_ERR(new)) {
765 		ret = PTR_ERR(new);
766 		goto out;
767 	}
768 
769 	task_lock(current);
770 	ret = mpol_set_nodemask(new, nodes, scratch);
771 	if (ret) {
772 		task_unlock(current);
773 		mpol_put(new);
774 		goto out;
775 	}
776 	old = current->mempolicy;
777 	current->mempolicy = new;
778 	if (new && new->mode == MPOL_INTERLEAVE &&
779 	    nodes_weight(new->v.nodes))
780 		current->il_next = first_node(new->v.nodes);
781 	task_unlock(current);
782 	mpol_put(old);
783 	ret = 0;
784 out:
785 	NODEMASK_SCRATCH_FREE(scratch);
786 	return ret;
787 }
788 
789 /*
790  * Return nodemask for policy for get_mempolicy() query
791  *
792  * Called with task's alloc_lock held
793  */
794 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
795 {
796 	nodes_clear(*nodes);
797 	if (p == &default_policy)
798 		return;
799 
800 	switch (p->mode) {
801 	case MPOL_BIND:
802 		/* Fall through */
803 	case MPOL_INTERLEAVE:
804 		*nodes = p->v.nodes;
805 		break;
806 	case MPOL_PREFERRED:
807 		if (!(p->flags & MPOL_F_LOCAL))
808 			node_set(p->v.preferred_node, *nodes);
809 		/* else return empty node mask for local allocation */
810 		break;
811 	default:
812 		BUG();
813 	}
814 }
815 
816 static int lookup_node(struct mm_struct *mm, unsigned long addr)
817 {
818 	struct page *p;
819 	int err;
820 
821 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
822 	if (err >= 0) {
823 		err = page_to_nid(p);
824 		put_page(p);
825 	}
826 	return err;
827 }
828 
829 /* Retrieve NUMA policy */
830 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
831 			     unsigned long addr, unsigned long flags)
832 {
833 	int err;
834 	struct mm_struct *mm = current->mm;
835 	struct vm_area_struct *vma = NULL;
836 	struct mempolicy *pol = current->mempolicy;
837 
838 	if (flags &
839 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
840 		return -EINVAL;
841 
842 	if (flags & MPOL_F_MEMS_ALLOWED) {
843 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
844 			return -EINVAL;
845 		*policy = 0;	/* just so it's initialized */
846 		task_lock(current);
847 		*nmask  = cpuset_current_mems_allowed;
848 		task_unlock(current);
849 		return 0;
850 	}
851 
852 	if (flags & MPOL_F_ADDR) {
853 		/*
854 		 * Do NOT fall back to task policy if the
855 		 * vma/shared policy at addr is NULL.  We
856 		 * want to return MPOL_DEFAULT in this case.
857 		 */
858 		down_read(&mm->mmap_sem);
859 		vma = find_vma_intersection(mm, addr, addr+1);
860 		if (!vma) {
861 			up_read(&mm->mmap_sem);
862 			return -EFAULT;
863 		}
864 		if (vma->vm_ops && vma->vm_ops->get_policy)
865 			pol = vma->vm_ops->get_policy(vma, addr);
866 		else
867 			pol = vma->vm_policy;
868 	} else if (addr)
869 		return -EINVAL;
870 
871 	if (!pol)
872 		pol = &default_policy;	/* indicates default behavior */
873 
874 	if (flags & MPOL_F_NODE) {
875 		if (flags & MPOL_F_ADDR) {
876 			err = lookup_node(mm, addr);
877 			if (err < 0)
878 				goto out;
879 			*policy = err;
880 		} else if (pol == current->mempolicy &&
881 				pol->mode == MPOL_INTERLEAVE) {
882 			*policy = current->il_next;
883 		} else {
884 			err = -EINVAL;
885 			goto out;
886 		}
887 	} else {
888 		*policy = pol == &default_policy ? MPOL_DEFAULT :
889 						pol->mode;
890 		/*
891 		 * Internal mempolicy flags must be masked off before exposing
892 		 * the policy to userspace.
893 		 */
894 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
895 	}
896 
897 	if (vma) {
898 		up_read(&current->mm->mmap_sem);
899 		vma = NULL;
900 	}
901 
902 	err = 0;
903 	if (nmask) {
904 		if (mpol_store_user_nodemask(pol)) {
905 			*nmask = pol->w.user_nodemask;
906 		} else {
907 			task_lock(current);
908 			get_policy_nodemask(pol, nmask);
909 			task_unlock(current);
910 		}
911 	}
912 
913  out:
914 	mpol_cond_put(pol);
915 	if (vma)
916 		up_read(&current->mm->mmap_sem);
917 	return err;
918 }
919 
920 #ifdef CONFIG_MIGRATION
921 /*
922  * page migration
923  */
924 static void migrate_page_add(struct page *page, struct list_head *pagelist,
925 				unsigned long flags)
926 {
927 	/*
928 	 * Avoid migrating a page that is shared with others.
929 	 */
930 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
931 		if (!isolate_lru_page(page)) {
932 			list_add_tail(&page->lru, pagelist);
933 			inc_zone_page_state(page, NR_ISOLATED_ANON +
934 					    page_is_file_cache(page));
935 		}
936 	}
937 }
938 
939 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
940 {
941 	if (PageHuge(page))
942 		return alloc_huge_page_node(page_hstate(compound_head(page)),
943 					node);
944 	else
945 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
946 						    __GFP_THISNODE, 0);
947 }
948 
949 /*
950  * Migrate pages from one node to a target node.
951  * Returns error or the number of pages not migrated.
952  */
953 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
954 			   int flags)
955 {
956 	nodemask_t nmask;
957 	LIST_HEAD(pagelist);
958 	int err = 0;
959 
960 	nodes_clear(nmask);
961 	node_set(source, nmask);
962 
963 	/*
964 	 * This does not "check" the range but isolates all pages that
965 	 * need migration.  Between passing in the full user address
966 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
967 	 */
968 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
969 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
970 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
971 
972 	if (!list_empty(&pagelist)) {
973 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
974 					MIGRATE_SYNC, MR_SYSCALL);
975 		if (err)
976 			putback_movable_pages(&pagelist);
977 	}
978 
979 	return err;
980 }
981 
982 /*
983  * Move pages between the two nodesets so as to preserve the physical
984  * layout as much as possible.
985  *
986  * Returns the number of page that could not be moved.
987  */
988 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
989 		     const nodemask_t *to, int flags)
990 {
991 	int busy = 0;
992 	int err;
993 	nodemask_t tmp;
994 
995 	err = migrate_prep();
996 	if (err)
997 		return err;
998 
999 	down_read(&mm->mmap_sem);
1000 
1001 	/*
1002 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1003 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1004 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1005 	 * The pair of nodemasks 'to' and 'from' define the map.
1006 	 *
1007 	 * If no pair of bits is found that way, fallback to picking some
1008 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1009 	 * 'source' and 'dest' bits are the same, this represents a node
1010 	 * that will be migrating to itself, so no pages need move.
1011 	 *
1012 	 * If no bits are left in 'tmp', or if all remaining bits left
1013 	 * in 'tmp' correspond to the same bit in 'to', return false
1014 	 * (nothing left to migrate).
1015 	 *
1016 	 * This lets us pick a pair of nodes to migrate between, such that
1017 	 * if possible the dest node is not already occupied by some other
1018 	 * source node, minimizing the risk of overloading the memory on a
1019 	 * node that would happen if we migrated incoming memory to a node
1020 	 * before migrating outgoing memory source that same node.
1021 	 *
1022 	 * A single scan of tmp is sufficient.  As we go, we remember the
1023 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1024 	 * that not only moved, but what's better, moved to an empty slot
1025 	 * (d is not set in tmp), then we break out then, with that pair.
1026 	 * Otherwise when we finish scanning from_tmp, we at least have the
1027 	 * most recent <s, d> pair that moved.  If we get all the way through
1028 	 * the scan of tmp without finding any node that moved, much less
1029 	 * moved to an empty node, then there is nothing left worth migrating.
1030 	 */
1031 
1032 	tmp = *from;
1033 	while (!nodes_empty(tmp)) {
1034 		int s,d;
1035 		int source = NUMA_NO_NODE;
1036 		int dest = 0;
1037 
1038 		for_each_node_mask(s, tmp) {
1039 
1040 			/*
1041 			 * do_migrate_pages() tries to maintain the relative
1042 			 * node relationship of the pages established between
1043 			 * threads and memory areas.
1044                          *
1045 			 * However if the number of source nodes is not equal to
1046 			 * the number of destination nodes we can not preserve
1047 			 * this node relative relationship.  In that case, skip
1048 			 * copying memory from a node that is in the destination
1049 			 * mask.
1050 			 *
1051 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1052 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1053 			 */
1054 
1055 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1056 						(node_isset(s, *to)))
1057 				continue;
1058 
1059 			d = node_remap(s, *from, *to);
1060 			if (s == d)
1061 				continue;
1062 
1063 			source = s;	/* Node moved. Memorize */
1064 			dest = d;
1065 
1066 			/* dest not in remaining from nodes? */
1067 			if (!node_isset(dest, tmp))
1068 				break;
1069 		}
1070 		if (source == NUMA_NO_NODE)
1071 			break;
1072 
1073 		node_clear(source, tmp);
1074 		err = migrate_to_node(mm, source, dest, flags);
1075 		if (err > 0)
1076 			busy += err;
1077 		if (err < 0)
1078 			break;
1079 	}
1080 	up_read(&mm->mmap_sem);
1081 	if (err < 0)
1082 		return err;
1083 	return busy;
1084 
1085 }
1086 
1087 /*
1088  * Allocate a new page for page migration based on vma policy.
1089  * Start by assuming the page is mapped by the same vma as contains @start.
1090  * Search forward from there, if not.  N.B., this assumes that the
1091  * list of pages handed to migrate_pages()--which is how we get here--
1092  * is in virtual address order.
1093  */
1094 static struct page *new_page(struct page *page, unsigned long start, int **x)
1095 {
1096 	struct vm_area_struct *vma;
1097 	unsigned long uninitialized_var(address);
1098 
1099 	vma = find_vma(current->mm, start);
1100 	while (vma) {
1101 		address = page_address_in_vma(page, vma);
1102 		if (address != -EFAULT)
1103 			break;
1104 		vma = vma->vm_next;
1105 	}
1106 
1107 	if (PageHuge(page)) {
1108 		BUG_ON(!vma);
1109 		return alloc_huge_page_noerr(vma, address, 1);
1110 	}
1111 	/*
1112 	 * if !vma, alloc_page_vma() will use task or system default policy
1113 	 */
1114 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1115 }
1116 #else
1117 
1118 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1119 				unsigned long flags)
1120 {
1121 }
1122 
1123 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1124 		     const nodemask_t *to, int flags)
1125 {
1126 	return -ENOSYS;
1127 }
1128 
1129 static struct page *new_page(struct page *page, unsigned long start, int **x)
1130 {
1131 	return NULL;
1132 }
1133 #endif
1134 
1135 static long do_mbind(unsigned long start, unsigned long len,
1136 		     unsigned short mode, unsigned short mode_flags,
1137 		     nodemask_t *nmask, unsigned long flags)
1138 {
1139 	struct mm_struct *mm = current->mm;
1140 	struct mempolicy *new;
1141 	unsigned long end;
1142 	int err;
1143 	LIST_HEAD(pagelist);
1144 
1145 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1146 		return -EINVAL;
1147 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1148 		return -EPERM;
1149 
1150 	if (start & ~PAGE_MASK)
1151 		return -EINVAL;
1152 
1153 	if (mode == MPOL_DEFAULT)
1154 		flags &= ~MPOL_MF_STRICT;
1155 
1156 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1157 	end = start + len;
1158 
1159 	if (end < start)
1160 		return -EINVAL;
1161 	if (end == start)
1162 		return 0;
1163 
1164 	new = mpol_new(mode, mode_flags, nmask);
1165 	if (IS_ERR(new))
1166 		return PTR_ERR(new);
1167 
1168 	if (flags & MPOL_MF_LAZY)
1169 		new->flags |= MPOL_F_MOF;
1170 
1171 	/*
1172 	 * If we are using the default policy then operation
1173 	 * on discontinuous address spaces is okay after all
1174 	 */
1175 	if (!new)
1176 		flags |= MPOL_MF_DISCONTIG_OK;
1177 
1178 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1179 		 start, start + len, mode, mode_flags,
1180 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1181 
1182 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1183 
1184 		err = migrate_prep();
1185 		if (err)
1186 			goto mpol_out;
1187 	}
1188 	{
1189 		NODEMASK_SCRATCH(scratch);
1190 		if (scratch) {
1191 			down_write(&mm->mmap_sem);
1192 			task_lock(current);
1193 			err = mpol_set_nodemask(new, nmask, scratch);
1194 			task_unlock(current);
1195 			if (err)
1196 				up_write(&mm->mmap_sem);
1197 		} else
1198 			err = -ENOMEM;
1199 		NODEMASK_SCRATCH_FREE(scratch);
1200 	}
1201 	if (err)
1202 		goto mpol_out;
1203 
1204 	err = queue_pages_range(mm, start, end, nmask,
1205 			  flags | MPOL_MF_INVERT, &pagelist);
1206 	if (!err)
1207 		err = mbind_range(mm, start, end, new);
1208 
1209 	if (!err) {
1210 		int nr_failed = 0;
1211 
1212 		if (!list_empty(&pagelist)) {
1213 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1214 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1215 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1216 			if (nr_failed)
1217 				putback_movable_pages(&pagelist);
1218 		}
1219 
1220 		if (nr_failed && (flags & MPOL_MF_STRICT))
1221 			err = -EIO;
1222 	} else
1223 		putback_movable_pages(&pagelist);
1224 
1225 	up_write(&mm->mmap_sem);
1226  mpol_out:
1227 	mpol_put(new);
1228 	return err;
1229 }
1230 
1231 /*
1232  * User space interface with variable sized bitmaps for nodelists.
1233  */
1234 
1235 /* Copy a node mask from user space. */
1236 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1237 		     unsigned long maxnode)
1238 {
1239 	unsigned long k;
1240 	unsigned long nlongs;
1241 	unsigned long endmask;
1242 
1243 	--maxnode;
1244 	nodes_clear(*nodes);
1245 	if (maxnode == 0 || !nmask)
1246 		return 0;
1247 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1248 		return -EINVAL;
1249 
1250 	nlongs = BITS_TO_LONGS(maxnode);
1251 	if ((maxnode % BITS_PER_LONG) == 0)
1252 		endmask = ~0UL;
1253 	else
1254 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1255 
1256 	/* When the user specified more nodes than supported just check
1257 	   if the non supported part is all zero. */
1258 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1259 		if (nlongs > PAGE_SIZE/sizeof(long))
1260 			return -EINVAL;
1261 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1262 			unsigned long t;
1263 			if (get_user(t, nmask + k))
1264 				return -EFAULT;
1265 			if (k == nlongs - 1) {
1266 				if (t & endmask)
1267 					return -EINVAL;
1268 			} else if (t)
1269 				return -EINVAL;
1270 		}
1271 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1272 		endmask = ~0UL;
1273 	}
1274 
1275 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1276 		return -EFAULT;
1277 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1278 	return 0;
1279 }
1280 
1281 /* Copy a kernel node mask to user space */
1282 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1283 			      nodemask_t *nodes)
1284 {
1285 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1286 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1287 
1288 	if (copy > nbytes) {
1289 		if (copy > PAGE_SIZE)
1290 			return -EINVAL;
1291 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1292 			return -EFAULT;
1293 		copy = nbytes;
1294 	}
1295 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1296 }
1297 
1298 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1299 		unsigned long, mode, const unsigned long __user *, nmask,
1300 		unsigned long, maxnode, unsigned, flags)
1301 {
1302 	nodemask_t nodes;
1303 	int err;
1304 	unsigned short mode_flags;
1305 
1306 	mode_flags = mode & MPOL_MODE_FLAGS;
1307 	mode &= ~MPOL_MODE_FLAGS;
1308 	if (mode >= MPOL_MAX)
1309 		return -EINVAL;
1310 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1311 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1312 		return -EINVAL;
1313 	err = get_nodes(&nodes, nmask, maxnode);
1314 	if (err)
1315 		return err;
1316 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1317 }
1318 
1319 /* Set the process memory policy */
1320 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1321 		unsigned long, maxnode)
1322 {
1323 	int err;
1324 	nodemask_t nodes;
1325 	unsigned short flags;
1326 
1327 	flags = mode & MPOL_MODE_FLAGS;
1328 	mode &= ~MPOL_MODE_FLAGS;
1329 	if ((unsigned int)mode >= MPOL_MAX)
1330 		return -EINVAL;
1331 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1332 		return -EINVAL;
1333 	err = get_nodes(&nodes, nmask, maxnode);
1334 	if (err)
1335 		return err;
1336 	return do_set_mempolicy(mode, flags, &nodes);
1337 }
1338 
1339 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1340 		const unsigned long __user *, old_nodes,
1341 		const unsigned long __user *, new_nodes)
1342 {
1343 	const struct cred *cred = current_cred(), *tcred;
1344 	struct mm_struct *mm = NULL;
1345 	struct task_struct *task;
1346 	nodemask_t task_nodes;
1347 	int err;
1348 	nodemask_t *old;
1349 	nodemask_t *new;
1350 	NODEMASK_SCRATCH(scratch);
1351 
1352 	if (!scratch)
1353 		return -ENOMEM;
1354 
1355 	old = &scratch->mask1;
1356 	new = &scratch->mask2;
1357 
1358 	err = get_nodes(old, old_nodes, maxnode);
1359 	if (err)
1360 		goto out;
1361 
1362 	err = get_nodes(new, new_nodes, maxnode);
1363 	if (err)
1364 		goto out;
1365 
1366 	/* Find the mm_struct */
1367 	rcu_read_lock();
1368 	task = pid ? find_task_by_vpid(pid) : current;
1369 	if (!task) {
1370 		rcu_read_unlock();
1371 		err = -ESRCH;
1372 		goto out;
1373 	}
1374 	get_task_struct(task);
1375 
1376 	err = -EINVAL;
1377 
1378 	/*
1379 	 * Check if this process has the right to modify the specified
1380 	 * process. The right exists if the process has administrative
1381 	 * capabilities, superuser privileges or the same
1382 	 * userid as the target process.
1383 	 */
1384 	tcred = __task_cred(task);
1385 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1386 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1387 	    !capable(CAP_SYS_NICE)) {
1388 		rcu_read_unlock();
1389 		err = -EPERM;
1390 		goto out_put;
1391 	}
1392 	rcu_read_unlock();
1393 
1394 	task_nodes = cpuset_mems_allowed(task);
1395 	/* Is the user allowed to access the target nodes? */
1396 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1397 		err = -EPERM;
1398 		goto out_put;
1399 	}
1400 
1401 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1402 		err = -EINVAL;
1403 		goto out_put;
1404 	}
1405 
1406 	err = security_task_movememory(task);
1407 	if (err)
1408 		goto out_put;
1409 
1410 	mm = get_task_mm(task);
1411 	put_task_struct(task);
1412 
1413 	if (!mm) {
1414 		err = -EINVAL;
1415 		goto out;
1416 	}
1417 
1418 	err = do_migrate_pages(mm, old, new,
1419 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1420 
1421 	mmput(mm);
1422 out:
1423 	NODEMASK_SCRATCH_FREE(scratch);
1424 
1425 	return err;
1426 
1427 out_put:
1428 	put_task_struct(task);
1429 	goto out;
1430 
1431 }
1432 
1433 
1434 /* Retrieve NUMA policy */
1435 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1436 		unsigned long __user *, nmask, unsigned long, maxnode,
1437 		unsigned long, addr, unsigned long, flags)
1438 {
1439 	int err;
1440 	int uninitialized_var(pval);
1441 	nodemask_t nodes;
1442 
1443 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1444 		return -EINVAL;
1445 
1446 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1447 
1448 	if (err)
1449 		return err;
1450 
1451 	if (policy && put_user(pval, policy))
1452 		return -EFAULT;
1453 
1454 	if (nmask)
1455 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1456 
1457 	return err;
1458 }
1459 
1460 #ifdef CONFIG_COMPAT
1461 
1462 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1463 		       compat_ulong_t __user *, nmask,
1464 		       compat_ulong_t, maxnode,
1465 		       compat_ulong_t, addr, compat_ulong_t, flags)
1466 {
1467 	long err;
1468 	unsigned long __user *nm = NULL;
1469 	unsigned long nr_bits, alloc_size;
1470 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1471 
1472 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1473 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1474 
1475 	if (nmask)
1476 		nm = compat_alloc_user_space(alloc_size);
1477 
1478 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1479 
1480 	if (!err && nmask) {
1481 		unsigned long copy_size;
1482 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1483 		err = copy_from_user(bm, nm, copy_size);
1484 		/* ensure entire bitmap is zeroed */
1485 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1486 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1487 	}
1488 
1489 	return err;
1490 }
1491 
1492 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1493 		       compat_ulong_t, maxnode)
1494 {
1495 	long err = 0;
1496 	unsigned long __user *nm = NULL;
1497 	unsigned long nr_bits, alloc_size;
1498 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1499 
1500 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1501 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1502 
1503 	if (nmask) {
1504 		err = compat_get_bitmap(bm, nmask, nr_bits);
1505 		nm = compat_alloc_user_space(alloc_size);
1506 		err |= copy_to_user(nm, bm, alloc_size);
1507 	}
1508 
1509 	if (err)
1510 		return -EFAULT;
1511 
1512 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1513 }
1514 
1515 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1516 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1517 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1518 {
1519 	long err = 0;
1520 	unsigned long __user *nm = NULL;
1521 	unsigned long nr_bits, alloc_size;
1522 	nodemask_t bm;
1523 
1524 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1525 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1526 
1527 	if (nmask) {
1528 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1529 		nm = compat_alloc_user_space(alloc_size);
1530 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1531 	}
1532 
1533 	if (err)
1534 		return -EFAULT;
1535 
1536 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1537 }
1538 
1539 #endif
1540 
1541 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1542 						unsigned long addr)
1543 {
1544 	struct mempolicy *pol = NULL;
1545 
1546 	if (vma) {
1547 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1548 			pol = vma->vm_ops->get_policy(vma, addr);
1549 		} else if (vma->vm_policy) {
1550 			pol = vma->vm_policy;
1551 
1552 			/*
1553 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1554 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1555 			 * count on these policies which will be dropped by
1556 			 * mpol_cond_put() later
1557 			 */
1558 			if (mpol_needs_cond_ref(pol))
1559 				mpol_get(pol);
1560 		}
1561 	}
1562 
1563 	return pol;
1564 }
1565 
1566 /*
1567  * get_vma_policy(@vma, @addr)
1568  * @vma: virtual memory area whose policy is sought
1569  * @addr: address in @vma for shared policy lookup
1570  *
1571  * Returns effective policy for a VMA at specified address.
1572  * Falls back to current->mempolicy or system default policy, as necessary.
1573  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1574  * count--added by the get_policy() vm_op, as appropriate--to protect against
1575  * freeing by another task.  It is the caller's responsibility to free the
1576  * extra reference for shared policies.
1577  */
1578 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1579 						unsigned long addr)
1580 {
1581 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1582 
1583 	if (!pol)
1584 		pol = get_task_policy(current);
1585 
1586 	return pol;
1587 }
1588 
1589 bool vma_policy_mof(struct vm_area_struct *vma)
1590 {
1591 	struct mempolicy *pol;
1592 
1593 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1594 		bool ret = false;
1595 
1596 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1597 		if (pol && (pol->flags & MPOL_F_MOF))
1598 			ret = true;
1599 		mpol_cond_put(pol);
1600 
1601 		return ret;
1602 	}
1603 
1604 	pol = vma->vm_policy;
1605 	if (!pol)
1606 		pol = get_task_policy(current);
1607 
1608 	return pol->flags & MPOL_F_MOF;
1609 }
1610 
1611 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1612 {
1613 	enum zone_type dynamic_policy_zone = policy_zone;
1614 
1615 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1616 
1617 	/*
1618 	 * if policy->v.nodes has movable memory only,
1619 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1620 	 *
1621 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1622 	 * so if the following test faile, it implies
1623 	 * policy->v.nodes has movable memory only.
1624 	 */
1625 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1626 		dynamic_policy_zone = ZONE_MOVABLE;
1627 
1628 	return zone >= dynamic_policy_zone;
1629 }
1630 
1631 /*
1632  * Return a nodemask representing a mempolicy for filtering nodes for
1633  * page allocation
1634  */
1635 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1636 {
1637 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1638 	if (unlikely(policy->mode == MPOL_BIND) &&
1639 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1640 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1641 		return &policy->v.nodes;
1642 
1643 	return NULL;
1644 }
1645 
1646 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1647 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1648 	int nd)
1649 {
1650 	switch (policy->mode) {
1651 	case MPOL_PREFERRED:
1652 		if (!(policy->flags & MPOL_F_LOCAL))
1653 			nd = policy->v.preferred_node;
1654 		break;
1655 	case MPOL_BIND:
1656 		/*
1657 		 * Normally, MPOL_BIND allocations are node-local within the
1658 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1659 		 * current node isn't part of the mask, we use the zonelist for
1660 		 * the first node in the mask instead.
1661 		 */
1662 		if (unlikely(gfp & __GFP_THISNODE) &&
1663 				unlikely(!node_isset(nd, policy->v.nodes)))
1664 			nd = first_node(policy->v.nodes);
1665 		break;
1666 	default:
1667 		BUG();
1668 	}
1669 	return node_zonelist(nd, gfp);
1670 }
1671 
1672 /* Do dynamic interleaving for a process */
1673 static unsigned interleave_nodes(struct mempolicy *policy)
1674 {
1675 	unsigned nid, next;
1676 	struct task_struct *me = current;
1677 
1678 	nid = me->il_next;
1679 	next = next_node(nid, policy->v.nodes);
1680 	if (next >= MAX_NUMNODES)
1681 		next = first_node(policy->v.nodes);
1682 	if (next < MAX_NUMNODES)
1683 		me->il_next = next;
1684 	return nid;
1685 }
1686 
1687 /*
1688  * Depending on the memory policy provide a node from which to allocate the
1689  * next slab entry.
1690  */
1691 unsigned int mempolicy_slab_node(void)
1692 {
1693 	struct mempolicy *policy;
1694 	int node = numa_mem_id();
1695 
1696 	if (in_interrupt())
1697 		return node;
1698 
1699 	policy = current->mempolicy;
1700 	if (!policy || policy->flags & MPOL_F_LOCAL)
1701 		return node;
1702 
1703 	switch (policy->mode) {
1704 	case MPOL_PREFERRED:
1705 		/*
1706 		 * handled MPOL_F_LOCAL above
1707 		 */
1708 		return policy->v.preferred_node;
1709 
1710 	case MPOL_INTERLEAVE:
1711 		return interleave_nodes(policy);
1712 
1713 	case MPOL_BIND: {
1714 		/*
1715 		 * Follow bind policy behavior and start allocation at the
1716 		 * first node.
1717 		 */
1718 		struct zonelist *zonelist;
1719 		struct zone *zone;
1720 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1721 		zonelist = &NODE_DATA(node)->node_zonelists[0];
1722 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1723 							&policy->v.nodes,
1724 							&zone);
1725 		return zone ? zone->node : node;
1726 	}
1727 
1728 	default:
1729 		BUG();
1730 	}
1731 }
1732 
1733 /* Do static interleaving for a VMA with known offset. */
1734 static unsigned offset_il_node(struct mempolicy *pol,
1735 		struct vm_area_struct *vma, unsigned long off)
1736 {
1737 	unsigned nnodes = nodes_weight(pol->v.nodes);
1738 	unsigned target;
1739 	int c;
1740 	int nid = NUMA_NO_NODE;
1741 
1742 	if (!nnodes)
1743 		return numa_node_id();
1744 	target = (unsigned int)off % nnodes;
1745 	c = 0;
1746 	do {
1747 		nid = next_node(nid, pol->v.nodes);
1748 		c++;
1749 	} while (c <= target);
1750 	return nid;
1751 }
1752 
1753 /* Determine a node number for interleave */
1754 static inline unsigned interleave_nid(struct mempolicy *pol,
1755 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1756 {
1757 	if (vma) {
1758 		unsigned long off;
1759 
1760 		/*
1761 		 * for small pages, there is no difference between
1762 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1763 		 * for huge pages, since vm_pgoff is in units of small
1764 		 * pages, we need to shift off the always 0 bits to get
1765 		 * a useful offset.
1766 		 */
1767 		BUG_ON(shift < PAGE_SHIFT);
1768 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1769 		off += (addr - vma->vm_start) >> shift;
1770 		return offset_il_node(pol, vma, off);
1771 	} else
1772 		return interleave_nodes(pol);
1773 }
1774 
1775 /*
1776  * Return the bit number of a random bit set in the nodemask.
1777  * (returns NUMA_NO_NODE if nodemask is empty)
1778  */
1779 int node_random(const nodemask_t *maskp)
1780 {
1781 	int w, bit = NUMA_NO_NODE;
1782 
1783 	w = nodes_weight(*maskp);
1784 	if (w)
1785 		bit = bitmap_ord_to_pos(maskp->bits,
1786 			get_random_int() % w, MAX_NUMNODES);
1787 	return bit;
1788 }
1789 
1790 #ifdef CONFIG_HUGETLBFS
1791 /*
1792  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1793  * @vma: virtual memory area whose policy is sought
1794  * @addr: address in @vma for shared policy lookup and interleave policy
1795  * @gfp_flags: for requested zone
1796  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1797  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1798  *
1799  * Returns a zonelist suitable for a huge page allocation and a pointer
1800  * to the struct mempolicy for conditional unref after allocation.
1801  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1802  * @nodemask for filtering the zonelist.
1803  *
1804  * Must be protected by read_mems_allowed_begin()
1805  */
1806 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1807 				gfp_t gfp_flags, struct mempolicy **mpol,
1808 				nodemask_t **nodemask)
1809 {
1810 	struct zonelist *zl;
1811 
1812 	*mpol = get_vma_policy(vma, addr);
1813 	*nodemask = NULL;	/* assume !MPOL_BIND */
1814 
1815 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1816 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1817 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1818 	} else {
1819 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1820 		if ((*mpol)->mode == MPOL_BIND)
1821 			*nodemask = &(*mpol)->v.nodes;
1822 	}
1823 	return zl;
1824 }
1825 
1826 /*
1827  * init_nodemask_of_mempolicy
1828  *
1829  * If the current task's mempolicy is "default" [NULL], return 'false'
1830  * to indicate default policy.  Otherwise, extract the policy nodemask
1831  * for 'bind' or 'interleave' policy into the argument nodemask, or
1832  * initialize the argument nodemask to contain the single node for
1833  * 'preferred' or 'local' policy and return 'true' to indicate presence
1834  * of non-default mempolicy.
1835  *
1836  * We don't bother with reference counting the mempolicy [mpol_get/put]
1837  * because the current task is examining it's own mempolicy and a task's
1838  * mempolicy is only ever changed by the task itself.
1839  *
1840  * N.B., it is the caller's responsibility to free a returned nodemask.
1841  */
1842 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1843 {
1844 	struct mempolicy *mempolicy;
1845 	int nid;
1846 
1847 	if (!(mask && current->mempolicy))
1848 		return false;
1849 
1850 	task_lock(current);
1851 	mempolicy = current->mempolicy;
1852 	switch (mempolicy->mode) {
1853 	case MPOL_PREFERRED:
1854 		if (mempolicy->flags & MPOL_F_LOCAL)
1855 			nid = numa_node_id();
1856 		else
1857 			nid = mempolicy->v.preferred_node;
1858 		init_nodemask_of_node(mask, nid);
1859 		break;
1860 
1861 	case MPOL_BIND:
1862 		/* Fall through */
1863 	case MPOL_INTERLEAVE:
1864 		*mask =  mempolicy->v.nodes;
1865 		break;
1866 
1867 	default:
1868 		BUG();
1869 	}
1870 	task_unlock(current);
1871 
1872 	return true;
1873 }
1874 #endif
1875 
1876 /*
1877  * mempolicy_nodemask_intersects
1878  *
1879  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1880  * policy.  Otherwise, check for intersection between mask and the policy
1881  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1882  * policy, always return true since it may allocate elsewhere on fallback.
1883  *
1884  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1885  */
1886 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1887 					const nodemask_t *mask)
1888 {
1889 	struct mempolicy *mempolicy;
1890 	bool ret = true;
1891 
1892 	if (!mask)
1893 		return ret;
1894 	task_lock(tsk);
1895 	mempolicy = tsk->mempolicy;
1896 	if (!mempolicy)
1897 		goto out;
1898 
1899 	switch (mempolicy->mode) {
1900 	case MPOL_PREFERRED:
1901 		/*
1902 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1903 		 * allocate from, they may fallback to other nodes when oom.
1904 		 * Thus, it's possible for tsk to have allocated memory from
1905 		 * nodes in mask.
1906 		 */
1907 		break;
1908 	case MPOL_BIND:
1909 	case MPOL_INTERLEAVE:
1910 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1911 		break;
1912 	default:
1913 		BUG();
1914 	}
1915 out:
1916 	task_unlock(tsk);
1917 	return ret;
1918 }
1919 
1920 /* Allocate a page in interleaved policy.
1921    Own path because it needs to do special accounting. */
1922 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1923 					unsigned nid)
1924 {
1925 	struct zonelist *zl;
1926 	struct page *page;
1927 
1928 	zl = node_zonelist(nid, gfp);
1929 	page = __alloc_pages(gfp, order, zl);
1930 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1931 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1932 	return page;
1933 }
1934 
1935 /**
1936  * 	alloc_pages_vma	- Allocate a page for a VMA.
1937  *
1938  * 	@gfp:
1939  *      %GFP_USER    user allocation.
1940  *      %GFP_KERNEL  kernel allocations,
1941  *      %GFP_HIGHMEM highmem/user allocations,
1942  *      %GFP_FS      allocation should not call back into a file system.
1943  *      %GFP_ATOMIC  don't sleep.
1944  *
1945  *	@order:Order of the GFP allocation.
1946  * 	@vma:  Pointer to VMA or NULL if not available.
1947  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1948  *	@node: Which node to prefer for allocation (modulo policy).
1949  *	@hugepage: for hugepages try only the preferred node if possible
1950  *
1951  * 	This function allocates a page from the kernel page pool and applies
1952  *	a NUMA policy associated with the VMA or the current process.
1953  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1954  *	mm_struct of the VMA to prevent it from going away. Should be used for
1955  *	all allocations for pages that will be mapped into user space. Returns
1956  *	NULL when no page can be allocated.
1957  */
1958 struct page *
1959 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1960 		unsigned long addr, int node, bool hugepage)
1961 {
1962 	struct mempolicy *pol;
1963 	struct page *page;
1964 	unsigned int cpuset_mems_cookie;
1965 	struct zonelist *zl;
1966 	nodemask_t *nmask;
1967 
1968 retry_cpuset:
1969 	pol = get_vma_policy(vma, addr);
1970 	cpuset_mems_cookie = read_mems_allowed_begin();
1971 
1972 	if (pol->mode == MPOL_INTERLEAVE) {
1973 		unsigned nid;
1974 
1975 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1976 		mpol_cond_put(pol);
1977 		page = alloc_page_interleave(gfp, order, nid);
1978 		goto out;
1979 	}
1980 
1981 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1982 		int hpage_node = node;
1983 
1984 		/*
1985 		 * For hugepage allocation and non-interleave policy which
1986 		 * allows the current node (or other explicitly preferred
1987 		 * node) we only try to allocate from the current/preferred
1988 		 * node and don't fall back to other nodes, as the cost of
1989 		 * remote accesses would likely offset THP benefits.
1990 		 *
1991 		 * If the policy is interleave, or does not allow the current
1992 		 * node in its nodemask, we allocate the standard way.
1993 		 */
1994 		if (pol->mode == MPOL_PREFERRED &&
1995 						!(pol->flags & MPOL_F_LOCAL))
1996 			hpage_node = pol->v.preferred_node;
1997 
1998 		nmask = policy_nodemask(gfp, pol);
1999 		if (!nmask || node_isset(hpage_node, *nmask)) {
2000 			mpol_cond_put(pol);
2001 			page = __alloc_pages_node(hpage_node,
2002 						gfp | __GFP_THISNODE, order);
2003 			goto out;
2004 		}
2005 	}
2006 
2007 	nmask = policy_nodemask(gfp, pol);
2008 	zl = policy_zonelist(gfp, pol, node);
2009 	mpol_cond_put(pol);
2010 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2011 out:
2012 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2013 		goto retry_cpuset;
2014 	return page;
2015 }
2016 
2017 /**
2018  * 	alloc_pages_current - Allocate pages.
2019  *
2020  *	@gfp:
2021  *		%GFP_USER   user allocation,
2022  *      	%GFP_KERNEL kernel allocation,
2023  *      	%GFP_HIGHMEM highmem allocation,
2024  *      	%GFP_FS     don't call back into a file system.
2025  *      	%GFP_ATOMIC don't sleep.
2026  *	@order: Power of two of allocation size in pages. 0 is a single page.
2027  *
2028  *	Allocate a page from the kernel page pool.  When not in
2029  *	interrupt context and apply the current process NUMA policy.
2030  *	Returns NULL when no page can be allocated.
2031  *
2032  *	Don't call cpuset_update_task_memory_state() unless
2033  *	1) it's ok to take cpuset_sem (can WAIT), and
2034  *	2) allocating for current task (not interrupt).
2035  */
2036 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2037 {
2038 	struct mempolicy *pol = &default_policy;
2039 	struct page *page;
2040 	unsigned int cpuset_mems_cookie;
2041 
2042 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2043 		pol = get_task_policy(current);
2044 
2045 retry_cpuset:
2046 	cpuset_mems_cookie = read_mems_allowed_begin();
2047 
2048 	/*
2049 	 * No reference counting needed for current->mempolicy
2050 	 * nor system default_policy
2051 	 */
2052 	if (pol->mode == MPOL_INTERLEAVE)
2053 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2054 	else
2055 		page = __alloc_pages_nodemask(gfp, order,
2056 				policy_zonelist(gfp, pol, numa_node_id()),
2057 				policy_nodemask(gfp, pol));
2058 
2059 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2060 		goto retry_cpuset;
2061 
2062 	return page;
2063 }
2064 EXPORT_SYMBOL(alloc_pages_current);
2065 
2066 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2067 {
2068 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2069 
2070 	if (IS_ERR(pol))
2071 		return PTR_ERR(pol);
2072 	dst->vm_policy = pol;
2073 	return 0;
2074 }
2075 
2076 /*
2077  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2078  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2079  * with the mems_allowed returned by cpuset_mems_allowed().  This
2080  * keeps mempolicies cpuset relative after its cpuset moves.  See
2081  * further kernel/cpuset.c update_nodemask().
2082  *
2083  * current's mempolicy may be rebinded by the other task(the task that changes
2084  * cpuset's mems), so we needn't do rebind work for current task.
2085  */
2086 
2087 /* Slow path of a mempolicy duplicate */
2088 struct mempolicy *__mpol_dup(struct mempolicy *old)
2089 {
2090 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2091 
2092 	if (!new)
2093 		return ERR_PTR(-ENOMEM);
2094 
2095 	/* task's mempolicy is protected by alloc_lock */
2096 	if (old == current->mempolicy) {
2097 		task_lock(current);
2098 		*new = *old;
2099 		task_unlock(current);
2100 	} else
2101 		*new = *old;
2102 
2103 	if (current_cpuset_is_being_rebound()) {
2104 		nodemask_t mems = cpuset_mems_allowed(current);
2105 		if (new->flags & MPOL_F_REBINDING)
2106 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2107 		else
2108 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2109 	}
2110 	atomic_set(&new->refcnt, 1);
2111 	return new;
2112 }
2113 
2114 /* Slow path of a mempolicy comparison */
2115 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2116 {
2117 	if (!a || !b)
2118 		return false;
2119 	if (a->mode != b->mode)
2120 		return false;
2121 	if (a->flags != b->flags)
2122 		return false;
2123 	if (mpol_store_user_nodemask(a))
2124 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2125 			return false;
2126 
2127 	switch (a->mode) {
2128 	case MPOL_BIND:
2129 		/* Fall through */
2130 	case MPOL_INTERLEAVE:
2131 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2132 	case MPOL_PREFERRED:
2133 		return a->v.preferred_node == b->v.preferred_node;
2134 	default:
2135 		BUG();
2136 		return false;
2137 	}
2138 }
2139 
2140 /*
2141  * Shared memory backing store policy support.
2142  *
2143  * Remember policies even when nobody has shared memory mapped.
2144  * The policies are kept in Red-Black tree linked from the inode.
2145  * They are protected by the sp->lock spinlock, which should be held
2146  * for any accesses to the tree.
2147  */
2148 
2149 /* lookup first element intersecting start-end */
2150 /* Caller holds sp->lock */
2151 static struct sp_node *
2152 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2153 {
2154 	struct rb_node *n = sp->root.rb_node;
2155 
2156 	while (n) {
2157 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2158 
2159 		if (start >= p->end)
2160 			n = n->rb_right;
2161 		else if (end <= p->start)
2162 			n = n->rb_left;
2163 		else
2164 			break;
2165 	}
2166 	if (!n)
2167 		return NULL;
2168 	for (;;) {
2169 		struct sp_node *w = NULL;
2170 		struct rb_node *prev = rb_prev(n);
2171 		if (!prev)
2172 			break;
2173 		w = rb_entry(prev, struct sp_node, nd);
2174 		if (w->end <= start)
2175 			break;
2176 		n = prev;
2177 	}
2178 	return rb_entry(n, struct sp_node, nd);
2179 }
2180 
2181 /* Insert a new shared policy into the list. */
2182 /* Caller holds sp->lock */
2183 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2184 {
2185 	struct rb_node **p = &sp->root.rb_node;
2186 	struct rb_node *parent = NULL;
2187 	struct sp_node *nd;
2188 
2189 	while (*p) {
2190 		parent = *p;
2191 		nd = rb_entry(parent, struct sp_node, nd);
2192 		if (new->start < nd->start)
2193 			p = &(*p)->rb_left;
2194 		else if (new->end > nd->end)
2195 			p = &(*p)->rb_right;
2196 		else
2197 			BUG();
2198 	}
2199 	rb_link_node(&new->nd, parent, p);
2200 	rb_insert_color(&new->nd, &sp->root);
2201 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2202 		 new->policy ? new->policy->mode : 0);
2203 }
2204 
2205 /* Find shared policy intersecting idx */
2206 struct mempolicy *
2207 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2208 {
2209 	struct mempolicy *pol = NULL;
2210 	struct sp_node *sn;
2211 
2212 	if (!sp->root.rb_node)
2213 		return NULL;
2214 	spin_lock(&sp->lock);
2215 	sn = sp_lookup(sp, idx, idx+1);
2216 	if (sn) {
2217 		mpol_get(sn->policy);
2218 		pol = sn->policy;
2219 	}
2220 	spin_unlock(&sp->lock);
2221 	return pol;
2222 }
2223 
2224 static void sp_free(struct sp_node *n)
2225 {
2226 	mpol_put(n->policy);
2227 	kmem_cache_free(sn_cache, n);
2228 }
2229 
2230 /**
2231  * mpol_misplaced - check whether current page node is valid in policy
2232  *
2233  * @page: page to be checked
2234  * @vma: vm area where page mapped
2235  * @addr: virtual address where page mapped
2236  *
2237  * Lookup current policy node id for vma,addr and "compare to" page's
2238  * node id.
2239  *
2240  * Returns:
2241  *	-1	- not misplaced, page is in the right node
2242  *	node	- node id where the page should be
2243  *
2244  * Policy determination "mimics" alloc_page_vma().
2245  * Called from fault path where we know the vma and faulting address.
2246  */
2247 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2248 {
2249 	struct mempolicy *pol;
2250 	struct zone *zone;
2251 	int curnid = page_to_nid(page);
2252 	unsigned long pgoff;
2253 	int thiscpu = raw_smp_processor_id();
2254 	int thisnid = cpu_to_node(thiscpu);
2255 	int polnid = -1;
2256 	int ret = -1;
2257 
2258 	BUG_ON(!vma);
2259 
2260 	pol = get_vma_policy(vma, addr);
2261 	if (!(pol->flags & MPOL_F_MOF))
2262 		goto out;
2263 
2264 	switch (pol->mode) {
2265 	case MPOL_INTERLEAVE:
2266 		BUG_ON(addr >= vma->vm_end);
2267 		BUG_ON(addr < vma->vm_start);
2268 
2269 		pgoff = vma->vm_pgoff;
2270 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2271 		polnid = offset_il_node(pol, vma, pgoff);
2272 		break;
2273 
2274 	case MPOL_PREFERRED:
2275 		if (pol->flags & MPOL_F_LOCAL)
2276 			polnid = numa_node_id();
2277 		else
2278 			polnid = pol->v.preferred_node;
2279 		break;
2280 
2281 	case MPOL_BIND:
2282 		/*
2283 		 * allows binding to multiple nodes.
2284 		 * use current page if in policy nodemask,
2285 		 * else select nearest allowed node, if any.
2286 		 * If no allowed nodes, use current [!misplaced].
2287 		 */
2288 		if (node_isset(curnid, pol->v.nodes))
2289 			goto out;
2290 		(void)first_zones_zonelist(
2291 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2292 				gfp_zone(GFP_HIGHUSER),
2293 				&pol->v.nodes, &zone);
2294 		polnid = zone->node;
2295 		break;
2296 
2297 	default:
2298 		BUG();
2299 	}
2300 
2301 	/* Migrate the page towards the node whose CPU is referencing it */
2302 	if (pol->flags & MPOL_F_MORON) {
2303 		polnid = thisnid;
2304 
2305 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2306 			goto out;
2307 	}
2308 
2309 	if (curnid != polnid)
2310 		ret = polnid;
2311 out:
2312 	mpol_cond_put(pol);
2313 
2314 	return ret;
2315 }
2316 
2317 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2318 {
2319 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2320 	rb_erase(&n->nd, &sp->root);
2321 	sp_free(n);
2322 }
2323 
2324 static void sp_node_init(struct sp_node *node, unsigned long start,
2325 			unsigned long end, struct mempolicy *pol)
2326 {
2327 	node->start = start;
2328 	node->end = end;
2329 	node->policy = pol;
2330 }
2331 
2332 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2333 				struct mempolicy *pol)
2334 {
2335 	struct sp_node *n;
2336 	struct mempolicy *newpol;
2337 
2338 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2339 	if (!n)
2340 		return NULL;
2341 
2342 	newpol = mpol_dup(pol);
2343 	if (IS_ERR(newpol)) {
2344 		kmem_cache_free(sn_cache, n);
2345 		return NULL;
2346 	}
2347 	newpol->flags |= MPOL_F_SHARED;
2348 	sp_node_init(n, start, end, newpol);
2349 
2350 	return n;
2351 }
2352 
2353 /* Replace a policy range. */
2354 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2355 				 unsigned long end, struct sp_node *new)
2356 {
2357 	struct sp_node *n;
2358 	struct sp_node *n_new = NULL;
2359 	struct mempolicy *mpol_new = NULL;
2360 	int ret = 0;
2361 
2362 restart:
2363 	spin_lock(&sp->lock);
2364 	n = sp_lookup(sp, start, end);
2365 	/* Take care of old policies in the same range. */
2366 	while (n && n->start < end) {
2367 		struct rb_node *next = rb_next(&n->nd);
2368 		if (n->start >= start) {
2369 			if (n->end <= end)
2370 				sp_delete(sp, n);
2371 			else
2372 				n->start = end;
2373 		} else {
2374 			/* Old policy spanning whole new range. */
2375 			if (n->end > end) {
2376 				if (!n_new)
2377 					goto alloc_new;
2378 
2379 				*mpol_new = *n->policy;
2380 				atomic_set(&mpol_new->refcnt, 1);
2381 				sp_node_init(n_new, end, n->end, mpol_new);
2382 				n->end = start;
2383 				sp_insert(sp, n_new);
2384 				n_new = NULL;
2385 				mpol_new = NULL;
2386 				break;
2387 			} else
2388 				n->end = start;
2389 		}
2390 		if (!next)
2391 			break;
2392 		n = rb_entry(next, struct sp_node, nd);
2393 	}
2394 	if (new)
2395 		sp_insert(sp, new);
2396 	spin_unlock(&sp->lock);
2397 	ret = 0;
2398 
2399 err_out:
2400 	if (mpol_new)
2401 		mpol_put(mpol_new);
2402 	if (n_new)
2403 		kmem_cache_free(sn_cache, n_new);
2404 
2405 	return ret;
2406 
2407 alloc_new:
2408 	spin_unlock(&sp->lock);
2409 	ret = -ENOMEM;
2410 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2411 	if (!n_new)
2412 		goto err_out;
2413 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2414 	if (!mpol_new)
2415 		goto err_out;
2416 	goto restart;
2417 }
2418 
2419 /**
2420  * mpol_shared_policy_init - initialize shared policy for inode
2421  * @sp: pointer to inode shared policy
2422  * @mpol:  struct mempolicy to install
2423  *
2424  * Install non-NULL @mpol in inode's shared policy rb-tree.
2425  * On entry, the current task has a reference on a non-NULL @mpol.
2426  * This must be released on exit.
2427  * This is called at get_inode() calls and we can use GFP_KERNEL.
2428  */
2429 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2430 {
2431 	int ret;
2432 
2433 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2434 	spin_lock_init(&sp->lock);
2435 
2436 	if (mpol) {
2437 		struct vm_area_struct pvma;
2438 		struct mempolicy *new;
2439 		NODEMASK_SCRATCH(scratch);
2440 
2441 		if (!scratch)
2442 			goto put_mpol;
2443 		/* contextualize the tmpfs mount point mempolicy */
2444 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2445 		if (IS_ERR(new))
2446 			goto free_scratch; /* no valid nodemask intersection */
2447 
2448 		task_lock(current);
2449 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2450 		task_unlock(current);
2451 		if (ret)
2452 			goto put_new;
2453 
2454 		/* Create pseudo-vma that contains just the policy */
2455 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2456 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2457 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2458 
2459 put_new:
2460 		mpol_put(new);			/* drop initial ref */
2461 free_scratch:
2462 		NODEMASK_SCRATCH_FREE(scratch);
2463 put_mpol:
2464 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2465 	}
2466 }
2467 
2468 int mpol_set_shared_policy(struct shared_policy *info,
2469 			struct vm_area_struct *vma, struct mempolicy *npol)
2470 {
2471 	int err;
2472 	struct sp_node *new = NULL;
2473 	unsigned long sz = vma_pages(vma);
2474 
2475 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2476 		 vma->vm_pgoff,
2477 		 sz, npol ? npol->mode : -1,
2478 		 npol ? npol->flags : -1,
2479 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2480 
2481 	if (npol) {
2482 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2483 		if (!new)
2484 			return -ENOMEM;
2485 	}
2486 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2487 	if (err && new)
2488 		sp_free(new);
2489 	return err;
2490 }
2491 
2492 /* Free a backing policy store on inode delete. */
2493 void mpol_free_shared_policy(struct shared_policy *p)
2494 {
2495 	struct sp_node *n;
2496 	struct rb_node *next;
2497 
2498 	if (!p->root.rb_node)
2499 		return;
2500 	spin_lock(&p->lock);
2501 	next = rb_first(&p->root);
2502 	while (next) {
2503 		n = rb_entry(next, struct sp_node, nd);
2504 		next = rb_next(&n->nd);
2505 		sp_delete(p, n);
2506 	}
2507 	spin_unlock(&p->lock);
2508 }
2509 
2510 #ifdef CONFIG_NUMA_BALANCING
2511 static int __initdata numabalancing_override;
2512 
2513 static void __init check_numabalancing_enable(void)
2514 {
2515 	bool numabalancing_default = false;
2516 
2517 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2518 		numabalancing_default = true;
2519 
2520 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2521 	if (numabalancing_override)
2522 		set_numabalancing_state(numabalancing_override == 1);
2523 
2524 	if (num_online_nodes() > 1 && !numabalancing_override) {
2525 		pr_info("%s automatic NUMA balancing. "
2526 			"Configure with numa_balancing= or the "
2527 			"kernel.numa_balancing sysctl",
2528 			numabalancing_default ? "Enabling" : "Disabling");
2529 		set_numabalancing_state(numabalancing_default);
2530 	}
2531 }
2532 
2533 static int __init setup_numabalancing(char *str)
2534 {
2535 	int ret = 0;
2536 	if (!str)
2537 		goto out;
2538 
2539 	if (!strcmp(str, "enable")) {
2540 		numabalancing_override = 1;
2541 		ret = 1;
2542 	} else if (!strcmp(str, "disable")) {
2543 		numabalancing_override = -1;
2544 		ret = 1;
2545 	}
2546 out:
2547 	if (!ret)
2548 		pr_warn("Unable to parse numa_balancing=\n");
2549 
2550 	return ret;
2551 }
2552 __setup("numa_balancing=", setup_numabalancing);
2553 #else
2554 static inline void __init check_numabalancing_enable(void)
2555 {
2556 }
2557 #endif /* CONFIG_NUMA_BALANCING */
2558 
2559 /* assumes fs == KERNEL_DS */
2560 void __init numa_policy_init(void)
2561 {
2562 	nodemask_t interleave_nodes;
2563 	unsigned long largest = 0;
2564 	int nid, prefer = 0;
2565 
2566 	policy_cache = kmem_cache_create("numa_policy",
2567 					 sizeof(struct mempolicy),
2568 					 0, SLAB_PANIC, NULL);
2569 
2570 	sn_cache = kmem_cache_create("shared_policy_node",
2571 				     sizeof(struct sp_node),
2572 				     0, SLAB_PANIC, NULL);
2573 
2574 	for_each_node(nid) {
2575 		preferred_node_policy[nid] = (struct mempolicy) {
2576 			.refcnt = ATOMIC_INIT(1),
2577 			.mode = MPOL_PREFERRED,
2578 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2579 			.v = { .preferred_node = nid, },
2580 		};
2581 	}
2582 
2583 	/*
2584 	 * Set interleaving policy for system init. Interleaving is only
2585 	 * enabled across suitably sized nodes (default is >= 16MB), or
2586 	 * fall back to the largest node if they're all smaller.
2587 	 */
2588 	nodes_clear(interleave_nodes);
2589 	for_each_node_state(nid, N_MEMORY) {
2590 		unsigned long total_pages = node_present_pages(nid);
2591 
2592 		/* Preserve the largest node */
2593 		if (largest < total_pages) {
2594 			largest = total_pages;
2595 			prefer = nid;
2596 		}
2597 
2598 		/* Interleave this node? */
2599 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2600 			node_set(nid, interleave_nodes);
2601 	}
2602 
2603 	/* All too small, use the largest */
2604 	if (unlikely(nodes_empty(interleave_nodes)))
2605 		node_set(prefer, interleave_nodes);
2606 
2607 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2608 		pr_err("%s: interleaving failed\n", __func__);
2609 
2610 	check_numabalancing_enable();
2611 }
2612 
2613 /* Reset policy of current process to default */
2614 void numa_default_policy(void)
2615 {
2616 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2617 }
2618 
2619 /*
2620  * Parse and format mempolicy from/to strings
2621  */
2622 
2623 /*
2624  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2625  */
2626 static const char * const policy_modes[] =
2627 {
2628 	[MPOL_DEFAULT]    = "default",
2629 	[MPOL_PREFERRED]  = "prefer",
2630 	[MPOL_BIND]       = "bind",
2631 	[MPOL_INTERLEAVE] = "interleave",
2632 	[MPOL_LOCAL]      = "local",
2633 };
2634 
2635 
2636 #ifdef CONFIG_TMPFS
2637 /**
2638  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2639  * @str:  string containing mempolicy to parse
2640  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2641  *
2642  * Format of input:
2643  *	<mode>[=<flags>][:<nodelist>]
2644  *
2645  * On success, returns 0, else 1
2646  */
2647 int mpol_parse_str(char *str, struct mempolicy **mpol)
2648 {
2649 	struct mempolicy *new = NULL;
2650 	unsigned short mode;
2651 	unsigned short mode_flags;
2652 	nodemask_t nodes;
2653 	char *nodelist = strchr(str, ':');
2654 	char *flags = strchr(str, '=');
2655 	int err = 1;
2656 
2657 	if (nodelist) {
2658 		/* NUL-terminate mode or flags string */
2659 		*nodelist++ = '\0';
2660 		if (nodelist_parse(nodelist, nodes))
2661 			goto out;
2662 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2663 			goto out;
2664 	} else
2665 		nodes_clear(nodes);
2666 
2667 	if (flags)
2668 		*flags++ = '\0';	/* terminate mode string */
2669 
2670 	for (mode = 0; mode < MPOL_MAX; mode++) {
2671 		if (!strcmp(str, policy_modes[mode])) {
2672 			break;
2673 		}
2674 	}
2675 	if (mode >= MPOL_MAX)
2676 		goto out;
2677 
2678 	switch (mode) {
2679 	case MPOL_PREFERRED:
2680 		/*
2681 		 * Insist on a nodelist of one node only
2682 		 */
2683 		if (nodelist) {
2684 			char *rest = nodelist;
2685 			while (isdigit(*rest))
2686 				rest++;
2687 			if (*rest)
2688 				goto out;
2689 		}
2690 		break;
2691 	case MPOL_INTERLEAVE:
2692 		/*
2693 		 * Default to online nodes with memory if no nodelist
2694 		 */
2695 		if (!nodelist)
2696 			nodes = node_states[N_MEMORY];
2697 		break;
2698 	case MPOL_LOCAL:
2699 		/*
2700 		 * Don't allow a nodelist;  mpol_new() checks flags
2701 		 */
2702 		if (nodelist)
2703 			goto out;
2704 		mode = MPOL_PREFERRED;
2705 		break;
2706 	case MPOL_DEFAULT:
2707 		/*
2708 		 * Insist on a empty nodelist
2709 		 */
2710 		if (!nodelist)
2711 			err = 0;
2712 		goto out;
2713 	case MPOL_BIND:
2714 		/*
2715 		 * Insist on a nodelist
2716 		 */
2717 		if (!nodelist)
2718 			goto out;
2719 	}
2720 
2721 	mode_flags = 0;
2722 	if (flags) {
2723 		/*
2724 		 * Currently, we only support two mutually exclusive
2725 		 * mode flags.
2726 		 */
2727 		if (!strcmp(flags, "static"))
2728 			mode_flags |= MPOL_F_STATIC_NODES;
2729 		else if (!strcmp(flags, "relative"))
2730 			mode_flags |= MPOL_F_RELATIVE_NODES;
2731 		else
2732 			goto out;
2733 	}
2734 
2735 	new = mpol_new(mode, mode_flags, &nodes);
2736 	if (IS_ERR(new))
2737 		goto out;
2738 
2739 	/*
2740 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2741 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2742 	 */
2743 	if (mode != MPOL_PREFERRED)
2744 		new->v.nodes = nodes;
2745 	else if (nodelist)
2746 		new->v.preferred_node = first_node(nodes);
2747 	else
2748 		new->flags |= MPOL_F_LOCAL;
2749 
2750 	/*
2751 	 * Save nodes for contextualization: this will be used to "clone"
2752 	 * the mempolicy in a specific context [cpuset] at a later time.
2753 	 */
2754 	new->w.user_nodemask = nodes;
2755 
2756 	err = 0;
2757 
2758 out:
2759 	/* Restore string for error message */
2760 	if (nodelist)
2761 		*--nodelist = ':';
2762 	if (flags)
2763 		*--flags = '=';
2764 	if (!err)
2765 		*mpol = new;
2766 	return err;
2767 }
2768 #endif /* CONFIG_TMPFS */
2769 
2770 /**
2771  * mpol_to_str - format a mempolicy structure for printing
2772  * @buffer:  to contain formatted mempolicy string
2773  * @maxlen:  length of @buffer
2774  * @pol:  pointer to mempolicy to be formatted
2775  *
2776  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2777  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2778  * longest flag, "relative", and to display at least a few node ids.
2779  */
2780 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2781 {
2782 	char *p = buffer;
2783 	nodemask_t nodes = NODE_MASK_NONE;
2784 	unsigned short mode = MPOL_DEFAULT;
2785 	unsigned short flags = 0;
2786 
2787 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2788 		mode = pol->mode;
2789 		flags = pol->flags;
2790 	}
2791 
2792 	switch (mode) {
2793 	case MPOL_DEFAULT:
2794 		break;
2795 	case MPOL_PREFERRED:
2796 		if (flags & MPOL_F_LOCAL)
2797 			mode = MPOL_LOCAL;
2798 		else
2799 			node_set(pol->v.preferred_node, nodes);
2800 		break;
2801 	case MPOL_BIND:
2802 	case MPOL_INTERLEAVE:
2803 		nodes = pol->v.nodes;
2804 		break;
2805 	default:
2806 		WARN_ON_ONCE(1);
2807 		snprintf(p, maxlen, "unknown");
2808 		return;
2809 	}
2810 
2811 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2812 
2813 	if (flags & MPOL_MODE_FLAGS) {
2814 		p += snprintf(p, buffer + maxlen - p, "=");
2815 
2816 		/*
2817 		 * Currently, the only defined flags are mutually exclusive
2818 		 */
2819 		if (flags & MPOL_F_STATIC_NODES)
2820 			p += snprintf(p, buffer + maxlen - p, "static");
2821 		else if (flags & MPOL_F_RELATIVE_NODES)
2822 			p += snprintf(p, buffer + maxlen - p, "relative");
2823 	}
2824 
2825 	if (!nodes_empty(nodes))
2826 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2827 			       nodemask_pr_args(&nodes));
2828 }
2829