xref: /openbmc/linux/mm/mempolicy.c (revision bcb84fb4)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/swap.h>
89 #include <linux/seq_file.h>
90 #include <linux/proc_fs.h>
91 #include <linux/migrate.h>
92 #include <linux/ksm.h>
93 #include <linux/rmap.h>
94 #include <linux/security.h>
95 #include <linux/syscalls.h>
96 #include <linux/ctype.h>
97 #include <linux/mm_inline.h>
98 #include <linux/mmu_notifier.h>
99 #include <linux/printk.h>
100 
101 #include <asm/tlbflush.h>
102 #include <linux/uaccess.h>
103 
104 #include "internal.h"
105 
106 /* Internal flags */
107 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
108 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
109 
110 static struct kmem_cache *policy_cache;
111 static struct kmem_cache *sn_cache;
112 
113 /* Highest zone. An specific allocation for a zone below that is not
114    policied. */
115 enum zone_type policy_zone = 0;
116 
117 /*
118  * run-time system-wide default policy => local allocation
119  */
120 static struct mempolicy default_policy = {
121 	.refcnt = ATOMIC_INIT(1), /* never free it */
122 	.mode = MPOL_PREFERRED,
123 	.flags = MPOL_F_LOCAL,
124 };
125 
126 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
127 
128 struct mempolicy *get_task_policy(struct task_struct *p)
129 {
130 	struct mempolicy *pol = p->mempolicy;
131 	int node;
132 
133 	if (pol)
134 		return pol;
135 
136 	node = numa_node_id();
137 	if (node != NUMA_NO_NODE) {
138 		pol = &preferred_node_policy[node];
139 		/* preferred_node_policy is not initialised early in boot */
140 		if (pol->mode)
141 			return pol;
142 	}
143 
144 	return &default_policy;
145 }
146 
147 static const struct mempolicy_operations {
148 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149 	/*
150 	 * If read-side task has no lock to protect task->mempolicy, write-side
151 	 * task will rebind the task->mempolicy by two step. The first step is
152 	 * setting all the newly nodes, and the second step is cleaning all the
153 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
154 	 * page.
155 	 * If we have a lock to protect task->mempolicy in read-side, we do
156 	 * rebind directly.
157 	 *
158 	 * step:
159 	 * 	MPOL_REBIND_ONCE - do rebind work at once
160 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
161 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
162 	 */
163 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164 			enum mpol_rebind_step step);
165 } mpol_ops[MPOL_MAX];
166 
167 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
168 {
169 	return pol->flags & MPOL_MODE_FLAGS;
170 }
171 
172 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
173 				   const nodemask_t *rel)
174 {
175 	nodemask_t tmp;
176 	nodes_fold(tmp, *orig, nodes_weight(*rel));
177 	nodes_onto(*ret, tmp, *rel);
178 }
179 
180 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
181 {
182 	if (nodes_empty(*nodes))
183 		return -EINVAL;
184 	pol->v.nodes = *nodes;
185 	return 0;
186 }
187 
188 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
189 {
190 	if (!nodes)
191 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
192 	else if (nodes_empty(*nodes))
193 		return -EINVAL;			/*  no allowed nodes */
194 	else
195 		pol->v.preferred_node = first_node(*nodes);
196 	return 0;
197 }
198 
199 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
200 {
201 	if (nodes_empty(*nodes))
202 		return -EINVAL;
203 	pol->v.nodes = *nodes;
204 	return 0;
205 }
206 
207 /*
208  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
209  * any, for the new policy.  mpol_new() has already validated the nodes
210  * parameter with respect to the policy mode and flags.  But, we need to
211  * handle an empty nodemask with MPOL_PREFERRED here.
212  *
213  * Must be called holding task's alloc_lock to protect task's mems_allowed
214  * and mempolicy.  May also be called holding the mmap_semaphore for write.
215  */
216 static int mpol_set_nodemask(struct mempolicy *pol,
217 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
218 {
219 	int ret;
220 
221 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
222 	if (pol == NULL)
223 		return 0;
224 	/* Check N_MEMORY */
225 	nodes_and(nsc->mask1,
226 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
227 
228 	VM_BUG_ON(!nodes);
229 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
230 		nodes = NULL;	/* explicit local allocation */
231 	else {
232 		if (pol->flags & MPOL_F_RELATIVE_NODES)
233 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
234 		else
235 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
236 
237 		if (mpol_store_user_nodemask(pol))
238 			pol->w.user_nodemask = *nodes;
239 		else
240 			pol->w.cpuset_mems_allowed =
241 						cpuset_current_mems_allowed;
242 	}
243 
244 	if (nodes)
245 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
246 	else
247 		ret = mpol_ops[pol->mode].create(pol, NULL);
248 	return ret;
249 }
250 
251 /*
252  * This function just creates a new policy, does some check and simple
253  * initialization. You must invoke mpol_set_nodemask() to set nodes.
254  */
255 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
256 				  nodemask_t *nodes)
257 {
258 	struct mempolicy *policy;
259 
260 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
261 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
262 
263 	if (mode == MPOL_DEFAULT) {
264 		if (nodes && !nodes_empty(*nodes))
265 			return ERR_PTR(-EINVAL);
266 		return NULL;
267 	}
268 	VM_BUG_ON(!nodes);
269 
270 	/*
271 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
272 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
273 	 * All other modes require a valid pointer to a non-empty nodemask.
274 	 */
275 	if (mode == MPOL_PREFERRED) {
276 		if (nodes_empty(*nodes)) {
277 			if (((flags & MPOL_F_STATIC_NODES) ||
278 			     (flags & MPOL_F_RELATIVE_NODES)))
279 				return ERR_PTR(-EINVAL);
280 		}
281 	} else if (mode == MPOL_LOCAL) {
282 		if (!nodes_empty(*nodes) ||
283 		    (flags & MPOL_F_STATIC_NODES) ||
284 		    (flags & MPOL_F_RELATIVE_NODES))
285 			return ERR_PTR(-EINVAL);
286 		mode = MPOL_PREFERRED;
287 	} else if (nodes_empty(*nodes))
288 		return ERR_PTR(-EINVAL);
289 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
290 	if (!policy)
291 		return ERR_PTR(-ENOMEM);
292 	atomic_set(&policy->refcnt, 1);
293 	policy->mode = mode;
294 	policy->flags = flags;
295 
296 	return policy;
297 }
298 
299 /* Slow path of a mpol destructor. */
300 void __mpol_put(struct mempolicy *p)
301 {
302 	if (!atomic_dec_and_test(&p->refcnt))
303 		return;
304 	kmem_cache_free(policy_cache, p);
305 }
306 
307 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
308 				enum mpol_rebind_step step)
309 {
310 }
311 
312 /*
313  * step:
314  * 	MPOL_REBIND_ONCE  - do rebind work at once
315  * 	MPOL_REBIND_STEP1 - set all the newly nodes
316  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
317  */
318 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
319 				 enum mpol_rebind_step step)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES)
324 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
325 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
326 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327 	else {
328 		/*
329 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
330 		 * result
331 		 */
332 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
333 			nodes_remap(tmp, pol->v.nodes,
334 					pol->w.cpuset_mems_allowed, *nodes);
335 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
336 		} else if (step == MPOL_REBIND_STEP2) {
337 			tmp = pol->w.cpuset_mems_allowed;
338 			pol->w.cpuset_mems_allowed = *nodes;
339 		} else
340 			BUG();
341 	}
342 
343 	if (nodes_empty(tmp))
344 		tmp = *nodes;
345 
346 	if (step == MPOL_REBIND_STEP1)
347 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
348 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
349 		pol->v.nodes = tmp;
350 	else
351 		BUG();
352 
353 	if (!node_isset(current->il_next, tmp)) {
354 		current->il_next = next_node_in(current->il_next, tmp);
355 		if (current->il_next >= MAX_NUMNODES)
356 			current->il_next = numa_node_id();
357 	}
358 }
359 
360 static void mpol_rebind_preferred(struct mempolicy *pol,
361 				  const nodemask_t *nodes,
362 				  enum mpol_rebind_step step)
363 {
364 	nodemask_t tmp;
365 
366 	if (pol->flags & MPOL_F_STATIC_NODES) {
367 		int node = first_node(pol->w.user_nodemask);
368 
369 		if (node_isset(node, *nodes)) {
370 			pol->v.preferred_node = node;
371 			pol->flags &= ~MPOL_F_LOCAL;
372 		} else
373 			pol->flags |= MPOL_F_LOCAL;
374 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
375 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
376 		pol->v.preferred_node = first_node(tmp);
377 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
378 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
379 						   pol->w.cpuset_mems_allowed,
380 						   *nodes);
381 		pol->w.cpuset_mems_allowed = *nodes;
382 	}
383 }
384 
385 /*
386  * mpol_rebind_policy - Migrate a policy to a different set of nodes
387  *
388  * If read-side task has no lock to protect task->mempolicy, write-side
389  * task will rebind the task->mempolicy by two step. The first step is
390  * setting all the newly nodes, and the second step is cleaning all the
391  * disallowed nodes. In this way, we can avoid finding no node to alloc
392  * page.
393  * If we have a lock to protect task->mempolicy in read-side, we do
394  * rebind directly.
395  *
396  * step:
397  * 	MPOL_REBIND_ONCE  - do rebind work at once
398  * 	MPOL_REBIND_STEP1 - set all the newly nodes
399  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
400  */
401 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
402 				enum mpol_rebind_step step)
403 {
404 	if (!pol)
405 		return;
406 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
407 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
408 		return;
409 
410 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
414 		BUG();
415 
416 	if (step == MPOL_REBIND_STEP1)
417 		pol->flags |= MPOL_F_REBINDING;
418 	else if (step == MPOL_REBIND_STEP2)
419 		pol->flags &= ~MPOL_F_REBINDING;
420 	else if (step >= MPOL_REBIND_NSTEP)
421 		BUG();
422 
423 	mpol_ops[pol->mode].rebind(pol, newmask, step);
424 }
425 
426 /*
427  * Wrapper for mpol_rebind_policy() that just requires task
428  * pointer, and updates task mempolicy.
429  *
430  * Called with task's alloc_lock held.
431  */
432 
433 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
434 			enum mpol_rebind_step step)
435 {
436 	mpol_rebind_policy(tsk->mempolicy, new, step);
437 }
438 
439 /*
440  * Rebind each vma in mm to new nodemask.
441  *
442  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
443  */
444 
445 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
446 {
447 	struct vm_area_struct *vma;
448 
449 	down_write(&mm->mmap_sem);
450 	for (vma = mm->mmap; vma; vma = vma->vm_next)
451 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
452 	up_write(&mm->mmap_sem);
453 }
454 
455 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
456 	[MPOL_DEFAULT] = {
457 		.rebind = mpol_rebind_default,
458 	},
459 	[MPOL_INTERLEAVE] = {
460 		.create = mpol_new_interleave,
461 		.rebind = mpol_rebind_nodemask,
462 	},
463 	[MPOL_PREFERRED] = {
464 		.create = mpol_new_preferred,
465 		.rebind = mpol_rebind_preferred,
466 	},
467 	[MPOL_BIND] = {
468 		.create = mpol_new_bind,
469 		.rebind = mpol_rebind_nodemask,
470 	},
471 };
472 
473 static void migrate_page_add(struct page *page, struct list_head *pagelist,
474 				unsigned long flags);
475 
476 struct queue_pages {
477 	struct list_head *pagelist;
478 	unsigned long flags;
479 	nodemask_t *nmask;
480 	struct vm_area_struct *prev;
481 };
482 
483 /*
484  * Scan through pages checking if pages follow certain conditions,
485  * and move them to the pagelist if they do.
486  */
487 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
488 			unsigned long end, struct mm_walk *walk)
489 {
490 	struct vm_area_struct *vma = walk->vma;
491 	struct page *page;
492 	struct queue_pages *qp = walk->private;
493 	unsigned long flags = qp->flags;
494 	int nid, ret;
495 	pte_t *pte;
496 	spinlock_t *ptl;
497 
498 	if (pmd_trans_huge(*pmd)) {
499 		ptl = pmd_lock(walk->mm, pmd);
500 		if (pmd_trans_huge(*pmd)) {
501 			page = pmd_page(*pmd);
502 			if (is_huge_zero_page(page)) {
503 				spin_unlock(ptl);
504 				__split_huge_pmd(vma, pmd, addr, false, NULL);
505 			} else {
506 				get_page(page);
507 				spin_unlock(ptl);
508 				lock_page(page);
509 				ret = split_huge_page(page);
510 				unlock_page(page);
511 				put_page(page);
512 				if (ret)
513 					return 0;
514 			}
515 		} else {
516 			spin_unlock(ptl);
517 		}
518 	}
519 
520 	if (pmd_trans_unstable(pmd))
521 		return 0;
522 retry:
523 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
524 	for (; addr != end; pte++, addr += PAGE_SIZE) {
525 		if (!pte_present(*pte))
526 			continue;
527 		page = vm_normal_page(vma, addr, *pte);
528 		if (!page)
529 			continue;
530 		/*
531 		 * vm_normal_page() filters out zero pages, but there might
532 		 * still be PageReserved pages to skip, perhaps in a VDSO.
533 		 */
534 		if (PageReserved(page))
535 			continue;
536 		nid = page_to_nid(page);
537 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
538 			continue;
539 		if (PageTransCompound(page)) {
540 			get_page(page);
541 			pte_unmap_unlock(pte, ptl);
542 			lock_page(page);
543 			ret = split_huge_page(page);
544 			unlock_page(page);
545 			put_page(page);
546 			/* Failed to split -- skip. */
547 			if (ret) {
548 				pte = pte_offset_map_lock(walk->mm, pmd,
549 						addr, &ptl);
550 				continue;
551 			}
552 			goto retry;
553 		}
554 
555 		migrate_page_add(page, qp->pagelist, flags);
556 	}
557 	pte_unmap_unlock(pte - 1, ptl);
558 	cond_resched();
559 	return 0;
560 }
561 
562 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
563 			       unsigned long addr, unsigned long end,
564 			       struct mm_walk *walk)
565 {
566 #ifdef CONFIG_HUGETLB_PAGE
567 	struct queue_pages *qp = walk->private;
568 	unsigned long flags = qp->flags;
569 	int nid;
570 	struct page *page;
571 	spinlock_t *ptl;
572 	pte_t entry;
573 
574 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
575 	entry = huge_ptep_get(pte);
576 	if (!pte_present(entry))
577 		goto unlock;
578 	page = pte_page(entry);
579 	nid = page_to_nid(page);
580 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
581 		goto unlock;
582 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
583 	if (flags & (MPOL_MF_MOVE_ALL) ||
584 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
585 		isolate_huge_page(page, qp->pagelist);
586 unlock:
587 	spin_unlock(ptl);
588 #else
589 	BUG();
590 #endif
591 	return 0;
592 }
593 
594 #ifdef CONFIG_NUMA_BALANCING
595 /*
596  * This is used to mark a range of virtual addresses to be inaccessible.
597  * These are later cleared by a NUMA hinting fault. Depending on these
598  * faults, pages may be migrated for better NUMA placement.
599  *
600  * This is assuming that NUMA faults are handled using PROT_NONE. If
601  * an architecture makes a different choice, it will need further
602  * changes to the core.
603  */
604 unsigned long change_prot_numa(struct vm_area_struct *vma,
605 			unsigned long addr, unsigned long end)
606 {
607 	int nr_updated;
608 
609 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
610 	if (nr_updated)
611 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
612 
613 	return nr_updated;
614 }
615 #else
616 static unsigned long change_prot_numa(struct vm_area_struct *vma,
617 			unsigned long addr, unsigned long end)
618 {
619 	return 0;
620 }
621 #endif /* CONFIG_NUMA_BALANCING */
622 
623 static int queue_pages_test_walk(unsigned long start, unsigned long end,
624 				struct mm_walk *walk)
625 {
626 	struct vm_area_struct *vma = walk->vma;
627 	struct queue_pages *qp = walk->private;
628 	unsigned long endvma = vma->vm_end;
629 	unsigned long flags = qp->flags;
630 
631 	if (!vma_migratable(vma))
632 		return 1;
633 
634 	if (endvma > end)
635 		endvma = end;
636 	if (vma->vm_start > start)
637 		start = vma->vm_start;
638 
639 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
640 		if (!vma->vm_next && vma->vm_end < end)
641 			return -EFAULT;
642 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
643 			return -EFAULT;
644 	}
645 
646 	qp->prev = vma;
647 
648 	if (flags & MPOL_MF_LAZY) {
649 		/* Similar to task_numa_work, skip inaccessible VMAs */
650 		if (!is_vm_hugetlb_page(vma) &&
651 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
652 			!(vma->vm_flags & VM_MIXEDMAP))
653 			change_prot_numa(vma, start, endvma);
654 		return 1;
655 	}
656 
657 	/* queue pages from current vma */
658 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
659 		return 0;
660 	return 1;
661 }
662 
663 /*
664  * Walk through page tables and collect pages to be migrated.
665  *
666  * If pages found in a given range are on a set of nodes (determined by
667  * @nodes and @flags,) it's isolated and queued to the pagelist which is
668  * passed via @private.)
669  */
670 static int
671 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
672 		nodemask_t *nodes, unsigned long flags,
673 		struct list_head *pagelist)
674 {
675 	struct queue_pages qp = {
676 		.pagelist = pagelist,
677 		.flags = flags,
678 		.nmask = nodes,
679 		.prev = NULL,
680 	};
681 	struct mm_walk queue_pages_walk = {
682 		.hugetlb_entry = queue_pages_hugetlb,
683 		.pmd_entry = queue_pages_pte_range,
684 		.test_walk = queue_pages_test_walk,
685 		.mm = mm,
686 		.private = &qp,
687 	};
688 
689 	return walk_page_range(start, end, &queue_pages_walk);
690 }
691 
692 /*
693  * Apply policy to a single VMA
694  * This must be called with the mmap_sem held for writing.
695  */
696 static int vma_replace_policy(struct vm_area_struct *vma,
697 						struct mempolicy *pol)
698 {
699 	int err;
700 	struct mempolicy *old;
701 	struct mempolicy *new;
702 
703 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
704 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
705 		 vma->vm_ops, vma->vm_file,
706 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
707 
708 	new = mpol_dup(pol);
709 	if (IS_ERR(new))
710 		return PTR_ERR(new);
711 
712 	if (vma->vm_ops && vma->vm_ops->set_policy) {
713 		err = vma->vm_ops->set_policy(vma, new);
714 		if (err)
715 			goto err_out;
716 	}
717 
718 	old = vma->vm_policy;
719 	vma->vm_policy = new; /* protected by mmap_sem */
720 	mpol_put(old);
721 
722 	return 0;
723  err_out:
724 	mpol_put(new);
725 	return err;
726 }
727 
728 /* Step 2: apply policy to a range and do splits. */
729 static int mbind_range(struct mm_struct *mm, unsigned long start,
730 		       unsigned long end, struct mempolicy *new_pol)
731 {
732 	struct vm_area_struct *next;
733 	struct vm_area_struct *prev;
734 	struct vm_area_struct *vma;
735 	int err = 0;
736 	pgoff_t pgoff;
737 	unsigned long vmstart;
738 	unsigned long vmend;
739 
740 	vma = find_vma(mm, start);
741 	if (!vma || vma->vm_start > start)
742 		return -EFAULT;
743 
744 	prev = vma->vm_prev;
745 	if (start > vma->vm_start)
746 		prev = vma;
747 
748 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
749 		next = vma->vm_next;
750 		vmstart = max(start, vma->vm_start);
751 		vmend   = min(end, vma->vm_end);
752 
753 		if (mpol_equal(vma_policy(vma), new_pol))
754 			continue;
755 
756 		pgoff = vma->vm_pgoff +
757 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
758 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
759 				 vma->anon_vma, vma->vm_file, pgoff,
760 				 new_pol, vma->vm_userfaultfd_ctx);
761 		if (prev) {
762 			vma = prev;
763 			next = vma->vm_next;
764 			if (mpol_equal(vma_policy(vma), new_pol))
765 				continue;
766 			/* vma_merge() joined vma && vma->next, case 8 */
767 			goto replace;
768 		}
769 		if (vma->vm_start != vmstart) {
770 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
771 			if (err)
772 				goto out;
773 		}
774 		if (vma->vm_end != vmend) {
775 			err = split_vma(vma->vm_mm, vma, vmend, 0);
776 			if (err)
777 				goto out;
778 		}
779  replace:
780 		err = vma_replace_policy(vma, new_pol);
781 		if (err)
782 			goto out;
783 	}
784 
785  out:
786 	return err;
787 }
788 
789 /* Set the process memory policy */
790 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
791 			     nodemask_t *nodes)
792 {
793 	struct mempolicy *new, *old;
794 	NODEMASK_SCRATCH(scratch);
795 	int ret;
796 
797 	if (!scratch)
798 		return -ENOMEM;
799 
800 	new = mpol_new(mode, flags, nodes);
801 	if (IS_ERR(new)) {
802 		ret = PTR_ERR(new);
803 		goto out;
804 	}
805 
806 	task_lock(current);
807 	ret = mpol_set_nodemask(new, nodes, scratch);
808 	if (ret) {
809 		task_unlock(current);
810 		mpol_put(new);
811 		goto out;
812 	}
813 	old = current->mempolicy;
814 	current->mempolicy = new;
815 	if (new && new->mode == MPOL_INTERLEAVE &&
816 	    nodes_weight(new->v.nodes))
817 		current->il_next = first_node(new->v.nodes);
818 	task_unlock(current);
819 	mpol_put(old);
820 	ret = 0;
821 out:
822 	NODEMASK_SCRATCH_FREE(scratch);
823 	return ret;
824 }
825 
826 /*
827  * Return nodemask for policy for get_mempolicy() query
828  *
829  * Called with task's alloc_lock held
830  */
831 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
832 {
833 	nodes_clear(*nodes);
834 	if (p == &default_policy)
835 		return;
836 
837 	switch (p->mode) {
838 	case MPOL_BIND:
839 		/* Fall through */
840 	case MPOL_INTERLEAVE:
841 		*nodes = p->v.nodes;
842 		break;
843 	case MPOL_PREFERRED:
844 		if (!(p->flags & MPOL_F_LOCAL))
845 			node_set(p->v.preferred_node, *nodes);
846 		/* else return empty node mask for local allocation */
847 		break;
848 	default:
849 		BUG();
850 	}
851 }
852 
853 static int lookup_node(unsigned long addr)
854 {
855 	struct page *p;
856 	int err;
857 
858 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
859 	if (err >= 0) {
860 		err = page_to_nid(p);
861 		put_page(p);
862 	}
863 	return err;
864 }
865 
866 /* Retrieve NUMA policy */
867 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
868 			     unsigned long addr, unsigned long flags)
869 {
870 	int err;
871 	struct mm_struct *mm = current->mm;
872 	struct vm_area_struct *vma = NULL;
873 	struct mempolicy *pol = current->mempolicy;
874 
875 	if (flags &
876 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
877 		return -EINVAL;
878 
879 	if (flags & MPOL_F_MEMS_ALLOWED) {
880 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
881 			return -EINVAL;
882 		*policy = 0;	/* just so it's initialized */
883 		task_lock(current);
884 		*nmask  = cpuset_current_mems_allowed;
885 		task_unlock(current);
886 		return 0;
887 	}
888 
889 	if (flags & MPOL_F_ADDR) {
890 		/*
891 		 * Do NOT fall back to task policy if the
892 		 * vma/shared policy at addr is NULL.  We
893 		 * want to return MPOL_DEFAULT in this case.
894 		 */
895 		down_read(&mm->mmap_sem);
896 		vma = find_vma_intersection(mm, addr, addr+1);
897 		if (!vma) {
898 			up_read(&mm->mmap_sem);
899 			return -EFAULT;
900 		}
901 		if (vma->vm_ops && vma->vm_ops->get_policy)
902 			pol = vma->vm_ops->get_policy(vma, addr);
903 		else
904 			pol = vma->vm_policy;
905 	} else if (addr)
906 		return -EINVAL;
907 
908 	if (!pol)
909 		pol = &default_policy;	/* indicates default behavior */
910 
911 	if (flags & MPOL_F_NODE) {
912 		if (flags & MPOL_F_ADDR) {
913 			err = lookup_node(addr);
914 			if (err < 0)
915 				goto out;
916 			*policy = err;
917 		} else if (pol == current->mempolicy &&
918 				pol->mode == MPOL_INTERLEAVE) {
919 			*policy = current->il_next;
920 		} else {
921 			err = -EINVAL;
922 			goto out;
923 		}
924 	} else {
925 		*policy = pol == &default_policy ? MPOL_DEFAULT :
926 						pol->mode;
927 		/*
928 		 * Internal mempolicy flags must be masked off before exposing
929 		 * the policy to userspace.
930 		 */
931 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
932 	}
933 
934 	if (vma) {
935 		up_read(&current->mm->mmap_sem);
936 		vma = NULL;
937 	}
938 
939 	err = 0;
940 	if (nmask) {
941 		if (mpol_store_user_nodemask(pol)) {
942 			*nmask = pol->w.user_nodemask;
943 		} else {
944 			task_lock(current);
945 			get_policy_nodemask(pol, nmask);
946 			task_unlock(current);
947 		}
948 	}
949 
950  out:
951 	mpol_cond_put(pol);
952 	if (vma)
953 		up_read(&current->mm->mmap_sem);
954 	return err;
955 }
956 
957 #ifdef CONFIG_MIGRATION
958 /*
959  * page migration
960  */
961 static void migrate_page_add(struct page *page, struct list_head *pagelist,
962 				unsigned long flags)
963 {
964 	/*
965 	 * Avoid migrating a page that is shared with others.
966 	 */
967 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
968 		if (!isolate_lru_page(page)) {
969 			list_add_tail(&page->lru, pagelist);
970 			inc_node_page_state(page, NR_ISOLATED_ANON +
971 					    page_is_file_cache(page));
972 		}
973 	}
974 }
975 
976 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
977 {
978 	if (PageHuge(page))
979 		return alloc_huge_page_node(page_hstate(compound_head(page)),
980 					node);
981 	else
982 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
983 						    __GFP_THISNODE, 0);
984 }
985 
986 /*
987  * Migrate pages from one node to a target node.
988  * Returns error or the number of pages not migrated.
989  */
990 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
991 			   int flags)
992 {
993 	nodemask_t nmask;
994 	LIST_HEAD(pagelist);
995 	int err = 0;
996 
997 	nodes_clear(nmask);
998 	node_set(source, nmask);
999 
1000 	/*
1001 	 * This does not "check" the range but isolates all pages that
1002 	 * need migration.  Between passing in the full user address
1003 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1004 	 */
1005 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1006 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1007 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1008 
1009 	if (!list_empty(&pagelist)) {
1010 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1011 					MIGRATE_SYNC, MR_SYSCALL);
1012 		if (err)
1013 			putback_movable_pages(&pagelist);
1014 	}
1015 
1016 	return err;
1017 }
1018 
1019 /*
1020  * Move pages between the two nodesets so as to preserve the physical
1021  * layout as much as possible.
1022  *
1023  * Returns the number of page that could not be moved.
1024  */
1025 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1026 		     const nodemask_t *to, int flags)
1027 {
1028 	int busy = 0;
1029 	int err;
1030 	nodemask_t tmp;
1031 
1032 	err = migrate_prep();
1033 	if (err)
1034 		return err;
1035 
1036 	down_read(&mm->mmap_sem);
1037 
1038 	/*
1039 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1040 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1041 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1042 	 * The pair of nodemasks 'to' and 'from' define the map.
1043 	 *
1044 	 * If no pair of bits is found that way, fallback to picking some
1045 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1046 	 * 'source' and 'dest' bits are the same, this represents a node
1047 	 * that will be migrating to itself, so no pages need move.
1048 	 *
1049 	 * If no bits are left in 'tmp', or if all remaining bits left
1050 	 * in 'tmp' correspond to the same bit in 'to', return false
1051 	 * (nothing left to migrate).
1052 	 *
1053 	 * This lets us pick a pair of nodes to migrate between, such that
1054 	 * if possible the dest node is not already occupied by some other
1055 	 * source node, minimizing the risk of overloading the memory on a
1056 	 * node that would happen if we migrated incoming memory to a node
1057 	 * before migrating outgoing memory source that same node.
1058 	 *
1059 	 * A single scan of tmp is sufficient.  As we go, we remember the
1060 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1061 	 * that not only moved, but what's better, moved to an empty slot
1062 	 * (d is not set in tmp), then we break out then, with that pair.
1063 	 * Otherwise when we finish scanning from_tmp, we at least have the
1064 	 * most recent <s, d> pair that moved.  If we get all the way through
1065 	 * the scan of tmp without finding any node that moved, much less
1066 	 * moved to an empty node, then there is nothing left worth migrating.
1067 	 */
1068 
1069 	tmp = *from;
1070 	while (!nodes_empty(tmp)) {
1071 		int s,d;
1072 		int source = NUMA_NO_NODE;
1073 		int dest = 0;
1074 
1075 		for_each_node_mask(s, tmp) {
1076 
1077 			/*
1078 			 * do_migrate_pages() tries to maintain the relative
1079 			 * node relationship of the pages established between
1080 			 * threads and memory areas.
1081                          *
1082 			 * However if the number of source nodes is not equal to
1083 			 * the number of destination nodes we can not preserve
1084 			 * this node relative relationship.  In that case, skip
1085 			 * copying memory from a node that is in the destination
1086 			 * mask.
1087 			 *
1088 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1089 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1090 			 */
1091 
1092 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1093 						(node_isset(s, *to)))
1094 				continue;
1095 
1096 			d = node_remap(s, *from, *to);
1097 			if (s == d)
1098 				continue;
1099 
1100 			source = s;	/* Node moved. Memorize */
1101 			dest = d;
1102 
1103 			/* dest not in remaining from nodes? */
1104 			if (!node_isset(dest, tmp))
1105 				break;
1106 		}
1107 		if (source == NUMA_NO_NODE)
1108 			break;
1109 
1110 		node_clear(source, tmp);
1111 		err = migrate_to_node(mm, source, dest, flags);
1112 		if (err > 0)
1113 			busy += err;
1114 		if (err < 0)
1115 			break;
1116 	}
1117 	up_read(&mm->mmap_sem);
1118 	if (err < 0)
1119 		return err;
1120 	return busy;
1121 
1122 }
1123 
1124 /*
1125  * Allocate a new page for page migration based on vma policy.
1126  * Start by assuming the page is mapped by the same vma as contains @start.
1127  * Search forward from there, if not.  N.B., this assumes that the
1128  * list of pages handed to migrate_pages()--which is how we get here--
1129  * is in virtual address order.
1130  */
1131 static struct page *new_page(struct page *page, unsigned long start, int **x)
1132 {
1133 	struct vm_area_struct *vma;
1134 	unsigned long uninitialized_var(address);
1135 
1136 	vma = find_vma(current->mm, start);
1137 	while (vma) {
1138 		address = page_address_in_vma(page, vma);
1139 		if (address != -EFAULT)
1140 			break;
1141 		vma = vma->vm_next;
1142 	}
1143 
1144 	if (PageHuge(page)) {
1145 		BUG_ON(!vma);
1146 		return alloc_huge_page_noerr(vma, address, 1);
1147 	}
1148 	/*
1149 	 * if !vma, alloc_page_vma() will use task or system default policy
1150 	 */
1151 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1152 }
1153 #else
1154 
1155 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1156 				unsigned long flags)
1157 {
1158 }
1159 
1160 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1161 		     const nodemask_t *to, int flags)
1162 {
1163 	return -ENOSYS;
1164 }
1165 
1166 static struct page *new_page(struct page *page, unsigned long start, int **x)
1167 {
1168 	return NULL;
1169 }
1170 #endif
1171 
1172 static long do_mbind(unsigned long start, unsigned long len,
1173 		     unsigned short mode, unsigned short mode_flags,
1174 		     nodemask_t *nmask, unsigned long flags)
1175 {
1176 	struct mm_struct *mm = current->mm;
1177 	struct mempolicy *new;
1178 	unsigned long end;
1179 	int err;
1180 	LIST_HEAD(pagelist);
1181 
1182 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1183 		return -EINVAL;
1184 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1185 		return -EPERM;
1186 
1187 	if (start & ~PAGE_MASK)
1188 		return -EINVAL;
1189 
1190 	if (mode == MPOL_DEFAULT)
1191 		flags &= ~MPOL_MF_STRICT;
1192 
1193 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1194 	end = start + len;
1195 
1196 	if (end < start)
1197 		return -EINVAL;
1198 	if (end == start)
1199 		return 0;
1200 
1201 	new = mpol_new(mode, mode_flags, nmask);
1202 	if (IS_ERR(new))
1203 		return PTR_ERR(new);
1204 
1205 	if (flags & MPOL_MF_LAZY)
1206 		new->flags |= MPOL_F_MOF;
1207 
1208 	/*
1209 	 * If we are using the default policy then operation
1210 	 * on discontinuous address spaces is okay after all
1211 	 */
1212 	if (!new)
1213 		flags |= MPOL_MF_DISCONTIG_OK;
1214 
1215 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1216 		 start, start + len, mode, mode_flags,
1217 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1218 
1219 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1220 
1221 		err = migrate_prep();
1222 		if (err)
1223 			goto mpol_out;
1224 	}
1225 	{
1226 		NODEMASK_SCRATCH(scratch);
1227 		if (scratch) {
1228 			down_write(&mm->mmap_sem);
1229 			task_lock(current);
1230 			err = mpol_set_nodemask(new, nmask, scratch);
1231 			task_unlock(current);
1232 			if (err)
1233 				up_write(&mm->mmap_sem);
1234 		} else
1235 			err = -ENOMEM;
1236 		NODEMASK_SCRATCH_FREE(scratch);
1237 	}
1238 	if (err)
1239 		goto mpol_out;
1240 
1241 	err = queue_pages_range(mm, start, end, nmask,
1242 			  flags | MPOL_MF_INVERT, &pagelist);
1243 	if (!err)
1244 		err = mbind_range(mm, start, end, new);
1245 
1246 	if (!err) {
1247 		int nr_failed = 0;
1248 
1249 		if (!list_empty(&pagelist)) {
1250 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1251 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1252 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1253 			if (nr_failed)
1254 				putback_movable_pages(&pagelist);
1255 		}
1256 
1257 		if (nr_failed && (flags & MPOL_MF_STRICT))
1258 			err = -EIO;
1259 	} else
1260 		putback_movable_pages(&pagelist);
1261 
1262 	up_write(&mm->mmap_sem);
1263  mpol_out:
1264 	mpol_put(new);
1265 	return err;
1266 }
1267 
1268 /*
1269  * User space interface with variable sized bitmaps for nodelists.
1270  */
1271 
1272 /* Copy a node mask from user space. */
1273 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1274 		     unsigned long maxnode)
1275 {
1276 	unsigned long k;
1277 	unsigned long nlongs;
1278 	unsigned long endmask;
1279 
1280 	--maxnode;
1281 	nodes_clear(*nodes);
1282 	if (maxnode == 0 || !nmask)
1283 		return 0;
1284 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1285 		return -EINVAL;
1286 
1287 	nlongs = BITS_TO_LONGS(maxnode);
1288 	if ((maxnode % BITS_PER_LONG) == 0)
1289 		endmask = ~0UL;
1290 	else
1291 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1292 
1293 	/* When the user specified more nodes than supported just check
1294 	   if the non supported part is all zero. */
1295 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1296 		if (nlongs > PAGE_SIZE/sizeof(long))
1297 			return -EINVAL;
1298 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1299 			unsigned long t;
1300 			if (get_user(t, nmask + k))
1301 				return -EFAULT;
1302 			if (k == nlongs - 1) {
1303 				if (t & endmask)
1304 					return -EINVAL;
1305 			} else if (t)
1306 				return -EINVAL;
1307 		}
1308 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1309 		endmask = ~0UL;
1310 	}
1311 
1312 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1313 		return -EFAULT;
1314 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1315 	return 0;
1316 }
1317 
1318 /* Copy a kernel node mask to user space */
1319 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1320 			      nodemask_t *nodes)
1321 {
1322 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1323 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1324 
1325 	if (copy > nbytes) {
1326 		if (copy > PAGE_SIZE)
1327 			return -EINVAL;
1328 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1329 			return -EFAULT;
1330 		copy = nbytes;
1331 	}
1332 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1333 }
1334 
1335 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1336 		unsigned long, mode, const unsigned long __user *, nmask,
1337 		unsigned long, maxnode, unsigned, flags)
1338 {
1339 	nodemask_t nodes;
1340 	int err;
1341 	unsigned short mode_flags;
1342 
1343 	mode_flags = mode & MPOL_MODE_FLAGS;
1344 	mode &= ~MPOL_MODE_FLAGS;
1345 	if (mode >= MPOL_MAX)
1346 		return -EINVAL;
1347 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1348 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1349 		return -EINVAL;
1350 	err = get_nodes(&nodes, nmask, maxnode);
1351 	if (err)
1352 		return err;
1353 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1354 }
1355 
1356 /* Set the process memory policy */
1357 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1358 		unsigned long, maxnode)
1359 {
1360 	int err;
1361 	nodemask_t nodes;
1362 	unsigned short flags;
1363 
1364 	flags = mode & MPOL_MODE_FLAGS;
1365 	mode &= ~MPOL_MODE_FLAGS;
1366 	if ((unsigned int)mode >= MPOL_MAX)
1367 		return -EINVAL;
1368 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1369 		return -EINVAL;
1370 	err = get_nodes(&nodes, nmask, maxnode);
1371 	if (err)
1372 		return err;
1373 	return do_set_mempolicy(mode, flags, &nodes);
1374 }
1375 
1376 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1377 		const unsigned long __user *, old_nodes,
1378 		const unsigned long __user *, new_nodes)
1379 {
1380 	const struct cred *cred = current_cred(), *tcred;
1381 	struct mm_struct *mm = NULL;
1382 	struct task_struct *task;
1383 	nodemask_t task_nodes;
1384 	int err;
1385 	nodemask_t *old;
1386 	nodemask_t *new;
1387 	NODEMASK_SCRATCH(scratch);
1388 
1389 	if (!scratch)
1390 		return -ENOMEM;
1391 
1392 	old = &scratch->mask1;
1393 	new = &scratch->mask2;
1394 
1395 	err = get_nodes(old, old_nodes, maxnode);
1396 	if (err)
1397 		goto out;
1398 
1399 	err = get_nodes(new, new_nodes, maxnode);
1400 	if (err)
1401 		goto out;
1402 
1403 	/* Find the mm_struct */
1404 	rcu_read_lock();
1405 	task = pid ? find_task_by_vpid(pid) : current;
1406 	if (!task) {
1407 		rcu_read_unlock();
1408 		err = -ESRCH;
1409 		goto out;
1410 	}
1411 	get_task_struct(task);
1412 
1413 	err = -EINVAL;
1414 
1415 	/*
1416 	 * Check if this process has the right to modify the specified
1417 	 * process. The right exists if the process has administrative
1418 	 * capabilities, superuser privileges or the same
1419 	 * userid as the target process.
1420 	 */
1421 	tcred = __task_cred(task);
1422 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1423 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1424 	    !capable(CAP_SYS_NICE)) {
1425 		rcu_read_unlock();
1426 		err = -EPERM;
1427 		goto out_put;
1428 	}
1429 	rcu_read_unlock();
1430 
1431 	task_nodes = cpuset_mems_allowed(task);
1432 	/* Is the user allowed to access the target nodes? */
1433 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1434 		err = -EPERM;
1435 		goto out_put;
1436 	}
1437 
1438 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1439 		err = -EINVAL;
1440 		goto out_put;
1441 	}
1442 
1443 	err = security_task_movememory(task);
1444 	if (err)
1445 		goto out_put;
1446 
1447 	mm = get_task_mm(task);
1448 	put_task_struct(task);
1449 
1450 	if (!mm) {
1451 		err = -EINVAL;
1452 		goto out;
1453 	}
1454 
1455 	err = do_migrate_pages(mm, old, new,
1456 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1457 
1458 	mmput(mm);
1459 out:
1460 	NODEMASK_SCRATCH_FREE(scratch);
1461 
1462 	return err;
1463 
1464 out_put:
1465 	put_task_struct(task);
1466 	goto out;
1467 
1468 }
1469 
1470 
1471 /* Retrieve NUMA policy */
1472 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1473 		unsigned long __user *, nmask, unsigned long, maxnode,
1474 		unsigned long, addr, unsigned long, flags)
1475 {
1476 	int err;
1477 	int uninitialized_var(pval);
1478 	nodemask_t nodes;
1479 
1480 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1481 		return -EINVAL;
1482 
1483 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1484 
1485 	if (err)
1486 		return err;
1487 
1488 	if (policy && put_user(pval, policy))
1489 		return -EFAULT;
1490 
1491 	if (nmask)
1492 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1493 
1494 	return err;
1495 }
1496 
1497 #ifdef CONFIG_COMPAT
1498 
1499 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1500 		       compat_ulong_t __user *, nmask,
1501 		       compat_ulong_t, maxnode,
1502 		       compat_ulong_t, addr, compat_ulong_t, flags)
1503 {
1504 	long err;
1505 	unsigned long __user *nm = NULL;
1506 	unsigned long nr_bits, alloc_size;
1507 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1508 
1509 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1510 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1511 
1512 	if (nmask)
1513 		nm = compat_alloc_user_space(alloc_size);
1514 
1515 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1516 
1517 	if (!err && nmask) {
1518 		unsigned long copy_size;
1519 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1520 		err = copy_from_user(bm, nm, copy_size);
1521 		/* ensure entire bitmap is zeroed */
1522 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1523 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1524 	}
1525 
1526 	return err;
1527 }
1528 
1529 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1530 		       compat_ulong_t, maxnode)
1531 {
1532 	long err = 0;
1533 	unsigned long __user *nm = NULL;
1534 	unsigned long nr_bits, alloc_size;
1535 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1536 
1537 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1538 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1539 
1540 	if (nmask) {
1541 		err = compat_get_bitmap(bm, nmask, nr_bits);
1542 		nm = compat_alloc_user_space(alloc_size);
1543 		err |= copy_to_user(nm, bm, alloc_size);
1544 	}
1545 
1546 	if (err)
1547 		return -EFAULT;
1548 
1549 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1550 }
1551 
1552 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1553 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1554 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1555 {
1556 	long err = 0;
1557 	unsigned long __user *nm = NULL;
1558 	unsigned long nr_bits, alloc_size;
1559 	nodemask_t bm;
1560 
1561 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1562 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1563 
1564 	if (nmask) {
1565 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1566 		nm = compat_alloc_user_space(alloc_size);
1567 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1568 	}
1569 
1570 	if (err)
1571 		return -EFAULT;
1572 
1573 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1574 }
1575 
1576 #endif
1577 
1578 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1579 						unsigned long addr)
1580 {
1581 	struct mempolicy *pol = NULL;
1582 
1583 	if (vma) {
1584 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1585 			pol = vma->vm_ops->get_policy(vma, addr);
1586 		} else if (vma->vm_policy) {
1587 			pol = vma->vm_policy;
1588 
1589 			/*
1590 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1591 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1592 			 * count on these policies which will be dropped by
1593 			 * mpol_cond_put() later
1594 			 */
1595 			if (mpol_needs_cond_ref(pol))
1596 				mpol_get(pol);
1597 		}
1598 	}
1599 
1600 	return pol;
1601 }
1602 
1603 /*
1604  * get_vma_policy(@vma, @addr)
1605  * @vma: virtual memory area whose policy is sought
1606  * @addr: address in @vma for shared policy lookup
1607  *
1608  * Returns effective policy for a VMA at specified address.
1609  * Falls back to current->mempolicy or system default policy, as necessary.
1610  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1611  * count--added by the get_policy() vm_op, as appropriate--to protect against
1612  * freeing by another task.  It is the caller's responsibility to free the
1613  * extra reference for shared policies.
1614  */
1615 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1616 						unsigned long addr)
1617 {
1618 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1619 
1620 	if (!pol)
1621 		pol = get_task_policy(current);
1622 
1623 	return pol;
1624 }
1625 
1626 bool vma_policy_mof(struct vm_area_struct *vma)
1627 {
1628 	struct mempolicy *pol;
1629 
1630 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1631 		bool ret = false;
1632 
1633 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1634 		if (pol && (pol->flags & MPOL_F_MOF))
1635 			ret = true;
1636 		mpol_cond_put(pol);
1637 
1638 		return ret;
1639 	}
1640 
1641 	pol = vma->vm_policy;
1642 	if (!pol)
1643 		pol = get_task_policy(current);
1644 
1645 	return pol->flags & MPOL_F_MOF;
1646 }
1647 
1648 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1649 {
1650 	enum zone_type dynamic_policy_zone = policy_zone;
1651 
1652 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1653 
1654 	/*
1655 	 * if policy->v.nodes has movable memory only,
1656 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1657 	 *
1658 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1659 	 * so if the following test faile, it implies
1660 	 * policy->v.nodes has movable memory only.
1661 	 */
1662 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1663 		dynamic_policy_zone = ZONE_MOVABLE;
1664 
1665 	return zone >= dynamic_policy_zone;
1666 }
1667 
1668 /*
1669  * Return a nodemask representing a mempolicy for filtering nodes for
1670  * page allocation
1671  */
1672 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1673 {
1674 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1675 	if (unlikely(policy->mode == MPOL_BIND) &&
1676 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1677 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1678 		return &policy->v.nodes;
1679 
1680 	return NULL;
1681 }
1682 
1683 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1684 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1685 	int nd)
1686 {
1687 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1688 		nd = policy->v.preferred_node;
1689 	else {
1690 		/*
1691 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1692 		 * because we might easily break the expectation to stay on the
1693 		 * requested node and not break the policy.
1694 		 */
1695 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1696 	}
1697 
1698 	return node_zonelist(nd, gfp);
1699 }
1700 
1701 /* Do dynamic interleaving for a process */
1702 static unsigned interleave_nodes(struct mempolicy *policy)
1703 {
1704 	unsigned nid, next;
1705 	struct task_struct *me = current;
1706 
1707 	nid = me->il_next;
1708 	next = next_node_in(nid, policy->v.nodes);
1709 	if (next < MAX_NUMNODES)
1710 		me->il_next = next;
1711 	return nid;
1712 }
1713 
1714 /*
1715  * Depending on the memory policy provide a node from which to allocate the
1716  * next slab entry.
1717  */
1718 unsigned int mempolicy_slab_node(void)
1719 {
1720 	struct mempolicy *policy;
1721 	int node = numa_mem_id();
1722 
1723 	if (in_interrupt())
1724 		return node;
1725 
1726 	policy = current->mempolicy;
1727 	if (!policy || policy->flags & MPOL_F_LOCAL)
1728 		return node;
1729 
1730 	switch (policy->mode) {
1731 	case MPOL_PREFERRED:
1732 		/*
1733 		 * handled MPOL_F_LOCAL above
1734 		 */
1735 		return policy->v.preferred_node;
1736 
1737 	case MPOL_INTERLEAVE:
1738 		return interleave_nodes(policy);
1739 
1740 	case MPOL_BIND: {
1741 		struct zoneref *z;
1742 
1743 		/*
1744 		 * Follow bind policy behavior and start allocation at the
1745 		 * first node.
1746 		 */
1747 		struct zonelist *zonelist;
1748 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1749 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1750 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1751 							&policy->v.nodes);
1752 		return z->zone ? z->zone->node : node;
1753 	}
1754 
1755 	default:
1756 		BUG();
1757 	}
1758 }
1759 
1760 /*
1761  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1762  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1763  * number of present nodes.
1764  */
1765 static unsigned offset_il_node(struct mempolicy *pol,
1766 			       struct vm_area_struct *vma, unsigned long n)
1767 {
1768 	unsigned nnodes = nodes_weight(pol->v.nodes);
1769 	unsigned target;
1770 	int i;
1771 	int nid;
1772 
1773 	if (!nnodes)
1774 		return numa_node_id();
1775 	target = (unsigned int)n % nnodes;
1776 	nid = first_node(pol->v.nodes);
1777 	for (i = 0; i < target; i++)
1778 		nid = next_node(nid, pol->v.nodes);
1779 	return nid;
1780 }
1781 
1782 /* Determine a node number for interleave */
1783 static inline unsigned interleave_nid(struct mempolicy *pol,
1784 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1785 {
1786 	if (vma) {
1787 		unsigned long off;
1788 
1789 		/*
1790 		 * for small pages, there is no difference between
1791 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1792 		 * for huge pages, since vm_pgoff is in units of small
1793 		 * pages, we need to shift off the always 0 bits to get
1794 		 * a useful offset.
1795 		 */
1796 		BUG_ON(shift < PAGE_SHIFT);
1797 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1798 		off += (addr - vma->vm_start) >> shift;
1799 		return offset_il_node(pol, vma, off);
1800 	} else
1801 		return interleave_nodes(pol);
1802 }
1803 
1804 #ifdef CONFIG_HUGETLBFS
1805 /*
1806  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1807  * @vma: virtual memory area whose policy is sought
1808  * @addr: address in @vma for shared policy lookup and interleave policy
1809  * @gfp_flags: for requested zone
1810  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1811  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1812  *
1813  * Returns a zonelist suitable for a huge page allocation and a pointer
1814  * to the struct mempolicy for conditional unref after allocation.
1815  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1816  * @nodemask for filtering the zonelist.
1817  *
1818  * Must be protected by read_mems_allowed_begin()
1819  */
1820 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1821 				gfp_t gfp_flags, struct mempolicy **mpol,
1822 				nodemask_t **nodemask)
1823 {
1824 	struct zonelist *zl;
1825 
1826 	*mpol = get_vma_policy(vma, addr);
1827 	*nodemask = NULL;	/* assume !MPOL_BIND */
1828 
1829 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1830 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1831 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1832 	} else {
1833 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1834 		if ((*mpol)->mode == MPOL_BIND)
1835 			*nodemask = &(*mpol)->v.nodes;
1836 	}
1837 	return zl;
1838 }
1839 
1840 /*
1841  * init_nodemask_of_mempolicy
1842  *
1843  * If the current task's mempolicy is "default" [NULL], return 'false'
1844  * to indicate default policy.  Otherwise, extract the policy nodemask
1845  * for 'bind' or 'interleave' policy into the argument nodemask, or
1846  * initialize the argument nodemask to contain the single node for
1847  * 'preferred' or 'local' policy and return 'true' to indicate presence
1848  * of non-default mempolicy.
1849  *
1850  * We don't bother with reference counting the mempolicy [mpol_get/put]
1851  * because the current task is examining it's own mempolicy and a task's
1852  * mempolicy is only ever changed by the task itself.
1853  *
1854  * N.B., it is the caller's responsibility to free a returned nodemask.
1855  */
1856 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1857 {
1858 	struct mempolicy *mempolicy;
1859 	int nid;
1860 
1861 	if (!(mask && current->mempolicy))
1862 		return false;
1863 
1864 	task_lock(current);
1865 	mempolicy = current->mempolicy;
1866 	switch (mempolicy->mode) {
1867 	case MPOL_PREFERRED:
1868 		if (mempolicy->flags & MPOL_F_LOCAL)
1869 			nid = numa_node_id();
1870 		else
1871 			nid = mempolicy->v.preferred_node;
1872 		init_nodemask_of_node(mask, nid);
1873 		break;
1874 
1875 	case MPOL_BIND:
1876 		/* Fall through */
1877 	case MPOL_INTERLEAVE:
1878 		*mask =  mempolicy->v.nodes;
1879 		break;
1880 
1881 	default:
1882 		BUG();
1883 	}
1884 	task_unlock(current);
1885 
1886 	return true;
1887 }
1888 #endif
1889 
1890 /*
1891  * mempolicy_nodemask_intersects
1892  *
1893  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1894  * policy.  Otherwise, check for intersection between mask and the policy
1895  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1896  * policy, always return true since it may allocate elsewhere on fallback.
1897  *
1898  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1899  */
1900 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1901 					const nodemask_t *mask)
1902 {
1903 	struct mempolicy *mempolicy;
1904 	bool ret = true;
1905 
1906 	if (!mask)
1907 		return ret;
1908 	task_lock(tsk);
1909 	mempolicy = tsk->mempolicy;
1910 	if (!mempolicy)
1911 		goto out;
1912 
1913 	switch (mempolicy->mode) {
1914 	case MPOL_PREFERRED:
1915 		/*
1916 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1917 		 * allocate from, they may fallback to other nodes when oom.
1918 		 * Thus, it's possible for tsk to have allocated memory from
1919 		 * nodes in mask.
1920 		 */
1921 		break;
1922 	case MPOL_BIND:
1923 	case MPOL_INTERLEAVE:
1924 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1925 		break;
1926 	default:
1927 		BUG();
1928 	}
1929 out:
1930 	task_unlock(tsk);
1931 	return ret;
1932 }
1933 
1934 /* Allocate a page in interleaved policy.
1935    Own path because it needs to do special accounting. */
1936 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1937 					unsigned nid)
1938 {
1939 	struct zonelist *zl;
1940 	struct page *page;
1941 
1942 	zl = node_zonelist(nid, gfp);
1943 	page = __alloc_pages(gfp, order, zl);
1944 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1945 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1946 	return page;
1947 }
1948 
1949 /**
1950  * 	alloc_pages_vma	- Allocate a page for a VMA.
1951  *
1952  * 	@gfp:
1953  *      %GFP_USER    user allocation.
1954  *      %GFP_KERNEL  kernel allocations,
1955  *      %GFP_HIGHMEM highmem/user allocations,
1956  *      %GFP_FS      allocation should not call back into a file system.
1957  *      %GFP_ATOMIC  don't sleep.
1958  *
1959  *	@order:Order of the GFP allocation.
1960  * 	@vma:  Pointer to VMA or NULL if not available.
1961  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1962  *	@node: Which node to prefer for allocation (modulo policy).
1963  *	@hugepage: for hugepages try only the preferred node if possible
1964  *
1965  * 	This function allocates a page from the kernel page pool and applies
1966  *	a NUMA policy associated with the VMA or the current process.
1967  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1968  *	mm_struct of the VMA to prevent it from going away. Should be used for
1969  *	all allocations for pages that will be mapped into user space. Returns
1970  *	NULL when no page can be allocated.
1971  */
1972 struct page *
1973 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1974 		unsigned long addr, int node, bool hugepage)
1975 {
1976 	struct mempolicy *pol;
1977 	struct page *page;
1978 	unsigned int cpuset_mems_cookie;
1979 	struct zonelist *zl;
1980 	nodemask_t *nmask;
1981 
1982 retry_cpuset:
1983 	pol = get_vma_policy(vma, addr);
1984 	cpuset_mems_cookie = read_mems_allowed_begin();
1985 
1986 	if (pol->mode == MPOL_INTERLEAVE) {
1987 		unsigned nid;
1988 
1989 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1990 		mpol_cond_put(pol);
1991 		page = alloc_page_interleave(gfp, order, nid);
1992 		goto out;
1993 	}
1994 
1995 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1996 		int hpage_node = node;
1997 
1998 		/*
1999 		 * For hugepage allocation and non-interleave policy which
2000 		 * allows the current node (or other explicitly preferred
2001 		 * node) we only try to allocate from the current/preferred
2002 		 * node and don't fall back to other nodes, as the cost of
2003 		 * remote accesses would likely offset THP benefits.
2004 		 *
2005 		 * If the policy is interleave, or does not allow the current
2006 		 * node in its nodemask, we allocate the standard way.
2007 		 */
2008 		if (pol->mode == MPOL_PREFERRED &&
2009 						!(pol->flags & MPOL_F_LOCAL))
2010 			hpage_node = pol->v.preferred_node;
2011 
2012 		nmask = policy_nodemask(gfp, pol);
2013 		if (!nmask || node_isset(hpage_node, *nmask)) {
2014 			mpol_cond_put(pol);
2015 			page = __alloc_pages_node(hpage_node,
2016 						gfp | __GFP_THISNODE, order);
2017 			goto out;
2018 		}
2019 	}
2020 
2021 	nmask = policy_nodemask(gfp, pol);
2022 	zl = policy_zonelist(gfp, pol, node);
2023 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2024 	mpol_cond_put(pol);
2025 out:
2026 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2027 		goto retry_cpuset;
2028 	return page;
2029 }
2030 
2031 /**
2032  * 	alloc_pages_current - Allocate pages.
2033  *
2034  *	@gfp:
2035  *		%GFP_USER   user allocation,
2036  *      	%GFP_KERNEL kernel allocation,
2037  *      	%GFP_HIGHMEM highmem allocation,
2038  *      	%GFP_FS     don't call back into a file system.
2039  *      	%GFP_ATOMIC don't sleep.
2040  *	@order: Power of two of allocation size in pages. 0 is a single page.
2041  *
2042  *	Allocate a page from the kernel page pool.  When not in
2043  *	interrupt context and apply the current process NUMA policy.
2044  *	Returns NULL when no page can be allocated.
2045  *
2046  *	Don't call cpuset_update_task_memory_state() unless
2047  *	1) it's ok to take cpuset_sem (can WAIT), and
2048  *	2) allocating for current task (not interrupt).
2049  */
2050 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2051 {
2052 	struct mempolicy *pol = &default_policy;
2053 	struct page *page;
2054 	unsigned int cpuset_mems_cookie;
2055 
2056 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2057 		pol = get_task_policy(current);
2058 
2059 retry_cpuset:
2060 	cpuset_mems_cookie = read_mems_allowed_begin();
2061 
2062 	/*
2063 	 * No reference counting needed for current->mempolicy
2064 	 * nor system default_policy
2065 	 */
2066 	if (pol->mode == MPOL_INTERLEAVE)
2067 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2068 	else
2069 		page = __alloc_pages_nodemask(gfp, order,
2070 				policy_zonelist(gfp, pol, numa_node_id()),
2071 				policy_nodemask(gfp, pol));
2072 
2073 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2074 		goto retry_cpuset;
2075 
2076 	return page;
2077 }
2078 EXPORT_SYMBOL(alloc_pages_current);
2079 
2080 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2081 {
2082 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2083 
2084 	if (IS_ERR(pol))
2085 		return PTR_ERR(pol);
2086 	dst->vm_policy = pol;
2087 	return 0;
2088 }
2089 
2090 /*
2091  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2092  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2093  * with the mems_allowed returned by cpuset_mems_allowed().  This
2094  * keeps mempolicies cpuset relative after its cpuset moves.  See
2095  * further kernel/cpuset.c update_nodemask().
2096  *
2097  * current's mempolicy may be rebinded by the other task(the task that changes
2098  * cpuset's mems), so we needn't do rebind work for current task.
2099  */
2100 
2101 /* Slow path of a mempolicy duplicate */
2102 struct mempolicy *__mpol_dup(struct mempolicy *old)
2103 {
2104 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2105 
2106 	if (!new)
2107 		return ERR_PTR(-ENOMEM);
2108 
2109 	/* task's mempolicy is protected by alloc_lock */
2110 	if (old == current->mempolicy) {
2111 		task_lock(current);
2112 		*new = *old;
2113 		task_unlock(current);
2114 	} else
2115 		*new = *old;
2116 
2117 	if (current_cpuset_is_being_rebound()) {
2118 		nodemask_t mems = cpuset_mems_allowed(current);
2119 		if (new->flags & MPOL_F_REBINDING)
2120 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2121 		else
2122 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2123 	}
2124 	atomic_set(&new->refcnt, 1);
2125 	return new;
2126 }
2127 
2128 /* Slow path of a mempolicy comparison */
2129 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2130 {
2131 	if (!a || !b)
2132 		return false;
2133 	if (a->mode != b->mode)
2134 		return false;
2135 	if (a->flags != b->flags)
2136 		return false;
2137 	if (mpol_store_user_nodemask(a))
2138 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2139 			return false;
2140 
2141 	switch (a->mode) {
2142 	case MPOL_BIND:
2143 		/* Fall through */
2144 	case MPOL_INTERLEAVE:
2145 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2146 	case MPOL_PREFERRED:
2147 		return a->v.preferred_node == b->v.preferred_node;
2148 	default:
2149 		BUG();
2150 		return false;
2151 	}
2152 }
2153 
2154 /*
2155  * Shared memory backing store policy support.
2156  *
2157  * Remember policies even when nobody has shared memory mapped.
2158  * The policies are kept in Red-Black tree linked from the inode.
2159  * They are protected by the sp->lock rwlock, which should be held
2160  * for any accesses to the tree.
2161  */
2162 
2163 /*
2164  * lookup first element intersecting start-end.  Caller holds sp->lock for
2165  * reading or for writing
2166  */
2167 static struct sp_node *
2168 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2169 {
2170 	struct rb_node *n = sp->root.rb_node;
2171 
2172 	while (n) {
2173 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2174 
2175 		if (start >= p->end)
2176 			n = n->rb_right;
2177 		else if (end <= p->start)
2178 			n = n->rb_left;
2179 		else
2180 			break;
2181 	}
2182 	if (!n)
2183 		return NULL;
2184 	for (;;) {
2185 		struct sp_node *w = NULL;
2186 		struct rb_node *prev = rb_prev(n);
2187 		if (!prev)
2188 			break;
2189 		w = rb_entry(prev, struct sp_node, nd);
2190 		if (w->end <= start)
2191 			break;
2192 		n = prev;
2193 	}
2194 	return rb_entry(n, struct sp_node, nd);
2195 }
2196 
2197 /*
2198  * Insert a new shared policy into the list.  Caller holds sp->lock for
2199  * writing.
2200  */
2201 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2202 {
2203 	struct rb_node **p = &sp->root.rb_node;
2204 	struct rb_node *parent = NULL;
2205 	struct sp_node *nd;
2206 
2207 	while (*p) {
2208 		parent = *p;
2209 		nd = rb_entry(parent, struct sp_node, nd);
2210 		if (new->start < nd->start)
2211 			p = &(*p)->rb_left;
2212 		else if (new->end > nd->end)
2213 			p = &(*p)->rb_right;
2214 		else
2215 			BUG();
2216 	}
2217 	rb_link_node(&new->nd, parent, p);
2218 	rb_insert_color(&new->nd, &sp->root);
2219 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2220 		 new->policy ? new->policy->mode : 0);
2221 }
2222 
2223 /* Find shared policy intersecting idx */
2224 struct mempolicy *
2225 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2226 {
2227 	struct mempolicy *pol = NULL;
2228 	struct sp_node *sn;
2229 
2230 	if (!sp->root.rb_node)
2231 		return NULL;
2232 	read_lock(&sp->lock);
2233 	sn = sp_lookup(sp, idx, idx+1);
2234 	if (sn) {
2235 		mpol_get(sn->policy);
2236 		pol = sn->policy;
2237 	}
2238 	read_unlock(&sp->lock);
2239 	return pol;
2240 }
2241 
2242 static void sp_free(struct sp_node *n)
2243 {
2244 	mpol_put(n->policy);
2245 	kmem_cache_free(sn_cache, n);
2246 }
2247 
2248 /**
2249  * mpol_misplaced - check whether current page node is valid in policy
2250  *
2251  * @page: page to be checked
2252  * @vma: vm area where page mapped
2253  * @addr: virtual address where page mapped
2254  *
2255  * Lookup current policy node id for vma,addr and "compare to" page's
2256  * node id.
2257  *
2258  * Returns:
2259  *	-1	- not misplaced, page is in the right node
2260  *	node	- node id where the page should be
2261  *
2262  * Policy determination "mimics" alloc_page_vma().
2263  * Called from fault path where we know the vma and faulting address.
2264  */
2265 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2266 {
2267 	struct mempolicy *pol;
2268 	struct zoneref *z;
2269 	int curnid = page_to_nid(page);
2270 	unsigned long pgoff;
2271 	int thiscpu = raw_smp_processor_id();
2272 	int thisnid = cpu_to_node(thiscpu);
2273 	int polnid = -1;
2274 	int ret = -1;
2275 
2276 	BUG_ON(!vma);
2277 
2278 	pol = get_vma_policy(vma, addr);
2279 	if (!(pol->flags & MPOL_F_MOF))
2280 		goto out;
2281 
2282 	switch (pol->mode) {
2283 	case MPOL_INTERLEAVE:
2284 		BUG_ON(addr >= vma->vm_end);
2285 		BUG_ON(addr < vma->vm_start);
2286 
2287 		pgoff = vma->vm_pgoff;
2288 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2289 		polnid = offset_il_node(pol, vma, pgoff);
2290 		break;
2291 
2292 	case MPOL_PREFERRED:
2293 		if (pol->flags & MPOL_F_LOCAL)
2294 			polnid = numa_node_id();
2295 		else
2296 			polnid = pol->v.preferred_node;
2297 		break;
2298 
2299 	case MPOL_BIND:
2300 
2301 		/*
2302 		 * allows binding to multiple nodes.
2303 		 * use current page if in policy nodemask,
2304 		 * else select nearest allowed node, if any.
2305 		 * If no allowed nodes, use current [!misplaced].
2306 		 */
2307 		if (node_isset(curnid, pol->v.nodes))
2308 			goto out;
2309 		z = first_zones_zonelist(
2310 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2311 				gfp_zone(GFP_HIGHUSER),
2312 				&pol->v.nodes);
2313 		polnid = z->zone->node;
2314 		break;
2315 
2316 	default:
2317 		BUG();
2318 	}
2319 
2320 	/* Migrate the page towards the node whose CPU is referencing it */
2321 	if (pol->flags & MPOL_F_MORON) {
2322 		polnid = thisnid;
2323 
2324 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2325 			goto out;
2326 	}
2327 
2328 	if (curnid != polnid)
2329 		ret = polnid;
2330 out:
2331 	mpol_cond_put(pol);
2332 
2333 	return ret;
2334 }
2335 
2336 /*
2337  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2338  * dropped after task->mempolicy is set to NULL so that any allocation done as
2339  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2340  * policy.
2341  */
2342 void mpol_put_task_policy(struct task_struct *task)
2343 {
2344 	struct mempolicy *pol;
2345 
2346 	task_lock(task);
2347 	pol = task->mempolicy;
2348 	task->mempolicy = NULL;
2349 	task_unlock(task);
2350 	mpol_put(pol);
2351 }
2352 
2353 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2354 {
2355 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2356 	rb_erase(&n->nd, &sp->root);
2357 	sp_free(n);
2358 }
2359 
2360 static void sp_node_init(struct sp_node *node, unsigned long start,
2361 			unsigned long end, struct mempolicy *pol)
2362 {
2363 	node->start = start;
2364 	node->end = end;
2365 	node->policy = pol;
2366 }
2367 
2368 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2369 				struct mempolicy *pol)
2370 {
2371 	struct sp_node *n;
2372 	struct mempolicy *newpol;
2373 
2374 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2375 	if (!n)
2376 		return NULL;
2377 
2378 	newpol = mpol_dup(pol);
2379 	if (IS_ERR(newpol)) {
2380 		kmem_cache_free(sn_cache, n);
2381 		return NULL;
2382 	}
2383 	newpol->flags |= MPOL_F_SHARED;
2384 	sp_node_init(n, start, end, newpol);
2385 
2386 	return n;
2387 }
2388 
2389 /* Replace a policy range. */
2390 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2391 				 unsigned long end, struct sp_node *new)
2392 {
2393 	struct sp_node *n;
2394 	struct sp_node *n_new = NULL;
2395 	struct mempolicy *mpol_new = NULL;
2396 	int ret = 0;
2397 
2398 restart:
2399 	write_lock(&sp->lock);
2400 	n = sp_lookup(sp, start, end);
2401 	/* Take care of old policies in the same range. */
2402 	while (n && n->start < end) {
2403 		struct rb_node *next = rb_next(&n->nd);
2404 		if (n->start >= start) {
2405 			if (n->end <= end)
2406 				sp_delete(sp, n);
2407 			else
2408 				n->start = end;
2409 		} else {
2410 			/* Old policy spanning whole new range. */
2411 			if (n->end > end) {
2412 				if (!n_new)
2413 					goto alloc_new;
2414 
2415 				*mpol_new = *n->policy;
2416 				atomic_set(&mpol_new->refcnt, 1);
2417 				sp_node_init(n_new, end, n->end, mpol_new);
2418 				n->end = start;
2419 				sp_insert(sp, n_new);
2420 				n_new = NULL;
2421 				mpol_new = NULL;
2422 				break;
2423 			} else
2424 				n->end = start;
2425 		}
2426 		if (!next)
2427 			break;
2428 		n = rb_entry(next, struct sp_node, nd);
2429 	}
2430 	if (new)
2431 		sp_insert(sp, new);
2432 	write_unlock(&sp->lock);
2433 	ret = 0;
2434 
2435 err_out:
2436 	if (mpol_new)
2437 		mpol_put(mpol_new);
2438 	if (n_new)
2439 		kmem_cache_free(sn_cache, n_new);
2440 
2441 	return ret;
2442 
2443 alloc_new:
2444 	write_unlock(&sp->lock);
2445 	ret = -ENOMEM;
2446 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2447 	if (!n_new)
2448 		goto err_out;
2449 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2450 	if (!mpol_new)
2451 		goto err_out;
2452 	goto restart;
2453 }
2454 
2455 /**
2456  * mpol_shared_policy_init - initialize shared policy for inode
2457  * @sp: pointer to inode shared policy
2458  * @mpol:  struct mempolicy to install
2459  *
2460  * Install non-NULL @mpol in inode's shared policy rb-tree.
2461  * On entry, the current task has a reference on a non-NULL @mpol.
2462  * This must be released on exit.
2463  * This is called at get_inode() calls and we can use GFP_KERNEL.
2464  */
2465 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2466 {
2467 	int ret;
2468 
2469 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2470 	rwlock_init(&sp->lock);
2471 
2472 	if (mpol) {
2473 		struct vm_area_struct pvma;
2474 		struct mempolicy *new;
2475 		NODEMASK_SCRATCH(scratch);
2476 
2477 		if (!scratch)
2478 			goto put_mpol;
2479 		/* contextualize the tmpfs mount point mempolicy */
2480 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2481 		if (IS_ERR(new))
2482 			goto free_scratch; /* no valid nodemask intersection */
2483 
2484 		task_lock(current);
2485 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2486 		task_unlock(current);
2487 		if (ret)
2488 			goto put_new;
2489 
2490 		/* Create pseudo-vma that contains just the policy */
2491 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2492 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2493 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2494 
2495 put_new:
2496 		mpol_put(new);			/* drop initial ref */
2497 free_scratch:
2498 		NODEMASK_SCRATCH_FREE(scratch);
2499 put_mpol:
2500 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2501 	}
2502 }
2503 
2504 int mpol_set_shared_policy(struct shared_policy *info,
2505 			struct vm_area_struct *vma, struct mempolicy *npol)
2506 {
2507 	int err;
2508 	struct sp_node *new = NULL;
2509 	unsigned long sz = vma_pages(vma);
2510 
2511 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2512 		 vma->vm_pgoff,
2513 		 sz, npol ? npol->mode : -1,
2514 		 npol ? npol->flags : -1,
2515 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2516 
2517 	if (npol) {
2518 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2519 		if (!new)
2520 			return -ENOMEM;
2521 	}
2522 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2523 	if (err && new)
2524 		sp_free(new);
2525 	return err;
2526 }
2527 
2528 /* Free a backing policy store on inode delete. */
2529 void mpol_free_shared_policy(struct shared_policy *p)
2530 {
2531 	struct sp_node *n;
2532 	struct rb_node *next;
2533 
2534 	if (!p->root.rb_node)
2535 		return;
2536 	write_lock(&p->lock);
2537 	next = rb_first(&p->root);
2538 	while (next) {
2539 		n = rb_entry(next, struct sp_node, nd);
2540 		next = rb_next(&n->nd);
2541 		sp_delete(p, n);
2542 	}
2543 	write_unlock(&p->lock);
2544 }
2545 
2546 #ifdef CONFIG_NUMA_BALANCING
2547 static int __initdata numabalancing_override;
2548 
2549 static void __init check_numabalancing_enable(void)
2550 {
2551 	bool numabalancing_default = false;
2552 
2553 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2554 		numabalancing_default = true;
2555 
2556 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2557 	if (numabalancing_override)
2558 		set_numabalancing_state(numabalancing_override == 1);
2559 
2560 	if (num_online_nodes() > 1 && !numabalancing_override) {
2561 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2562 			numabalancing_default ? "Enabling" : "Disabling");
2563 		set_numabalancing_state(numabalancing_default);
2564 	}
2565 }
2566 
2567 static int __init setup_numabalancing(char *str)
2568 {
2569 	int ret = 0;
2570 	if (!str)
2571 		goto out;
2572 
2573 	if (!strcmp(str, "enable")) {
2574 		numabalancing_override = 1;
2575 		ret = 1;
2576 	} else if (!strcmp(str, "disable")) {
2577 		numabalancing_override = -1;
2578 		ret = 1;
2579 	}
2580 out:
2581 	if (!ret)
2582 		pr_warn("Unable to parse numa_balancing=\n");
2583 
2584 	return ret;
2585 }
2586 __setup("numa_balancing=", setup_numabalancing);
2587 #else
2588 static inline void __init check_numabalancing_enable(void)
2589 {
2590 }
2591 #endif /* CONFIG_NUMA_BALANCING */
2592 
2593 /* assumes fs == KERNEL_DS */
2594 void __init numa_policy_init(void)
2595 {
2596 	nodemask_t interleave_nodes;
2597 	unsigned long largest = 0;
2598 	int nid, prefer = 0;
2599 
2600 	policy_cache = kmem_cache_create("numa_policy",
2601 					 sizeof(struct mempolicy),
2602 					 0, SLAB_PANIC, NULL);
2603 
2604 	sn_cache = kmem_cache_create("shared_policy_node",
2605 				     sizeof(struct sp_node),
2606 				     0, SLAB_PANIC, NULL);
2607 
2608 	for_each_node(nid) {
2609 		preferred_node_policy[nid] = (struct mempolicy) {
2610 			.refcnt = ATOMIC_INIT(1),
2611 			.mode = MPOL_PREFERRED,
2612 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2613 			.v = { .preferred_node = nid, },
2614 		};
2615 	}
2616 
2617 	/*
2618 	 * Set interleaving policy for system init. Interleaving is only
2619 	 * enabled across suitably sized nodes (default is >= 16MB), or
2620 	 * fall back to the largest node if they're all smaller.
2621 	 */
2622 	nodes_clear(interleave_nodes);
2623 	for_each_node_state(nid, N_MEMORY) {
2624 		unsigned long total_pages = node_present_pages(nid);
2625 
2626 		/* Preserve the largest node */
2627 		if (largest < total_pages) {
2628 			largest = total_pages;
2629 			prefer = nid;
2630 		}
2631 
2632 		/* Interleave this node? */
2633 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2634 			node_set(nid, interleave_nodes);
2635 	}
2636 
2637 	/* All too small, use the largest */
2638 	if (unlikely(nodes_empty(interleave_nodes)))
2639 		node_set(prefer, interleave_nodes);
2640 
2641 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2642 		pr_err("%s: interleaving failed\n", __func__);
2643 
2644 	check_numabalancing_enable();
2645 }
2646 
2647 /* Reset policy of current process to default */
2648 void numa_default_policy(void)
2649 {
2650 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2651 }
2652 
2653 /*
2654  * Parse and format mempolicy from/to strings
2655  */
2656 
2657 /*
2658  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2659  */
2660 static const char * const policy_modes[] =
2661 {
2662 	[MPOL_DEFAULT]    = "default",
2663 	[MPOL_PREFERRED]  = "prefer",
2664 	[MPOL_BIND]       = "bind",
2665 	[MPOL_INTERLEAVE] = "interleave",
2666 	[MPOL_LOCAL]      = "local",
2667 };
2668 
2669 
2670 #ifdef CONFIG_TMPFS
2671 /**
2672  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2673  * @str:  string containing mempolicy to parse
2674  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2675  *
2676  * Format of input:
2677  *	<mode>[=<flags>][:<nodelist>]
2678  *
2679  * On success, returns 0, else 1
2680  */
2681 int mpol_parse_str(char *str, struct mempolicy **mpol)
2682 {
2683 	struct mempolicy *new = NULL;
2684 	unsigned short mode;
2685 	unsigned short mode_flags;
2686 	nodemask_t nodes;
2687 	char *nodelist = strchr(str, ':');
2688 	char *flags = strchr(str, '=');
2689 	int err = 1;
2690 
2691 	if (nodelist) {
2692 		/* NUL-terminate mode or flags string */
2693 		*nodelist++ = '\0';
2694 		if (nodelist_parse(nodelist, nodes))
2695 			goto out;
2696 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2697 			goto out;
2698 	} else
2699 		nodes_clear(nodes);
2700 
2701 	if (flags)
2702 		*flags++ = '\0';	/* terminate mode string */
2703 
2704 	for (mode = 0; mode < MPOL_MAX; mode++) {
2705 		if (!strcmp(str, policy_modes[mode])) {
2706 			break;
2707 		}
2708 	}
2709 	if (mode >= MPOL_MAX)
2710 		goto out;
2711 
2712 	switch (mode) {
2713 	case MPOL_PREFERRED:
2714 		/*
2715 		 * Insist on a nodelist of one node only
2716 		 */
2717 		if (nodelist) {
2718 			char *rest = nodelist;
2719 			while (isdigit(*rest))
2720 				rest++;
2721 			if (*rest)
2722 				goto out;
2723 		}
2724 		break;
2725 	case MPOL_INTERLEAVE:
2726 		/*
2727 		 * Default to online nodes with memory if no nodelist
2728 		 */
2729 		if (!nodelist)
2730 			nodes = node_states[N_MEMORY];
2731 		break;
2732 	case MPOL_LOCAL:
2733 		/*
2734 		 * Don't allow a nodelist;  mpol_new() checks flags
2735 		 */
2736 		if (nodelist)
2737 			goto out;
2738 		mode = MPOL_PREFERRED;
2739 		break;
2740 	case MPOL_DEFAULT:
2741 		/*
2742 		 * Insist on a empty nodelist
2743 		 */
2744 		if (!nodelist)
2745 			err = 0;
2746 		goto out;
2747 	case MPOL_BIND:
2748 		/*
2749 		 * Insist on a nodelist
2750 		 */
2751 		if (!nodelist)
2752 			goto out;
2753 	}
2754 
2755 	mode_flags = 0;
2756 	if (flags) {
2757 		/*
2758 		 * Currently, we only support two mutually exclusive
2759 		 * mode flags.
2760 		 */
2761 		if (!strcmp(flags, "static"))
2762 			mode_flags |= MPOL_F_STATIC_NODES;
2763 		else if (!strcmp(flags, "relative"))
2764 			mode_flags |= MPOL_F_RELATIVE_NODES;
2765 		else
2766 			goto out;
2767 	}
2768 
2769 	new = mpol_new(mode, mode_flags, &nodes);
2770 	if (IS_ERR(new))
2771 		goto out;
2772 
2773 	/*
2774 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2775 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2776 	 */
2777 	if (mode != MPOL_PREFERRED)
2778 		new->v.nodes = nodes;
2779 	else if (nodelist)
2780 		new->v.preferred_node = first_node(nodes);
2781 	else
2782 		new->flags |= MPOL_F_LOCAL;
2783 
2784 	/*
2785 	 * Save nodes for contextualization: this will be used to "clone"
2786 	 * the mempolicy in a specific context [cpuset] at a later time.
2787 	 */
2788 	new->w.user_nodemask = nodes;
2789 
2790 	err = 0;
2791 
2792 out:
2793 	/* Restore string for error message */
2794 	if (nodelist)
2795 		*--nodelist = ':';
2796 	if (flags)
2797 		*--flags = '=';
2798 	if (!err)
2799 		*mpol = new;
2800 	return err;
2801 }
2802 #endif /* CONFIG_TMPFS */
2803 
2804 /**
2805  * mpol_to_str - format a mempolicy structure for printing
2806  * @buffer:  to contain formatted mempolicy string
2807  * @maxlen:  length of @buffer
2808  * @pol:  pointer to mempolicy to be formatted
2809  *
2810  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2811  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2812  * longest flag, "relative", and to display at least a few node ids.
2813  */
2814 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2815 {
2816 	char *p = buffer;
2817 	nodemask_t nodes = NODE_MASK_NONE;
2818 	unsigned short mode = MPOL_DEFAULT;
2819 	unsigned short flags = 0;
2820 
2821 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2822 		mode = pol->mode;
2823 		flags = pol->flags;
2824 	}
2825 
2826 	switch (mode) {
2827 	case MPOL_DEFAULT:
2828 		break;
2829 	case MPOL_PREFERRED:
2830 		if (flags & MPOL_F_LOCAL)
2831 			mode = MPOL_LOCAL;
2832 		else
2833 			node_set(pol->v.preferred_node, nodes);
2834 		break;
2835 	case MPOL_BIND:
2836 	case MPOL_INTERLEAVE:
2837 		nodes = pol->v.nodes;
2838 		break;
2839 	default:
2840 		WARN_ON_ONCE(1);
2841 		snprintf(p, maxlen, "unknown");
2842 		return;
2843 	}
2844 
2845 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2846 
2847 	if (flags & MPOL_MODE_FLAGS) {
2848 		p += snprintf(p, buffer + maxlen - p, "=");
2849 
2850 		/*
2851 		 * Currently, the only defined flags are mutually exclusive
2852 		 */
2853 		if (flags & MPOL_F_STATIC_NODES)
2854 			p += snprintf(p, buffer + maxlen - p, "static");
2855 		else if (flags & MPOL_F_RELATIVE_NODES)
2856 			p += snprintf(p, buffer + maxlen - p, "relative");
2857 	}
2858 
2859 	if (!nodes_empty(nodes))
2860 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2861 			       nodemask_pr_args(&nodes));
2862 }
2863