xref: /openbmc/linux/mm/mempolicy.c (revision 4d2804b7)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/swap.h>
89 #include <linux/seq_file.h>
90 #include <linux/proc_fs.h>
91 #include <linux/migrate.h>
92 #include <linux/ksm.h>
93 #include <linux/rmap.h>
94 #include <linux/security.h>
95 #include <linux/syscalls.h>
96 #include <linux/ctype.h>
97 #include <linux/mm_inline.h>
98 #include <linux/mmu_notifier.h>
99 #include <linux/printk.h>
100 
101 #include <asm/tlbflush.h>
102 #include <linux/uaccess.h>
103 
104 #include "internal.h"
105 
106 /* Internal flags */
107 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
108 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
109 
110 static struct kmem_cache *policy_cache;
111 static struct kmem_cache *sn_cache;
112 
113 /* Highest zone. An specific allocation for a zone below that is not
114    policied. */
115 enum zone_type policy_zone = 0;
116 
117 /*
118  * run-time system-wide default policy => local allocation
119  */
120 static struct mempolicy default_policy = {
121 	.refcnt = ATOMIC_INIT(1), /* never free it */
122 	.mode = MPOL_PREFERRED,
123 	.flags = MPOL_F_LOCAL,
124 };
125 
126 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
127 
128 struct mempolicy *get_task_policy(struct task_struct *p)
129 {
130 	struct mempolicy *pol = p->mempolicy;
131 	int node;
132 
133 	if (pol)
134 		return pol;
135 
136 	node = numa_node_id();
137 	if (node != NUMA_NO_NODE) {
138 		pol = &preferred_node_policy[node];
139 		/* preferred_node_policy is not initialised early in boot */
140 		if (pol->mode)
141 			return pol;
142 	}
143 
144 	return &default_policy;
145 }
146 
147 static const struct mempolicy_operations {
148 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149 	/*
150 	 * If read-side task has no lock to protect task->mempolicy, write-side
151 	 * task will rebind the task->mempolicy by two step. The first step is
152 	 * setting all the newly nodes, and the second step is cleaning all the
153 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
154 	 * page.
155 	 * If we have a lock to protect task->mempolicy in read-side, we do
156 	 * rebind directly.
157 	 *
158 	 * step:
159 	 * 	MPOL_REBIND_ONCE - do rebind work at once
160 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
161 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
162 	 */
163 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164 			enum mpol_rebind_step step);
165 } mpol_ops[MPOL_MAX];
166 
167 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
168 {
169 	return pol->flags & MPOL_MODE_FLAGS;
170 }
171 
172 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
173 				   const nodemask_t *rel)
174 {
175 	nodemask_t tmp;
176 	nodes_fold(tmp, *orig, nodes_weight(*rel));
177 	nodes_onto(*ret, tmp, *rel);
178 }
179 
180 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
181 {
182 	if (nodes_empty(*nodes))
183 		return -EINVAL;
184 	pol->v.nodes = *nodes;
185 	return 0;
186 }
187 
188 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
189 {
190 	if (!nodes)
191 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
192 	else if (nodes_empty(*nodes))
193 		return -EINVAL;			/*  no allowed nodes */
194 	else
195 		pol->v.preferred_node = first_node(*nodes);
196 	return 0;
197 }
198 
199 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
200 {
201 	if (nodes_empty(*nodes))
202 		return -EINVAL;
203 	pol->v.nodes = *nodes;
204 	return 0;
205 }
206 
207 /*
208  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
209  * any, for the new policy.  mpol_new() has already validated the nodes
210  * parameter with respect to the policy mode and flags.  But, we need to
211  * handle an empty nodemask with MPOL_PREFERRED here.
212  *
213  * Must be called holding task's alloc_lock to protect task's mems_allowed
214  * and mempolicy.  May also be called holding the mmap_semaphore for write.
215  */
216 static int mpol_set_nodemask(struct mempolicy *pol,
217 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
218 {
219 	int ret;
220 
221 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
222 	if (pol == NULL)
223 		return 0;
224 	/* Check N_MEMORY */
225 	nodes_and(nsc->mask1,
226 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
227 
228 	VM_BUG_ON(!nodes);
229 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
230 		nodes = NULL;	/* explicit local allocation */
231 	else {
232 		if (pol->flags & MPOL_F_RELATIVE_NODES)
233 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
234 		else
235 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
236 
237 		if (mpol_store_user_nodemask(pol))
238 			pol->w.user_nodemask = *nodes;
239 		else
240 			pol->w.cpuset_mems_allowed =
241 						cpuset_current_mems_allowed;
242 	}
243 
244 	if (nodes)
245 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
246 	else
247 		ret = mpol_ops[pol->mode].create(pol, NULL);
248 	return ret;
249 }
250 
251 /*
252  * This function just creates a new policy, does some check and simple
253  * initialization. You must invoke mpol_set_nodemask() to set nodes.
254  */
255 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
256 				  nodemask_t *nodes)
257 {
258 	struct mempolicy *policy;
259 
260 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
261 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
262 
263 	if (mode == MPOL_DEFAULT) {
264 		if (nodes && !nodes_empty(*nodes))
265 			return ERR_PTR(-EINVAL);
266 		return NULL;
267 	}
268 	VM_BUG_ON(!nodes);
269 
270 	/*
271 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
272 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
273 	 * All other modes require a valid pointer to a non-empty nodemask.
274 	 */
275 	if (mode == MPOL_PREFERRED) {
276 		if (nodes_empty(*nodes)) {
277 			if (((flags & MPOL_F_STATIC_NODES) ||
278 			     (flags & MPOL_F_RELATIVE_NODES)))
279 				return ERR_PTR(-EINVAL);
280 		}
281 	} else if (mode == MPOL_LOCAL) {
282 		if (!nodes_empty(*nodes) ||
283 		    (flags & MPOL_F_STATIC_NODES) ||
284 		    (flags & MPOL_F_RELATIVE_NODES))
285 			return ERR_PTR(-EINVAL);
286 		mode = MPOL_PREFERRED;
287 	} else if (nodes_empty(*nodes))
288 		return ERR_PTR(-EINVAL);
289 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
290 	if (!policy)
291 		return ERR_PTR(-ENOMEM);
292 	atomic_set(&policy->refcnt, 1);
293 	policy->mode = mode;
294 	policy->flags = flags;
295 
296 	return policy;
297 }
298 
299 /* Slow path of a mpol destructor. */
300 void __mpol_put(struct mempolicy *p)
301 {
302 	if (!atomic_dec_and_test(&p->refcnt))
303 		return;
304 	kmem_cache_free(policy_cache, p);
305 }
306 
307 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
308 				enum mpol_rebind_step step)
309 {
310 }
311 
312 /*
313  * step:
314  * 	MPOL_REBIND_ONCE  - do rebind work at once
315  * 	MPOL_REBIND_STEP1 - set all the newly nodes
316  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
317  */
318 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
319 				 enum mpol_rebind_step step)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES)
324 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
325 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
326 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327 	else {
328 		/*
329 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
330 		 * result
331 		 */
332 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
333 			nodes_remap(tmp, pol->v.nodes,
334 					pol->w.cpuset_mems_allowed, *nodes);
335 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
336 		} else if (step == MPOL_REBIND_STEP2) {
337 			tmp = pol->w.cpuset_mems_allowed;
338 			pol->w.cpuset_mems_allowed = *nodes;
339 		} else
340 			BUG();
341 	}
342 
343 	if (nodes_empty(tmp))
344 		tmp = *nodes;
345 
346 	if (step == MPOL_REBIND_STEP1)
347 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
348 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
349 		pol->v.nodes = tmp;
350 	else
351 		BUG();
352 
353 	if (!node_isset(current->il_next, tmp)) {
354 		current->il_next = next_node_in(current->il_next, tmp);
355 		if (current->il_next >= MAX_NUMNODES)
356 			current->il_next = numa_node_id();
357 	}
358 }
359 
360 static void mpol_rebind_preferred(struct mempolicy *pol,
361 				  const nodemask_t *nodes,
362 				  enum mpol_rebind_step step)
363 {
364 	nodemask_t tmp;
365 
366 	if (pol->flags & MPOL_F_STATIC_NODES) {
367 		int node = first_node(pol->w.user_nodemask);
368 
369 		if (node_isset(node, *nodes)) {
370 			pol->v.preferred_node = node;
371 			pol->flags &= ~MPOL_F_LOCAL;
372 		} else
373 			pol->flags |= MPOL_F_LOCAL;
374 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
375 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
376 		pol->v.preferred_node = first_node(tmp);
377 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
378 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
379 						   pol->w.cpuset_mems_allowed,
380 						   *nodes);
381 		pol->w.cpuset_mems_allowed = *nodes;
382 	}
383 }
384 
385 /*
386  * mpol_rebind_policy - Migrate a policy to a different set of nodes
387  *
388  * If read-side task has no lock to protect task->mempolicy, write-side
389  * task will rebind the task->mempolicy by two step. The first step is
390  * setting all the newly nodes, and the second step is cleaning all the
391  * disallowed nodes. In this way, we can avoid finding no node to alloc
392  * page.
393  * If we have a lock to protect task->mempolicy in read-side, we do
394  * rebind directly.
395  *
396  * step:
397  * 	MPOL_REBIND_ONCE  - do rebind work at once
398  * 	MPOL_REBIND_STEP1 - set all the newly nodes
399  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
400  */
401 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
402 				enum mpol_rebind_step step)
403 {
404 	if (!pol)
405 		return;
406 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
407 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
408 		return;
409 
410 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
414 		BUG();
415 
416 	if (step == MPOL_REBIND_STEP1)
417 		pol->flags |= MPOL_F_REBINDING;
418 	else if (step == MPOL_REBIND_STEP2)
419 		pol->flags &= ~MPOL_F_REBINDING;
420 	else if (step >= MPOL_REBIND_NSTEP)
421 		BUG();
422 
423 	mpol_ops[pol->mode].rebind(pol, newmask, step);
424 }
425 
426 /*
427  * Wrapper for mpol_rebind_policy() that just requires task
428  * pointer, and updates task mempolicy.
429  *
430  * Called with task's alloc_lock held.
431  */
432 
433 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
434 			enum mpol_rebind_step step)
435 {
436 	mpol_rebind_policy(tsk->mempolicy, new, step);
437 }
438 
439 /*
440  * Rebind each vma in mm to new nodemask.
441  *
442  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
443  */
444 
445 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
446 {
447 	struct vm_area_struct *vma;
448 
449 	down_write(&mm->mmap_sem);
450 	for (vma = mm->mmap; vma; vma = vma->vm_next)
451 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
452 	up_write(&mm->mmap_sem);
453 }
454 
455 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
456 	[MPOL_DEFAULT] = {
457 		.rebind = mpol_rebind_default,
458 	},
459 	[MPOL_INTERLEAVE] = {
460 		.create = mpol_new_interleave,
461 		.rebind = mpol_rebind_nodemask,
462 	},
463 	[MPOL_PREFERRED] = {
464 		.create = mpol_new_preferred,
465 		.rebind = mpol_rebind_preferred,
466 	},
467 	[MPOL_BIND] = {
468 		.create = mpol_new_bind,
469 		.rebind = mpol_rebind_nodemask,
470 	},
471 };
472 
473 static void migrate_page_add(struct page *page, struct list_head *pagelist,
474 				unsigned long flags);
475 
476 struct queue_pages {
477 	struct list_head *pagelist;
478 	unsigned long flags;
479 	nodemask_t *nmask;
480 	struct vm_area_struct *prev;
481 };
482 
483 /*
484  * Scan through pages checking if pages follow certain conditions,
485  * and move them to the pagelist if they do.
486  */
487 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
488 			unsigned long end, struct mm_walk *walk)
489 {
490 	struct vm_area_struct *vma = walk->vma;
491 	struct page *page;
492 	struct queue_pages *qp = walk->private;
493 	unsigned long flags = qp->flags;
494 	int nid, ret;
495 	pte_t *pte;
496 	spinlock_t *ptl;
497 
498 	if (pmd_trans_huge(*pmd)) {
499 		ptl = pmd_lock(walk->mm, pmd);
500 		if (pmd_trans_huge(*pmd)) {
501 			page = pmd_page(*pmd);
502 			if (is_huge_zero_page(page)) {
503 				spin_unlock(ptl);
504 				__split_huge_pmd(vma, pmd, addr, false, NULL);
505 			} else {
506 				get_page(page);
507 				spin_unlock(ptl);
508 				lock_page(page);
509 				ret = split_huge_page(page);
510 				unlock_page(page);
511 				put_page(page);
512 				if (ret)
513 					return 0;
514 			}
515 		} else {
516 			spin_unlock(ptl);
517 		}
518 	}
519 
520 	if (pmd_trans_unstable(pmd))
521 		return 0;
522 retry:
523 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
524 	for (; addr != end; pte++, addr += PAGE_SIZE) {
525 		if (!pte_present(*pte))
526 			continue;
527 		page = vm_normal_page(vma, addr, *pte);
528 		if (!page)
529 			continue;
530 		/*
531 		 * vm_normal_page() filters out zero pages, but there might
532 		 * still be PageReserved pages to skip, perhaps in a VDSO.
533 		 */
534 		if (PageReserved(page))
535 			continue;
536 		nid = page_to_nid(page);
537 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
538 			continue;
539 		if (PageTransCompound(page)) {
540 			get_page(page);
541 			pte_unmap_unlock(pte, ptl);
542 			lock_page(page);
543 			ret = split_huge_page(page);
544 			unlock_page(page);
545 			put_page(page);
546 			/* Failed to split -- skip. */
547 			if (ret) {
548 				pte = pte_offset_map_lock(walk->mm, pmd,
549 						addr, &ptl);
550 				continue;
551 			}
552 			goto retry;
553 		}
554 
555 		migrate_page_add(page, qp->pagelist, flags);
556 	}
557 	pte_unmap_unlock(pte - 1, ptl);
558 	cond_resched();
559 	return 0;
560 }
561 
562 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
563 			       unsigned long addr, unsigned long end,
564 			       struct mm_walk *walk)
565 {
566 #ifdef CONFIG_HUGETLB_PAGE
567 	struct queue_pages *qp = walk->private;
568 	unsigned long flags = qp->flags;
569 	int nid;
570 	struct page *page;
571 	spinlock_t *ptl;
572 	pte_t entry;
573 
574 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
575 	entry = huge_ptep_get(pte);
576 	if (!pte_present(entry))
577 		goto unlock;
578 	page = pte_page(entry);
579 	nid = page_to_nid(page);
580 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
581 		goto unlock;
582 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
583 	if (flags & (MPOL_MF_MOVE_ALL) ||
584 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
585 		isolate_huge_page(page, qp->pagelist);
586 unlock:
587 	spin_unlock(ptl);
588 #else
589 	BUG();
590 #endif
591 	return 0;
592 }
593 
594 #ifdef CONFIG_NUMA_BALANCING
595 /*
596  * This is used to mark a range of virtual addresses to be inaccessible.
597  * These are later cleared by a NUMA hinting fault. Depending on these
598  * faults, pages may be migrated for better NUMA placement.
599  *
600  * This is assuming that NUMA faults are handled using PROT_NONE. If
601  * an architecture makes a different choice, it will need further
602  * changes to the core.
603  */
604 unsigned long change_prot_numa(struct vm_area_struct *vma,
605 			unsigned long addr, unsigned long end)
606 {
607 	int nr_updated;
608 
609 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
610 	if (nr_updated)
611 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
612 
613 	return nr_updated;
614 }
615 #else
616 static unsigned long change_prot_numa(struct vm_area_struct *vma,
617 			unsigned long addr, unsigned long end)
618 {
619 	return 0;
620 }
621 #endif /* CONFIG_NUMA_BALANCING */
622 
623 static int queue_pages_test_walk(unsigned long start, unsigned long end,
624 				struct mm_walk *walk)
625 {
626 	struct vm_area_struct *vma = walk->vma;
627 	struct queue_pages *qp = walk->private;
628 	unsigned long endvma = vma->vm_end;
629 	unsigned long flags = qp->flags;
630 
631 	if (!vma_migratable(vma))
632 		return 1;
633 
634 	if (endvma > end)
635 		endvma = end;
636 	if (vma->vm_start > start)
637 		start = vma->vm_start;
638 
639 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
640 		if (!vma->vm_next && vma->vm_end < end)
641 			return -EFAULT;
642 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
643 			return -EFAULT;
644 	}
645 
646 	qp->prev = vma;
647 
648 	if (flags & MPOL_MF_LAZY) {
649 		/* Similar to task_numa_work, skip inaccessible VMAs */
650 		if (!is_vm_hugetlb_page(vma) &&
651 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
652 			!(vma->vm_flags & VM_MIXEDMAP))
653 			change_prot_numa(vma, start, endvma);
654 		return 1;
655 	}
656 
657 	/* queue pages from current vma */
658 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
659 		return 0;
660 	return 1;
661 }
662 
663 /*
664  * Walk through page tables and collect pages to be migrated.
665  *
666  * If pages found in a given range are on a set of nodes (determined by
667  * @nodes and @flags,) it's isolated and queued to the pagelist which is
668  * passed via @private.)
669  */
670 static int
671 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
672 		nodemask_t *nodes, unsigned long flags,
673 		struct list_head *pagelist)
674 {
675 	struct queue_pages qp = {
676 		.pagelist = pagelist,
677 		.flags = flags,
678 		.nmask = nodes,
679 		.prev = NULL,
680 	};
681 	struct mm_walk queue_pages_walk = {
682 		.hugetlb_entry = queue_pages_hugetlb,
683 		.pmd_entry = queue_pages_pte_range,
684 		.test_walk = queue_pages_test_walk,
685 		.mm = mm,
686 		.private = &qp,
687 	};
688 
689 	return walk_page_range(start, end, &queue_pages_walk);
690 }
691 
692 /*
693  * Apply policy to a single VMA
694  * This must be called with the mmap_sem held for writing.
695  */
696 static int vma_replace_policy(struct vm_area_struct *vma,
697 						struct mempolicy *pol)
698 {
699 	int err;
700 	struct mempolicy *old;
701 	struct mempolicy *new;
702 
703 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
704 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
705 		 vma->vm_ops, vma->vm_file,
706 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
707 
708 	new = mpol_dup(pol);
709 	if (IS_ERR(new))
710 		return PTR_ERR(new);
711 
712 	if (vma->vm_ops && vma->vm_ops->set_policy) {
713 		err = vma->vm_ops->set_policy(vma, new);
714 		if (err)
715 			goto err_out;
716 	}
717 
718 	old = vma->vm_policy;
719 	vma->vm_policy = new; /* protected by mmap_sem */
720 	mpol_put(old);
721 
722 	return 0;
723  err_out:
724 	mpol_put(new);
725 	return err;
726 }
727 
728 /* Step 2: apply policy to a range and do splits. */
729 static int mbind_range(struct mm_struct *mm, unsigned long start,
730 		       unsigned long end, struct mempolicy *new_pol)
731 {
732 	struct vm_area_struct *next;
733 	struct vm_area_struct *prev;
734 	struct vm_area_struct *vma;
735 	int err = 0;
736 	pgoff_t pgoff;
737 	unsigned long vmstart;
738 	unsigned long vmend;
739 
740 	vma = find_vma(mm, start);
741 	if (!vma || vma->vm_start > start)
742 		return -EFAULT;
743 
744 	prev = vma->vm_prev;
745 	if (start > vma->vm_start)
746 		prev = vma;
747 
748 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
749 		next = vma->vm_next;
750 		vmstart = max(start, vma->vm_start);
751 		vmend   = min(end, vma->vm_end);
752 
753 		if (mpol_equal(vma_policy(vma), new_pol))
754 			continue;
755 
756 		pgoff = vma->vm_pgoff +
757 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
758 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
759 				 vma->anon_vma, vma->vm_file, pgoff,
760 				 new_pol, vma->vm_userfaultfd_ctx);
761 		if (prev) {
762 			vma = prev;
763 			next = vma->vm_next;
764 			if (mpol_equal(vma_policy(vma), new_pol))
765 				continue;
766 			/* vma_merge() joined vma && vma->next, case 8 */
767 			goto replace;
768 		}
769 		if (vma->vm_start != vmstart) {
770 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
771 			if (err)
772 				goto out;
773 		}
774 		if (vma->vm_end != vmend) {
775 			err = split_vma(vma->vm_mm, vma, vmend, 0);
776 			if (err)
777 				goto out;
778 		}
779  replace:
780 		err = vma_replace_policy(vma, new_pol);
781 		if (err)
782 			goto out;
783 	}
784 
785  out:
786 	return err;
787 }
788 
789 /* Set the process memory policy */
790 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
791 			     nodemask_t *nodes)
792 {
793 	struct mempolicy *new, *old;
794 	NODEMASK_SCRATCH(scratch);
795 	int ret;
796 
797 	if (!scratch)
798 		return -ENOMEM;
799 
800 	new = mpol_new(mode, flags, nodes);
801 	if (IS_ERR(new)) {
802 		ret = PTR_ERR(new);
803 		goto out;
804 	}
805 
806 	task_lock(current);
807 	ret = mpol_set_nodemask(new, nodes, scratch);
808 	if (ret) {
809 		task_unlock(current);
810 		mpol_put(new);
811 		goto out;
812 	}
813 	old = current->mempolicy;
814 	current->mempolicy = new;
815 	if (new && new->mode == MPOL_INTERLEAVE &&
816 	    nodes_weight(new->v.nodes))
817 		current->il_next = first_node(new->v.nodes);
818 	task_unlock(current);
819 	mpol_put(old);
820 	ret = 0;
821 out:
822 	NODEMASK_SCRATCH_FREE(scratch);
823 	return ret;
824 }
825 
826 /*
827  * Return nodemask for policy for get_mempolicy() query
828  *
829  * Called with task's alloc_lock held
830  */
831 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
832 {
833 	nodes_clear(*nodes);
834 	if (p == &default_policy)
835 		return;
836 
837 	switch (p->mode) {
838 	case MPOL_BIND:
839 		/* Fall through */
840 	case MPOL_INTERLEAVE:
841 		*nodes = p->v.nodes;
842 		break;
843 	case MPOL_PREFERRED:
844 		if (!(p->flags & MPOL_F_LOCAL))
845 			node_set(p->v.preferred_node, *nodes);
846 		/* else return empty node mask for local allocation */
847 		break;
848 	default:
849 		BUG();
850 	}
851 }
852 
853 static int lookup_node(unsigned long addr)
854 {
855 	struct page *p;
856 	int err;
857 
858 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
859 	if (err >= 0) {
860 		err = page_to_nid(p);
861 		put_page(p);
862 	}
863 	return err;
864 }
865 
866 /* Retrieve NUMA policy */
867 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
868 			     unsigned long addr, unsigned long flags)
869 {
870 	int err;
871 	struct mm_struct *mm = current->mm;
872 	struct vm_area_struct *vma = NULL;
873 	struct mempolicy *pol = current->mempolicy;
874 
875 	if (flags &
876 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
877 		return -EINVAL;
878 
879 	if (flags & MPOL_F_MEMS_ALLOWED) {
880 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
881 			return -EINVAL;
882 		*policy = 0;	/* just so it's initialized */
883 		task_lock(current);
884 		*nmask  = cpuset_current_mems_allowed;
885 		task_unlock(current);
886 		return 0;
887 	}
888 
889 	if (flags & MPOL_F_ADDR) {
890 		/*
891 		 * Do NOT fall back to task policy if the
892 		 * vma/shared policy at addr is NULL.  We
893 		 * want to return MPOL_DEFAULT in this case.
894 		 */
895 		down_read(&mm->mmap_sem);
896 		vma = find_vma_intersection(mm, addr, addr+1);
897 		if (!vma) {
898 			up_read(&mm->mmap_sem);
899 			return -EFAULT;
900 		}
901 		if (vma->vm_ops && vma->vm_ops->get_policy)
902 			pol = vma->vm_ops->get_policy(vma, addr);
903 		else
904 			pol = vma->vm_policy;
905 	} else if (addr)
906 		return -EINVAL;
907 
908 	if (!pol)
909 		pol = &default_policy;	/* indicates default behavior */
910 
911 	if (flags & MPOL_F_NODE) {
912 		if (flags & MPOL_F_ADDR) {
913 			err = lookup_node(addr);
914 			if (err < 0)
915 				goto out;
916 			*policy = err;
917 		} else if (pol == current->mempolicy &&
918 				pol->mode == MPOL_INTERLEAVE) {
919 			*policy = current->il_next;
920 		} else {
921 			err = -EINVAL;
922 			goto out;
923 		}
924 	} else {
925 		*policy = pol == &default_policy ? MPOL_DEFAULT :
926 						pol->mode;
927 		/*
928 		 * Internal mempolicy flags must be masked off before exposing
929 		 * the policy to userspace.
930 		 */
931 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
932 	}
933 
934 	if (vma) {
935 		up_read(&current->mm->mmap_sem);
936 		vma = NULL;
937 	}
938 
939 	err = 0;
940 	if (nmask) {
941 		if (mpol_store_user_nodemask(pol)) {
942 			*nmask = pol->w.user_nodemask;
943 		} else {
944 			task_lock(current);
945 			get_policy_nodemask(pol, nmask);
946 			task_unlock(current);
947 		}
948 	}
949 
950  out:
951 	mpol_cond_put(pol);
952 	if (vma)
953 		up_read(&current->mm->mmap_sem);
954 	return err;
955 }
956 
957 #ifdef CONFIG_MIGRATION
958 /*
959  * page migration
960  */
961 static void migrate_page_add(struct page *page, struct list_head *pagelist,
962 				unsigned long flags)
963 {
964 	/*
965 	 * Avoid migrating a page that is shared with others.
966 	 */
967 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
968 		if (!isolate_lru_page(page)) {
969 			list_add_tail(&page->lru, pagelist);
970 			inc_node_page_state(page, NR_ISOLATED_ANON +
971 					    page_is_file_cache(page));
972 		}
973 	}
974 }
975 
976 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
977 {
978 	if (PageHuge(page))
979 		return alloc_huge_page_node(page_hstate(compound_head(page)),
980 					node);
981 	else
982 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
983 						    __GFP_THISNODE, 0);
984 }
985 
986 /*
987  * Migrate pages from one node to a target node.
988  * Returns error or the number of pages not migrated.
989  */
990 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
991 			   int flags)
992 {
993 	nodemask_t nmask;
994 	LIST_HEAD(pagelist);
995 	int err = 0;
996 
997 	nodes_clear(nmask);
998 	node_set(source, nmask);
999 
1000 	/*
1001 	 * This does not "check" the range but isolates all pages that
1002 	 * need migration.  Between passing in the full user address
1003 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1004 	 */
1005 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1006 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1007 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1008 
1009 	if (!list_empty(&pagelist)) {
1010 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1011 					MIGRATE_SYNC, MR_SYSCALL);
1012 		if (err)
1013 			putback_movable_pages(&pagelist);
1014 	}
1015 
1016 	return err;
1017 }
1018 
1019 /*
1020  * Move pages between the two nodesets so as to preserve the physical
1021  * layout as much as possible.
1022  *
1023  * Returns the number of page that could not be moved.
1024  */
1025 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1026 		     const nodemask_t *to, int flags)
1027 {
1028 	int busy = 0;
1029 	int err;
1030 	nodemask_t tmp;
1031 
1032 	err = migrate_prep();
1033 	if (err)
1034 		return err;
1035 
1036 	down_read(&mm->mmap_sem);
1037 
1038 	/*
1039 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1040 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1041 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1042 	 * The pair of nodemasks 'to' and 'from' define the map.
1043 	 *
1044 	 * If no pair of bits is found that way, fallback to picking some
1045 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1046 	 * 'source' and 'dest' bits are the same, this represents a node
1047 	 * that will be migrating to itself, so no pages need move.
1048 	 *
1049 	 * If no bits are left in 'tmp', or if all remaining bits left
1050 	 * in 'tmp' correspond to the same bit in 'to', return false
1051 	 * (nothing left to migrate).
1052 	 *
1053 	 * This lets us pick a pair of nodes to migrate between, such that
1054 	 * if possible the dest node is not already occupied by some other
1055 	 * source node, minimizing the risk of overloading the memory on a
1056 	 * node that would happen if we migrated incoming memory to a node
1057 	 * before migrating outgoing memory source that same node.
1058 	 *
1059 	 * A single scan of tmp is sufficient.  As we go, we remember the
1060 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1061 	 * that not only moved, but what's better, moved to an empty slot
1062 	 * (d is not set in tmp), then we break out then, with that pair.
1063 	 * Otherwise when we finish scanning from_tmp, we at least have the
1064 	 * most recent <s, d> pair that moved.  If we get all the way through
1065 	 * the scan of tmp without finding any node that moved, much less
1066 	 * moved to an empty node, then there is nothing left worth migrating.
1067 	 */
1068 
1069 	tmp = *from;
1070 	while (!nodes_empty(tmp)) {
1071 		int s,d;
1072 		int source = NUMA_NO_NODE;
1073 		int dest = 0;
1074 
1075 		for_each_node_mask(s, tmp) {
1076 
1077 			/*
1078 			 * do_migrate_pages() tries to maintain the relative
1079 			 * node relationship of the pages established between
1080 			 * threads and memory areas.
1081                          *
1082 			 * However if the number of source nodes is not equal to
1083 			 * the number of destination nodes we can not preserve
1084 			 * this node relative relationship.  In that case, skip
1085 			 * copying memory from a node that is in the destination
1086 			 * mask.
1087 			 *
1088 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1089 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1090 			 */
1091 
1092 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1093 						(node_isset(s, *to)))
1094 				continue;
1095 
1096 			d = node_remap(s, *from, *to);
1097 			if (s == d)
1098 				continue;
1099 
1100 			source = s;	/* Node moved. Memorize */
1101 			dest = d;
1102 
1103 			/* dest not in remaining from nodes? */
1104 			if (!node_isset(dest, tmp))
1105 				break;
1106 		}
1107 		if (source == NUMA_NO_NODE)
1108 			break;
1109 
1110 		node_clear(source, tmp);
1111 		err = migrate_to_node(mm, source, dest, flags);
1112 		if (err > 0)
1113 			busy += err;
1114 		if (err < 0)
1115 			break;
1116 	}
1117 	up_read(&mm->mmap_sem);
1118 	if (err < 0)
1119 		return err;
1120 	return busy;
1121 
1122 }
1123 
1124 /*
1125  * Allocate a new page for page migration based on vma policy.
1126  * Start by assuming the page is mapped by the same vma as contains @start.
1127  * Search forward from there, if not.  N.B., this assumes that the
1128  * list of pages handed to migrate_pages()--which is how we get here--
1129  * is in virtual address order.
1130  */
1131 static struct page *new_page(struct page *page, unsigned long start, int **x)
1132 {
1133 	struct vm_area_struct *vma;
1134 	unsigned long uninitialized_var(address);
1135 
1136 	vma = find_vma(current->mm, start);
1137 	while (vma) {
1138 		address = page_address_in_vma(page, vma);
1139 		if (address != -EFAULT)
1140 			break;
1141 		vma = vma->vm_next;
1142 	}
1143 
1144 	if (PageHuge(page)) {
1145 		BUG_ON(!vma);
1146 		return alloc_huge_page_noerr(vma, address, 1);
1147 	}
1148 	/*
1149 	 * if !vma, alloc_page_vma() will use task or system default policy
1150 	 */
1151 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1152 }
1153 #else
1154 
1155 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1156 				unsigned long flags)
1157 {
1158 }
1159 
1160 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1161 		     const nodemask_t *to, int flags)
1162 {
1163 	return -ENOSYS;
1164 }
1165 
1166 static struct page *new_page(struct page *page, unsigned long start, int **x)
1167 {
1168 	return NULL;
1169 }
1170 #endif
1171 
1172 static long do_mbind(unsigned long start, unsigned long len,
1173 		     unsigned short mode, unsigned short mode_flags,
1174 		     nodemask_t *nmask, unsigned long flags)
1175 {
1176 	struct mm_struct *mm = current->mm;
1177 	struct mempolicy *new;
1178 	unsigned long end;
1179 	int err;
1180 	LIST_HEAD(pagelist);
1181 
1182 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1183 		return -EINVAL;
1184 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1185 		return -EPERM;
1186 
1187 	if (start & ~PAGE_MASK)
1188 		return -EINVAL;
1189 
1190 	if (mode == MPOL_DEFAULT)
1191 		flags &= ~MPOL_MF_STRICT;
1192 
1193 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1194 	end = start + len;
1195 
1196 	if (end < start)
1197 		return -EINVAL;
1198 	if (end == start)
1199 		return 0;
1200 
1201 	new = mpol_new(mode, mode_flags, nmask);
1202 	if (IS_ERR(new))
1203 		return PTR_ERR(new);
1204 
1205 	if (flags & MPOL_MF_LAZY)
1206 		new->flags |= MPOL_F_MOF;
1207 
1208 	/*
1209 	 * If we are using the default policy then operation
1210 	 * on discontinuous address spaces is okay after all
1211 	 */
1212 	if (!new)
1213 		flags |= MPOL_MF_DISCONTIG_OK;
1214 
1215 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1216 		 start, start + len, mode, mode_flags,
1217 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1218 
1219 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1220 
1221 		err = migrate_prep();
1222 		if (err)
1223 			goto mpol_out;
1224 	}
1225 	{
1226 		NODEMASK_SCRATCH(scratch);
1227 		if (scratch) {
1228 			down_write(&mm->mmap_sem);
1229 			task_lock(current);
1230 			err = mpol_set_nodemask(new, nmask, scratch);
1231 			task_unlock(current);
1232 			if (err)
1233 				up_write(&mm->mmap_sem);
1234 		} else
1235 			err = -ENOMEM;
1236 		NODEMASK_SCRATCH_FREE(scratch);
1237 	}
1238 	if (err)
1239 		goto mpol_out;
1240 
1241 	err = queue_pages_range(mm, start, end, nmask,
1242 			  flags | MPOL_MF_INVERT, &pagelist);
1243 	if (!err)
1244 		err = mbind_range(mm, start, end, new);
1245 
1246 	if (!err) {
1247 		int nr_failed = 0;
1248 
1249 		if (!list_empty(&pagelist)) {
1250 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1251 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1252 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1253 			if (nr_failed)
1254 				putback_movable_pages(&pagelist);
1255 		}
1256 
1257 		if (nr_failed && (flags & MPOL_MF_STRICT))
1258 			err = -EIO;
1259 	} else
1260 		putback_movable_pages(&pagelist);
1261 
1262 	up_write(&mm->mmap_sem);
1263  mpol_out:
1264 	mpol_put(new);
1265 	return err;
1266 }
1267 
1268 /*
1269  * User space interface with variable sized bitmaps for nodelists.
1270  */
1271 
1272 /* Copy a node mask from user space. */
1273 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1274 		     unsigned long maxnode)
1275 {
1276 	unsigned long k;
1277 	unsigned long nlongs;
1278 	unsigned long endmask;
1279 
1280 	--maxnode;
1281 	nodes_clear(*nodes);
1282 	if (maxnode == 0 || !nmask)
1283 		return 0;
1284 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1285 		return -EINVAL;
1286 
1287 	nlongs = BITS_TO_LONGS(maxnode);
1288 	if ((maxnode % BITS_PER_LONG) == 0)
1289 		endmask = ~0UL;
1290 	else
1291 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1292 
1293 	/* When the user specified more nodes than supported just check
1294 	   if the non supported part is all zero. */
1295 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1296 		if (nlongs > PAGE_SIZE/sizeof(long))
1297 			return -EINVAL;
1298 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1299 			unsigned long t;
1300 			if (get_user(t, nmask + k))
1301 				return -EFAULT;
1302 			if (k == nlongs - 1) {
1303 				if (t & endmask)
1304 					return -EINVAL;
1305 			} else if (t)
1306 				return -EINVAL;
1307 		}
1308 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1309 		endmask = ~0UL;
1310 	}
1311 
1312 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1313 		return -EFAULT;
1314 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1315 	return 0;
1316 }
1317 
1318 /* Copy a kernel node mask to user space */
1319 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1320 			      nodemask_t *nodes)
1321 {
1322 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1323 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1324 
1325 	if (copy > nbytes) {
1326 		if (copy > PAGE_SIZE)
1327 			return -EINVAL;
1328 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1329 			return -EFAULT;
1330 		copy = nbytes;
1331 	}
1332 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1333 }
1334 
1335 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1336 		unsigned long, mode, const unsigned long __user *, nmask,
1337 		unsigned long, maxnode, unsigned, flags)
1338 {
1339 	nodemask_t nodes;
1340 	int err;
1341 	unsigned short mode_flags;
1342 
1343 	mode_flags = mode & MPOL_MODE_FLAGS;
1344 	mode &= ~MPOL_MODE_FLAGS;
1345 	if (mode >= MPOL_MAX)
1346 		return -EINVAL;
1347 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1348 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1349 		return -EINVAL;
1350 	err = get_nodes(&nodes, nmask, maxnode);
1351 	if (err)
1352 		return err;
1353 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1354 }
1355 
1356 /* Set the process memory policy */
1357 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1358 		unsigned long, maxnode)
1359 {
1360 	int err;
1361 	nodemask_t nodes;
1362 	unsigned short flags;
1363 
1364 	flags = mode & MPOL_MODE_FLAGS;
1365 	mode &= ~MPOL_MODE_FLAGS;
1366 	if ((unsigned int)mode >= MPOL_MAX)
1367 		return -EINVAL;
1368 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1369 		return -EINVAL;
1370 	err = get_nodes(&nodes, nmask, maxnode);
1371 	if (err)
1372 		return err;
1373 	return do_set_mempolicy(mode, flags, &nodes);
1374 }
1375 
1376 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1377 		const unsigned long __user *, old_nodes,
1378 		const unsigned long __user *, new_nodes)
1379 {
1380 	const struct cred *cred = current_cred(), *tcred;
1381 	struct mm_struct *mm = NULL;
1382 	struct task_struct *task;
1383 	nodemask_t task_nodes;
1384 	int err;
1385 	nodemask_t *old;
1386 	nodemask_t *new;
1387 	NODEMASK_SCRATCH(scratch);
1388 
1389 	if (!scratch)
1390 		return -ENOMEM;
1391 
1392 	old = &scratch->mask1;
1393 	new = &scratch->mask2;
1394 
1395 	err = get_nodes(old, old_nodes, maxnode);
1396 	if (err)
1397 		goto out;
1398 
1399 	err = get_nodes(new, new_nodes, maxnode);
1400 	if (err)
1401 		goto out;
1402 
1403 	/* Find the mm_struct */
1404 	rcu_read_lock();
1405 	task = pid ? find_task_by_vpid(pid) : current;
1406 	if (!task) {
1407 		rcu_read_unlock();
1408 		err = -ESRCH;
1409 		goto out;
1410 	}
1411 	get_task_struct(task);
1412 
1413 	err = -EINVAL;
1414 
1415 	/*
1416 	 * Check if this process has the right to modify the specified
1417 	 * process. The right exists if the process has administrative
1418 	 * capabilities, superuser privileges or the same
1419 	 * userid as the target process.
1420 	 */
1421 	tcred = __task_cred(task);
1422 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1423 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1424 	    !capable(CAP_SYS_NICE)) {
1425 		rcu_read_unlock();
1426 		err = -EPERM;
1427 		goto out_put;
1428 	}
1429 	rcu_read_unlock();
1430 
1431 	task_nodes = cpuset_mems_allowed(task);
1432 	/* Is the user allowed to access the target nodes? */
1433 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1434 		err = -EPERM;
1435 		goto out_put;
1436 	}
1437 
1438 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1439 		err = -EINVAL;
1440 		goto out_put;
1441 	}
1442 
1443 	err = security_task_movememory(task);
1444 	if (err)
1445 		goto out_put;
1446 
1447 	mm = get_task_mm(task);
1448 	put_task_struct(task);
1449 
1450 	if (!mm) {
1451 		err = -EINVAL;
1452 		goto out;
1453 	}
1454 
1455 	err = do_migrate_pages(mm, old, new,
1456 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1457 
1458 	mmput(mm);
1459 out:
1460 	NODEMASK_SCRATCH_FREE(scratch);
1461 
1462 	return err;
1463 
1464 out_put:
1465 	put_task_struct(task);
1466 	goto out;
1467 
1468 }
1469 
1470 
1471 /* Retrieve NUMA policy */
1472 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1473 		unsigned long __user *, nmask, unsigned long, maxnode,
1474 		unsigned long, addr, unsigned long, flags)
1475 {
1476 	int err;
1477 	int uninitialized_var(pval);
1478 	nodemask_t nodes;
1479 
1480 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1481 		return -EINVAL;
1482 
1483 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1484 
1485 	if (err)
1486 		return err;
1487 
1488 	if (policy && put_user(pval, policy))
1489 		return -EFAULT;
1490 
1491 	if (nmask)
1492 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1493 
1494 	return err;
1495 }
1496 
1497 #ifdef CONFIG_COMPAT
1498 
1499 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1500 		       compat_ulong_t __user *, nmask,
1501 		       compat_ulong_t, maxnode,
1502 		       compat_ulong_t, addr, compat_ulong_t, flags)
1503 {
1504 	long err;
1505 	unsigned long __user *nm = NULL;
1506 	unsigned long nr_bits, alloc_size;
1507 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1508 
1509 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1510 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1511 
1512 	if (nmask)
1513 		nm = compat_alloc_user_space(alloc_size);
1514 
1515 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1516 
1517 	if (!err && nmask) {
1518 		unsigned long copy_size;
1519 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1520 		err = copy_from_user(bm, nm, copy_size);
1521 		/* ensure entire bitmap is zeroed */
1522 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1523 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1524 	}
1525 
1526 	return err;
1527 }
1528 
1529 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1530 		       compat_ulong_t, maxnode)
1531 {
1532 	unsigned long __user *nm = NULL;
1533 	unsigned long nr_bits, alloc_size;
1534 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1535 
1536 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1537 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1538 
1539 	if (nmask) {
1540 		if (compat_get_bitmap(bm, nmask, nr_bits))
1541 			return -EFAULT;
1542 		nm = compat_alloc_user_space(alloc_size);
1543 		if (copy_to_user(nm, bm, alloc_size))
1544 			return -EFAULT;
1545 	}
1546 
1547 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1548 }
1549 
1550 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1551 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1552 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1553 {
1554 	unsigned long __user *nm = NULL;
1555 	unsigned long nr_bits, alloc_size;
1556 	nodemask_t bm;
1557 
1558 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1559 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1560 
1561 	if (nmask) {
1562 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1563 			return -EFAULT;
1564 		nm = compat_alloc_user_space(alloc_size);
1565 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1566 			return -EFAULT;
1567 	}
1568 
1569 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1570 }
1571 
1572 #endif
1573 
1574 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1575 						unsigned long addr)
1576 {
1577 	struct mempolicy *pol = NULL;
1578 
1579 	if (vma) {
1580 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1581 			pol = vma->vm_ops->get_policy(vma, addr);
1582 		} else if (vma->vm_policy) {
1583 			pol = vma->vm_policy;
1584 
1585 			/*
1586 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1587 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1588 			 * count on these policies which will be dropped by
1589 			 * mpol_cond_put() later
1590 			 */
1591 			if (mpol_needs_cond_ref(pol))
1592 				mpol_get(pol);
1593 		}
1594 	}
1595 
1596 	return pol;
1597 }
1598 
1599 /*
1600  * get_vma_policy(@vma, @addr)
1601  * @vma: virtual memory area whose policy is sought
1602  * @addr: address in @vma for shared policy lookup
1603  *
1604  * Returns effective policy for a VMA at specified address.
1605  * Falls back to current->mempolicy or system default policy, as necessary.
1606  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1607  * count--added by the get_policy() vm_op, as appropriate--to protect against
1608  * freeing by another task.  It is the caller's responsibility to free the
1609  * extra reference for shared policies.
1610  */
1611 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1612 						unsigned long addr)
1613 {
1614 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1615 
1616 	if (!pol)
1617 		pol = get_task_policy(current);
1618 
1619 	return pol;
1620 }
1621 
1622 bool vma_policy_mof(struct vm_area_struct *vma)
1623 {
1624 	struct mempolicy *pol;
1625 
1626 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1627 		bool ret = false;
1628 
1629 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1630 		if (pol && (pol->flags & MPOL_F_MOF))
1631 			ret = true;
1632 		mpol_cond_put(pol);
1633 
1634 		return ret;
1635 	}
1636 
1637 	pol = vma->vm_policy;
1638 	if (!pol)
1639 		pol = get_task_policy(current);
1640 
1641 	return pol->flags & MPOL_F_MOF;
1642 }
1643 
1644 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1645 {
1646 	enum zone_type dynamic_policy_zone = policy_zone;
1647 
1648 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1649 
1650 	/*
1651 	 * if policy->v.nodes has movable memory only,
1652 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1653 	 *
1654 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1655 	 * so if the following test faile, it implies
1656 	 * policy->v.nodes has movable memory only.
1657 	 */
1658 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1659 		dynamic_policy_zone = ZONE_MOVABLE;
1660 
1661 	return zone >= dynamic_policy_zone;
1662 }
1663 
1664 /*
1665  * Return a nodemask representing a mempolicy for filtering nodes for
1666  * page allocation
1667  */
1668 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1669 {
1670 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1671 	if (unlikely(policy->mode == MPOL_BIND) &&
1672 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1673 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1674 		return &policy->v.nodes;
1675 
1676 	return NULL;
1677 }
1678 
1679 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1680 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1681 	int nd)
1682 {
1683 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1684 		nd = policy->v.preferred_node;
1685 	else {
1686 		/*
1687 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1688 		 * because we might easily break the expectation to stay on the
1689 		 * requested node and not break the policy.
1690 		 */
1691 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1692 	}
1693 
1694 	return node_zonelist(nd, gfp);
1695 }
1696 
1697 /* Do dynamic interleaving for a process */
1698 static unsigned interleave_nodes(struct mempolicy *policy)
1699 {
1700 	unsigned nid, next;
1701 	struct task_struct *me = current;
1702 
1703 	nid = me->il_next;
1704 	next = next_node_in(nid, policy->v.nodes);
1705 	if (next < MAX_NUMNODES)
1706 		me->il_next = next;
1707 	return nid;
1708 }
1709 
1710 /*
1711  * Depending on the memory policy provide a node from which to allocate the
1712  * next slab entry.
1713  */
1714 unsigned int mempolicy_slab_node(void)
1715 {
1716 	struct mempolicy *policy;
1717 	int node = numa_mem_id();
1718 
1719 	if (in_interrupt())
1720 		return node;
1721 
1722 	policy = current->mempolicy;
1723 	if (!policy || policy->flags & MPOL_F_LOCAL)
1724 		return node;
1725 
1726 	switch (policy->mode) {
1727 	case MPOL_PREFERRED:
1728 		/*
1729 		 * handled MPOL_F_LOCAL above
1730 		 */
1731 		return policy->v.preferred_node;
1732 
1733 	case MPOL_INTERLEAVE:
1734 		return interleave_nodes(policy);
1735 
1736 	case MPOL_BIND: {
1737 		struct zoneref *z;
1738 
1739 		/*
1740 		 * Follow bind policy behavior and start allocation at the
1741 		 * first node.
1742 		 */
1743 		struct zonelist *zonelist;
1744 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1745 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1746 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1747 							&policy->v.nodes);
1748 		return z->zone ? z->zone->node : node;
1749 	}
1750 
1751 	default:
1752 		BUG();
1753 	}
1754 }
1755 
1756 /*
1757  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1758  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1759  * number of present nodes.
1760  */
1761 static unsigned offset_il_node(struct mempolicy *pol,
1762 			       struct vm_area_struct *vma, unsigned long n)
1763 {
1764 	unsigned nnodes = nodes_weight(pol->v.nodes);
1765 	unsigned target;
1766 	int i;
1767 	int nid;
1768 
1769 	if (!nnodes)
1770 		return numa_node_id();
1771 	target = (unsigned int)n % nnodes;
1772 	nid = first_node(pol->v.nodes);
1773 	for (i = 0; i < target; i++)
1774 		nid = next_node(nid, pol->v.nodes);
1775 	return nid;
1776 }
1777 
1778 /* Determine a node number for interleave */
1779 static inline unsigned interleave_nid(struct mempolicy *pol,
1780 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1781 {
1782 	if (vma) {
1783 		unsigned long off;
1784 
1785 		/*
1786 		 * for small pages, there is no difference between
1787 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1788 		 * for huge pages, since vm_pgoff is in units of small
1789 		 * pages, we need to shift off the always 0 bits to get
1790 		 * a useful offset.
1791 		 */
1792 		BUG_ON(shift < PAGE_SHIFT);
1793 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1794 		off += (addr - vma->vm_start) >> shift;
1795 		return offset_il_node(pol, vma, off);
1796 	} else
1797 		return interleave_nodes(pol);
1798 }
1799 
1800 #ifdef CONFIG_HUGETLBFS
1801 /*
1802  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1803  * @vma: virtual memory area whose policy is sought
1804  * @addr: address in @vma for shared policy lookup and interleave policy
1805  * @gfp_flags: for requested zone
1806  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1807  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1808  *
1809  * Returns a zonelist suitable for a huge page allocation and a pointer
1810  * to the struct mempolicy for conditional unref after allocation.
1811  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1812  * @nodemask for filtering the zonelist.
1813  *
1814  * Must be protected by read_mems_allowed_begin()
1815  */
1816 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1817 				gfp_t gfp_flags, struct mempolicy **mpol,
1818 				nodemask_t **nodemask)
1819 {
1820 	struct zonelist *zl;
1821 
1822 	*mpol = get_vma_policy(vma, addr);
1823 	*nodemask = NULL;	/* assume !MPOL_BIND */
1824 
1825 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1826 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1827 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1828 	} else {
1829 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1830 		if ((*mpol)->mode == MPOL_BIND)
1831 			*nodemask = &(*mpol)->v.nodes;
1832 	}
1833 	return zl;
1834 }
1835 
1836 /*
1837  * init_nodemask_of_mempolicy
1838  *
1839  * If the current task's mempolicy is "default" [NULL], return 'false'
1840  * to indicate default policy.  Otherwise, extract the policy nodemask
1841  * for 'bind' or 'interleave' policy into the argument nodemask, or
1842  * initialize the argument nodemask to contain the single node for
1843  * 'preferred' or 'local' policy and return 'true' to indicate presence
1844  * of non-default mempolicy.
1845  *
1846  * We don't bother with reference counting the mempolicy [mpol_get/put]
1847  * because the current task is examining it's own mempolicy and a task's
1848  * mempolicy is only ever changed by the task itself.
1849  *
1850  * N.B., it is the caller's responsibility to free a returned nodemask.
1851  */
1852 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1853 {
1854 	struct mempolicy *mempolicy;
1855 	int nid;
1856 
1857 	if (!(mask && current->mempolicy))
1858 		return false;
1859 
1860 	task_lock(current);
1861 	mempolicy = current->mempolicy;
1862 	switch (mempolicy->mode) {
1863 	case MPOL_PREFERRED:
1864 		if (mempolicy->flags & MPOL_F_LOCAL)
1865 			nid = numa_node_id();
1866 		else
1867 			nid = mempolicy->v.preferred_node;
1868 		init_nodemask_of_node(mask, nid);
1869 		break;
1870 
1871 	case MPOL_BIND:
1872 		/* Fall through */
1873 	case MPOL_INTERLEAVE:
1874 		*mask =  mempolicy->v.nodes;
1875 		break;
1876 
1877 	default:
1878 		BUG();
1879 	}
1880 	task_unlock(current);
1881 
1882 	return true;
1883 }
1884 #endif
1885 
1886 /*
1887  * mempolicy_nodemask_intersects
1888  *
1889  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1890  * policy.  Otherwise, check for intersection between mask and the policy
1891  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1892  * policy, always return true since it may allocate elsewhere on fallback.
1893  *
1894  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1895  */
1896 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1897 					const nodemask_t *mask)
1898 {
1899 	struct mempolicy *mempolicy;
1900 	bool ret = true;
1901 
1902 	if (!mask)
1903 		return ret;
1904 	task_lock(tsk);
1905 	mempolicy = tsk->mempolicy;
1906 	if (!mempolicy)
1907 		goto out;
1908 
1909 	switch (mempolicy->mode) {
1910 	case MPOL_PREFERRED:
1911 		/*
1912 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1913 		 * allocate from, they may fallback to other nodes when oom.
1914 		 * Thus, it's possible for tsk to have allocated memory from
1915 		 * nodes in mask.
1916 		 */
1917 		break;
1918 	case MPOL_BIND:
1919 	case MPOL_INTERLEAVE:
1920 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1921 		break;
1922 	default:
1923 		BUG();
1924 	}
1925 out:
1926 	task_unlock(tsk);
1927 	return ret;
1928 }
1929 
1930 /* Allocate a page in interleaved policy.
1931    Own path because it needs to do special accounting. */
1932 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1933 					unsigned nid)
1934 {
1935 	struct zonelist *zl;
1936 	struct page *page;
1937 
1938 	zl = node_zonelist(nid, gfp);
1939 	page = __alloc_pages(gfp, order, zl);
1940 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1941 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1942 	return page;
1943 }
1944 
1945 /**
1946  * 	alloc_pages_vma	- Allocate a page for a VMA.
1947  *
1948  * 	@gfp:
1949  *      %GFP_USER    user allocation.
1950  *      %GFP_KERNEL  kernel allocations,
1951  *      %GFP_HIGHMEM highmem/user allocations,
1952  *      %GFP_FS      allocation should not call back into a file system.
1953  *      %GFP_ATOMIC  don't sleep.
1954  *
1955  *	@order:Order of the GFP allocation.
1956  * 	@vma:  Pointer to VMA or NULL if not available.
1957  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1958  *	@node: Which node to prefer for allocation (modulo policy).
1959  *	@hugepage: for hugepages try only the preferred node if possible
1960  *
1961  * 	This function allocates a page from the kernel page pool and applies
1962  *	a NUMA policy associated with the VMA or the current process.
1963  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1964  *	mm_struct of the VMA to prevent it from going away. Should be used for
1965  *	all allocations for pages that will be mapped into user space. Returns
1966  *	NULL when no page can be allocated.
1967  */
1968 struct page *
1969 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1970 		unsigned long addr, int node, bool hugepage)
1971 {
1972 	struct mempolicy *pol;
1973 	struct page *page;
1974 	unsigned int cpuset_mems_cookie;
1975 	struct zonelist *zl;
1976 	nodemask_t *nmask;
1977 
1978 retry_cpuset:
1979 	pol = get_vma_policy(vma, addr);
1980 	cpuset_mems_cookie = read_mems_allowed_begin();
1981 
1982 	if (pol->mode == MPOL_INTERLEAVE) {
1983 		unsigned nid;
1984 
1985 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1986 		mpol_cond_put(pol);
1987 		page = alloc_page_interleave(gfp, order, nid);
1988 		goto out;
1989 	}
1990 
1991 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1992 		int hpage_node = node;
1993 
1994 		/*
1995 		 * For hugepage allocation and non-interleave policy which
1996 		 * allows the current node (or other explicitly preferred
1997 		 * node) we only try to allocate from the current/preferred
1998 		 * node and don't fall back to other nodes, as the cost of
1999 		 * remote accesses would likely offset THP benefits.
2000 		 *
2001 		 * If the policy is interleave, or does not allow the current
2002 		 * node in its nodemask, we allocate the standard way.
2003 		 */
2004 		if (pol->mode == MPOL_PREFERRED &&
2005 						!(pol->flags & MPOL_F_LOCAL))
2006 			hpage_node = pol->v.preferred_node;
2007 
2008 		nmask = policy_nodemask(gfp, pol);
2009 		if (!nmask || node_isset(hpage_node, *nmask)) {
2010 			mpol_cond_put(pol);
2011 			page = __alloc_pages_node(hpage_node,
2012 						gfp | __GFP_THISNODE, order);
2013 			goto out;
2014 		}
2015 	}
2016 
2017 	nmask = policy_nodemask(gfp, pol);
2018 	zl = policy_zonelist(gfp, pol, node);
2019 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2020 	mpol_cond_put(pol);
2021 out:
2022 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2023 		goto retry_cpuset;
2024 	return page;
2025 }
2026 
2027 /**
2028  * 	alloc_pages_current - Allocate pages.
2029  *
2030  *	@gfp:
2031  *		%GFP_USER   user allocation,
2032  *      	%GFP_KERNEL kernel allocation,
2033  *      	%GFP_HIGHMEM highmem allocation,
2034  *      	%GFP_FS     don't call back into a file system.
2035  *      	%GFP_ATOMIC don't sleep.
2036  *	@order: Power of two of allocation size in pages. 0 is a single page.
2037  *
2038  *	Allocate a page from the kernel page pool.  When not in
2039  *	interrupt context and apply the current process NUMA policy.
2040  *	Returns NULL when no page can be allocated.
2041  *
2042  *	Don't call cpuset_update_task_memory_state() unless
2043  *	1) it's ok to take cpuset_sem (can WAIT), and
2044  *	2) allocating for current task (not interrupt).
2045  */
2046 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2047 {
2048 	struct mempolicy *pol = &default_policy;
2049 	struct page *page;
2050 	unsigned int cpuset_mems_cookie;
2051 
2052 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2053 		pol = get_task_policy(current);
2054 
2055 retry_cpuset:
2056 	cpuset_mems_cookie = read_mems_allowed_begin();
2057 
2058 	/*
2059 	 * No reference counting needed for current->mempolicy
2060 	 * nor system default_policy
2061 	 */
2062 	if (pol->mode == MPOL_INTERLEAVE)
2063 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2064 	else
2065 		page = __alloc_pages_nodemask(gfp, order,
2066 				policy_zonelist(gfp, pol, numa_node_id()),
2067 				policy_nodemask(gfp, pol));
2068 
2069 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2070 		goto retry_cpuset;
2071 
2072 	return page;
2073 }
2074 EXPORT_SYMBOL(alloc_pages_current);
2075 
2076 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2077 {
2078 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2079 
2080 	if (IS_ERR(pol))
2081 		return PTR_ERR(pol);
2082 	dst->vm_policy = pol;
2083 	return 0;
2084 }
2085 
2086 /*
2087  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2088  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2089  * with the mems_allowed returned by cpuset_mems_allowed().  This
2090  * keeps mempolicies cpuset relative after its cpuset moves.  See
2091  * further kernel/cpuset.c update_nodemask().
2092  *
2093  * current's mempolicy may be rebinded by the other task(the task that changes
2094  * cpuset's mems), so we needn't do rebind work for current task.
2095  */
2096 
2097 /* Slow path of a mempolicy duplicate */
2098 struct mempolicy *__mpol_dup(struct mempolicy *old)
2099 {
2100 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2101 
2102 	if (!new)
2103 		return ERR_PTR(-ENOMEM);
2104 
2105 	/* task's mempolicy is protected by alloc_lock */
2106 	if (old == current->mempolicy) {
2107 		task_lock(current);
2108 		*new = *old;
2109 		task_unlock(current);
2110 	} else
2111 		*new = *old;
2112 
2113 	if (current_cpuset_is_being_rebound()) {
2114 		nodemask_t mems = cpuset_mems_allowed(current);
2115 		if (new->flags & MPOL_F_REBINDING)
2116 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2117 		else
2118 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2119 	}
2120 	atomic_set(&new->refcnt, 1);
2121 	return new;
2122 }
2123 
2124 /* Slow path of a mempolicy comparison */
2125 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2126 {
2127 	if (!a || !b)
2128 		return false;
2129 	if (a->mode != b->mode)
2130 		return false;
2131 	if (a->flags != b->flags)
2132 		return false;
2133 	if (mpol_store_user_nodemask(a))
2134 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2135 			return false;
2136 
2137 	switch (a->mode) {
2138 	case MPOL_BIND:
2139 		/* Fall through */
2140 	case MPOL_INTERLEAVE:
2141 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2142 	case MPOL_PREFERRED:
2143 		return a->v.preferred_node == b->v.preferred_node;
2144 	default:
2145 		BUG();
2146 		return false;
2147 	}
2148 }
2149 
2150 /*
2151  * Shared memory backing store policy support.
2152  *
2153  * Remember policies even when nobody has shared memory mapped.
2154  * The policies are kept in Red-Black tree linked from the inode.
2155  * They are protected by the sp->lock rwlock, which should be held
2156  * for any accesses to the tree.
2157  */
2158 
2159 /*
2160  * lookup first element intersecting start-end.  Caller holds sp->lock for
2161  * reading or for writing
2162  */
2163 static struct sp_node *
2164 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2165 {
2166 	struct rb_node *n = sp->root.rb_node;
2167 
2168 	while (n) {
2169 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2170 
2171 		if (start >= p->end)
2172 			n = n->rb_right;
2173 		else if (end <= p->start)
2174 			n = n->rb_left;
2175 		else
2176 			break;
2177 	}
2178 	if (!n)
2179 		return NULL;
2180 	for (;;) {
2181 		struct sp_node *w = NULL;
2182 		struct rb_node *prev = rb_prev(n);
2183 		if (!prev)
2184 			break;
2185 		w = rb_entry(prev, struct sp_node, nd);
2186 		if (w->end <= start)
2187 			break;
2188 		n = prev;
2189 	}
2190 	return rb_entry(n, struct sp_node, nd);
2191 }
2192 
2193 /*
2194  * Insert a new shared policy into the list.  Caller holds sp->lock for
2195  * writing.
2196  */
2197 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2198 {
2199 	struct rb_node **p = &sp->root.rb_node;
2200 	struct rb_node *parent = NULL;
2201 	struct sp_node *nd;
2202 
2203 	while (*p) {
2204 		parent = *p;
2205 		nd = rb_entry(parent, struct sp_node, nd);
2206 		if (new->start < nd->start)
2207 			p = &(*p)->rb_left;
2208 		else if (new->end > nd->end)
2209 			p = &(*p)->rb_right;
2210 		else
2211 			BUG();
2212 	}
2213 	rb_link_node(&new->nd, parent, p);
2214 	rb_insert_color(&new->nd, &sp->root);
2215 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2216 		 new->policy ? new->policy->mode : 0);
2217 }
2218 
2219 /* Find shared policy intersecting idx */
2220 struct mempolicy *
2221 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2222 {
2223 	struct mempolicy *pol = NULL;
2224 	struct sp_node *sn;
2225 
2226 	if (!sp->root.rb_node)
2227 		return NULL;
2228 	read_lock(&sp->lock);
2229 	sn = sp_lookup(sp, idx, idx+1);
2230 	if (sn) {
2231 		mpol_get(sn->policy);
2232 		pol = sn->policy;
2233 	}
2234 	read_unlock(&sp->lock);
2235 	return pol;
2236 }
2237 
2238 static void sp_free(struct sp_node *n)
2239 {
2240 	mpol_put(n->policy);
2241 	kmem_cache_free(sn_cache, n);
2242 }
2243 
2244 /**
2245  * mpol_misplaced - check whether current page node is valid in policy
2246  *
2247  * @page: page to be checked
2248  * @vma: vm area where page mapped
2249  * @addr: virtual address where page mapped
2250  *
2251  * Lookup current policy node id for vma,addr and "compare to" page's
2252  * node id.
2253  *
2254  * Returns:
2255  *	-1	- not misplaced, page is in the right node
2256  *	node	- node id where the page should be
2257  *
2258  * Policy determination "mimics" alloc_page_vma().
2259  * Called from fault path where we know the vma and faulting address.
2260  */
2261 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2262 {
2263 	struct mempolicy *pol;
2264 	struct zoneref *z;
2265 	int curnid = page_to_nid(page);
2266 	unsigned long pgoff;
2267 	int thiscpu = raw_smp_processor_id();
2268 	int thisnid = cpu_to_node(thiscpu);
2269 	int polnid = -1;
2270 	int ret = -1;
2271 
2272 	BUG_ON(!vma);
2273 
2274 	pol = get_vma_policy(vma, addr);
2275 	if (!(pol->flags & MPOL_F_MOF))
2276 		goto out;
2277 
2278 	switch (pol->mode) {
2279 	case MPOL_INTERLEAVE:
2280 		BUG_ON(addr >= vma->vm_end);
2281 		BUG_ON(addr < vma->vm_start);
2282 
2283 		pgoff = vma->vm_pgoff;
2284 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2285 		polnid = offset_il_node(pol, vma, pgoff);
2286 		break;
2287 
2288 	case MPOL_PREFERRED:
2289 		if (pol->flags & MPOL_F_LOCAL)
2290 			polnid = numa_node_id();
2291 		else
2292 			polnid = pol->v.preferred_node;
2293 		break;
2294 
2295 	case MPOL_BIND:
2296 
2297 		/*
2298 		 * allows binding to multiple nodes.
2299 		 * use current page if in policy nodemask,
2300 		 * else select nearest allowed node, if any.
2301 		 * If no allowed nodes, use current [!misplaced].
2302 		 */
2303 		if (node_isset(curnid, pol->v.nodes))
2304 			goto out;
2305 		z = first_zones_zonelist(
2306 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2307 				gfp_zone(GFP_HIGHUSER),
2308 				&pol->v.nodes);
2309 		polnid = z->zone->node;
2310 		break;
2311 
2312 	default:
2313 		BUG();
2314 	}
2315 
2316 	/* Migrate the page towards the node whose CPU is referencing it */
2317 	if (pol->flags & MPOL_F_MORON) {
2318 		polnid = thisnid;
2319 
2320 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2321 			goto out;
2322 	}
2323 
2324 	if (curnid != polnid)
2325 		ret = polnid;
2326 out:
2327 	mpol_cond_put(pol);
2328 
2329 	return ret;
2330 }
2331 
2332 /*
2333  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2334  * dropped after task->mempolicy is set to NULL so that any allocation done as
2335  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2336  * policy.
2337  */
2338 void mpol_put_task_policy(struct task_struct *task)
2339 {
2340 	struct mempolicy *pol;
2341 
2342 	task_lock(task);
2343 	pol = task->mempolicy;
2344 	task->mempolicy = NULL;
2345 	task_unlock(task);
2346 	mpol_put(pol);
2347 }
2348 
2349 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2350 {
2351 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2352 	rb_erase(&n->nd, &sp->root);
2353 	sp_free(n);
2354 }
2355 
2356 static void sp_node_init(struct sp_node *node, unsigned long start,
2357 			unsigned long end, struct mempolicy *pol)
2358 {
2359 	node->start = start;
2360 	node->end = end;
2361 	node->policy = pol;
2362 }
2363 
2364 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2365 				struct mempolicy *pol)
2366 {
2367 	struct sp_node *n;
2368 	struct mempolicy *newpol;
2369 
2370 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2371 	if (!n)
2372 		return NULL;
2373 
2374 	newpol = mpol_dup(pol);
2375 	if (IS_ERR(newpol)) {
2376 		kmem_cache_free(sn_cache, n);
2377 		return NULL;
2378 	}
2379 	newpol->flags |= MPOL_F_SHARED;
2380 	sp_node_init(n, start, end, newpol);
2381 
2382 	return n;
2383 }
2384 
2385 /* Replace a policy range. */
2386 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2387 				 unsigned long end, struct sp_node *new)
2388 {
2389 	struct sp_node *n;
2390 	struct sp_node *n_new = NULL;
2391 	struct mempolicy *mpol_new = NULL;
2392 	int ret = 0;
2393 
2394 restart:
2395 	write_lock(&sp->lock);
2396 	n = sp_lookup(sp, start, end);
2397 	/* Take care of old policies in the same range. */
2398 	while (n && n->start < end) {
2399 		struct rb_node *next = rb_next(&n->nd);
2400 		if (n->start >= start) {
2401 			if (n->end <= end)
2402 				sp_delete(sp, n);
2403 			else
2404 				n->start = end;
2405 		} else {
2406 			/* Old policy spanning whole new range. */
2407 			if (n->end > end) {
2408 				if (!n_new)
2409 					goto alloc_new;
2410 
2411 				*mpol_new = *n->policy;
2412 				atomic_set(&mpol_new->refcnt, 1);
2413 				sp_node_init(n_new, end, n->end, mpol_new);
2414 				n->end = start;
2415 				sp_insert(sp, n_new);
2416 				n_new = NULL;
2417 				mpol_new = NULL;
2418 				break;
2419 			} else
2420 				n->end = start;
2421 		}
2422 		if (!next)
2423 			break;
2424 		n = rb_entry(next, struct sp_node, nd);
2425 	}
2426 	if (new)
2427 		sp_insert(sp, new);
2428 	write_unlock(&sp->lock);
2429 	ret = 0;
2430 
2431 err_out:
2432 	if (mpol_new)
2433 		mpol_put(mpol_new);
2434 	if (n_new)
2435 		kmem_cache_free(sn_cache, n_new);
2436 
2437 	return ret;
2438 
2439 alloc_new:
2440 	write_unlock(&sp->lock);
2441 	ret = -ENOMEM;
2442 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2443 	if (!n_new)
2444 		goto err_out;
2445 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2446 	if (!mpol_new)
2447 		goto err_out;
2448 	goto restart;
2449 }
2450 
2451 /**
2452  * mpol_shared_policy_init - initialize shared policy for inode
2453  * @sp: pointer to inode shared policy
2454  * @mpol:  struct mempolicy to install
2455  *
2456  * Install non-NULL @mpol in inode's shared policy rb-tree.
2457  * On entry, the current task has a reference on a non-NULL @mpol.
2458  * This must be released on exit.
2459  * This is called at get_inode() calls and we can use GFP_KERNEL.
2460  */
2461 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2462 {
2463 	int ret;
2464 
2465 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2466 	rwlock_init(&sp->lock);
2467 
2468 	if (mpol) {
2469 		struct vm_area_struct pvma;
2470 		struct mempolicy *new;
2471 		NODEMASK_SCRATCH(scratch);
2472 
2473 		if (!scratch)
2474 			goto put_mpol;
2475 		/* contextualize the tmpfs mount point mempolicy */
2476 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2477 		if (IS_ERR(new))
2478 			goto free_scratch; /* no valid nodemask intersection */
2479 
2480 		task_lock(current);
2481 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2482 		task_unlock(current);
2483 		if (ret)
2484 			goto put_new;
2485 
2486 		/* Create pseudo-vma that contains just the policy */
2487 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2488 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2489 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2490 
2491 put_new:
2492 		mpol_put(new);			/* drop initial ref */
2493 free_scratch:
2494 		NODEMASK_SCRATCH_FREE(scratch);
2495 put_mpol:
2496 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2497 	}
2498 }
2499 
2500 int mpol_set_shared_policy(struct shared_policy *info,
2501 			struct vm_area_struct *vma, struct mempolicy *npol)
2502 {
2503 	int err;
2504 	struct sp_node *new = NULL;
2505 	unsigned long sz = vma_pages(vma);
2506 
2507 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2508 		 vma->vm_pgoff,
2509 		 sz, npol ? npol->mode : -1,
2510 		 npol ? npol->flags : -1,
2511 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2512 
2513 	if (npol) {
2514 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2515 		if (!new)
2516 			return -ENOMEM;
2517 	}
2518 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2519 	if (err && new)
2520 		sp_free(new);
2521 	return err;
2522 }
2523 
2524 /* Free a backing policy store on inode delete. */
2525 void mpol_free_shared_policy(struct shared_policy *p)
2526 {
2527 	struct sp_node *n;
2528 	struct rb_node *next;
2529 
2530 	if (!p->root.rb_node)
2531 		return;
2532 	write_lock(&p->lock);
2533 	next = rb_first(&p->root);
2534 	while (next) {
2535 		n = rb_entry(next, struct sp_node, nd);
2536 		next = rb_next(&n->nd);
2537 		sp_delete(p, n);
2538 	}
2539 	write_unlock(&p->lock);
2540 }
2541 
2542 #ifdef CONFIG_NUMA_BALANCING
2543 static int __initdata numabalancing_override;
2544 
2545 static void __init check_numabalancing_enable(void)
2546 {
2547 	bool numabalancing_default = false;
2548 
2549 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2550 		numabalancing_default = true;
2551 
2552 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2553 	if (numabalancing_override)
2554 		set_numabalancing_state(numabalancing_override == 1);
2555 
2556 	if (num_online_nodes() > 1 && !numabalancing_override) {
2557 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2558 			numabalancing_default ? "Enabling" : "Disabling");
2559 		set_numabalancing_state(numabalancing_default);
2560 	}
2561 }
2562 
2563 static int __init setup_numabalancing(char *str)
2564 {
2565 	int ret = 0;
2566 	if (!str)
2567 		goto out;
2568 
2569 	if (!strcmp(str, "enable")) {
2570 		numabalancing_override = 1;
2571 		ret = 1;
2572 	} else if (!strcmp(str, "disable")) {
2573 		numabalancing_override = -1;
2574 		ret = 1;
2575 	}
2576 out:
2577 	if (!ret)
2578 		pr_warn("Unable to parse numa_balancing=\n");
2579 
2580 	return ret;
2581 }
2582 __setup("numa_balancing=", setup_numabalancing);
2583 #else
2584 static inline void __init check_numabalancing_enable(void)
2585 {
2586 }
2587 #endif /* CONFIG_NUMA_BALANCING */
2588 
2589 /* assumes fs == KERNEL_DS */
2590 void __init numa_policy_init(void)
2591 {
2592 	nodemask_t interleave_nodes;
2593 	unsigned long largest = 0;
2594 	int nid, prefer = 0;
2595 
2596 	policy_cache = kmem_cache_create("numa_policy",
2597 					 sizeof(struct mempolicy),
2598 					 0, SLAB_PANIC, NULL);
2599 
2600 	sn_cache = kmem_cache_create("shared_policy_node",
2601 				     sizeof(struct sp_node),
2602 				     0, SLAB_PANIC, NULL);
2603 
2604 	for_each_node(nid) {
2605 		preferred_node_policy[nid] = (struct mempolicy) {
2606 			.refcnt = ATOMIC_INIT(1),
2607 			.mode = MPOL_PREFERRED,
2608 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2609 			.v = { .preferred_node = nid, },
2610 		};
2611 	}
2612 
2613 	/*
2614 	 * Set interleaving policy for system init. Interleaving is only
2615 	 * enabled across suitably sized nodes (default is >= 16MB), or
2616 	 * fall back to the largest node if they're all smaller.
2617 	 */
2618 	nodes_clear(interleave_nodes);
2619 	for_each_node_state(nid, N_MEMORY) {
2620 		unsigned long total_pages = node_present_pages(nid);
2621 
2622 		/* Preserve the largest node */
2623 		if (largest < total_pages) {
2624 			largest = total_pages;
2625 			prefer = nid;
2626 		}
2627 
2628 		/* Interleave this node? */
2629 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2630 			node_set(nid, interleave_nodes);
2631 	}
2632 
2633 	/* All too small, use the largest */
2634 	if (unlikely(nodes_empty(interleave_nodes)))
2635 		node_set(prefer, interleave_nodes);
2636 
2637 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2638 		pr_err("%s: interleaving failed\n", __func__);
2639 
2640 	check_numabalancing_enable();
2641 }
2642 
2643 /* Reset policy of current process to default */
2644 void numa_default_policy(void)
2645 {
2646 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2647 }
2648 
2649 /*
2650  * Parse and format mempolicy from/to strings
2651  */
2652 
2653 /*
2654  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2655  */
2656 static const char * const policy_modes[] =
2657 {
2658 	[MPOL_DEFAULT]    = "default",
2659 	[MPOL_PREFERRED]  = "prefer",
2660 	[MPOL_BIND]       = "bind",
2661 	[MPOL_INTERLEAVE] = "interleave",
2662 	[MPOL_LOCAL]      = "local",
2663 };
2664 
2665 
2666 #ifdef CONFIG_TMPFS
2667 /**
2668  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2669  * @str:  string containing mempolicy to parse
2670  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2671  *
2672  * Format of input:
2673  *	<mode>[=<flags>][:<nodelist>]
2674  *
2675  * On success, returns 0, else 1
2676  */
2677 int mpol_parse_str(char *str, struct mempolicy **mpol)
2678 {
2679 	struct mempolicy *new = NULL;
2680 	unsigned short mode;
2681 	unsigned short mode_flags;
2682 	nodemask_t nodes;
2683 	char *nodelist = strchr(str, ':');
2684 	char *flags = strchr(str, '=');
2685 	int err = 1;
2686 
2687 	if (nodelist) {
2688 		/* NUL-terminate mode or flags string */
2689 		*nodelist++ = '\0';
2690 		if (nodelist_parse(nodelist, nodes))
2691 			goto out;
2692 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2693 			goto out;
2694 	} else
2695 		nodes_clear(nodes);
2696 
2697 	if (flags)
2698 		*flags++ = '\0';	/* terminate mode string */
2699 
2700 	for (mode = 0; mode < MPOL_MAX; mode++) {
2701 		if (!strcmp(str, policy_modes[mode])) {
2702 			break;
2703 		}
2704 	}
2705 	if (mode >= MPOL_MAX)
2706 		goto out;
2707 
2708 	switch (mode) {
2709 	case MPOL_PREFERRED:
2710 		/*
2711 		 * Insist on a nodelist of one node only
2712 		 */
2713 		if (nodelist) {
2714 			char *rest = nodelist;
2715 			while (isdigit(*rest))
2716 				rest++;
2717 			if (*rest)
2718 				goto out;
2719 		}
2720 		break;
2721 	case MPOL_INTERLEAVE:
2722 		/*
2723 		 * Default to online nodes with memory if no nodelist
2724 		 */
2725 		if (!nodelist)
2726 			nodes = node_states[N_MEMORY];
2727 		break;
2728 	case MPOL_LOCAL:
2729 		/*
2730 		 * Don't allow a nodelist;  mpol_new() checks flags
2731 		 */
2732 		if (nodelist)
2733 			goto out;
2734 		mode = MPOL_PREFERRED;
2735 		break;
2736 	case MPOL_DEFAULT:
2737 		/*
2738 		 * Insist on a empty nodelist
2739 		 */
2740 		if (!nodelist)
2741 			err = 0;
2742 		goto out;
2743 	case MPOL_BIND:
2744 		/*
2745 		 * Insist on a nodelist
2746 		 */
2747 		if (!nodelist)
2748 			goto out;
2749 	}
2750 
2751 	mode_flags = 0;
2752 	if (flags) {
2753 		/*
2754 		 * Currently, we only support two mutually exclusive
2755 		 * mode flags.
2756 		 */
2757 		if (!strcmp(flags, "static"))
2758 			mode_flags |= MPOL_F_STATIC_NODES;
2759 		else if (!strcmp(flags, "relative"))
2760 			mode_flags |= MPOL_F_RELATIVE_NODES;
2761 		else
2762 			goto out;
2763 	}
2764 
2765 	new = mpol_new(mode, mode_flags, &nodes);
2766 	if (IS_ERR(new))
2767 		goto out;
2768 
2769 	/*
2770 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2771 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2772 	 */
2773 	if (mode != MPOL_PREFERRED)
2774 		new->v.nodes = nodes;
2775 	else if (nodelist)
2776 		new->v.preferred_node = first_node(nodes);
2777 	else
2778 		new->flags |= MPOL_F_LOCAL;
2779 
2780 	/*
2781 	 * Save nodes for contextualization: this will be used to "clone"
2782 	 * the mempolicy in a specific context [cpuset] at a later time.
2783 	 */
2784 	new->w.user_nodemask = nodes;
2785 
2786 	err = 0;
2787 
2788 out:
2789 	/* Restore string for error message */
2790 	if (nodelist)
2791 		*--nodelist = ':';
2792 	if (flags)
2793 		*--flags = '=';
2794 	if (!err)
2795 		*mpol = new;
2796 	return err;
2797 }
2798 #endif /* CONFIG_TMPFS */
2799 
2800 /**
2801  * mpol_to_str - format a mempolicy structure for printing
2802  * @buffer:  to contain formatted mempolicy string
2803  * @maxlen:  length of @buffer
2804  * @pol:  pointer to mempolicy to be formatted
2805  *
2806  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2807  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2808  * longest flag, "relative", and to display at least a few node ids.
2809  */
2810 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2811 {
2812 	char *p = buffer;
2813 	nodemask_t nodes = NODE_MASK_NONE;
2814 	unsigned short mode = MPOL_DEFAULT;
2815 	unsigned short flags = 0;
2816 
2817 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2818 		mode = pol->mode;
2819 		flags = pol->flags;
2820 	}
2821 
2822 	switch (mode) {
2823 	case MPOL_DEFAULT:
2824 		break;
2825 	case MPOL_PREFERRED:
2826 		if (flags & MPOL_F_LOCAL)
2827 			mode = MPOL_LOCAL;
2828 		else
2829 			node_set(pol->v.preferred_node, nodes);
2830 		break;
2831 	case MPOL_BIND:
2832 	case MPOL_INTERLEAVE:
2833 		nodes = pol->v.nodes;
2834 		break;
2835 	default:
2836 		WARN_ON_ONCE(1);
2837 		snprintf(p, maxlen, "unknown");
2838 		return;
2839 	}
2840 
2841 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2842 
2843 	if (flags & MPOL_MODE_FLAGS) {
2844 		p += snprintf(p, buffer + maxlen - p, "=");
2845 
2846 		/*
2847 		 * Currently, the only defined flags are mutually exclusive
2848 		 */
2849 		if (flags & MPOL_F_STATIC_NODES)
2850 			p += snprintf(p, buffer + maxlen - p, "static");
2851 		else if (flags & MPOL_F_RELATIVE_NODES)
2852 			p += snprintf(p, buffer + maxlen - p, "relative");
2853 	}
2854 
2855 	if (!nodes_empty(nodes))
2856 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2857 			       nodemask_pr_args(&nodes));
2858 }
2859