xref: /openbmc/linux/mm/mempolicy.c (revision f3a8b664)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 
101 #include "internal.h"
102 
103 /* Internal flags */
104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
106 
107 static struct kmem_cache *policy_cache;
108 static struct kmem_cache *sn_cache;
109 
110 /* Highest zone. An specific allocation for a zone below that is not
111    policied. */
112 enum zone_type policy_zone = 0;
113 
114 /*
115  * run-time system-wide default policy => local allocation
116  */
117 static struct mempolicy default_policy = {
118 	.refcnt = ATOMIC_INIT(1), /* never free it */
119 	.mode = MPOL_PREFERRED,
120 	.flags = MPOL_F_LOCAL,
121 };
122 
123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
124 
125 struct mempolicy *get_task_policy(struct task_struct *p)
126 {
127 	struct mempolicy *pol = p->mempolicy;
128 	int node;
129 
130 	if (pol)
131 		return pol;
132 
133 	node = numa_node_id();
134 	if (node != NUMA_NO_NODE) {
135 		pol = &preferred_node_policy[node];
136 		/* preferred_node_policy is not initialised early in boot */
137 		if (pol->mode)
138 			return pol;
139 	}
140 
141 	return &default_policy;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
165 {
166 	return pol->flags & MPOL_MODE_FLAGS;
167 }
168 
169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
170 				   const nodemask_t *rel)
171 {
172 	nodemask_t tmp;
173 	nodes_fold(tmp, *orig, nodes_weight(*rel));
174 	nodes_onto(*ret, tmp, *rel);
175 }
176 
177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (nodes_empty(*nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
186 {
187 	if (!nodes)
188 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
189 	else if (nodes_empty(*nodes))
190 		return -EINVAL;			/*  no allowed nodes */
191 	else
192 		pol->v.preferred_node = first_node(*nodes);
193 	return 0;
194 }
195 
196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
197 {
198 	if (nodes_empty(*nodes))
199 		return -EINVAL;
200 	pol->v.nodes = *nodes;
201 	return 0;
202 }
203 
204 /*
205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
206  * any, for the new policy.  mpol_new() has already validated the nodes
207  * parameter with respect to the policy mode and flags.  But, we need to
208  * handle an empty nodemask with MPOL_PREFERRED here.
209  *
210  * Must be called holding task's alloc_lock to protect task's mems_allowed
211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
212  */
213 static int mpol_set_nodemask(struct mempolicy *pol,
214 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
215 {
216 	int ret;
217 
218 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
219 	if (pol == NULL)
220 		return 0;
221 	/* Check N_MEMORY */
222 	nodes_and(nsc->mask1,
223 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
224 
225 	VM_BUG_ON(!nodes);
226 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
227 		nodes = NULL;	/* explicit local allocation */
228 	else {
229 		if (pol->flags & MPOL_F_RELATIVE_NODES)
230 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
231 		else
232 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
233 
234 		if (mpol_store_user_nodemask(pol))
235 			pol->w.user_nodemask = *nodes;
236 		else
237 			pol->w.cpuset_mems_allowed =
238 						cpuset_current_mems_allowed;
239 	}
240 
241 	if (nodes)
242 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
243 	else
244 		ret = mpol_ops[pol->mode].create(pol, NULL);
245 	return ret;
246 }
247 
248 /*
249  * This function just creates a new policy, does some check and simple
250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
251  */
252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
253 				  nodemask_t *nodes)
254 {
255 	struct mempolicy *policy;
256 
257 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
258 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
259 
260 	if (mode == MPOL_DEFAULT) {
261 		if (nodes && !nodes_empty(*nodes))
262 			return ERR_PTR(-EINVAL);
263 		return NULL;
264 	}
265 	VM_BUG_ON(!nodes);
266 
267 	/*
268 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
269 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
270 	 * All other modes require a valid pointer to a non-empty nodemask.
271 	 */
272 	if (mode == MPOL_PREFERRED) {
273 		if (nodes_empty(*nodes)) {
274 			if (((flags & MPOL_F_STATIC_NODES) ||
275 			     (flags & MPOL_F_RELATIVE_NODES)))
276 				return ERR_PTR(-EINVAL);
277 		}
278 	} else if (mode == MPOL_LOCAL) {
279 		if (!nodes_empty(*nodes))
280 			return ERR_PTR(-EINVAL);
281 		mode = MPOL_PREFERRED;
282 	} else if (nodes_empty(*nodes))
283 		return ERR_PTR(-EINVAL);
284 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
285 	if (!policy)
286 		return ERR_PTR(-ENOMEM);
287 	atomic_set(&policy->refcnt, 1);
288 	policy->mode = mode;
289 	policy->flags = flags;
290 
291 	return policy;
292 }
293 
294 /* Slow path of a mpol destructor. */
295 void __mpol_put(struct mempolicy *p)
296 {
297 	if (!atomic_dec_and_test(&p->refcnt))
298 		return;
299 	kmem_cache_free(policy_cache, p);
300 }
301 
302 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
303 				enum mpol_rebind_step step)
304 {
305 }
306 
307 /*
308  * step:
309  * 	MPOL_REBIND_ONCE  - do rebind work at once
310  * 	MPOL_REBIND_STEP1 - set all the newly nodes
311  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
312  */
313 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
314 				 enum mpol_rebind_step step)
315 {
316 	nodemask_t tmp;
317 
318 	if (pol->flags & MPOL_F_STATIC_NODES)
319 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
320 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
321 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
322 	else {
323 		/*
324 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
325 		 * result
326 		 */
327 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
328 			nodes_remap(tmp, pol->v.nodes,
329 					pol->w.cpuset_mems_allowed, *nodes);
330 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
331 		} else if (step == MPOL_REBIND_STEP2) {
332 			tmp = pol->w.cpuset_mems_allowed;
333 			pol->w.cpuset_mems_allowed = *nodes;
334 		} else
335 			BUG();
336 	}
337 
338 	if (nodes_empty(tmp))
339 		tmp = *nodes;
340 
341 	if (step == MPOL_REBIND_STEP1)
342 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
343 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
344 		pol->v.nodes = tmp;
345 	else
346 		BUG();
347 
348 	if (!node_isset(current->il_next, tmp)) {
349 		current->il_next = next_node_in(current->il_next, tmp);
350 		if (current->il_next >= MAX_NUMNODES)
351 			current->il_next = numa_node_id();
352 	}
353 }
354 
355 static void mpol_rebind_preferred(struct mempolicy *pol,
356 				  const nodemask_t *nodes,
357 				  enum mpol_rebind_step step)
358 {
359 	nodemask_t tmp;
360 
361 	if (pol->flags & MPOL_F_STATIC_NODES) {
362 		int node = first_node(pol->w.user_nodemask);
363 
364 		if (node_isset(node, *nodes)) {
365 			pol->v.preferred_node = node;
366 			pol->flags &= ~MPOL_F_LOCAL;
367 		} else
368 			pol->flags |= MPOL_F_LOCAL;
369 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
370 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
371 		pol->v.preferred_node = first_node(tmp);
372 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
373 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
374 						   pol->w.cpuset_mems_allowed,
375 						   *nodes);
376 		pol->w.cpuset_mems_allowed = *nodes;
377 	}
378 }
379 
380 /*
381  * mpol_rebind_policy - Migrate a policy to a different set of nodes
382  *
383  * If read-side task has no lock to protect task->mempolicy, write-side
384  * task will rebind the task->mempolicy by two step. The first step is
385  * setting all the newly nodes, and the second step is cleaning all the
386  * disallowed nodes. In this way, we can avoid finding no node to alloc
387  * page.
388  * If we have a lock to protect task->mempolicy in read-side, we do
389  * rebind directly.
390  *
391  * step:
392  * 	MPOL_REBIND_ONCE  - do rebind work at once
393  * 	MPOL_REBIND_STEP1 - set all the newly nodes
394  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
395  */
396 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
397 				enum mpol_rebind_step step)
398 {
399 	if (!pol)
400 		return;
401 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
402 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
403 		return;
404 
405 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
409 		BUG();
410 
411 	if (step == MPOL_REBIND_STEP1)
412 		pol->flags |= MPOL_F_REBINDING;
413 	else if (step == MPOL_REBIND_STEP2)
414 		pol->flags &= ~MPOL_F_REBINDING;
415 	else if (step >= MPOL_REBIND_NSTEP)
416 		BUG();
417 
418 	mpol_ops[pol->mode].rebind(pol, newmask, step);
419 }
420 
421 /*
422  * Wrapper for mpol_rebind_policy() that just requires task
423  * pointer, and updates task mempolicy.
424  *
425  * Called with task's alloc_lock held.
426  */
427 
428 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
429 			enum mpol_rebind_step step)
430 {
431 	mpol_rebind_policy(tsk->mempolicy, new, step);
432 }
433 
434 /*
435  * Rebind each vma in mm to new nodemask.
436  *
437  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
438  */
439 
440 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
441 {
442 	struct vm_area_struct *vma;
443 
444 	down_write(&mm->mmap_sem);
445 	for (vma = mm->mmap; vma; vma = vma->vm_next)
446 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
447 	up_write(&mm->mmap_sem);
448 }
449 
450 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
451 	[MPOL_DEFAULT] = {
452 		.rebind = mpol_rebind_default,
453 	},
454 	[MPOL_INTERLEAVE] = {
455 		.create = mpol_new_interleave,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 	[MPOL_PREFERRED] = {
459 		.create = mpol_new_preferred,
460 		.rebind = mpol_rebind_preferred,
461 	},
462 	[MPOL_BIND] = {
463 		.create = mpol_new_bind,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 };
467 
468 static void migrate_page_add(struct page *page, struct list_head *pagelist,
469 				unsigned long flags);
470 
471 struct queue_pages {
472 	struct list_head *pagelist;
473 	unsigned long flags;
474 	nodemask_t *nmask;
475 	struct vm_area_struct *prev;
476 };
477 
478 /*
479  * Scan through pages checking if pages follow certain conditions,
480  * and move them to the pagelist if they do.
481  */
482 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
483 			unsigned long end, struct mm_walk *walk)
484 {
485 	struct vm_area_struct *vma = walk->vma;
486 	struct page *page;
487 	struct queue_pages *qp = walk->private;
488 	unsigned long flags = qp->flags;
489 	int nid, ret;
490 	pte_t *pte;
491 	spinlock_t *ptl;
492 
493 	if (pmd_trans_huge(*pmd)) {
494 		ptl = pmd_lock(walk->mm, pmd);
495 		if (pmd_trans_huge(*pmd)) {
496 			page = pmd_page(*pmd);
497 			if (is_huge_zero_page(page)) {
498 				spin_unlock(ptl);
499 				split_huge_pmd(vma, pmd, addr);
500 			} else {
501 				get_page(page);
502 				spin_unlock(ptl);
503 				lock_page(page);
504 				ret = split_huge_page(page);
505 				unlock_page(page);
506 				put_page(page);
507 				if (ret)
508 					return 0;
509 			}
510 		} else {
511 			spin_unlock(ptl);
512 		}
513 	}
514 
515 	if (pmd_trans_unstable(pmd))
516 		return 0;
517 retry:
518 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
519 	for (; addr != end; pte++, addr += PAGE_SIZE) {
520 		if (!pte_present(*pte))
521 			continue;
522 		page = vm_normal_page(vma, addr, *pte);
523 		if (!page)
524 			continue;
525 		/*
526 		 * vm_normal_page() filters out zero pages, but there might
527 		 * still be PageReserved pages to skip, perhaps in a VDSO.
528 		 */
529 		if (PageReserved(page))
530 			continue;
531 		nid = page_to_nid(page);
532 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 			continue;
534 		if (PageTransCompound(page)) {
535 			get_page(page);
536 			pte_unmap_unlock(pte, ptl);
537 			lock_page(page);
538 			ret = split_huge_page(page);
539 			unlock_page(page);
540 			put_page(page);
541 			/* Failed to split -- skip. */
542 			if (ret) {
543 				pte = pte_offset_map_lock(walk->mm, pmd,
544 						addr, &ptl);
545 				continue;
546 			}
547 			goto retry;
548 		}
549 
550 		migrate_page_add(page, qp->pagelist, flags);
551 	}
552 	pte_unmap_unlock(pte - 1, ptl);
553 	cond_resched();
554 	return 0;
555 }
556 
557 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 			       unsigned long addr, unsigned long end,
559 			       struct mm_walk *walk)
560 {
561 #ifdef CONFIG_HUGETLB_PAGE
562 	struct queue_pages *qp = walk->private;
563 	unsigned long flags = qp->flags;
564 	int nid;
565 	struct page *page;
566 	spinlock_t *ptl;
567 	pte_t entry;
568 
569 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
570 	entry = huge_ptep_get(pte);
571 	if (!pte_present(entry))
572 		goto unlock;
573 	page = pte_page(entry);
574 	nid = page_to_nid(page);
575 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
576 		goto unlock;
577 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
578 	if (flags & (MPOL_MF_MOVE_ALL) ||
579 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
580 		isolate_huge_page(page, qp->pagelist);
581 unlock:
582 	spin_unlock(ptl);
583 #else
584 	BUG();
585 #endif
586 	return 0;
587 }
588 
589 #ifdef CONFIG_NUMA_BALANCING
590 /*
591  * This is used to mark a range of virtual addresses to be inaccessible.
592  * These are later cleared by a NUMA hinting fault. Depending on these
593  * faults, pages may be migrated for better NUMA placement.
594  *
595  * This is assuming that NUMA faults are handled using PROT_NONE. If
596  * an architecture makes a different choice, it will need further
597  * changes to the core.
598  */
599 unsigned long change_prot_numa(struct vm_area_struct *vma,
600 			unsigned long addr, unsigned long end)
601 {
602 	int nr_updated;
603 
604 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
605 	if (nr_updated)
606 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
607 
608 	return nr_updated;
609 }
610 #else
611 static unsigned long change_prot_numa(struct vm_area_struct *vma,
612 			unsigned long addr, unsigned long end)
613 {
614 	return 0;
615 }
616 #endif /* CONFIG_NUMA_BALANCING */
617 
618 static int queue_pages_test_walk(unsigned long start, unsigned long end,
619 				struct mm_walk *walk)
620 {
621 	struct vm_area_struct *vma = walk->vma;
622 	struct queue_pages *qp = walk->private;
623 	unsigned long endvma = vma->vm_end;
624 	unsigned long flags = qp->flags;
625 
626 	if (!vma_migratable(vma))
627 		return 1;
628 
629 	if (endvma > end)
630 		endvma = end;
631 	if (vma->vm_start > start)
632 		start = vma->vm_start;
633 
634 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635 		if (!vma->vm_next && vma->vm_end < end)
636 			return -EFAULT;
637 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
638 			return -EFAULT;
639 	}
640 
641 	qp->prev = vma;
642 
643 	if (flags & MPOL_MF_LAZY) {
644 		/* Similar to task_numa_work, skip inaccessible VMAs */
645 		if (!is_vm_hugetlb_page(vma) &&
646 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
647 			!(vma->vm_flags & VM_MIXEDMAP))
648 			change_prot_numa(vma, start, endvma);
649 		return 1;
650 	}
651 
652 	/* queue pages from current vma */
653 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
654 		return 0;
655 	return 1;
656 }
657 
658 /*
659  * Walk through page tables and collect pages to be migrated.
660  *
661  * If pages found in a given range are on a set of nodes (determined by
662  * @nodes and @flags,) it's isolated and queued to the pagelist which is
663  * passed via @private.)
664  */
665 static int
666 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
667 		nodemask_t *nodes, unsigned long flags,
668 		struct list_head *pagelist)
669 {
670 	struct queue_pages qp = {
671 		.pagelist = pagelist,
672 		.flags = flags,
673 		.nmask = nodes,
674 		.prev = NULL,
675 	};
676 	struct mm_walk queue_pages_walk = {
677 		.hugetlb_entry = queue_pages_hugetlb,
678 		.pmd_entry = queue_pages_pte_range,
679 		.test_walk = queue_pages_test_walk,
680 		.mm = mm,
681 		.private = &qp,
682 	};
683 
684 	return walk_page_range(start, end, &queue_pages_walk);
685 }
686 
687 /*
688  * Apply policy to a single VMA
689  * This must be called with the mmap_sem held for writing.
690  */
691 static int vma_replace_policy(struct vm_area_struct *vma,
692 						struct mempolicy *pol)
693 {
694 	int err;
695 	struct mempolicy *old;
696 	struct mempolicy *new;
697 
698 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
699 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
700 		 vma->vm_ops, vma->vm_file,
701 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
702 
703 	new = mpol_dup(pol);
704 	if (IS_ERR(new))
705 		return PTR_ERR(new);
706 
707 	if (vma->vm_ops && vma->vm_ops->set_policy) {
708 		err = vma->vm_ops->set_policy(vma, new);
709 		if (err)
710 			goto err_out;
711 	}
712 
713 	old = vma->vm_policy;
714 	vma->vm_policy = new; /* protected by mmap_sem */
715 	mpol_put(old);
716 
717 	return 0;
718  err_out:
719 	mpol_put(new);
720 	return err;
721 }
722 
723 /* Step 2: apply policy to a range and do splits. */
724 static int mbind_range(struct mm_struct *mm, unsigned long start,
725 		       unsigned long end, struct mempolicy *new_pol)
726 {
727 	struct vm_area_struct *next;
728 	struct vm_area_struct *prev;
729 	struct vm_area_struct *vma;
730 	int err = 0;
731 	pgoff_t pgoff;
732 	unsigned long vmstart;
733 	unsigned long vmend;
734 
735 	vma = find_vma(mm, start);
736 	if (!vma || vma->vm_start > start)
737 		return -EFAULT;
738 
739 	prev = vma->vm_prev;
740 	if (start > vma->vm_start)
741 		prev = vma;
742 
743 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
744 		next = vma->vm_next;
745 		vmstart = max(start, vma->vm_start);
746 		vmend   = min(end, vma->vm_end);
747 
748 		if (mpol_equal(vma_policy(vma), new_pol))
749 			continue;
750 
751 		pgoff = vma->vm_pgoff +
752 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
753 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
754 				 vma->anon_vma, vma->vm_file, pgoff,
755 				 new_pol, vma->vm_userfaultfd_ctx);
756 		if (prev) {
757 			vma = prev;
758 			next = vma->vm_next;
759 			if (mpol_equal(vma_policy(vma), new_pol))
760 				continue;
761 			/* vma_merge() joined vma && vma->next, case 8 */
762 			goto replace;
763 		}
764 		if (vma->vm_start != vmstart) {
765 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
766 			if (err)
767 				goto out;
768 		}
769 		if (vma->vm_end != vmend) {
770 			err = split_vma(vma->vm_mm, vma, vmend, 0);
771 			if (err)
772 				goto out;
773 		}
774  replace:
775 		err = vma_replace_policy(vma, new_pol);
776 		if (err)
777 			goto out;
778 	}
779 
780  out:
781 	return err;
782 }
783 
784 /* Set the process memory policy */
785 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
786 			     nodemask_t *nodes)
787 {
788 	struct mempolicy *new, *old;
789 	NODEMASK_SCRATCH(scratch);
790 	int ret;
791 
792 	if (!scratch)
793 		return -ENOMEM;
794 
795 	new = mpol_new(mode, flags, nodes);
796 	if (IS_ERR(new)) {
797 		ret = PTR_ERR(new);
798 		goto out;
799 	}
800 
801 	task_lock(current);
802 	ret = mpol_set_nodemask(new, nodes, scratch);
803 	if (ret) {
804 		task_unlock(current);
805 		mpol_put(new);
806 		goto out;
807 	}
808 	old = current->mempolicy;
809 	current->mempolicy = new;
810 	if (new && new->mode == MPOL_INTERLEAVE &&
811 	    nodes_weight(new->v.nodes))
812 		current->il_next = first_node(new->v.nodes);
813 	task_unlock(current);
814 	mpol_put(old);
815 	ret = 0;
816 out:
817 	NODEMASK_SCRATCH_FREE(scratch);
818 	return ret;
819 }
820 
821 /*
822  * Return nodemask for policy for get_mempolicy() query
823  *
824  * Called with task's alloc_lock held
825  */
826 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
827 {
828 	nodes_clear(*nodes);
829 	if (p == &default_policy)
830 		return;
831 
832 	switch (p->mode) {
833 	case MPOL_BIND:
834 		/* Fall through */
835 	case MPOL_INTERLEAVE:
836 		*nodes = p->v.nodes;
837 		break;
838 	case MPOL_PREFERRED:
839 		if (!(p->flags & MPOL_F_LOCAL))
840 			node_set(p->v.preferred_node, *nodes);
841 		/* else return empty node mask for local allocation */
842 		break;
843 	default:
844 		BUG();
845 	}
846 }
847 
848 static int lookup_node(unsigned long addr)
849 {
850 	struct page *p;
851 	int err;
852 
853 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
854 	if (err >= 0) {
855 		err = page_to_nid(p);
856 		put_page(p);
857 	}
858 	return err;
859 }
860 
861 /* Retrieve NUMA policy */
862 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
863 			     unsigned long addr, unsigned long flags)
864 {
865 	int err;
866 	struct mm_struct *mm = current->mm;
867 	struct vm_area_struct *vma = NULL;
868 	struct mempolicy *pol = current->mempolicy;
869 
870 	if (flags &
871 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
872 		return -EINVAL;
873 
874 	if (flags & MPOL_F_MEMS_ALLOWED) {
875 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
876 			return -EINVAL;
877 		*policy = 0;	/* just so it's initialized */
878 		task_lock(current);
879 		*nmask  = cpuset_current_mems_allowed;
880 		task_unlock(current);
881 		return 0;
882 	}
883 
884 	if (flags & MPOL_F_ADDR) {
885 		/*
886 		 * Do NOT fall back to task policy if the
887 		 * vma/shared policy at addr is NULL.  We
888 		 * want to return MPOL_DEFAULT in this case.
889 		 */
890 		down_read(&mm->mmap_sem);
891 		vma = find_vma_intersection(mm, addr, addr+1);
892 		if (!vma) {
893 			up_read(&mm->mmap_sem);
894 			return -EFAULT;
895 		}
896 		if (vma->vm_ops && vma->vm_ops->get_policy)
897 			pol = vma->vm_ops->get_policy(vma, addr);
898 		else
899 			pol = vma->vm_policy;
900 	} else if (addr)
901 		return -EINVAL;
902 
903 	if (!pol)
904 		pol = &default_policy;	/* indicates default behavior */
905 
906 	if (flags & MPOL_F_NODE) {
907 		if (flags & MPOL_F_ADDR) {
908 			err = lookup_node(addr);
909 			if (err < 0)
910 				goto out;
911 			*policy = err;
912 		} else if (pol == current->mempolicy &&
913 				pol->mode == MPOL_INTERLEAVE) {
914 			*policy = current->il_next;
915 		} else {
916 			err = -EINVAL;
917 			goto out;
918 		}
919 	} else {
920 		*policy = pol == &default_policy ? MPOL_DEFAULT :
921 						pol->mode;
922 		/*
923 		 * Internal mempolicy flags must be masked off before exposing
924 		 * the policy to userspace.
925 		 */
926 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
927 	}
928 
929 	if (vma) {
930 		up_read(&current->mm->mmap_sem);
931 		vma = NULL;
932 	}
933 
934 	err = 0;
935 	if (nmask) {
936 		if (mpol_store_user_nodemask(pol)) {
937 			*nmask = pol->w.user_nodemask;
938 		} else {
939 			task_lock(current);
940 			get_policy_nodemask(pol, nmask);
941 			task_unlock(current);
942 		}
943 	}
944 
945  out:
946 	mpol_cond_put(pol);
947 	if (vma)
948 		up_read(&current->mm->mmap_sem);
949 	return err;
950 }
951 
952 #ifdef CONFIG_MIGRATION
953 /*
954  * page migration
955  */
956 static void migrate_page_add(struct page *page, struct list_head *pagelist,
957 				unsigned long flags)
958 {
959 	/*
960 	 * Avoid migrating a page that is shared with others.
961 	 */
962 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
963 		if (!isolate_lru_page(page)) {
964 			list_add_tail(&page->lru, pagelist);
965 			inc_node_page_state(page, NR_ISOLATED_ANON +
966 					    page_is_file_cache(page));
967 		}
968 	}
969 }
970 
971 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
972 {
973 	if (PageHuge(page))
974 		return alloc_huge_page_node(page_hstate(compound_head(page)),
975 					node);
976 	else
977 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
978 						    __GFP_THISNODE, 0);
979 }
980 
981 /*
982  * Migrate pages from one node to a target node.
983  * Returns error or the number of pages not migrated.
984  */
985 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
986 			   int flags)
987 {
988 	nodemask_t nmask;
989 	LIST_HEAD(pagelist);
990 	int err = 0;
991 
992 	nodes_clear(nmask);
993 	node_set(source, nmask);
994 
995 	/*
996 	 * This does not "check" the range but isolates all pages that
997 	 * need migration.  Between passing in the full user address
998 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
999 	 */
1000 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1001 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1002 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1003 
1004 	if (!list_empty(&pagelist)) {
1005 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1006 					MIGRATE_SYNC, MR_SYSCALL);
1007 		if (err)
1008 			putback_movable_pages(&pagelist);
1009 	}
1010 
1011 	return err;
1012 }
1013 
1014 /*
1015  * Move pages between the two nodesets so as to preserve the physical
1016  * layout as much as possible.
1017  *
1018  * Returns the number of page that could not be moved.
1019  */
1020 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1021 		     const nodemask_t *to, int flags)
1022 {
1023 	int busy = 0;
1024 	int err;
1025 	nodemask_t tmp;
1026 
1027 	err = migrate_prep();
1028 	if (err)
1029 		return err;
1030 
1031 	down_read(&mm->mmap_sem);
1032 
1033 	/*
1034 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1035 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1036 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1037 	 * The pair of nodemasks 'to' and 'from' define the map.
1038 	 *
1039 	 * If no pair of bits is found that way, fallback to picking some
1040 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1041 	 * 'source' and 'dest' bits are the same, this represents a node
1042 	 * that will be migrating to itself, so no pages need move.
1043 	 *
1044 	 * If no bits are left in 'tmp', or if all remaining bits left
1045 	 * in 'tmp' correspond to the same bit in 'to', return false
1046 	 * (nothing left to migrate).
1047 	 *
1048 	 * This lets us pick a pair of nodes to migrate between, such that
1049 	 * if possible the dest node is not already occupied by some other
1050 	 * source node, minimizing the risk of overloading the memory on a
1051 	 * node that would happen if we migrated incoming memory to a node
1052 	 * before migrating outgoing memory source that same node.
1053 	 *
1054 	 * A single scan of tmp is sufficient.  As we go, we remember the
1055 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1056 	 * that not only moved, but what's better, moved to an empty slot
1057 	 * (d is not set in tmp), then we break out then, with that pair.
1058 	 * Otherwise when we finish scanning from_tmp, we at least have the
1059 	 * most recent <s, d> pair that moved.  If we get all the way through
1060 	 * the scan of tmp without finding any node that moved, much less
1061 	 * moved to an empty node, then there is nothing left worth migrating.
1062 	 */
1063 
1064 	tmp = *from;
1065 	while (!nodes_empty(tmp)) {
1066 		int s,d;
1067 		int source = NUMA_NO_NODE;
1068 		int dest = 0;
1069 
1070 		for_each_node_mask(s, tmp) {
1071 
1072 			/*
1073 			 * do_migrate_pages() tries to maintain the relative
1074 			 * node relationship of the pages established between
1075 			 * threads and memory areas.
1076                          *
1077 			 * However if the number of source nodes is not equal to
1078 			 * the number of destination nodes we can not preserve
1079 			 * this node relative relationship.  In that case, skip
1080 			 * copying memory from a node that is in the destination
1081 			 * mask.
1082 			 *
1083 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1084 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1085 			 */
1086 
1087 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1088 						(node_isset(s, *to)))
1089 				continue;
1090 
1091 			d = node_remap(s, *from, *to);
1092 			if (s == d)
1093 				continue;
1094 
1095 			source = s;	/* Node moved. Memorize */
1096 			dest = d;
1097 
1098 			/* dest not in remaining from nodes? */
1099 			if (!node_isset(dest, tmp))
1100 				break;
1101 		}
1102 		if (source == NUMA_NO_NODE)
1103 			break;
1104 
1105 		node_clear(source, tmp);
1106 		err = migrate_to_node(mm, source, dest, flags);
1107 		if (err > 0)
1108 			busy += err;
1109 		if (err < 0)
1110 			break;
1111 	}
1112 	up_read(&mm->mmap_sem);
1113 	if (err < 0)
1114 		return err;
1115 	return busy;
1116 
1117 }
1118 
1119 /*
1120  * Allocate a new page for page migration based on vma policy.
1121  * Start by assuming the page is mapped by the same vma as contains @start.
1122  * Search forward from there, if not.  N.B., this assumes that the
1123  * list of pages handed to migrate_pages()--which is how we get here--
1124  * is in virtual address order.
1125  */
1126 static struct page *new_page(struct page *page, unsigned long start, int **x)
1127 {
1128 	struct vm_area_struct *vma;
1129 	unsigned long uninitialized_var(address);
1130 
1131 	vma = find_vma(current->mm, start);
1132 	while (vma) {
1133 		address = page_address_in_vma(page, vma);
1134 		if (address != -EFAULT)
1135 			break;
1136 		vma = vma->vm_next;
1137 	}
1138 
1139 	if (PageHuge(page)) {
1140 		BUG_ON(!vma);
1141 		return alloc_huge_page_noerr(vma, address, 1);
1142 	}
1143 	/*
1144 	 * if !vma, alloc_page_vma() will use task or system default policy
1145 	 */
1146 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1147 }
1148 #else
1149 
1150 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1151 				unsigned long flags)
1152 {
1153 }
1154 
1155 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1156 		     const nodemask_t *to, int flags)
1157 {
1158 	return -ENOSYS;
1159 }
1160 
1161 static struct page *new_page(struct page *page, unsigned long start, int **x)
1162 {
1163 	return NULL;
1164 }
1165 #endif
1166 
1167 static long do_mbind(unsigned long start, unsigned long len,
1168 		     unsigned short mode, unsigned short mode_flags,
1169 		     nodemask_t *nmask, unsigned long flags)
1170 {
1171 	struct mm_struct *mm = current->mm;
1172 	struct mempolicy *new;
1173 	unsigned long end;
1174 	int err;
1175 	LIST_HEAD(pagelist);
1176 
1177 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1178 		return -EINVAL;
1179 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1180 		return -EPERM;
1181 
1182 	if (start & ~PAGE_MASK)
1183 		return -EINVAL;
1184 
1185 	if (mode == MPOL_DEFAULT)
1186 		flags &= ~MPOL_MF_STRICT;
1187 
1188 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1189 	end = start + len;
1190 
1191 	if (end < start)
1192 		return -EINVAL;
1193 	if (end == start)
1194 		return 0;
1195 
1196 	new = mpol_new(mode, mode_flags, nmask);
1197 	if (IS_ERR(new))
1198 		return PTR_ERR(new);
1199 
1200 	if (flags & MPOL_MF_LAZY)
1201 		new->flags |= MPOL_F_MOF;
1202 
1203 	/*
1204 	 * If we are using the default policy then operation
1205 	 * on discontinuous address spaces is okay after all
1206 	 */
1207 	if (!new)
1208 		flags |= MPOL_MF_DISCONTIG_OK;
1209 
1210 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1211 		 start, start + len, mode, mode_flags,
1212 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1213 
1214 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1215 
1216 		err = migrate_prep();
1217 		if (err)
1218 			goto mpol_out;
1219 	}
1220 	{
1221 		NODEMASK_SCRATCH(scratch);
1222 		if (scratch) {
1223 			down_write(&mm->mmap_sem);
1224 			task_lock(current);
1225 			err = mpol_set_nodemask(new, nmask, scratch);
1226 			task_unlock(current);
1227 			if (err)
1228 				up_write(&mm->mmap_sem);
1229 		} else
1230 			err = -ENOMEM;
1231 		NODEMASK_SCRATCH_FREE(scratch);
1232 	}
1233 	if (err)
1234 		goto mpol_out;
1235 
1236 	err = queue_pages_range(mm, start, end, nmask,
1237 			  flags | MPOL_MF_INVERT, &pagelist);
1238 	if (!err)
1239 		err = mbind_range(mm, start, end, new);
1240 
1241 	if (!err) {
1242 		int nr_failed = 0;
1243 
1244 		if (!list_empty(&pagelist)) {
1245 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1246 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1247 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1248 			if (nr_failed)
1249 				putback_movable_pages(&pagelist);
1250 		}
1251 
1252 		if (nr_failed && (flags & MPOL_MF_STRICT))
1253 			err = -EIO;
1254 	} else
1255 		putback_movable_pages(&pagelist);
1256 
1257 	up_write(&mm->mmap_sem);
1258  mpol_out:
1259 	mpol_put(new);
1260 	return err;
1261 }
1262 
1263 /*
1264  * User space interface with variable sized bitmaps for nodelists.
1265  */
1266 
1267 /* Copy a node mask from user space. */
1268 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1269 		     unsigned long maxnode)
1270 {
1271 	unsigned long k;
1272 	unsigned long nlongs;
1273 	unsigned long endmask;
1274 
1275 	--maxnode;
1276 	nodes_clear(*nodes);
1277 	if (maxnode == 0 || !nmask)
1278 		return 0;
1279 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1280 		return -EINVAL;
1281 
1282 	nlongs = BITS_TO_LONGS(maxnode);
1283 	if ((maxnode % BITS_PER_LONG) == 0)
1284 		endmask = ~0UL;
1285 	else
1286 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1287 
1288 	/* When the user specified more nodes than supported just check
1289 	   if the non supported part is all zero. */
1290 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1291 		if (nlongs > PAGE_SIZE/sizeof(long))
1292 			return -EINVAL;
1293 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1294 			unsigned long t;
1295 			if (get_user(t, nmask + k))
1296 				return -EFAULT;
1297 			if (k == nlongs - 1) {
1298 				if (t & endmask)
1299 					return -EINVAL;
1300 			} else if (t)
1301 				return -EINVAL;
1302 		}
1303 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1304 		endmask = ~0UL;
1305 	}
1306 
1307 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1308 		return -EFAULT;
1309 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1310 	return 0;
1311 }
1312 
1313 /* Copy a kernel node mask to user space */
1314 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1315 			      nodemask_t *nodes)
1316 {
1317 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1318 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1319 
1320 	if (copy > nbytes) {
1321 		if (copy > PAGE_SIZE)
1322 			return -EINVAL;
1323 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1324 			return -EFAULT;
1325 		copy = nbytes;
1326 	}
1327 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1328 }
1329 
1330 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1331 		unsigned long, mode, const unsigned long __user *, nmask,
1332 		unsigned long, maxnode, unsigned, flags)
1333 {
1334 	nodemask_t nodes;
1335 	int err;
1336 	unsigned short mode_flags;
1337 
1338 	mode_flags = mode & MPOL_MODE_FLAGS;
1339 	mode &= ~MPOL_MODE_FLAGS;
1340 	if (mode >= MPOL_MAX)
1341 		return -EINVAL;
1342 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1343 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1344 		return -EINVAL;
1345 	err = get_nodes(&nodes, nmask, maxnode);
1346 	if (err)
1347 		return err;
1348 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1349 }
1350 
1351 /* Set the process memory policy */
1352 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1353 		unsigned long, maxnode)
1354 {
1355 	int err;
1356 	nodemask_t nodes;
1357 	unsigned short flags;
1358 
1359 	flags = mode & MPOL_MODE_FLAGS;
1360 	mode &= ~MPOL_MODE_FLAGS;
1361 	if ((unsigned int)mode >= MPOL_MAX)
1362 		return -EINVAL;
1363 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1364 		return -EINVAL;
1365 	err = get_nodes(&nodes, nmask, maxnode);
1366 	if (err)
1367 		return err;
1368 	return do_set_mempolicy(mode, flags, &nodes);
1369 }
1370 
1371 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1372 		const unsigned long __user *, old_nodes,
1373 		const unsigned long __user *, new_nodes)
1374 {
1375 	const struct cred *cred = current_cred(), *tcred;
1376 	struct mm_struct *mm = NULL;
1377 	struct task_struct *task;
1378 	nodemask_t task_nodes;
1379 	int err;
1380 	nodemask_t *old;
1381 	nodemask_t *new;
1382 	NODEMASK_SCRATCH(scratch);
1383 
1384 	if (!scratch)
1385 		return -ENOMEM;
1386 
1387 	old = &scratch->mask1;
1388 	new = &scratch->mask2;
1389 
1390 	err = get_nodes(old, old_nodes, maxnode);
1391 	if (err)
1392 		goto out;
1393 
1394 	err = get_nodes(new, new_nodes, maxnode);
1395 	if (err)
1396 		goto out;
1397 
1398 	/* Find the mm_struct */
1399 	rcu_read_lock();
1400 	task = pid ? find_task_by_vpid(pid) : current;
1401 	if (!task) {
1402 		rcu_read_unlock();
1403 		err = -ESRCH;
1404 		goto out;
1405 	}
1406 	get_task_struct(task);
1407 
1408 	err = -EINVAL;
1409 
1410 	/*
1411 	 * Check if this process has the right to modify the specified
1412 	 * process. The right exists if the process has administrative
1413 	 * capabilities, superuser privileges or the same
1414 	 * userid as the target process.
1415 	 */
1416 	tcred = __task_cred(task);
1417 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1418 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1419 	    !capable(CAP_SYS_NICE)) {
1420 		rcu_read_unlock();
1421 		err = -EPERM;
1422 		goto out_put;
1423 	}
1424 	rcu_read_unlock();
1425 
1426 	task_nodes = cpuset_mems_allowed(task);
1427 	/* Is the user allowed to access the target nodes? */
1428 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1429 		err = -EPERM;
1430 		goto out_put;
1431 	}
1432 
1433 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1434 		err = -EINVAL;
1435 		goto out_put;
1436 	}
1437 
1438 	err = security_task_movememory(task);
1439 	if (err)
1440 		goto out_put;
1441 
1442 	mm = get_task_mm(task);
1443 	put_task_struct(task);
1444 
1445 	if (!mm) {
1446 		err = -EINVAL;
1447 		goto out;
1448 	}
1449 
1450 	err = do_migrate_pages(mm, old, new,
1451 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1452 
1453 	mmput(mm);
1454 out:
1455 	NODEMASK_SCRATCH_FREE(scratch);
1456 
1457 	return err;
1458 
1459 out_put:
1460 	put_task_struct(task);
1461 	goto out;
1462 
1463 }
1464 
1465 
1466 /* Retrieve NUMA policy */
1467 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1468 		unsigned long __user *, nmask, unsigned long, maxnode,
1469 		unsigned long, addr, unsigned long, flags)
1470 {
1471 	int err;
1472 	int uninitialized_var(pval);
1473 	nodemask_t nodes;
1474 
1475 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1476 		return -EINVAL;
1477 
1478 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1479 
1480 	if (err)
1481 		return err;
1482 
1483 	if (policy && put_user(pval, policy))
1484 		return -EFAULT;
1485 
1486 	if (nmask)
1487 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1488 
1489 	return err;
1490 }
1491 
1492 #ifdef CONFIG_COMPAT
1493 
1494 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1495 		       compat_ulong_t __user *, nmask,
1496 		       compat_ulong_t, maxnode,
1497 		       compat_ulong_t, addr, compat_ulong_t, flags)
1498 {
1499 	long err;
1500 	unsigned long __user *nm = NULL;
1501 	unsigned long nr_bits, alloc_size;
1502 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1503 
1504 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1505 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1506 
1507 	if (nmask)
1508 		nm = compat_alloc_user_space(alloc_size);
1509 
1510 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1511 
1512 	if (!err && nmask) {
1513 		unsigned long copy_size;
1514 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1515 		err = copy_from_user(bm, nm, copy_size);
1516 		/* ensure entire bitmap is zeroed */
1517 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1518 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1519 	}
1520 
1521 	return err;
1522 }
1523 
1524 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1525 		       compat_ulong_t, maxnode)
1526 {
1527 	long err = 0;
1528 	unsigned long __user *nm = NULL;
1529 	unsigned long nr_bits, alloc_size;
1530 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1531 
1532 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1533 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1534 
1535 	if (nmask) {
1536 		err = compat_get_bitmap(bm, nmask, nr_bits);
1537 		nm = compat_alloc_user_space(alloc_size);
1538 		err |= copy_to_user(nm, bm, alloc_size);
1539 	}
1540 
1541 	if (err)
1542 		return -EFAULT;
1543 
1544 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1545 }
1546 
1547 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1548 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1549 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1550 {
1551 	long err = 0;
1552 	unsigned long __user *nm = NULL;
1553 	unsigned long nr_bits, alloc_size;
1554 	nodemask_t bm;
1555 
1556 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1557 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1558 
1559 	if (nmask) {
1560 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1561 		nm = compat_alloc_user_space(alloc_size);
1562 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1563 	}
1564 
1565 	if (err)
1566 		return -EFAULT;
1567 
1568 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1569 }
1570 
1571 #endif
1572 
1573 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1574 						unsigned long addr)
1575 {
1576 	struct mempolicy *pol = NULL;
1577 
1578 	if (vma) {
1579 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1580 			pol = vma->vm_ops->get_policy(vma, addr);
1581 		} else if (vma->vm_policy) {
1582 			pol = vma->vm_policy;
1583 
1584 			/*
1585 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1586 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1587 			 * count on these policies which will be dropped by
1588 			 * mpol_cond_put() later
1589 			 */
1590 			if (mpol_needs_cond_ref(pol))
1591 				mpol_get(pol);
1592 		}
1593 	}
1594 
1595 	return pol;
1596 }
1597 
1598 /*
1599  * get_vma_policy(@vma, @addr)
1600  * @vma: virtual memory area whose policy is sought
1601  * @addr: address in @vma for shared policy lookup
1602  *
1603  * Returns effective policy for a VMA at specified address.
1604  * Falls back to current->mempolicy or system default policy, as necessary.
1605  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1606  * count--added by the get_policy() vm_op, as appropriate--to protect against
1607  * freeing by another task.  It is the caller's responsibility to free the
1608  * extra reference for shared policies.
1609  */
1610 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1611 						unsigned long addr)
1612 {
1613 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1614 
1615 	if (!pol)
1616 		pol = get_task_policy(current);
1617 
1618 	return pol;
1619 }
1620 
1621 bool vma_policy_mof(struct vm_area_struct *vma)
1622 {
1623 	struct mempolicy *pol;
1624 
1625 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1626 		bool ret = false;
1627 
1628 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1629 		if (pol && (pol->flags & MPOL_F_MOF))
1630 			ret = true;
1631 		mpol_cond_put(pol);
1632 
1633 		return ret;
1634 	}
1635 
1636 	pol = vma->vm_policy;
1637 	if (!pol)
1638 		pol = get_task_policy(current);
1639 
1640 	return pol->flags & MPOL_F_MOF;
1641 }
1642 
1643 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1644 {
1645 	enum zone_type dynamic_policy_zone = policy_zone;
1646 
1647 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1648 
1649 	/*
1650 	 * if policy->v.nodes has movable memory only,
1651 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1652 	 *
1653 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1654 	 * so if the following test faile, it implies
1655 	 * policy->v.nodes has movable memory only.
1656 	 */
1657 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1658 		dynamic_policy_zone = ZONE_MOVABLE;
1659 
1660 	return zone >= dynamic_policy_zone;
1661 }
1662 
1663 /*
1664  * Return a nodemask representing a mempolicy for filtering nodes for
1665  * page allocation
1666  */
1667 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1668 {
1669 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1670 	if (unlikely(policy->mode == MPOL_BIND) &&
1671 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1672 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1673 		return &policy->v.nodes;
1674 
1675 	return NULL;
1676 }
1677 
1678 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1679 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1680 	int nd)
1681 {
1682 	switch (policy->mode) {
1683 	case MPOL_PREFERRED:
1684 		if (!(policy->flags & MPOL_F_LOCAL))
1685 			nd = policy->v.preferred_node;
1686 		break;
1687 	case MPOL_BIND:
1688 		/*
1689 		 * Normally, MPOL_BIND allocations are node-local within the
1690 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1691 		 * current node isn't part of the mask, we use the zonelist for
1692 		 * the first node in the mask instead.
1693 		 */
1694 		if (unlikely(gfp & __GFP_THISNODE) &&
1695 				unlikely(!node_isset(nd, policy->v.nodes)))
1696 			nd = first_node(policy->v.nodes);
1697 		break;
1698 	default:
1699 		BUG();
1700 	}
1701 	return node_zonelist(nd, gfp);
1702 }
1703 
1704 /* Do dynamic interleaving for a process */
1705 static unsigned interleave_nodes(struct mempolicy *policy)
1706 {
1707 	unsigned nid, next;
1708 	struct task_struct *me = current;
1709 
1710 	nid = me->il_next;
1711 	next = next_node_in(nid, policy->v.nodes);
1712 	if (next < MAX_NUMNODES)
1713 		me->il_next = next;
1714 	return nid;
1715 }
1716 
1717 /*
1718  * Depending on the memory policy provide a node from which to allocate the
1719  * next slab entry.
1720  */
1721 unsigned int mempolicy_slab_node(void)
1722 {
1723 	struct mempolicy *policy;
1724 	int node = numa_mem_id();
1725 
1726 	if (in_interrupt())
1727 		return node;
1728 
1729 	policy = current->mempolicy;
1730 	if (!policy || policy->flags & MPOL_F_LOCAL)
1731 		return node;
1732 
1733 	switch (policy->mode) {
1734 	case MPOL_PREFERRED:
1735 		/*
1736 		 * handled MPOL_F_LOCAL above
1737 		 */
1738 		return policy->v.preferred_node;
1739 
1740 	case MPOL_INTERLEAVE:
1741 		return interleave_nodes(policy);
1742 
1743 	case MPOL_BIND: {
1744 		struct zoneref *z;
1745 
1746 		/*
1747 		 * Follow bind policy behavior and start allocation at the
1748 		 * first node.
1749 		 */
1750 		struct zonelist *zonelist;
1751 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1752 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1753 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1754 							&policy->v.nodes);
1755 		return z->zone ? z->zone->node : node;
1756 	}
1757 
1758 	default:
1759 		BUG();
1760 	}
1761 }
1762 
1763 /*
1764  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1765  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1766  * number of present nodes.
1767  */
1768 static unsigned offset_il_node(struct mempolicy *pol,
1769 			       struct vm_area_struct *vma, unsigned long n)
1770 {
1771 	unsigned nnodes = nodes_weight(pol->v.nodes);
1772 	unsigned target;
1773 	int i;
1774 	int nid;
1775 
1776 	if (!nnodes)
1777 		return numa_node_id();
1778 	target = (unsigned int)n % nnodes;
1779 	nid = first_node(pol->v.nodes);
1780 	for (i = 0; i < target; i++)
1781 		nid = next_node(nid, pol->v.nodes);
1782 	return nid;
1783 }
1784 
1785 /* Determine a node number for interleave */
1786 static inline unsigned interleave_nid(struct mempolicy *pol,
1787 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1788 {
1789 	if (vma) {
1790 		unsigned long off;
1791 
1792 		/*
1793 		 * for small pages, there is no difference between
1794 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1795 		 * for huge pages, since vm_pgoff is in units of small
1796 		 * pages, we need to shift off the always 0 bits to get
1797 		 * a useful offset.
1798 		 */
1799 		BUG_ON(shift < PAGE_SHIFT);
1800 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1801 		off += (addr - vma->vm_start) >> shift;
1802 		return offset_il_node(pol, vma, off);
1803 	} else
1804 		return interleave_nodes(pol);
1805 }
1806 
1807 #ifdef CONFIG_HUGETLBFS
1808 /*
1809  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1810  * @vma: virtual memory area whose policy is sought
1811  * @addr: address in @vma for shared policy lookup and interleave policy
1812  * @gfp_flags: for requested zone
1813  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1814  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1815  *
1816  * Returns a zonelist suitable for a huge page allocation and a pointer
1817  * to the struct mempolicy for conditional unref after allocation.
1818  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1819  * @nodemask for filtering the zonelist.
1820  *
1821  * Must be protected by read_mems_allowed_begin()
1822  */
1823 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1824 				gfp_t gfp_flags, struct mempolicy **mpol,
1825 				nodemask_t **nodemask)
1826 {
1827 	struct zonelist *zl;
1828 
1829 	*mpol = get_vma_policy(vma, addr);
1830 	*nodemask = NULL;	/* assume !MPOL_BIND */
1831 
1832 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1833 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1834 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1835 	} else {
1836 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1837 		if ((*mpol)->mode == MPOL_BIND)
1838 			*nodemask = &(*mpol)->v.nodes;
1839 	}
1840 	return zl;
1841 }
1842 
1843 /*
1844  * init_nodemask_of_mempolicy
1845  *
1846  * If the current task's mempolicy is "default" [NULL], return 'false'
1847  * to indicate default policy.  Otherwise, extract the policy nodemask
1848  * for 'bind' or 'interleave' policy into the argument nodemask, or
1849  * initialize the argument nodemask to contain the single node for
1850  * 'preferred' or 'local' policy and return 'true' to indicate presence
1851  * of non-default mempolicy.
1852  *
1853  * We don't bother with reference counting the mempolicy [mpol_get/put]
1854  * because the current task is examining it's own mempolicy and a task's
1855  * mempolicy is only ever changed by the task itself.
1856  *
1857  * N.B., it is the caller's responsibility to free a returned nodemask.
1858  */
1859 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1860 {
1861 	struct mempolicy *mempolicy;
1862 	int nid;
1863 
1864 	if (!(mask && current->mempolicy))
1865 		return false;
1866 
1867 	task_lock(current);
1868 	mempolicy = current->mempolicy;
1869 	switch (mempolicy->mode) {
1870 	case MPOL_PREFERRED:
1871 		if (mempolicy->flags & MPOL_F_LOCAL)
1872 			nid = numa_node_id();
1873 		else
1874 			nid = mempolicy->v.preferred_node;
1875 		init_nodemask_of_node(mask, nid);
1876 		break;
1877 
1878 	case MPOL_BIND:
1879 		/* Fall through */
1880 	case MPOL_INTERLEAVE:
1881 		*mask =  mempolicy->v.nodes;
1882 		break;
1883 
1884 	default:
1885 		BUG();
1886 	}
1887 	task_unlock(current);
1888 
1889 	return true;
1890 }
1891 #endif
1892 
1893 /*
1894  * mempolicy_nodemask_intersects
1895  *
1896  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1897  * policy.  Otherwise, check for intersection between mask and the policy
1898  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1899  * policy, always return true since it may allocate elsewhere on fallback.
1900  *
1901  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1902  */
1903 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1904 					const nodemask_t *mask)
1905 {
1906 	struct mempolicy *mempolicy;
1907 	bool ret = true;
1908 
1909 	if (!mask)
1910 		return ret;
1911 	task_lock(tsk);
1912 	mempolicy = tsk->mempolicy;
1913 	if (!mempolicy)
1914 		goto out;
1915 
1916 	switch (mempolicy->mode) {
1917 	case MPOL_PREFERRED:
1918 		/*
1919 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1920 		 * allocate from, they may fallback to other nodes when oom.
1921 		 * Thus, it's possible for tsk to have allocated memory from
1922 		 * nodes in mask.
1923 		 */
1924 		break;
1925 	case MPOL_BIND:
1926 	case MPOL_INTERLEAVE:
1927 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1928 		break;
1929 	default:
1930 		BUG();
1931 	}
1932 out:
1933 	task_unlock(tsk);
1934 	return ret;
1935 }
1936 
1937 /* Allocate a page in interleaved policy.
1938    Own path because it needs to do special accounting. */
1939 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1940 					unsigned nid)
1941 {
1942 	struct zonelist *zl;
1943 	struct page *page;
1944 
1945 	zl = node_zonelist(nid, gfp);
1946 	page = __alloc_pages(gfp, order, zl);
1947 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1948 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1949 	return page;
1950 }
1951 
1952 /**
1953  * 	alloc_pages_vma	- Allocate a page for a VMA.
1954  *
1955  * 	@gfp:
1956  *      %GFP_USER    user allocation.
1957  *      %GFP_KERNEL  kernel allocations,
1958  *      %GFP_HIGHMEM highmem/user allocations,
1959  *      %GFP_FS      allocation should not call back into a file system.
1960  *      %GFP_ATOMIC  don't sleep.
1961  *
1962  *	@order:Order of the GFP allocation.
1963  * 	@vma:  Pointer to VMA or NULL if not available.
1964  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1965  *	@node: Which node to prefer for allocation (modulo policy).
1966  *	@hugepage: for hugepages try only the preferred node if possible
1967  *
1968  * 	This function allocates a page from the kernel page pool and applies
1969  *	a NUMA policy associated with the VMA or the current process.
1970  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1971  *	mm_struct of the VMA to prevent it from going away. Should be used for
1972  *	all allocations for pages that will be mapped into user space. Returns
1973  *	NULL when no page can be allocated.
1974  */
1975 struct page *
1976 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1977 		unsigned long addr, int node, bool hugepage)
1978 {
1979 	struct mempolicy *pol;
1980 	struct page *page;
1981 	unsigned int cpuset_mems_cookie;
1982 	struct zonelist *zl;
1983 	nodemask_t *nmask;
1984 
1985 retry_cpuset:
1986 	pol = get_vma_policy(vma, addr);
1987 	cpuset_mems_cookie = read_mems_allowed_begin();
1988 
1989 	if (pol->mode == MPOL_INTERLEAVE) {
1990 		unsigned nid;
1991 
1992 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1993 		mpol_cond_put(pol);
1994 		page = alloc_page_interleave(gfp, order, nid);
1995 		goto out;
1996 	}
1997 
1998 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1999 		int hpage_node = node;
2000 
2001 		/*
2002 		 * For hugepage allocation and non-interleave policy which
2003 		 * allows the current node (or other explicitly preferred
2004 		 * node) we only try to allocate from the current/preferred
2005 		 * node and don't fall back to other nodes, as the cost of
2006 		 * remote accesses would likely offset THP benefits.
2007 		 *
2008 		 * If the policy is interleave, or does not allow the current
2009 		 * node in its nodemask, we allocate the standard way.
2010 		 */
2011 		if (pol->mode == MPOL_PREFERRED &&
2012 						!(pol->flags & MPOL_F_LOCAL))
2013 			hpage_node = pol->v.preferred_node;
2014 
2015 		nmask = policy_nodemask(gfp, pol);
2016 		if (!nmask || node_isset(hpage_node, *nmask)) {
2017 			mpol_cond_put(pol);
2018 			page = __alloc_pages_node(hpage_node,
2019 						gfp | __GFP_THISNODE, order);
2020 			goto out;
2021 		}
2022 	}
2023 
2024 	nmask = policy_nodemask(gfp, pol);
2025 	zl = policy_zonelist(gfp, pol, node);
2026 	mpol_cond_put(pol);
2027 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2028 out:
2029 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2030 		goto retry_cpuset;
2031 	return page;
2032 }
2033 
2034 /**
2035  * 	alloc_pages_current - Allocate pages.
2036  *
2037  *	@gfp:
2038  *		%GFP_USER   user allocation,
2039  *      	%GFP_KERNEL kernel allocation,
2040  *      	%GFP_HIGHMEM highmem allocation,
2041  *      	%GFP_FS     don't call back into a file system.
2042  *      	%GFP_ATOMIC don't sleep.
2043  *	@order: Power of two of allocation size in pages. 0 is a single page.
2044  *
2045  *	Allocate a page from the kernel page pool.  When not in
2046  *	interrupt context and apply the current process NUMA policy.
2047  *	Returns NULL when no page can be allocated.
2048  *
2049  *	Don't call cpuset_update_task_memory_state() unless
2050  *	1) it's ok to take cpuset_sem (can WAIT), and
2051  *	2) allocating for current task (not interrupt).
2052  */
2053 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2054 {
2055 	struct mempolicy *pol = &default_policy;
2056 	struct page *page;
2057 	unsigned int cpuset_mems_cookie;
2058 
2059 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2060 		pol = get_task_policy(current);
2061 
2062 retry_cpuset:
2063 	cpuset_mems_cookie = read_mems_allowed_begin();
2064 
2065 	/*
2066 	 * No reference counting needed for current->mempolicy
2067 	 * nor system default_policy
2068 	 */
2069 	if (pol->mode == MPOL_INTERLEAVE)
2070 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2071 	else
2072 		page = __alloc_pages_nodemask(gfp, order,
2073 				policy_zonelist(gfp, pol, numa_node_id()),
2074 				policy_nodemask(gfp, pol));
2075 
2076 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2077 		goto retry_cpuset;
2078 
2079 	return page;
2080 }
2081 EXPORT_SYMBOL(alloc_pages_current);
2082 
2083 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2084 {
2085 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2086 
2087 	if (IS_ERR(pol))
2088 		return PTR_ERR(pol);
2089 	dst->vm_policy = pol;
2090 	return 0;
2091 }
2092 
2093 /*
2094  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2095  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2096  * with the mems_allowed returned by cpuset_mems_allowed().  This
2097  * keeps mempolicies cpuset relative after its cpuset moves.  See
2098  * further kernel/cpuset.c update_nodemask().
2099  *
2100  * current's mempolicy may be rebinded by the other task(the task that changes
2101  * cpuset's mems), so we needn't do rebind work for current task.
2102  */
2103 
2104 /* Slow path of a mempolicy duplicate */
2105 struct mempolicy *__mpol_dup(struct mempolicy *old)
2106 {
2107 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2108 
2109 	if (!new)
2110 		return ERR_PTR(-ENOMEM);
2111 
2112 	/* task's mempolicy is protected by alloc_lock */
2113 	if (old == current->mempolicy) {
2114 		task_lock(current);
2115 		*new = *old;
2116 		task_unlock(current);
2117 	} else
2118 		*new = *old;
2119 
2120 	if (current_cpuset_is_being_rebound()) {
2121 		nodemask_t mems = cpuset_mems_allowed(current);
2122 		if (new->flags & MPOL_F_REBINDING)
2123 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2124 		else
2125 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2126 	}
2127 	atomic_set(&new->refcnt, 1);
2128 	return new;
2129 }
2130 
2131 /* Slow path of a mempolicy comparison */
2132 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2133 {
2134 	if (!a || !b)
2135 		return false;
2136 	if (a->mode != b->mode)
2137 		return false;
2138 	if (a->flags != b->flags)
2139 		return false;
2140 	if (mpol_store_user_nodemask(a))
2141 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2142 			return false;
2143 
2144 	switch (a->mode) {
2145 	case MPOL_BIND:
2146 		/* Fall through */
2147 	case MPOL_INTERLEAVE:
2148 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2149 	case MPOL_PREFERRED:
2150 		return a->v.preferred_node == b->v.preferred_node;
2151 	default:
2152 		BUG();
2153 		return false;
2154 	}
2155 }
2156 
2157 /*
2158  * Shared memory backing store policy support.
2159  *
2160  * Remember policies even when nobody has shared memory mapped.
2161  * The policies are kept in Red-Black tree linked from the inode.
2162  * They are protected by the sp->lock rwlock, which should be held
2163  * for any accesses to the tree.
2164  */
2165 
2166 /*
2167  * lookup first element intersecting start-end.  Caller holds sp->lock for
2168  * reading or for writing
2169  */
2170 static struct sp_node *
2171 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2172 {
2173 	struct rb_node *n = sp->root.rb_node;
2174 
2175 	while (n) {
2176 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2177 
2178 		if (start >= p->end)
2179 			n = n->rb_right;
2180 		else if (end <= p->start)
2181 			n = n->rb_left;
2182 		else
2183 			break;
2184 	}
2185 	if (!n)
2186 		return NULL;
2187 	for (;;) {
2188 		struct sp_node *w = NULL;
2189 		struct rb_node *prev = rb_prev(n);
2190 		if (!prev)
2191 			break;
2192 		w = rb_entry(prev, struct sp_node, nd);
2193 		if (w->end <= start)
2194 			break;
2195 		n = prev;
2196 	}
2197 	return rb_entry(n, struct sp_node, nd);
2198 }
2199 
2200 /*
2201  * Insert a new shared policy into the list.  Caller holds sp->lock for
2202  * writing.
2203  */
2204 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2205 {
2206 	struct rb_node **p = &sp->root.rb_node;
2207 	struct rb_node *parent = NULL;
2208 	struct sp_node *nd;
2209 
2210 	while (*p) {
2211 		parent = *p;
2212 		nd = rb_entry(parent, struct sp_node, nd);
2213 		if (new->start < nd->start)
2214 			p = &(*p)->rb_left;
2215 		else if (new->end > nd->end)
2216 			p = &(*p)->rb_right;
2217 		else
2218 			BUG();
2219 	}
2220 	rb_link_node(&new->nd, parent, p);
2221 	rb_insert_color(&new->nd, &sp->root);
2222 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2223 		 new->policy ? new->policy->mode : 0);
2224 }
2225 
2226 /* Find shared policy intersecting idx */
2227 struct mempolicy *
2228 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2229 {
2230 	struct mempolicy *pol = NULL;
2231 	struct sp_node *sn;
2232 
2233 	if (!sp->root.rb_node)
2234 		return NULL;
2235 	read_lock(&sp->lock);
2236 	sn = sp_lookup(sp, idx, idx+1);
2237 	if (sn) {
2238 		mpol_get(sn->policy);
2239 		pol = sn->policy;
2240 	}
2241 	read_unlock(&sp->lock);
2242 	return pol;
2243 }
2244 
2245 static void sp_free(struct sp_node *n)
2246 {
2247 	mpol_put(n->policy);
2248 	kmem_cache_free(sn_cache, n);
2249 }
2250 
2251 /**
2252  * mpol_misplaced - check whether current page node is valid in policy
2253  *
2254  * @page: page to be checked
2255  * @vma: vm area where page mapped
2256  * @addr: virtual address where page mapped
2257  *
2258  * Lookup current policy node id for vma,addr and "compare to" page's
2259  * node id.
2260  *
2261  * Returns:
2262  *	-1	- not misplaced, page is in the right node
2263  *	node	- node id where the page should be
2264  *
2265  * Policy determination "mimics" alloc_page_vma().
2266  * Called from fault path where we know the vma and faulting address.
2267  */
2268 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2269 {
2270 	struct mempolicy *pol;
2271 	struct zoneref *z;
2272 	int curnid = page_to_nid(page);
2273 	unsigned long pgoff;
2274 	int thiscpu = raw_smp_processor_id();
2275 	int thisnid = cpu_to_node(thiscpu);
2276 	int polnid = -1;
2277 	int ret = -1;
2278 
2279 	BUG_ON(!vma);
2280 
2281 	pol = get_vma_policy(vma, addr);
2282 	if (!(pol->flags & MPOL_F_MOF))
2283 		goto out;
2284 
2285 	switch (pol->mode) {
2286 	case MPOL_INTERLEAVE:
2287 		BUG_ON(addr >= vma->vm_end);
2288 		BUG_ON(addr < vma->vm_start);
2289 
2290 		pgoff = vma->vm_pgoff;
2291 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2292 		polnid = offset_il_node(pol, vma, pgoff);
2293 		break;
2294 
2295 	case MPOL_PREFERRED:
2296 		if (pol->flags & MPOL_F_LOCAL)
2297 			polnid = numa_node_id();
2298 		else
2299 			polnid = pol->v.preferred_node;
2300 		break;
2301 
2302 	case MPOL_BIND:
2303 
2304 		/*
2305 		 * allows binding to multiple nodes.
2306 		 * use current page if in policy nodemask,
2307 		 * else select nearest allowed node, if any.
2308 		 * If no allowed nodes, use current [!misplaced].
2309 		 */
2310 		if (node_isset(curnid, pol->v.nodes))
2311 			goto out;
2312 		z = first_zones_zonelist(
2313 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2314 				gfp_zone(GFP_HIGHUSER),
2315 				&pol->v.nodes);
2316 		polnid = z->zone->node;
2317 		break;
2318 
2319 	default:
2320 		BUG();
2321 	}
2322 
2323 	/* Migrate the page towards the node whose CPU is referencing it */
2324 	if (pol->flags & MPOL_F_MORON) {
2325 		polnid = thisnid;
2326 
2327 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2328 			goto out;
2329 	}
2330 
2331 	if (curnid != polnid)
2332 		ret = polnid;
2333 out:
2334 	mpol_cond_put(pol);
2335 
2336 	return ret;
2337 }
2338 
2339 /*
2340  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2341  * dropped after task->mempolicy is set to NULL so that any allocation done as
2342  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2343  * policy.
2344  */
2345 void mpol_put_task_policy(struct task_struct *task)
2346 {
2347 	struct mempolicy *pol;
2348 
2349 	task_lock(task);
2350 	pol = task->mempolicy;
2351 	task->mempolicy = NULL;
2352 	task_unlock(task);
2353 	mpol_put(pol);
2354 }
2355 
2356 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2357 {
2358 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2359 	rb_erase(&n->nd, &sp->root);
2360 	sp_free(n);
2361 }
2362 
2363 static void sp_node_init(struct sp_node *node, unsigned long start,
2364 			unsigned long end, struct mempolicy *pol)
2365 {
2366 	node->start = start;
2367 	node->end = end;
2368 	node->policy = pol;
2369 }
2370 
2371 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2372 				struct mempolicy *pol)
2373 {
2374 	struct sp_node *n;
2375 	struct mempolicy *newpol;
2376 
2377 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2378 	if (!n)
2379 		return NULL;
2380 
2381 	newpol = mpol_dup(pol);
2382 	if (IS_ERR(newpol)) {
2383 		kmem_cache_free(sn_cache, n);
2384 		return NULL;
2385 	}
2386 	newpol->flags |= MPOL_F_SHARED;
2387 	sp_node_init(n, start, end, newpol);
2388 
2389 	return n;
2390 }
2391 
2392 /* Replace a policy range. */
2393 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2394 				 unsigned long end, struct sp_node *new)
2395 {
2396 	struct sp_node *n;
2397 	struct sp_node *n_new = NULL;
2398 	struct mempolicy *mpol_new = NULL;
2399 	int ret = 0;
2400 
2401 restart:
2402 	write_lock(&sp->lock);
2403 	n = sp_lookup(sp, start, end);
2404 	/* Take care of old policies in the same range. */
2405 	while (n && n->start < end) {
2406 		struct rb_node *next = rb_next(&n->nd);
2407 		if (n->start >= start) {
2408 			if (n->end <= end)
2409 				sp_delete(sp, n);
2410 			else
2411 				n->start = end;
2412 		} else {
2413 			/* Old policy spanning whole new range. */
2414 			if (n->end > end) {
2415 				if (!n_new)
2416 					goto alloc_new;
2417 
2418 				*mpol_new = *n->policy;
2419 				atomic_set(&mpol_new->refcnt, 1);
2420 				sp_node_init(n_new, end, n->end, mpol_new);
2421 				n->end = start;
2422 				sp_insert(sp, n_new);
2423 				n_new = NULL;
2424 				mpol_new = NULL;
2425 				break;
2426 			} else
2427 				n->end = start;
2428 		}
2429 		if (!next)
2430 			break;
2431 		n = rb_entry(next, struct sp_node, nd);
2432 	}
2433 	if (new)
2434 		sp_insert(sp, new);
2435 	write_unlock(&sp->lock);
2436 	ret = 0;
2437 
2438 err_out:
2439 	if (mpol_new)
2440 		mpol_put(mpol_new);
2441 	if (n_new)
2442 		kmem_cache_free(sn_cache, n_new);
2443 
2444 	return ret;
2445 
2446 alloc_new:
2447 	write_unlock(&sp->lock);
2448 	ret = -ENOMEM;
2449 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2450 	if (!n_new)
2451 		goto err_out;
2452 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2453 	if (!mpol_new)
2454 		goto err_out;
2455 	goto restart;
2456 }
2457 
2458 /**
2459  * mpol_shared_policy_init - initialize shared policy for inode
2460  * @sp: pointer to inode shared policy
2461  * @mpol:  struct mempolicy to install
2462  *
2463  * Install non-NULL @mpol in inode's shared policy rb-tree.
2464  * On entry, the current task has a reference on a non-NULL @mpol.
2465  * This must be released on exit.
2466  * This is called at get_inode() calls and we can use GFP_KERNEL.
2467  */
2468 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2469 {
2470 	int ret;
2471 
2472 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2473 	rwlock_init(&sp->lock);
2474 
2475 	if (mpol) {
2476 		struct vm_area_struct pvma;
2477 		struct mempolicy *new;
2478 		NODEMASK_SCRATCH(scratch);
2479 
2480 		if (!scratch)
2481 			goto put_mpol;
2482 		/* contextualize the tmpfs mount point mempolicy */
2483 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2484 		if (IS_ERR(new))
2485 			goto free_scratch; /* no valid nodemask intersection */
2486 
2487 		task_lock(current);
2488 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2489 		task_unlock(current);
2490 		if (ret)
2491 			goto put_new;
2492 
2493 		/* Create pseudo-vma that contains just the policy */
2494 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2495 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2496 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2497 
2498 put_new:
2499 		mpol_put(new);			/* drop initial ref */
2500 free_scratch:
2501 		NODEMASK_SCRATCH_FREE(scratch);
2502 put_mpol:
2503 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2504 	}
2505 }
2506 
2507 int mpol_set_shared_policy(struct shared_policy *info,
2508 			struct vm_area_struct *vma, struct mempolicy *npol)
2509 {
2510 	int err;
2511 	struct sp_node *new = NULL;
2512 	unsigned long sz = vma_pages(vma);
2513 
2514 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2515 		 vma->vm_pgoff,
2516 		 sz, npol ? npol->mode : -1,
2517 		 npol ? npol->flags : -1,
2518 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2519 
2520 	if (npol) {
2521 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2522 		if (!new)
2523 			return -ENOMEM;
2524 	}
2525 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2526 	if (err && new)
2527 		sp_free(new);
2528 	return err;
2529 }
2530 
2531 /* Free a backing policy store on inode delete. */
2532 void mpol_free_shared_policy(struct shared_policy *p)
2533 {
2534 	struct sp_node *n;
2535 	struct rb_node *next;
2536 
2537 	if (!p->root.rb_node)
2538 		return;
2539 	write_lock(&p->lock);
2540 	next = rb_first(&p->root);
2541 	while (next) {
2542 		n = rb_entry(next, struct sp_node, nd);
2543 		next = rb_next(&n->nd);
2544 		sp_delete(p, n);
2545 	}
2546 	write_unlock(&p->lock);
2547 }
2548 
2549 #ifdef CONFIG_NUMA_BALANCING
2550 static int __initdata numabalancing_override;
2551 
2552 static void __init check_numabalancing_enable(void)
2553 {
2554 	bool numabalancing_default = false;
2555 
2556 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2557 		numabalancing_default = true;
2558 
2559 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2560 	if (numabalancing_override)
2561 		set_numabalancing_state(numabalancing_override == 1);
2562 
2563 	if (num_online_nodes() > 1 && !numabalancing_override) {
2564 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2565 			numabalancing_default ? "Enabling" : "Disabling");
2566 		set_numabalancing_state(numabalancing_default);
2567 	}
2568 }
2569 
2570 static int __init setup_numabalancing(char *str)
2571 {
2572 	int ret = 0;
2573 	if (!str)
2574 		goto out;
2575 
2576 	if (!strcmp(str, "enable")) {
2577 		numabalancing_override = 1;
2578 		ret = 1;
2579 	} else if (!strcmp(str, "disable")) {
2580 		numabalancing_override = -1;
2581 		ret = 1;
2582 	}
2583 out:
2584 	if (!ret)
2585 		pr_warn("Unable to parse numa_balancing=\n");
2586 
2587 	return ret;
2588 }
2589 __setup("numa_balancing=", setup_numabalancing);
2590 #else
2591 static inline void __init check_numabalancing_enable(void)
2592 {
2593 }
2594 #endif /* CONFIG_NUMA_BALANCING */
2595 
2596 /* assumes fs == KERNEL_DS */
2597 void __init numa_policy_init(void)
2598 {
2599 	nodemask_t interleave_nodes;
2600 	unsigned long largest = 0;
2601 	int nid, prefer = 0;
2602 
2603 	policy_cache = kmem_cache_create("numa_policy",
2604 					 sizeof(struct mempolicy),
2605 					 0, SLAB_PANIC, NULL);
2606 
2607 	sn_cache = kmem_cache_create("shared_policy_node",
2608 				     sizeof(struct sp_node),
2609 				     0, SLAB_PANIC, NULL);
2610 
2611 	for_each_node(nid) {
2612 		preferred_node_policy[nid] = (struct mempolicy) {
2613 			.refcnt = ATOMIC_INIT(1),
2614 			.mode = MPOL_PREFERRED,
2615 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2616 			.v = { .preferred_node = nid, },
2617 		};
2618 	}
2619 
2620 	/*
2621 	 * Set interleaving policy for system init. Interleaving is only
2622 	 * enabled across suitably sized nodes (default is >= 16MB), or
2623 	 * fall back to the largest node if they're all smaller.
2624 	 */
2625 	nodes_clear(interleave_nodes);
2626 	for_each_node_state(nid, N_MEMORY) {
2627 		unsigned long total_pages = node_present_pages(nid);
2628 
2629 		/* Preserve the largest node */
2630 		if (largest < total_pages) {
2631 			largest = total_pages;
2632 			prefer = nid;
2633 		}
2634 
2635 		/* Interleave this node? */
2636 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2637 			node_set(nid, interleave_nodes);
2638 	}
2639 
2640 	/* All too small, use the largest */
2641 	if (unlikely(nodes_empty(interleave_nodes)))
2642 		node_set(prefer, interleave_nodes);
2643 
2644 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2645 		pr_err("%s: interleaving failed\n", __func__);
2646 
2647 	check_numabalancing_enable();
2648 }
2649 
2650 /* Reset policy of current process to default */
2651 void numa_default_policy(void)
2652 {
2653 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2654 }
2655 
2656 /*
2657  * Parse and format mempolicy from/to strings
2658  */
2659 
2660 /*
2661  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2662  */
2663 static const char * const policy_modes[] =
2664 {
2665 	[MPOL_DEFAULT]    = "default",
2666 	[MPOL_PREFERRED]  = "prefer",
2667 	[MPOL_BIND]       = "bind",
2668 	[MPOL_INTERLEAVE] = "interleave",
2669 	[MPOL_LOCAL]      = "local",
2670 };
2671 
2672 
2673 #ifdef CONFIG_TMPFS
2674 /**
2675  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2676  * @str:  string containing mempolicy to parse
2677  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2678  *
2679  * Format of input:
2680  *	<mode>[=<flags>][:<nodelist>]
2681  *
2682  * On success, returns 0, else 1
2683  */
2684 int mpol_parse_str(char *str, struct mempolicy **mpol)
2685 {
2686 	struct mempolicy *new = NULL;
2687 	unsigned short mode;
2688 	unsigned short mode_flags;
2689 	nodemask_t nodes;
2690 	char *nodelist = strchr(str, ':');
2691 	char *flags = strchr(str, '=');
2692 	int err = 1;
2693 
2694 	if (nodelist) {
2695 		/* NUL-terminate mode or flags string */
2696 		*nodelist++ = '\0';
2697 		if (nodelist_parse(nodelist, nodes))
2698 			goto out;
2699 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2700 			goto out;
2701 	} else
2702 		nodes_clear(nodes);
2703 
2704 	if (flags)
2705 		*flags++ = '\0';	/* terminate mode string */
2706 
2707 	for (mode = 0; mode < MPOL_MAX; mode++) {
2708 		if (!strcmp(str, policy_modes[mode])) {
2709 			break;
2710 		}
2711 	}
2712 	if (mode >= MPOL_MAX)
2713 		goto out;
2714 
2715 	switch (mode) {
2716 	case MPOL_PREFERRED:
2717 		/*
2718 		 * Insist on a nodelist of one node only
2719 		 */
2720 		if (nodelist) {
2721 			char *rest = nodelist;
2722 			while (isdigit(*rest))
2723 				rest++;
2724 			if (*rest)
2725 				goto out;
2726 		}
2727 		break;
2728 	case MPOL_INTERLEAVE:
2729 		/*
2730 		 * Default to online nodes with memory if no nodelist
2731 		 */
2732 		if (!nodelist)
2733 			nodes = node_states[N_MEMORY];
2734 		break;
2735 	case MPOL_LOCAL:
2736 		/*
2737 		 * Don't allow a nodelist;  mpol_new() checks flags
2738 		 */
2739 		if (nodelist)
2740 			goto out;
2741 		mode = MPOL_PREFERRED;
2742 		break;
2743 	case MPOL_DEFAULT:
2744 		/*
2745 		 * Insist on a empty nodelist
2746 		 */
2747 		if (!nodelist)
2748 			err = 0;
2749 		goto out;
2750 	case MPOL_BIND:
2751 		/*
2752 		 * Insist on a nodelist
2753 		 */
2754 		if (!nodelist)
2755 			goto out;
2756 	}
2757 
2758 	mode_flags = 0;
2759 	if (flags) {
2760 		/*
2761 		 * Currently, we only support two mutually exclusive
2762 		 * mode flags.
2763 		 */
2764 		if (!strcmp(flags, "static"))
2765 			mode_flags |= MPOL_F_STATIC_NODES;
2766 		else if (!strcmp(flags, "relative"))
2767 			mode_flags |= MPOL_F_RELATIVE_NODES;
2768 		else
2769 			goto out;
2770 	}
2771 
2772 	new = mpol_new(mode, mode_flags, &nodes);
2773 	if (IS_ERR(new))
2774 		goto out;
2775 
2776 	/*
2777 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2778 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2779 	 */
2780 	if (mode != MPOL_PREFERRED)
2781 		new->v.nodes = nodes;
2782 	else if (nodelist)
2783 		new->v.preferred_node = first_node(nodes);
2784 	else
2785 		new->flags |= MPOL_F_LOCAL;
2786 
2787 	/*
2788 	 * Save nodes for contextualization: this will be used to "clone"
2789 	 * the mempolicy in a specific context [cpuset] at a later time.
2790 	 */
2791 	new->w.user_nodemask = nodes;
2792 
2793 	err = 0;
2794 
2795 out:
2796 	/* Restore string for error message */
2797 	if (nodelist)
2798 		*--nodelist = ':';
2799 	if (flags)
2800 		*--flags = '=';
2801 	if (!err)
2802 		*mpol = new;
2803 	return err;
2804 }
2805 #endif /* CONFIG_TMPFS */
2806 
2807 /**
2808  * mpol_to_str - format a mempolicy structure for printing
2809  * @buffer:  to contain formatted mempolicy string
2810  * @maxlen:  length of @buffer
2811  * @pol:  pointer to mempolicy to be formatted
2812  *
2813  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2814  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2815  * longest flag, "relative", and to display at least a few node ids.
2816  */
2817 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2818 {
2819 	char *p = buffer;
2820 	nodemask_t nodes = NODE_MASK_NONE;
2821 	unsigned short mode = MPOL_DEFAULT;
2822 	unsigned short flags = 0;
2823 
2824 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2825 		mode = pol->mode;
2826 		flags = pol->flags;
2827 	}
2828 
2829 	switch (mode) {
2830 	case MPOL_DEFAULT:
2831 		break;
2832 	case MPOL_PREFERRED:
2833 		if (flags & MPOL_F_LOCAL)
2834 			mode = MPOL_LOCAL;
2835 		else
2836 			node_set(pol->v.preferred_node, nodes);
2837 		break;
2838 	case MPOL_BIND:
2839 	case MPOL_INTERLEAVE:
2840 		nodes = pol->v.nodes;
2841 		break;
2842 	default:
2843 		WARN_ON_ONCE(1);
2844 		snprintf(p, maxlen, "unknown");
2845 		return;
2846 	}
2847 
2848 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2849 
2850 	if (flags & MPOL_MODE_FLAGS) {
2851 		p += snprintf(p, buffer + maxlen - p, "=");
2852 
2853 		/*
2854 		 * Currently, the only defined flags are mutually exclusive
2855 		 */
2856 		if (flags & MPOL_F_STATIC_NODES)
2857 			p += snprintf(p, buffer + maxlen - p, "static");
2858 		else if (flags & MPOL_F_RELATIVE_NODES)
2859 			p += snprintf(p, buffer + maxlen - p, "relative");
2860 	}
2861 
2862 	if (!nodes_empty(nodes))
2863 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2864 			       nodemask_pr_args(&nodes));
2865 }
2866