xref: /openbmc/linux/mm/mempolicy.c (revision 56d06fa2)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 #include <linux/random.h>
101 
102 #include "internal.h"
103 
104 /* Internal flags */
105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107 
108 static struct kmem_cache *policy_cache;
109 static struct kmem_cache *sn_cache;
110 
111 /* Highest zone. An specific allocation for a zone below that is not
112    policied. */
113 enum zone_type policy_zone = 0;
114 
115 /*
116  * run-time system-wide default policy => local allocation
117  */
118 static struct mempolicy default_policy = {
119 	.refcnt = ATOMIC_INIT(1), /* never free it */
120 	.mode = MPOL_PREFERRED,
121 	.flags = MPOL_F_LOCAL,
122 };
123 
124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 
126 struct mempolicy *get_task_policy(struct task_struct *p)
127 {
128 	struct mempolicy *pol = p->mempolicy;
129 	int node;
130 
131 	if (pol)
132 		return pol;
133 
134 	node = numa_node_id();
135 	if (node != NUMA_NO_NODE) {
136 		pol = &preferred_node_policy[node];
137 		/* preferred_node_policy is not initialised early in boot */
138 		if (pol->mode)
139 			return pol;
140 	}
141 
142 	return &default_policy;
143 }
144 
145 static const struct mempolicy_operations {
146 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
147 	/*
148 	 * If read-side task has no lock to protect task->mempolicy, write-side
149 	 * task will rebind the task->mempolicy by two step. The first step is
150 	 * setting all the newly nodes, and the second step is cleaning all the
151 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
152 	 * page.
153 	 * If we have a lock to protect task->mempolicy in read-side, we do
154 	 * rebind directly.
155 	 *
156 	 * step:
157 	 * 	MPOL_REBIND_ONCE - do rebind work at once
158 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
159 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
160 	 */
161 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
162 			enum mpol_rebind_step step);
163 } mpol_ops[MPOL_MAX];
164 
165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
166 {
167 	return pol->flags & MPOL_MODE_FLAGS;
168 }
169 
170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
171 				   const nodemask_t *rel)
172 {
173 	nodemask_t tmp;
174 	nodes_fold(tmp, *orig, nodes_weight(*rel));
175 	nodes_onto(*ret, tmp, *rel);
176 }
177 
178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
179 {
180 	if (nodes_empty(*nodes))
181 		return -EINVAL;
182 	pol->v.nodes = *nodes;
183 	return 0;
184 }
185 
186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (!nodes)
189 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
190 	else if (nodes_empty(*nodes))
191 		return -EINVAL;			/*  no allowed nodes */
192 	else
193 		pol->v.preferred_node = first_node(*nodes);
194 	return 0;
195 }
196 
197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
198 {
199 	if (nodes_empty(*nodes))
200 		return -EINVAL;
201 	pol->v.nodes = *nodes;
202 	return 0;
203 }
204 
205 /*
206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
207  * any, for the new policy.  mpol_new() has already validated the nodes
208  * parameter with respect to the policy mode and flags.  But, we need to
209  * handle an empty nodemask with MPOL_PREFERRED here.
210  *
211  * Must be called holding task's alloc_lock to protect task's mems_allowed
212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
213  */
214 static int mpol_set_nodemask(struct mempolicy *pol,
215 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
216 {
217 	int ret;
218 
219 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
220 	if (pol == NULL)
221 		return 0;
222 	/* Check N_MEMORY */
223 	nodes_and(nsc->mask1,
224 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
225 
226 	VM_BUG_ON(!nodes);
227 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
228 		nodes = NULL;	/* explicit local allocation */
229 	else {
230 		if (pol->flags & MPOL_F_RELATIVE_NODES)
231 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
232 		else
233 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
234 
235 		if (mpol_store_user_nodemask(pol))
236 			pol->w.user_nodemask = *nodes;
237 		else
238 			pol->w.cpuset_mems_allowed =
239 						cpuset_current_mems_allowed;
240 	}
241 
242 	if (nodes)
243 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
244 	else
245 		ret = mpol_ops[pol->mode].create(pol, NULL);
246 	return ret;
247 }
248 
249 /*
250  * This function just creates a new policy, does some check and simple
251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
252  */
253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 				  nodemask_t *nodes)
255 {
256 	struct mempolicy *policy;
257 
258 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
259 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
260 
261 	if (mode == MPOL_DEFAULT) {
262 		if (nodes && !nodes_empty(*nodes))
263 			return ERR_PTR(-EINVAL);
264 		return NULL;
265 	}
266 	VM_BUG_ON(!nodes);
267 
268 	/*
269 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
270 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
271 	 * All other modes require a valid pointer to a non-empty nodemask.
272 	 */
273 	if (mode == MPOL_PREFERRED) {
274 		if (nodes_empty(*nodes)) {
275 			if (((flags & MPOL_F_STATIC_NODES) ||
276 			     (flags & MPOL_F_RELATIVE_NODES)))
277 				return ERR_PTR(-EINVAL);
278 		}
279 	} else if (mode == MPOL_LOCAL) {
280 		if (!nodes_empty(*nodes))
281 			return ERR_PTR(-EINVAL);
282 		mode = MPOL_PREFERRED;
283 	} else if (nodes_empty(*nodes))
284 		return ERR_PTR(-EINVAL);
285 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
286 	if (!policy)
287 		return ERR_PTR(-ENOMEM);
288 	atomic_set(&policy->refcnt, 1);
289 	policy->mode = mode;
290 	policy->flags = flags;
291 
292 	return policy;
293 }
294 
295 /* Slow path of a mpol destructor. */
296 void __mpol_put(struct mempolicy *p)
297 {
298 	if (!atomic_dec_and_test(&p->refcnt))
299 		return;
300 	kmem_cache_free(policy_cache, p);
301 }
302 
303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
304 				enum mpol_rebind_step step)
305 {
306 }
307 
308 /*
309  * step:
310  * 	MPOL_REBIND_ONCE  - do rebind work at once
311  * 	MPOL_REBIND_STEP1 - set all the newly nodes
312  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
313  */
314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
315 				 enum mpol_rebind_step step)
316 {
317 	nodemask_t tmp;
318 
319 	if (pol->flags & MPOL_F_STATIC_NODES)
320 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
321 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 	else {
324 		/*
325 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
326 		 * result
327 		 */
328 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
329 			nodes_remap(tmp, pol->v.nodes,
330 					pol->w.cpuset_mems_allowed, *nodes);
331 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
332 		} else if (step == MPOL_REBIND_STEP2) {
333 			tmp = pol->w.cpuset_mems_allowed;
334 			pol->w.cpuset_mems_allowed = *nodes;
335 		} else
336 			BUG();
337 	}
338 
339 	if (nodes_empty(tmp))
340 		tmp = *nodes;
341 
342 	if (step == MPOL_REBIND_STEP1)
343 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
344 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
345 		pol->v.nodes = tmp;
346 	else
347 		BUG();
348 
349 	if (!node_isset(current->il_next, tmp)) {
350 		current->il_next = next_node(current->il_next, tmp);
351 		if (current->il_next >= MAX_NUMNODES)
352 			current->il_next = first_node(tmp);
353 		if (current->il_next >= MAX_NUMNODES)
354 			current->il_next = numa_node_id();
355 	}
356 }
357 
358 static void mpol_rebind_preferred(struct mempolicy *pol,
359 				  const nodemask_t *nodes,
360 				  enum mpol_rebind_step step)
361 {
362 	nodemask_t tmp;
363 
364 	if (pol->flags & MPOL_F_STATIC_NODES) {
365 		int node = first_node(pol->w.user_nodemask);
366 
367 		if (node_isset(node, *nodes)) {
368 			pol->v.preferred_node = node;
369 			pol->flags &= ~MPOL_F_LOCAL;
370 		} else
371 			pol->flags |= MPOL_F_LOCAL;
372 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
373 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
374 		pol->v.preferred_node = first_node(tmp);
375 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
376 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
377 						   pol->w.cpuset_mems_allowed,
378 						   *nodes);
379 		pol->w.cpuset_mems_allowed = *nodes;
380 	}
381 }
382 
383 /*
384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
385  *
386  * If read-side task has no lock to protect task->mempolicy, write-side
387  * task will rebind the task->mempolicy by two step. The first step is
388  * setting all the newly nodes, and the second step is cleaning all the
389  * disallowed nodes. In this way, we can avoid finding no node to alloc
390  * page.
391  * If we have a lock to protect task->mempolicy in read-side, we do
392  * rebind directly.
393  *
394  * step:
395  * 	MPOL_REBIND_ONCE  - do rebind work at once
396  * 	MPOL_REBIND_STEP1 - set all the newly nodes
397  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
398  */
399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
400 				enum mpol_rebind_step step)
401 {
402 	if (!pol)
403 		return;
404 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
405 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
409 		return;
410 
411 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
412 		BUG();
413 
414 	if (step == MPOL_REBIND_STEP1)
415 		pol->flags |= MPOL_F_REBINDING;
416 	else if (step == MPOL_REBIND_STEP2)
417 		pol->flags &= ~MPOL_F_REBINDING;
418 	else if (step >= MPOL_REBIND_NSTEP)
419 		BUG();
420 
421 	mpol_ops[pol->mode].rebind(pol, newmask, step);
422 }
423 
424 /*
425  * Wrapper for mpol_rebind_policy() that just requires task
426  * pointer, and updates task mempolicy.
427  *
428  * Called with task's alloc_lock held.
429  */
430 
431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
432 			enum mpol_rebind_step step)
433 {
434 	mpol_rebind_policy(tsk->mempolicy, new, step);
435 }
436 
437 /*
438  * Rebind each vma in mm to new nodemask.
439  *
440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
441  */
442 
443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
444 {
445 	struct vm_area_struct *vma;
446 
447 	down_write(&mm->mmap_sem);
448 	for (vma = mm->mmap; vma; vma = vma->vm_next)
449 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
450 	up_write(&mm->mmap_sem);
451 }
452 
453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
454 	[MPOL_DEFAULT] = {
455 		.rebind = mpol_rebind_default,
456 	},
457 	[MPOL_INTERLEAVE] = {
458 		.create = mpol_new_interleave,
459 		.rebind = mpol_rebind_nodemask,
460 	},
461 	[MPOL_PREFERRED] = {
462 		.create = mpol_new_preferred,
463 		.rebind = mpol_rebind_preferred,
464 	},
465 	[MPOL_BIND] = {
466 		.create = mpol_new_bind,
467 		.rebind = mpol_rebind_nodemask,
468 	},
469 };
470 
471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 				unsigned long flags);
473 
474 struct queue_pages {
475 	struct list_head *pagelist;
476 	unsigned long flags;
477 	nodemask_t *nmask;
478 	struct vm_area_struct *prev;
479 };
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  */
485 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
486 			unsigned long end, struct mm_walk *walk)
487 {
488 	struct vm_area_struct *vma = walk->vma;
489 	struct page *page;
490 	struct queue_pages *qp = walk->private;
491 	unsigned long flags = qp->flags;
492 	int nid, ret;
493 	pte_t *pte;
494 	spinlock_t *ptl;
495 
496 	if (pmd_trans_huge(*pmd)) {
497 		ptl = pmd_lock(walk->mm, pmd);
498 		if (pmd_trans_huge(*pmd)) {
499 			page = pmd_page(*pmd);
500 			if (is_huge_zero_page(page)) {
501 				spin_unlock(ptl);
502 				split_huge_pmd(vma, pmd, addr);
503 			} else {
504 				get_page(page);
505 				spin_unlock(ptl);
506 				lock_page(page);
507 				ret = split_huge_page(page);
508 				unlock_page(page);
509 				put_page(page);
510 				if (ret)
511 					return 0;
512 			}
513 		} else {
514 			spin_unlock(ptl);
515 		}
516 	}
517 
518 retry:
519 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
520 	for (; addr != end; pte++, addr += PAGE_SIZE) {
521 		if (!pte_present(*pte))
522 			continue;
523 		page = vm_normal_page(vma, addr, *pte);
524 		if (!page)
525 			continue;
526 		/*
527 		 * vm_normal_page() filters out zero pages, but there might
528 		 * still be PageReserved pages to skip, perhaps in a VDSO.
529 		 */
530 		if (PageReserved(page))
531 			continue;
532 		nid = page_to_nid(page);
533 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
534 			continue;
535 		if (PageTransCompound(page) && PageAnon(page)) {
536 			get_page(page);
537 			pte_unmap_unlock(pte, ptl);
538 			lock_page(page);
539 			ret = split_huge_page(page);
540 			unlock_page(page);
541 			put_page(page);
542 			/* Failed to split -- skip. */
543 			if (ret) {
544 				pte = pte_offset_map_lock(walk->mm, pmd,
545 						addr, &ptl);
546 				continue;
547 			}
548 			goto retry;
549 		}
550 
551 		migrate_page_add(page, qp->pagelist, flags);
552 	}
553 	pte_unmap_unlock(pte - 1, ptl);
554 	cond_resched();
555 	return 0;
556 }
557 
558 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
559 			       unsigned long addr, unsigned long end,
560 			       struct mm_walk *walk)
561 {
562 #ifdef CONFIG_HUGETLB_PAGE
563 	struct queue_pages *qp = walk->private;
564 	unsigned long flags = qp->flags;
565 	int nid;
566 	struct page *page;
567 	spinlock_t *ptl;
568 	pte_t entry;
569 
570 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
571 	entry = huge_ptep_get(pte);
572 	if (!pte_present(entry))
573 		goto unlock;
574 	page = pte_page(entry);
575 	nid = page_to_nid(page);
576 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
577 		goto unlock;
578 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
579 	if (flags & (MPOL_MF_MOVE_ALL) ||
580 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
581 		isolate_huge_page(page, qp->pagelist);
582 unlock:
583 	spin_unlock(ptl);
584 #else
585 	BUG();
586 #endif
587 	return 0;
588 }
589 
590 #ifdef CONFIG_NUMA_BALANCING
591 /*
592  * This is used to mark a range of virtual addresses to be inaccessible.
593  * These are later cleared by a NUMA hinting fault. Depending on these
594  * faults, pages may be migrated for better NUMA placement.
595  *
596  * This is assuming that NUMA faults are handled using PROT_NONE. If
597  * an architecture makes a different choice, it will need further
598  * changes to the core.
599  */
600 unsigned long change_prot_numa(struct vm_area_struct *vma,
601 			unsigned long addr, unsigned long end)
602 {
603 	int nr_updated;
604 
605 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
606 	if (nr_updated)
607 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608 
609 	return nr_updated;
610 }
611 #else
612 static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 			unsigned long addr, unsigned long end)
614 {
615 	return 0;
616 }
617 #endif /* CONFIG_NUMA_BALANCING */
618 
619 static int queue_pages_test_walk(unsigned long start, unsigned long end,
620 				struct mm_walk *walk)
621 {
622 	struct vm_area_struct *vma = walk->vma;
623 	struct queue_pages *qp = walk->private;
624 	unsigned long endvma = vma->vm_end;
625 	unsigned long flags = qp->flags;
626 
627 	if (!vma_migratable(vma))
628 		return 1;
629 
630 	if (endvma > end)
631 		endvma = end;
632 	if (vma->vm_start > start)
633 		start = vma->vm_start;
634 
635 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
636 		if (!vma->vm_next && vma->vm_end < end)
637 			return -EFAULT;
638 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
639 			return -EFAULT;
640 	}
641 
642 	qp->prev = vma;
643 
644 	if (flags & MPOL_MF_LAZY) {
645 		/* Similar to task_numa_work, skip inaccessible VMAs */
646 		if (!is_vm_hugetlb_page(vma) &&
647 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
648 			!(vma->vm_flags & VM_MIXEDMAP))
649 			change_prot_numa(vma, start, endvma);
650 		return 1;
651 	}
652 
653 	/* queue pages from current vma */
654 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
655 		return 0;
656 	return 1;
657 }
658 
659 /*
660  * Walk through page tables and collect pages to be migrated.
661  *
662  * If pages found in a given range are on a set of nodes (determined by
663  * @nodes and @flags,) it's isolated and queued to the pagelist which is
664  * passed via @private.)
665  */
666 static int
667 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
668 		nodemask_t *nodes, unsigned long flags,
669 		struct list_head *pagelist)
670 {
671 	struct queue_pages qp = {
672 		.pagelist = pagelist,
673 		.flags = flags,
674 		.nmask = nodes,
675 		.prev = NULL,
676 	};
677 	struct mm_walk queue_pages_walk = {
678 		.hugetlb_entry = queue_pages_hugetlb,
679 		.pmd_entry = queue_pages_pte_range,
680 		.test_walk = queue_pages_test_walk,
681 		.mm = mm,
682 		.private = &qp,
683 	};
684 
685 	return walk_page_range(start, end, &queue_pages_walk);
686 }
687 
688 /*
689  * Apply policy to a single VMA
690  * This must be called with the mmap_sem held for writing.
691  */
692 static int vma_replace_policy(struct vm_area_struct *vma,
693 						struct mempolicy *pol)
694 {
695 	int err;
696 	struct mempolicy *old;
697 	struct mempolicy *new;
698 
699 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
700 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
701 		 vma->vm_ops, vma->vm_file,
702 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
703 
704 	new = mpol_dup(pol);
705 	if (IS_ERR(new))
706 		return PTR_ERR(new);
707 
708 	if (vma->vm_ops && vma->vm_ops->set_policy) {
709 		err = vma->vm_ops->set_policy(vma, new);
710 		if (err)
711 			goto err_out;
712 	}
713 
714 	old = vma->vm_policy;
715 	vma->vm_policy = new; /* protected by mmap_sem */
716 	mpol_put(old);
717 
718 	return 0;
719  err_out:
720 	mpol_put(new);
721 	return err;
722 }
723 
724 /* Step 2: apply policy to a range and do splits. */
725 static int mbind_range(struct mm_struct *mm, unsigned long start,
726 		       unsigned long end, struct mempolicy *new_pol)
727 {
728 	struct vm_area_struct *next;
729 	struct vm_area_struct *prev;
730 	struct vm_area_struct *vma;
731 	int err = 0;
732 	pgoff_t pgoff;
733 	unsigned long vmstart;
734 	unsigned long vmend;
735 
736 	vma = find_vma(mm, start);
737 	if (!vma || vma->vm_start > start)
738 		return -EFAULT;
739 
740 	prev = vma->vm_prev;
741 	if (start > vma->vm_start)
742 		prev = vma;
743 
744 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
745 		next = vma->vm_next;
746 		vmstart = max(start, vma->vm_start);
747 		vmend   = min(end, vma->vm_end);
748 
749 		if (mpol_equal(vma_policy(vma), new_pol))
750 			continue;
751 
752 		pgoff = vma->vm_pgoff +
753 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
754 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
755 				 vma->anon_vma, vma->vm_file, pgoff,
756 				 new_pol, vma->vm_userfaultfd_ctx);
757 		if (prev) {
758 			vma = prev;
759 			next = vma->vm_next;
760 			if (mpol_equal(vma_policy(vma), new_pol))
761 				continue;
762 			/* vma_merge() joined vma && vma->next, case 8 */
763 			goto replace;
764 		}
765 		if (vma->vm_start != vmstart) {
766 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
767 			if (err)
768 				goto out;
769 		}
770 		if (vma->vm_end != vmend) {
771 			err = split_vma(vma->vm_mm, vma, vmend, 0);
772 			if (err)
773 				goto out;
774 		}
775  replace:
776 		err = vma_replace_policy(vma, new_pol);
777 		if (err)
778 			goto out;
779 	}
780 
781  out:
782 	return err;
783 }
784 
785 /* Set the process memory policy */
786 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
787 			     nodemask_t *nodes)
788 {
789 	struct mempolicy *new, *old;
790 	NODEMASK_SCRATCH(scratch);
791 	int ret;
792 
793 	if (!scratch)
794 		return -ENOMEM;
795 
796 	new = mpol_new(mode, flags, nodes);
797 	if (IS_ERR(new)) {
798 		ret = PTR_ERR(new);
799 		goto out;
800 	}
801 
802 	task_lock(current);
803 	ret = mpol_set_nodemask(new, nodes, scratch);
804 	if (ret) {
805 		task_unlock(current);
806 		mpol_put(new);
807 		goto out;
808 	}
809 	old = current->mempolicy;
810 	current->mempolicy = new;
811 	if (new && new->mode == MPOL_INTERLEAVE &&
812 	    nodes_weight(new->v.nodes))
813 		current->il_next = first_node(new->v.nodes);
814 	task_unlock(current);
815 	mpol_put(old);
816 	ret = 0;
817 out:
818 	NODEMASK_SCRATCH_FREE(scratch);
819 	return ret;
820 }
821 
822 /*
823  * Return nodemask for policy for get_mempolicy() query
824  *
825  * Called with task's alloc_lock held
826  */
827 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
828 {
829 	nodes_clear(*nodes);
830 	if (p == &default_policy)
831 		return;
832 
833 	switch (p->mode) {
834 	case MPOL_BIND:
835 		/* Fall through */
836 	case MPOL_INTERLEAVE:
837 		*nodes = p->v.nodes;
838 		break;
839 	case MPOL_PREFERRED:
840 		if (!(p->flags & MPOL_F_LOCAL))
841 			node_set(p->v.preferred_node, *nodes);
842 		/* else return empty node mask for local allocation */
843 		break;
844 	default:
845 		BUG();
846 	}
847 }
848 
849 static int lookup_node(unsigned long addr)
850 {
851 	struct page *p;
852 	int err;
853 
854 	err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
855 	if (err >= 0) {
856 		err = page_to_nid(p);
857 		put_page(p);
858 	}
859 	return err;
860 }
861 
862 /* Retrieve NUMA policy */
863 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
864 			     unsigned long addr, unsigned long flags)
865 {
866 	int err;
867 	struct mm_struct *mm = current->mm;
868 	struct vm_area_struct *vma = NULL;
869 	struct mempolicy *pol = current->mempolicy;
870 
871 	if (flags &
872 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
873 		return -EINVAL;
874 
875 	if (flags & MPOL_F_MEMS_ALLOWED) {
876 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
877 			return -EINVAL;
878 		*policy = 0;	/* just so it's initialized */
879 		task_lock(current);
880 		*nmask  = cpuset_current_mems_allowed;
881 		task_unlock(current);
882 		return 0;
883 	}
884 
885 	if (flags & MPOL_F_ADDR) {
886 		/*
887 		 * Do NOT fall back to task policy if the
888 		 * vma/shared policy at addr is NULL.  We
889 		 * want to return MPOL_DEFAULT in this case.
890 		 */
891 		down_read(&mm->mmap_sem);
892 		vma = find_vma_intersection(mm, addr, addr+1);
893 		if (!vma) {
894 			up_read(&mm->mmap_sem);
895 			return -EFAULT;
896 		}
897 		if (vma->vm_ops && vma->vm_ops->get_policy)
898 			pol = vma->vm_ops->get_policy(vma, addr);
899 		else
900 			pol = vma->vm_policy;
901 	} else if (addr)
902 		return -EINVAL;
903 
904 	if (!pol)
905 		pol = &default_policy;	/* indicates default behavior */
906 
907 	if (flags & MPOL_F_NODE) {
908 		if (flags & MPOL_F_ADDR) {
909 			err = lookup_node(addr);
910 			if (err < 0)
911 				goto out;
912 			*policy = err;
913 		} else if (pol == current->mempolicy &&
914 				pol->mode == MPOL_INTERLEAVE) {
915 			*policy = current->il_next;
916 		} else {
917 			err = -EINVAL;
918 			goto out;
919 		}
920 	} else {
921 		*policy = pol == &default_policy ? MPOL_DEFAULT :
922 						pol->mode;
923 		/*
924 		 * Internal mempolicy flags must be masked off before exposing
925 		 * the policy to userspace.
926 		 */
927 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
928 	}
929 
930 	if (vma) {
931 		up_read(&current->mm->mmap_sem);
932 		vma = NULL;
933 	}
934 
935 	err = 0;
936 	if (nmask) {
937 		if (mpol_store_user_nodemask(pol)) {
938 			*nmask = pol->w.user_nodemask;
939 		} else {
940 			task_lock(current);
941 			get_policy_nodemask(pol, nmask);
942 			task_unlock(current);
943 		}
944 	}
945 
946  out:
947 	mpol_cond_put(pol);
948 	if (vma)
949 		up_read(&current->mm->mmap_sem);
950 	return err;
951 }
952 
953 #ifdef CONFIG_MIGRATION
954 /*
955  * page migration
956  */
957 static void migrate_page_add(struct page *page, struct list_head *pagelist,
958 				unsigned long flags)
959 {
960 	/*
961 	 * Avoid migrating a page that is shared with others.
962 	 */
963 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
964 		if (!isolate_lru_page(page)) {
965 			list_add_tail(&page->lru, pagelist);
966 			inc_zone_page_state(page, NR_ISOLATED_ANON +
967 					    page_is_file_cache(page));
968 		}
969 	}
970 }
971 
972 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
973 {
974 	if (PageHuge(page))
975 		return alloc_huge_page_node(page_hstate(compound_head(page)),
976 					node);
977 	else
978 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
979 						    __GFP_THISNODE, 0);
980 }
981 
982 /*
983  * Migrate pages from one node to a target node.
984  * Returns error or the number of pages not migrated.
985  */
986 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
987 			   int flags)
988 {
989 	nodemask_t nmask;
990 	LIST_HEAD(pagelist);
991 	int err = 0;
992 
993 	nodes_clear(nmask);
994 	node_set(source, nmask);
995 
996 	/*
997 	 * This does not "check" the range but isolates all pages that
998 	 * need migration.  Between passing in the full user address
999 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1000 	 */
1001 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1002 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1003 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1004 
1005 	if (!list_empty(&pagelist)) {
1006 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1007 					MIGRATE_SYNC, MR_SYSCALL);
1008 		if (err)
1009 			putback_movable_pages(&pagelist);
1010 	}
1011 
1012 	return err;
1013 }
1014 
1015 /*
1016  * Move pages between the two nodesets so as to preserve the physical
1017  * layout as much as possible.
1018  *
1019  * Returns the number of page that could not be moved.
1020  */
1021 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1022 		     const nodemask_t *to, int flags)
1023 {
1024 	int busy = 0;
1025 	int err;
1026 	nodemask_t tmp;
1027 
1028 	err = migrate_prep();
1029 	if (err)
1030 		return err;
1031 
1032 	down_read(&mm->mmap_sem);
1033 
1034 	/*
1035 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1036 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1037 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1038 	 * The pair of nodemasks 'to' and 'from' define the map.
1039 	 *
1040 	 * If no pair of bits is found that way, fallback to picking some
1041 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1042 	 * 'source' and 'dest' bits are the same, this represents a node
1043 	 * that will be migrating to itself, so no pages need move.
1044 	 *
1045 	 * If no bits are left in 'tmp', or if all remaining bits left
1046 	 * in 'tmp' correspond to the same bit in 'to', return false
1047 	 * (nothing left to migrate).
1048 	 *
1049 	 * This lets us pick a pair of nodes to migrate between, such that
1050 	 * if possible the dest node is not already occupied by some other
1051 	 * source node, minimizing the risk of overloading the memory on a
1052 	 * node that would happen if we migrated incoming memory to a node
1053 	 * before migrating outgoing memory source that same node.
1054 	 *
1055 	 * A single scan of tmp is sufficient.  As we go, we remember the
1056 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1057 	 * that not only moved, but what's better, moved to an empty slot
1058 	 * (d is not set in tmp), then we break out then, with that pair.
1059 	 * Otherwise when we finish scanning from_tmp, we at least have the
1060 	 * most recent <s, d> pair that moved.  If we get all the way through
1061 	 * the scan of tmp without finding any node that moved, much less
1062 	 * moved to an empty node, then there is nothing left worth migrating.
1063 	 */
1064 
1065 	tmp = *from;
1066 	while (!nodes_empty(tmp)) {
1067 		int s,d;
1068 		int source = NUMA_NO_NODE;
1069 		int dest = 0;
1070 
1071 		for_each_node_mask(s, tmp) {
1072 
1073 			/*
1074 			 * do_migrate_pages() tries to maintain the relative
1075 			 * node relationship of the pages established between
1076 			 * threads and memory areas.
1077                          *
1078 			 * However if the number of source nodes is not equal to
1079 			 * the number of destination nodes we can not preserve
1080 			 * this node relative relationship.  In that case, skip
1081 			 * copying memory from a node that is in the destination
1082 			 * mask.
1083 			 *
1084 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1085 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1086 			 */
1087 
1088 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1089 						(node_isset(s, *to)))
1090 				continue;
1091 
1092 			d = node_remap(s, *from, *to);
1093 			if (s == d)
1094 				continue;
1095 
1096 			source = s;	/* Node moved. Memorize */
1097 			dest = d;
1098 
1099 			/* dest not in remaining from nodes? */
1100 			if (!node_isset(dest, tmp))
1101 				break;
1102 		}
1103 		if (source == NUMA_NO_NODE)
1104 			break;
1105 
1106 		node_clear(source, tmp);
1107 		err = migrate_to_node(mm, source, dest, flags);
1108 		if (err > 0)
1109 			busy += err;
1110 		if (err < 0)
1111 			break;
1112 	}
1113 	up_read(&mm->mmap_sem);
1114 	if (err < 0)
1115 		return err;
1116 	return busy;
1117 
1118 }
1119 
1120 /*
1121  * Allocate a new page for page migration based on vma policy.
1122  * Start by assuming the page is mapped by the same vma as contains @start.
1123  * Search forward from there, if not.  N.B., this assumes that the
1124  * list of pages handed to migrate_pages()--which is how we get here--
1125  * is in virtual address order.
1126  */
1127 static struct page *new_page(struct page *page, unsigned long start, int **x)
1128 {
1129 	struct vm_area_struct *vma;
1130 	unsigned long uninitialized_var(address);
1131 
1132 	vma = find_vma(current->mm, start);
1133 	while (vma) {
1134 		address = page_address_in_vma(page, vma);
1135 		if (address != -EFAULT)
1136 			break;
1137 		vma = vma->vm_next;
1138 	}
1139 
1140 	if (PageHuge(page)) {
1141 		BUG_ON(!vma);
1142 		return alloc_huge_page_noerr(vma, address, 1);
1143 	}
1144 	/*
1145 	 * if !vma, alloc_page_vma() will use task or system default policy
1146 	 */
1147 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1148 }
1149 #else
1150 
1151 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1152 				unsigned long flags)
1153 {
1154 }
1155 
1156 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1157 		     const nodemask_t *to, int flags)
1158 {
1159 	return -ENOSYS;
1160 }
1161 
1162 static struct page *new_page(struct page *page, unsigned long start, int **x)
1163 {
1164 	return NULL;
1165 }
1166 #endif
1167 
1168 static long do_mbind(unsigned long start, unsigned long len,
1169 		     unsigned short mode, unsigned short mode_flags,
1170 		     nodemask_t *nmask, unsigned long flags)
1171 {
1172 	struct mm_struct *mm = current->mm;
1173 	struct mempolicy *new;
1174 	unsigned long end;
1175 	int err;
1176 	LIST_HEAD(pagelist);
1177 
1178 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1179 		return -EINVAL;
1180 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1181 		return -EPERM;
1182 
1183 	if (start & ~PAGE_MASK)
1184 		return -EINVAL;
1185 
1186 	if (mode == MPOL_DEFAULT)
1187 		flags &= ~MPOL_MF_STRICT;
1188 
1189 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1190 	end = start + len;
1191 
1192 	if (end < start)
1193 		return -EINVAL;
1194 	if (end == start)
1195 		return 0;
1196 
1197 	new = mpol_new(mode, mode_flags, nmask);
1198 	if (IS_ERR(new))
1199 		return PTR_ERR(new);
1200 
1201 	if (flags & MPOL_MF_LAZY)
1202 		new->flags |= MPOL_F_MOF;
1203 
1204 	/*
1205 	 * If we are using the default policy then operation
1206 	 * on discontinuous address spaces is okay after all
1207 	 */
1208 	if (!new)
1209 		flags |= MPOL_MF_DISCONTIG_OK;
1210 
1211 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1212 		 start, start + len, mode, mode_flags,
1213 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1214 
1215 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1216 
1217 		err = migrate_prep();
1218 		if (err)
1219 			goto mpol_out;
1220 	}
1221 	{
1222 		NODEMASK_SCRATCH(scratch);
1223 		if (scratch) {
1224 			down_write(&mm->mmap_sem);
1225 			task_lock(current);
1226 			err = mpol_set_nodemask(new, nmask, scratch);
1227 			task_unlock(current);
1228 			if (err)
1229 				up_write(&mm->mmap_sem);
1230 		} else
1231 			err = -ENOMEM;
1232 		NODEMASK_SCRATCH_FREE(scratch);
1233 	}
1234 	if (err)
1235 		goto mpol_out;
1236 
1237 	err = queue_pages_range(mm, start, end, nmask,
1238 			  flags | MPOL_MF_INVERT, &pagelist);
1239 	if (!err)
1240 		err = mbind_range(mm, start, end, new);
1241 
1242 	if (!err) {
1243 		int nr_failed = 0;
1244 
1245 		if (!list_empty(&pagelist)) {
1246 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1247 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1248 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1249 			if (nr_failed)
1250 				putback_movable_pages(&pagelist);
1251 		}
1252 
1253 		if (nr_failed && (flags & MPOL_MF_STRICT))
1254 			err = -EIO;
1255 	} else
1256 		putback_movable_pages(&pagelist);
1257 
1258 	up_write(&mm->mmap_sem);
1259  mpol_out:
1260 	mpol_put(new);
1261 	return err;
1262 }
1263 
1264 /*
1265  * User space interface with variable sized bitmaps for nodelists.
1266  */
1267 
1268 /* Copy a node mask from user space. */
1269 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1270 		     unsigned long maxnode)
1271 {
1272 	unsigned long k;
1273 	unsigned long nlongs;
1274 	unsigned long endmask;
1275 
1276 	--maxnode;
1277 	nodes_clear(*nodes);
1278 	if (maxnode == 0 || !nmask)
1279 		return 0;
1280 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1281 		return -EINVAL;
1282 
1283 	nlongs = BITS_TO_LONGS(maxnode);
1284 	if ((maxnode % BITS_PER_LONG) == 0)
1285 		endmask = ~0UL;
1286 	else
1287 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1288 
1289 	/* When the user specified more nodes than supported just check
1290 	   if the non supported part is all zero. */
1291 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1292 		if (nlongs > PAGE_SIZE/sizeof(long))
1293 			return -EINVAL;
1294 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1295 			unsigned long t;
1296 			if (get_user(t, nmask + k))
1297 				return -EFAULT;
1298 			if (k == nlongs - 1) {
1299 				if (t & endmask)
1300 					return -EINVAL;
1301 			} else if (t)
1302 				return -EINVAL;
1303 		}
1304 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1305 		endmask = ~0UL;
1306 	}
1307 
1308 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1309 		return -EFAULT;
1310 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1311 	return 0;
1312 }
1313 
1314 /* Copy a kernel node mask to user space */
1315 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1316 			      nodemask_t *nodes)
1317 {
1318 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1319 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1320 
1321 	if (copy > nbytes) {
1322 		if (copy > PAGE_SIZE)
1323 			return -EINVAL;
1324 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1325 			return -EFAULT;
1326 		copy = nbytes;
1327 	}
1328 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1329 }
1330 
1331 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1332 		unsigned long, mode, const unsigned long __user *, nmask,
1333 		unsigned long, maxnode, unsigned, flags)
1334 {
1335 	nodemask_t nodes;
1336 	int err;
1337 	unsigned short mode_flags;
1338 
1339 	mode_flags = mode & MPOL_MODE_FLAGS;
1340 	mode &= ~MPOL_MODE_FLAGS;
1341 	if (mode >= MPOL_MAX)
1342 		return -EINVAL;
1343 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1344 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1345 		return -EINVAL;
1346 	err = get_nodes(&nodes, nmask, maxnode);
1347 	if (err)
1348 		return err;
1349 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1350 }
1351 
1352 /* Set the process memory policy */
1353 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1354 		unsigned long, maxnode)
1355 {
1356 	int err;
1357 	nodemask_t nodes;
1358 	unsigned short flags;
1359 
1360 	flags = mode & MPOL_MODE_FLAGS;
1361 	mode &= ~MPOL_MODE_FLAGS;
1362 	if ((unsigned int)mode >= MPOL_MAX)
1363 		return -EINVAL;
1364 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1365 		return -EINVAL;
1366 	err = get_nodes(&nodes, nmask, maxnode);
1367 	if (err)
1368 		return err;
1369 	return do_set_mempolicy(mode, flags, &nodes);
1370 }
1371 
1372 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1373 		const unsigned long __user *, old_nodes,
1374 		const unsigned long __user *, new_nodes)
1375 {
1376 	const struct cred *cred = current_cred(), *tcred;
1377 	struct mm_struct *mm = NULL;
1378 	struct task_struct *task;
1379 	nodemask_t task_nodes;
1380 	int err;
1381 	nodemask_t *old;
1382 	nodemask_t *new;
1383 	NODEMASK_SCRATCH(scratch);
1384 
1385 	if (!scratch)
1386 		return -ENOMEM;
1387 
1388 	old = &scratch->mask1;
1389 	new = &scratch->mask2;
1390 
1391 	err = get_nodes(old, old_nodes, maxnode);
1392 	if (err)
1393 		goto out;
1394 
1395 	err = get_nodes(new, new_nodes, maxnode);
1396 	if (err)
1397 		goto out;
1398 
1399 	/* Find the mm_struct */
1400 	rcu_read_lock();
1401 	task = pid ? find_task_by_vpid(pid) : current;
1402 	if (!task) {
1403 		rcu_read_unlock();
1404 		err = -ESRCH;
1405 		goto out;
1406 	}
1407 	get_task_struct(task);
1408 
1409 	err = -EINVAL;
1410 
1411 	/*
1412 	 * Check if this process has the right to modify the specified
1413 	 * process. The right exists if the process has administrative
1414 	 * capabilities, superuser privileges or the same
1415 	 * userid as the target process.
1416 	 */
1417 	tcred = __task_cred(task);
1418 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1419 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1420 	    !capable(CAP_SYS_NICE)) {
1421 		rcu_read_unlock();
1422 		err = -EPERM;
1423 		goto out_put;
1424 	}
1425 	rcu_read_unlock();
1426 
1427 	task_nodes = cpuset_mems_allowed(task);
1428 	/* Is the user allowed to access the target nodes? */
1429 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1430 		err = -EPERM;
1431 		goto out_put;
1432 	}
1433 
1434 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1435 		err = -EINVAL;
1436 		goto out_put;
1437 	}
1438 
1439 	err = security_task_movememory(task);
1440 	if (err)
1441 		goto out_put;
1442 
1443 	mm = get_task_mm(task);
1444 	put_task_struct(task);
1445 
1446 	if (!mm) {
1447 		err = -EINVAL;
1448 		goto out;
1449 	}
1450 
1451 	err = do_migrate_pages(mm, old, new,
1452 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1453 
1454 	mmput(mm);
1455 out:
1456 	NODEMASK_SCRATCH_FREE(scratch);
1457 
1458 	return err;
1459 
1460 out_put:
1461 	put_task_struct(task);
1462 	goto out;
1463 
1464 }
1465 
1466 
1467 /* Retrieve NUMA policy */
1468 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1469 		unsigned long __user *, nmask, unsigned long, maxnode,
1470 		unsigned long, addr, unsigned long, flags)
1471 {
1472 	int err;
1473 	int uninitialized_var(pval);
1474 	nodemask_t nodes;
1475 
1476 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1477 		return -EINVAL;
1478 
1479 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1480 
1481 	if (err)
1482 		return err;
1483 
1484 	if (policy && put_user(pval, policy))
1485 		return -EFAULT;
1486 
1487 	if (nmask)
1488 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1489 
1490 	return err;
1491 }
1492 
1493 #ifdef CONFIG_COMPAT
1494 
1495 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1496 		       compat_ulong_t __user *, nmask,
1497 		       compat_ulong_t, maxnode,
1498 		       compat_ulong_t, addr, compat_ulong_t, flags)
1499 {
1500 	long err;
1501 	unsigned long __user *nm = NULL;
1502 	unsigned long nr_bits, alloc_size;
1503 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1504 
1505 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1506 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1507 
1508 	if (nmask)
1509 		nm = compat_alloc_user_space(alloc_size);
1510 
1511 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1512 
1513 	if (!err && nmask) {
1514 		unsigned long copy_size;
1515 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1516 		err = copy_from_user(bm, nm, copy_size);
1517 		/* ensure entire bitmap is zeroed */
1518 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1519 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1520 	}
1521 
1522 	return err;
1523 }
1524 
1525 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1526 		       compat_ulong_t, maxnode)
1527 {
1528 	long err = 0;
1529 	unsigned long __user *nm = NULL;
1530 	unsigned long nr_bits, alloc_size;
1531 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1532 
1533 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1534 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1535 
1536 	if (nmask) {
1537 		err = compat_get_bitmap(bm, nmask, nr_bits);
1538 		nm = compat_alloc_user_space(alloc_size);
1539 		err |= copy_to_user(nm, bm, alloc_size);
1540 	}
1541 
1542 	if (err)
1543 		return -EFAULT;
1544 
1545 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1546 }
1547 
1548 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1549 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1550 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1551 {
1552 	long err = 0;
1553 	unsigned long __user *nm = NULL;
1554 	unsigned long nr_bits, alloc_size;
1555 	nodemask_t bm;
1556 
1557 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1558 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1559 
1560 	if (nmask) {
1561 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1562 		nm = compat_alloc_user_space(alloc_size);
1563 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1564 	}
1565 
1566 	if (err)
1567 		return -EFAULT;
1568 
1569 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1570 }
1571 
1572 #endif
1573 
1574 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1575 						unsigned long addr)
1576 {
1577 	struct mempolicy *pol = NULL;
1578 
1579 	if (vma) {
1580 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1581 			pol = vma->vm_ops->get_policy(vma, addr);
1582 		} else if (vma->vm_policy) {
1583 			pol = vma->vm_policy;
1584 
1585 			/*
1586 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1587 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1588 			 * count on these policies which will be dropped by
1589 			 * mpol_cond_put() later
1590 			 */
1591 			if (mpol_needs_cond_ref(pol))
1592 				mpol_get(pol);
1593 		}
1594 	}
1595 
1596 	return pol;
1597 }
1598 
1599 /*
1600  * get_vma_policy(@vma, @addr)
1601  * @vma: virtual memory area whose policy is sought
1602  * @addr: address in @vma for shared policy lookup
1603  *
1604  * Returns effective policy for a VMA at specified address.
1605  * Falls back to current->mempolicy or system default policy, as necessary.
1606  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1607  * count--added by the get_policy() vm_op, as appropriate--to protect against
1608  * freeing by another task.  It is the caller's responsibility to free the
1609  * extra reference for shared policies.
1610  */
1611 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1612 						unsigned long addr)
1613 {
1614 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1615 
1616 	if (!pol)
1617 		pol = get_task_policy(current);
1618 
1619 	return pol;
1620 }
1621 
1622 bool vma_policy_mof(struct vm_area_struct *vma)
1623 {
1624 	struct mempolicy *pol;
1625 
1626 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1627 		bool ret = false;
1628 
1629 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1630 		if (pol && (pol->flags & MPOL_F_MOF))
1631 			ret = true;
1632 		mpol_cond_put(pol);
1633 
1634 		return ret;
1635 	}
1636 
1637 	pol = vma->vm_policy;
1638 	if (!pol)
1639 		pol = get_task_policy(current);
1640 
1641 	return pol->flags & MPOL_F_MOF;
1642 }
1643 
1644 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1645 {
1646 	enum zone_type dynamic_policy_zone = policy_zone;
1647 
1648 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1649 
1650 	/*
1651 	 * if policy->v.nodes has movable memory only,
1652 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1653 	 *
1654 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1655 	 * so if the following test faile, it implies
1656 	 * policy->v.nodes has movable memory only.
1657 	 */
1658 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1659 		dynamic_policy_zone = ZONE_MOVABLE;
1660 
1661 	return zone >= dynamic_policy_zone;
1662 }
1663 
1664 /*
1665  * Return a nodemask representing a mempolicy for filtering nodes for
1666  * page allocation
1667  */
1668 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1669 {
1670 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1671 	if (unlikely(policy->mode == MPOL_BIND) &&
1672 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1673 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1674 		return &policy->v.nodes;
1675 
1676 	return NULL;
1677 }
1678 
1679 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1680 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1681 	int nd)
1682 {
1683 	switch (policy->mode) {
1684 	case MPOL_PREFERRED:
1685 		if (!(policy->flags & MPOL_F_LOCAL))
1686 			nd = policy->v.preferred_node;
1687 		break;
1688 	case MPOL_BIND:
1689 		/*
1690 		 * Normally, MPOL_BIND allocations are node-local within the
1691 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1692 		 * current node isn't part of the mask, we use the zonelist for
1693 		 * the first node in the mask instead.
1694 		 */
1695 		if (unlikely(gfp & __GFP_THISNODE) &&
1696 				unlikely(!node_isset(nd, policy->v.nodes)))
1697 			nd = first_node(policy->v.nodes);
1698 		break;
1699 	default:
1700 		BUG();
1701 	}
1702 	return node_zonelist(nd, gfp);
1703 }
1704 
1705 /* Do dynamic interleaving for a process */
1706 static unsigned interleave_nodes(struct mempolicy *policy)
1707 {
1708 	unsigned nid, next;
1709 	struct task_struct *me = current;
1710 
1711 	nid = me->il_next;
1712 	next = next_node(nid, policy->v.nodes);
1713 	if (next >= MAX_NUMNODES)
1714 		next = first_node(policy->v.nodes);
1715 	if (next < MAX_NUMNODES)
1716 		me->il_next = next;
1717 	return nid;
1718 }
1719 
1720 /*
1721  * Depending on the memory policy provide a node from which to allocate the
1722  * next slab entry.
1723  */
1724 unsigned int mempolicy_slab_node(void)
1725 {
1726 	struct mempolicy *policy;
1727 	int node = numa_mem_id();
1728 
1729 	if (in_interrupt())
1730 		return node;
1731 
1732 	policy = current->mempolicy;
1733 	if (!policy || policy->flags & MPOL_F_LOCAL)
1734 		return node;
1735 
1736 	switch (policy->mode) {
1737 	case MPOL_PREFERRED:
1738 		/*
1739 		 * handled MPOL_F_LOCAL above
1740 		 */
1741 		return policy->v.preferred_node;
1742 
1743 	case MPOL_INTERLEAVE:
1744 		return interleave_nodes(policy);
1745 
1746 	case MPOL_BIND: {
1747 		/*
1748 		 * Follow bind policy behavior and start allocation at the
1749 		 * first node.
1750 		 */
1751 		struct zonelist *zonelist;
1752 		struct zone *zone;
1753 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1754 		zonelist = &NODE_DATA(node)->node_zonelists[0];
1755 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1756 							&policy->v.nodes,
1757 							&zone);
1758 		return zone ? zone->node : node;
1759 	}
1760 
1761 	default:
1762 		BUG();
1763 	}
1764 }
1765 
1766 /* Do static interleaving for a VMA with known offset. */
1767 static unsigned offset_il_node(struct mempolicy *pol,
1768 		struct vm_area_struct *vma, unsigned long off)
1769 {
1770 	unsigned nnodes = nodes_weight(pol->v.nodes);
1771 	unsigned target;
1772 	int c;
1773 	int nid = NUMA_NO_NODE;
1774 
1775 	if (!nnodes)
1776 		return numa_node_id();
1777 	target = (unsigned int)off % nnodes;
1778 	c = 0;
1779 	do {
1780 		nid = next_node(nid, pol->v.nodes);
1781 		c++;
1782 	} while (c <= target);
1783 	return nid;
1784 }
1785 
1786 /* Determine a node number for interleave */
1787 static inline unsigned interleave_nid(struct mempolicy *pol,
1788 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1789 {
1790 	if (vma) {
1791 		unsigned long off;
1792 
1793 		/*
1794 		 * for small pages, there is no difference between
1795 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1796 		 * for huge pages, since vm_pgoff is in units of small
1797 		 * pages, we need to shift off the always 0 bits to get
1798 		 * a useful offset.
1799 		 */
1800 		BUG_ON(shift < PAGE_SHIFT);
1801 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1802 		off += (addr - vma->vm_start) >> shift;
1803 		return offset_il_node(pol, vma, off);
1804 	} else
1805 		return interleave_nodes(pol);
1806 }
1807 
1808 /*
1809  * Return the bit number of a random bit set in the nodemask.
1810  * (returns NUMA_NO_NODE if nodemask is empty)
1811  */
1812 int node_random(const nodemask_t *maskp)
1813 {
1814 	int w, bit = NUMA_NO_NODE;
1815 
1816 	w = nodes_weight(*maskp);
1817 	if (w)
1818 		bit = bitmap_ord_to_pos(maskp->bits,
1819 			get_random_int() % w, MAX_NUMNODES);
1820 	return bit;
1821 }
1822 
1823 #ifdef CONFIG_HUGETLBFS
1824 /*
1825  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1826  * @vma: virtual memory area whose policy is sought
1827  * @addr: address in @vma for shared policy lookup and interleave policy
1828  * @gfp_flags: for requested zone
1829  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1830  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1831  *
1832  * Returns a zonelist suitable for a huge page allocation and a pointer
1833  * to the struct mempolicy for conditional unref after allocation.
1834  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1835  * @nodemask for filtering the zonelist.
1836  *
1837  * Must be protected by read_mems_allowed_begin()
1838  */
1839 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1840 				gfp_t gfp_flags, struct mempolicy **mpol,
1841 				nodemask_t **nodemask)
1842 {
1843 	struct zonelist *zl;
1844 
1845 	*mpol = get_vma_policy(vma, addr);
1846 	*nodemask = NULL;	/* assume !MPOL_BIND */
1847 
1848 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1849 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1850 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1851 	} else {
1852 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1853 		if ((*mpol)->mode == MPOL_BIND)
1854 			*nodemask = &(*mpol)->v.nodes;
1855 	}
1856 	return zl;
1857 }
1858 
1859 /*
1860  * init_nodemask_of_mempolicy
1861  *
1862  * If the current task's mempolicy is "default" [NULL], return 'false'
1863  * to indicate default policy.  Otherwise, extract the policy nodemask
1864  * for 'bind' or 'interleave' policy into the argument nodemask, or
1865  * initialize the argument nodemask to contain the single node for
1866  * 'preferred' or 'local' policy and return 'true' to indicate presence
1867  * of non-default mempolicy.
1868  *
1869  * We don't bother with reference counting the mempolicy [mpol_get/put]
1870  * because the current task is examining it's own mempolicy and a task's
1871  * mempolicy is only ever changed by the task itself.
1872  *
1873  * N.B., it is the caller's responsibility to free a returned nodemask.
1874  */
1875 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1876 {
1877 	struct mempolicy *mempolicy;
1878 	int nid;
1879 
1880 	if (!(mask && current->mempolicy))
1881 		return false;
1882 
1883 	task_lock(current);
1884 	mempolicy = current->mempolicy;
1885 	switch (mempolicy->mode) {
1886 	case MPOL_PREFERRED:
1887 		if (mempolicy->flags & MPOL_F_LOCAL)
1888 			nid = numa_node_id();
1889 		else
1890 			nid = mempolicy->v.preferred_node;
1891 		init_nodemask_of_node(mask, nid);
1892 		break;
1893 
1894 	case MPOL_BIND:
1895 		/* Fall through */
1896 	case MPOL_INTERLEAVE:
1897 		*mask =  mempolicy->v.nodes;
1898 		break;
1899 
1900 	default:
1901 		BUG();
1902 	}
1903 	task_unlock(current);
1904 
1905 	return true;
1906 }
1907 #endif
1908 
1909 /*
1910  * mempolicy_nodemask_intersects
1911  *
1912  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1913  * policy.  Otherwise, check for intersection between mask and the policy
1914  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1915  * policy, always return true since it may allocate elsewhere on fallback.
1916  *
1917  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1918  */
1919 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1920 					const nodemask_t *mask)
1921 {
1922 	struct mempolicy *mempolicy;
1923 	bool ret = true;
1924 
1925 	if (!mask)
1926 		return ret;
1927 	task_lock(tsk);
1928 	mempolicy = tsk->mempolicy;
1929 	if (!mempolicy)
1930 		goto out;
1931 
1932 	switch (mempolicy->mode) {
1933 	case MPOL_PREFERRED:
1934 		/*
1935 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1936 		 * allocate from, they may fallback to other nodes when oom.
1937 		 * Thus, it's possible for tsk to have allocated memory from
1938 		 * nodes in mask.
1939 		 */
1940 		break;
1941 	case MPOL_BIND:
1942 	case MPOL_INTERLEAVE:
1943 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1944 		break;
1945 	default:
1946 		BUG();
1947 	}
1948 out:
1949 	task_unlock(tsk);
1950 	return ret;
1951 }
1952 
1953 /* Allocate a page in interleaved policy.
1954    Own path because it needs to do special accounting. */
1955 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1956 					unsigned nid)
1957 {
1958 	struct zonelist *zl;
1959 	struct page *page;
1960 
1961 	zl = node_zonelist(nid, gfp);
1962 	page = __alloc_pages(gfp, order, zl);
1963 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1964 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1965 	return page;
1966 }
1967 
1968 /**
1969  * 	alloc_pages_vma	- Allocate a page for a VMA.
1970  *
1971  * 	@gfp:
1972  *      %GFP_USER    user allocation.
1973  *      %GFP_KERNEL  kernel allocations,
1974  *      %GFP_HIGHMEM highmem/user allocations,
1975  *      %GFP_FS      allocation should not call back into a file system.
1976  *      %GFP_ATOMIC  don't sleep.
1977  *
1978  *	@order:Order of the GFP allocation.
1979  * 	@vma:  Pointer to VMA or NULL if not available.
1980  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1981  *	@node: Which node to prefer for allocation (modulo policy).
1982  *	@hugepage: for hugepages try only the preferred node if possible
1983  *
1984  * 	This function allocates a page from the kernel page pool and applies
1985  *	a NUMA policy associated with the VMA or the current process.
1986  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1987  *	mm_struct of the VMA to prevent it from going away. Should be used for
1988  *	all allocations for pages that will be mapped into user space. Returns
1989  *	NULL when no page can be allocated.
1990  */
1991 struct page *
1992 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1993 		unsigned long addr, int node, bool hugepage)
1994 {
1995 	struct mempolicy *pol;
1996 	struct page *page;
1997 	unsigned int cpuset_mems_cookie;
1998 	struct zonelist *zl;
1999 	nodemask_t *nmask;
2000 
2001 retry_cpuset:
2002 	pol = get_vma_policy(vma, addr);
2003 	cpuset_mems_cookie = read_mems_allowed_begin();
2004 
2005 	if (pol->mode == MPOL_INTERLEAVE) {
2006 		unsigned nid;
2007 
2008 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2009 		mpol_cond_put(pol);
2010 		page = alloc_page_interleave(gfp, order, nid);
2011 		goto out;
2012 	}
2013 
2014 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2015 		int hpage_node = node;
2016 
2017 		/*
2018 		 * For hugepage allocation and non-interleave policy which
2019 		 * allows the current node (or other explicitly preferred
2020 		 * node) we only try to allocate from the current/preferred
2021 		 * node and don't fall back to other nodes, as the cost of
2022 		 * remote accesses would likely offset THP benefits.
2023 		 *
2024 		 * If the policy is interleave, or does not allow the current
2025 		 * node in its nodemask, we allocate the standard way.
2026 		 */
2027 		if (pol->mode == MPOL_PREFERRED &&
2028 						!(pol->flags & MPOL_F_LOCAL))
2029 			hpage_node = pol->v.preferred_node;
2030 
2031 		nmask = policy_nodemask(gfp, pol);
2032 		if (!nmask || node_isset(hpage_node, *nmask)) {
2033 			mpol_cond_put(pol);
2034 			page = __alloc_pages_node(hpage_node,
2035 						gfp | __GFP_THISNODE, order);
2036 			goto out;
2037 		}
2038 	}
2039 
2040 	nmask = policy_nodemask(gfp, pol);
2041 	zl = policy_zonelist(gfp, pol, node);
2042 	mpol_cond_put(pol);
2043 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2044 out:
2045 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2046 		goto retry_cpuset;
2047 	return page;
2048 }
2049 
2050 /**
2051  * 	alloc_pages_current - Allocate pages.
2052  *
2053  *	@gfp:
2054  *		%GFP_USER   user allocation,
2055  *      	%GFP_KERNEL kernel allocation,
2056  *      	%GFP_HIGHMEM highmem allocation,
2057  *      	%GFP_FS     don't call back into a file system.
2058  *      	%GFP_ATOMIC don't sleep.
2059  *	@order: Power of two of allocation size in pages. 0 is a single page.
2060  *
2061  *	Allocate a page from the kernel page pool.  When not in
2062  *	interrupt context and apply the current process NUMA policy.
2063  *	Returns NULL when no page can be allocated.
2064  *
2065  *	Don't call cpuset_update_task_memory_state() unless
2066  *	1) it's ok to take cpuset_sem (can WAIT), and
2067  *	2) allocating for current task (not interrupt).
2068  */
2069 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2070 {
2071 	struct mempolicy *pol = &default_policy;
2072 	struct page *page;
2073 	unsigned int cpuset_mems_cookie;
2074 
2075 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2076 		pol = get_task_policy(current);
2077 
2078 retry_cpuset:
2079 	cpuset_mems_cookie = read_mems_allowed_begin();
2080 
2081 	/*
2082 	 * No reference counting needed for current->mempolicy
2083 	 * nor system default_policy
2084 	 */
2085 	if (pol->mode == MPOL_INTERLEAVE)
2086 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2087 	else
2088 		page = __alloc_pages_nodemask(gfp, order,
2089 				policy_zonelist(gfp, pol, numa_node_id()),
2090 				policy_nodemask(gfp, pol));
2091 
2092 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2093 		goto retry_cpuset;
2094 
2095 	return page;
2096 }
2097 EXPORT_SYMBOL(alloc_pages_current);
2098 
2099 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2100 {
2101 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2102 
2103 	if (IS_ERR(pol))
2104 		return PTR_ERR(pol);
2105 	dst->vm_policy = pol;
2106 	return 0;
2107 }
2108 
2109 /*
2110  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2111  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2112  * with the mems_allowed returned by cpuset_mems_allowed().  This
2113  * keeps mempolicies cpuset relative after its cpuset moves.  See
2114  * further kernel/cpuset.c update_nodemask().
2115  *
2116  * current's mempolicy may be rebinded by the other task(the task that changes
2117  * cpuset's mems), so we needn't do rebind work for current task.
2118  */
2119 
2120 /* Slow path of a mempolicy duplicate */
2121 struct mempolicy *__mpol_dup(struct mempolicy *old)
2122 {
2123 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2124 
2125 	if (!new)
2126 		return ERR_PTR(-ENOMEM);
2127 
2128 	/* task's mempolicy is protected by alloc_lock */
2129 	if (old == current->mempolicy) {
2130 		task_lock(current);
2131 		*new = *old;
2132 		task_unlock(current);
2133 	} else
2134 		*new = *old;
2135 
2136 	if (current_cpuset_is_being_rebound()) {
2137 		nodemask_t mems = cpuset_mems_allowed(current);
2138 		if (new->flags & MPOL_F_REBINDING)
2139 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2140 		else
2141 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2142 	}
2143 	atomic_set(&new->refcnt, 1);
2144 	return new;
2145 }
2146 
2147 /* Slow path of a mempolicy comparison */
2148 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2149 {
2150 	if (!a || !b)
2151 		return false;
2152 	if (a->mode != b->mode)
2153 		return false;
2154 	if (a->flags != b->flags)
2155 		return false;
2156 	if (mpol_store_user_nodemask(a))
2157 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2158 			return false;
2159 
2160 	switch (a->mode) {
2161 	case MPOL_BIND:
2162 		/* Fall through */
2163 	case MPOL_INTERLEAVE:
2164 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2165 	case MPOL_PREFERRED:
2166 		return a->v.preferred_node == b->v.preferred_node;
2167 	default:
2168 		BUG();
2169 		return false;
2170 	}
2171 }
2172 
2173 /*
2174  * Shared memory backing store policy support.
2175  *
2176  * Remember policies even when nobody has shared memory mapped.
2177  * The policies are kept in Red-Black tree linked from the inode.
2178  * They are protected by the sp->lock rwlock, which should be held
2179  * for any accesses to the tree.
2180  */
2181 
2182 /*
2183  * lookup first element intersecting start-end.  Caller holds sp->lock for
2184  * reading or for writing
2185  */
2186 static struct sp_node *
2187 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2188 {
2189 	struct rb_node *n = sp->root.rb_node;
2190 
2191 	while (n) {
2192 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2193 
2194 		if (start >= p->end)
2195 			n = n->rb_right;
2196 		else if (end <= p->start)
2197 			n = n->rb_left;
2198 		else
2199 			break;
2200 	}
2201 	if (!n)
2202 		return NULL;
2203 	for (;;) {
2204 		struct sp_node *w = NULL;
2205 		struct rb_node *prev = rb_prev(n);
2206 		if (!prev)
2207 			break;
2208 		w = rb_entry(prev, struct sp_node, nd);
2209 		if (w->end <= start)
2210 			break;
2211 		n = prev;
2212 	}
2213 	return rb_entry(n, struct sp_node, nd);
2214 }
2215 
2216 /*
2217  * Insert a new shared policy into the list.  Caller holds sp->lock for
2218  * writing.
2219  */
2220 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2221 {
2222 	struct rb_node **p = &sp->root.rb_node;
2223 	struct rb_node *parent = NULL;
2224 	struct sp_node *nd;
2225 
2226 	while (*p) {
2227 		parent = *p;
2228 		nd = rb_entry(parent, struct sp_node, nd);
2229 		if (new->start < nd->start)
2230 			p = &(*p)->rb_left;
2231 		else if (new->end > nd->end)
2232 			p = &(*p)->rb_right;
2233 		else
2234 			BUG();
2235 	}
2236 	rb_link_node(&new->nd, parent, p);
2237 	rb_insert_color(&new->nd, &sp->root);
2238 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2239 		 new->policy ? new->policy->mode : 0);
2240 }
2241 
2242 /* Find shared policy intersecting idx */
2243 struct mempolicy *
2244 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2245 {
2246 	struct mempolicy *pol = NULL;
2247 	struct sp_node *sn;
2248 
2249 	if (!sp->root.rb_node)
2250 		return NULL;
2251 	read_lock(&sp->lock);
2252 	sn = sp_lookup(sp, idx, idx+1);
2253 	if (sn) {
2254 		mpol_get(sn->policy);
2255 		pol = sn->policy;
2256 	}
2257 	read_unlock(&sp->lock);
2258 	return pol;
2259 }
2260 
2261 static void sp_free(struct sp_node *n)
2262 {
2263 	mpol_put(n->policy);
2264 	kmem_cache_free(sn_cache, n);
2265 }
2266 
2267 /**
2268  * mpol_misplaced - check whether current page node is valid in policy
2269  *
2270  * @page: page to be checked
2271  * @vma: vm area where page mapped
2272  * @addr: virtual address where page mapped
2273  *
2274  * Lookup current policy node id for vma,addr and "compare to" page's
2275  * node id.
2276  *
2277  * Returns:
2278  *	-1	- not misplaced, page is in the right node
2279  *	node	- node id where the page should be
2280  *
2281  * Policy determination "mimics" alloc_page_vma().
2282  * Called from fault path where we know the vma and faulting address.
2283  */
2284 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2285 {
2286 	struct mempolicy *pol;
2287 	struct zone *zone;
2288 	int curnid = page_to_nid(page);
2289 	unsigned long pgoff;
2290 	int thiscpu = raw_smp_processor_id();
2291 	int thisnid = cpu_to_node(thiscpu);
2292 	int polnid = -1;
2293 	int ret = -1;
2294 
2295 	BUG_ON(!vma);
2296 
2297 	pol = get_vma_policy(vma, addr);
2298 	if (!(pol->flags & MPOL_F_MOF))
2299 		goto out;
2300 
2301 	switch (pol->mode) {
2302 	case MPOL_INTERLEAVE:
2303 		BUG_ON(addr >= vma->vm_end);
2304 		BUG_ON(addr < vma->vm_start);
2305 
2306 		pgoff = vma->vm_pgoff;
2307 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2308 		polnid = offset_il_node(pol, vma, pgoff);
2309 		break;
2310 
2311 	case MPOL_PREFERRED:
2312 		if (pol->flags & MPOL_F_LOCAL)
2313 			polnid = numa_node_id();
2314 		else
2315 			polnid = pol->v.preferred_node;
2316 		break;
2317 
2318 	case MPOL_BIND:
2319 		/*
2320 		 * allows binding to multiple nodes.
2321 		 * use current page if in policy nodemask,
2322 		 * else select nearest allowed node, if any.
2323 		 * If no allowed nodes, use current [!misplaced].
2324 		 */
2325 		if (node_isset(curnid, pol->v.nodes))
2326 			goto out;
2327 		(void)first_zones_zonelist(
2328 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2329 				gfp_zone(GFP_HIGHUSER),
2330 				&pol->v.nodes, &zone);
2331 		polnid = zone->node;
2332 		break;
2333 
2334 	default:
2335 		BUG();
2336 	}
2337 
2338 	/* Migrate the page towards the node whose CPU is referencing it */
2339 	if (pol->flags & MPOL_F_MORON) {
2340 		polnid = thisnid;
2341 
2342 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2343 			goto out;
2344 	}
2345 
2346 	if (curnid != polnid)
2347 		ret = polnid;
2348 out:
2349 	mpol_cond_put(pol);
2350 
2351 	return ret;
2352 }
2353 
2354 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2355 {
2356 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2357 	rb_erase(&n->nd, &sp->root);
2358 	sp_free(n);
2359 }
2360 
2361 static void sp_node_init(struct sp_node *node, unsigned long start,
2362 			unsigned long end, struct mempolicy *pol)
2363 {
2364 	node->start = start;
2365 	node->end = end;
2366 	node->policy = pol;
2367 }
2368 
2369 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2370 				struct mempolicy *pol)
2371 {
2372 	struct sp_node *n;
2373 	struct mempolicy *newpol;
2374 
2375 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2376 	if (!n)
2377 		return NULL;
2378 
2379 	newpol = mpol_dup(pol);
2380 	if (IS_ERR(newpol)) {
2381 		kmem_cache_free(sn_cache, n);
2382 		return NULL;
2383 	}
2384 	newpol->flags |= MPOL_F_SHARED;
2385 	sp_node_init(n, start, end, newpol);
2386 
2387 	return n;
2388 }
2389 
2390 /* Replace a policy range. */
2391 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2392 				 unsigned long end, struct sp_node *new)
2393 {
2394 	struct sp_node *n;
2395 	struct sp_node *n_new = NULL;
2396 	struct mempolicy *mpol_new = NULL;
2397 	int ret = 0;
2398 
2399 restart:
2400 	write_lock(&sp->lock);
2401 	n = sp_lookup(sp, start, end);
2402 	/* Take care of old policies in the same range. */
2403 	while (n && n->start < end) {
2404 		struct rb_node *next = rb_next(&n->nd);
2405 		if (n->start >= start) {
2406 			if (n->end <= end)
2407 				sp_delete(sp, n);
2408 			else
2409 				n->start = end;
2410 		} else {
2411 			/* Old policy spanning whole new range. */
2412 			if (n->end > end) {
2413 				if (!n_new)
2414 					goto alloc_new;
2415 
2416 				*mpol_new = *n->policy;
2417 				atomic_set(&mpol_new->refcnt, 1);
2418 				sp_node_init(n_new, end, n->end, mpol_new);
2419 				n->end = start;
2420 				sp_insert(sp, n_new);
2421 				n_new = NULL;
2422 				mpol_new = NULL;
2423 				break;
2424 			} else
2425 				n->end = start;
2426 		}
2427 		if (!next)
2428 			break;
2429 		n = rb_entry(next, struct sp_node, nd);
2430 	}
2431 	if (new)
2432 		sp_insert(sp, new);
2433 	write_unlock(&sp->lock);
2434 	ret = 0;
2435 
2436 err_out:
2437 	if (mpol_new)
2438 		mpol_put(mpol_new);
2439 	if (n_new)
2440 		kmem_cache_free(sn_cache, n_new);
2441 
2442 	return ret;
2443 
2444 alloc_new:
2445 	write_unlock(&sp->lock);
2446 	ret = -ENOMEM;
2447 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2448 	if (!n_new)
2449 		goto err_out;
2450 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2451 	if (!mpol_new)
2452 		goto err_out;
2453 	goto restart;
2454 }
2455 
2456 /**
2457  * mpol_shared_policy_init - initialize shared policy for inode
2458  * @sp: pointer to inode shared policy
2459  * @mpol:  struct mempolicy to install
2460  *
2461  * Install non-NULL @mpol in inode's shared policy rb-tree.
2462  * On entry, the current task has a reference on a non-NULL @mpol.
2463  * This must be released on exit.
2464  * This is called at get_inode() calls and we can use GFP_KERNEL.
2465  */
2466 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2467 {
2468 	int ret;
2469 
2470 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2471 	rwlock_init(&sp->lock);
2472 
2473 	if (mpol) {
2474 		struct vm_area_struct pvma;
2475 		struct mempolicy *new;
2476 		NODEMASK_SCRATCH(scratch);
2477 
2478 		if (!scratch)
2479 			goto put_mpol;
2480 		/* contextualize the tmpfs mount point mempolicy */
2481 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2482 		if (IS_ERR(new))
2483 			goto free_scratch; /* no valid nodemask intersection */
2484 
2485 		task_lock(current);
2486 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2487 		task_unlock(current);
2488 		if (ret)
2489 			goto put_new;
2490 
2491 		/* Create pseudo-vma that contains just the policy */
2492 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2493 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2494 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2495 
2496 put_new:
2497 		mpol_put(new);			/* drop initial ref */
2498 free_scratch:
2499 		NODEMASK_SCRATCH_FREE(scratch);
2500 put_mpol:
2501 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2502 	}
2503 }
2504 
2505 int mpol_set_shared_policy(struct shared_policy *info,
2506 			struct vm_area_struct *vma, struct mempolicy *npol)
2507 {
2508 	int err;
2509 	struct sp_node *new = NULL;
2510 	unsigned long sz = vma_pages(vma);
2511 
2512 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2513 		 vma->vm_pgoff,
2514 		 sz, npol ? npol->mode : -1,
2515 		 npol ? npol->flags : -1,
2516 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2517 
2518 	if (npol) {
2519 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2520 		if (!new)
2521 			return -ENOMEM;
2522 	}
2523 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2524 	if (err && new)
2525 		sp_free(new);
2526 	return err;
2527 }
2528 
2529 /* Free a backing policy store on inode delete. */
2530 void mpol_free_shared_policy(struct shared_policy *p)
2531 {
2532 	struct sp_node *n;
2533 	struct rb_node *next;
2534 
2535 	if (!p->root.rb_node)
2536 		return;
2537 	write_lock(&p->lock);
2538 	next = rb_first(&p->root);
2539 	while (next) {
2540 		n = rb_entry(next, struct sp_node, nd);
2541 		next = rb_next(&n->nd);
2542 		sp_delete(p, n);
2543 	}
2544 	write_unlock(&p->lock);
2545 }
2546 
2547 #ifdef CONFIG_NUMA_BALANCING
2548 static int __initdata numabalancing_override;
2549 
2550 static void __init check_numabalancing_enable(void)
2551 {
2552 	bool numabalancing_default = false;
2553 
2554 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2555 		numabalancing_default = true;
2556 
2557 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2558 	if (numabalancing_override)
2559 		set_numabalancing_state(numabalancing_override == 1);
2560 
2561 	if (num_online_nodes() > 1 && !numabalancing_override) {
2562 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2563 			numabalancing_default ? "Enabling" : "Disabling");
2564 		set_numabalancing_state(numabalancing_default);
2565 	}
2566 }
2567 
2568 static int __init setup_numabalancing(char *str)
2569 {
2570 	int ret = 0;
2571 	if (!str)
2572 		goto out;
2573 
2574 	if (!strcmp(str, "enable")) {
2575 		numabalancing_override = 1;
2576 		ret = 1;
2577 	} else if (!strcmp(str, "disable")) {
2578 		numabalancing_override = -1;
2579 		ret = 1;
2580 	}
2581 out:
2582 	if (!ret)
2583 		pr_warn("Unable to parse numa_balancing=\n");
2584 
2585 	return ret;
2586 }
2587 __setup("numa_balancing=", setup_numabalancing);
2588 #else
2589 static inline void __init check_numabalancing_enable(void)
2590 {
2591 }
2592 #endif /* CONFIG_NUMA_BALANCING */
2593 
2594 /* assumes fs == KERNEL_DS */
2595 void __init numa_policy_init(void)
2596 {
2597 	nodemask_t interleave_nodes;
2598 	unsigned long largest = 0;
2599 	int nid, prefer = 0;
2600 
2601 	policy_cache = kmem_cache_create("numa_policy",
2602 					 sizeof(struct mempolicy),
2603 					 0, SLAB_PANIC, NULL);
2604 
2605 	sn_cache = kmem_cache_create("shared_policy_node",
2606 				     sizeof(struct sp_node),
2607 				     0, SLAB_PANIC, NULL);
2608 
2609 	for_each_node(nid) {
2610 		preferred_node_policy[nid] = (struct mempolicy) {
2611 			.refcnt = ATOMIC_INIT(1),
2612 			.mode = MPOL_PREFERRED,
2613 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2614 			.v = { .preferred_node = nid, },
2615 		};
2616 	}
2617 
2618 	/*
2619 	 * Set interleaving policy for system init. Interleaving is only
2620 	 * enabled across suitably sized nodes (default is >= 16MB), or
2621 	 * fall back to the largest node if they're all smaller.
2622 	 */
2623 	nodes_clear(interleave_nodes);
2624 	for_each_node_state(nid, N_MEMORY) {
2625 		unsigned long total_pages = node_present_pages(nid);
2626 
2627 		/* Preserve the largest node */
2628 		if (largest < total_pages) {
2629 			largest = total_pages;
2630 			prefer = nid;
2631 		}
2632 
2633 		/* Interleave this node? */
2634 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2635 			node_set(nid, interleave_nodes);
2636 	}
2637 
2638 	/* All too small, use the largest */
2639 	if (unlikely(nodes_empty(interleave_nodes)))
2640 		node_set(prefer, interleave_nodes);
2641 
2642 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2643 		pr_err("%s: interleaving failed\n", __func__);
2644 
2645 	check_numabalancing_enable();
2646 }
2647 
2648 /* Reset policy of current process to default */
2649 void numa_default_policy(void)
2650 {
2651 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2652 }
2653 
2654 /*
2655  * Parse and format mempolicy from/to strings
2656  */
2657 
2658 /*
2659  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2660  */
2661 static const char * const policy_modes[] =
2662 {
2663 	[MPOL_DEFAULT]    = "default",
2664 	[MPOL_PREFERRED]  = "prefer",
2665 	[MPOL_BIND]       = "bind",
2666 	[MPOL_INTERLEAVE] = "interleave",
2667 	[MPOL_LOCAL]      = "local",
2668 };
2669 
2670 
2671 #ifdef CONFIG_TMPFS
2672 /**
2673  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2674  * @str:  string containing mempolicy to parse
2675  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2676  *
2677  * Format of input:
2678  *	<mode>[=<flags>][:<nodelist>]
2679  *
2680  * On success, returns 0, else 1
2681  */
2682 int mpol_parse_str(char *str, struct mempolicy **mpol)
2683 {
2684 	struct mempolicy *new = NULL;
2685 	unsigned short mode;
2686 	unsigned short mode_flags;
2687 	nodemask_t nodes;
2688 	char *nodelist = strchr(str, ':');
2689 	char *flags = strchr(str, '=');
2690 	int err = 1;
2691 
2692 	if (nodelist) {
2693 		/* NUL-terminate mode or flags string */
2694 		*nodelist++ = '\0';
2695 		if (nodelist_parse(nodelist, nodes))
2696 			goto out;
2697 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2698 			goto out;
2699 	} else
2700 		nodes_clear(nodes);
2701 
2702 	if (flags)
2703 		*flags++ = '\0';	/* terminate mode string */
2704 
2705 	for (mode = 0; mode < MPOL_MAX; mode++) {
2706 		if (!strcmp(str, policy_modes[mode])) {
2707 			break;
2708 		}
2709 	}
2710 	if (mode >= MPOL_MAX)
2711 		goto out;
2712 
2713 	switch (mode) {
2714 	case MPOL_PREFERRED:
2715 		/*
2716 		 * Insist on a nodelist of one node only
2717 		 */
2718 		if (nodelist) {
2719 			char *rest = nodelist;
2720 			while (isdigit(*rest))
2721 				rest++;
2722 			if (*rest)
2723 				goto out;
2724 		}
2725 		break;
2726 	case MPOL_INTERLEAVE:
2727 		/*
2728 		 * Default to online nodes with memory if no nodelist
2729 		 */
2730 		if (!nodelist)
2731 			nodes = node_states[N_MEMORY];
2732 		break;
2733 	case MPOL_LOCAL:
2734 		/*
2735 		 * Don't allow a nodelist;  mpol_new() checks flags
2736 		 */
2737 		if (nodelist)
2738 			goto out;
2739 		mode = MPOL_PREFERRED;
2740 		break;
2741 	case MPOL_DEFAULT:
2742 		/*
2743 		 * Insist on a empty nodelist
2744 		 */
2745 		if (!nodelist)
2746 			err = 0;
2747 		goto out;
2748 	case MPOL_BIND:
2749 		/*
2750 		 * Insist on a nodelist
2751 		 */
2752 		if (!nodelist)
2753 			goto out;
2754 	}
2755 
2756 	mode_flags = 0;
2757 	if (flags) {
2758 		/*
2759 		 * Currently, we only support two mutually exclusive
2760 		 * mode flags.
2761 		 */
2762 		if (!strcmp(flags, "static"))
2763 			mode_flags |= MPOL_F_STATIC_NODES;
2764 		else if (!strcmp(flags, "relative"))
2765 			mode_flags |= MPOL_F_RELATIVE_NODES;
2766 		else
2767 			goto out;
2768 	}
2769 
2770 	new = mpol_new(mode, mode_flags, &nodes);
2771 	if (IS_ERR(new))
2772 		goto out;
2773 
2774 	/*
2775 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2776 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2777 	 */
2778 	if (mode != MPOL_PREFERRED)
2779 		new->v.nodes = nodes;
2780 	else if (nodelist)
2781 		new->v.preferred_node = first_node(nodes);
2782 	else
2783 		new->flags |= MPOL_F_LOCAL;
2784 
2785 	/*
2786 	 * Save nodes for contextualization: this will be used to "clone"
2787 	 * the mempolicy in a specific context [cpuset] at a later time.
2788 	 */
2789 	new->w.user_nodemask = nodes;
2790 
2791 	err = 0;
2792 
2793 out:
2794 	/* Restore string for error message */
2795 	if (nodelist)
2796 		*--nodelist = ':';
2797 	if (flags)
2798 		*--flags = '=';
2799 	if (!err)
2800 		*mpol = new;
2801 	return err;
2802 }
2803 #endif /* CONFIG_TMPFS */
2804 
2805 /**
2806  * mpol_to_str - format a mempolicy structure for printing
2807  * @buffer:  to contain formatted mempolicy string
2808  * @maxlen:  length of @buffer
2809  * @pol:  pointer to mempolicy to be formatted
2810  *
2811  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2812  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2813  * longest flag, "relative", and to display at least a few node ids.
2814  */
2815 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2816 {
2817 	char *p = buffer;
2818 	nodemask_t nodes = NODE_MASK_NONE;
2819 	unsigned short mode = MPOL_DEFAULT;
2820 	unsigned short flags = 0;
2821 
2822 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2823 		mode = pol->mode;
2824 		flags = pol->flags;
2825 	}
2826 
2827 	switch (mode) {
2828 	case MPOL_DEFAULT:
2829 		break;
2830 	case MPOL_PREFERRED:
2831 		if (flags & MPOL_F_LOCAL)
2832 			mode = MPOL_LOCAL;
2833 		else
2834 			node_set(pol->v.preferred_node, nodes);
2835 		break;
2836 	case MPOL_BIND:
2837 	case MPOL_INTERLEAVE:
2838 		nodes = pol->v.nodes;
2839 		break;
2840 	default:
2841 		WARN_ON_ONCE(1);
2842 		snprintf(p, maxlen, "unknown");
2843 		return;
2844 	}
2845 
2846 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2847 
2848 	if (flags & MPOL_MODE_FLAGS) {
2849 		p += snprintf(p, buffer + maxlen - p, "=");
2850 
2851 		/*
2852 		 * Currently, the only defined flags are mutually exclusive
2853 		 */
2854 		if (flags & MPOL_F_STATIC_NODES)
2855 			p += snprintf(p, buffer + maxlen - p, "static");
2856 		else if (flags & MPOL_F_RELATIVE_NODES)
2857 			p += snprintf(p, buffer + maxlen - p, "relative");
2858 	}
2859 
2860 	if (!nodes_empty(nodes))
2861 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2862 			       nodemask_pr_args(&nodes));
2863 }
2864