xref: /openbmc/linux/mm/mempolicy.c (revision 2596e07a)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 #include <linux/random.h>
101 
102 #include "internal.h"
103 
104 /* Internal flags */
105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107 
108 static struct kmem_cache *policy_cache;
109 static struct kmem_cache *sn_cache;
110 
111 /* Highest zone. An specific allocation for a zone below that is not
112    policied. */
113 enum zone_type policy_zone = 0;
114 
115 /*
116  * run-time system-wide default policy => local allocation
117  */
118 static struct mempolicy default_policy = {
119 	.refcnt = ATOMIC_INIT(1), /* never free it */
120 	.mode = MPOL_PREFERRED,
121 	.flags = MPOL_F_LOCAL,
122 };
123 
124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 
126 struct mempolicy *get_task_policy(struct task_struct *p)
127 {
128 	struct mempolicy *pol = p->mempolicy;
129 	int node;
130 
131 	if (pol)
132 		return pol;
133 
134 	node = numa_node_id();
135 	if (node != NUMA_NO_NODE) {
136 		pol = &preferred_node_policy[node];
137 		/* preferred_node_policy is not initialised early in boot */
138 		if (pol->mode)
139 			return pol;
140 	}
141 
142 	return &default_policy;
143 }
144 
145 static const struct mempolicy_operations {
146 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
147 	/*
148 	 * If read-side task has no lock to protect task->mempolicy, write-side
149 	 * task will rebind the task->mempolicy by two step. The first step is
150 	 * setting all the newly nodes, and the second step is cleaning all the
151 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
152 	 * page.
153 	 * If we have a lock to protect task->mempolicy in read-side, we do
154 	 * rebind directly.
155 	 *
156 	 * step:
157 	 * 	MPOL_REBIND_ONCE - do rebind work at once
158 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
159 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
160 	 */
161 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
162 			enum mpol_rebind_step step);
163 } mpol_ops[MPOL_MAX];
164 
165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
166 {
167 	return pol->flags & MPOL_MODE_FLAGS;
168 }
169 
170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
171 				   const nodemask_t *rel)
172 {
173 	nodemask_t tmp;
174 	nodes_fold(tmp, *orig, nodes_weight(*rel));
175 	nodes_onto(*ret, tmp, *rel);
176 }
177 
178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
179 {
180 	if (nodes_empty(*nodes))
181 		return -EINVAL;
182 	pol->v.nodes = *nodes;
183 	return 0;
184 }
185 
186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (!nodes)
189 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
190 	else if (nodes_empty(*nodes))
191 		return -EINVAL;			/*  no allowed nodes */
192 	else
193 		pol->v.preferred_node = first_node(*nodes);
194 	return 0;
195 }
196 
197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
198 {
199 	if (nodes_empty(*nodes))
200 		return -EINVAL;
201 	pol->v.nodes = *nodes;
202 	return 0;
203 }
204 
205 /*
206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
207  * any, for the new policy.  mpol_new() has already validated the nodes
208  * parameter with respect to the policy mode and flags.  But, we need to
209  * handle an empty nodemask with MPOL_PREFERRED here.
210  *
211  * Must be called holding task's alloc_lock to protect task's mems_allowed
212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
213  */
214 static int mpol_set_nodemask(struct mempolicy *pol,
215 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
216 {
217 	int ret;
218 
219 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
220 	if (pol == NULL)
221 		return 0;
222 	/* Check N_MEMORY */
223 	nodes_and(nsc->mask1,
224 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
225 
226 	VM_BUG_ON(!nodes);
227 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
228 		nodes = NULL;	/* explicit local allocation */
229 	else {
230 		if (pol->flags & MPOL_F_RELATIVE_NODES)
231 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
232 		else
233 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
234 
235 		if (mpol_store_user_nodemask(pol))
236 			pol->w.user_nodemask = *nodes;
237 		else
238 			pol->w.cpuset_mems_allowed =
239 						cpuset_current_mems_allowed;
240 	}
241 
242 	if (nodes)
243 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
244 	else
245 		ret = mpol_ops[pol->mode].create(pol, NULL);
246 	return ret;
247 }
248 
249 /*
250  * This function just creates a new policy, does some check and simple
251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
252  */
253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 				  nodemask_t *nodes)
255 {
256 	struct mempolicy *policy;
257 
258 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
259 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
260 
261 	if (mode == MPOL_DEFAULT) {
262 		if (nodes && !nodes_empty(*nodes))
263 			return ERR_PTR(-EINVAL);
264 		return NULL;
265 	}
266 	VM_BUG_ON(!nodes);
267 
268 	/*
269 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
270 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
271 	 * All other modes require a valid pointer to a non-empty nodemask.
272 	 */
273 	if (mode == MPOL_PREFERRED) {
274 		if (nodes_empty(*nodes)) {
275 			if (((flags & MPOL_F_STATIC_NODES) ||
276 			     (flags & MPOL_F_RELATIVE_NODES)))
277 				return ERR_PTR(-EINVAL);
278 		}
279 	} else if (mode == MPOL_LOCAL) {
280 		if (!nodes_empty(*nodes))
281 			return ERR_PTR(-EINVAL);
282 		mode = MPOL_PREFERRED;
283 	} else if (nodes_empty(*nodes))
284 		return ERR_PTR(-EINVAL);
285 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
286 	if (!policy)
287 		return ERR_PTR(-ENOMEM);
288 	atomic_set(&policy->refcnt, 1);
289 	policy->mode = mode;
290 	policy->flags = flags;
291 
292 	return policy;
293 }
294 
295 /* Slow path of a mpol destructor. */
296 void __mpol_put(struct mempolicy *p)
297 {
298 	if (!atomic_dec_and_test(&p->refcnt))
299 		return;
300 	kmem_cache_free(policy_cache, p);
301 }
302 
303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
304 				enum mpol_rebind_step step)
305 {
306 }
307 
308 /*
309  * step:
310  * 	MPOL_REBIND_ONCE  - do rebind work at once
311  * 	MPOL_REBIND_STEP1 - set all the newly nodes
312  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
313  */
314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
315 				 enum mpol_rebind_step step)
316 {
317 	nodemask_t tmp;
318 
319 	if (pol->flags & MPOL_F_STATIC_NODES)
320 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
321 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 	else {
324 		/*
325 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
326 		 * result
327 		 */
328 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
329 			nodes_remap(tmp, pol->v.nodes,
330 					pol->w.cpuset_mems_allowed, *nodes);
331 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
332 		} else if (step == MPOL_REBIND_STEP2) {
333 			tmp = pol->w.cpuset_mems_allowed;
334 			pol->w.cpuset_mems_allowed = *nodes;
335 		} else
336 			BUG();
337 	}
338 
339 	if (nodes_empty(tmp))
340 		tmp = *nodes;
341 
342 	if (step == MPOL_REBIND_STEP1)
343 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
344 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
345 		pol->v.nodes = tmp;
346 	else
347 		BUG();
348 
349 	if (!node_isset(current->il_next, tmp)) {
350 		current->il_next = next_node(current->il_next, tmp);
351 		if (current->il_next >= MAX_NUMNODES)
352 			current->il_next = first_node(tmp);
353 		if (current->il_next >= MAX_NUMNODES)
354 			current->il_next = numa_node_id();
355 	}
356 }
357 
358 static void mpol_rebind_preferred(struct mempolicy *pol,
359 				  const nodemask_t *nodes,
360 				  enum mpol_rebind_step step)
361 {
362 	nodemask_t tmp;
363 
364 	if (pol->flags & MPOL_F_STATIC_NODES) {
365 		int node = first_node(pol->w.user_nodemask);
366 
367 		if (node_isset(node, *nodes)) {
368 			pol->v.preferred_node = node;
369 			pol->flags &= ~MPOL_F_LOCAL;
370 		} else
371 			pol->flags |= MPOL_F_LOCAL;
372 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
373 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
374 		pol->v.preferred_node = first_node(tmp);
375 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
376 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
377 						   pol->w.cpuset_mems_allowed,
378 						   *nodes);
379 		pol->w.cpuset_mems_allowed = *nodes;
380 	}
381 }
382 
383 /*
384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
385  *
386  * If read-side task has no lock to protect task->mempolicy, write-side
387  * task will rebind the task->mempolicy by two step. The first step is
388  * setting all the newly nodes, and the second step is cleaning all the
389  * disallowed nodes. In this way, we can avoid finding no node to alloc
390  * page.
391  * If we have a lock to protect task->mempolicy in read-side, we do
392  * rebind directly.
393  *
394  * step:
395  * 	MPOL_REBIND_ONCE  - do rebind work at once
396  * 	MPOL_REBIND_STEP1 - set all the newly nodes
397  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
398  */
399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
400 				enum mpol_rebind_step step)
401 {
402 	if (!pol)
403 		return;
404 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
405 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
409 		return;
410 
411 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
412 		BUG();
413 
414 	if (step == MPOL_REBIND_STEP1)
415 		pol->flags |= MPOL_F_REBINDING;
416 	else if (step == MPOL_REBIND_STEP2)
417 		pol->flags &= ~MPOL_F_REBINDING;
418 	else if (step >= MPOL_REBIND_NSTEP)
419 		BUG();
420 
421 	mpol_ops[pol->mode].rebind(pol, newmask, step);
422 }
423 
424 /*
425  * Wrapper for mpol_rebind_policy() that just requires task
426  * pointer, and updates task mempolicy.
427  *
428  * Called with task's alloc_lock held.
429  */
430 
431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
432 			enum mpol_rebind_step step)
433 {
434 	mpol_rebind_policy(tsk->mempolicy, new, step);
435 }
436 
437 /*
438  * Rebind each vma in mm to new nodemask.
439  *
440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
441  */
442 
443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
444 {
445 	struct vm_area_struct *vma;
446 
447 	down_write(&mm->mmap_sem);
448 	for (vma = mm->mmap; vma; vma = vma->vm_next)
449 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
450 	up_write(&mm->mmap_sem);
451 }
452 
453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
454 	[MPOL_DEFAULT] = {
455 		.rebind = mpol_rebind_default,
456 	},
457 	[MPOL_INTERLEAVE] = {
458 		.create = mpol_new_interleave,
459 		.rebind = mpol_rebind_nodemask,
460 	},
461 	[MPOL_PREFERRED] = {
462 		.create = mpol_new_preferred,
463 		.rebind = mpol_rebind_preferred,
464 	},
465 	[MPOL_BIND] = {
466 		.create = mpol_new_bind,
467 		.rebind = mpol_rebind_nodemask,
468 	},
469 };
470 
471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 				unsigned long flags);
473 
474 struct queue_pages {
475 	struct list_head *pagelist;
476 	unsigned long flags;
477 	nodemask_t *nmask;
478 	struct vm_area_struct *prev;
479 };
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  */
485 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
486 			unsigned long end, struct mm_walk *walk)
487 {
488 	struct vm_area_struct *vma = walk->vma;
489 	struct page *page;
490 	struct queue_pages *qp = walk->private;
491 	unsigned long flags = qp->flags;
492 	int nid, ret;
493 	pte_t *pte;
494 	spinlock_t *ptl;
495 
496 	if (pmd_trans_huge(*pmd)) {
497 		ptl = pmd_lock(walk->mm, pmd);
498 		if (pmd_trans_huge(*pmd)) {
499 			page = pmd_page(*pmd);
500 			if (is_huge_zero_page(page)) {
501 				spin_unlock(ptl);
502 				split_huge_pmd(vma, pmd, addr);
503 			} else {
504 				get_page(page);
505 				spin_unlock(ptl);
506 				lock_page(page);
507 				ret = split_huge_page(page);
508 				unlock_page(page);
509 				put_page(page);
510 				if (ret)
511 					return 0;
512 			}
513 		} else {
514 			spin_unlock(ptl);
515 		}
516 	}
517 
518 retry:
519 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
520 	for (; addr != end; pte++, addr += PAGE_SIZE) {
521 		if (!pte_present(*pte))
522 			continue;
523 		page = vm_normal_page(vma, addr, *pte);
524 		if (!page)
525 			continue;
526 		/*
527 		 * vm_normal_page() filters out zero pages, but there might
528 		 * still be PageReserved pages to skip, perhaps in a VDSO.
529 		 */
530 		if (PageReserved(page))
531 			continue;
532 		nid = page_to_nid(page);
533 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
534 			continue;
535 		if (PageTail(page) && PageAnon(page)) {
536 			get_page(page);
537 			pte_unmap_unlock(pte, ptl);
538 			lock_page(page);
539 			ret = split_huge_page(page);
540 			unlock_page(page);
541 			put_page(page);
542 			/* Failed to split -- skip. */
543 			if (ret) {
544 				pte = pte_offset_map_lock(walk->mm, pmd,
545 						addr, &ptl);
546 				continue;
547 			}
548 			goto retry;
549 		}
550 
551 		migrate_page_add(page, qp->pagelist, flags);
552 	}
553 	pte_unmap_unlock(pte - 1, ptl);
554 	cond_resched();
555 	return 0;
556 }
557 
558 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
559 			       unsigned long addr, unsigned long end,
560 			       struct mm_walk *walk)
561 {
562 #ifdef CONFIG_HUGETLB_PAGE
563 	struct queue_pages *qp = walk->private;
564 	unsigned long flags = qp->flags;
565 	int nid;
566 	struct page *page;
567 	spinlock_t *ptl;
568 	pte_t entry;
569 
570 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
571 	entry = huge_ptep_get(pte);
572 	if (!pte_present(entry))
573 		goto unlock;
574 	page = pte_page(entry);
575 	nid = page_to_nid(page);
576 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
577 		goto unlock;
578 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
579 	if (flags & (MPOL_MF_MOVE_ALL) ||
580 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
581 		isolate_huge_page(page, qp->pagelist);
582 unlock:
583 	spin_unlock(ptl);
584 #else
585 	BUG();
586 #endif
587 	return 0;
588 }
589 
590 #ifdef CONFIG_NUMA_BALANCING
591 /*
592  * This is used to mark a range of virtual addresses to be inaccessible.
593  * These are later cleared by a NUMA hinting fault. Depending on these
594  * faults, pages may be migrated for better NUMA placement.
595  *
596  * This is assuming that NUMA faults are handled using PROT_NONE. If
597  * an architecture makes a different choice, it will need further
598  * changes to the core.
599  */
600 unsigned long change_prot_numa(struct vm_area_struct *vma,
601 			unsigned long addr, unsigned long end)
602 {
603 	int nr_updated;
604 
605 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
606 	if (nr_updated)
607 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608 
609 	return nr_updated;
610 }
611 #else
612 static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 			unsigned long addr, unsigned long end)
614 {
615 	return 0;
616 }
617 #endif /* CONFIG_NUMA_BALANCING */
618 
619 static int queue_pages_test_walk(unsigned long start, unsigned long end,
620 				struct mm_walk *walk)
621 {
622 	struct vm_area_struct *vma = walk->vma;
623 	struct queue_pages *qp = walk->private;
624 	unsigned long endvma = vma->vm_end;
625 	unsigned long flags = qp->flags;
626 
627 	if (!vma_migratable(vma))
628 		return 1;
629 
630 	if (endvma > end)
631 		endvma = end;
632 	if (vma->vm_start > start)
633 		start = vma->vm_start;
634 
635 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
636 		if (!vma->vm_next && vma->vm_end < end)
637 			return -EFAULT;
638 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
639 			return -EFAULT;
640 	}
641 
642 	qp->prev = vma;
643 
644 	if (flags & MPOL_MF_LAZY) {
645 		/* Similar to task_numa_work, skip inaccessible VMAs */
646 		if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
647 			change_prot_numa(vma, start, endvma);
648 		return 1;
649 	}
650 
651 	/* queue pages from current vma */
652 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
653 		return 0;
654 	return 1;
655 }
656 
657 /*
658  * Walk through page tables and collect pages to be migrated.
659  *
660  * If pages found in a given range are on a set of nodes (determined by
661  * @nodes and @flags,) it's isolated and queued to the pagelist which is
662  * passed via @private.)
663  */
664 static int
665 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
666 		nodemask_t *nodes, unsigned long flags,
667 		struct list_head *pagelist)
668 {
669 	struct queue_pages qp = {
670 		.pagelist = pagelist,
671 		.flags = flags,
672 		.nmask = nodes,
673 		.prev = NULL,
674 	};
675 	struct mm_walk queue_pages_walk = {
676 		.hugetlb_entry = queue_pages_hugetlb,
677 		.pmd_entry = queue_pages_pte_range,
678 		.test_walk = queue_pages_test_walk,
679 		.mm = mm,
680 		.private = &qp,
681 	};
682 
683 	return walk_page_range(start, end, &queue_pages_walk);
684 }
685 
686 /*
687  * Apply policy to a single VMA
688  * This must be called with the mmap_sem held for writing.
689  */
690 static int vma_replace_policy(struct vm_area_struct *vma,
691 						struct mempolicy *pol)
692 {
693 	int err;
694 	struct mempolicy *old;
695 	struct mempolicy *new;
696 
697 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
698 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
699 		 vma->vm_ops, vma->vm_file,
700 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
701 
702 	new = mpol_dup(pol);
703 	if (IS_ERR(new))
704 		return PTR_ERR(new);
705 
706 	if (vma->vm_ops && vma->vm_ops->set_policy) {
707 		err = vma->vm_ops->set_policy(vma, new);
708 		if (err)
709 			goto err_out;
710 	}
711 
712 	old = vma->vm_policy;
713 	vma->vm_policy = new; /* protected by mmap_sem */
714 	mpol_put(old);
715 
716 	return 0;
717  err_out:
718 	mpol_put(new);
719 	return err;
720 }
721 
722 /* Step 2: apply policy to a range and do splits. */
723 static int mbind_range(struct mm_struct *mm, unsigned long start,
724 		       unsigned long end, struct mempolicy *new_pol)
725 {
726 	struct vm_area_struct *next;
727 	struct vm_area_struct *prev;
728 	struct vm_area_struct *vma;
729 	int err = 0;
730 	pgoff_t pgoff;
731 	unsigned long vmstart;
732 	unsigned long vmend;
733 
734 	vma = find_vma(mm, start);
735 	if (!vma || vma->vm_start > start)
736 		return -EFAULT;
737 
738 	prev = vma->vm_prev;
739 	if (start > vma->vm_start)
740 		prev = vma;
741 
742 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
743 		next = vma->vm_next;
744 		vmstart = max(start, vma->vm_start);
745 		vmend   = min(end, vma->vm_end);
746 
747 		if (mpol_equal(vma_policy(vma), new_pol))
748 			continue;
749 
750 		pgoff = vma->vm_pgoff +
751 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
752 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
753 				 vma->anon_vma, vma->vm_file, pgoff,
754 				 new_pol, vma->vm_userfaultfd_ctx);
755 		if (prev) {
756 			vma = prev;
757 			next = vma->vm_next;
758 			if (mpol_equal(vma_policy(vma), new_pol))
759 				continue;
760 			/* vma_merge() joined vma && vma->next, case 8 */
761 			goto replace;
762 		}
763 		if (vma->vm_start != vmstart) {
764 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
765 			if (err)
766 				goto out;
767 		}
768 		if (vma->vm_end != vmend) {
769 			err = split_vma(vma->vm_mm, vma, vmend, 0);
770 			if (err)
771 				goto out;
772 		}
773  replace:
774 		err = vma_replace_policy(vma, new_pol);
775 		if (err)
776 			goto out;
777 	}
778 
779  out:
780 	return err;
781 }
782 
783 /* Set the process memory policy */
784 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
785 			     nodemask_t *nodes)
786 {
787 	struct mempolicy *new, *old;
788 	NODEMASK_SCRATCH(scratch);
789 	int ret;
790 
791 	if (!scratch)
792 		return -ENOMEM;
793 
794 	new = mpol_new(mode, flags, nodes);
795 	if (IS_ERR(new)) {
796 		ret = PTR_ERR(new);
797 		goto out;
798 	}
799 
800 	task_lock(current);
801 	ret = mpol_set_nodemask(new, nodes, scratch);
802 	if (ret) {
803 		task_unlock(current);
804 		mpol_put(new);
805 		goto out;
806 	}
807 	old = current->mempolicy;
808 	current->mempolicy = new;
809 	if (new && new->mode == MPOL_INTERLEAVE &&
810 	    nodes_weight(new->v.nodes))
811 		current->il_next = first_node(new->v.nodes);
812 	task_unlock(current);
813 	mpol_put(old);
814 	ret = 0;
815 out:
816 	NODEMASK_SCRATCH_FREE(scratch);
817 	return ret;
818 }
819 
820 /*
821  * Return nodemask for policy for get_mempolicy() query
822  *
823  * Called with task's alloc_lock held
824  */
825 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
826 {
827 	nodes_clear(*nodes);
828 	if (p == &default_policy)
829 		return;
830 
831 	switch (p->mode) {
832 	case MPOL_BIND:
833 		/* Fall through */
834 	case MPOL_INTERLEAVE:
835 		*nodes = p->v.nodes;
836 		break;
837 	case MPOL_PREFERRED:
838 		if (!(p->flags & MPOL_F_LOCAL))
839 			node_set(p->v.preferred_node, *nodes);
840 		/* else return empty node mask for local allocation */
841 		break;
842 	default:
843 		BUG();
844 	}
845 }
846 
847 static int lookup_node(struct mm_struct *mm, unsigned long addr)
848 {
849 	struct page *p;
850 	int err;
851 
852 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
853 	if (err >= 0) {
854 		err = page_to_nid(p);
855 		put_page(p);
856 	}
857 	return err;
858 }
859 
860 /* Retrieve NUMA policy */
861 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
862 			     unsigned long addr, unsigned long flags)
863 {
864 	int err;
865 	struct mm_struct *mm = current->mm;
866 	struct vm_area_struct *vma = NULL;
867 	struct mempolicy *pol = current->mempolicy;
868 
869 	if (flags &
870 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
871 		return -EINVAL;
872 
873 	if (flags & MPOL_F_MEMS_ALLOWED) {
874 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
875 			return -EINVAL;
876 		*policy = 0;	/* just so it's initialized */
877 		task_lock(current);
878 		*nmask  = cpuset_current_mems_allowed;
879 		task_unlock(current);
880 		return 0;
881 	}
882 
883 	if (flags & MPOL_F_ADDR) {
884 		/*
885 		 * Do NOT fall back to task policy if the
886 		 * vma/shared policy at addr is NULL.  We
887 		 * want to return MPOL_DEFAULT in this case.
888 		 */
889 		down_read(&mm->mmap_sem);
890 		vma = find_vma_intersection(mm, addr, addr+1);
891 		if (!vma) {
892 			up_read(&mm->mmap_sem);
893 			return -EFAULT;
894 		}
895 		if (vma->vm_ops && vma->vm_ops->get_policy)
896 			pol = vma->vm_ops->get_policy(vma, addr);
897 		else
898 			pol = vma->vm_policy;
899 	} else if (addr)
900 		return -EINVAL;
901 
902 	if (!pol)
903 		pol = &default_policy;	/* indicates default behavior */
904 
905 	if (flags & MPOL_F_NODE) {
906 		if (flags & MPOL_F_ADDR) {
907 			err = lookup_node(mm, addr);
908 			if (err < 0)
909 				goto out;
910 			*policy = err;
911 		} else if (pol == current->mempolicy &&
912 				pol->mode == MPOL_INTERLEAVE) {
913 			*policy = current->il_next;
914 		} else {
915 			err = -EINVAL;
916 			goto out;
917 		}
918 	} else {
919 		*policy = pol == &default_policy ? MPOL_DEFAULT :
920 						pol->mode;
921 		/*
922 		 * Internal mempolicy flags must be masked off before exposing
923 		 * the policy to userspace.
924 		 */
925 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
926 	}
927 
928 	if (vma) {
929 		up_read(&current->mm->mmap_sem);
930 		vma = NULL;
931 	}
932 
933 	err = 0;
934 	if (nmask) {
935 		if (mpol_store_user_nodemask(pol)) {
936 			*nmask = pol->w.user_nodemask;
937 		} else {
938 			task_lock(current);
939 			get_policy_nodemask(pol, nmask);
940 			task_unlock(current);
941 		}
942 	}
943 
944  out:
945 	mpol_cond_put(pol);
946 	if (vma)
947 		up_read(&current->mm->mmap_sem);
948 	return err;
949 }
950 
951 #ifdef CONFIG_MIGRATION
952 /*
953  * page migration
954  */
955 static void migrate_page_add(struct page *page, struct list_head *pagelist,
956 				unsigned long flags)
957 {
958 	/*
959 	 * Avoid migrating a page that is shared with others.
960 	 */
961 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
962 		if (!isolate_lru_page(page)) {
963 			list_add_tail(&page->lru, pagelist);
964 			inc_zone_page_state(page, NR_ISOLATED_ANON +
965 					    page_is_file_cache(page));
966 		}
967 	}
968 }
969 
970 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
971 {
972 	if (PageHuge(page))
973 		return alloc_huge_page_node(page_hstate(compound_head(page)),
974 					node);
975 	else
976 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
977 						    __GFP_THISNODE, 0);
978 }
979 
980 /*
981  * Migrate pages from one node to a target node.
982  * Returns error or the number of pages not migrated.
983  */
984 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
985 			   int flags)
986 {
987 	nodemask_t nmask;
988 	LIST_HEAD(pagelist);
989 	int err = 0;
990 
991 	nodes_clear(nmask);
992 	node_set(source, nmask);
993 
994 	/*
995 	 * This does not "check" the range but isolates all pages that
996 	 * need migration.  Between passing in the full user address
997 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
998 	 */
999 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1000 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1001 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1002 
1003 	if (!list_empty(&pagelist)) {
1004 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1005 					MIGRATE_SYNC, MR_SYSCALL);
1006 		if (err)
1007 			putback_movable_pages(&pagelist);
1008 	}
1009 
1010 	return err;
1011 }
1012 
1013 /*
1014  * Move pages between the two nodesets so as to preserve the physical
1015  * layout as much as possible.
1016  *
1017  * Returns the number of page that could not be moved.
1018  */
1019 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1020 		     const nodemask_t *to, int flags)
1021 {
1022 	int busy = 0;
1023 	int err;
1024 	nodemask_t tmp;
1025 
1026 	err = migrate_prep();
1027 	if (err)
1028 		return err;
1029 
1030 	down_read(&mm->mmap_sem);
1031 
1032 	/*
1033 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1034 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1035 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1036 	 * The pair of nodemasks 'to' and 'from' define the map.
1037 	 *
1038 	 * If no pair of bits is found that way, fallback to picking some
1039 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1040 	 * 'source' and 'dest' bits are the same, this represents a node
1041 	 * that will be migrating to itself, so no pages need move.
1042 	 *
1043 	 * If no bits are left in 'tmp', or if all remaining bits left
1044 	 * in 'tmp' correspond to the same bit in 'to', return false
1045 	 * (nothing left to migrate).
1046 	 *
1047 	 * This lets us pick a pair of nodes to migrate between, such that
1048 	 * if possible the dest node is not already occupied by some other
1049 	 * source node, minimizing the risk of overloading the memory on a
1050 	 * node that would happen if we migrated incoming memory to a node
1051 	 * before migrating outgoing memory source that same node.
1052 	 *
1053 	 * A single scan of tmp is sufficient.  As we go, we remember the
1054 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1055 	 * that not only moved, but what's better, moved to an empty slot
1056 	 * (d is not set in tmp), then we break out then, with that pair.
1057 	 * Otherwise when we finish scanning from_tmp, we at least have the
1058 	 * most recent <s, d> pair that moved.  If we get all the way through
1059 	 * the scan of tmp without finding any node that moved, much less
1060 	 * moved to an empty node, then there is nothing left worth migrating.
1061 	 */
1062 
1063 	tmp = *from;
1064 	while (!nodes_empty(tmp)) {
1065 		int s,d;
1066 		int source = NUMA_NO_NODE;
1067 		int dest = 0;
1068 
1069 		for_each_node_mask(s, tmp) {
1070 
1071 			/*
1072 			 * do_migrate_pages() tries to maintain the relative
1073 			 * node relationship of the pages established between
1074 			 * threads and memory areas.
1075                          *
1076 			 * However if the number of source nodes is not equal to
1077 			 * the number of destination nodes we can not preserve
1078 			 * this node relative relationship.  In that case, skip
1079 			 * copying memory from a node that is in the destination
1080 			 * mask.
1081 			 *
1082 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1083 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1084 			 */
1085 
1086 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1087 						(node_isset(s, *to)))
1088 				continue;
1089 
1090 			d = node_remap(s, *from, *to);
1091 			if (s == d)
1092 				continue;
1093 
1094 			source = s;	/* Node moved. Memorize */
1095 			dest = d;
1096 
1097 			/* dest not in remaining from nodes? */
1098 			if (!node_isset(dest, tmp))
1099 				break;
1100 		}
1101 		if (source == NUMA_NO_NODE)
1102 			break;
1103 
1104 		node_clear(source, tmp);
1105 		err = migrate_to_node(mm, source, dest, flags);
1106 		if (err > 0)
1107 			busy += err;
1108 		if (err < 0)
1109 			break;
1110 	}
1111 	up_read(&mm->mmap_sem);
1112 	if (err < 0)
1113 		return err;
1114 	return busy;
1115 
1116 }
1117 
1118 /*
1119  * Allocate a new page for page migration based on vma policy.
1120  * Start by assuming the page is mapped by the same vma as contains @start.
1121  * Search forward from there, if not.  N.B., this assumes that the
1122  * list of pages handed to migrate_pages()--which is how we get here--
1123  * is in virtual address order.
1124  */
1125 static struct page *new_page(struct page *page, unsigned long start, int **x)
1126 {
1127 	struct vm_area_struct *vma;
1128 	unsigned long uninitialized_var(address);
1129 
1130 	vma = find_vma(current->mm, start);
1131 	while (vma) {
1132 		address = page_address_in_vma(page, vma);
1133 		if (address != -EFAULT)
1134 			break;
1135 		vma = vma->vm_next;
1136 	}
1137 
1138 	if (PageHuge(page)) {
1139 		BUG_ON(!vma);
1140 		return alloc_huge_page_noerr(vma, address, 1);
1141 	}
1142 	/*
1143 	 * if !vma, alloc_page_vma() will use task or system default policy
1144 	 */
1145 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1146 }
1147 #else
1148 
1149 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1150 				unsigned long flags)
1151 {
1152 }
1153 
1154 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1155 		     const nodemask_t *to, int flags)
1156 {
1157 	return -ENOSYS;
1158 }
1159 
1160 static struct page *new_page(struct page *page, unsigned long start, int **x)
1161 {
1162 	return NULL;
1163 }
1164 #endif
1165 
1166 static long do_mbind(unsigned long start, unsigned long len,
1167 		     unsigned short mode, unsigned short mode_flags,
1168 		     nodemask_t *nmask, unsigned long flags)
1169 {
1170 	struct mm_struct *mm = current->mm;
1171 	struct mempolicy *new;
1172 	unsigned long end;
1173 	int err;
1174 	LIST_HEAD(pagelist);
1175 
1176 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1177 		return -EINVAL;
1178 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1179 		return -EPERM;
1180 
1181 	if (start & ~PAGE_MASK)
1182 		return -EINVAL;
1183 
1184 	if (mode == MPOL_DEFAULT)
1185 		flags &= ~MPOL_MF_STRICT;
1186 
1187 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1188 	end = start + len;
1189 
1190 	if (end < start)
1191 		return -EINVAL;
1192 	if (end == start)
1193 		return 0;
1194 
1195 	new = mpol_new(mode, mode_flags, nmask);
1196 	if (IS_ERR(new))
1197 		return PTR_ERR(new);
1198 
1199 	if (flags & MPOL_MF_LAZY)
1200 		new->flags |= MPOL_F_MOF;
1201 
1202 	/*
1203 	 * If we are using the default policy then operation
1204 	 * on discontinuous address spaces is okay after all
1205 	 */
1206 	if (!new)
1207 		flags |= MPOL_MF_DISCONTIG_OK;
1208 
1209 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1210 		 start, start + len, mode, mode_flags,
1211 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1212 
1213 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1214 
1215 		err = migrate_prep();
1216 		if (err)
1217 			goto mpol_out;
1218 	}
1219 	{
1220 		NODEMASK_SCRATCH(scratch);
1221 		if (scratch) {
1222 			down_write(&mm->mmap_sem);
1223 			task_lock(current);
1224 			err = mpol_set_nodemask(new, nmask, scratch);
1225 			task_unlock(current);
1226 			if (err)
1227 				up_write(&mm->mmap_sem);
1228 		} else
1229 			err = -ENOMEM;
1230 		NODEMASK_SCRATCH_FREE(scratch);
1231 	}
1232 	if (err)
1233 		goto mpol_out;
1234 
1235 	err = queue_pages_range(mm, start, end, nmask,
1236 			  flags | MPOL_MF_INVERT, &pagelist);
1237 	if (!err)
1238 		err = mbind_range(mm, start, end, new);
1239 
1240 	if (!err) {
1241 		int nr_failed = 0;
1242 
1243 		if (!list_empty(&pagelist)) {
1244 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1245 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1246 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1247 			if (nr_failed)
1248 				putback_movable_pages(&pagelist);
1249 		}
1250 
1251 		if (nr_failed && (flags & MPOL_MF_STRICT))
1252 			err = -EIO;
1253 	} else
1254 		putback_movable_pages(&pagelist);
1255 
1256 	up_write(&mm->mmap_sem);
1257  mpol_out:
1258 	mpol_put(new);
1259 	return err;
1260 }
1261 
1262 /*
1263  * User space interface with variable sized bitmaps for nodelists.
1264  */
1265 
1266 /* Copy a node mask from user space. */
1267 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1268 		     unsigned long maxnode)
1269 {
1270 	unsigned long k;
1271 	unsigned long nlongs;
1272 	unsigned long endmask;
1273 
1274 	--maxnode;
1275 	nodes_clear(*nodes);
1276 	if (maxnode == 0 || !nmask)
1277 		return 0;
1278 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1279 		return -EINVAL;
1280 
1281 	nlongs = BITS_TO_LONGS(maxnode);
1282 	if ((maxnode % BITS_PER_LONG) == 0)
1283 		endmask = ~0UL;
1284 	else
1285 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1286 
1287 	/* When the user specified more nodes than supported just check
1288 	   if the non supported part is all zero. */
1289 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1290 		if (nlongs > PAGE_SIZE/sizeof(long))
1291 			return -EINVAL;
1292 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1293 			unsigned long t;
1294 			if (get_user(t, nmask + k))
1295 				return -EFAULT;
1296 			if (k == nlongs - 1) {
1297 				if (t & endmask)
1298 					return -EINVAL;
1299 			} else if (t)
1300 				return -EINVAL;
1301 		}
1302 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1303 		endmask = ~0UL;
1304 	}
1305 
1306 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1307 		return -EFAULT;
1308 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1309 	return 0;
1310 }
1311 
1312 /* Copy a kernel node mask to user space */
1313 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1314 			      nodemask_t *nodes)
1315 {
1316 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1317 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1318 
1319 	if (copy > nbytes) {
1320 		if (copy > PAGE_SIZE)
1321 			return -EINVAL;
1322 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1323 			return -EFAULT;
1324 		copy = nbytes;
1325 	}
1326 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1327 }
1328 
1329 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1330 		unsigned long, mode, const unsigned long __user *, nmask,
1331 		unsigned long, maxnode, unsigned, flags)
1332 {
1333 	nodemask_t nodes;
1334 	int err;
1335 	unsigned short mode_flags;
1336 
1337 	mode_flags = mode & MPOL_MODE_FLAGS;
1338 	mode &= ~MPOL_MODE_FLAGS;
1339 	if (mode >= MPOL_MAX)
1340 		return -EINVAL;
1341 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1342 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1343 		return -EINVAL;
1344 	err = get_nodes(&nodes, nmask, maxnode);
1345 	if (err)
1346 		return err;
1347 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1348 }
1349 
1350 /* Set the process memory policy */
1351 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1352 		unsigned long, maxnode)
1353 {
1354 	int err;
1355 	nodemask_t nodes;
1356 	unsigned short flags;
1357 
1358 	flags = mode & MPOL_MODE_FLAGS;
1359 	mode &= ~MPOL_MODE_FLAGS;
1360 	if ((unsigned int)mode >= MPOL_MAX)
1361 		return -EINVAL;
1362 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1363 		return -EINVAL;
1364 	err = get_nodes(&nodes, nmask, maxnode);
1365 	if (err)
1366 		return err;
1367 	return do_set_mempolicy(mode, flags, &nodes);
1368 }
1369 
1370 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1371 		const unsigned long __user *, old_nodes,
1372 		const unsigned long __user *, new_nodes)
1373 {
1374 	const struct cred *cred = current_cred(), *tcred;
1375 	struct mm_struct *mm = NULL;
1376 	struct task_struct *task;
1377 	nodemask_t task_nodes;
1378 	int err;
1379 	nodemask_t *old;
1380 	nodemask_t *new;
1381 	NODEMASK_SCRATCH(scratch);
1382 
1383 	if (!scratch)
1384 		return -ENOMEM;
1385 
1386 	old = &scratch->mask1;
1387 	new = &scratch->mask2;
1388 
1389 	err = get_nodes(old, old_nodes, maxnode);
1390 	if (err)
1391 		goto out;
1392 
1393 	err = get_nodes(new, new_nodes, maxnode);
1394 	if (err)
1395 		goto out;
1396 
1397 	/* Find the mm_struct */
1398 	rcu_read_lock();
1399 	task = pid ? find_task_by_vpid(pid) : current;
1400 	if (!task) {
1401 		rcu_read_unlock();
1402 		err = -ESRCH;
1403 		goto out;
1404 	}
1405 	get_task_struct(task);
1406 
1407 	err = -EINVAL;
1408 
1409 	/*
1410 	 * Check if this process has the right to modify the specified
1411 	 * process. The right exists if the process has administrative
1412 	 * capabilities, superuser privileges or the same
1413 	 * userid as the target process.
1414 	 */
1415 	tcred = __task_cred(task);
1416 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1417 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1418 	    !capable(CAP_SYS_NICE)) {
1419 		rcu_read_unlock();
1420 		err = -EPERM;
1421 		goto out_put;
1422 	}
1423 	rcu_read_unlock();
1424 
1425 	task_nodes = cpuset_mems_allowed(task);
1426 	/* Is the user allowed to access the target nodes? */
1427 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1428 		err = -EPERM;
1429 		goto out_put;
1430 	}
1431 
1432 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1433 		err = -EINVAL;
1434 		goto out_put;
1435 	}
1436 
1437 	err = security_task_movememory(task);
1438 	if (err)
1439 		goto out_put;
1440 
1441 	mm = get_task_mm(task);
1442 	put_task_struct(task);
1443 
1444 	if (!mm) {
1445 		err = -EINVAL;
1446 		goto out;
1447 	}
1448 
1449 	err = do_migrate_pages(mm, old, new,
1450 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1451 
1452 	mmput(mm);
1453 out:
1454 	NODEMASK_SCRATCH_FREE(scratch);
1455 
1456 	return err;
1457 
1458 out_put:
1459 	put_task_struct(task);
1460 	goto out;
1461 
1462 }
1463 
1464 
1465 /* Retrieve NUMA policy */
1466 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1467 		unsigned long __user *, nmask, unsigned long, maxnode,
1468 		unsigned long, addr, unsigned long, flags)
1469 {
1470 	int err;
1471 	int uninitialized_var(pval);
1472 	nodemask_t nodes;
1473 
1474 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1475 		return -EINVAL;
1476 
1477 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1478 
1479 	if (err)
1480 		return err;
1481 
1482 	if (policy && put_user(pval, policy))
1483 		return -EFAULT;
1484 
1485 	if (nmask)
1486 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1487 
1488 	return err;
1489 }
1490 
1491 #ifdef CONFIG_COMPAT
1492 
1493 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1494 		       compat_ulong_t __user *, nmask,
1495 		       compat_ulong_t, maxnode,
1496 		       compat_ulong_t, addr, compat_ulong_t, flags)
1497 {
1498 	long err;
1499 	unsigned long __user *nm = NULL;
1500 	unsigned long nr_bits, alloc_size;
1501 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1502 
1503 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1504 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1505 
1506 	if (nmask)
1507 		nm = compat_alloc_user_space(alloc_size);
1508 
1509 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1510 
1511 	if (!err && nmask) {
1512 		unsigned long copy_size;
1513 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1514 		err = copy_from_user(bm, nm, copy_size);
1515 		/* ensure entire bitmap is zeroed */
1516 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1517 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1518 	}
1519 
1520 	return err;
1521 }
1522 
1523 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1524 		       compat_ulong_t, maxnode)
1525 {
1526 	long err = 0;
1527 	unsigned long __user *nm = NULL;
1528 	unsigned long nr_bits, alloc_size;
1529 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1530 
1531 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1532 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1533 
1534 	if (nmask) {
1535 		err = compat_get_bitmap(bm, nmask, nr_bits);
1536 		nm = compat_alloc_user_space(alloc_size);
1537 		err |= copy_to_user(nm, bm, alloc_size);
1538 	}
1539 
1540 	if (err)
1541 		return -EFAULT;
1542 
1543 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1544 }
1545 
1546 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1547 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1548 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1549 {
1550 	long err = 0;
1551 	unsigned long __user *nm = NULL;
1552 	unsigned long nr_bits, alloc_size;
1553 	nodemask_t bm;
1554 
1555 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1556 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1557 
1558 	if (nmask) {
1559 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1560 		nm = compat_alloc_user_space(alloc_size);
1561 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1562 	}
1563 
1564 	if (err)
1565 		return -EFAULT;
1566 
1567 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1568 }
1569 
1570 #endif
1571 
1572 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1573 						unsigned long addr)
1574 {
1575 	struct mempolicy *pol = NULL;
1576 
1577 	if (vma) {
1578 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1579 			pol = vma->vm_ops->get_policy(vma, addr);
1580 		} else if (vma->vm_policy) {
1581 			pol = vma->vm_policy;
1582 
1583 			/*
1584 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1585 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1586 			 * count on these policies which will be dropped by
1587 			 * mpol_cond_put() later
1588 			 */
1589 			if (mpol_needs_cond_ref(pol))
1590 				mpol_get(pol);
1591 		}
1592 	}
1593 
1594 	return pol;
1595 }
1596 
1597 /*
1598  * get_vma_policy(@vma, @addr)
1599  * @vma: virtual memory area whose policy is sought
1600  * @addr: address in @vma for shared policy lookup
1601  *
1602  * Returns effective policy for a VMA at specified address.
1603  * Falls back to current->mempolicy or system default policy, as necessary.
1604  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1605  * count--added by the get_policy() vm_op, as appropriate--to protect against
1606  * freeing by another task.  It is the caller's responsibility to free the
1607  * extra reference for shared policies.
1608  */
1609 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1610 						unsigned long addr)
1611 {
1612 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1613 
1614 	if (!pol)
1615 		pol = get_task_policy(current);
1616 
1617 	return pol;
1618 }
1619 
1620 bool vma_policy_mof(struct vm_area_struct *vma)
1621 {
1622 	struct mempolicy *pol;
1623 
1624 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1625 		bool ret = false;
1626 
1627 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1628 		if (pol && (pol->flags & MPOL_F_MOF))
1629 			ret = true;
1630 		mpol_cond_put(pol);
1631 
1632 		return ret;
1633 	}
1634 
1635 	pol = vma->vm_policy;
1636 	if (!pol)
1637 		pol = get_task_policy(current);
1638 
1639 	return pol->flags & MPOL_F_MOF;
1640 }
1641 
1642 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1643 {
1644 	enum zone_type dynamic_policy_zone = policy_zone;
1645 
1646 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1647 
1648 	/*
1649 	 * if policy->v.nodes has movable memory only,
1650 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1651 	 *
1652 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1653 	 * so if the following test faile, it implies
1654 	 * policy->v.nodes has movable memory only.
1655 	 */
1656 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1657 		dynamic_policy_zone = ZONE_MOVABLE;
1658 
1659 	return zone >= dynamic_policy_zone;
1660 }
1661 
1662 /*
1663  * Return a nodemask representing a mempolicy for filtering nodes for
1664  * page allocation
1665  */
1666 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1667 {
1668 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1669 	if (unlikely(policy->mode == MPOL_BIND) &&
1670 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1671 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1672 		return &policy->v.nodes;
1673 
1674 	return NULL;
1675 }
1676 
1677 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1678 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1679 	int nd)
1680 {
1681 	switch (policy->mode) {
1682 	case MPOL_PREFERRED:
1683 		if (!(policy->flags & MPOL_F_LOCAL))
1684 			nd = policy->v.preferred_node;
1685 		break;
1686 	case MPOL_BIND:
1687 		/*
1688 		 * Normally, MPOL_BIND allocations are node-local within the
1689 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1690 		 * current node isn't part of the mask, we use the zonelist for
1691 		 * the first node in the mask instead.
1692 		 */
1693 		if (unlikely(gfp & __GFP_THISNODE) &&
1694 				unlikely(!node_isset(nd, policy->v.nodes)))
1695 			nd = first_node(policy->v.nodes);
1696 		break;
1697 	default:
1698 		BUG();
1699 	}
1700 	return node_zonelist(nd, gfp);
1701 }
1702 
1703 /* Do dynamic interleaving for a process */
1704 static unsigned interleave_nodes(struct mempolicy *policy)
1705 {
1706 	unsigned nid, next;
1707 	struct task_struct *me = current;
1708 
1709 	nid = me->il_next;
1710 	next = next_node(nid, policy->v.nodes);
1711 	if (next >= MAX_NUMNODES)
1712 		next = first_node(policy->v.nodes);
1713 	if (next < MAX_NUMNODES)
1714 		me->il_next = next;
1715 	return nid;
1716 }
1717 
1718 /*
1719  * Depending on the memory policy provide a node from which to allocate the
1720  * next slab entry.
1721  */
1722 unsigned int mempolicy_slab_node(void)
1723 {
1724 	struct mempolicy *policy;
1725 	int node = numa_mem_id();
1726 
1727 	if (in_interrupt())
1728 		return node;
1729 
1730 	policy = current->mempolicy;
1731 	if (!policy || policy->flags & MPOL_F_LOCAL)
1732 		return node;
1733 
1734 	switch (policy->mode) {
1735 	case MPOL_PREFERRED:
1736 		/*
1737 		 * handled MPOL_F_LOCAL above
1738 		 */
1739 		return policy->v.preferred_node;
1740 
1741 	case MPOL_INTERLEAVE:
1742 		return interleave_nodes(policy);
1743 
1744 	case MPOL_BIND: {
1745 		/*
1746 		 * Follow bind policy behavior and start allocation at the
1747 		 * first node.
1748 		 */
1749 		struct zonelist *zonelist;
1750 		struct zone *zone;
1751 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1752 		zonelist = &NODE_DATA(node)->node_zonelists[0];
1753 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1754 							&policy->v.nodes,
1755 							&zone);
1756 		return zone ? zone->node : node;
1757 	}
1758 
1759 	default:
1760 		BUG();
1761 	}
1762 }
1763 
1764 /* Do static interleaving for a VMA with known offset. */
1765 static unsigned offset_il_node(struct mempolicy *pol,
1766 		struct vm_area_struct *vma, unsigned long off)
1767 {
1768 	unsigned nnodes = nodes_weight(pol->v.nodes);
1769 	unsigned target;
1770 	int c;
1771 	int nid = NUMA_NO_NODE;
1772 
1773 	if (!nnodes)
1774 		return numa_node_id();
1775 	target = (unsigned int)off % nnodes;
1776 	c = 0;
1777 	do {
1778 		nid = next_node(nid, pol->v.nodes);
1779 		c++;
1780 	} while (c <= target);
1781 	return nid;
1782 }
1783 
1784 /* Determine a node number for interleave */
1785 static inline unsigned interleave_nid(struct mempolicy *pol,
1786 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1787 {
1788 	if (vma) {
1789 		unsigned long off;
1790 
1791 		/*
1792 		 * for small pages, there is no difference between
1793 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1794 		 * for huge pages, since vm_pgoff is in units of small
1795 		 * pages, we need to shift off the always 0 bits to get
1796 		 * a useful offset.
1797 		 */
1798 		BUG_ON(shift < PAGE_SHIFT);
1799 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1800 		off += (addr - vma->vm_start) >> shift;
1801 		return offset_il_node(pol, vma, off);
1802 	} else
1803 		return interleave_nodes(pol);
1804 }
1805 
1806 /*
1807  * Return the bit number of a random bit set in the nodemask.
1808  * (returns NUMA_NO_NODE if nodemask is empty)
1809  */
1810 int node_random(const nodemask_t *maskp)
1811 {
1812 	int w, bit = NUMA_NO_NODE;
1813 
1814 	w = nodes_weight(*maskp);
1815 	if (w)
1816 		bit = bitmap_ord_to_pos(maskp->bits,
1817 			get_random_int() % w, MAX_NUMNODES);
1818 	return bit;
1819 }
1820 
1821 #ifdef CONFIG_HUGETLBFS
1822 /*
1823  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1824  * @vma: virtual memory area whose policy is sought
1825  * @addr: address in @vma for shared policy lookup and interleave policy
1826  * @gfp_flags: for requested zone
1827  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1828  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1829  *
1830  * Returns a zonelist suitable for a huge page allocation and a pointer
1831  * to the struct mempolicy for conditional unref after allocation.
1832  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1833  * @nodemask for filtering the zonelist.
1834  *
1835  * Must be protected by read_mems_allowed_begin()
1836  */
1837 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1838 				gfp_t gfp_flags, struct mempolicy **mpol,
1839 				nodemask_t **nodemask)
1840 {
1841 	struct zonelist *zl;
1842 
1843 	*mpol = get_vma_policy(vma, addr);
1844 	*nodemask = NULL;	/* assume !MPOL_BIND */
1845 
1846 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1847 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1848 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1849 	} else {
1850 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1851 		if ((*mpol)->mode == MPOL_BIND)
1852 			*nodemask = &(*mpol)->v.nodes;
1853 	}
1854 	return zl;
1855 }
1856 
1857 /*
1858  * init_nodemask_of_mempolicy
1859  *
1860  * If the current task's mempolicy is "default" [NULL], return 'false'
1861  * to indicate default policy.  Otherwise, extract the policy nodemask
1862  * for 'bind' or 'interleave' policy into the argument nodemask, or
1863  * initialize the argument nodemask to contain the single node for
1864  * 'preferred' or 'local' policy and return 'true' to indicate presence
1865  * of non-default mempolicy.
1866  *
1867  * We don't bother with reference counting the mempolicy [mpol_get/put]
1868  * because the current task is examining it's own mempolicy and a task's
1869  * mempolicy is only ever changed by the task itself.
1870  *
1871  * N.B., it is the caller's responsibility to free a returned nodemask.
1872  */
1873 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1874 {
1875 	struct mempolicy *mempolicy;
1876 	int nid;
1877 
1878 	if (!(mask && current->mempolicy))
1879 		return false;
1880 
1881 	task_lock(current);
1882 	mempolicy = current->mempolicy;
1883 	switch (mempolicy->mode) {
1884 	case MPOL_PREFERRED:
1885 		if (mempolicy->flags & MPOL_F_LOCAL)
1886 			nid = numa_node_id();
1887 		else
1888 			nid = mempolicy->v.preferred_node;
1889 		init_nodemask_of_node(mask, nid);
1890 		break;
1891 
1892 	case MPOL_BIND:
1893 		/* Fall through */
1894 	case MPOL_INTERLEAVE:
1895 		*mask =  mempolicy->v.nodes;
1896 		break;
1897 
1898 	default:
1899 		BUG();
1900 	}
1901 	task_unlock(current);
1902 
1903 	return true;
1904 }
1905 #endif
1906 
1907 /*
1908  * mempolicy_nodemask_intersects
1909  *
1910  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1911  * policy.  Otherwise, check for intersection between mask and the policy
1912  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1913  * policy, always return true since it may allocate elsewhere on fallback.
1914  *
1915  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1916  */
1917 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1918 					const nodemask_t *mask)
1919 {
1920 	struct mempolicy *mempolicy;
1921 	bool ret = true;
1922 
1923 	if (!mask)
1924 		return ret;
1925 	task_lock(tsk);
1926 	mempolicy = tsk->mempolicy;
1927 	if (!mempolicy)
1928 		goto out;
1929 
1930 	switch (mempolicy->mode) {
1931 	case MPOL_PREFERRED:
1932 		/*
1933 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1934 		 * allocate from, they may fallback to other nodes when oom.
1935 		 * Thus, it's possible for tsk to have allocated memory from
1936 		 * nodes in mask.
1937 		 */
1938 		break;
1939 	case MPOL_BIND:
1940 	case MPOL_INTERLEAVE:
1941 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1942 		break;
1943 	default:
1944 		BUG();
1945 	}
1946 out:
1947 	task_unlock(tsk);
1948 	return ret;
1949 }
1950 
1951 /* Allocate a page in interleaved policy.
1952    Own path because it needs to do special accounting. */
1953 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1954 					unsigned nid)
1955 {
1956 	struct zonelist *zl;
1957 	struct page *page;
1958 
1959 	zl = node_zonelist(nid, gfp);
1960 	page = __alloc_pages(gfp, order, zl);
1961 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1962 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1963 	return page;
1964 }
1965 
1966 /**
1967  * 	alloc_pages_vma	- Allocate a page for a VMA.
1968  *
1969  * 	@gfp:
1970  *      %GFP_USER    user allocation.
1971  *      %GFP_KERNEL  kernel allocations,
1972  *      %GFP_HIGHMEM highmem/user allocations,
1973  *      %GFP_FS      allocation should not call back into a file system.
1974  *      %GFP_ATOMIC  don't sleep.
1975  *
1976  *	@order:Order of the GFP allocation.
1977  * 	@vma:  Pointer to VMA or NULL if not available.
1978  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1979  *	@node: Which node to prefer for allocation (modulo policy).
1980  *	@hugepage: for hugepages try only the preferred node if possible
1981  *
1982  * 	This function allocates a page from the kernel page pool and applies
1983  *	a NUMA policy associated with the VMA or the current process.
1984  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1985  *	mm_struct of the VMA to prevent it from going away. Should be used for
1986  *	all allocations for pages that will be mapped into user space. Returns
1987  *	NULL when no page can be allocated.
1988  */
1989 struct page *
1990 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1991 		unsigned long addr, int node, bool hugepage)
1992 {
1993 	struct mempolicy *pol;
1994 	struct page *page;
1995 	unsigned int cpuset_mems_cookie;
1996 	struct zonelist *zl;
1997 	nodemask_t *nmask;
1998 
1999 retry_cpuset:
2000 	pol = get_vma_policy(vma, addr);
2001 	cpuset_mems_cookie = read_mems_allowed_begin();
2002 
2003 	if (pol->mode == MPOL_INTERLEAVE) {
2004 		unsigned nid;
2005 
2006 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2007 		mpol_cond_put(pol);
2008 		page = alloc_page_interleave(gfp, order, nid);
2009 		goto out;
2010 	}
2011 
2012 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2013 		int hpage_node = node;
2014 
2015 		/*
2016 		 * For hugepage allocation and non-interleave policy which
2017 		 * allows the current node (or other explicitly preferred
2018 		 * node) we only try to allocate from the current/preferred
2019 		 * node and don't fall back to other nodes, as the cost of
2020 		 * remote accesses would likely offset THP benefits.
2021 		 *
2022 		 * If the policy is interleave, or does not allow the current
2023 		 * node in its nodemask, we allocate the standard way.
2024 		 */
2025 		if (pol->mode == MPOL_PREFERRED &&
2026 						!(pol->flags & MPOL_F_LOCAL))
2027 			hpage_node = pol->v.preferred_node;
2028 
2029 		nmask = policy_nodemask(gfp, pol);
2030 		if (!nmask || node_isset(hpage_node, *nmask)) {
2031 			mpol_cond_put(pol);
2032 			page = __alloc_pages_node(hpage_node,
2033 						gfp | __GFP_THISNODE, order);
2034 			goto out;
2035 		}
2036 	}
2037 
2038 	nmask = policy_nodemask(gfp, pol);
2039 	zl = policy_zonelist(gfp, pol, node);
2040 	mpol_cond_put(pol);
2041 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2042 out:
2043 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2044 		goto retry_cpuset;
2045 	return page;
2046 }
2047 
2048 /**
2049  * 	alloc_pages_current - Allocate pages.
2050  *
2051  *	@gfp:
2052  *		%GFP_USER   user allocation,
2053  *      	%GFP_KERNEL kernel allocation,
2054  *      	%GFP_HIGHMEM highmem allocation,
2055  *      	%GFP_FS     don't call back into a file system.
2056  *      	%GFP_ATOMIC don't sleep.
2057  *	@order: Power of two of allocation size in pages. 0 is a single page.
2058  *
2059  *	Allocate a page from the kernel page pool.  When not in
2060  *	interrupt context and apply the current process NUMA policy.
2061  *	Returns NULL when no page can be allocated.
2062  *
2063  *	Don't call cpuset_update_task_memory_state() unless
2064  *	1) it's ok to take cpuset_sem (can WAIT), and
2065  *	2) allocating for current task (not interrupt).
2066  */
2067 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2068 {
2069 	struct mempolicy *pol = &default_policy;
2070 	struct page *page;
2071 	unsigned int cpuset_mems_cookie;
2072 
2073 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2074 		pol = get_task_policy(current);
2075 
2076 retry_cpuset:
2077 	cpuset_mems_cookie = read_mems_allowed_begin();
2078 
2079 	/*
2080 	 * No reference counting needed for current->mempolicy
2081 	 * nor system default_policy
2082 	 */
2083 	if (pol->mode == MPOL_INTERLEAVE)
2084 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2085 	else
2086 		page = __alloc_pages_nodemask(gfp, order,
2087 				policy_zonelist(gfp, pol, numa_node_id()),
2088 				policy_nodemask(gfp, pol));
2089 
2090 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2091 		goto retry_cpuset;
2092 
2093 	return page;
2094 }
2095 EXPORT_SYMBOL(alloc_pages_current);
2096 
2097 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2098 {
2099 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2100 
2101 	if (IS_ERR(pol))
2102 		return PTR_ERR(pol);
2103 	dst->vm_policy = pol;
2104 	return 0;
2105 }
2106 
2107 /*
2108  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2109  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2110  * with the mems_allowed returned by cpuset_mems_allowed().  This
2111  * keeps mempolicies cpuset relative after its cpuset moves.  See
2112  * further kernel/cpuset.c update_nodemask().
2113  *
2114  * current's mempolicy may be rebinded by the other task(the task that changes
2115  * cpuset's mems), so we needn't do rebind work for current task.
2116  */
2117 
2118 /* Slow path of a mempolicy duplicate */
2119 struct mempolicy *__mpol_dup(struct mempolicy *old)
2120 {
2121 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2122 
2123 	if (!new)
2124 		return ERR_PTR(-ENOMEM);
2125 
2126 	/* task's mempolicy is protected by alloc_lock */
2127 	if (old == current->mempolicy) {
2128 		task_lock(current);
2129 		*new = *old;
2130 		task_unlock(current);
2131 	} else
2132 		*new = *old;
2133 
2134 	if (current_cpuset_is_being_rebound()) {
2135 		nodemask_t mems = cpuset_mems_allowed(current);
2136 		if (new->flags & MPOL_F_REBINDING)
2137 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2138 		else
2139 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2140 	}
2141 	atomic_set(&new->refcnt, 1);
2142 	return new;
2143 }
2144 
2145 /* Slow path of a mempolicy comparison */
2146 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2147 {
2148 	if (!a || !b)
2149 		return false;
2150 	if (a->mode != b->mode)
2151 		return false;
2152 	if (a->flags != b->flags)
2153 		return false;
2154 	if (mpol_store_user_nodemask(a))
2155 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2156 			return false;
2157 
2158 	switch (a->mode) {
2159 	case MPOL_BIND:
2160 		/* Fall through */
2161 	case MPOL_INTERLEAVE:
2162 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2163 	case MPOL_PREFERRED:
2164 		return a->v.preferred_node == b->v.preferred_node;
2165 	default:
2166 		BUG();
2167 		return false;
2168 	}
2169 }
2170 
2171 /*
2172  * Shared memory backing store policy support.
2173  *
2174  * Remember policies even when nobody has shared memory mapped.
2175  * The policies are kept in Red-Black tree linked from the inode.
2176  * They are protected by the sp->lock rwlock, which should be held
2177  * for any accesses to the tree.
2178  */
2179 
2180 /*
2181  * lookup first element intersecting start-end.  Caller holds sp->lock for
2182  * reading or for writing
2183  */
2184 static struct sp_node *
2185 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2186 {
2187 	struct rb_node *n = sp->root.rb_node;
2188 
2189 	while (n) {
2190 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2191 
2192 		if (start >= p->end)
2193 			n = n->rb_right;
2194 		else if (end <= p->start)
2195 			n = n->rb_left;
2196 		else
2197 			break;
2198 	}
2199 	if (!n)
2200 		return NULL;
2201 	for (;;) {
2202 		struct sp_node *w = NULL;
2203 		struct rb_node *prev = rb_prev(n);
2204 		if (!prev)
2205 			break;
2206 		w = rb_entry(prev, struct sp_node, nd);
2207 		if (w->end <= start)
2208 			break;
2209 		n = prev;
2210 	}
2211 	return rb_entry(n, struct sp_node, nd);
2212 }
2213 
2214 /*
2215  * Insert a new shared policy into the list.  Caller holds sp->lock for
2216  * writing.
2217  */
2218 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2219 {
2220 	struct rb_node **p = &sp->root.rb_node;
2221 	struct rb_node *parent = NULL;
2222 	struct sp_node *nd;
2223 
2224 	while (*p) {
2225 		parent = *p;
2226 		nd = rb_entry(parent, struct sp_node, nd);
2227 		if (new->start < nd->start)
2228 			p = &(*p)->rb_left;
2229 		else if (new->end > nd->end)
2230 			p = &(*p)->rb_right;
2231 		else
2232 			BUG();
2233 	}
2234 	rb_link_node(&new->nd, parent, p);
2235 	rb_insert_color(&new->nd, &sp->root);
2236 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2237 		 new->policy ? new->policy->mode : 0);
2238 }
2239 
2240 /* Find shared policy intersecting idx */
2241 struct mempolicy *
2242 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2243 {
2244 	struct mempolicy *pol = NULL;
2245 	struct sp_node *sn;
2246 
2247 	if (!sp->root.rb_node)
2248 		return NULL;
2249 	read_lock(&sp->lock);
2250 	sn = sp_lookup(sp, idx, idx+1);
2251 	if (sn) {
2252 		mpol_get(sn->policy);
2253 		pol = sn->policy;
2254 	}
2255 	read_unlock(&sp->lock);
2256 	return pol;
2257 }
2258 
2259 static void sp_free(struct sp_node *n)
2260 {
2261 	mpol_put(n->policy);
2262 	kmem_cache_free(sn_cache, n);
2263 }
2264 
2265 /**
2266  * mpol_misplaced - check whether current page node is valid in policy
2267  *
2268  * @page: page to be checked
2269  * @vma: vm area where page mapped
2270  * @addr: virtual address where page mapped
2271  *
2272  * Lookup current policy node id for vma,addr and "compare to" page's
2273  * node id.
2274  *
2275  * Returns:
2276  *	-1	- not misplaced, page is in the right node
2277  *	node	- node id where the page should be
2278  *
2279  * Policy determination "mimics" alloc_page_vma().
2280  * Called from fault path where we know the vma and faulting address.
2281  */
2282 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2283 {
2284 	struct mempolicy *pol;
2285 	struct zone *zone;
2286 	int curnid = page_to_nid(page);
2287 	unsigned long pgoff;
2288 	int thiscpu = raw_smp_processor_id();
2289 	int thisnid = cpu_to_node(thiscpu);
2290 	int polnid = -1;
2291 	int ret = -1;
2292 
2293 	BUG_ON(!vma);
2294 
2295 	pol = get_vma_policy(vma, addr);
2296 	if (!(pol->flags & MPOL_F_MOF))
2297 		goto out;
2298 
2299 	switch (pol->mode) {
2300 	case MPOL_INTERLEAVE:
2301 		BUG_ON(addr >= vma->vm_end);
2302 		BUG_ON(addr < vma->vm_start);
2303 
2304 		pgoff = vma->vm_pgoff;
2305 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2306 		polnid = offset_il_node(pol, vma, pgoff);
2307 		break;
2308 
2309 	case MPOL_PREFERRED:
2310 		if (pol->flags & MPOL_F_LOCAL)
2311 			polnid = numa_node_id();
2312 		else
2313 			polnid = pol->v.preferred_node;
2314 		break;
2315 
2316 	case MPOL_BIND:
2317 		/*
2318 		 * allows binding to multiple nodes.
2319 		 * use current page if in policy nodemask,
2320 		 * else select nearest allowed node, if any.
2321 		 * If no allowed nodes, use current [!misplaced].
2322 		 */
2323 		if (node_isset(curnid, pol->v.nodes))
2324 			goto out;
2325 		(void)first_zones_zonelist(
2326 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2327 				gfp_zone(GFP_HIGHUSER),
2328 				&pol->v.nodes, &zone);
2329 		polnid = zone->node;
2330 		break;
2331 
2332 	default:
2333 		BUG();
2334 	}
2335 
2336 	/* Migrate the page towards the node whose CPU is referencing it */
2337 	if (pol->flags & MPOL_F_MORON) {
2338 		polnid = thisnid;
2339 
2340 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2341 			goto out;
2342 	}
2343 
2344 	if (curnid != polnid)
2345 		ret = polnid;
2346 out:
2347 	mpol_cond_put(pol);
2348 
2349 	return ret;
2350 }
2351 
2352 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2353 {
2354 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2355 	rb_erase(&n->nd, &sp->root);
2356 	sp_free(n);
2357 }
2358 
2359 static void sp_node_init(struct sp_node *node, unsigned long start,
2360 			unsigned long end, struct mempolicy *pol)
2361 {
2362 	node->start = start;
2363 	node->end = end;
2364 	node->policy = pol;
2365 }
2366 
2367 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2368 				struct mempolicy *pol)
2369 {
2370 	struct sp_node *n;
2371 	struct mempolicy *newpol;
2372 
2373 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2374 	if (!n)
2375 		return NULL;
2376 
2377 	newpol = mpol_dup(pol);
2378 	if (IS_ERR(newpol)) {
2379 		kmem_cache_free(sn_cache, n);
2380 		return NULL;
2381 	}
2382 	newpol->flags |= MPOL_F_SHARED;
2383 	sp_node_init(n, start, end, newpol);
2384 
2385 	return n;
2386 }
2387 
2388 /* Replace a policy range. */
2389 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2390 				 unsigned long end, struct sp_node *new)
2391 {
2392 	struct sp_node *n;
2393 	struct sp_node *n_new = NULL;
2394 	struct mempolicy *mpol_new = NULL;
2395 	int ret = 0;
2396 
2397 restart:
2398 	write_lock(&sp->lock);
2399 	n = sp_lookup(sp, start, end);
2400 	/* Take care of old policies in the same range. */
2401 	while (n && n->start < end) {
2402 		struct rb_node *next = rb_next(&n->nd);
2403 		if (n->start >= start) {
2404 			if (n->end <= end)
2405 				sp_delete(sp, n);
2406 			else
2407 				n->start = end;
2408 		} else {
2409 			/* Old policy spanning whole new range. */
2410 			if (n->end > end) {
2411 				if (!n_new)
2412 					goto alloc_new;
2413 
2414 				*mpol_new = *n->policy;
2415 				atomic_set(&mpol_new->refcnt, 1);
2416 				sp_node_init(n_new, end, n->end, mpol_new);
2417 				n->end = start;
2418 				sp_insert(sp, n_new);
2419 				n_new = NULL;
2420 				mpol_new = NULL;
2421 				break;
2422 			} else
2423 				n->end = start;
2424 		}
2425 		if (!next)
2426 			break;
2427 		n = rb_entry(next, struct sp_node, nd);
2428 	}
2429 	if (new)
2430 		sp_insert(sp, new);
2431 	write_unlock(&sp->lock);
2432 	ret = 0;
2433 
2434 err_out:
2435 	if (mpol_new)
2436 		mpol_put(mpol_new);
2437 	if (n_new)
2438 		kmem_cache_free(sn_cache, n_new);
2439 
2440 	return ret;
2441 
2442 alloc_new:
2443 	write_unlock(&sp->lock);
2444 	ret = -ENOMEM;
2445 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2446 	if (!n_new)
2447 		goto err_out;
2448 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2449 	if (!mpol_new)
2450 		goto err_out;
2451 	goto restart;
2452 }
2453 
2454 /**
2455  * mpol_shared_policy_init - initialize shared policy for inode
2456  * @sp: pointer to inode shared policy
2457  * @mpol:  struct mempolicy to install
2458  *
2459  * Install non-NULL @mpol in inode's shared policy rb-tree.
2460  * On entry, the current task has a reference on a non-NULL @mpol.
2461  * This must be released on exit.
2462  * This is called at get_inode() calls and we can use GFP_KERNEL.
2463  */
2464 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2465 {
2466 	int ret;
2467 
2468 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2469 	rwlock_init(&sp->lock);
2470 
2471 	if (mpol) {
2472 		struct vm_area_struct pvma;
2473 		struct mempolicy *new;
2474 		NODEMASK_SCRATCH(scratch);
2475 
2476 		if (!scratch)
2477 			goto put_mpol;
2478 		/* contextualize the tmpfs mount point mempolicy */
2479 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2480 		if (IS_ERR(new))
2481 			goto free_scratch; /* no valid nodemask intersection */
2482 
2483 		task_lock(current);
2484 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2485 		task_unlock(current);
2486 		if (ret)
2487 			goto put_new;
2488 
2489 		/* Create pseudo-vma that contains just the policy */
2490 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2491 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2492 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2493 
2494 put_new:
2495 		mpol_put(new);			/* drop initial ref */
2496 free_scratch:
2497 		NODEMASK_SCRATCH_FREE(scratch);
2498 put_mpol:
2499 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2500 	}
2501 }
2502 
2503 int mpol_set_shared_policy(struct shared_policy *info,
2504 			struct vm_area_struct *vma, struct mempolicy *npol)
2505 {
2506 	int err;
2507 	struct sp_node *new = NULL;
2508 	unsigned long sz = vma_pages(vma);
2509 
2510 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2511 		 vma->vm_pgoff,
2512 		 sz, npol ? npol->mode : -1,
2513 		 npol ? npol->flags : -1,
2514 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2515 
2516 	if (npol) {
2517 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2518 		if (!new)
2519 			return -ENOMEM;
2520 	}
2521 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2522 	if (err && new)
2523 		sp_free(new);
2524 	return err;
2525 }
2526 
2527 /* Free a backing policy store on inode delete. */
2528 void mpol_free_shared_policy(struct shared_policy *p)
2529 {
2530 	struct sp_node *n;
2531 	struct rb_node *next;
2532 
2533 	if (!p->root.rb_node)
2534 		return;
2535 	write_lock(&p->lock);
2536 	next = rb_first(&p->root);
2537 	while (next) {
2538 		n = rb_entry(next, struct sp_node, nd);
2539 		next = rb_next(&n->nd);
2540 		sp_delete(p, n);
2541 	}
2542 	write_unlock(&p->lock);
2543 }
2544 
2545 #ifdef CONFIG_NUMA_BALANCING
2546 static int __initdata numabalancing_override;
2547 
2548 static void __init check_numabalancing_enable(void)
2549 {
2550 	bool numabalancing_default = false;
2551 
2552 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2553 		numabalancing_default = true;
2554 
2555 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2556 	if (numabalancing_override)
2557 		set_numabalancing_state(numabalancing_override == 1);
2558 
2559 	if (num_online_nodes() > 1 && !numabalancing_override) {
2560 		pr_info("%s automatic NUMA balancing. "
2561 			"Configure with numa_balancing= or the "
2562 			"kernel.numa_balancing sysctl",
2563 			numabalancing_default ? "Enabling" : "Disabling");
2564 		set_numabalancing_state(numabalancing_default);
2565 	}
2566 }
2567 
2568 static int __init setup_numabalancing(char *str)
2569 {
2570 	int ret = 0;
2571 	if (!str)
2572 		goto out;
2573 
2574 	if (!strcmp(str, "enable")) {
2575 		numabalancing_override = 1;
2576 		ret = 1;
2577 	} else if (!strcmp(str, "disable")) {
2578 		numabalancing_override = -1;
2579 		ret = 1;
2580 	}
2581 out:
2582 	if (!ret)
2583 		pr_warn("Unable to parse numa_balancing=\n");
2584 
2585 	return ret;
2586 }
2587 __setup("numa_balancing=", setup_numabalancing);
2588 #else
2589 static inline void __init check_numabalancing_enable(void)
2590 {
2591 }
2592 #endif /* CONFIG_NUMA_BALANCING */
2593 
2594 /* assumes fs == KERNEL_DS */
2595 void __init numa_policy_init(void)
2596 {
2597 	nodemask_t interleave_nodes;
2598 	unsigned long largest = 0;
2599 	int nid, prefer = 0;
2600 
2601 	policy_cache = kmem_cache_create("numa_policy",
2602 					 sizeof(struct mempolicy),
2603 					 0, SLAB_PANIC, NULL);
2604 
2605 	sn_cache = kmem_cache_create("shared_policy_node",
2606 				     sizeof(struct sp_node),
2607 				     0, SLAB_PANIC, NULL);
2608 
2609 	for_each_node(nid) {
2610 		preferred_node_policy[nid] = (struct mempolicy) {
2611 			.refcnt = ATOMIC_INIT(1),
2612 			.mode = MPOL_PREFERRED,
2613 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2614 			.v = { .preferred_node = nid, },
2615 		};
2616 	}
2617 
2618 	/*
2619 	 * Set interleaving policy for system init. Interleaving is only
2620 	 * enabled across suitably sized nodes (default is >= 16MB), or
2621 	 * fall back to the largest node if they're all smaller.
2622 	 */
2623 	nodes_clear(interleave_nodes);
2624 	for_each_node_state(nid, N_MEMORY) {
2625 		unsigned long total_pages = node_present_pages(nid);
2626 
2627 		/* Preserve the largest node */
2628 		if (largest < total_pages) {
2629 			largest = total_pages;
2630 			prefer = nid;
2631 		}
2632 
2633 		/* Interleave this node? */
2634 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2635 			node_set(nid, interleave_nodes);
2636 	}
2637 
2638 	/* All too small, use the largest */
2639 	if (unlikely(nodes_empty(interleave_nodes)))
2640 		node_set(prefer, interleave_nodes);
2641 
2642 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2643 		pr_err("%s: interleaving failed\n", __func__);
2644 
2645 	check_numabalancing_enable();
2646 }
2647 
2648 /* Reset policy of current process to default */
2649 void numa_default_policy(void)
2650 {
2651 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2652 }
2653 
2654 /*
2655  * Parse and format mempolicy from/to strings
2656  */
2657 
2658 /*
2659  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2660  */
2661 static const char * const policy_modes[] =
2662 {
2663 	[MPOL_DEFAULT]    = "default",
2664 	[MPOL_PREFERRED]  = "prefer",
2665 	[MPOL_BIND]       = "bind",
2666 	[MPOL_INTERLEAVE] = "interleave",
2667 	[MPOL_LOCAL]      = "local",
2668 };
2669 
2670 
2671 #ifdef CONFIG_TMPFS
2672 /**
2673  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2674  * @str:  string containing mempolicy to parse
2675  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2676  *
2677  * Format of input:
2678  *	<mode>[=<flags>][:<nodelist>]
2679  *
2680  * On success, returns 0, else 1
2681  */
2682 int mpol_parse_str(char *str, struct mempolicy **mpol)
2683 {
2684 	struct mempolicy *new = NULL;
2685 	unsigned short mode;
2686 	unsigned short mode_flags;
2687 	nodemask_t nodes;
2688 	char *nodelist = strchr(str, ':');
2689 	char *flags = strchr(str, '=');
2690 	int err = 1;
2691 
2692 	if (nodelist) {
2693 		/* NUL-terminate mode or flags string */
2694 		*nodelist++ = '\0';
2695 		if (nodelist_parse(nodelist, nodes))
2696 			goto out;
2697 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2698 			goto out;
2699 	} else
2700 		nodes_clear(nodes);
2701 
2702 	if (flags)
2703 		*flags++ = '\0';	/* terminate mode string */
2704 
2705 	for (mode = 0; mode < MPOL_MAX; mode++) {
2706 		if (!strcmp(str, policy_modes[mode])) {
2707 			break;
2708 		}
2709 	}
2710 	if (mode >= MPOL_MAX)
2711 		goto out;
2712 
2713 	switch (mode) {
2714 	case MPOL_PREFERRED:
2715 		/*
2716 		 * Insist on a nodelist of one node only
2717 		 */
2718 		if (nodelist) {
2719 			char *rest = nodelist;
2720 			while (isdigit(*rest))
2721 				rest++;
2722 			if (*rest)
2723 				goto out;
2724 		}
2725 		break;
2726 	case MPOL_INTERLEAVE:
2727 		/*
2728 		 * Default to online nodes with memory if no nodelist
2729 		 */
2730 		if (!nodelist)
2731 			nodes = node_states[N_MEMORY];
2732 		break;
2733 	case MPOL_LOCAL:
2734 		/*
2735 		 * Don't allow a nodelist;  mpol_new() checks flags
2736 		 */
2737 		if (nodelist)
2738 			goto out;
2739 		mode = MPOL_PREFERRED;
2740 		break;
2741 	case MPOL_DEFAULT:
2742 		/*
2743 		 * Insist on a empty nodelist
2744 		 */
2745 		if (!nodelist)
2746 			err = 0;
2747 		goto out;
2748 	case MPOL_BIND:
2749 		/*
2750 		 * Insist on a nodelist
2751 		 */
2752 		if (!nodelist)
2753 			goto out;
2754 	}
2755 
2756 	mode_flags = 0;
2757 	if (flags) {
2758 		/*
2759 		 * Currently, we only support two mutually exclusive
2760 		 * mode flags.
2761 		 */
2762 		if (!strcmp(flags, "static"))
2763 			mode_flags |= MPOL_F_STATIC_NODES;
2764 		else if (!strcmp(flags, "relative"))
2765 			mode_flags |= MPOL_F_RELATIVE_NODES;
2766 		else
2767 			goto out;
2768 	}
2769 
2770 	new = mpol_new(mode, mode_flags, &nodes);
2771 	if (IS_ERR(new))
2772 		goto out;
2773 
2774 	/*
2775 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2776 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2777 	 */
2778 	if (mode != MPOL_PREFERRED)
2779 		new->v.nodes = nodes;
2780 	else if (nodelist)
2781 		new->v.preferred_node = first_node(nodes);
2782 	else
2783 		new->flags |= MPOL_F_LOCAL;
2784 
2785 	/*
2786 	 * Save nodes for contextualization: this will be used to "clone"
2787 	 * the mempolicy in a specific context [cpuset] at a later time.
2788 	 */
2789 	new->w.user_nodemask = nodes;
2790 
2791 	err = 0;
2792 
2793 out:
2794 	/* Restore string for error message */
2795 	if (nodelist)
2796 		*--nodelist = ':';
2797 	if (flags)
2798 		*--flags = '=';
2799 	if (!err)
2800 		*mpol = new;
2801 	return err;
2802 }
2803 #endif /* CONFIG_TMPFS */
2804 
2805 /**
2806  * mpol_to_str - format a mempolicy structure for printing
2807  * @buffer:  to contain formatted mempolicy string
2808  * @maxlen:  length of @buffer
2809  * @pol:  pointer to mempolicy to be formatted
2810  *
2811  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2812  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2813  * longest flag, "relative", and to display at least a few node ids.
2814  */
2815 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2816 {
2817 	char *p = buffer;
2818 	nodemask_t nodes = NODE_MASK_NONE;
2819 	unsigned short mode = MPOL_DEFAULT;
2820 	unsigned short flags = 0;
2821 
2822 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2823 		mode = pol->mode;
2824 		flags = pol->flags;
2825 	}
2826 
2827 	switch (mode) {
2828 	case MPOL_DEFAULT:
2829 		break;
2830 	case MPOL_PREFERRED:
2831 		if (flags & MPOL_F_LOCAL)
2832 			mode = MPOL_LOCAL;
2833 		else
2834 			node_set(pol->v.preferred_node, nodes);
2835 		break;
2836 	case MPOL_BIND:
2837 	case MPOL_INTERLEAVE:
2838 		nodes = pol->v.nodes;
2839 		break;
2840 	default:
2841 		WARN_ON_ONCE(1);
2842 		snprintf(p, maxlen, "unknown");
2843 		return;
2844 	}
2845 
2846 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2847 
2848 	if (flags & MPOL_MODE_FLAGS) {
2849 		p += snprintf(p, buffer + maxlen - p, "=");
2850 
2851 		/*
2852 		 * Currently, the only defined flags are mutually exclusive
2853 		 */
2854 		if (flags & MPOL_F_STATIC_NODES)
2855 			p += snprintf(p, buffer + maxlen - p, "static");
2856 		else if (flags & MPOL_F_RELATIVE_NODES)
2857 			p += snprintf(p, buffer + maxlen - p, "relative");
2858 	}
2859 
2860 	if (!nodes_empty(nodes))
2861 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2862 			       nodemask_pr_args(&nodes));
2863 }
2864