xref: /openbmc/linux/mm/mempolicy.c (revision 174cd4b1)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/nodemask.h>
79 #include <linux/cpuset.h>
80 #include <linux/slab.h>
81 #include <linux/string.h>
82 #include <linux/export.h>
83 #include <linux/nsproxy.h>
84 #include <linux/interrupt.h>
85 #include <linux/init.h>
86 #include <linux/compat.h>
87 #include <linux/swap.h>
88 #include <linux/seq_file.h>
89 #include <linux/proc_fs.h>
90 #include <linux/migrate.h>
91 #include <linux/ksm.h>
92 #include <linux/rmap.h>
93 #include <linux/security.h>
94 #include <linux/syscalls.h>
95 #include <linux/ctype.h>
96 #include <linux/mm_inline.h>
97 #include <linux/mmu_notifier.h>
98 #include <linux/printk.h>
99 
100 #include <asm/tlbflush.h>
101 #include <linux/uaccess.h>
102 
103 #include "internal.h"
104 
105 /* Internal flags */
106 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
107 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
108 
109 static struct kmem_cache *policy_cache;
110 static struct kmem_cache *sn_cache;
111 
112 /* Highest zone. An specific allocation for a zone below that is not
113    policied. */
114 enum zone_type policy_zone = 0;
115 
116 /*
117  * run-time system-wide default policy => local allocation
118  */
119 static struct mempolicy default_policy = {
120 	.refcnt = ATOMIC_INIT(1), /* never free it */
121 	.mode = MPOL_PREFERRED,
122 	.flags = MPOL_F_LOCAL,
123 };
124 
125 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
126 
127 struct mempolicy *get_task_policy(struct task_struct *p)
128 {
129 	struct mempolicy *pol = p->mempolicy;
130 	int node;
131 
132 	if (pol)
133 		return pol;
134 
135 	node = numa_node_id();
136 	if (node != NUMA_NO_NODE) {
137 		pol = &preferred_node_policy[node];
138 		/* preferred_node_policy is not initialised early in boot */
139 		if (pol->mode)
140 			return pol;
141 	}
142 
143 	return &default_policy;
144 }
145 
146 static const struct mempolicy_operations {
147 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
148 	/*
149 	 * If read-side task has no lock to protect task->mempolicy, write-side
150 	 * task will rebind the task->mempolicy by two step. The first step is
151 	 * setting all the newly nodes, and the second step is cleaning all the
152 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
153 	 * page.
154 	 * If we have a lock to protect task->mempolicy in read-side, we do
155 	 * rebind directly.
156 	 *
157 	 * step:
158 	 * 	MPOL_REBIND_ONCE - do rebind work at once
159 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
160 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
161 	 */
162 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
163 			enum mpol_rebind_step step);
164 } mpol_ops[MPOL_MAX];
165 
166 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
167 {
168 	return pol->flags & MPOL_MODE_FLAGS;
169 }
170 
171 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
172 				   const nodemask_t *rel)
173 {
174 	nodemask_t tmp;
175 	nodes_fold(tmp, *orig, nodes_weight(*rel));
176 	nodes_onto(*ret, tmp, *rel);
177 }
178 
179 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (nodes_empty(*nodes))
182 		return -EINVAL;
183 	pol->v.nodes = *nodes;
184 	return 0;
185 }
186 
187 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
188 {
189 	if (!nodes)
190 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
191 	else if (nodes_empty(*nodes))
192 		return -EINVAL;			/*  no allowed nodes */
193 	else
194 		pol->v.preferred_node = first_node(*nodes);
195 	return 0;
196 }
197 
198 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
199 {
200 	if (nodes_empty(*nodes))
201 		return -EINVAL;
202 	pol->v.nodes = *nodes;
203 	return 0;
204 }
205 
206 /*
207  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
208  * any, for the new policy.  mpol_new() has already validated the nodes
209  * parameter with respect to the policy mode and flags.  But, we need to
210  * handle an empty nodemask with MPOL_PREFERRED here.
211  *
212  * Must be called holding task's alloc_lock to protect task's mems_allowed
213  * and mempolicy.  May also be called holding the mmap_semaphore for write.
214  */
215 static int mpol_set_nodemask(struct mempolicy *pol,
216 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
217 {
218 	int ret;
219 
220 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
221 	if (pol == NULL)
222 		return 0;
223 	/* Check N_MEMORY */
224 	nodes_and(nsc->mask1,
225 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
226 
227 	VM_BUG_ON(!nodes);
228 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
229 		nodes = NULL;	/* explicit local allocation */
230 	else {
231 		if (pol->flags & MPOL_F_RELATIVE_NODES)
232 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
233 		else
234 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
235 
236 		if (mpol_store_user_nodemask(pol))
237 			pol->w.user_nodemask = *nodes;
238 		else
239 			pol->w.cpuset_mems_allowed =
240 						cpuset_current_mems_allowed;
241 	}
242 
243 	if (nodes)
244 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
245 	else
246 		ret = mpol_ops[pol->mode].create(pol, NULL);
247 	return ret;
248 }
249 
250 /*
251  * This function just creates a new policy, does some check and simple
252  * initialization. You must invoke mpol_set_nodemask() to set nodes.
253  */
254 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
255 				  nodemask_t *nodes)
256 {
257 	struct mempolicy *policy;
258 
259 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
260 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
261 
262 	if (mode == MPOL_DEFAULT) {
263 		if (nodes && !nodes_empty(*nodes))
264 			return ERR_PTR(-EINVAL);
265 		return NULL;
266 	}
267 	VM_BUG_ON(!nodes);
268 
269 	/*
270 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
271 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
272 	 * All other modes require a valid pointer to a non-empty nodemask.
273 	 */
274 	if (mode == MPOL_PREFERRED) {
275 		if (nodes_empty(*nodes)) {
276 			if (((flags & MPOL_F_STATIC_NODES) ||
277 			     (flags & MPOL_F_RELATIVE_NODES)))
278 				return ERR_PTR(-EINVAL);
279 		}
280 	} else if (mode == MPOL_LOCAL) {
281 		if (!nodes_empty(*nodes) ||
282 		    (flags & MPOL_F_STATIC_NODES) ||
283 		    (flags & MPOL_F_RELATIVE_NODES))
284 			return ERR_PTR(-EINVAL);
285 		mode = MPOL_PREFERRED;
286 	} else if (nodes_empty(*nodes))
287 		return ERR_PTR(-EINVAL);
288 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
289 	if (!policy)
290 		return ERR_PTR(-ENOMEM);
291 	atomic_set(&policy->refcnt, 1);
292 	policy->mode = mode;
293 	policy->flags = flags;
294 
295 	return policy;
296 }
297 
298 /* Slow path of a mpol destructor. */
299 void __mpol_put(struct mempolicy *p)
300 {
301 	if (!atomic_dec_and_test(&p->refcnt))
302 		return;
303 	kmem_cache_free(policy_cache, p);
304 }
305 
306 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
307 				enum mpol_rebind_step step)
308 {
309 }
310 
311 /*
312  * step:
313  * 	MPOL_REBIND_ONCE  - do rebind work at once
314  * 	MPOL_REBIND_STEP1 - set all the newly nodes
315  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
316  */
317 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
318 				 enum mpol_rebind_step step)
319 {
320 	nodemask_t tmp;
321 
322 	if (pol->flags & MPOL_F_STATIC_NODES)
323 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
324 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
325 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
326 	else {
327 		/*
328 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
329 		 * result
330 		 */
331 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
332 			nodes_remap(tmp, pol->v.nodes,
333 					pol->w.cpuset_mems_allowed, *nodes);
334 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
335 		} else if (step == MPOL_REBIND_STEP2) {
336 			tmp = pol->w.cpuset_mems_allowed;
337 			pol->w.cpuset_mems_allowed = *nodes;
338 		} else
339 			BUG();
340 	}
341 
342 	if (nodes_empty(tmp))
343 		tmp = *nodes;
344 
345 	if (step == MPOL_REBIND_STEP1)
346 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
347 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
348 		pol->v.nodes = tmp;
349 	else
350 		BUG();
351 
352 	if (!node_isset(current->il_next, tmp)) {
353 		current->il_next = next_node_in(current->il_next, tmp);
354 		if (current->il_next >= MAX_NUMNODES)
355 			current->il_next = numa_node_id();
356 	}
357 }
358 
359 static void mpol_rebind_preferred(struct mempolicy *pol,
360 				  const nodemask_t *nodes,
361 				  enum mpol_rebind_step step)
362 {
363 	nodemask_t tmp;
364 
365 	if (pol->flags & MPOL_F_STATIC_NODES) {
366 		int node = first_node(pol->w.user_nodemask);
367 
368 		if (node_isset(node, *nodes)) {
369 			pol->v.preferred_node = node;
370 			pol->flags &= ~MPOL_F_LOCAL;
371 		} else
372 			pol->flags |= MPOL_F_LOCAL;
373 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
374 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
375 		pol->v.preferred_node = first_node(tmp);
376 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
377 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
378 						   pol->w.cpuset_mems_allowed,
379 						   *nodes);
380 		pol->w.cpuset_mems_allowed = *nodes;
381 	}
382 }
383 
384 /*
385  * mpol_rebind_policy - Migrate a policy to a different set of nodes
386  *
387  * If read-side task has no lock to protect task->mempolicy, write-side
388  * task will rebind the task->mempolicy by two step. The first step is
389  * setting all the newly nodes, and the second step is cleaning all the
390  * disallowed nodes. In this way, we can avoid finding no node to alloc
391  * page.
392  * If we have a lock to protect task->mempolicy in read-side, we do
393  * rebind directly.
394  *
395  * step:
396  * 	MPOL_REBIND_ONCE  - do rebind work at once
397  * 	MPOL_REBIND_STEP1 - set all the newly nodes
398  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
399  */
400 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
401 				enum mpol_rebind_step step)
402 {
403 	if (!pol)
404 		return;
405 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
406 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
407 		return;
408 
409 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
410 		return;
411 
412 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
413 		BUG();
414 
415 	if (step == MPOL_REBIND_STEP1)
416 		pol->flags |= MPOL_F_REBINDING;
417 	else if (step == MPOL_REBIND_STEP2)
418 		pol->flags &= ~MPOL_F_REBINDING;
419 	else if (step >= MPOL_REBIND_NSTEP)
420 		BUG();
421 
422 	mpol_ops[pol->mode].rebind(pol, newmask, step);
423 }
424 
425 /*
426  * Wrapper for mpol_rebind_policy() that just requires task
427  * pointer, and updates task mempolicy.
428  *
429  * Called with task's alloc_lock held.
430  */
431 
432 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
433 			enum mpol_rebind_step step)
434 {
435 	mpol_rebind_policy(tsk->mempolicy, new, step);
436 }
437 
438 /*
439  * Rebind each vma in mm to new nodemask.
440  *
441  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
442  */
443 
444 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
445 {
446 	struct vm_area_struct *vma;
447 
448 	down_write(&mm->mmap_sem);
449 	for (vma = mm->mmap; vma; vma = vma->vm_next)
450 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
451 	up_write(&mm->mmap_sem);
452 }
453 
454 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
455 	[MPOL_DEFAULT] = {
456 		.rebind = mpol_rebind_default,
457 	},
458 	[MPOL_INTERLEAVE] = {
459 		.create = mpol_new_interleave,
460 		.rebind = mpol_rebind_nodemask,
461 	},
462 	[MPOL_PREFERRED] = {
463 		.create = mpol_new_preferred,
464 		.rebind = mpol_rebind_preferred,
465 	},
466 	[MPOL_BIND] = {
467 		.create = mpol_new_bind,
468 		.rebind = mpol_rebind_nodemask,
469 	},
470 };
471 
472 static void migrate_page_add(struct page *page, struct list_head *pagelist,
473 				unsigned long flags);
474 
475 struct queue_pages {
476 	struct list_head *pagelist;
477 	unsigned long flags;
478 	nodemask_t *nmask;
479 	struct vm_area_struct *prev;
480 };
481 
482 /*
483  * Scan through pages checking if pages follow certain conditions,
484  * and move them to the pagelist if they do.
485  */
486 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
487 			unsigned long end, struct mm_walk *walk)
488 {
489 	struct vm_area_struct *vma = walk->vma;
490 	struct page *page;
491 	struct queue_pages *qp = walk->private;
492 	unsigned long flags = qp->flags;
493 	int nid, ret;
494 	pte_t *pte;
495 	spinlock_t *ptl;
496 
497 	if (pmd_trans_huge(*pmd)) {
498 		ptl = pmd_lock(walk->mm, pmd);
499 		if (pmd_trans_huge(*pmd)) {
500 			page = pmd_page(*pmd);
501 			if (is_huge_zero_page(page)) {
502 				spin_unlock(ptl);
503 				__split_huge_pmd(vma, pmd, addr, false, NULL);
504 			} else {
505 				get_page(page);
506 				spin_unlock(ptl);
507 				lock_page(page);
508 				ret = split_huge_page(page);
509 				unlock_page(page);
510 				put_page(page);
511 				if (ret)
512 					return 0;
513 			}
514 		} else {
515 			spin_unlock(ptl);
516 		}
517 	}
518 
519 	if (pmd_trans_unstable(pmd))
520 		return 0;
521 retry:
522 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
523 	for (; addr != end; pte++, addr += PAGE_SIZE) {
524 		if (!pte_present(*pte))
525 			continue;
526 		page = vm_normal_page(vma, addr, *pte);
527 		if (!page)
528 			continue;
529 		/*
530 		 * vm_normal_page() filters out zero pages, but there might
531 		 * still be PageReserved pages to skip, perhaps in a VDSO.
532 		 */
533 		if (PageReserved(page))
534 			continue;
535 		nid = page_to_nid(page);
536 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
537 			continue;
538 		if (PageTransCompound(page)) {
539 			get_page(page);
540 			pte_unmap_unlock(pte, ptl);
541 			lock_page(page);
542 			ret = split_huge_page(page);
543 			unlock_page(page);
544 			put_page(page);
545 			/* Failed to split -- skip. */
546 			if (ret) {
547 				pte = pte_offset_map_lock(walk->mm, pmd,
548 						addr, &ptl);
549 				continue;
550 			}
551 			goto retry;
552 		}
553 
554 		migrate_page_add(page, qp->pagelist, flags);
555 	}
556 	pte_unmap_unlock(pte - 1, ptl);
557 	cond_resched();
558 	return 0;
559 }
560 
561 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
562 			       unsigned long addr, unsigned long end,
563 			       struct mm_walk *walk)
564 {
565 #ifdef CONFIG_HUGETLB_PAGE
566 	struct queue_pages *qp = walk->private;
567 	unsigned long flags = qp->flags;
568 	int nid;
569 	struct page *page;
570 	spinlock_t *ptl;
571 	pte_t entry;
572 
573 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574 	entry = huge_ptep_get(pte);
575 	if (!pte_present(entry))
576 		goto unlock;
577 	page = pte_page(entry);
578 	nid = page_to_nid(page);
579 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
580 		goto unlock;
581 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
582 	if (flags & (MPOL_MF_MOVE_ALL) ||
583 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
584 		isolate_huge_page(page, qp->pagelist);
585 unlock:
586 	spin_unlock(ptl);
587 #else
588 	BUG();
589 #endif
590 	return 0;
591 }
592 
593 #ifdef CONFIG_NUMA_BALANCING
594 /*
595  * This is used to mark a range of virtual addresses to be inaccessible.
596  * These are later cleared by a NUMA hinting fault. Depending on these
597  * faults, pages may be migrated for better NUMA placement.
598  *
599  * This is assuming that NUMA faults are handled using PROT_NONE. If
600  * an architecture makes a different choice, it will need further
601  * changes to the core.
602  */
603 unsigned long change_prot_numa(struct vm_area_struct *vma,
604 			unsigned long addr, unsigned long end)
605 {
606 	int nr_updated;
607 
608 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
609 	if (nr_updated)
610 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
611 
612 	return nr_updated;
613 }
614 #else
615 static unsigned long change_prot_numa(struct vm_area_struct *vma,
616 			unsigned long addr, unsigned long end)
617 {
618 	return 0;
619 }
620 #endif /* CONFIG_NUMA_BALANCING */
621 
622 static int queue_pages_test_walk(unsigned long start, unsigned long end,
623 				struct mm_walk *walk)
624 {
625 	struct vm_area_struct *vma = walk->vma;
626 	struct queue_pages *qp = walk->private;
627 	unsigned long endvma = vma->vm_end;
628 	unsigned long flags = qp->flags;
629 
630 	if (!vma_migratable(vma))
631 		return 1;
632 
633 	if (endvma > end)
634 		endvma = end;
635 	if (vma->vm_start > start)
636 		start = vma->vm_start;
637 
638 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
639 		if (!vma->vm_next && vma->vm_end < end)
640 			return -EFAULT;
641 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
642 			return -EFAULT;
643 	}
644 
645 	qp->prev = vma;
646 
647 	if (flags & MPOL_MF_LAZY) {
648 		/* Similar to task_numa_work, skip inaccessible VMAs */
649 		if (!is_vm_hugetlb_page(vma) &&
650 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
651 			!(vma->vm_flags & VM_MIXEDMAP))
652 			change_prot_numa(vma, start, endvma);
653 		return 1;
654 	}
655 
656 	/* queue pages from current vma */
657 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
658 		return 0;
659 	return 1;
660 }
661 
662 /*
663  * Walk through page tables and collect pages to be migrated.
664  *
665  * If pages found in a given range are on a set of nodes (determined by
666  * @nodes and @flags,) it's isolated and queued to the pagelist which is
667  * passed via @private.)
668  */
669 static int
670 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
671 		nodemask_t *nodes, unsigned long flags,
672 		struct list_head *pagelist)
673 {
674 	struct queue_pages qp = {
675 		.pagelist = pagelist,
676 		.flags = flags,
677 		.nmask = nodes,
678 		.prev = NULL,
679 	};
680 	struct mm_walk queue_pages_walk = {
681 		.hugetlb_entry = queue_pages_hugetlb,
682 		.pmd_entry = queue_pages_pte_range,
683 		.test_walk = queue_pages_test_walk,
684 		.mm = mm,
685 		.private = &qp,
686 	};
687 
688 	return walk_page_range(start, end, &queue_pages_walk);
689 }
690 
691 /*
692  * Apply policy to a single VMA
693  * This must be called with the mmap_sem held for writing.
694  */
695 static int vma_replace_policy(struct vm_area_struct *vma,
696 						struct mempolicy *pol)
697 {
698 	int err;
699 	struct mempolicy *old;
700 	struct mempolicy *new;
701 
702 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
703 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
704 		 vma->vm_ops, vma->vm_file,
705 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
706 
707 	new = mpol_dup(pol);
708 	if (IS_ERR(new))
709 		return PTR_ERR(new);
710 
711 	if (vma->vm_ops && vma->vm_ops->set_policy) {
712 		err = vma->vm_ops->set_policy(vma, new);
713 		if (err)
714 			goto err_out;
715 	}
716 
717 	old = vma->vm_policy;
718 	vma->vm_policy = new; /* protected by mmap_sem */
719 	mpol_put(old);
720 
721 	return 0;
722  err_out:
723 	mpol_put(new);
724 	return err;
725 }
726 
727 /* Step 2: apply policy to a range and do splits. */
728 static int mbind_range(struct mm_struct *mm, unsigned long start,
729 		       unsigned long end, struct mempolicy *new_pol)
730 {
731 	struct vm_area_struct *next;
732 	struct vm_area_struct *prev;
733 	struct vm_area_struct *vma;
734 	int err = 0;
735 	pgoff_t pgoff;
736 	unsigned long vmstart;
737 	unsigned long vmend;
738 
739 	vma = find_vma(mm, start);
740 	if (!vma || vma->vm_start > start)
741 		return -EFAULT;
742 
743 	prev = vma->vm_prev;
744 	if (start > vma->vm_start)
745 		prev = vma;
746 
747 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
748 		next = vma->vm_next;
749 		vmstart = max(start, vma->vm_start);
750 		vmend   = min(end, vma->vm_end);
751 
752 		if (mpol_equal(vma_policy(vma), new_pol))
753 			continue;
754 
755 		pgoff = vma->vm_pgoff +
756 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
757 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
758 				 vma->anon_vma, vma->vm_file, pgoff,
759 				 new_pol, vma->vm_userfaultfd_ctx);
760 		if (prev) {
761 			vma = prev;
762 			next = vma->vm_next;
763 			if (mpol_equal(vma_policy(vma), new_pol))
764 				continue;
765 			/* vma_merge() joined vma && vma->next, case 8 */
766 			goto replace;
767 		}
768 		if (vma->vm_start != vmstart) {
769 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
770 			if (err)
771 				goto out;
772 		}
773 		if (vma->vm_end != vmend) {
774 			err = split_vma(vma->vm_mm, vma, vmend, 0);
775 			if (err)
776 				goto out;
777 		}
778  replace:
779 		err = vma_replace_policy(vma, new_pol);
780 		if (err)
781 			goto out;
782 	}
783 
784  out:
785 	return err;
786 }
787 
788 /* Set the process memory policy */
789 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
790 			     nodemask_t *nodes)
791 {
792 	struct mempolicy *new, *old;
793 	NODEMASK_SCRATCH(scratch);
794 	int ret;
795 
796 	if (!scratch)
797 		return -ENOMEM;
798 
799 	new = mpol_new(mode, flags, nodes);
800 	if (IS_ERR(new)) {
801 		ret = PTR_ERR(new);
802 		goto out;
803 	}
804 
805 	task_lock(current);
806 	ret = mpol_set_nodemask(new, nodes, scratch);
807 	if (ret) {
808 		task_unlock(current);
809 		mpol_put(new);
810 		goto out;
811 	}
812 	old = current->mempolicy;
813 	current->mempolicy = new;
814 	if (new && new->mode == MPOL_INTERLEAVE &&
815 	    nodes_weight(new->v.nodes))
816 		current->il_next = first_node(new->v.nodes);
817 	task_unlock(current);
818 	mpol_put(old);
819 	ret = 0;
820 out:
821 	NODEMASK_SCRATCH_FREE(scratch);
822 	return ret;
823 }
824 
825 /*
826  * Return nodemask for policy for get_mempolicy() query
827  *
828  * Called with task's alloc_lock held
829  */
830 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
831 {
832 	nodes_clear(*nodes);
833 	if (p == &default_policy)
834 		return;
835 
836 	switch (p->mode) {
837 	case MPOL_BIND:
838 		/* Fall through */
839 	case MPOL_INTERLEAVE:
840 		*nodes = p->v.nodes;
841 		break;
842 	case MPOL_PREFERRED:
843 		if (!(p->flags & MPOL_F_LOCAL))
844 			node_set(p->v.preferred_node, *nodes);
845 		/* else return empty node mask for local allocation */
846 		break;
847 	default:
848 		BUG();
849 	}
850 }
851 
852 static int lookup_node(unsigned long addr)
853 {
854 	struct page *p;
855 	int err;
856 
857 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
858 	if (err >= 0) {
859 		err = page_to_nid(p);
860 		put_page(p);
861 	}
862 	return err;
863 }
864 
865 /* Retrieve NUMA policy */
866 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
867 			     unsigned long addr, unsigned long flags)
868 {
869 	int err;
870 	struct mm_struct *mm = current->mm;
871 	struct vm_area_struct *vma = NULL;
872 	struct mempolicy *pol = current->mempolicy;
873 
874 	if (flags &
875 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
876 		return -EINVAL;
877 
878 	if (flags & MPOL_F_MEMS_ALLOWED) {
879 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
880 			return -EINVAL;
881 		*policy = 0;	/* just so it's initialized */
882 		task_lock(current);
883 		*nmask  = cpuset_current_mems_allowed;
884 		task_unlock(current);
885 		return 0;
886 	}
887 
888 	if (flags & MPOL_F_ADDR) {
889 		/*
890 		 * Do NOT fall back to task policy if the
891 		 * vma/shared policy at addr is NULL.  We
892 		 * want to return MPOL_DEFAULT in this case.
893 		 */
894 		down_read(&mm->mmap_sem);
895 		vma = find_vma_intersection(mm, addr, addr+1);
896 		if (!vma) {
897 			up_read(&mm->mmap_sem);
898 			return -EFAULT;
899 		}
900 		if (vma->vm_ops && vma->vm_ops->get_policy)
901 			pol = vma->vm_ops->get_policy(vma, addr);
902 		else
903 			pol = vma->vm_policy;
904 	} else if (addr)
905 		return -EINVAL;
906 
907 	if (!pol)
908 		pol = &default_policy;	/* indicates default behavior */
909 
910 	if (flags & MPOL_F_NODE) {
911 		if (flags & MPOL_F_ADDR) {
912 			err = lookup_node(addr);
913 			if (err < 0)
914 				goto out;
915 			*policy = err;
916 		} else if (pol == current->mempolicy &&
917 				pol->mode == MPOL_INTERLEAVE) {
918 			*policy = current->il_next;
919 		} else {
920 			err = -EINVAL;
921 			goto out;
922 		}
923 	} else {
924 		*policy = pol == &default_policy ? MPOL_DEFAULT :
925 						pol->mode;
926 		/*
927 		 * Internal mempolicy flags must be masked off before exposing
928 		 * the policy to userspace.
929 		 */
930 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
931 	}
932 
933 	if (vma) {
934 		up_read(&current->mm->mmap_sem);
935 		vma = NULL;
936 	}
937 
938 	err = 0;
939 	if (nmask) {
940 		if (mpol_store_user_nodemask(pol)) {
941 			*nmask = pol->w.user_nodemask;
942 		} else {
943 			task_lock(current);
944 			get_policy_nodemask(pol, nmask);
945 			task_unlock(current);
946 		}
947 	}
948 
949  out:
950 	mpol_cond_put(pol);
951 	if (vma)
952 		up_read(&current->mm->mmap_sem);
953 	return err;
954 }
955 
956 #ifdef CONFIG_MIGRATION
957 /*
958  * page migration
959  */
960 static void migrate_page_add(struct page *page, struct list_head *pagelist,
961 				unsigned long flags)
962 {
963 	/*
964 	 * Avoid migrating a page that is shared with others.
965 	 */
966 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
967 		if (!isolate_lru_page(page)) {
968 			list_add_tail(&page->lru, pagelist);
969 			inc_node_page_state(page, NR_ISOLATED_ANON +
970 					    page_is_file_cache(page));
971 		}
972 	}
973 }
974 
975 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
976 {
977 	if (PageHuge(page))
978 		return alloc_huge_page_node(page_hstate(compound_head(page)),
979 					node);
980 	else
981 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
982 						    __GFP_THISNODE, 0);
983 }
984 
985 /*
986  * Migrate pages from one node to a target node.
987  * Returns error or the number of pages not migrated.
988  */
989 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
990 			   int flags)
991 {
992 	nodemask_t nmask;
993 	LIST_HEAD(pagelist);
994 	int err = 0;
995 
996 	nodes_clear(nmask);
997 	node_set(source, nmask);
998 
999 	/*
1000 	 * This does not "check" the range but isolates all pages that
1001 	 * need migration.  Between passing in the full user address
1002 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1003 	 */
1004 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1005 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1006 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1007 
1008 	if (!list_empty(&pagelist)) {
1009 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1010 					MIGRATE_SYNC, MR_SYSCALL);
1011 		if (err)
1012 			putback_movable_pages(&pagelist);
1013 	}
1014 
1015 	return err;
1016 }
1017 
1018 /*
1019  * Move pages between the two nodesets so as to preserve the physical
1020  * layout as much as possible.
1021  *
1022  * Returns the number of page that could not be moved.
1023  */
1024 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1025 		     const nodemask_t *to, int flags)
1026 {
1027 	int busy = 0;
1028 	int err;
1029 	nodemask_t tmp;
1030 
1031 	err = migrate_prep();
1032 	if (err)
1033 		return err;
1034 
1035 	down_read(&mm->mmap_sem);
1036 
1037 	/*
1038 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1039 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1040 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1041 	 * The pair of nodemasks 'to' and 'from' define the map.
1042 	 *
1043 	 * If no pair of bits is found that way, fallback to picking some
1044 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1045 	 * 'source' and 'dest' bits are the same, this represents a node
1046 	 * that will be migrating to itself, so no pages need move.
1047 	 *
1048 	 * If no bits are left in 'tmp', or if all remaining bits left
1049 	 * in 'tmp' correspond to the same bit in 'to', return false
1050 	 * (nothing left to migrate).
1051 	 *
1052 	 * This lets us pick a pair of nodes to migrate between, such that
1053 	 * if possible the dest node is not already occupied by some other
1054 	 * source node, minimizing the risk of overloading the memory on a
1055 	 * node that would happen if we migrated incoming memory to a node
1056 	 * before migrating outgoing memory source that same node.
1057 	 *
1058 	 * A single scan of tmp is sufficient.  As we go, we remember the
1059 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1060 	 * that not only moved, but what's better, moved to an empty slot
1061 	 * (d is not set in tmp), then we break out then, with that pair.
1062 	 * Otherwise when we finish scanning from_tmp, we at least have the
1063 	 * most recent <s, d> pair that moved.  If we get all the way through
1064 	 * the scan of tmp without finding any node that moved, much less
1065 	 * moved to an empty node, then there is nothing left worth migrating.
1066 	 */
1067 
1068 	tmp = *from;
1069 	while (!nodes_empty(tmp)) {
1070 		int s,d;
1071 		int source = NUMA_NO_NODE;
1072 		int dest = 0;
1073 
1074 		for_each_node_mask(s, tmp) {
1075 
1076 			/*
1077 			 * do_migrate_pages() tries to maintain the relative
1078 			 * node relationship of the pages established between
1079 			 * threads and memory areas.
1080                          *
1081 			 * However if the number of source nodes is not equal to
1082 			 * the number of destination nodes we can not preserve
1083 			 * this node relative relationship.  In that case, skip
1084 			 * copying memory from a node that is in the destination
1085 			 * mask.
1086 			 *
1087 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1088 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1089 			 */
1090 
1091 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1092 						(node_isset(s, *to)))
1093 				continue;
1094 
1095 			d = node_remap(s, *from, *to);
1096 			if (s == d)
1097 				continue;
1098 
1099 			source = s;	/* Node moved. Memorize */
1100 			dest = d;
1101 
1102 			/* dest not in remaining from nodes? */
1103 			if (!node_isset(dest, tmp))
1104 				break;
1105 		}
1106 		if (source == NUMA_NO_NODE)
1107 			break;
1108 
1109 		node_clear(source, tmp);
1110 		err = migrate_to_node(mm, source, dest, flags);
1111 		if (err > 0)
1112 			busy += err;
1113 		if (err < 0)
1114 			break;
1115 	}
1116 	up_read(&mm->mmap_sem);
1117 	if (err < 0)
1118 		return err;
1119 	return busy;
1120 
1121 }
1122 
1123 /*
1124  * Allocate a new page for page migration based on vma policy.
1125  * Start by assuming the page is mapped by the same vma as contains @start.
1126  * Search forward from there, if not.  N.B., this assumes that the
1127  * list of pages handed to migrate_pages()--which is how we get here--
1128  * is in virtual address order.
1129  */
1130 static struct page *new_page(struct page *page, unsigned long start, int **x)
1131 {
1132 	struct vm_area_struct *vma;
1133 	unsigned long uninitialized_var(address);
1134 
1135 	vma = find_vma(current->mm, start);
1136 	while (vma) {
1137 		address = page_address_in_vma(page, vma);
1138 		if (address != -EFAULT)
1139 			break;
1140 		vma = vma->vm_next;
1141 	}
1142 
1143 	if (PageHuge(page)) {
1144 		BUG_ON(!vma);
1145 		return alloc_huge_page_noerr(vma, address, 1);
1146 	}
1147 	/*
1148 	 * if !vma, alloc_page_vma() will use task or system default policy
1149 	 */
1150 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1151 }
1152 #else
1153 
1154 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1155 				unsigned long flags)
1156 {
1157 }
1158 
1159 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1160 		     const nodemask_t *to, int flags)
1161 {
1162 	return -ENOSYS;
1163 }
1164 
1165 static struct page *new_page(struct page *page, unsigned long start, int **x)
1166 {
1167 	return NULL;
1168 }
1169 #endif
1170 
1171 static long do_mbind(unsigned long start, unsigned long len,
1172 		     unsigned short mode, unsigned short mode_flags,
1173 		     nodemask_t *nmask, unsigned long flags)
1174 {
1175 	struct mm_struct *mm = current->mm;
1176 	struct mempolicy *new;
1177 	unsigned long end;
1178 	int err;
1179 	LIST_HEAD(pagelist);
1180 
1181 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1182 		return -EINVAL;
1183 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1184 		return -EPERM;
1185 
1186 	if (start & ~PAGE_MASK)
1187 		return -EINVAL;
1188 
1189 	if (mode == MPOL_DEFAULT)
1190 		flags &= ~MPOL_MF_STRICT;
1191 
1192 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1193 	end = start + len;
1194 
1195 	if (end < start)
1196 		return -EINVAL;
1197 	if (end == start)
1198 		return 0;
1199 
1200 	new = mpol_new(mode, mode_flags, nmask);
1201 	if (IS_ERR(new))
1202 		return PTR_ERR(new);
1203 
1204 	if (flags & MPOL_MF_LAZY)
1205 		new->flags |= MPOL_F_MOF;
1206 
1207 	/*
1208 	 * If we are using the default policy then operation
1209 	 * on discontinuous address spaces is okay after all
1210 	 */
1211 	if (!new)
1212 		flags |= MPOL_MF_DISCONTIG_OK;
1213 
1214 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1215 		 start, start + len, mode, mode_flags,
1216 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1217 
1218 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1219 
1220 		err = migrate_prep();
1221 		if (err)
1222 			goto mpol_out;
1223 	}
1224 	{
1225 		NODEMASK_SCRATCH(scratch);
1226 		if (scratch) {
1227 			down_write(&mm->mmap_sem);
1228 			task_lock(current);
1229 			err = mpol_set_nodemask(new, nmask, scratch);
1230 			task_unlock(current);
1231 			if (err)
1232 				up_write(&mm->mmap_sem);
1233 		} else
1234 			err = -ENOMEM;
1235 		NODEMASK_SCRATCH_FREE(scratch);
1236 	}
1237 	if (err)
1238 		goto mpol_out;
1239 
1240 	err = queue_pages_range(mm, start, end, nmask,
1241 			  flags | MPOL_MF_INVERT, &pagelist);
1242 	if (!err)
1243 		err = mbind_range(mm, start, end, new);
1244 
1245 	if (!err) {
1246 		int nr_failed = 0;
1247 
1248 		if (!list_empty(&pagelist)) {
1249 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1250 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1251 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1252 			if (nr_failed)
1253 				putback_movable_pages(&pagelist);
1254 		}
1255 
1256 		if (nr_failed && (flags & MPOL_MF_STRICT))
1257 			err = -EIO;
1258 	} else
1259 		putback_movable_pages(&pagelist);
1260 
1261 	up_write(&mm->mmap_sem);
1262  mpol_out:
1263 	mpol_put(new);
1264 	return err;
1265 }
1266 
1267 /*
1268  * User space interface with variable sized bitmaps for nodelists.
1269  */
1270 
1271 /* Copy a node mask from user space. */
1272 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1273 		     unsigned long maxnode)
1274 {
1275 	unsigned long k;
1276 	unsigned long nlongs;
1277 	unsigned long endmask;
1278 
1279 	--maxnode;
1280 	nodes_clear(*nodes);
1281 	if (maxnode == 0 || !nmask)
1282 		return 0;
1283 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1284 		return -EINVAL;
1285 
1286 	nlongs = BITS_TO_LONGS(maxnode);
1287 	if ((maxnode % BITS_PER_LONG) == 0)
1288 		endmask = ~0UL;
1289 	else
1290 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1291 
1292 	/* When the user specified more nodes than supported just check
1293 	   if the non supported part is all zero. */
1294 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1295 		if (nlongs > PAGE_SIZE/sizeof(long))
1296 			return -EINVAL;
1297 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1298 			unsigned long t;
1299 			if (get_user(t, nmask + k))
1300 				return -EFAULT;
1301 			if (k == nlongs - 1) {
1302 				if (t & endmask)
1303 					return -EINVAL;
1304 			} else if (t)
1305 				return -EINVAL;
1306 		}
1307 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1308 		endmask = ~0UL;
1309 	}
1310 
1311 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1312 		return -EFAULT;
1313 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1314 	return 0;
1315 }
1316 
1317 /* Copy a kernel node mask to user space */
1318 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1319 			      nodemask_t *nodes)
1320 {
1321 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1322 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1323 
1324 	if (copy > nbytes) {
1325 		if (copy > PAGE_SIZE)
1326 			return -EINVAL;
1327 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1328 			return -EFAULT;
1329 		copy = nbytes;
1330 	}
1331 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1332 }
1333 
1334 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1335 		unsigned long, mode, const unsigned long __user *, nmask,
1336 		unsigned long, maxnode, unsigned, flags)
1337 {
1338 	nodemask_t nodes;
1339 	int err;
1340 	unsigned short mode_flags;
1341 
1342 	mode_flags = mode & MPOL_MODE_FLAGS;
1343 	mode &= ~MPOL_MODE_FLAGS;
1344 	if (mode >= MPOL_MAX)
1345 		return -EINVAL;
1346 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1347 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1348 		return -EINVAL;
1349 	err = get_nodes(&nodes, nmask, maxnode);
1350 	if (err)
1351 		return err;
1352 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1353 }
1354 
1355 /* Set the process memory policy */
1356 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1357 		unsigned long, maxnode)
1358 {
1359 	int err;
1360 	nodemask_t nodes;
1361 	unsigned short flags;
1362 
1363 	flags = mode & MPOL_MODE_FLAGS;
1364 	mode &= ~MPOL_MODE_FLAGS;
1365 	if ((unsigned int)mode >= MPOL_MAX)
1366 		return -EINVAL;
1367 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1368 		return -EINVAL;
1369 	err = get_nodes(&nodes, nmask, maxnode);
1370 	if (err)
1371 		return err;
1372 	return do_set_mempolicy(mode, flags, &nodes);
1373 }
1374 
1375 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1376 		const unsigned long __user *, old_nodes,
1377 		const unsigned long __user *, new_nodes)
1378 {
1379 	const struct cred *cred = current_cred(), *tcred;
1380 	struct mm_struct *mm = NULL;
1381 	struct task_struct *task;
1382 	nodemask_t task_nodes;
1383 	int err;
1384 	nodemask_t *old;
1385 	nodemask_t *new;
1386 	NODEMASK_SCRATCH(scratch);
1387 
1388 	if (!scratch)
1389 		return -ENOMEM;
1390 
1391 	old = &scratch->mask1;
1392 	new = &scratch->mask2;
1393 
1394 	err = get_nodes(old, old_nodes, maxnode);
1395 	if (err)
1396 		goto out;
1397 
1398 	err = get_nodes(new, new_nodes, maxnode);
1399 	if (err)
1400 		goto out;
1401 
1402 	/* Find the mm_struct */
1403 	rcu_read_lock();
1404 	task = pid ? find_task_by_vpid(pid) : current;
1405 	if (!task) {
1406 		rcu_read_unlock();
1407 		err = -ESRCH;
1408 		goto out;
1409 	}
1410 	get_task_struct(task);
1411 
1412 	err = -EINVAL;
1413 
1414 	/*
1415 	 * Check if this process has the right to modify the specified
1416 	 * process. The right exists if the process has administrative
1417 	 * capabilities, superuser privileges or the same
1418 	 * userid as the target process.
1419 	 */
1420 	tcred = __task_cred(task);
1421 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1422 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1423 	    !capable(CAP_SYS_NICE)) {
1424 		rcu_read_unlock();
1425 		err = -EPERM;
1426 		goto out_put;
1427 	}
1428 	rcu_read_unlock();
1429 
1430 	task_nodes = cpuset_mems_allowed(task);
1431 	/* Is the user allowed to access the target nodes? */
1432 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1433 		err = -EPERM;
1434 		goto out_put;
1435 	}
1436 
1437 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1438 		err = -EINVAL;
1439 		goto out_put;
1440 	}
1441 
1442 	err = security_task_movememory(task);
1443 	if (err)
1444 		goto out_put;
1445 
1446 	mm = get_task_mm(task);
1447 	put_task_struct(task);
1448 
1449 	if (!mm) {
1450 		err = -EINVAL;
1451 		goto out;
1452 	}
1453 
1454 	err = do_migrate_pages(mm, old, new,
1455 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1456 
1457 	mmput(mm);
1458 out:
1459 	NODEMASK_SCRATCH_FREE(scratch);
1460 
1461 	return err;
1462 
1463 out_put:
1464 	put_task_struct(task);
1465 	goto out;
1466 
1467 }
1468 
1469 
1470 /* Retrieve NUMA policy */
1471 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1472 		unsigned long __user *, nmask, unsigned long, maxnode,
1473 		unsigned long, addr, unsigned long, flags)
1474 {
1475 	int err;
1476 	int uninitialized_var(pval);
1477 	nodemask_t nodes;
1478 
1479 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1480 		return -EINVAL;
1481 
1482 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1483 
1484 	if (err)
1485 		return err;
1486 
1487 	if (policy && put_user(pval, policy))
1488 		return -EFAULT;
1489 
1490 	if (nmask)
1491 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1492 
1493 	return err;
1494 }
1495 
1496 #ifdef CONFIG_COMPAT
1497 
1498 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1499 		       compat_ulong_t __user *, nmask,
1500 		       compat_ulong_t, maxnode,
1501 		       compat_ulong_t, addr, compat_ulong_t, flags)
1502 {
1503 	long err;
1504 	unsigned long __user *nm = NULL;
1505 	unsigned long nr_bits, alloc_size;
1506 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1507 
1508 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1509 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1510 
1511 	if (nmask)
1512 		nm = compat_alloc_user_space(alloc_size);
1513 
1514 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1515 
1516 	if (!err && nmask) {
1517 		unsigned long copy_size;
1518 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1519 		err = copy_from_user(bm, nm, copy_size);
1520 		/* ensure entire bitmap is zeroed */
1521 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1522 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1523 	}
1524 
1525 	return err;
1526 }
1527 
1528 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1529 		       compat_ulong_t, maxnode)
1530 {
1531 	long err = 0;
1532 	unsigned long __user *nm = NULL;
1533 	unsigned long nr_bits, alloc_size;
1534 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1535 
1536 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1537 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1538 
1539 	if (nmask) {
1540 		err = compat_get_bitmap(bm, nmask, nr_bits);
1541 		nm = compat_alloc_user_space(alloc_size);
1542 		err |= copy_to_user(nm, bm, alloc_size);
1543 	}
1544 
1545 	if (err)
1546 		return -EFAULT;
1547 
1548 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1549 }
1550 
1551 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1552 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1553 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1554 {
1555 	long err = 0;
1556 	unsigned long __user *nm = NULL;
1557 	unsigned long nr_bits, alloc_size;
1558 	nodemask_t bm;
1559 
1560 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1561 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1562 
1563 	if (nmask) {
1564 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1565 		nm = compat_alloc_user_space(alloc_size);
1566 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1567 	}
1568 
1569 	if (err)
1570 		return -EFAULT;
1571 
1572 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1573 }
1574 
1575 #endif
1576 
1577 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1578 						unsigned long addr)
1579 {
1580 	struct mempolicy *pol = NULL;
1581 
1582 	if (vma) {
1583 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1584 			pol = vma->vm_ops->get_policy(vma, addr);
1585 		} else if (vma->vm_policy) {
1586 			pol = vma->vm_policy;
1587 
1588 			/*
1589 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1590 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1591 			 * count on these policies which will be dropped by
1592 			 * mpol_cond_put() later
1593 			 */
1594 			if (mpol_needs_cond_ref(pol))
1595 				mpol_get(pol);
1596 		}
1597 	}
1598 
1599 	return pol;
1600 }
1601 
1602 /*
1603  * get_vma_policy(@vma, @addr)
1604  * @vma: virtual memory area whose policy is sought
1605  * @addr: address in @vma for shared policy lookup
1606  *
1607  * Returns effective policy for a VMA at specified address.
1608  * Falls back to current->mempolicy or system default policy, as necessary.
1609  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1610  * count--added by the get_policy() vm_op, as appropriate--to protect against
1611  * freeing by another task.  It is the caller's responsibility to free the
1612  * extra reference for shared policies.
1613  */
1614 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1615 						unsigned long addr)
1616 {
1617 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1618 
1619 	if (!pol)
1620 		pol = get_task_policy(current);
1621 
1622 	return pol;
1623 }
1624 
1625 bool vma_policy_mof(struct vm_area_struct *vma)
1626 {
1627 	struct mempolicy *pol;
1628 
1629 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1630 		bool ret = false;
1631 
1632 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1633 		if (pol && (pol->flags & MPOL_F_MOF))
1634 			ret = true;
1635 		mpol_cond_put(pol);
1636 
1637 		return ret;
1638 	}
1639 
1640 	pol = vma->vm_policy;
1641 	if (!pol)
1642 		pol = get_task_policy(current);
1643 
1644 	return pol->flags & MPOL_F_MOF;
1645 }
1646 
1647 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1648 {
1649 	enum zone_type dynamic_policy_zone = policy_zone;
1650 
1651 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1652 
1653 	/*
1654 	 * if policy->v.nodes has movable memory only,
1655 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1656 	 *
1657 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1658 	 * so if the following test faile, it implies
1659 	 * policy->v.nodes has movable memory only.
1660 	 */
1661 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1662 		dynamic_policy_zone = ZONE_MOVABLE;
1663 
1664 	return zone >= dynamic_policy_zone;
1665 }
1666 
1667 /*
1668  * Return a nodemask representing a mempolicy for filtering nodes for
1669  * page allocation
1670  */
1671 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1672 {
1673 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1674 	if (unlikely(policy->mode == MPOL_BIND) &&
1675 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1676 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1677 		return &policy->v.nodes;
1678 
1679 	return NULL;
1680 }
1681 
1682 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1683 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1684 	int nd)
1685 {
1686 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1687 		nd = policy->v.preferred_node;
1688 	else {
1689 		/*
1690 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1691 		 * because we might easily break the expectation to stay on the
1692 		 * requested node and not break the policy.
1693 		 */
1694 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1695 	}
1696 
1697 	return node_zonelist(nd, gfp);
1698 }
1699 
1700 /* Do dynamic interleaving for a process */
1701 static unsigned interleave_nodes(struct mempolicy *policy)
1702 {
1703 	unsigned nid, next;
1704 	struct task_struct *me = current;
1705 
1706 	nid = me->il_next;
1707 	next = next_node_in(nid, policy->v.nodes);
1708 	if (next < MAX_NUMNODES)
1709 		me->il_next = next;
1710 	return nid;
1711 }
1712 
1713 /*
1714  * Depending on the memory policy provide a node from which to allocate the
1715  * next slab entry.
1716  */
1717 unsigned int mempolicy_slab_node(void)
1718 {
1719 	struct mempolicy *policy;
1720 	int node = numa_mem_id();
1721 
1722 	if (in_interrupt())
1723 		return node;
1724 
1725 	policy = current->mempolicy;
1726 	if (!policy || policy->flags & MPOL_F_LOCAL)
1727 		return node;
1728 
1729 	switch (policy->mode) {
1730 	case MPOL_PREFERRED:
1731 		/*
1732 		 * handled MPOL_F_LOCAL above
1733 		 */
1734 		return policy->v.preferred_node;
1735 
1736 	case MPOL_INTERLEAVE:
1737 		return interleave_nodes(policy);
1738 
1739 	case MPOL_BIND: {
1740 		struct zoneref *z;
1741 
1742 		/*
1743 		 * Follow bind policy behavior and start allocation at the
1744 		 * first node.
1745 		 */
1746 		struct zonelist *zonelist;
1747 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1748 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1749 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1750 							&policy->v.nodes);
1751 		return z->zone ? z->zone->node : node;
1752 	}
1753 
1754 	default:
1755 		BUG();
1756 	}
1757 }
1758 
1759 /*
1760  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1761  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1762  * number of present nodes.
1763  */
1764 static unsigned offset_il_node(struct mempolicy *pol,
1765 			       struct vm_area_struct *vma, unsigned long n)
1766 {
1767 	unsigned nnodes = nodes_weight(pol->v.nodes);
1768 	unsigned target;
1769 	int i;
1770 	int nid;
1771 
1772 	if (!nnodes)
1773 		return numa_node_id();
1774 	target = (unsigned int)n % nnodes;
1775 	nid = first_node(pol->v.nodes);
1776 	for (i = 0; i < target; i++)
1777 		nid = next_node(nid, pol->v.nodes);
1778 	return nid;
1779 }
1780 
1781 /* Determine a node number for interleave */
1782 static inline unsigned interleave_nid(struct mempolicy *pol,
1783 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1784 {
1785 	if (vma) {
1786 		unsigned long off;
1787 
1788 		/*
1789 		 * for small pages, there is no difference between
1790 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1791 		 * for huge pages, since vm_pgoff is in units of small
1792 		 * pages, we need to shift off the always 0 bits to get
1793 		 * a useful offset.
1794 		 */
1795 		BUG_ON(shift < PAGE_SHIFT);
1796 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1797 		off += (addr - vma->vm_start) >> shift;
1798 		return offset_il_node(pol, vma, off);
1799 	} else
1800 		return interleave_nodes(pol);
1801 }
1802 
1803 #ifdef CONFIG_HUGETLBFS
1804 /*
1805  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1806  * @vma: virtual memory area whose policy is sought
1807  * @addr: address in @vma for shared policy lookup and interleave policy
1808  * @gfp_flags: for requested zone
1809  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1810  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1811  *
1812  * Returns a zonelist suitable for a huge page allocation and a pointer
1813  * to the struct mempolicy for conditional unref after allocation.
1814  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1815  * @nodemask for filtering the zonelist.
1816  *
1817  * Must be protected by read_mems_allowed_begin()
1818  */
1819 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1820 				gfp_t gfp_flags, struct mempolicy **mpol,
1821 				nodemask_t **nodemask)
1822 {
1823 	struct zonelist *zl;
1824 
1825 	*mpol = get_vma_policy(vma, addr);
1826 	*nodemask = NULL;	/* assume !MPOL_BIND */
1827 
1828 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1829 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1830 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1831 	} else {
1832 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1833 		if ((*mpol)->mode == MPOL_BIND)
1834 			*nodemask = &(*mpol)->v.nodes;
1835 	}
1836 	return zl;
1837 }
1838 
1839 /*
1840  * init_nodemask_of_mempolicy
1841  *
1842  * If the current task's mempolicy is "default" [NULL], return 'false'
1843  * to indicate default policy.  Otherwise, extract the policy nodemask
1844  * for 'bind' or 'interleave' policy into the argument nodemask, or
1845  * initialize the argument nodemask to contain the single node for
1846  * 'preferred' or 'local' policy and return 'true' to indicate presence
1847  * of non-default mempolicy.
1848  *
1849  * We don't bother with reference counting the mempolicy [mpol_get/put]
1850  * because the current task is examining it's own mempolicy and a task's
1851  * mempolicy is only ever changed by the task itself.
1852  *
1853  * N.B., it is the caller's responsibility to free a returned nodemask.
1854  */
1855 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1856 {
1857 	struct mempolicy *mempolicy;
1858 	int nid;
1859 
1860 	if (!(mask && current->mempolicy))
1861 		return false;
1862 
1863 	task_lock(current);
1864 	mempolicy = current->mempolicy;
1865 	switch (mempolicy->mode) {
1866 	case MPOL_PREFERRED:
1867 		if (mempolicy->flags & MPOL_F_LOCAL)
1868 			nid = numa_node_id();
1869 		else
1870 			nid = mempolicy->v.preferred_node;
1871 		init_nodemask_of_node(mask, nid);
1872 		break;
1873 
1874 	case MPOL_BIND:
1875 		/* Fall through */
1876 	case MPOL_INTERLEAVE:
1877 		*mask =  mempolicy->v.nodes;
1878 		break;
1879 
1880 	default:
1881 		BUG();
1882 	}
1883 	task_unlock(current);
1884 
1885 	return true;
1886 }
1887 #endif
1888 
1889 /*
1890  * mempolicy_nodemask_intersects
1891  *
1892  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1893  * policy.  Otherwise, check for intersection between mask and the policy
1894  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1895  * policy, always return true since it may allocate elsewhere on fallback.
1896  *
1897  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1898  */
1899 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1900 					const nodemask_t *mask)
1901 {
1902 	struct mempolicy *mempolicy;
1903 	bool ret = true;
1904 
1905 	if (!mask)
1906 		return ret;
1907 	task_lock(tsk);
1908 	mempolicy = tsk->mempolicy;
1909 	if (!mempolicy)
1910 		goto out;
1911 
1912 	switch (mempolicy->mode) {
1913 	case MPOL_PREFERRED:
1914 		/*
1915 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1916 		 * allocate from, they may fallback to other nodes when oom.
1917 		 * Thus, it's possible for tsk to have allocated memory from
1918 		 * nodes in mask.
1919 		 */
1920 		break;
1921 	case MPOL_BIND:
1922 	case MPOL_INTERLEAVE:
1923 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1924 		break;
1925 	default:
1926 		BUG();
1927 	}
1928 out:
1929 	task_unlock(tsk);
1930 	return ret;
1931 }
1932 
1933 /* Allocate a page in interleaved policy.
1934    Own path because it needs to do special accounting. */
1935 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1936 					unsigned nid)
1937 {
1938 	struct zonelist *zl;
1939 	struct page *page;
1940 
1941 	zl = node_zonelist(nid, gfp);
1942 	page = __alloc_pages(gfp, order, zl);
1943 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1944 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1945 	return page;
1946 }
1947 
1948 /**
1949  * 	alloc_pages_vma	- Allocate a page for a VMA.
1950  *
1951  * 	@gfp:
1952  *      %GFP_USER    user allocation.
1953  *      %GFP_KERNEL  kernel allocations,
1954  *      %GFP_HIGHMEM highmem/user allocations,
1955  *      %GFP_FS      allocation should not call back into a file system.
1956  *      %GFP_ATOMIC  don't sleep.
1957  *
1958  *	@order:Order of the GFP allocation.
1959  * 	@vma:  Pointer to VMA or NULL if not available.
1960  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1961  *	@node: Which node to prefer for allocation (modulo policy).
1962  *	@hugepage: for hugepages try only the preferred node if possible
1963  *
1964  * 	This function allocates a page from the kernel page pool and applies
1965  *	a NUMA policy associated with the VMA or the current process.
1966  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1967  *	mm_struct of the VMA to prevent it from going away. Should be used for
1968  *	all allocations for pages that will be mapped into user space. Returns
1969  *	NULL when no page can be allocated.
1970  */
1971 struct page *
1972 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1973 		unsigned long addr, int node, bool hugepage)
1974 {
1975 	struct mempolicy *pol;
1976 	struct page *page;
1977 	unsigned int cpuset_mems_cookie;
1978 	struct zonelist *zl;
1979 	nodemask_t *nmask;
1980 
1981 retry_cpuset:
1982 	pol = get_vma_policy(vma, addr);
1983 	cpuset_mems_cookie = read_mems_allowed_begin();
1984 
1985 	if (pol->mode == MPOL_INTERLEAVE) {
1986 		unsigned nid;
1987 
1988 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1989 		mpol_cond_put(pol);
1990 		page = alloc_page_interleave(gfp, order, nid);
1991 		goto out;
1992 	}
1993 
1994 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1995 		int hpage_node = node;
1996 
1997 		/*
1998 		 * For hugepage allocation and non-interleave policy which
1999 		 * allows the current node (or other explicitly preferred
2000 		 * node) we only try to allocate from the current/preferred
2001 		 * node and don't fall back to other nodes, as the cost of
2002 		 * remote accesses would likely offset THP benefits.
2003 		 *
2004 		 * If the policy is interleave, or does not allow the current
2005 		 * node in its nodemask, we allocate the standard way.
2006 		 */
2007 		if (pol->mode == MPOL_PREFERRED &&
2008 						!(pol->flags & MPOL_F_LOCAL))
2009 			hpage_node = pol->v.preferred_node;
2010 
2011 		nmask = policy_nodemask(gfp, pol);
2012 		if (!nmask || node_isset(hpage_node, *nmask)) {
2013 			mpol_cond_put(pol);
2014 			page = __alloc_pages_node(hpage_node,
2015 						gfp | __GFP_THISNODE, order);
2016 			goto out;
2017 		}
2018 	}
2019 
2020 	nmask = policy_nodemask(gfp, pol);
2021 	zl = policy_zonelist(gfp, pol, node);
2022 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2023 	mpol_cond_put(pol);
2024 out:
2025 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2026 		goto retry_cpuset;
2027 	return page;
2028 }
2029 
2030 /**
2031  * 	alloc_pages_current - Allocate pages.
2032  *
2033  *	@gfp:
2034  *		%GFP_USER   user allocation,
2035  *      	%GFP_KERNEL kernel allocation,
2036  *      	%GFP_HIGHMEM highmem allocation,
2037  *      	%GFP_FS     don't call back into a file system.
2038  *      	%GFP_ATOMIC don't sleep.
2039  *	@order: Power of two of allocation size in pages. 0 is a single page.
2040  *
2041  *	Allocate a page from the kernel page pool.  When not in
2042  *	interrupt context and apply the current process NUMA policy.
2043  *	Returns NULL when no page can be allocated.
2044  *
2045  *	Don't call cpuset_update_task_memory_state() unless
2046  *	1) it's ok to take cpuset_sem (can WAIT), and
2047  *	2) allocating for current task (not interrupt).
2048  */
2049 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2050 {
2051 	struct mempolicy *pol = &default_policy;
2052 	struct page *page;
2053 	unsigned int cpuset_mems_cookie;
2054 
2055 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2056 		pol = get_task_policy(current);
2057 
2058 retry_cpuset:
2059 	cpuset_mems_cookie = read_mems_allowed_begin();
2060 
2061 	/*
2062 	 * No reference counting needed for current->mempolicy
2063 	 * nor system default_policy
2064 	 */
2065 	if (pol->mode == MPOL_INTERLEAVE)
2066 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2067 	else
2068 		page = __alloc_pages_nodemask(gfp, order,
2069 				policy_zonelist(gfp, pol, numa_node_id()),
2070 				policy_nodemask(gfp, pol));
2071 
2072 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2073 		goto retry_cpuset;
2074 
2075 	return page;
2076 }
2077 EXPORT_SYMBOL(alloc_pages_current);
2078 
2079 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2080 {
2081 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2082 
2083 	if (IS_ERR(pol))
2084 		return PTR_ERR(pol);
2085 	dst->vm_policy = pol;
2086 	return 0;
2087 }
2088 
2089 /*
2090  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2091  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2092  * with the mems_allowed returned by cpuset_mems_allowed().  This
2093  * keeps mempolicies cpuset relative after its cpuset moves.  See
2094  * further kernel/cpuset.c update_nodemask().
2095  *
2096  * current's mempolicy may be rebinded by the other task(the task that changes
2097  * cpuset's mems), so we needn't do rebind work for current task.
2098  */
2099 
2100 /* Slow path of a mempolicy duplicate */
2101 struct mempolicy *__mpol_dup(struct mempolicy *old)
2102 {
2103 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2104 
2105 	if (!new)
2106 		return ERR_PTR(-ENOMEM);
2107 
2108 	/* task's mempolicy is protected by alloc_lock */
2109 	if (old == current->mempolicy) {
2110 		task_lock(current);
2111 		*new = *old;
2112 		task_unlock(current);
2113 	} else
2114 		*new = *old;
2115 
2116 	if (current_cpuset_is_being_rebound()) {
2117 		nodemask_t mems = cpuset_mems_allowed(current);
2118 		if (new->flags & MPOL_F_REBINDING)
2119 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2120 		else
2121 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2122 	}
2123 	atomic_set(&new->refcnt, 1);
2124 	return new;
2125 }
2126 
2127 /* Slow path of a mempolicy comparison */
2128 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2129 {
2130 	if (!a || !b)
2131 		return false;
2132 	if (a->mode != b->mode)
2133 		return false;
2134 	if (a->flags != b->flags)
2135 		return false;
2136 	if (mpol_store_user_nodemask(a))
2137 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2138 			return false;
2139 
2140 	switch (a->mode) {
2141 	case MPOL_BIND:
2142 		/* Fall through */
2143 	case MPOL_INTERLEAVE:
2144 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2145 	case MPOL_PREFERRED:
2146 		return a->v.preferred_node == b->v.preferred_node;
2147 	default:
2148 		BUG();
2149 		return false;
2150 	}
2151 }
2152 
2153 /*
2154  * Shared memory backing store policy support.
2155  *
2156  * Remember policies even when nobody has shared memory mapped.
2157  * The policies are kept in Red-Black tree linked from the inode.
2158  * They are protected by the sp->lock rwlock, which should be held
2159  * for any accesses to the tree.
2160  */
2161 
2162 /*
2163  * lookup first element intersecting start-end.  Caller holds sp->lock for
2164  * reading or for writing
2165  */
2166 static struct sp_node *
2167 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2168 {
2169 	struct rb_node *n = sp->root.rb_node;
2170 
2171 	while (n) {
2172 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2173 
2174 		if (start >= p->end)
2175 			n = n->rb_right;
2176 		else if (end <= p->start)
2177 			n = n->rb_left;
2178 		else
2179 			break;
2180 	}
2181 	if (!n)
2182 		return NULL;
2183 	for (;;) {
2184 		struct sp_node *w = NULL;
2185 		struct rb_node *prev = rb_prev(n);
2186 		if (!prev)
2187 			break;
2188 		w = rb_entry(prev, struct sp_node, nd);
2189 		if (w->end <= start)
2190 			break;
2191 		n = prev;
2192 	}
2193 	return rb_entry(n, struct sp_node, nd);
2194 }
2195 
2196 /*
2197  * Insert a new shared policy into the list.  Caller holds sp->lock for
2198  * writing.
2199  */
2200 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2201 {
2202 	struct rb_node **p = &sp->root.rb_node;
2203 	struct rb_node *parent = NULL;
2204 	struct sp_node *nd;
2205 
2206 	while (*p) {
2207 		parent = *p;
2208 		nd = rb_entry(parent, struct sp_node, nd);
2209 		if (new->start < nd->start)
2210 			p = &(*p)->rb_left;
2211 		else if (new->end > nd->end)
2212 			p = &(*p)->rb_right;
2213 		else
2214 			BUG();
2215 	}
2216 	rb_link_node(&new->nd, parent, p);
2217 	rb_insert_color(&new->nd, &sp->root);
2218 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2219 		 new->policy ? new->policy->mode : 0);
2220 }
2221 
2222 /* Find shared policy intersecting idx */
2223 struct mempolicy *
2224 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2225 {
2226 	struct mempolicy *pol = NULL;
2227 	struct sp_node *sn;
2228 
2229 	if (!sp->root.rb_node)
2230 		return NULL;
2231 	read_lock(&sp->lock);
2232 	sn = sp_lookup(sp, idx, idx+1);
2233 	if (sn) {
2234 		mpol_get(sn->policy);
2235 		pol = sn->policy;
2236 	}
2237 	read_unlock(&sp->lock);
2238 	return pol;
2239 }
2240 
2241 static void sp_free(struct sp_node *n)
2242 {
2243 	mpol_put(n->policy);
2244 	kmem_cache_free(sn_cache, n);
2245 }
2246 
2247 /**
2248  * mpol_misplaced - check whether current page node is valid in policy
2249  *
2250  * @page: page to be checked
2251  * @vma: vm area where page mapped
2252  * @addr: virtual address where page mapped
2253  *
2254  * Lookup current policy node id for vma,addr and "compare to" page's
2255  * node id.
2256  *
2257  * Returns:
2258  *	-1	- not misplaced, page is in the right node
2259  *	node	- node id where the page should be
2260  *
2261  * Policy determination "mimics" alloc_page_vma().
2262  * Called from fault path where we know the vma and faulting address.
2263  */
2264 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2265 {
2266 	struct mempolicy *pol;
2267 	struct zoneref *z;
2268 	int curnid = page_to_nid(page);
2269 	unsigned long pgoff;
2270 	int thiscpu = raw_smp_processor_id();
2271 	int thisnid = cpu_to_node(thiscpu);
2272 	int polnid = -1;
2273 	int ret = -1;
2274 
2275 	BUG_ON(!vma);
2276 
2277 	pol = get_vma_policy(vma, addr);
2278 	if (!(pol->flags & MPOL_F_MOF))
2279 		goto out;
2280 
2281 	switch (pol->mode) {
2282 	case MPOL_INTERLEAVE:
2283 		BUG_ON(addr >= vma->vm_end);
2284 		BUG_ON(addr < vma->vm_start);
2285 
2286 		pgoff = vma->vm_pgoff;
2287 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2288 		polnid = offset_il_node(pol, vma, pgoff);
2289 		break;
2290 
2291 	case MPOL_PREFERRED:
2292 		if (pol->flags & MPOL_F_LOCAL)
2293 			polnid = numa_node_id();
2294 		else
2295 			polnid = pol->v.preferred_node;
2296 		break;
2297 
2298 	case MPOL_BIND:
2299 
2300 		/*
2301 		 * allows binding to multiple nodes.
2302 		 * use current page if in policy nodemask,
2303 		 * else select nearest allowed node, if any.
2304 		 * If no allowed nodes, use current [!misplaced].
2305 		 */
2306 		if (node_isset(curnid, pol->v.nodes))
2307 			goto out;
2308 		z = first_zones_zonelist(
2309 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2310 				gfp_zone(GFP_HIGHUSER),
2311 				&pol->v.nodes);
2312 		polnid = z->zone->node;
2313 		break;
2314 
2315 	default:
2316 		BUG();
2317 	}
2318 
2319 	/* Migrate the page towards the node whose CPU is referencing it */
2320 	if (pol->flags & MPOL_F_MORON) {
2321 		polnid = thisnid;
2322 
2323 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2324 			goto out;
2325 	}
2326 
2327 	if (curnid != polnid)
2328 		ret = polnid;
2329 out:
2330 	mpol_cond_put(pol);
2331 
2332 	return ret;
2333 }
2334 
2335 /*
2336  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2337  * dropped after task->mempolicy is set to NULL so that any allocation done as
2338  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2339  * policy.
2340  */
2341 void mpol_put_task_policy(struct task_struct *task)
2342 {
2343 	struct mempolicy *pol;
2344 
2345 	task_lock(task);
2346 	pol = task->mempolicy;
2347 	task->mempolicy = NULL;
2348 	task_unlock(task);
2349 	mpol_put(pol);
2350 }
2351 
2352 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2353 {
2354 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2355 	rb_erase(&n->nd, &sp->root);
2356 	sp_free(n);
2357 }
2358 
2359 static void sp_node_init(struct sp_node *node, unsigned long start,
2360 			unsigned long end, struct mempolicy *pol)
2361 {
2362 	node->start = start;
2363 	node->end = end;
2364 	node->policy = pol;
2365 }
2366 
2367 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2368 				struct mempolicy *pol)
2369 {
2370 	struct sp_node *n;
2371 	struct mempolicy *newpol;
2372 
2373 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2374 	if (!n)
2375 		return NULL;
2376 
2377 	newpol = mpol_dup(pol);
2378 	if (IS_ERR(newpol)) {
2379 		kmem_cache_free(sn_cache, n);
2380 		return NULL;
2381 	}
2382 	newpol->flags |= MPOL_F_SHARED;
2383 	sp_node_init(n, start, end, newpol);
2384 
2385 	return n;
2386 }
2387 
2388 /* Replace a policy range. */
2389 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2390 				 unsigned long end, struct sp_node *new)
2391 {
2392 	struct sp_node *n;
2393 	struct sp_node *n_new = NULL;
2394 	struct mempolicy *mpol_new = NULL;
2395 	int ret = 0;
2396 
2397 restart:
2398 	write_lock(&sp->lock);
2399 	n = sp_lookup(sp, start, end);
2400 	/* Take care of old policies in the same range. */
2401 	while (n && n->start < end) {
2402 		struct rb_node *next = rb_next(&n->nd);
2403 		if (n->start >= start) {
2404 			if (n->end <= end)
2405 				sp_delete(sp, n);
2406 			else
2407 				n->start = end;
2408 		} else {
2409 			/* Old policy spanning whole new range. */
2410 			if (n->end > end) {
2411 				if (!n_new)
2412 					goto alloc_new;
2413 
2414 				*mpol_new = *n->policy;
2415 				atomic_set(&mpol_new->refcnt, 1);
2416 				sp_node_init(n_new, end, n->end, mpol_new);
2417 				n->end = start;
2418 				sp_insert(sp, n_new);
2419 				n_new = NULL;
2420 				mpol_new = NULL;
2421 				break;
2422 			} else
2423 				n->end = start;
2424 		}
2425 		if (!next)
2426 			break;
2427 		n = rb_entry(next, struct sp_node, nd);
2428 	}
2429 	if (new)
2430 		sp_insert(sp, new);
2431 	write_unlock(&sp->lock);
2432 	ret = 0;
2433 
2434 err_out:
2435 	if (mpol_new)
2436 		mpol_put(mpol_new);
2437 	if (n_new)
2438 		kmem_cache_free(sn_cache, n_new);
2439 
2440 	return ret;
2441 
2442 alloc_new:
2443 	write_unlock(&sp->lock);
2444 	ret = -ENOMEM;
2445 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2446 	if (!n_new)
2447 		goto err_out;
2448 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2449 	if (!mpol_new)
2450 		goto err_out;
2451 	goto restart;
2452 }
2453 
2454 /**
2455  * mpol_shared_policy_init - initialize shared policy for inode
2456  * @sp: pointer to inode shared policy
2457  * @mpol:  struct mempolicy to install
2458  *
2459  * Install non-NULL @mpol in inode's shared policy rb-tree.
2460  * On entry, the current task has a reference on a non-NULL @mpol.
2461  * This must be released on exit.
2462  * This is called at get_inode() calls and we can use GFP_KERNEL.
2463  */
2464 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2465 {
2466 	int ret;
2467 
2468 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2469 	rwlock_init(&sp->lock);
2470 
2471 	if (mpol) {
2472 		struct vm_area_struct pvma;
2473 		struct mempolicy *new;
2474 		NODEMASK_SCRATCH(scratch);
2475 
2476 		if (!scratch)
2477 			goto put_mpol;
2478 		/* contextualize the tmpfs mount point mempolicy */
2479 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2480 		if (IS_ERR(new))
2481 			goto free_scratch; /* no valid nodemask intersection */
2482 
2483 		task_lock(current);
2484 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2485 		task_unlock(current);
2486 		if (ret)
2487 			goto put_new;
2488 
2489 		/* Create pseudo-vma that contains just the policy */
2490 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2491 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2492 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2493 
2494 put_new:
2495 		mpol_put(new);			/* drop initial ref */
2496 free_scratch:
2497 		NODEMASK_SCRATCH_FREE(scratch);
2498 put_mpol:
2499 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2500 	}
2501 }
2502 
2503 int mpol_set_shared_policy(struct shared_policy *info,
2504 			struct vm_area_struct *vma, struct mempolicy *npol)
2505 {
2506 	int err;
2507 	struct sp_node *new = NULL;
2508 	unsigned long sz = vma_pages(vma);
2509 
2510 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2511 		 vma->vm_pgoff,
2512 		 sz, npol ? npol->mode : -1,
2513 		 npol ? npol->flags : -1,
2514 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2515 
2516 	if (npol) {
2517 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2518 		if (!new)
2519 			return -ENOMEM;
2520 	}
2521 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2522 	if (err && new)
2523 		sp_free(new);
2524 	return err;
2525 }
2526 
2527 /* Free a backing policy store on inode delete. */
2528 void mpol_free_shared_policy(struct shared_policy *p)
2529 {
2530 	struct sp_node *n;
2531 	struct rb_node *next;
2532 
2533 	if (!p->root.rb_node)
2534 		return;
2535 	write_lock(&p->lock);
2536 	next = rb_first(&p->root);
2537 	while (next) {
2538 		n = rb_entry(next, struct sp_node, nd);
2539 		next = rb_next(&n->nd);
2540 		sp_delete(p, n);
2541 	}
2542 	write_unlock(&p->lock);
2543 }
2544 
2545 #ifdef CONFIG_NUMA_BALANCING
2546 static int __initdata numabalancing_override;
2547 
2548 static void __init check_numabalancing_enable(void)
2549 {
2550 	bool numabalancing_default = false;
2551 
2552 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2553 		numabalancing_default = true;
2554 
2555 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2556 	if (numabalancing_override)
2557 		set_numabalancing_state(numabalancing_override == 1);
2558 
2559 	if (num_online_nodes() > 1 && !numabalancing_override) {
2560 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2561 			numabalancing_default ? "Enabling" : "Disabling");
2562 		set_numabalancing_state(numabalancing_default);
2563 	}
2564 }
2565 
2566 static int __init setup_numabalancing(char *str)
2567 {
2568 	int ret = 0;
2569 	if (!str)
2570 		goto out;
2571 
2572 	if (!strcmp(str, "enable")) {
2573 		numabalancing_override = 1;
2574 		ret = 1;
2575 	} else if (!strcmp(str, "disable")) {
2576 		numabalancing_override = -1;
2577 		ret = 1;
2578 	}
2579 out:
2580 	if (!ret)
2581 		pr_warn("Unable to parse numa_balancing=\n");
2582 
2583 	return ret;
2584 }
2585 __setup("numa_balancing=", setup_numabalancing);
2586 #else
2587 static inline void __init check_numabalancing_enable(void)
2588 {
2589 }
2590 #endif /* CONFIG_NUMA_BALANCING */
2591 
2592 /* assumes fs == KERNEL_DS */
2593 void __init numa_policy_init(void)
2594 {
2595 	nodemask_t interleave_nodes;
2596 	unsigned long largest = 0;
2597 	int nid, prefer = 0;
2598 
2599 	policy_cache = kmem_cache_create("numa_policy",
2600 					 sizeof(struct mempolicy),
2601 					 0, SLAB_PANIC, NULL);
2602 
2603 	sn_cache = kmem_cache_create("shared_policy_node",
2604 				     sizeof(struct sp_node),
2605 				     0, SLAB_PANIC, NULL);
2606 
2607 	for_each_node(nid) {
2608 		preferred_node_policy[nid] = (struct mempolicy) {
2609 			.refcnt = ATOMIC_INIT(1),
2610 			.mode = MPOL_PREFERRED,
2611 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2612 			.v = { .preferred_node = nid, },
2613 		};
2614 	}
2615 
2616 	/*
2617 	 * Set interleaving policy for system init. Interleaving is only
2618 	 * enabled across suitably sized nodes (default is >= 16MB), or
2619 	 * fall back to the largest node if they're all smaller.
2620 	 */
2621 	nodes_clear(interleave_nodes);
2622 	for_each_node_state(nid, N_MEMORY) {
2623 		unsigned long total_pages = node_present_pages(nid);
2624 
2625 		/* Preserve the largest node */
2626 		if (largest < total_pages) {
2627 			largest = total_pages;
2628 			prefer = nid;
2629 		}
2630 
2631 		/* Interleave this node? */
2632 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2633 			node_set(nid, interleave_nodes);
2634 	}
2635 
2636 	/* All too small, use the largest */
2637 	if (unlikely(nodes_empty(interleave_nodes)))
2638 		node_set(prefer, interleave_nodes);
2639 
2640 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2641 		pr_err("%s: interleaving failed\n", __func__);
2642 
2643 	check_numabalancing_enable();
2644 }
2645 
2646 /* Reset policy of current process to default */
2647 void numa_default_policy(void)
2648 {
2649 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2650 }
2651 
2652 /*
2653  * Parse and format mempolicy from/to strings
2654  */
2655 
2656 /*
2657  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2658  */
2659 static const char * const policy_modes[] =
2660 {
2661 	[MPOL_DEFAULT]    = "default",
2662 	[MPOL_PREFERRED]  = "prefer",
2663 	[MPOL_BIND]       = "bind",
2664 	[MPOL_INTERLEAVE] = "interleave",
2665 	[MPOL_LOCAL]      = "local",
2666 };
2667 
2668 
2669 #ifdef CONFIG_TMPFS
2670 /**
2671  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2672  * @str:  string containing mempolicy to parse
2673  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2674  *
2675  * Format of input:
2676  *	<mode>[=<flags>][:<nodelist>]
2677  *
2678  * On success, returns 0, else 1
2679  */
2680 int mpol_parse_str(char *str, struct mempolicy **mpol)
2681 {
2682 	struct mempolicy *new = NULL;
2683 	unsigned short mode;
2684 	unsigned short mode_flags;
2685 	nodemask_t nodes;
2686 	char *nodelist = strchr(str, ':');
2687 	char *flags = strchr(str, '=');
2688 	int err = 1;
2689 
2690 	if (nodelist) {
2691 		/* NUL-terminate mode or flags string */
2692 		*nodelist++ = '\0';
2693 		if (nodelist_parse(nodelist, nodes))
2694 			goto out;
2695 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2696 			goto out;
2697 	} else
2698 		nodes_clear(nodes);
2699 
2700 	if (flags)
2701 		*flags++ = '\0';	/* terminate mode string */
2702 
2703 	for (mode = 0; mode < MPOL_MAX; mode++) {
2704 		if (!strcmp(str, policy_modes[mode])) {
2705 			break;
2706 		}
2707 	}
2708 	if (mode >= MPOL_MAX)
2709 		goto out;
2710 
2711 	switch (mode) {
2712 	case MPOL_PREFERRED:
2713 		/*
2714 		 * Insist on a nodelist of one node only
2715 		 */
2716 		if (nodelist) {
2717 			char *rest = nodelist;
2718 			while (isdigit(*rest))
2719 				rest++;
2720 			if (*rest)
2721 				goto out;
2722 		}
2723 		break;
2724 	case MPOL_INTERLEAVE:
2725 		/*
2726 		 * Default to online nodes with memory if no nodelist
2727 		 */
2728 		if (!nodelist)
2729 			nodes = node_states[N_MEMORY];
2730 		break;
2731 	case MPOL_LOCAL:
2732 		/*
2733 		 * Don't allow a nodelist;  mpol_new() checks flags
2734 		 */
2735 		if (nodelist)
2736 			goto out;
2737 		mode = MPOL_PREFERRED;
2738 		break;
2739 	case MPOL_DEFAULT:
2740 		/*
2741 		 * Insist on a empty nodelist
2742 		 */
2743 		if (!nodelist)
2744 			err = 0;
2745 		goto out;
2746 	case MPOL_BIND:
2747 		/*
2748 		 * Insist on a nodelist
2749 		 */
2750 		if (!nodelist)
2751 			goto out;
2752 	}
2753 
2754 	mode_flags = 0;
2755 	if (flags) {
2756 		/*
2757 		 * Currently, we only support two mutually exclusive
2758 		 * mode flags.
2759 		 */
2760 		if (!strcmp(flags, "static"))
2761 			mode_flags |= MPOL_F_STATIC_NODES;
2762 		else if (!strcmp(flags, "relative"))
2763 			mode_flags |= MPOL_F_RELATIVE_NODES;
2764 		else
2765 			goto out;
2766 	}
2767 
2768 	new = mpol_new(mode, mode_flags, &nodes);
2769 	if (IS_ERR(new))
2770 		goto out;
2771 
2772 	/*
2773 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2774 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2775 	 */
2776 	if (mode != MPOL_PREFERRED)
2777 		new->v.nodes = nodes;
2778 	else if (nodelist)
2779 		new->v.preferred_node = first_node(nodes);
2780 	else
2781 		new->flags |= MPOL_F_LOCAL;
2782 
2783 	/*
2784 	 * Save nodes for contextualization: this will be used to "clone"
2785 	 * the mempolicy in a specific context [cpuset] at a later time.
2786 	 */
2787 	new->w.user_nodemask = nodes;
2788 
2789 	err = 0;
2790 
2791 out:
2792 	/* Restore string for error message */
2793 	if (nodelist)
2794 		*--nodelist = ':';
2795 	if (flags)
2796 		*--flags = '=';
2797 	if (!err)
2798 		*mpol = new;
2799 	return err;
2800 }
2801 #endif /* CONFIG_TMPFS */
2802 
2803 /**
2804  * mpol_to_str - format a mempolicy structure for printing
2805  * @buffer:  to contain formatted mempolicy string
2806  * @maxlen:  length of @buffer
2807  * @pol:  pointer to mempolicy to be formatted
2808  *
2809  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2810  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2811  * longest flag, "relative", and to display at least a few node ids.
2812  */
2813 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2814 {
2815 	char *p = buffer;
2816 	nodemask_t nodes = NODE_MASK_NONE;
2817 	unsigned short mode = MPOL_DEFAULT;
2818 	unsigned short flags = 0;
2819 
2820 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2821 		mode = pol->mode;
2822 		flags = pol->flags;
2823 	}
2824 
2825 	switch (mode) {
2826 	case MPOL_DEFAULT:
2827 		break;
2828 	case MPOL_PREFERRED:
2829 		if (flags & MPOL_F_LOCAL)
2830 			mode = MPOL_LOCAL;
2831 		else
2832 			node_set(pol->v.preferred_node, nodes);
2833 		break;
2834 	case MPOL_BIND:
2835 	case MPOL_INTERLEAVE:
2836 		nodes = pol->v.nodes;
2837 		break;
2838 	default:
2839 		WARN_ON_ONCE(1);
2840 		snprintf(p, maxlen, "unknown");
2841 		return;
2842 	}
2843 
2844 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2845 
2846 	if (flags & MPOL_MODE_FLAGS) {
2847 		p += snprintf(p, buffer + maxlen - p, "=");
2848 
2849 		/*
2850 		 * Currently, the only defined flags are mutually exclusive
2851 		 */
2852 		if (flags & MPOL_F_STATIC_NODES)
2853 			p += snprintf(p, buffer + maxlen - p, "static");
2854 		else if (flags & MPOL_F_RELATIVE_NODES)
2855 			p += snprintf(p, buffer + maxlen - p, "relative");
2856 	}
2857 
2858 	if (!nodes_empty(nodes))
2859 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2860 			       nodemask_pr_args(&nodes));
2861 }
2862