xref: /openbmc/linux/mm/mempolicy.c (revision e6dec923)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/swap.h>
89 #include <linux/seq_file.h>
90 #include <linux/proc_fs.h>
91 #include <linux/migrate.h>
92 #include <linux/ksm.h>
93 #include <linux/rmap.h>
94 #include <linux/security.h>
95 #include <linux/syscalls.h>
96 #include <linux/ctype.h>
97 #include <linux/mm_inline.h>
98 #include <linux/mmu_notifier.h>
99 #include <linux/printk.h>
100 
101 #include <asm/tlbflush.h>
102 #include <linux/uaccess.h>
103 
104 #include "internal.h"
105 
106 /* Internal flags */
107 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
108 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
109 
110 static struct kmem_cache *policy_cache;
111 static struct kmem_cache *sn_cache;
112 
113 /* Highest zone. An specific allocation for a zone below that is not
114    policied. */
115 enum zone_type policy_zone = 0;
116 
117 /*
118  * run-time system-wide default policy => local allocation
119  */
120 static struct mempolicy default_policy = {
121 	.refcnt = ATOMIC_INIT(1), /* never free it */
122 	.mode = MPOL_PREFERRED,
123 	.flags = MPOL_F_LOCAL,
124 };
125 
126 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
127 
128 struct mempolicy *get_task_policy(struct task_struct *p)
129 {
130 	struct mempolicy *pol = p->mempolicy;
131 	int node;
132 
133 	if (pol)
134 		return pol;
135 
136 	node = numa_node_id();
137 	if (node != NUMA_NO_NODE) {
138 		pol = &preferred_node_policy[node];
139 		/* preferred_node_policy is not initialised early in boot */
140 		if (pol->mode)
141 			return pol;
142 	}
143 
144 	return &default_policy;
145 }
146 
147 static const struct mempolicy_operations {
148 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
150 } mpol_ops[MPOL_MAX];
151 
152 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
153 {
154 	return pol->flags & MPOL_MODE_FLAGS;
155 }
156 
157 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
158 				   const nodemask_t *rel)
159 {
160 	nodemask_t tmp;
161 	nodes_fold(tmp, *orig, nodes_weight(*rel));
162 	nodes_onto(*ret, tmp, *rel);
163 }
164 
165 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
166 {
167 	if (nodes_empty(*nodes))
168 		return -EINVAL;
169 	pol->v.nodes = *nodes;
170 	return 0;
171 }
172 
173 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
174 {
175 	if (!nodes)
176 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
177 	else if (nodes_empty(*nodes))
178 		return -EINVAL;			/*  no allowed nodes */
179 	else
180 		pol->v.preferred_node = first_node(*nodes);
181 	return 0;
182 }
183 
184 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
185 {
186 	if (nodes_empty(*nodes))
187 		return -EINVAL;
188 	pol->v.nodes = *nodes;
189 	return 0;
190 }
191 
192 /*
193  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
194  * any, for the new policy.  mpol_new() has already validated the nodes
195  * parameter with respect to the policy mode and flags.  But, we need to
196  * handle an empty nodemask with MPOL_PREFERRED here.
197  *
198  * Must be called holding task's alloc_lock to protect task's mems_allowed
199  * and mempolicy.  May also be called holding the mmap_semaphore for write.
200  */
201 static int mpol_set_nodemask(struct mempolicy *pol,
202 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
203 {
204 	int ret;
205 
206 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
207 	if (pol == NULL)
208 		return 0;
209 	/* Check N_MEMORY */
210 	nodes_and(nsc->mask1,
211 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
212 
213 	VM_BUG_ON(!nodes);
214 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
215 		nodes = NULL;	/* explicit local allocation */
216 	else {
217 		if (pol->flags & MPOL_F_RELATIVE_NODES)
218 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
219 		else
220 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
221 
222 		if (mpol_store_user_nodemask(pol))
223 			pol->w.user_nodemask = *nodes;
224 		else
225 			pol->w.cpuset_mems_allowed =
226 						cpuset_current_mems_allowed;
227 	}
228 
229 	if (nodes)
230 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
231 	else
232 		ret = mpol_ops[pol->mode].create(pol, NULL);
233 	return ret;
234 }
235 
236 /*
237  * This function just creates a new policy, does some check and simple
238  * initialization. You must invoke mpol_set_nodemask() to set nodes.
239  */
240 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
241 				  nodemask_t *nodes)
242 {
243 	struct mempolicy *policy;
244 
245 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
246 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
247 
248 	if (mode == MPOL_DEFAULT) {
249 		if (nodes && !nodes_empty(*nodes))
250 			return ERR_PTR(-EINVAL);
251 		return NULL;
252 	}
253 	VM_BUG_ON(!nodes);
254 
255 	/*
256 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
257 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
258 	 * All other modes require a valid pointer to a non-empty nodemask.
259 	 */
260 	if (mode == MPOL_PREFERRED) {
261 		if (nodes_empty(*nodes)) {
262 			if (((flags & MPOL_F_STATIC_NODES) ||
263 			     (flags & MPOL_F_RELATIVE_NODES)))
264 				return ERR_PTR(-EINVAL);
265 		}
266 	} else if (mode == MPOL_LOCAL) {
267 		if (!nodes_empty(*nodes) ||
268 		    (flags & MPOL_F_STATIC_NODES) ||
269 		    (flags & MPOL_F_RELATIVE_NODES))
270 			return ERR_PTR(-EINVAL);
271 		mode = MPOL_PREFERRED;
272 	} else if (nodes_empty(*nodes))
273 		return ERR_PTR(-EINVAL);
274 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
275 	if (!policy)
276 		return ERR_PTR(-ENOMEM);
277 	atomic_set(&policy->refcnt, 1);
278 	policy->mode = mode;
279 	policy->flags = flags;
280 
281 	return policy;
282 }
283 
284 /* Slow path of a mpol destructor. */
285 void __mpol_put(struct mempolicy *p)
286 {
287 	if (!atomic_dec_and_test(&p->refcnt))
288 		return;
289 	kmem_cache_free(policy_cache, p);
290 }
291 
292 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
293 {
294 }
295 
296 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
297 {
298 	nodemask_t tmp;
299 
300 	if (pol->flags & MPOL_F_STATIC_NODES)
301 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
302 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
303 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
304 	else {
305 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
306 								*nodes);
307 		pol->w.cpuset_mems_allowed = tmp;
308 	}
309 
310 	if (nodes_empty(tmp))
311 		tmp = *nodes;
312 
313 	pol->v.nodes = tmp;
314 }
315 
316 static void mpol_rebind_preferred(struct mempolicy *pol,
317 						const nodemask_t *nodes)
318 {
319 	nodemask_t tmp;
320 
321 	if (pol->flags & MPOL_F_STATIC_NODES) {
322 		int node = first_node(pol->w.user_nodemask);
323 
324 		if (node_isset(node, *nodes)) {
325 			pol->v.preferred_node = node;
326 			pol->flags &= ~MPOL_F_LOCAL;
327 		} else
328 			pol->flags |= MPOL_F_LOCAL;
329 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
330 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
331 		pol->v.preferred_node = first_node(tmp);
332 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
333 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
334 						   pol->w.cpuset_mems_allowed,
335 						   *nodes);
336 		pol->w.cpuset_mems_allowed = *nodes;
337 	}
338 }
339 
340 /*
341  * mpol_rebind_policy - Migrate a policy to a different set of nodes
342  *
343  * Per-vma policies are protected by mmap_sem. Allocations using per-task
344  * policies are protected by task->mems_allowed_seq to prevent a premature
345  * OOM/allocation failure due to parallel nodemask modification.
346  */
347 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
348 {
349 	if (!pol)
350 		return;
351 	if (!mpol_store_user_nodemask(pol) &&
352 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
353 		return;
354 
355 	mpol_ops[pol->mode].rebind(pol, newmask);
356 }
357 
358 /*
359  * Wrapper for mpol_rebind_policy() that just requires task
360  * pointer, and updates task mempolicy.
361  *
362  * Called with task's alloc_lock held.
363  */
364 
365 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
366 {
367 	mpol_rebind_policy(tsk->mempolicy, new);
368 }
369 
370 /*
371  * Rebind each vma in mm to new nodemask.
372  *
373  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
374  */
375 
376 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
377 {
378 	struct vm_area_struct *vma;
379 
380 	down_write(&mm->mmap_sem);
381 	for (vma = mm->mmap; vma; vma = vma->vm_next)
382 		mpol_rebind_policy(vma->vm_policy, new);
383 	up_write(&mm->mmap_sem);
384 }
385 
386 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
387 	[MPOL_DEFAULT] = {
388 		.rebind = mpol_rebind_default,
389 	},
390 	[MPOL_INTERLEAVE] = {
391 		.create = mpol_new_interleave,
392 		.rebind = mpol_rebind_nodemask,
393 	},
394 	[MPOL_PREFERRED] = {
395 		.create = mpol_new_preferred,
396 		.rebind = mpol_rebind_preferred,
397 	},
398 	[MPOL_BIND] = {
399 		.create = mpol_new_bind,
400 		.rebind = mpol_rebind_nodemask,
401 	},
402 };
403 
404 static void migrate_page_add(struct page *page, struct list_head *pagelist,
405 				unsigned long flags);
406 
407 struct queue_pages {
408 	struct list_head *pagelist;
409 	unsigned long flags;
410 	nodemask_t *nmask;
411 	struct vm_area_struct *prev;
412 };
413 
414 /*
415  * Scan through pages checking if pages follow certain conditions,
416  * and move them to the pagelist if they do.
417  */
418 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
419 			unsigned long end, struct mm_walk *walk)
420 {
421 	struct vm_area_struct *vma = walk->vma;
422 	struct page *page;
423 	struct queue_pages *qp = walk->private;
424 	unsigned long flags = qp->flags;
425 	int nid, ret;
426 	pte_t *pte;
427 	spinlock_t *ptl;
428 
429 	if (pmd_trans_huge(*pmd)) {
430 		ptl = pmd_lock(walk->mm, pmd);
431 		if (pmd_trans_huge(*pmd)) {
432 			page = pmd_page(*pmd);
433 			if (is_huge_zero_page(page)) {
434 				spin_unlock(ptl);
435 				__split_huge_pmd(vma, pmd, addr, false, NULL);
436 			} else {
437 				get_page(page);
438 				spin_unlock(ptl);
439 				lock_page(page);
440 				ret = split_huge_page(page);
441 				unlock_page(page);
442 				put_page(page);
443 				if (ret)
444 					return 0;
445 			}
446 		} else {
447 			spin_unlock(ptl);
448 		}
449 	}
450 
451 	if (pmd_trans_unstable(pmd))
452 		return 0;
453 retry:
454 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
455 	for (; addr != end; pte++, addr += PAGE_SIZE) {
456 		if (!pte_present(*pte))
457 			continue;
458 		page = vm_normal_page(vma, addr, *pte);
459 		if (!page)
460 			continue;
461 		/*
462 		 * vm_normal_page() filters out zero pages, but there might
463 		 * still be PageReserved pages to skip, perhaps in a VDSO.
464 		 */
465 		if (PageReserved(page))
466 			continue;
467 		nid = page_to_nid(page);
468 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
469 			continue;
470 		if (PageTransCompound(page)) {
471 			get_page(page);
472 			pte_unmap_unlock(pte, ptl);
473 			lock_page(page);
474 			ret = split_huge_page(page);
475 			unlock_page(page);
476 			put_page(page);
477 			/* Failed to split -- skip. */
478 			if (ret) {
479 				pte = pte_offset_map_lock(walk->mm, pmd,
480 						addr, &ptl);
481 				continue;
482 			}
483 			goto retry;
484 		}
485 
486 		migrate_page_add(page, qp->pagelist, flags);
487 	}
488 	pte_unmap_unlock(pte - 1, ptl);
489 	cond_resched();
490 	return 0;
491 }
492 
493 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
494 			       unsigned long addr, unsigned long end,
495 			       struct mm_walk *walk)
496 {
497 #ifdef CONFIG_HUGETLB_PAGE
498 	struct queue_pages *qp = walk->private;
499 	unsigned long flags = qp->flags;
500 	int nid;
501 	struct page *page;
502 	spinlock_t *ptl;
503 	pte_t entry;
504 
505 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
506 	entry = huge_ptep_get(pte);
507 	if (!pte_present(entry))
508 		goto unlock;
509 	page = pte_page(entry);
510 	nid = page_to_nid(page);
511 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
512 		goto unlock;
513 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
514 	if (flags & (MPOL_MF_MOVE_ALL) ||
515 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
516 		isolate_huge_page(page, qp->pagelist);
517 unlock:
518 	spin_unlock(ptl);
519 #else
520 	BUG();
521 #endif
522 	return 0;
523 }
524 
525 #ifdef CONFIG_NUMA_BALANCING
526 /*
527  * This is used to mark a range of virtual addresses to be inaccessible.
528  * These are later cleared by a NUMA hinting fault. Depending on these
529  * faults, pages may be migrated for better NUMA placement.
530  *
531  * This is assuming that NUMA faults are handled using PROT_NONE. If
532  * an architecture makes a different choice, it will need further
533  * changes to the core.
534  */
535 unsigned long change_prot_numa(struct vm_area_struct *vma,
536 			unsigned long addr, unsigned long end)
537 {
538 	int nr_updated;
539 
540 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
541 	if (nr_updated)
542 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
543 
544 	return nr_updated;
545 }
546 #else
547 static unsigned long change_prot_numa(struct vm_area_struct *vma,
548 			unsigned long addr, unsigned long end)
549 {
550 	return 0;
551 }
552 #endif /* CONFIG_NUMA_BALANCING */
553 
554 static int queue_pages_test_walk(unsigned long start, unsigned long end,
555 				struct mm_walk *walk)
556 {
557 	struct vm_area_struct *vma = walk->vma;
558 	struct queue_pages *qp = walk->private;
559 	unsigned long endvma = vma->vm_end;
560 	unsigned long flags = qp->flags;
561 
562 	if (!vma_migratable(vma))
563 		return 1;
564 
565 	if (endvma > end)
566 		endvma = end;
567 	if (vma->vm_start > start)
568 		start = vma->vm_start;
569 
570 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
571 		if (!vma->vm_next && vma->vm_end < end)
572 			return -EFAULT;
573 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
574 			return -EFAULT;
575 	}
576 
577 	qp->prev = vma;
578 
579 	if (flags & MPOL_MF_LAZY) {
580 		/* Similar to task_numa_work, skip inaccessible VMAs */
581 		if (!is_vm_hugetlb_page(vma) &&
582 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
583 			!(vma->vm_flags & VM_MIXEDMAP))
584 			change_prot_numa(vma, start, endvma);
585 		return 1;
586 	}
587 
588 	/* queue pages from current vma */
589 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
590 		return 0;
591 	return 1;
592 }
593 
594 /*
595  * Walk through page tables and collect pages to be migrated.
596  *
597  * If pages found in a given range are on a set of nodes (determined by
598  * @nodes and @flags,) it's isolated and queued to the pagelist which is
599  * passed via @private.)
600  */
601 static int
602 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
603 		nodemask_t *nodes, unsigned long flags,
604 		struct list_head *pagelist)
605 {
606 	struct queue_pages qp = {
607 		.pagelist = pagelist,
608 		.flags = flags,
609 		.nmask = nodes,
610 		.prev = NULL,
611 	};
612 	struct mm_walk queue_pages_walk = {
613 		.hugetlb_entry = queue_pages_hugetlb,
614 		.pmd_entry = queue_pages_pte_range,
615 		.test_walk = queue_pages_test_walk,
616 		.mm = mm,
617 		.private = &qp,
618 	};
619 
620 	return walk_page_range(start, end, &queue_pages_walk);
621 }
622 
623 /*
624  * Apply policy to a single VMA
625  * This must be called with the mmap_sem held for writing.
626  */
627 static int vma_replace_policy(struct vm_area_struct *vma,
628 						struct mempolicy *pol)
629 {
630 	int err;
631 	struct mempolicy *old;
632 	struct mempolicy *new;
633 
634 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
635 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
636 		 vma->vm_ops, vma->vm_file,
637 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
638 
639 	new = mpol_dup(pol);
640 	if (IS_ERR(new))
641 		return PTR_ERR(new);
642 
643 	if (vma->vm_ops && vma->vm_ops->set_policy) {
644 		err = vma->vm_ops->set_policy(vma, new);
645 		if (err)
646 			goto err_out;
647 	}
648 
649 	old = vma->vm_policy;
650 	vma->vm_policy = new; /* protected by mmap_sem */
651 	mpol_put(old);
652 
653 	return 0;
654  err_out:
655 	mpol_put(new);
656 	return err;
657 }
658 
659 /* Step 2: apply policy to a range and do splits. */
660 static int mbind_range(struct mm_struct *mm, unsigned long start,
661 		       unsigned long end, struct mempolicy *new_pol)
662 {
663 	struct vm_area_struct *next;
664 	struct vm_area_struct *prev;
665 	struct vm_area_struct *vma;
666 	int err = 0;
667 	pgoff_t pgoff;
668 	unsigned long vmstart;
669 	unsigned long vmend;
670 
671 	vma = find_vma(mm, start);
672 	if (!vma || vma->vm_start > start)
673 		return -EFAULT;
674 
675 	prev = vma->vm_prev;
676 	if (start > vma->vm_start)
677 		prev = vma;
678 
679 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
680 		next = vma->vm_next;
681 		vmstart = max(start, vma->vm_start);
682 		vmend   = min(end, vma->vm_end);
683 
684 		if (mpol_equal(vma_policy(vma), new_pol))
685 			continue;
686 
687 		pgoff = vma->vm_pgoff +
688 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
689 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
690 				 vma->anon_vma, vma->vm_file, pgoff,
691 				 new_pol, vma->vm_userfaultfd_ctx);
692 		if (prev) {
693 			vma = prev;
694 			next = vma->vm_next;
695 			if (mpol_equal(vma_policy(vma), new_pol))
696 				continue;
697 			/* vma_merge() joined vma && vma->next, case 8 */
698 			goto replace;
699 		}
700 		if (vma->vm_start != vmstart) {
701 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
702 			if (err)
703 				goto out;
704 		}
705 		if (vma->vm_end != vmend) {
706 			err = split_vma(vma->vm_mm, vma, vmend, 0);
707 			if (err)
708 				goto out;
709 		}
710  replace:
711 		err = vma_replace_policy(vma, new_pol);
712 		if (err)
713 			goto out;
714 	}
715 
716  out:
717 	return err;
718 }
719 
720 /* Set the process memory policy */
721 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
722 			     nodemask_t *nodes)
723 {
724 	struct mempolicy *new, *old;
725 	NODEMASK_SCRATCH(scratch);
726 	int ret;
727 
728 	if (!scratch)
729 		return -ENOMEM;
730 
731 	new = mpol_new(mode, flags, nodes);
732 	if (IS_ERR(new)) {
733 		ret = PTR_ERR(new);
734 		goto out;
735 	}
736 
737 	task_lock(current);
738 	ret = mpol_set_nodemask(new, nodes, scratch);
739 	if (ret) {
740 		task_unlock(current);
741 		mpol_put(new);
742 		goto out;
743 	}
744 	old = current->mempolicy;
745 	current->mempolicy = new;
746 	if (new && new->mode == MPOL_INTERLEAVE)
747 		current->il_prev = MAX_NUMNODES-1;
748 	task_unlock(current);
749 	mpol_put(old);
750 	ret = 0;
751 out:
752 	NODEMASK_SCRATCH_FREE(scratch);
753 	return ret;
754 }
755 
756 /*
757  * Return nodemask for policy for get_mempolicy() query
758  *
759  * Called with task's alloc_lock held
760  */
761 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
762 {
763 	nodes_clear(*nodes);
764 	if (p == &default_policy)
765 		return;
766 
767 	switch (p->mode) {
768 	case MPOL_BIND:
769 		/* Fall through */
770 	case MPOL_INTERLEAVE:
771 		*nodes = p->v.nodes;
772 		break;
773 	case MPOL_PREFERRED:
774 		if (!(p->flags & MPOL_F_LOCAL))
775 			node_set(p->v.preferred_node, *nodes);
776 		/* else return empty node mask for local allocation */
777 		break;
778 	default:
779 		BUG();
780 	}
781 }
782 
783 static int lookup_node(unsigned long addr)
784 {
785 	struct page *p;
786 	int err;
787 
788 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
789 	if (err >= 0) {
790 		err = page_to_nid(p);
791 		put_page(p);
792 	}
793 	return err;
794 }
795 
796 /* Retrieve NUMA policy */
797 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
798 			     unsigned long addr, unsigned long flags)
799 {
800 	int err;
801 	struct mm_struct *mm = current->mm;
802 	struct vm_area_struct *vma = NULL;
803 	struct mempolicy *pol = current->mempolicy;
804 
805 	if (flags &
806 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
807 		return -EINVAL;
808 
809 	if (flags & MPOL_F_MEMS_ALLOWED) {
810 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
811 			return -EINVAL;
812 		*policy = 0;	/* just so it's initialized */
813 		task_lock(current);
814 		*nmask  = cpuset_current_mems_allowed;
815 		task_unlock(current);
816 		return 0;
817 	}
818 
819 	if (flags & MPOL_F_ADDR) {
820 		/*
821 		 * Do NOT fall back to task policy if the
822 		 * vma/shared policy at addr is NULL.  We
823 		 * want to return MPOL_DEFAULT in this case.
824 		 */
825 		down_read(&mm->mmap_sem);
826 		vma = find_vma_intersection(mm, addr, addr+1);
827 		if (!vma) {
828 			up_read(&mm->mmap_sem);
829 			return -EFAULT;
830 		}
831 		if (vma->vm_ops && vma->vm_ops->get_policy)
832 			pol = vma->vm_ops->get_policy(vma, addr);
833 		else
834 			pol = vma->vm_policy;
835 	} else if (addr)
836 		return -EINVAL;
837 
838 	if (!pol)
839 		pol = &default_policy;	/* indicates default behavior */
840 
841 	if (flags & MPOL_F_NODE) {
842 		if (flags & MPOL_F_ADDR) {
843 			err = lookup_node(addr);
844 			if (err < 0)
845 				goto out;
846 			*policy = err;
847 		} else if (pol == current->mempolicy &&
848 				pol->mode == MPOL_INTERLEAVE) {
849 			*policy = next_node_in(current->il_prev, pol->v.nodes);
850 		} else {
851 			err = -EINVAL;
852 			goto out;
853 		}
854 	} else {
855 		*policy = pol == &default_policy ? MPOL_DEFAULT :
856 						pol->mode;
857 		/*
858 		 * Internal mempolicy flags must be masked off before exposing
859 		 * the policy to userspace.
860 		 */
861 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
862 	}
863 
864 	if (vma) {
865 		up_read(&current->mm->mmap_sem);
866 		vma = NULL;
867 	}
868 
869 	err = 0;
870 	if (nmask) {
871 		if (mpol_store_user_nodemask(pol)) {
872 			*nmask = pol->w.user_nodemask;
873 		} else {
874 			task_lock(current);
875 			get_policy_nodemask(pol, nmask);
876 			task_unlock(current);
877 		}
878 	}
879 
880  out:
881 	mpol_cond_put(pol);
882 	if (vma)
883 		up_read(&current->mm->mmap_sem);
884 	return err;
885 }
886 
887 #ifdef CONFIG_MIGRATION
888 /*
889  * page migration
890  */
891 static void migrate_page_add(struct page *page, struct list_head *pagelist,
892 				unsigned long flags)
893 {
894 	/*
895 	 * Avoid migrating a page that is shared with others.
896 	 */
897 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
898 		if (!isolate_lru_page(page)) {
899 			list_add_tail(&page->lru, pagelist);
900 			inc_node_page_state(page, NR_ISOLATED_ANON +
901 					    page_is_file_cache(page));
902 		}
903 	}
904 }
905 
906 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
907 {
908 	if (PageHuge(page))
909 		return alloc_huge_page_node(page_hstate(compound_head(page)),
910 					node);
911 	else
912 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
913 						    __GFP_THISNODE, 0);
914 }
915 
916 /*
917  * Migrate pages from one node to a target node.
918  * Returns error or the number of pages not migrated.
919  */
920 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
921 			   int flags)
922 {
923 	nodemask_t nmask;
924 	LIST_HEAD(pagelist);
925 	int err = 0;
926 
927 	nodes_clear(nmask);
928 	node_set(source, nmask);
929 
930 	/*
931 	 * This does not "check" the range but isolates all pages that
932 	 * need migration.  Between passing in the full user address
933 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
934 	 */
935 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
936 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
937 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
938 
939 	if (!list_empty(&pagelist)) {
940 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
941 					MIGRATE_SYNC, MR_SYSCALL);
942 		if (err)
943 			putback_movable_pages(&pagelist);
944 	}
945 
946 	return err;
947 }
948 
949 /*
950  * Move pages between the two nodesets so as to preserve the physical
951  * layout as much as possible.
952  *
953  * Returns the number of page that could not be moved.
954  */
955 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
956 		     const nodemask_t *to, int flags)
957 {
958 	int busy = 0;
959 	int err;
960 	nodemask_t tmp;
961 
962 	err = migrate_prep();
963 	if (err)
964 		return err;
965 
966 	down_read(&mm->mmap_sem);
967 
968 	/*
969 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
970 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
971 	 * bit in 'tmp', and return that <source, dest> pair for migration.
972 	 * The pair of nodemasks 'to' and 'from' define the map.
973 	 *
974 	 * If no pair of bits is found that way, fallback to picking some
975 	 * pair of 'source' and 'dest' bits that are not the same.  If the
976 	 * 'source' and 'dest' bits are the same, this represents a node
977 	 * that will be migrating to itself, so no pages need move.
978 	 *
979 	 * If no bits are left in 'tmp', or if all remaining bits left
980 	 * in 'tmp' correspond to the same bit in 'to', return false
981 	 * (nothing left to migrate).
982 	 *
983 	 * This lets us pick a pair of nodes to migrate between, such that
984 	 * if possible the dest node is not already occupied by some other
985 	 * source node, minimizing the risk of overloading the memory on a
986 	 * node that would happen if we migrated incoming memory to a node
987 	 * before migrating outgoing memory source that same node.
988 	 *
989 	 * A single scan of tmp is sufficient.  As we go, we remember the
990 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
991 	 * that not only moved, but what's better, moved to an empty slot
992 	 * (d is not set in tmp), then we break out then, with that pair.
993 	 * Otherwise when we finish scanning from_tmp, we at least have the
994 	 * most recent <s, d> pair that moved.  If we get all the way through
995 	 * the scan of tmp without finding any node that moved, much less
996 	 * moved to an empty node, then there is nothing left worth migrating.
997 	 */
998 
999 	tmp = *from;
1000 	while (!nodes_empty(tmp)) {
1001 		int s,d;
1002 		int source = NUMA_NO_NODE;
1003 		int dest = 0;
1004 
1005 		for_each_node_mask(s, tmp) {
1006 
1007 			/*
1008 			 * do_migrate_pages() tries to maintain the relative
1009 			 * node relationship of the pages established between
1010 			 * threads and memory areas.
1011                          *
1012 			 * However if the number of source nodes is not equal to
1013 			 * the number of destination nodes we can not preserve
1014 			 * this node relative relationship.  In that case, skip
1015 			 * copying memory from a node that is in the destination
1016 			 * mask.
1017 			 *
1018 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1019 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1020 			 */
1021 
1022 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1023 						(node_isset(s, *to)))
1024 				continue;
1025 
1026 			d = node_remap(s, *from, *to);
1027 			if (s == d)
1028 				continue;
1029 
1030 			source = s;	/* Node moved. Memorize */
1031 			dest = d;
1032 
1033 			/* dest not in remaining from nodes? */
1034 			if (!node_isset(dest, tmp))
1035 				break;
1036 		}
1037 		if (source == NUMA_NO_NODE)
1038 			break;
1039 
1040 		node_clear(source, tmp);
1041 		err = migrate_to_node(mm, source, dest, flags);
1042 		if (err > 0)
1043 			busy += err;
1044 		if (err < 0)
1045 			break;
1046 	}
1047 	up_read(&mm->mmap_sem);
1048 	if (err < 0)
1049 		return err;
1050 	return busy;
1051 
1052 }
1053 
1054 /*
1055  * Allocate a new page for page migration based on vma policy.
1056  * Start by assuming the page is mapped by the same vma as contains @start.
1057  * Search forward from there, if not.  N.B., this assumes that the
1058  * list of pages handed to migrate_pages()--which is how we get here--
1059  * is in virtual address order.
1060  */
1061 static struct page *new_page(struct page *page, unsigned long start, int **x)
1062 {
1063 	struct vm_area_struct *vma;
1064 	unsigned long uninitialized_var(address);
1065 
1066 	vma = find_vma(current->mm, start);
1067 	while (vma) {
1068 		address = page_address_in_vma(page, vma);
1069 		if (address != -EFAULT)
1070 			break;
1071 		vma = vma->vm_next;
1072 	}
1073 
1074 	if (PageHuge(page)) {
1075 		BUG_ON(!vma);
1076 		return alloc_huge_page_noerr(vma, address, 1);
1077 	}
1078 	/*
1079 	 * if !vma, alloc_page_vma() will use task or system default policy
1080 	 */
1081 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1082 			vma, address);
1083 }
1084 #else
1085 
1086 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1087 				unsigned long flags)
1088 {
1089 }
1090 
1091 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1092 		     const nodemask_t *to, int flags)
1093 {
1094 	return -ENOSYS;
1095 }
1096 
1097 static struct page *new_page(struct page *page, unsigned long start, int **x)
1098 {
1099 	return NULL;
1100 }
1101 #endif
1102 
1103 static long do_mbind(unsigned long start, unsigned long len,
1104 		     unsigned short mode, unsigned short mode_flags,
1105 		     nodemask_t *nmask, unsigned long flags)
1106 {
1107 	struct mm_struct *mm = current->mm;
1108 	struct mempolicy *new;
1109 	unsigned long end;
1110 	int err;
1111 	LIST_HEAD(pagelist);
1112 
1113 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1114 		return -EINVAL;
1115 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1116 		return -EPERM;
1117 
1118 	if (start & ~PAGE_MASK)
1119 		return -EINVAL;
1120 
1121 	if (mode == MPOL_DEFAULT)
1122 		flags &= ~MPOL_MF_STRICT;
1123 
1124 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1125 	end = start + len;
1126 
1127 	if (end < start)
1128 		return -EINVAL;
1129 	if (end == start)
1130 		return 0;
1131 
1132 	new = mpol_new(mode, mode_flags, nmask);
1133 	if (IS_ERR(new))
1134 		return PTR_ERR(new);
1135 
1136 	if (flags & MPOL_MF_LAZY)
1137 		new->flags |= MPOL_F_MOF;
1138 
1139 	/*
1140 	 * If we are using the default policy then operation
1141 	 * on discontinuous address spaces is okay after all
1142 	 */
1143 	if (!new)
1144 		flags |= MPOL_MF_DISCONTIG_OK;
1145 
1146 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1147 		 start, start + len, mode, mode_flags,
1148 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1149 
1150 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1151 
1152 		err = migrate_prep();
1153 		if (err)
1154 			goto mpol_out;
1155 	}
1156 	{
1157 		NODEMASK_SCRATCH(scratch);
1158 		if (scratch) {
1159 			down_write(&mm->mmap_sem);
1160 			task_lock(current);
1161 			err = mpol_set_nodemask(new, nmask, scratch);
1162 			task_unlock(current);
1163 			if (err)
1164 				up_write(&mm->mmap_sem);
1165 		} else
1166 			err = -ENOMEM;
1167 		NODEMASK_SCRATCH_FREE(scratch);
1168 	}
1169 	if (err)
1170 		goto mpol_out;
1171 
1172 	err = queue_pages_range(mm, start, end, nmask,
1173 			  flags | MPOL_MF_INVERT, &pagelist);
1174 	if (!err)
1175 		err = mbind_range(mm, start, end, new);
1176 
1177 	if (!err) {
1178 		int nr_failed = 0;
1179 
1180 		if (!list_empty(&pagelist)) {
1181 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1182 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1183 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1184 			if (nr_failed)
1185 				putback_movable_pages(&pagelist);
1186 		}
1187 
1188 		if (nr_failed && (flags & MPOL_MF_STRICT))
1189 			err = -EIO;
1190 	} else
1191 		putback_movable_pages(&pagelist);
1192 
1193 	up_write(&mm->mmap_sem);
1194  mpol_out:
1195 	mpol_put(new);
1196 	return err;
1197 }
1198 
1199 /*
1200  * User space interface with variable sized bitmaps for nodelists.
1201  */
1202 
1203 /* Copy a node mask from user space. */
1204 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1205 		     unsigned long maxnode)
1206 {
1207 	unsigned long k;
1208 	unsigned long nlongs;
1209 	unsigned long endmask;
1210 
1211 	--maxnode;
1212 	nodes_clear(*nodes);
1213 	if (maxnode == 0 || !nmask)
1214 		return 0;
1215 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1216 		return -EINVAL;
1217 
1218 	nlongs = BITS_TO_LONGS(maxnode);
1219 	if ((maxnode % BITS_PER_LONG) == 0)
1220 		endmask = ~0UL;
1221 	else
1222 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1223 
1224 	/* When the user specified more nodes than supported just check
1225 	   if the non supported part is all zero. */
1226 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1227 		if (nlongs > PAGE_SIZE/sizeof(long))
1228 			return -EINVAL;
1229 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1230 			unsigned long t;
1231 			if (get_user(t, nmask + k))
1232 				return -EFAULT;
1233 			if (k == nlongs - 1) {
1234 				if (t & endmask)
1235 					return -EINVAL;
1236 			} else if (t)
1237 				return -EINVAL;
1238 		}
1239 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1240 		endmask = ~0UL;
1241 	}
1242 
1243 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1244 		return -EFAULT;
1245 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1246 	return 0;
1247 }
1248 
1249 /* Copy a kernel node mask to user space */
1250 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1251 			      nodemask_t *nodes)
1252 {
1253 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1254 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1255 
1256 	if (copy > nbytes) {
1257 		if (copy > PAGE_SIZE)
1258 			return -EINVAL;
1259 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1260 			return -EFAULT;
1261 		copy = nbytes;
1262 	}
1263 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1264 }
1265 
1266 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1267 		unsigned long, mode, const unsigned long __user *, nmask,
1268 		unsigned long, maxnode, unsigned, flags)
1269 {
1270 	nodemask_t nodes;
1271 	int err;
1272 	unsigned short mode_flags;
1273 
1274 	mode_flags = mode & MPOL_MODE_FLAGS;
1275 	mode &= ~MPOL_MODE_FLAGS;
1276 	if (mode >= MPOL_MAX)
1277 		return -EINVAL;
1278 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1279 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1280 		return -EINVAL;
1281 	err = get_nodes(&nodes, nmask, maxnode);
1282 	if (err)
1283 		return err;
1284 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1285 }
1286 
1287 /* Set the process memory policy */
1288 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1289 		unsigned long, maxnode)
1290 {
1291 	int err;
1292 	nodemask_t nodes;
1293 	unsigned short flags;
1294 
1295 	flags = mode & MPOL_MODE_FLAGS;
1296 	mode &= ~MPOL_MODE_FLAGS;
1297 	if ((unsigned int)mode >= MPOL_MAX)
1298 		return -EINVAL;
1299 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1300 		return -EINVAL;
1301 	err = get_nodes(&nodes, nmask, maxnode);
1302 	if (err)
1303 		return err;
1304 	return do_set_mempolicy(mode, flags, &nodes);
1305 }
1306 
1307 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1308 		const unsigned long __user *, old_nodes,
1309 		const unsigned long __user *, new_nodes)
1310 {
1311 	const struct cred *cred = current_cred(), *tcred;
1312 	struct mm_struct *mm = NULL;
1313 	struct task_struct *task;
1314 	nodemask_t task_nodes;
1315 	int err;
1316 	nodemask_t *old;
1317 	nodemask_t *new;
1318 	NODEMASK_SCRATCH(scratch);
1319 
1320 	if (!scratch)
1321 		return -ENOMEM;
1322 
1323 	old = &scratch->mask1;
1324 	new = &scratch->mask2;
1325 
1326 	err = get_nodes(old, old_nodes, maxnode);
1327 	if (err)
1328 		goto out;
1329 
1330 	err = get_nodes(new, new_nodes, maxnode);
1331 	if (err)
1332 		goto out;
1333 
1334 	/* Find the mm_struct */
1335 	rcu_read_lock();
1336 	task = pid ? find_task_by_vpid(pid) : current;
1337 	if (!task) {
1338 		rcu_read_unlock();
1339 		err = -ESRCH;
1340 		goto out;
1341 	}
1342 	get_task_struct(task);
1343 
1344 	err = -EINVAL;
1345 
1346 	/*
1347 	 * Check if this process has the right to modify the specified
1348 	 * process. The right exists if the process has administrative
1349 	 * capabilities, superuser privileges or the same
1350 	 * userid as the target process.
1351 	 */
1352 	tcred = __task_cred(task);
1353 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1354 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1355 	    !capable(CAP_SYS_NICE)) {
1356 		rcu_read_unlock();
1357 		err = -EPERM;
1358 		goto out_put;
1359 	}
1360 	rcu_read_unlock();
1361 
1362 	task_nodes = cpuset_mems_allowed(task);
1363 	/* Is the user allowed to access the target nodes? */
1364 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1365 		err = -EPERM;
1366 		goto out_put;
1367 	}
1368 
1369 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1370 		err = -EINVAL;
1371 		goto out_put;
1372 	}
1373 
1374 	err = security_task_movememory(task);
1375 	if (err)
1376 		goto out_put;
1377 
1378 	mm = get_task_mm(task);
1379 	put_task_struct(task);
1380 
1381 	if (!mm) {
1382 		err = -EINVAL;
1383 		goto out;
1384 	}
1385 
1386 	err = do_migrate_pages(mm, old, new,
1387 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1388 
1389 	mmput(mm);
1390 out:
1391 	NODEMASK_SCRATCH_FREE(scratch);
1392 
1393 	return err;
1394 
1395 out_put:
1396 	put_task_struct(task);
1397 	goto out;
1398 
1399 }
1400 
1401 
1402 /* Retrieve NUMA policy */
1403 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1404 		unsigned long __user *, nmask, unsigned long, maxnode,
1405 		unsigned long, addr, unsigned long, flags)
1406 {
1407 	int err;
1408 	int uninitialized_var(pval);
1409 	nodemask_t nodes;
1410 
1411 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1412 		return -EINVAL;
1413 
1414 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1415 
1416 	if (err)
1417 		return err;
1418 
1419 	if (policy && put_user(pval, policy))
1420 		return -EFAULT;
1421 
1422 	if (nmask)
1423 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1424 
1425 	return err;
1426 }
1427 
1428 #ifdef CONFIG_COMPAT
1429 
1430 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1431 		       compat_ulong_t __user *, nmask,
1432 		       compat_ulong_t, maxnode,
1433 		       compat_ulong_t, addr, compat_ulong_t, flags)
1434 {
1435 	long err;
1436 	unsigned long __user *nm = NULL;
1437 	unsigned long nr_bits, alloc_size;
1438 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1439 
1440 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1441 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1442 
1443 	if (nmask)
1444 		nm = compat_alloc_user_space(alloc_size);
1445 
1446 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1447 
1448 	if (!err && nmask) {
1449 		unsigned long copy_size;
1450 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1451 		err = copy_from_user(bm, nm, copy_size);
1452 		/* ensure entire bitmap is zeroed */
1453 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1454 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1455 	}
1456 
1457 	return err;
1458 }
1459 
1460 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1461 		       compat_ulong_t, maxnode)
1462 {
1463 	unsigned long __user *nm = NULL;
1464 	unsigned long nr_bits, alloc_size;
1465 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1466 
1467 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1468 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1469 
1470 	if (nmask) {
1471 		if (compat_get_bitmap(bm, nmask, nr_bits))
1472 			return -EFAULT;
1473 		nm = compat_alloc_user_space(alloc_size);
1474 		if (copy_to_user(nm, bm, alloc_size))
1475 			return -EFAULT;
1476 	}
1477 
1478 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1479 }
1480 
1481 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1482 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1483 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1484 {
1485 	unsigned long __user *nm = NULL;
1486 	unsigned long nr_bits, alloc_size;
1487 	nodemask_t bm;
1488 
1489 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1490 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1491 
1492 	if (nmask) {
1493 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1494 			return -EFAULT;
1495 		nm = compat_alloc_user_space(alloc_size);
1496 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1497 			return -EFAULT;
1498 	}
1499 
1500 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1501 }
1502 
1503 #endif
1504 
1505 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1506 						unsigned long addr)
1507 {
1508 	struct mempolicy *pol = NULL;
1509 
1510 	if (vma) {
1511 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1512 			pol = vma->vm_ops->get_policy(vma, addr);
1513 		} else if (vma->vm_policy) {
1514 			pol = vma->vm_policy;
1515 
1516 			/*
1517 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1518 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1519 			 * count on these policies which will be dropped by
1520 			 * mpol_cond_put() later
1521 			 */
1522 			if (mpol_needs_cond_ref(pol))
1523 				mpol_get(pol);
1524 		}
1525 	}
1526 
1527 	return pol;
1528 }
1529 
1530 /*
1531  * get_vma_policy(@vma, @addr)
1532  * @vma: virtual memory area whose policy is sought
1533  * @addr: address in @vma for shared policy lookup
1534  *
1535  * Returns effective policy for a VMA at specified address.
1536  * Falls back to current->mempolicy or system default policy, as necessary.
1537  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1538  * count--added by the get_policy() vm_op, as appropriate--to protect against
1539  * freeing by another task.  It is the caller's responsibility to free the
1540  * extra reference for shared policies.
1541  */
1542 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1543 						unsigned long addr)
1544 {
1545 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1546 
1547 	if (!pol)
1548 		pol = get_task_policy(current);
1549 
1550 	return pol;
1551 }
1552 
1553 bool vma_policy_mof(struct vm_area_struct *vma)
1554 {
1555 	struct mempolicy *pol;
1556 
1557 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1558 		bool ret = false;
1559 
1560 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1561 		if (pol && (pol->flags & MPOL_F_MOF))
1562 			ret = true;
1563 		mpol_cond_put(pol);
1564 
1565 		return ret;
1566 	}
1567 
1568 	pol = vma->vm_policy;
1569 	if (!pol)
1570 		pol = get_task_policy(current);
1571 
1572 	return pol->flags & MPOL_F_MOF;
1573 }
1574 
1575 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1576 {
1577 	enum zone_type dynamic_policy_zone = policy_zone;
1578 
1579 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1580 
1581 	/*
1582 	 * if policy->v.nodes has movable memory only,
1583 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1584 	 *
1585 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1586 	 * so if the following test faile, it implies
1587 	 * policy->v.nodes has movable memory only.
1588 	 */
1589 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1590 		dynamic_policy_zone = ZONE_MOVABLE;
1591 
1592 	return zone >= dynamic_policy_zone;
1593 }
1594 
1595 /*
1596  * Return a nodemask representing a mempolicy for filtering nodes for
1597  * page allocation
1598  */
1599 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1600 {
1601 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1602 	if (unlikely(policy->mode == MPOL_BIND) &&
1603 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1604 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1605 		return &policy->v.nodes;
1606 
1607 	return NULL;
1608 }
1609 
1610 /* Return the node id preferred by the given mempolicy, or the given id */
1611 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1612 								int nd)
1613 {
1614 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1615 		nd = policy->v.preferred_node;
1616 	else {
1617 		/*
1618 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1619 		 * because we might easily break the expectation to stay on the
1620 		 * requested node and not break the policy.
1621 		 */
1622 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1623 	}
1624 
1625 	return nd;
1626 }
1627 
1628 /* Do dynamic interleaving for a process */
1629 static unsigned interleave_nodes(struct mempolicy *policy)
1630 {
1631 	unsigned next;
1632 	struct task_struct *me = current;
1633 
1634 	next = next_node_in(me->il_prev, policy->v.nodes);
1635 	if (next < MAX_NUMNODES)
1636 		me->il_prev = next;
1637 	return next;
1638 }
1639 
1640 /*
1641  * Depending on the memory policy provide a node from which to allocate the
1642  * next slab entry.
1643  */
1644 unsigned int mempolicy_slab_node(void)
1645 {
1646 	struct mempolicy *policy;
1647 	int node = numa_mem_id();
1648 
1649 	if (in_interrupt())
1650 		return node;
1651 
1652 	policy = current->mempolicy;
1653 	if (!policy || policy->flags & MPOL_F_LOCAL)
1654 		return node;
1655 
1656 	switch (policy->mode) {
1657 	case MPOL_PREFERRED:
1658 		/*
1659 		 * handled MPOL_F_LOCAL above
1660 		 */
1661 		return policy->v.preferred_node;
1662 
1663 	case MPOL_INTERLEAVE:
1664 		return interleave_nodes(policy);
1665 
1666 	case MPOL_BIND: {
1667 		struct zoneref *z;
1668 
1669 		/*
1670 		 * Follow bind policy behavior and start allocation at the
1671 		 * first node.
1672 		 */
1673 		struct zonelist *zonelist;
1674 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1675 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1676 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1677 							&policy->v.nodes);
1678 		return z->zone ? z->zone->node : node;
1679 	}
1680 
1681 	default:
1682 		BUG();
1683 	}
1684 }
1685 
1686 /*
1687  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1688  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1689  * number of present nodes.
1690  */
1691 static unsigned offset_il_node(struct mempolicy *pol,
1692 			       struct vm_area_struct *vma, unsigned long n)
1693 {
1694 	unsigned nnodes = nodes_weight(pol->v.nodes);
1695 	unsigned target;
1696 	int i;
1697 	int nid;
1698 
1699 	if (!nnodes)
1700 		return numa_node_id();
1701 	target = (unsigned int)n % nnodes;
1702 	nid = first_node(pol->v.nodes);
1703 	for (i = 0; i < target; i++)
1704 		nid = next_node(nid, pol->v.nodes);
1705 	return nid;
1706 }
1707 
1708 /* Determine a node number for interleave */
1709 static inline unsigned interleave_nid(struct mempolicy *pol,
1710 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1711 {
1712 	if (vma) {
1713 		unsigned long off;
1714 
1715 		/*
1716 		 * for small pages, there is no difference between
1717 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1718 		 * for huge pages, since vm_pgoff is in units of small
1719 		 * pages, we need to shift off the always 0 bits to get
1720 		 * a useful offset.
1721 		 */
1722 		BUG_ON(shift < PAGE_SHIFT);
1723 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1724 		off += (addr - vma->vm_start) >> shift;
1725 		return offset_il_node(pol, vma, off);
1726 	} else
1727 		return interleave_nodes(pol);
1728 }
1729 
1730 #ifdef CONFIG_HUGETLBFS
1731 /*
1732  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1733  * @vma: virtual memory area whose policy is sought
1734  * @addr: address in @vma for shared policy lookup and interleave policy
1735  * @gfp_flags: for requested zone
1736  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1737  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1738  *
1739  * Returns a nid suitable for a huge page allocation and a pointer
1740  * to the struct mempolicy for conditional unref after allocation.
1741  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1742  * @nodemask for filtering the zonelist.
1743  *
1744  * Must be protected by read_mems_allowed_begin()
1745  */
1746 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1747 				struct mempolicy **mpol, nodemask_t **nodemask)
1748 {
1749 	int nid;
1750 
1751 	*mpol = get_vma_policy(vma, addr);
1752 	*nodemask = NULL;	/* assume !MPOL_BIND */
1753 
1754 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1755 		nid = interleave_nid(*mpol, vma, addr,
1756 					huge_page_shift(hstate_vma(vma)));
1757 	} else {
1758 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1759 		if ((*mpol)->mode == MPOL_BIND)
1760 			*nodemask = &(*mpol)->v.nodes;
1761 	}
1762 	return nid;
1763 }
1764 
1765 /*
1766  * init_nodemask_of_mempolicy
1767  *
1768  * If the current task's mempolicy is "default" [NULL], return 'false'
1769  * to indicate default policy.  Otherwise, extract the policy nodemask
1770  * for 'bind' or 'interleave' policy into the argument nodemask, or
1771  * initialize the argument nodemask to contain the single node for
1772  * 'preferred' or 'local' policy and return 'true' to indicate presence
1773  * of non-default mempolicy.
1774  *
1775  * We don't bother with reference counting the mempolicy [mpol_get/put]
1776  * because the current task is examining it's own mempolicy and a task's
1777  * mempolicy is only ever changed by the task itself.
1778  *
1779  * N.B., it is the caller's responsibility to free a returned nodemask.
1780  */
1781 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1782 {
1783 	struct mempolicy *mempolicy;
1784 	int nid;
1785 
1786 	if (!(mask && current->mempolicy))
1787 		return false;
1788 
1789 	task_lock(current);
1790 	mempolicy = current->mempolicy;
1791 	switch (mempolicy->mode) {
1792 	case MPOL_PREFERRED:
1793 		if (mempolicy->flags & MPOL_F_LOCAL)
1794 			nid = numa_node_id();
1795 		else
1796 			nid = mempolicy->v.preferred_node;
1797 		init_nodemask_of_node(mask, nid);
1798 		break;
1799 
1800 	case MPOL_BIND:
1801 		/* Fall through */
1802 	case MPOL_INTERLEAVE:
1803 		*mask =  mempolicy->v.nodes;
1804 		break;
1805 
1806 	default:
1807 		BUG();
1808 	}
1809 	task_unlock(current);
1810 
1811 	return true;
1812 }
1813 #endif
1814 
1815 /*
1816  * mempolicy_nodemask_intersects
1817  *
1818  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1819  * policy.  Otherwise, check for intersection between mask and the policy
1820  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1821  * policy, always return true since it may allocate elsewhere on fallback.
1822  *
1823  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1824  */
1825 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1826 					const nodemask_t *mask)
1827 {
1828 	struct mempolicy *mempolicy;
1829 	bool ret = true;
1830 
1831 	if (!mask)
1832 		return ret;
1833 	task_lock(tsk);
1834 	mempolicy = tsk->mempolicy;
1835 	if (!mempolicy)
1836 		goto out;
1837 
1838 	switch (mempolicy->mode) {
1839 	case MPOL_PREFERRED:
1840 		/*
1841 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1842 		 * allocate from, they may fallback to other nodes when oom.
1843 		 * Thus, it's possible for tsk to have allocated memory from
1844 		 * nodes in mask.
1845 		 */
1846 		break;
1847 	case MPOL_BIND:
1848 	case MPOL_INTERLEAVE:
1849 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1850 		break;
1851 	default:
1852 		BUG();
1853 	}
1854 out:
1855 	task_unlock(tsk);
1856 	return ret;
1857 }
1858 
1859 /* Allocate a page in interleaved policy.
1860    Own path because it needs to do special accounting. */
1861 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1862 					unsigned nid)
1863 {
1864 	struct page *page;
1865 
1866 	page = __alloc_pages(gfp, order, nid);
1867 	if (page && page_to_nid(page) == nid)
1868 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1869 	return page;
1870 }
1871 
1872 /**
1873  * 	alloc_pages_vma	- Allocate a page for a VMA.
1874  *
1875  * 	@gfp:
1876  *      %GFP_USER    user allocation.
1877  *      %GFP_KERNEL  kernel allocations,
1878  *      %GFP_HIGHMEM highmem/user allocations,
1879  *      %GFP_FS      allocation should not call back into a file system.
1880  *      %GFP_ATOMIC  don't sleep.
1881  *
1882  *	@order:Order of the GFP allocation.
1883  * 	@vma:  Pointer to VMA or NULL if not available.
1884  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1885  *	@node: Which node to prefer for allocation (modulo policy).
1886  *	@hugepage: for hugepages try only the preferred node if possible
1887  *
1888  * 	This function allocates a page from the kernel page pool and applies
1889  *	a NUMA policy associated with the VMA or the current process.
1890  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1891  *	mm_struct of the VMA to prevent it from going away. Should be used for
1892  *	all allocations for pages that will be mapped into user space. Returns
1893  *	NULL when no page can be allocated.
1894  */
1895 struct page *
1896 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1897 		unsigned long addr, int node, bool hugepage)
1898 {
1899 	struct mempolicy *pol;
1900 	struct page *page;
1901 	int preferred_nid;
1902 	nodemask_t *nmask;
1903 
1904 	pol = get_vma_policy(vma, addr);
1905 
1906 	if (pol->mode == MPOL_INTERLEAVE) {
1907 		unsigned nid;
1908 
1909 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1910 		mpol_cond_put(pol);
1911 		page = alloc_page_interleave(gfp, order, nid);
1912 		goto out;
1913 	}
1914 
1915 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1916 		int hpage_node = node;
1917 
1918 		/*
1919 		 * For hugepage allocation and non-interleave policy which
1920 		 * allows the current node (or other explicitly preferred
1921 		 * node) we only try to allocate from the current/preferred
1922 		 * node and don't fall back to other nodes, as the cost of
1923 		 * remote accesses would likely offset THP benefits.
1924 		 *
1925 		 * If the policy is interleave, or does not allow the current
1926 		 * node in its nodemask, we allocate the standard way.
1927 		 */
1928 		if (pol->mode == MPOL_PREFERRED &&
1929 						!(pol->flags & MPOL_F_LOCAL))
1930 			hpage_node = pol->v.preferred_node;
1931 
1932 		nmask = policy_nodemask(gfp, pol);
1933 		if (!nmask || node_isset(hpage_node, *nmask)) {
1934 			mpol_cond_put(pol);
1935 			page = __alloc_pages_node(hpage_node,
1936 						gfp | __GFP_THISNODE, order);
1937 			goto out;
1938 		}
1939 	}
1940 
1941 	nmask = policy_nodemask(gfp, pol);
1942 	preferred_nid = policy_node(gfp, pol, node);
1943 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
1944 	mpol_cond_put(pol);
1945 out:
1946 	return page;
1947 }
1948 
1949 /**
1950  * 	alloc_pages_current - Allocate pages.
1951  *
1952  *	@gfp:
1953  *		%GFP_USER   user allocation,
1954  *      	%GFP_KERNEL kernel allocation,
1955  *      	%GFP_HIGHMEM highmem allocation,
1956  *      	%GFP_FS     don't call back into a file system.
1957  *      	%GFP_ATOMIC don't sleep.
1958  *	@order: Power of two of allocation size in pages. 0 is a single page.
1959  *
1960  *	Allocate a page from the kernel page pool.  When not in
1961  *	interrupt context and apply the current process NUMA policy.
1962  *	Returns NULL when no page can be allocated.
1963  */
1964 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1965 {
1966 	struct mempolicy *pol = &default_policy;
1967 	struct page *page;
1968 
1969 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
1970 		pol = get_task_policy(current);
1971 
1972 	/*
1973 	 * No reference counting needed for current->mempolicy
1974 	 * nor system default_policy
1975 	 */
1976 	if (pol->mode == MPOL_INTERLEAVE)
1977 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1978 	else
1979 		page = __alloc_pages_nodemask(gfp, order,
1980 				policy_node(gfp, pol, numa_node_id()),
1981 				policy_nodemask(gfp, pol));
1982 
1983 	return page;
1984 }
1985 EXPORT_SYMBOL(alloc_pages_current);
1986 
1987 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
1988 {
1989 	struct mempolicy *pol = mpol_dup(vma_policy(src));
1990 
1991 	if (IS_ERR(pol))
1992 		return PTR_ERR(pol);
1993 	dst->vm_policy = pol;
1994 	return 0;
1995 }
1996 
1997 /*
1998  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1999  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2000  * with the mems_allowed returned by cpuset_mems_allowed().  This
2001  * keeps mempolicies cpuset relative after its cpuset moves.  See
2002  * further kernel/cpuset.c update_nodemask().
2003  *
2004  * current's mempolicy may be rebinded by the other task(the task that changes
2005  * cpuset's mems), so we needn't do rebind work for current task.
2006  */
2007 
2008 /* Slow path of a mempolicy duplicate */
2009 struct mempolicy *__mpol_dup(struct mempolicy *old)
2010 {
2011 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2012 
2013 	if (!new)
2014 		return ERR_PTR(-ENOMEM);
2015 
2016 	/* task's mempolicy is protected by alloc_lock */
2017 	if (old == current->mempolicy) {
2018 		task_lock(current);
2019 		*new = *old;
2020 		task_unlock(current);
2021 	} else
2022 		*new = *old;
2023 
2024 	if (current_cpuset_is_being_rebound()) {
2025 		nodemask_t mems = cpuset_mems_allowed(current);
2026 		mpol_rebind_policy(new, &mems);
2027 	}
2028 	atomic_set(&new->refcnt, 1);
2029 	return new;
2030 }
2031 
2032 /* Slow path of a mempolicy comparison */
2033 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2034 {
2035 	if (!a || !b)
2036 		return false;
2037 	if (a->mode != b->mode)
2038 		return false;
2039 	if (a->flags != b->flags)
2040 		return false;
2041 	if (mpol_store_user_nodemask(a))
2042 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2043 			return false;
2044 
2045 	switch (a->mode) {
2046 	case MPOL_BIND:
2047 		/* Fall through */
2048 	case MPOL_INTERLEAVE:
2049 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2050 	case MPOL_PREFERRED:
2051 		return a->v.preferred_node == b->v.preferred_node;
2052 	default:
2053 		BUG();
2054 		return false;
2055 	}
2056 }
2057 
2058 /*
2059  * Shared memory backing store policy support.
2060  *
2061  * Remember policies even when nobody has shared memory mapped.
2062  * The policies are kept in Red-Black tree linked from the inode.
2063  * They are protected by the sp->lock rwlock, which should be held
2064  * for any accesses to the tree.
2065  */
2066 
2067 /*
2068  * lookup first element intersecting start-end.  Caller holds sp->lock for
2069  * reading or for writing
2070  */
2071 static struct sp_node *
2072 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2073 {
2074 	struct rb_node *n = sp->root.rb_node;
2075 
2076 	while (n) {
2077 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2078 
2079 		if (start >= p->end)
2080 			n = n->rb_right;
2081 		else if (end <= p->start)
2082 			n = n->rb_left;
2083 		else
2084 			break;
2085 	}
2086 	if (!n)
2087 		return NULL;
2088 	for (;;) {
2089 		struct sp_node *w = NULL;
2090 		struct rb_node *prev = rb_prev(n);
2091 		if (!prev)
2092 			break;
2093 		w = rb_entry(prev, struct sp_node, nd);
2094 		if (w->end <= start)
2095 			break;
2096 		n = prev;
2097 	}
2098 	return rb_entry(n, struct sp_node, nd);
2099 }
2100 
2101 /*
2102  * Insert a new shared policy into the list.  Caller holds sp->lock for
2103  * writing.
2104  */
2105 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2106 {
2107 	struct rb_node **p = &sp->root.rb_node;
2108 	struct rb_node *parent = NULL;
2109 	struct sp_node *nd;
2110 
2111 	while (*p) {
2112 		parent = *p;
2113 		nd = rb_entry(parent, struct sp_node, nd);
2114 		if (new->start < nd->start)
2115 			p = &(*p)->rb_left;
2116 		else if (new->end > nd->end)
2117 			p = &(*p)->rb_right;
2118 		else
2119 			BUG();
2120 	}
2121 	rb_link_node(&new->nd, parent, p);
2122 	rb_insert_color(&new->nd, &sp->root);
2123 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2124 		 new->policy ? new->policy->mode : 0);
2125 }
2126 
2127 /* Find shared policy intersecting idx */
2128 struct mempolicy *
2129 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2130 {
2131 	struct mempolicy *pol = NULL;
2132 	struct sp_node *sn;
2133 
2134 	if (!sp->root.rb_node)
2135 		return NULL;
2136 	read_lock(&sp->lock);
2137 	sn = sp_lookup(sp, idx, idx+1);
2138 	if (sn) {
2139 		mpol_get(sn->policy);
2140 		pol = sn->policy;
2141 	}
2142 	read_unlock(&sp->lock);
2143 	return pol;
2144 }
2145 
2146 static void sp_free(struct sp_node *n)
2147 {
2148 	mpol_put(n->policy);
2149 	kmem_cache_free(sn_cache, n);
2150 }
2151 
2152 /**
2153  * mpol_misplaced - check whether current page node is valid in policy
2154  *
2155  * @page: page to be checked
2156  * @vma: vm area where page mapped
2157  * @addr: virtual address where page mapped
2158  *
2159  * Lookup current policy node id for vma,addr and "compare to" page's
2160  * node id.
2161  *
2162  * Returns:
2163  *	-1	- not misplaced, page is in the right node
2164  *	node	- node id where the page should be
2165  *
2166  * Policy determination "mimics" alloc_page_vma().
2167  * Called from fault path where we know the vma and faulting address.
2168  */
2169 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2170 {
2171 	struct mempolicy *pol;
2172 	struct zoneref *z;
2173 	int curnid = page_to_nid(page);
2174 	unsigned long pgoff;
2175 	int thiscpu = raw_smp_processor_id();
2176 	int thisnid = cpu_to_node(thiscpu);
2177 	int polnid = -1;
2178 	int ret = -1;
2179 
2180 	BUG_ON(!vma);
2181 
2182 	pol = get_vma_policy(vma, addr);
2183 	if (!(pol->flags & MPOL_F_MOF))
2184 		goto out;
2185 
2186 	switch (pol->mode) {
2187 	case MPOL_INTERLEAVE:
2188 		BUG_ON(addr >= vma->vm_end);
2189 		BUG_ON(addr < vma->vm_start);
2190 
2191 		pgoff = vma->vm_pgoff;
2192 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2193 		polnid = offset_il_node(pol, vma, pgoff);
2194 		break;
2195 
2196 	case MPOL_PREFERRED:
2197 		if (pol->flags & MPOL_F_LOCAL)
2198 			polnid = numa_node_id();
2199 		else
2200 			polnid = pol->v.preferred_node;
2201 		break;
2202 
2203 	case MPOL_BIND:
2204 
2205 		/*
2206 		 * allows binding to multiple nodes.
2207 		 * use current page if in policy nodemask,
2208 		 * else select nearest allowed node, if any.
2209 		 * If no allowed nodes, use current [!misplaced].
2210 		 */
2211 		if (node_isset(curnid, pol->v.nodes))
2212 			goto out;
2213 		z = first_zones_zonelist(
2214 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2215 				gfp_zone(GFP_HIGHUSER),
2216 				&pol->v.nodes);
2217 		polnid = z->zone->node;
2218 		break;
2219 
2220 	default:
2221 		BUG();
2222 	}
2223 
2224 	/* Migrate the page towards the node whose CPU is referencing it */
2225 	if (pol->flags & MPOL_F_MORON) {
2226 		polnid = thisnid;
2227 
2228 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2229 			goto out;
2230 	}
2231 
2232 	if (curnid != polnid)
2233 		ret = polnid;
2234 out:
2235 	mpol_cond_put(pol);
2236 
2237 	return ret;
2238 }
2239 
2240 /*
2241  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2242  * dropped after task->mempolicy is set to NULL so that any allocation done as
2243  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2244  * policy.
2245  */
2246 void mpol_put_task_policy(struct task_struct *task)
2247 {
2248 	struct mempolicy *pol;
2249 
2250 	task_lock(task);
2251 	pol = task->mempolicy;
2252 	task->mempolicy = NULL;
2253 	task_unlock(task);
2254 	mpol_put(pol);
2255 }
2256 
2257 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2258 {
2259 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2260 	rb_erase(&n->nd, &sp->root);
2261 	sp_free(n);
2262 }
2263 
2264 static void sp_node_init(struct sp_node *node, unsigned long start,
2265 			unsigned long end, struct mempolicy *pol)
2266 {
2267 	node->start = start;
2268 	node->end = end;
2269 	node->policy = pol;
2270 }
2271 
2272 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2273 				struct mempolicy *pol)
2274 {
2275 	struct sp_node *n;
2276 	struct mempolicy *newpol;
2277 
2278 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2279 	if (!n)
2280 		return NULL;
2281 
2282 	newpol = mpol_dup(pol);
2283 	if (IS_ERR(newpol)) {
2284 		kmem_cache_free(sn_cache, n);
2285 		return NULL;
2286 	}
2287 	newpol->flags |= MPOL_F_SHARED;
2288 	sp_node_init(n, start, end, newpol);
2289 
2290 	return n;
2291 }
2292 
2293 /* Replace a policy range. */
2294 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2295 				 unsigned long end, struct sp_node *new)
2296 {
2297 	struct sp_node *n;
2298 	struct sp_node *n_new = NULL;
2299 	struct mempolicy *mpol_new = NULL;
2300 	int ret = 0;
2301 
2302 restart:
2303 	write_lock(&sp->lock);
2304 	n = sp_lookup(sp, start, end);
2305 	/* Take care of old policies in the same range. */
2306 	while (n && n->start < end) {
2307 		struct rb_node *next = rb_next(&n->nd);
2308 		if (n->start >= start) {
2309 			if (n->end <= end)
2310 				sp_delete(sp, n);
2311 			else
2312 				n->start = end;
2313 		} else {
2314 			/* Old policy spanning whole new range. */
2315 			if (n->end > end) {
2316 				if (!n_new)
2317 					goto alloc_new;
2318 
2319 				*mpol_new = *n->policy;
2320 				atomic_set(&mpol_new->refcnt, 1);
2321 				sp_node_init(n_new, end, n->end, mpol_new);
2322 				n->end = start;
2323 				sp_insert(sp, n_new);
2324 				n_new = NULL;
2325 				mpol_new = NULL;
2326 				break;
2327 			} else
2328 				n->end = start;
2329 		}
2330 		if (!next)
2331 			break;
2332 		n = rb_entry(next, struct sp_node, nd);
2333 	}
2334 	if (new)
2335 		sp_insert(sp, new);
2336 	write_unlock(&sp->lock);
2337 	ret = 0;
2338 
2339 err_out:
2340 	if (mpol_new)
2341 		mpol_put(mpol_new);
2342 	if (n_new)
2343 		kmem_cache_free(sn_cache, n_new);
2344 
2345 	return ret;
2346 
2347 alloc_new:
2348 	write_unlock(&sp->lock);
2349 	ret = -ENOMEM;
2350 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2351 	if (!n_new)
2352 		goto err_out;
2353 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2354 	if (!mpol_new)
2355 		goto err_out;
2356 	goto restart;
2357 }
2358 
2359 /**
2360  * mpol_shared_policy_init - initialize shared policy for inode
2361  * @sp: pointer to inode shared policy
2362  * @mpol:  struct mempolicy to install
2363  *
2364  * Install non-NULL @mpol in inode's shared policy rb-tree.
2365  * On entry, the current task has a reference on a non-NULL @mpol.
2366  * This must be released on exit.
2367  * This is called at get_inode() calls and we can use GFP_KERNEL.
2368  */
2369 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2370 {
2371 	int ret;
2372 
2373 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2374 	rwlock_init(&sp->lock);
2375 
2376 	if (mpol) {
2377 		struct vm_area_struct pvma;
2378 		struct mempolicy *new;
2379 		NODEMASK_SCRATCH(scratch);
2380 
2381 		if (!scratch)
2382 			goto put_mpol;
2383 		/* contextualize the tmpfs mount point mempolicy */
2384 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2385 		if (IS_ERR(new))
2386 			goto free_scratch; /* no valid nodemask intersection */
2387 
2388 		task_lock(current);
2389 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2390 		task_unlock(current);
2391 		if (ret)
2392 			goto put_new;
2393 
2394 		/* Create pseudo-vma that contains just the policy */
2395 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2396 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2397 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2398 
2399 put_new:
2400 		mpol_put(new);			/* drop initial ref */
2401 free_scratch:
2402 		NODEMASK_SCRATCH_FREE(scratch);
2403 put_mpol:
2404 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2405 	}
2406 }
2407 
2408 int mpol_set_shared_policy(struct shared_policy *info,
2409 			struct vm_area_struct *vma, struct mempolicy *npol)
2410 {
2411 	int err;
2412 	struct sp_node *new = NULL;
2413 	unsigned long sz = vma_pages(vma);
2414 
2415 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2416 		 vma->vm_pgoff,
2417 		 sz, npol ? npol->mode : -1,
2418 		 npol ? npol->flags : -1,
2419 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2420 
2421 	if (npol) {
2422 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2423 		if (!new)
2424 			return -ENOMEM;
2425 	}
2426 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2427 	if (err && new)
2428 		sp_free(new);
2429 	return err;
2430 }
2431 
2432 /* Free a backing policy store on inode delete. */
2433 void mpol_free_shared_policy(struct shared_policy *p)
2434 {
2435 	struct sp_node *n;
2436 	struct rb_node *next;
2437 
2438 	if (!p->root.rb_node)
2439 		return;
2440 	write_lock(&p->lock);
2441 	next = rb_first(&p->root);
2442 	while (next) {
2443 		n = rb_entry(next, struct sp_node, nd);
2444 		next = rb_next(&n->nd);
2445 		sp_delete(p, n);
2446 	}
2447 	write_unlock(&p->lock);
2448 }
2449 
2450 #ifdef CONFIG_NUMA_BALANCING
2451 static int __initdata numabalancing_override;
2452 
2453 static void __init check_numabalancing_enable(void)
2454 {
2455 	bool numabalancing_default = false;
2456 
2457 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2458 		numabalancing_default = true;
2459 
2460 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2461 	if (numabalancing_override)
2462 		set_numabalancing_state(numabalancing_override == 1);
2463 
2464 	if (num_online_nodes() > 1 && !numabalancing_override) {
2465 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2466 			numabalancing_default ? "Enabling" : "Disabling");
2467 		set_numabalancing_state(numabalancing_default);
2468 	}
2469 }
2470 
2471 static int __init setup_numabalancing(char *str)
2472 {
2473 	int ret = 0;
2474 	if (!str)
2475 		goto out;
2476 
2477 	if (!strcmp(str, "enable")) {
2478 		numabalancing_override = 1;
2479 		ret = 1;
2480 	} else if (!strcmp(str, "disable")) {
2481 		numabalancing_override = -1;
2482 		ret = 1;
2483 	}
2484 out:
2485 	if (!ret)
2486 		pr_warn("Unable to parse numa_balancing=\n");
2487 
2488 	return ret;
2489 }
2490 __setup("numa_balancing=", setup_numabalancing);
2491 #else
2492 static inline void __init check_numabalancing_enable(void)
2493 {
2494 }
2495 #endif /* CONFIG_NUMA_BALANCING */
2496 
2497 /* assumes fs == KERNEL_DS */
2498 void __init numa_policy_init(void)
2499 {
2500 	nodemask_t interleave_nodes;
2501 	unsigned long largest = 0;
2502 	int nid, prefer = 0;
2503 
2504 	policy_cache = kmem_cache_create("numa_policy",
2505 					 sizeof(struct mempolicy),
2506 					 0, SLAB_PANIC, NULL);
2507 
2508 	sn_cache = kmem_cache_create("shared_policy_node",
2509 				     sizeof(struct sp_node),
2510 				     0, SLAB_PANIC, NULL);
2511 
2512 	for_each_node(nid) {
2513 		preferred_node_policy[nid] = (struct mempolicy) {
2514 			.refcnt = ATOMIC_INIT(1),
2515 			.mode = MPOL_PREFERRED,
2516 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2517 			.v = { .preferred_node = nid, },
2518 		};
2519 	}
2520 
2521 	/*
2522 	 * Set interleaving policy for system init. Interleaving is only
2523 	 * enabled across suitably sized nodes (default is >= 16MB), or
2524 	 * fall back to the largest node if they're all smaller.
2525 	 */
2526 	nodes_clear(interleave_nodes);
2527 	for_each_node_state(nid, N_MEMORY) {
2528 		unsigned long total_pages = node_present_pages(nid);
2529 
2530 		/* Preserve the largest node */
2531 		if (largest < total_pages) {
2532 			largest = total_pages;
2533 			prefer = nid;
2534 		}
2535 
2536 		/* Interleave this node? */
2537 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2538 			node_set(nid, interleave_nodes);
2539 	}
2540 
2541 	/* All too small, use the largest */
2542 	if (unlikely(nodes_empty(interleave_nodes)))
2543 		node_set(prefer, interleave_nodes);
2544 
2545 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2546 		pr_err("%s: interleaving failed\n", __func__);
2547 
2548 	check_numabalancing_enable();
2549 }
2550 
2551 /* Reset policy of current process to default */
2552 void numa_default_policy(void)
2553 {
2554 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2555 }
2556 
2557 /*
2558  * Parse and format mempolicy from/to strings
2559  */
2560 
2561 /*
2562  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2563  */
2564 static const char * const policy_modes[] =
2565 {
2566 	[MPOL_DEFAULT]    = "default",
2567 	[MPOL_PREFERRED]  = "prefer",
2568 	[MPOL_BIND]       = "bind",
2569 	[MPOL_INTERLEAVE] = "interleave",
2570 	[MPOL_LOCAL]      = "local",
2571 };
2572 
2573 
2574 #ifdef CONFIG_TMPFS
2575 /**
2576  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2577  * @str:  string containing mempolicy to parse
2578  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2579  *
2580  * Format of input:
2581  *	<mode>[=<flags>][:<nodelist>]
2582  *
2583  * On success, returns 0, else 1
2584  */
2585 int mpol_parse_str(char *str, struct mempolicy **mpol)
2586 {
2587 	struct mempolicy *new = NULL;
2588 	unsigned short mode;
2589 	unsigned short mode_flags;
2590 	nodemask_t nodes;
2591 	char *nodelist = strchr(str, ':');
2592 	char *flags = strchr(str, '=');
2593 	int err = 1;
2594 
2595 	if (nodelist) {
2596 		/* NUL-terminate mode or flags string */
2597 		*nodelist++ = '\0';
2598 		if (nodelist_parse(nodelist, nodes))
2599 			goto out;
2600 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2601 			goto out;
2602 	} else
2603 		nodes_clear(nodes);
2604 
2605 	if (flags)
2606 		*flags++ = '\0';	/* terminate mode string */
2607 
2608 	for (mode = 0; mode < MPOL_MAX; mode++) {
2609 		if (!strcmp(str, policy_modes[mode])) {
2610 			break;
2611 		}
2612 	}
2613 	if (mode >= MPOL_MAX)
2614 		goto out;
2615 
2616 	switch (mode) {
2617 	case MPOL_PREFERRED:
2618 		/*
2619 		 * Insist on a nodelist of one node only
2620 		 */
2621 		if (nodelist) {
2622 			char *rest = nodelist;
2623 			while (isdigit(*rest))
2624 				rest++;
2625 			if (*rest)
2626 				goto out;
2627 		}
2628 		break;
2629 	case MPOL_INTERLEAVE:
2630 		/*
2631 		 * Default to online nodes with memory if no nodelist
2632 		 */
2633 		if (!nodelist)
2634 			nodes = node_states[N_MEMORY];
2635 		break;
2636 	case MPOL_LOCAL:
2637 		/*
2638 		 * Don't allow a nodelist;  mpol_new() checks flags
2639 		 */
2640 		if (nodelist)
2641 			goto out;
2642 		mode = MPOL_PREFERRED;
2643 		break;
2644 	case MPOL_DEFAULT:
2645 		/*
2646 		 * Insist on a empty nodelist
2647 		 */
2648 		if (!nodelist)
2649 			err = 0;
2650 		goto out;
2651 	case MPOL_BIND:
2652 		/*
2653 		 * Insist on a nodelist
2654 		 */
2655 		if (!nodelist)
2656 			goto out;
2657 	}
2658 
2659 	mode_flags = 0;
2660 	if (flags) {
2661 		/*
2662 		 * Currently, we only support two mutually exclusive
2663 		 * mode flags.
2664 		 */
2665 		if (!strcmp(flags, "static"))
2666 			mode_flags |= MPOL_F_STATIC_NODES;
2667 		else if (!strcmp(flags, "relative"))
2668 			mode_flags |= MPOL_F_RELATIVE_NODES;
2669 		else
2670 			goto out;
2671 	}
2672 
2673 	new = mpol_new(mode, mode_flags, &nodes);
2674 	if (IS_ERR(new))
2675 		goto out;
2676 
2677 	/*
2678 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2679 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2680 	 */
2681 	if (mode != MPOL_PREFERRED)
2682 		new->v.nodes = nodes;
2683 	else if (nodelist)
2684 		new->v.preferred_node = first_node(nodes);
2685 	else
2686 		new->flags |= MPOL_F_LOCAL;
2687 
2688 	/*
2689 	 * Save nodes for contextualization: this will be used to "clone"
2690 	 * the mempolicy in a specific context [cpuset] at a later time.
2691 	 */
2692 	new->w.user_nodemask = nodes;
2693 
2694 	err = 0;
2695 
2696 out:
2697 	/* Restore string for error message */
2698 	if (nodelist)
2699 		*--nodelist = ':';
2700 	if (flags)
2701 		*--flags = '=';
2702 	if (!err)
2703 		*mpol = new;
2704 	return err;
2705 }
2706 #endif /* CONFIG_TMPFS */
2707 
2708 /**
2709  * mpol_to_str - format a mempolicy structure for printing
2710  * @buffer:  to contain formatted mempolicy string
2711  * @maxlen:  length of @buffer
2712  * @pol:  pointer to mempolicy to be formatted
2713  *
2714  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2715  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2716  * longest flag, "relative", and to display at least a few node ids.
2717  */
2718 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2719 {
2720 	char *p = buffer;
2721 	nodemask_t nodes = NODE_MASK_NONE;
2722 	unsigned short mode = MPOL_DEFAULT;
2723 	unsigned short flags = 0;
2724 
2725 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2726 		mode = pol->mode;
2727 		flags = pol->flags;
2728 	}
2729 
2730 	switch (mode) {
2731 	case MPOL_DEFAULT:
2732 		break;
2733 	case MPOL_PREFERRED:
2734 		if (flags & MPOL_F_LOCAL)
2735 			mode = MPOL_LOCAL;
2736 		else
2737 			node_set(pol->v.preferred_node, nodes);
2738 		break;
2739 	case MPOL_BIND:
2740 	case MPOL_INTERLEAVE:
2741 		nodes = pol->v.nodes;
2742 		break;
2743 	default:
2744 		WARN_ON_ONCE(1);
2745 		snprintf(p, maxlen, "unknown");
2746 		return;
2747 	}
2748 
2749 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2750 
2751 	if (flags & MPOL_MODE_FLAGS) {
2752 		p += snprintf(p, buffer + maxlen - p, "=");
2753 
2754 		/*
2755 		 * Currently, the only defined flags are mutually exclusive
2756 		 */
2757 		if (flags & MPOL_F_STATIC_NODES)
2758 			p += snprintf(p, buffer + maxlen - p, "static");
2759 		else if (flags & MPOL_F_RELATIVE_NODES)
2760 			p += snprintf(p, buffer + maxlen - p, "relative");
2761 	}
2762 
2763 	if (!nodes_empty(nodes))
2764 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2765 			       nodemask_pr_args(&nodes));
2766 }
2767