xref: /openbmc/linux/mm/mempolicy.c (revision 12a5b00a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	unsigned long start;
414 	unsigned long end;
415 	struct vm_area_struct *first;
416 };
417 
418 /*
419  * Check if the page's nid is in qp->nmask.
420  *
421  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
422  * in the invert of qp->nmask.
423  */
424 static inline bool queue_pages_required(struct page *page,
425 					struct queue_pages *qp)
426 {
427 	int nid = page_to_nid(page);
428 	unsigned long flags = qp->flags;
429 
430 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431 }
432 
433 /*
434  * queue_pages_pmd() has four possible return values:
435  * 0 - pages are placed on the right node or queued successfully.
436  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
437  *     specified.
438  * 2 - THP was split.
439  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
440  *        existing page was already on a node that does not follow the
441  *        policy.
442  */
443 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 				unsigned long end, struct mm_walk *walk)
445 	__releases(ptl)
446 {
447 	int ret = 0;
448 	struct page *page;
449 	struct queue_pages *qp = walk->private;
450 	unsigned long flags;
451 
452 	if (unlikely(is_pmd_migration_entry(*pmd))) {
453 		ret = -EIO;
454 		goto unlock;
455 	}
456 	page = pmd_page(*pmd);
457 	if (is_huge_zero_page(page)) {
458 		spin_unlock(ptl);
459 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
460 		ret = 2;
461 		goto out;
462 	}
463 	if (!queue_pages_required(page, qp))
464 		goto unlock;
465 
466 	flags = qp->flags;
467 	/* go to thp migration */
468 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
469 		if (!vma_migratable(walk->vma) ||
470 		    migrate_page_add(page, qp->pagelist, flags)) {
471 			ret = 1;
472 			goto unlock;
473 		}
474 	} else
475 		ret = -EIO;
476 unlock:
477 	spin_unlock(ptl);
478 out:
479 	return ret;
480 }
481 
482 /*
483  * Scan through pages checking if pages follow certain conditions,
484  * and move them to the pagelist if they do.
485  *
486  * queue_pages_pte_range() has three possible return values:
487  * 0 - pages are placed on the right node or queued successfully.
488  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
489  *     specified.
490  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
491  *        on a node that does not follow the policy.
492  */
493 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
494 			unsigned long end, struct mm_walk *walk)
495 {
496 	struct vm_area_struct *vma = walk->vma;
497 	struct page *page;
498 	struct queue_pages *qp = walk->private;
499 	unsigned long flags = qp->flags;
500 	int ret;
501 	bool has_unmovable = false;
502 	pte_t *pte;
503 	spinlock_t *ptl;
504 
505 	ptl = pmd_trans_huge_lock(pmd, vma);
506 	if (ptl) {
507 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
508 		if (ret != 2)
509 			return ret;
510 	}
511 	/* THP was split, fall through to pte walk */
512 
513 	if (pmd_trans_unstable(pmd))
514 		return 0;
515 
516 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
517 	for (; addr != end; pte++, addr += PAGE_SIZE) {
518 		if (!pte_present(*pte))
519 			continue;
520 		page = vm_normal_page(vma, addr, *pte);
521 		if (!page)
522 			continue;
523 		/*
524 		 * vm_normal_page() filters out zero pages, but there might
525 		 * still be PageReserved pages to skip, perhaps in a VDSO.
526 		 */
527 		if (PageReserved(page))
528 			continue;
529 		if (!queue_pages_required(page, qp))
530 			continue;
531 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
532 			/* MPOL_MF_STRICT must be specified if we get here */
533 			if (!vma_migratable(vma)) {
534 				has_unmovable = true;
535 				break;
536 			}
537 
538 			/*
539 			 * Do not abort immediately since there may be
540 			 * temporary off LRU pages in the range.  Still
541 			 * need migrate other LRU pages.
542 			 */
543 			if (migrate_page_add(page, qp->pagelist, flags))
544 				has_unmovable = true;
545 		} else
546 			break;
547 	}
548 	pte_unmap_unlock(pte - 1, ptl);
549 	cond_resched();
550 
551 	if (has_unmovable)
552 		return 1;
553 
554 	return addr != end ? -EIO : 0;
555 }
556 
557 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 			       unsigned long addr, unsigned long end,
559 			       struct mm_walk *walk)
560 {
561 	int ret = 0;
562 #ifdef CONFIG_HUGETLB_PAGE
563 	struct queue_pages *qp = walk->private;
564 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
565 	struct page *page;
566 	spinlock_t *ptl;
567 	pte_t entry;
568 
569 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
570 	entry = huge_ptep_get(pte);
571 	if (!pte_present(entry))
572 		goto unlock;
573 	page = pte_page(entry);
574 	if (!queue_pages_required(page, qp))
575 		goto unlock;
576 
577 	if (flags == MPOL_MF_STRICT) {
578 		/*
579 		 * STRICT alone means only detecting misplaced page and no
580 		 * need to further check other vma.
581 		 */
582 		ret = -EIO;
583 		goto unlock;
584 	}
585 
586 	if (!vma_migratable(walk->vma)) {
587 		/*
588 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
589 		 * stopped walking current vma.
590 		 * Detecting misplaced page but allow migrating pages which
591 		 * have been queued.
592 		 */
593 		ret = 1;
594 		goto unlock;
595 	}
596 
597 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
598 	if (flags & (MPOL_MF_MOVE_ALL) ||
599 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
600 		if (!isolate_huge_page(page, qp->pagelist) &&
601 			(flags & MPOL_MF_STRICT))
602 			/*
603 			 * Failed to isolate page but allow migrating pages
604 			 * which have been queued.
605 			 */
606 			ret = 1;
607 	}
608 unlock:
609 	spin_unlock(ptl);
610 #else
611 	BUG();
612 #endif
613 	return ret;
614 }
615 
616 #ifdef CONFIG_NUMA_BALANCING
617 /*
618  * This is used to mark a range of virtual addresses to be inaccessible.
619  * These are later cleared by a NUMA hinting fault. Depending on these
620  * faults, pages may be migrated for better NUMA placement.
621  *
622  * This is assuming that NUMA faults are handled using PROT_NONE. If
623  * an architecture makes a different choice, it will need further
624  * changes to the core.
625  */
626 unsigned long change_prot_numa(struct vm_area_struct *vma,
627 			unsigned long addr, unsigned long end)
628 {
629 	int nr_updated;
630 
631 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
632 	if (nr_updated)
633 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 
635 	return nr_updated;
636 }
637 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 			unsigned long addr, unsigned long end)
640 {
641 	return 0;
642 }
643 #endif /* CONFIG_NUMA_BALANCING */
644 
645 static int queue_pages_test_walk(unsigned long start, unsigned long end,
646 				struct mm_walk *walk)
647 {
648 	struct vm_area_struct *vma = walk->vma;
649 	struct queue_pages *qp = walk->private;
650 	unsigned long endvma = vma->vm_end;
651 	unsigned long flags = qp->flags;
652 
653 	/* range check first */
654 	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
655 
656 	if (!qp->first) {
657 		qp->first = vma;
658 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
659 			(qp->start < vma->vm_start))
660 			/* hole at head side of range */
661 			return -EFAULT;
662 	}
663 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
664 		((vma->vm_end < qp->end) &&
665 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
666 		/* hole at middle or tail of range */
667 		return -EFAULT;
668 
669 	/*
670 	 * Need check MPOL_MF_STRICT to return -EIO if possible
671 	 * regardless of vma_migratable
672 	 */
673 	if (!vma_migratable(vma) &&
674 	    !(flags & MPOL_MF_STRICT))
675 		return 1;
676 
677 	if (endvma > end)
678 		endvma = end;
679 
680 	if (flags & MPOL_MF_LAZY) {
681 		/* Similar to task_numa_work, skip inaccessible VMAs */
682 		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
683 			!(vma->vm_flags & VM_MIXEDMAP))
684 			change_prot_numa(vma, start, endvma);
685 		return 1;
686 	}
687 
688 	/* queue pages from current vma */
689 	if (flags & MPOL_MF_VALID)
690 		return 0;
691 	return 1;
692 }
693 
694 static const struct mm_walk_ops queue_pages_walk_ops = {
695 	.hugetlb_entry		= queue_pages_hugetlb,
696 	.pmd_entry		= queue_pages_pte_range,
697 	.test_walk		= queue_pages_test_walk,
698 };
699 
700 /*
701  * Walk through page tables and collect pages to be migrated.
702  *
703  * If pages found in a given range are on a set of nodes (determined by
704  * @nodes and @flags,) it's isolated and queued to the pagelist which is
705  * passed via @private.
706  *
707  * queue_pages_range() has three possible return values:
708  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
709  *     specified.
710  * 0 - queue pages successfully or no misplaced page.
711  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
712  *         memory range specified by nodemask and maxnode points outside
713  *         your accessible address space (-EFAULT)
714  */
715 static int
716 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
717 		nodemask_t *nodes, unsigned long flags,
718 		struct list_head *pagelist)
719 {
720 	int err;
721 	struct queue_pages qp = {
722 		.pagelist = pagelist,
723 		.flags = flags,
724 		.nmask = nodes,
725 		.start = start,
726 		.end = end,
727 		.first = NULL,
728 	};
729 
730 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
731 
732 	if (!qp.first)
733 		/* whole range in hole */
734 		err = -EFAULT;
735 
736 	return err;
737 }
738 
739 /*
740  * Apply policy to a single VMA
741  * This must be called with the mmap_sem held for writing.
742  */
743 static int vma_replace_policy(struct vm_area_struct *vma,
744 						struct mempolicy *pol)
745 {
746 	int err;
747 	struct mempolicy *old;
748 	struct mempolicy *new;
749 
750 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
751 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
752 		 vma->vm_ops, vma->vm_file,
753 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
754 
755 	new = mpol_dup(pol);
756 	if (IS_ERR(new))
757 		return PTR_ERR(new);
758 
759 	if (vma->vm_ops && vma->vm_ops->set_policy) {
760 		err = vma->vm_ops->set_policy(vma, new);
761 		if (err)
762 			goto err_out;
763 	}
764 
765 	old = vma->vm_policy;
766 	vma->vm_policy = new; /* protected by mmap_sem */
767 	mpol_put(old);
768 
769 	return 0;
770  err_out:
771 	mpol_put(new);
772 	return err;
773 }
774 
775 /* Step 2: apply policy to a range and do splits. */
776 static int mbind_range(struct mm_struct *mm, unsigned long start,
777 		       unsigned long end, struct mempolicy *new_pol)
778 {
779 	struct vm_area_struct *next;
780 	struct vm_area_struct *prev;
781 	struct vm_area_struct *vma;
782 	int err = 0;
783 	pgoff_t pgoff;
784 	unsigned long vmstart;
785 	unsigned long vmend;
786 
787 	vma = find_vma(mm, start);
788 	VM_BUG_ON(!vma);
789 
790 	prev = vma->vm_prev;
791 	if (start > vma->vm_start)
792 		prev = vma;
793 
794 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
795 		next = vma->vm_next;
796 		vmstart = max(start, vma->vm_start);
797 		vmend   = min(end, vma->vm_end);
798 
799 		if (mpol_equal(vma_policy(vma), new_pol))
800 			continue;
801 
802 		pgoff = vma->vm_pgoff +
803 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
804 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
805 				 vma->anon_vma, vma->vm_file, pgoff,
806 				 new_pol, vma->vm_userfaultfd_ctx);
807 		if (prev) {
808 			vma = prev;
809 			next = vma->vm_next;
810 			if (mpol_equal(vma_policy(vma), new_pol))
811 				continue;
812 			/* vma_merge() joined vma && vma->next, case 8 */
813 			goto replace;
814 		}
815 		if (vma->vm_start != vmstart) {
816 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
817 			if (err)
818 				goto out;
819 		}
820 		if (vma->vm_end != vmend) {
821 			err = split_vma(vma->vm_mm, vma, vmend, 0);
822 			if (err)
823 				goto out;
824 		}
825  replace:
826 		err = vma_replace_policy(vma, new_pol);
827 		if (err)
828 			goto out;
829 	}
830 
831  out:
832 	return err;
833 }
834 
835 /* Set the process memory policy */
836 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
837 			     nodemask_t *nodes)
838 {
839 	struct mempolicy *new, *old;
840 	NODEMASK_SCRATCH(scratch);
841 	int ret;
842 
843 	if (!scratch)
844 		return -ENOMEM;
845 
846 	new = mpol_new(mode, flags, nodes);
847 	if (IS_ERR(new)) {
848 		ret = PTR_ERR(new);
849 		goto out;
850 	}
851 
852 	task_lock(current);
853 	ret = mpol_set_nodemask(new, nodes, scratch);
854 	if (ret) {
855 		task_unlock(current);
856 		mpol_put(new);
857 		goto out;
858 	}
859 	old = current->mempolicy;
860 	current->mempolicy = new;
861 	if (new && new->mode == MPOL_INTERLEAVE)
862 		current->il_prev = MAX_NUMNODES-1;
863 	task_unlock(current);
864 	mpol_put(old);
865 	ret = 0;
866 out:
867 	NODEMASK_SCRATCH_FREE(scratch);
868 	return ret;
869 }
870 
871 /*
872  * Return nodemask for policy for get_mempolicy() query
873  *
874  * Called with task's alloc_lock held
875  */
876 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
877 {
878 	nodes_clear(*nodes);
879 	if (p == &default_policy)
880 		return;
881 
882 	switch (p->mode) {
883 	case MPOL_BIND:
884 	case MPOL_INTERLEAVE:
885 		*nodes = p->v.nodes;
886 		break;
887 	case MPOL_PREFERRED:
888 		if (!(p->flags & MPOL_F_LOCAL))
889 			node_set(p->v.preferred_node, *nodes);
890 		/* else return empty node mask for local allocation */
891 		break;
892 	default:
893 		BUG();
894 	}
895 }
896 
897 static int lookup_node(struct mm_struct *mm, unsigned long addr)
898 {
899 	struct page *p;
900 	int err;
901 
902 	int locked = 1;
903 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
904 	if (err >= 0) {
905 		err = page_to_nid(p);
906 		put_page(p);
907 	}
908 	if (locked)
909 		up_read(&mm->mmap_sem);
910 	return err;
911 }
912 
913 /* Retrieve NUMA policy */
914 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
915 			     unsigned long addr, unsigned long flags)
916 {
917 	int err;
918 	struct mm_struct *mm = current->mm;
919 	struct vm_area_struct *vma = NULL;
920 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
921 
922 	if (flags &
923 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
924 		return -EINVAL;
925 
926 	if (flags & MPOL_F_MEMS_ALLOWED) {
927 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
928 			return -EINVAL;
929 		*policy = 0;	/* just so it's initialized */
930 		task_lock(current);
931 		*nmask  = cpuset_current_mems_allowed;
932 		task_unlock(current);
933 		return 0;
934 	}
935 
936 	if (flags & MPOL_F_ADDR) {
937 		/*
938 		 * Do NOT fall back to task policy if the
939 		 * vma/shared policy at addr is NULL.  We
940 		 * want to return MPOL_DEFAULT in this case.
941 		 */
942 		down_read(&mm->mmap_sem);
943 		vma = find_vma_intersection(mm, addr, addr+1);
944 		if (!vma) {
945 			up_read(&mm->mmap_sem);
946 			return -EFAULT;
947 		}
948 		if (vma->vm_ops && vma->vm_ops->get_policy)
949 			pol = vma->vm_ops->get_policy(vma, addr);
950 		else
951 			pol = vma->vm_policy;
952 	} else if (addr)
953 		return -EINVAL;
954 
955 	if (!pol)
956 		pol = &default_policy;	/* indicates default behavior */
957 
958 	if (flags & MPOL_F_NODE) {
959 		if (flags & MPOL_F_ADDR) {
960 			/*
961 			 * Take a refcount on the mpol, lookup_node()
962 			 * wil drop the mmap_sem, so after calling
963 			 * lookup_node() only "pol" remains valid, "vma"
964 			 * is stale.
965 			 */
966 			pol_refcount = pol;
967 			vma = NULL;
968 			mpol_get(pol);
969 			err = lookup_node(mm, addr);
970 			if (err < 0)
971 				goto out;
972 			*policy = err;
973 		} else if (pol == current->mempolicy &&
974 				pol->mode == MPOL_INTERLEAVE) {
975 			*policy = next_node_in(current->il_prev, pol->v.nodes);
976 		} else {
977 			err = -EINVAL;
978 			goto out;
979 		}
980 	} else {
981 		*policy = pol == &default_policy ? MPOL_DEFAULT :
982 						pol->mode;
983 		/*
984 		 * Internal mempolicy flags must be masked off before exposing
985 		 * the policy to userspace.
986 		 */
987 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
988 	}
989 
990 	err = 0;
991 	if (nmask) {
992 		if (mpol_store_user_nodemask(pol)) {
993 			*nmask = pol->w.user_nodemask;
994 		} else {
995 			task_lock(current);
996 			get_policy_nodemask(pol, nmask);
997 			task_unlock(current);
998 		}
999 	}
1000 
1001  out:
1002 	mpol_cond_put(pol);
1003 	if (vma)
1004 		up_read(&mm->mmap_sem);
1005 	if (pol_refcount)
1006 		mpol_put(pol_refcount);
1007 	return err;
1008 }
1009 
1010 #ifdef CONFIG_MIGRATION
1011 /*
1012  * page migration, thp tail pages can be passed.
1013  */
1014 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1015 				unsigned long flags)
1016 {
1017 	struct page *head = compound_head(page);
1018 	/*
1019 	 * Avoid migrating a page that is shared with others.
1020 	 */
1021 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1022 		if (!isolate_lru_page(head)) {
1023 			list_add_tail(&head->lru, pagelist);
1024 			mod_node_page_state(page_pgdat(head),
1025 				NR_ISOLATED_ANON + page_is_file_lru(head),
1026 				hpage_nr_pages(head));
1027 		} else if (flags & MPOL_MF_STRICT) {
1028 			/*
1029 			 * Non-movable page may reach here.  And, there may be
1030 			 * temporary off LRU pages or non-LRU movable pages.
1031 			 * Treat them as unmovable pages since they can't be
1032 			 * isolated, so they can't be moved at the moment.  It
1033 			 * should return -EIO for this case too.
1034 			 */
1035 			return -EIO;
1036 		}
1037 	}
1038 
1039 	return 0;
1040 }
1041 
1042 /* page allocation callback for NUMA node migration */
1043 struct page *alloc_new_node_page(struct page *page, unsigned long node)
1044 {
1045 	if (PageHuge(page))
1046 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1047 					node);
1048 	else if (PageTransHuge(page)) {
1049 		struct page *thp;
1050 
1051 		thp = alloc_pages_node(node,
1052 			(GFP_TRANSHUGE | __GFP_THISNODE),
1053 			HPAGE_PMD_ORDER);
1054 		if (!thp)
1055 			return NULL;
1056 		prep_transhuge_page(thp);
1057 		return thp;
1058 	} else
1059 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1060 						    __GFP_THISNODE, 0);
1061 }
1062 
1063 /*
1064  * Migrate pages from one node to a target node.
1065  * Returns error or the number of pages not migrated.
1066  */
1067 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1068 			   int flags)
1069 {
1070 	nodemask_t nmask;
1071 	LIST_HEAD(pagelist);
1072 	int err = 0;
1073 
1074 	nodes_clear(nmask);
1075 	node_set(source, nmask);
1076 
1077 	/*
1078 	 * This does not "check" the range but isolates all pages that
1079 	 * need migration.  Between passing in the full user address
1080 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1081 	 */
1082 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1083 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1084 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1085 
1086 	if (!list_empty(&pagelist)) {
1087 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1088 					MIGRATE_SYNC, MR_SYSCALL);
1089 		if (err)
1090 			putback_movable_pages(&pagelist);
1091 	}
1092 
1093 	return err;
1094 }
1095 
1096 /*
1097  * Move pages between the two nodesets so as to preserve the physical
1098  * layout as much as possible.
1099  *
1100  * Returns the number of page that could not be moved.
1101  */
1102 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1103 		     const nodemask_t *to, int flags)
1104 {
1105 	int busy = 0;
1106 	int err;
1107 	nodemask_t tmp;
1108 
1109 	err = migrate_prep();
1110 	if (err)
1111 		return err;
1112 
1113 	down_read(&mm->mmap_sem);
1114 
1115 	/*
1116 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1117 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1118 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1119 	 * The pair of nodemasks 'to' and 'from' define the map.
1120 	 *
1121 	 * If no pair of bits is found that way, fallback to picking some
1122 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1123 	 * 'source' and 'dest' bits are the same, this represents a node
1124 	 * that will be migrating to itself, so no pages need move.
1125 	 *
1126 	 * If no bits are left in 'tmp', or if all remaining bits left
1127 	 * in 'tmp' correspond to the same bit in 'to', return false
1128 	 * (nothing left to migrate).
1129 	 *
1130 	 * This lets us pick a pair of nodes to migrate between, such that
1131 	 * if possible the dest node is not already occupied by some other
1132 	 * source node, minimizing the risk of overloading the memory on a
1133 	 * node that would happen if we migrated incoming memory to a node
1134 	 * before migrating outgoing memory source that same node.
1135 	 *
1136 	 * A single scan of tmp is sufficient.  As we go, we remember the
1137 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1138 	 * that not only moved, but what's better, moved to an empty slot
1139 	 * (d is not set in tmp), then we break out then, with that pair.
1140 	 * Otherwise when we finish scanning from_tmp, we at least have the
1141 	 * most recent <s, d> pair that moved.  If we get all the way through
1142 	 * the scan of tmp without finding any node that moved, much less
1143 	 * moved to an empty node, then there is nothing left worth migrating.
1144 	 */
1145 
1146 	tmp = *from;
1147 	while (!nodes_empty(tmp)) {
1148 		int s,d;
1149 		int source = NUMA_NO_NODE;
1150 		int dest = 0;
1151 
1152 		for_each_node_mask(s, tmp) {
1153 
1154 			/*
1155 			 * do_migrate_pages() tries to maintain the relative
1156 			 * node relationship of the pages established between
1157 			 * threads and memory areas.
1158                          *
1159 			 * However if the number of source nodes is not equal to
1160 			 * the number of destination nodes we can not preserve
1161 			 * this node relative relationship.  In that case, skip
1162 			 * copying memory from a node that is in the destination
1163 			 * mask.
1164 			 *
1165 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1166 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1167 			 */
1168 
1169 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1170 						(node_isset(s, *to)))
1171 				continue;
1172 
1173 			d = node_remap(s, *from, *to);
1174 			if (s == d)
1175 				continue;
1176 
1177 			source = s;	/* Node moved. Memorize */
1178 			dest = d;
1179 
1180 			/* dest not in remaining from nodes? */
1181 			if (!node_isset(dest, tmp))
1182 				break;
1183 		}
1184 		if (source == NUMA_NO_NODE)
1185 			break;
1186 
1187 		node_clear(source, tmp);
1188 		err = migrate_to_node(mm, source, dest, flags);
1189 		if (err > 0)
1190 			busy += err;
1191 		if (err < 0)
1192 			break;
1193 	}
1194 	up_read(&mm->mmap_sem);
1195 	if (err < 0)
1196 		return err;
1197 	return busy;
1198 
1199 }
1200 
1201 /*
1202  * Allocate a new page for page migration based on vma policy.
1203  * Start by assuming the page is mapped by the same vma as contains @start.
1204  * Search forward from there, if not.  N.B., this assumes that the
1205  * list of pages handed to migrate_pages()--which is how we get here--
1206  * is in virtual address order.
1207  */
1208 static struct page *new_page(struct page *page, unsigned long start)
1209 {
1210 	struct vm_area_struct *vma;
1211 	unsigned long uninitialized_var(address);
1212 
1213 	vma = find_vma(current->mm, start);
1214 	while (vma) {
1215 		address = page_address_in_vma(page, vma);
1216 		if (address != -EFAULT)
1217 			break;
1218 		vma = vma->vm_next;
1219 	}
1220 
1221 	if (PageHuge(page)) {
1222 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1223 				vma, address);
1224 	} else if (PageTransHuge(page)) {
1225 		struct page *thp;
1226 
1227 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1228 					 HPAGE_PMD_ORDER);
1229 		if (!thp)
1230 			return NULL;
1231 		prep_transhuge_page(thp);
1232 		return thp;
1233 	}
1234 	/*
1235 	 * if !vma, alloc_page_vma() will use task or system default policy
1236 	 */
1237 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1238 			vma, address);
1239 }
1240 #else
1241 
1242 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1243 				unsigned long flags)
1244 {
1245 	return -EIO;
1246 }
1247 
1248 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1249 		     const nodemask_t *to, int flags)
1250 {
1251 	return -ENOSYS;
1252 }
1253 
1254 static struct page *new_page(struct page *page, unsigned long start)
1255 {
1256 	return NULL;
1257 }
1258 #endif
1259 
1260 static long do_mbind(unsigned long start, unsigned long len,
1261 		     unsigned short mode, unsigned short mode_flags,
1262 		     nodemask_t *nmask, unsigned long flags)
1263 {
1264 	struct mm_struct *mm = current->mm;
1265 	struct mempolicy *new;
1266 	unsigned long end;
1267 	int err;
1268 	int ret;
1269 	LIST_HEAD(pagelist);
1270 
1271 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1272 		return -EINVAL;
1273 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1274 		return -EPERM;
1275 
1276 	if (start & ~PAGE_MASK)
1277 		return -EINVAL;
1278 
1279 	if (mode == MPOL_DEFAULT)
1280 		flags &= ~MPOL_MF_STRICT;
1281 
1282 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1283 	end = start + len;
1284 
1285 	if (end < start)
1286 		return -EINVAL;
1287 	if (end == start)
1288 		return 0;
1289 
1290 	new = mpol_new(mode, mode_flags, nmask);
1291 	if (IS_ERR(new))
1292 		return PTR_ERR(new);
1293 
1294 	if (flags & MPOL_MF_LAZY)
1295 		new->flags |= MPOL_F_MOF;
1296 
1297 	/*
1298 	 * If we are using the default policy then operation
1299 	 * on discontinuous address spaces is okay after all
1300 	 */
1301 	if (!new)
1302 		flags |= MPOL_MF_DISCONTIG_OK;
1303 
1304 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1305 		 start, start + len, mode, mode_flags,
1306 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1307 
1308 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1309 
1310 		err = migrate_prep();
1311 		if (err)
1312 			goto mpol_out;
1313 	}
1314 	{
1315 		NODEMASK_SCRATCH(scratch);
1316 		if (scratch) {
1317 			down_write(&mm->mmap_sem);
1318 			task_lock(current);
1319 			err = mpol_set_nodemask(new, nmask, scratch);
1320 			task_unlock(current);
1321 			if (err)
1322 				up_write(&mm->mmap_sem);
1323 		} else
1324 			err = -ENOMEM;
1325 		NODEMASK_SCRATCH_FREE(scratch);
1326 	}
1327 	if (err)
1328 		goto mpol_out;
1329 
1330 	ret = queue_pages_range(mm, start, end, nmask,
1331 			  flags | MPOL_MF_INVERT, &pagelist);
1332 
1333 	if (ret < 0) {
1334 		err = ret;
1335 		goto up_out;
1336 	}
1337 
1338 	err = mbind_range(mm, start, end, new);
1339 
1340 	if (!err) {
1341 		int nr_failed = 0;
1342 
1343 		if (!list_empty(&pagelist)) {
1344 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1345 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1346 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1347 			if (nr_failed)
1348 				putback_movable_pages(&pagelist);
1349 		}
1350 
1351 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1352 			err = -EIO;
1353 	} else {
1354 up_out:
1355 		if (!list_empty(&pagelist))
1356 			putback_movable_pages(&pagelist);
1357 	}
1358 
1359 	up_write(&mm->mmap_sem);
1360 mpol_out:
1361 	mpol_put(new);
1362 	return err;
1363 }
1364 
1365 /*
1366  * User space interface with variable sized bitmaps for nodelists.
1367  */
1368 
1369 /* Copy a node mask from user space. */
1370 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1371 		     unsigned long maxnode)
1372 {
1373 	unsigned long k;
1374 	unsigned long t;
1375 	unsigned long nlongs;
1376 	unsigned long endmask;
1377 
1378 	--maxnode;
1379 	nodes_clear(*nodes);
1380 	if (maxnode == 0 || !nmask)
1381 		return 0;
1382 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1383 		return -EINVAL;
1384 
1385 	nlongs = BITS_TO_LONGS(maxnode);
1386 	if ((maxnode % BITS_PER_LONG) == 0)
1387 		endmask = ~0UL;
1388 	else
1389 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1390 
1391 	/*
1392 	 * When the user specified more nodes than supported just check
1393 	 * if the non supported part is all zero.
1394 	 *
1395 	 * If maxnode have more longs than MAX_NUMNODES, check
1396 	 * the bits in that area first. And then go through to
1397 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1398 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1399 	 */
1400 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1401 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1402 			if (get_user(t, nmask + k))
1403 				return -EFAULT;
1404 			if (k == nlongs - 1) {
1405 				if (t & endmask)
1406 					return -EINVAL;
1407 			} else if (t)
1408 				return -EINVAL;
1409 		}
1410 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1411 		endmask = ~0UL;
1412 	}
1413 
1414 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1415 		unsigned long valid_mask = endmask;
1416 
1417 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1418 		if (get_user(t, nmask + nlongs - 1))
1419 			return -EFAULT;
1420 		if (t & valid_mask)
1421 			return -EINVAL;
1422 	}
1423 
1424 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1425 		return -EFAULT;
1426 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1427 	return 0;
1428 }
1429 
1430 /* Copy a kernel node mask to user space */
1431 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1432 			      nodemask_t *nodes)
1433 {
1434 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1435 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1436 
1437 	if (copy > nbytes) {
1438 		if (copy > PAGE_SIZE)
1439 			return -EINVAL;
1440 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1441 			return -EFAULT;
1442 		copy = nbytes;
1443 	}
1444 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1445 }
1446 
1447 static long kernel_mbind(unsigned long start, unsigned long len,
1448 			 unsigned long mode, const unsigned long __user *nmask,
1449 			 unsigned long maxnode, unsigned int flags)
1450 {
1451 	nodemask_t nodes;
1452 	int err;
1453 	unsigned short mode_flags;
1454 
1455 	start = untagged_addr(start);
1456 	mode_flags = mode & MPOL_MODE_FLAGS;
1457 	mode &= ~MPOL_MODE_FLAGS;
1458 	if (mode >= MPOL_MAX)
1459 		return -EINVAL;
1460 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1461 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1462 		return -EINVAL;
1463 	err = get_nodes(&nodes, nmask, maxnode);
1464 	if (err)
1465 		return err;
1466 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1467 }
1468 
1469 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1470 		unsigned long, mode, const unsigned long __user *, nmask,
1471 		unsigned long, maxnode, unsigned int, flags)
1472 {
1473 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1474 }
1475 
1476 /* Set the process memory policy */
1477 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1478 				 unsigned long maxnode)
1479 {
1480 	int err;
1481 	nodemask_t nodes;
1482 	unsigned short flags;
1483 
1484 	flags = mode & MPOL_MODE_FLAGS;
1485 	mode &= ~MPOL_MODE_FLAGS;
1486 	if ((unsigned int)mode >= MPOL_MAX)
1487 		return -EINVAL;
1488 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1489 		return -EINVAL;
1490 	err = get_nodes(&nodes, nmask, maxnode);
1491 	if (err)
1492 		return err;
1493 	return do_set_mempolicy(mode, flags, &nodes);
1494 }
1495 
1496 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1497 		unsigned long, maxnode)
1498 {
1499 	return kernel_set_mempolicy(mode, nmask, maxnode);
1500 }
1501 
1502 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1503 				const unsigned long __user *old_nodes,
1504 				const unsigned long __user *new_nodes)
1505 {
1506 	struct mm_struct *mm = NULL;
1507 	struct task_struct *task;
1508 	nodemask_t task_nodes;
1509 	int err;
1510 	nodemask_t *old;
1511 	nodemask_t *new;
1512 	NODEMASK_SCRATCH(scratch);
1513 
1514 	if (!scratch)
1515 		return -ENOMEM;
1516 
1517 	old = &scratch->mask1;
1518 	new = &scratch->mask2;
1519 
1520 	err = get_nodes(old, old_nodes, maxnode);
1521 	if (err)
1522 		goto out;
1523 
1524 	err = get_nodes(new, new_nodes, maxnode);
1525 	if (err)
1526 		goto out;
1527 
1528 	/* Find the mm_struct */
1529 	rcu_read_lock();
1530 	task = pid ? find_task_by_vpid(pid) : current;
1531 	if (!task) {
1532 		rcu_read_unlock();
1533 		err = -ESRCH;
1534 		goto out;
1535 	}
1536 	get_task_struct(task);
1537 
1538 	err = -EINVAL;
1539 
1540 	/*
1541 	 * Check if this process has the right to modify the specified process.
1542 	 * Use the regular "ptrace_may_access()" checks.
1543 	 */
1544 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1545 		rcu_read_unlock();
1546 		err = -EPERM;
1547 		goto out_put;
1548 	}
1549 	rcu_read_unlock();
1550 
1551 	task_nodes = cpuset_mems_allowed(task);
1552 	/* Is the user allowed to access the target nodes? */
1553 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1554 		err = -EPERM;
1555 		goto out_put;
1556 	}
1557 
1558 	task_nodes = cpuset_mems_allowed(current);
1559 	nodes_and(*new, *new, task_nodes);
1560 	if (nodes_empty(*new))
1561 		goto out_put;
1562 
1563 	err = security_task_movememory(task);
1564 	if (err)
1565 		goto out_put;
1566 
1567 	mm = get_task_mm(task);
1568 	put_task_struct(task);
1569 
1570 	if (!mm) {
1571 		err = -EINVAL;
1572 		goto out;
1573 	}
1574 
1575 	err = do_migrate_pages(mm, old, new,
1576 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1577 
1578 	mmput(mm);
1579 out:
1580 	NODEMASK_SCRATCH_FREE(scratch);
1581 
1582 	return err;
1583 
1584 out_put:
1585 	put_task_struct(task);
1586 	goto out;
1587 
1588 }
1589 
1590 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1591 		const unsigned long __user *, old_nodes,
1592 		const unsigned long __user *, new_nodes)
1593 {
1594 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1595 }
1596 
1597 
1598 /* Retrieve NUMA policy */
1599 static int kernel_get_mempolicy(int __user *policy,
1600 				unsigned long __user *nmask,
1601 				unsigned long maxnode,
1602 				unsigned long addr,
1603 				unsigned long flags)
1604 {
1605 	int err;
1606 	int uninitialized_var(pval);
1607 	nodemask_t nodes;
1608 
1609 	addr = untagged_addr(addr);
1610 
1611 	if (nmask != NULL && maxnode < nr_node_ids)
1612 		return -EINVAL;
1613 
1614 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1615 
1616 	if (err)
1617 		return err;
1618 
1619 	if (policy && put_user(pval, policy))
1620 		return -EFAULT;
1621 
1622 	if (nmask)
1623 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1624 
1625 	return err;
1626 }
1627 
1628 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1629 		unsigned long __user *, nmask, unsigned long, maxnode,
1630 		unsigned long, addr, unsigned long, flags)
1631 {
1632 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1633 }
1634 
1635 #ifdef CONFIG_COMPAT
1636 
1637 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1638 		       compat_ulong_t __user *, nmask,
1639 		       compat_ulong_t, maxnode,
1640 		       compat_ulong_t, addr, compat_ulong_t, flags)
1641 {
1642 	long err;
1643 	unsigned long __user *nm = NULL;
1644 	unsigned long nr_bits, alloc_size;
1645 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1646 
1647 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1648 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1649 
1650 	if (nmask)
1651 		nm = compat_alloc_user_space(alloc_size);
1652 
1653 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1654 
1655 	if (!err && nmask) {
1656 		unsigned long copy_size;
1657 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1658 		err = copy_from_user(bm, nm, copy_size);
1659 		/* ensure entire bitmap is zeroed */
1660 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1661 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1662 	}
1663 
1664 	return err;
1665 }
1666 
1667 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1668 		       compat_ulong_t, maxnode)
1669 {
1670 	unsigned long __user *nm = NULL;
1671 	unsigned long nr_bits, alloc_size;
1672 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1673 
1674 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1675 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1676 
1677 	if (nmask) {
1678 		if (compat_get_bitmap(bm, nmask, nr_bits))
1679 			return -EFAULT;
1680 		nm = compat_alloc_user_space(alloc_size);
1681 		if (copy_to_user(nm, bm, alloc_size))
1682 			return -EFAULT;
1683 	}
1684 
1685 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1686 }
1687 
1688 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1689 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1690 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1691 {
1692 	unsigned long __user *nm = NULL;
1693 	unsigned long nr_bits, alloc_size;
1694 	nodemask_t bm;
1695 
1696 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1697 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1698 
1699 	if (nmask) {
1700 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1701 			return -EFAULT;
1702 		nm = compat_alloc_user_space(alloc_size);
1703 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1704 			return -EFAULT;
1705 	}
1706 
1707 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1708 }
1709 
1710 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1711 		       compat_ulong_t, maxnode,
1712 		       const compat_ulong_t __user *, old_nodes,
1713 		       const compat_ulong_t __user *, new_nodes)
1714 {
1715 	unsigned long __user *old = NULL;
1716 	unsigned long __user *new = NULL;
1717 	nodemask_t tmp_mask;
1718 	unsigned long nr_bits;
1719 	unsigned long size;
1720 
1721 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1722 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1723 	if (old_nodes) {
1724 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1725 			return -EFAULT;
1726 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1727 		if (new_nodes)
1728 			new = old + size / sizeof(unsigned long);
1729 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1730 			return -EFAULT;
1731 	}
1732 	if (new_nodes) {
1733 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1734 			return -EFAULT;
1735 		if (new == NULL)
1736 			new = compat_alloc_user_space(size);
1737 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1738 			return -EFAULT;
1739 	}
1740 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1741 }
1742 
1743 #endif /* CONFIG_COMPAT */
1744 
1745 bool vma_migratable(struct vm_area_struct *vma)
1746 {
1747 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1748 		return false;
1749 
1750 	/*
1751 	 * DAX device mappings require predictable access latency, so avoid
1752 	 * incurring periodic faults.
1753 	 */
1754 	if (vma_is_dax(vma))
1755 		return false;
1756 
1757 	if (is_vm_hugetlb_page(vma) &&
1758 		!hugepage_migration_supported(hstate_vma(vma)))
1759 		return false;
1760 
1761 	/*
1762 	 * Migration allocates pages in the highest zone. If we cannot
1763 	 * do so then migration (at least from node to node) is not
1764 	 * possible.
1765 	 */
1766 	if (vma->vm_file &&
1767 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1768 			< policy_zone)
1769 		return false;
1770 	return true;
1771 }
1772 
1773 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1774 						unsigned long addr)
1775 {
1776 	struct mempolicy *pol = NULL;
1777 
1778 	if (vma) {
1779 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1780 			pol = vma->vm_ops->get_policy(vma, addr);
1781 		} else if (vma->vm_policy) {
1782 			pol = vma->vm_policy;
1783 
1784 			/*
1785 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1786 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1787 			 * count on these policies which will be dropped by
1788 			 * mpol_cond_put() later
1789 			 */
1790 			if (mpol_needs_cond_ref(pol))
1791 				mpol_get(pol);
1792 		}
1793 	}
1794 
1795 	return pol;
1796 }
1797 
1798 /*
1799  * get_vma_policy(@vma, @addr)
1800  * @vma: virtual memory area whose policy is sought
1801  * @addr: address in @vma for shared policy lookup
1802  *
1803  * Returns effective policy for a VMA at specified address.
1804  * Falls back to current->mempolicy or system default policy, as necessary.
1805  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1806  * count--added by the get_policy() vm_op, as appropriate--to protect against
1807  * freeing by another task.  It is the caller's responsibility to free the
1808  * extra reference for shared policies.
1809  */
1810 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1811 						unsigned long addr)
1812 {
1813 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1814 
1815 	if (!pol)
1816 		pol = get_task_policy(current);
1817 
1818 	return pol;
1819 }
1820 
1821 bool vma_policy_mof(struct vm_area_struct *vma)
1822 {
1823 	struct mempolicy *pol;
1824 
1825 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1826 		bool ret = false;
1827 
1828 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1829 		if (pol && (pol->flags & MPOL_F_MOF))
1830 			ret = true;
1831 		mpol_cond_put(pol);
1832 
1833 		return ret;
1834 	}
1835 
1836 	pol = vma->vm_policy;
1837 	if (!pol)
1838 		pol = get_task_policy(current);
1839 
1840 	return pol->flags & MPOL_F_MOF;
1841 }
1842 
1843 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1844 {
1845 	enum zone_type dynamic_policy_zone = policy_zone;
1846 
1847 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1848 
1849 	/*
1850 	 * if policy->v.nodes has movable memory only,
1851 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1852 	 *
1853 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1854 	 * so if the following test faile, it implies
1855 	 * policy->v.nodes has movable memory only.
1856 	 */
1857 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1858 		dynamic_policy_zone = ZONE_MOVABLE;
1859 
1860 	return zone >= dynamic_policy_zone;
1861 }
1862 
1863 /*
1864  * Return a nodemask representing a mempolicy for filtering nodes for
1865  * page allocation
1866  */
1867 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1868 {
1869 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1870 	if (unlikely(policy->mode == MPOL_BIND) &&
1871 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1872 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1873 		return &policy->v.nodes;
1874 
1875 	return NULL;
1876 }
1877 
1878 /* Return the node id preferred by the given mempolicy, or the given id */
1879 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1880 								int nd)
1881 {
1882 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1883 		nd = policy->v.preferred_node;
1884 	else {
1885 		/*
1886 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1887 		 * because we might easily break the expectation to stay on the
1888 		 * requested node and not break the policy.
1889 		 */
1890 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1891 	}
1892 
1893 	return nd;
1894 }
1895 
1896 /* Do dynamic interleaving for a process */
1897 static unsigned interleave_nodes(struct mempolicy *policy)
1898 {
1899 	unsigned next;
1900 	struct task_struct *me = current;
1901 
1902 	next = next_node_in(me->il_prev, policy->v.nodes);
1903 	if (next < MAX_NUMNODES)
1904 		me->il_prev = next;
1905 	return next;
1906 }
1907 
1908 /*
1909  * Depending on the memory policy provide a node from which to allocate the
1910  * next slab entry.
1911  */
1912 unsigned int mempolicy_slab_node(void)
1913 {
1914 	struct mempolicy *policy;
1915 	int node = numa_mem_id();
1916 
1917 	if (in_interrupt())
1918 		return node;
1919 
1920 	policy = current->mempolicy;
1921 	if (!policy || policy->flags & MPOL_F_LOCAL)
1922 		return node;
1923 
1924 	switch (policy->mode) {
1925 	case MPOL_PREFERRED:
1926 		/*
1927 		 * handled MPOL_F_LOCAL above
1928 		 */
1929 		return policy->v.preferred_node;
1930 
1931 	case MPOL_INTERLEAVE:
1932 		return interleave_nodes(policy);
1933 
1934 	case MPOL_BIND: {
1935 		struct zoneref *z;
1936 
1937 		/*
1938 		 * Follow bind policy behavior and start allocation at the
1939 		 * first node.
1940 		 */
1941 		struct zonelist *zonelist;
1942 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1943 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1944 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1945 							&policy->v.nodes);
1946 		return z->zone ? zone_to_nid(z->zone) : node;
1947 	}
1948 
1949 	default:
1950 		BUG();
1951 	}
1952 }
1953 
1954 /*
1955  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1956  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1957  * number of present nodes.
1958  */
1959 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1960 {
1961 	unsigned nnodes = nodes_weight(pol->v.nodes);
1962 	unsigned target;
1963 	int i;
1964 	int nid;
1965 
1966 	if (!nnodes)
1967 		return numa_node_id();
1968 	target = (unsigned int)n % nnodes;
1969 	nid = first_node(pol->v.nodes);
1970 	for (i = 0; i < target; i++)
1971 		nid = next_node(nid, pol->v.nodes);
1972 	return nid;
1973 }
1974 
1975 /* Determine a node number for interleave */
1976 static inline unsigned interleave_nid(struct mempolicy *pol,
1977 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1978 {
1979 	if (vma) {
1980 		unsigned long off;
1981 
1982 		/*
1983 		 * for small pages, there is no difference between
1984 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1985 		 * for huge pages, since vm_pgoff is in units of small
1986 		 * pages, we need to shift off the always 0 bits to get
1987 		 * a useful offset.
1988 		 */
1989 		BUG_ON(shift < PAGE_SHIFT);
1990 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1991 		off += (addr - vma->vm_start) >> shift;
1992 		return offset_il_node(pol, off);
1993 	} else
1994 		return interleave_nodes(pol);
1995 }
1996 
1997 #ifdef CONFIG_HUGETLBFS
1998 /*
1999  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2000  * @vma: virtual memory area whose policy is sought
2001  * @addr: address in @vma for shared policy lookup and interleave policy
2002  * @gfp_flags: for requested zone
2003  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2004  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2005  *
2006  * Returns a nid suitable for a huge page allocation and a pointer
2007  * to the struct mempolicy for conditional unref after allocation.
2008  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2009  * @nodemask for filtering the zonelist.
2010  *
2011  * Must be protected by read_mems_allowed_begin()
2012  */
2013 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2014 				struct mempolicy **mpol, nodemask_t **nodemask)
2015 {
2016 	int nid;
2017 
2018 	*mpol = get_vma_policy(vma, addr);
2019 	*nodemask = NULL;	/* assume !MPOL_BIND */
2020 
2021 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2022 		nid = interleave_nid(*mpol, vma, addr,
2023 					huge_page_shift(hstate_vma(vma)));
2024 	} else {
2025 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2026 		if ((*mpol)->mode == MPOL_BIND)
2027 			*nodemask = &(*mpol)->v.nodes;
2028 	}
2029 	return nid;
2030 }
2031 
2032 /*
2033  * init_nodemask_of_mempolicy
2034  *
2035  * If the current task's mempolicy is "default" [NULL], return 'false'
2036  * to indicate default policy.  Otherwise, extract the policy nodemask
2037  * for 'bind' or 'interleave' policy into the argument nodemask, or
2038  * initialize the argument nodemask to contain the single node for
2039  * 'preferred' or 'local' policy and return 'true' to indicate presence
2040  * of non-default mempolicy.
2041  *
2042  * We don't bother with reference counting the mempolicy [mpol_get/put]
2043  * because the current task is examining it's own mempolicy and a task's
2044  * mempolicy is only ever changed by the task itself.
2045  *
2046  * N.B., it is the caller's responsibility to free a returned nodemask.
2047  */
2048 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2049 {
2050 	struct mempolicy *mempolicy;
2051 	int nid;
2052 
2053 	if (!(mask && current->mempolicy))
2054 		return false;
2055 
2056 	task_lock(current);
2057 	mempolicy = current->mempolicy;
2058 	switch (mempolicy->mode) {
2059 	case MPOL_PREFERRED:
2060 		if (mempolicy->flags & MPOL_F_LOCAL)
2061 			nid = numa_node_id();
2062 		else
2063 			nid = mempolicy->v.preferred_node;
2064 		init_nodemask_of_node(mask, nid);
2065 		break;
2066 
2067 	case MPOL_BIND:
2068 	case MPOL_INTERLEAVE:
2069 		*mask =  mempolicy->v.nodes;
2070 		break;
2071 
2072 	default:
2073 		BUG();
2074 	}
2075 	task_unlock(current);
2076 
2077 	return true;
2078 }
2079 #endif
2080 
2081 /*
2082  * mempolicy_nodemask_intersects
2083  *
2084  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2085  * policy.  Otherwise, check for intersection between mask and the policy
2086  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2087  * policy, always return true since it may allocate elsewhere on fallback.
2088  *
2089  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2090  */
2091 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2092 					const nodemask_t *mask)
2093 {
2094 	struct mempolicy *mempolicy;
2095 	bool ret = true;
2096 
2097 	if (!mask)
2098 		return ret;
2099 	task_lock(tsk);
2100 	mempolicy = tsk->mempolicy;
2101 	if (!mempolicy)
2102 		goto out;
2103 
2104 	switch (mempolicy->mode) {
2105 	case MPOL_PREFERRED:
2106 		/*
2107 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2108 		 * allocate from, they may fallback to other nodes when oom.
2109 		 * Thus, it's possible for tsk to have allocated memory from
2110 		 * nodes in mask.
2111 		 */
2112 		break;
2113 	case MPOL_BIND:
2114 	case MPOL_INTERLEAVE:
2115 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2116 		break;
2117 	default:
2118 		BUG();
2119 	}
2120 out:
2121 	task_unlock(tsk);
2122 	return ret;
2123 }
2124 
2125 /* Allocate a page in interleaved policy.
2126    Own path because it needs to do special accounting. */
2127 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2128 					unsigned nid)
2129 {
2130 	struct page *page;
2131 
2132 	page = __alloc_pages(gfp, order, nid);
2133 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2134 	if (!static_branch_likely(&vm_numa_stat_key))
2135 		return page;
2136 	if (page && page_to_nid(page) == nid) {
2137 		preempt_disable();
2138 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2139 		preempt_enable();
2140 	}
2141 	return page;
2142 }
2143 
2144 /**
2145  * 	alloc_pages_vma	- Allocate a page for a VMA.
2146  *
2147  * 	@gfp:
2148  *      %GFP_USER    user allocation.
2149  *      %GFP_KERNEL  kernel allocations,
2150  *      %GFP_HIGHMEM highmem/user allocations,
2151  *      %GFP_FS      allocation should not call back into a file system.
2152  *      %GFP_ATOMIC  don't sleep.
2153  *
2154  *	@order:Order of the GFP allocation.
2155  * 	@vma:  Pointer to VMA or NULL if not available.
2156  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2157  *	@node: Which node to prefer for allocation (modulo policy).
2158  *	@hugepage: for hugepages try only the preferred node if possible
2159  *
2160  * 	This function allocates a page from the kernel page pool and applies
2161  *	a NUMA policy associated with the VMA or the current process.
2162  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2163  *	mm_struct of the VMA to prevent it from going away. Should be used for
2164  *	all allocations for pages that will be mapped into user space. Returns
2165  *	NULL when no page can be allocated.
2166  */
2167 struct page *
2168 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2169 		unsigned long addr, int node, bool hugepage)
2170 {
2171 	struct mempolicy *pol;
2172 	struct page *page;
2173 	int preferred_nid;
2174 	nodemask_t *nmask;
2175 
2176 	pol = get_vma_policy(vma, addr);
2177 
2178 	if (pol->mode == MPOL_INTERLEAVE) {
2179 		unsigned nid;
2180 
2181 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2182 		mpol_cond_put(pol);
2183 		page = alloc_page_interleave(gfp, order, nid);
2184 		goto out;
2185 	}
2186 
2187 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2188 		int hpage_node = node;
2189 
2190 		/*
2191 		 * For hugepage allocation and non-interleave policy which
2192 		 * allows the current node (or other explicitly preferred
2193 		 * node) we only try to allocate from the current/preferred
2194 		 * node and don't fall back to other nodes, as the cost of
2195 		 * remote accesses would likely offset THP benefits.
2196 		 *
2197 		 * If the policy is interleave, or does not allow the current
2198 		 * node in its nodemask, we allocate the standard way.
2199 		 */
2200 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2201 			hpage_node = pol->v.preferred_node;
2202 
2203 		nmask = policy_nodemask(gfp, pol);
2204 		if (!nmask || node_isset(hpage_node, *nmask)) {
2205 			mpol_cond_put(pol);
2206 			/*
2207 			 * First, try to allocate THP only on local node, but
2208 			 * don't reclaim unnecessarily, just compact.
2209 			 */
2210 			page = __alloc_pages_node(hpage_node,
2211 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2212 
2213 			/*
2214 			 * If hugepage allocations are configured to always
2215 			 * synchronous compact or the vma has been madvised
2216 			 * to prefer hugepage backing, retry allowing remote
2217 			 * memory with both reclaim and compact as well.
2218 			 */
2219 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2220 				page = __alloc_pages_node(hpage_node,
2221 								gfp, order);
2222 
2223 			goto out;
2224 		}
2225 	}
2226 
2227 	nmask = policy_nodemask(gfp, pol);
2228 	preferred_nid = policy_node(gfp, pol, node);
2229 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2230 	mpol_cond_put(pol);
2231 out:
2232 	return page;
2233 }
2234 EXPORT_SYMBOL(alloc_pages_vma);
2235 
2236 /**
2237  * 	alloc_pages_current - Allocate pages.
2238  *
2239  *	@gfp:
2240  *		%GFP_USER   user allocation,
2241  *      	%GFP_KERNEL kernel allocation,
2242  *      	%GFP_HIGHMEM highmem allocation,
2243  *      	%GFP_FS     don't call back into a file system.
2244  *      	%GFP_ATOMIC don't sleep.
2245  *	@order: Power of two of allocation size in pages. 0 is a single page.
2246  *
2247  *	Allocate a page from the kernel page pool.  When not in
2248  *	interrupt context and apply the current process NUMA policy.
2249  *	Returns NULL when no page can be allocated.
2250  */
2251 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2252 {
2253 	struct mempolicy *pol = &default_policy;
2254 	struct page *page;
2255 
2256 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2257 		pol = get_task_policy(current);
2258 
2259 	/*
2260 	 * No reference counting needed for current->mempolicy
2261 	 * nor system default_policy
2262 	 */
2263 	if (pol->mode == MPOL_INTERLEAVE)
2264 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2265 	else
2266 		page = __alloc_pages_nodemask(gfp, order,
2267 				policy_node(gfp, pol, numa_node_id()),
2268 				policy_nodemask(gfp, pol));
2269 
2270 	return page;
2271 }
2272 EXPORT_SYMBOL(alloc_pages_current);
2273 
2274 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2275 {
2276 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2277 
2278 	if (IS_ERR(pol))
2279 		return PTR_ERR(pol);
2280 	dst->vm_policy = pol;
2281 	return 0;
2282 }
2283 
2284 /*
2285  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2286  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2287  * with the mems_allowed returned by cpuset_mems_allowed().  This
2288  * keeps mempolicies cpuset relative after its cpuset moves.  See
2289  * further kernel/cpuset.c update_nodemask().
2290  *
2291  * current's mempolicy may be rebinded by the other task(the task that changes
2292  * cpuset's mems), so we needn't do rebind work for current task.
2293  */
2294 
2295 /* Slow path of a mempolicy duplicate */
2296 struct mempolicy *__mpol_dup(struct mempolicy *old)
2297 {
2298 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2299 
2300 	if (!new)
2301 		return ERR_PTR(-ENOMEM);
2302 
2303 	/* task's mempolicy is protected by alloc_lock */
2304 	if (old == current->mempolicy) {
2305 		task_lock(current);
2306 		*new = *old;
2307 		task_unlock(current);
2308 	} else
2309 		*new = *old;
2310 
2311 	if (current_cpuset_is_being_rebound()) {
2312 		nodemask_t mems = cpuset_mems_allowed(current);
2313 		mpol_rebind_policy(new, &mems);
2314 	}
2315 	atomic_set(&new->refcnt, 1);
2316 	return new;
2317 }
2318 
2319 /* Slow path of a mempolicy comparison */
2320 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2321 {
2322 	if (!a || !b)
2323 		return false;
2324 	if (a->mode != b->mode)
2325 		return false;
2326 	if (a->flags != b->flags)
2327 		return false;
2328 	if (mpol_store_user_nodemask(a))
2329 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2330 			return false;
2331 
2332 	switch (a->mode) {
2333 	case MPOL_BIND:
2334 	case MPOL_INTERLEAVE:
2335 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2336 	case MPOL_PREFERRED:
2337 		/* a's ->flags is the same as b's */
2338 		if (a->flags & MPOL_F_LOCAL)
2339 			return true;
2340 		return a->v.preferred_node == b->v.preferred_node;
2341 	default:
2342 		BUG();
2343 		return false;
2344 	}
2345 }
2346 
2347 /*
2348  * Shared memory backing store policy support.
2349  *
2350  * Remember policies even when nobody has shared memory mapped.
2351  * The policies are kept in Red-Black tree linked from the inode.
2352  * They are protected by the sp->lock rwlock, which should be held
2353  * for any accesses to the tree.
2354  */
2355 
2356 /*
2357  * lookup first element intersecting start-end.  Caller holds sp->lock for
2358  * reading or for writing
2359  */
2360 static struct sp_node *
2361 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2362 {
2363 	struct rb_node *n = sp->root.rb_node;
2364 
2365 	while (n) {
2366 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2367 
2368 		if (start >= p->end)
2369 			n = n->rb_right;
2370 		else if (end <= p->start)
2371 			n = n->rb_left;
2372 		else
2373 			break;
2374 	}
2375 	if (!n)
2376 		return NULL;
2377 	for (;;) {
2378 		struct sp_node *w = NULL;
2379 		struct rb_node *prev = rb_prev(n);
2380 		if (!prev)
2381 			break;
2382 		w = rb_entry(prev, struct sp_node, nd);
2383 		if (w->end <= start)
2384 			break;
2385 		n = prev;
2386 	}
2387 	return rb_entry(n, struct sp_node, nd);
2388 }
2389 
2390 /*
2391  * Insert a new shared policy into the list.  Caller holds sp->lock for
2392  * writing.
2393  */
2394 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2395 {
2396 	struct rb_node **p = &sp->root.rb_node;
2397 	struct rb_node *parent = NULL;
2398 	struct sp_node *nd;
2399 
2400 	while (*p) {
2401 		parent = *p;
2402 		nd = rb_entry(parent, struct sp_node, nd);
2403 		if (new->start < nd->start)
2404 			p = &(*p)->rb_left;
2405 		else if (new->end > nd->end)
2406 			p = &(*p)->rb_right;
2407 		else
2408 			BUG();
2409 	}
2410 	rb_link_node(&new->nd, parent, p);
2411 	rb_insert_color(&new->nd, &sp->root);
2412 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2413 		 new->policy ? new->policy->mode : 0);
2414 }
2415 
2416 /* Find shared policy intersecting idx */
2417 struct mempolicy *
2418 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2419 {
2420 	struct mempolicy *pol = NULL;
2421 	struct sp_node *sn;
2422 
2423 	if (!sp->root.rb_node)
2424 		return NULL;
2425 	read_lock(&sp->lock);
2426 	sn = sp_lookup(sp, idx, idx+1);
2427 	if (sn) {
2428 		mpol_get(sn->policy);
2429 		pol = sn->policy;
2430 	}
2431 	read_unlock(&sp->lock);
2432 	return pol;
2433 }
2434 
2435 static void sp_free(struct sp_node *n)
2436 {
2437 	mpol_put(n->policy);
2438 	kmem_cache_free(sn_cache, n);
2439 }
2440 
2441 /**
2442  * mpol_misplaced - check whether current page node is valid in policy
2443  *
2444  * @page: page to be checked
2445  * @vma: vm area where page mapped
2446  * @addr: virtual address where page mapped
2447  *
2448  * Lookup current policy node id for vma,addr and "compare to" page's
2449  * node id.
2450  *
2451  * Returns:
2452  *	-1	- not misplaced, page is in the right node
2453  *	node	- node id where the page should be
2454  *
2455  * Policy determination "mimics" alloc_page_vma().
2456  * Called from fault path where we know the vma and faulting address.
2457  */
2458 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2459 {
2460 	struct mempolicy *pol;
2461 	struct zoneref *z;
2462 	int curnid = page_to_nid(page);
2463 	unsigned long pgoff;
2464 	int thiscpu = raw_smp_processor_id();
2465 	int thisnid = cpu_to_node(thiscpu);
2466 	int polnid = NUMA_NO_NODE;
2467 	int ret = -1;
2468 
2469 	pol = get_vma_policy(vma, addr);
2470 	if (!(pol->flags & MPOL_F_MOF))
2471 		goto out;
2472 
2473 	switch (pol->mode) {
2474 	case MPOL_INTERLEAVE:
2475 		pgoff = vma->vm_pgoff;
2476 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2477 		polnid = offset_il_node(pol, pgoff);
2478 		break;
2479 
2480 	case MPOL_PREFERRED:
2481 		if (pol->flags & MPOL_F_LOCAL)
2482 			polnid = numa_node_id();
2483 		else
2484 			polnid = pol->v.preferred_node;
2485 		break;
2486 
2487 	case MPOL_BIND:
2488 
2489 		/*
2490 		 * allows binding to multiple nodes.
2491 		 * use current page if in policy nodemask,
2492 		 * else select nearest allowed node, if any.
2493 		 * If no allowed nodes, use current [!misplaced].
2494 		 */
2495 		if (node_isset(curnid, pol->v.nodes))
2496 			goto out;
2497 		z = first_zones_zonelist(
2498 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2499 				gfp_zone(GFP_HIGHUSER),
2500 				&pol->v.nodes);
2501 		polnid = zone_to_nid(z->zone);
2502 		break;
2503 
2504 	default:
2505 		BUG();
2506 	}
2507 
2508 	/* Migrate the page towards the node whose CPU is referencing it */
2509 	if (pol->flags & MPOL_F_MORON) {
2510 		polnid = thisnid;
2511 
2512 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2513 			goto out;
2514 	}
2515 
2516 	if (curnid != polnid)
2517 		ret = polnid;
2518 out:
2519 	mpol_cond_put(pol);
2520 
2521 	return ret;
2522 }
2523 
2524 /*
2525  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2526  * dropped after task->mempolicy is set to NULL so that any allocation done as
2527  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2528  * policy.
2529  */
2530 void mpol_put_task_policy(struct task_struct *task)
2531 {
2532 	struct mempolicy *pol;
2533 
2534 	task_lock(task);
2535 	pol = task->mempolicy;
2536 	task->mempolicy = NULL;
2537 	task_unlock(task);
2538 	mpol_put(pol);
2539 }
2540 
2541 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2542 {
2543 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2544 	rb_erase(&n->nd, &sp->root);
2545 	sp_free(n);
2546 }
2547 
2548 static void sp_node_init(struct sp_node *node, unsigned long start,
2549 			unsigned long end, struct mempolicy *pol)
2550 {
2551 	node->start = start;
2552 	node->end = end;
2553 	node->policy = pol;
2554 }
2555 
2556 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2557 				struct mempolicy *pol)
2558 {
2559 	struct sp_node *n;
2560 	struct mempolicy *newpol;
2561 
2562 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2563 	if (!n)
2564 		return NULL;
2565 
2566 	newpol = mpol_dup(pol);
2567 	if (IS_ERR(newpol)) {
2568 		kmem_cache_free(sn_cache, n);
2569 		return NULL;
2570 	}
2571 	newpol->flags |= MPOL_F_SHARED;
2572 	sp_node_init(n, start, end, newpol);
2573 
2574 	return n;
2575 }
2576 
2577 /* Replace a policy range. */
2578 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2579 				 unsigned long end, struct sp_node *new)
2580 {
2581 	struct sp_node *n;
2582 	struct sp_node *n_new = NULL;
2583 	struct mempolicy *mpol_new = NULL;
2584 	int ret = 0;
2585 
2586 restart:
2587 	write_lock(&sp->lock);
2588 	n = sp_lookup(sp, start, end);
2589 	/* Take care of old policies in the same range. */
2590 	while (n && n->start < end) {
2591 		struct rb_node *next = rb_next(&n->nd);
2592 		if (n->start >= start) {
2593 			if (n->end <= end)
2594 				sp_delete(sp, n);
2595 			else
2596 				n->start = end;
2597 		} else {
2598 			/* Old policy spanning whole new range. */
2599 			if (n->end > end) {
2600 				if (!n_new)
2601 					goto alloc_new;
2602 
2603 				*mpol_new = *n->policy;
2604 				atomic_set(&mpol_new->refcnt, 1);
2605 				sp_node_init(n_new, end, n->end, mpol_new);
2606 				n->end = start;
2607 				sp_insert(sp, n_new);
2608 				n_new = NULL;
2609 				mpol_new = NULL;
2610 				break;
2611 			} else
2612 				n->end = start;
2613 		}
2614 		if (!next)
2615 			break;
2616 		n = rb_entry(next, struct sp_node, nd);
2617 	}
2618 	if (new)
2619 		sp_insert(sp, new);
2620 	write_unlock(&sp->lock);
2621 	ret = 0;
2622 
2623 err_out:
2624 	if (mpol_new)
2625 		mpol_put(mpol_new);
2626 	if (n_new)
2627 		kmem_cache_free(sn_cache, n_new);
2628 
2629 	return ret;
2630 
2631 alloc_new:
2632 	write_unlock(&sp->lock);
2633 	ret = -ENOMEM;
2634 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2635 	if (!n_new)
2636 		goto err_out;
2637 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2638 	if (!mpol_new)
2639 		goto err_out;
2640 	goto restart;
2641 }
2642 
2643 /**
2644  * mpol_shared_policy_init - initialize shared policy for inode
2645  * @sp: pointer to inode shared policy
2646  * @mpol:  struct mempolicy to install
2647  *
2648  * Install non-NULL @mpol in inode's shared policy rb-tree.
2649  * On entry, the current task has a reference on a non-NULL @mpol.
2650  * This must be released on exit.
2651  * This is called at get_inode() calls and we can use GFP_KERNEL.
2652  */
2653 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2654 {
2655 	int ret;
2656 
2657 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2658 	rwlock_init(&sp->lock);
2659 
2660 	if (mpol) {
2661 		struct vm_area_struct pvma;
2662 		struct mempolicy *new;
2663 		NODEMASK_SCRATCH(scratch);
2664 
2665 		if (!scratch)
2666 			goto put_mpol;
2667 		/* contextualize the tmpfs mount point mempolicy */
2668 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2669 		if (IS_ERR(new))
2670 			goto free_scratch; /* no valid nodemask intersection */
2671 
2672 		task_lock(current);
2673 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2674 		task_unlock(current);
2675 		if (ret)
2676 			goto put_new;
2677 
2678 		/* Create pseudo-vma that contains just the policy */
2679 		vma_init(&pvma, NULL);
2680 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2681 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2682 
2683 put_new:
2684 		mpol_put(new);			/* drop initial ref */
2685 free_scratch:
2686 		NODEMASK_SCRATCH_FREE(scratch);
2687 put_mpol:
2688 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2689 	}
2690 }
2691 
2692 int mpol_set_shared_policy(struct shared_policy *info,
2693 			struct vm_area_struct *vma, struct mempolicy *npol)
2694 {
2695 	int err;
2696 	struct sp_node *new = NULL;
2697 	unsigned long sz = vma_pages(vma);
2698 
2699 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2700 		 vma->vm_pgoff,
2701 		 sz, npol ? npol->mode : -1,
2702 		 npol ? npol->flags : -1,
2703 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2704 
2705 	if (npol) {
2706 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2707 		if (!new)
2708 			return -ENOMEM;
2709 	}
2710 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2711 	if (err && new)
2712 		sp_free(new);
2713 	return err;
2714 }
2715 
2716 /* Free a backing policy store on inode delete. */
2717 void mpol_free_shared_policy(struct shared_policy *p)
2718 {
2719 	struct sp_node *n;
2720 	struct rb_node *next;
2721 
2722 	if (!p->root.rb_node)
2723 		return;
2724 	write_lock(&p->lock);
2725 	next = rb_first(&p->root);
2726 	while (next) {
2727 		n = rb_entry(next, struct sp_node, nd);
2728 		next = rb_next(&n->nd);
2729 		sp_delete(p, n);
2730 	}
2731 	write_unlock(&p->lock);
2732 }
2733 
2734 #ifdef CONFIG_NUMA_BALANCING
2735 static int __initdata numabalancing_override;
2736 
2737 static void __init check_numabalancing_enable(void)
2738 {
2739 	bool numabalancing_default = false;
2740 
2741 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2742 		numabalancing_default = true;
2743 
2744 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2745 	if (numabalancing_override)
2746 		set_numabalancing_state(numabalancing_override == 1);
2747 
2748 	if (num_online_nodes() > 1 && !numabalancing_override) {
2749 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2750 			numabalancing_default ? "Enabling" : "Disabling");
2751 		set_numabalancing_state(numabalancing_default);
2752 	}
2753 }
2754 
2755 static int __init setup_numabalancing(char *str)
2756 {
2757 	int ret = 0;
2758 	if (!str)
2759 		goto out;
2760 
2761 	if (!strcmp(str, "enable")) {
2762 		numabalancing_override = 1;
2763 		ret = 1;
2764 	} else if (!strcmp(str, "disable")) {
2765 		numabalancing_override = -1;
2766 		ret = 1;
2767 	}
2768 out:
2769 	if (!ret)
2770 		pr_warn("Unable to parse numa_balancing=\n");
2771 
2772 	return ret;
2773 }
2774 __setup("numa_balancing=", setup_numabalancing);
2775 #else
2776 static inline void __init check_numabalancing_enable(void)
2777 {
2778 }
2779 #endif /* CONFIG_NUMA_BALANCING */
2780 
2781 /* assumes fs == KERNEL_DS */
2782 void __init numa_policy_init(void)
2783 {
2784 	nodemask_t interleave_nodes;
2785 	unsigned long largest = 0;
2786 	int nid, prefer = 0;
2787 
2788 	policy_cache = kmem_cache_create("numa_policy",
2789 					 sizeof(struct mempolicy),
2790 					 0, SLAB_PANIC, NULL);
2791 
2792 	sn_cache = kmem_cache_create("shared_policy_node",
2793 				     sizeof(struct sp_node),
2794 				     0, SLAB_PANIC, NULL);
2795 
2796 	for_each_node(nid) {
2797 		preferred_node_policy[nid] = (struct mempolicy) {
2798 			.refcnt = ATOMIC_INIT(1),
2799 			.mode = MPOL_PREFERRED,
2800 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2801 			.v = { .preferred_node = nid, },
2802 		};
2803 	}
2804 
2805 	/*
2806 	 * Set interleaving policy for system init. Interleaving is only
2807 	 * enabled across suitably sized nodes (default is >= 16MB), or
2808 	 * fall back to the largest node if they're all smaller.
2809 	 */
2810 	nodes_clear(interleave_nodes);
2811 	for_each_node_state(nid, N_MEMORY) {
2812 		unsigned long total_pages = node_present_pages(nid);
2813 
2814 		/* Preserve the largest node */
2815 		if (largest < total_pages) {
2816 			largest = total_pages;
2817 			prefer = nid;
2818 		}
2819 
2820 		/* Interleave this node? */
2821 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2822 			node_set(nid, interleave_nodes);
2823 	}
2824 
2825 	/* All too small, use the largest */
2826 	if (unlikely(nodes_empty(interleave_nodes)))
2827 		node_set(prefer, interleave_nodes);
2828 
2829 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2830 		pr_err("%s: interleaving failed\n", __func__);
2831 
2832 	check_numabalancing_enable();
2833 }
2834 
2835 /* Reset policy of current process to default */
2836 void numa_default_policy(void)
2837 {
2838 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2839 }
2840 
2841 /*
2842  * Parse and format mempolicy from/to strings
2843  */
2844 
2845 /*
2846  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2847  */
2848 static const char * const policy_modes[] =
2849 {
2850 	[MPOL_DEFAULT]    = "default",
2851 	[MPOL_PREFERRED]  = "prefer",
2852 	[MPOL_BIND]       = "bind",
2853 	[MPOL_INTERLEAVE] = "interleave",
2854 	[MPOL_LOCAL]      = "local",
2855 };
2856 
2857 
2858 #ifdef CONFIG_TMPFS
2859 /**
2860  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2861  * @str:  string containing mempolicy to parse
2862  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2863  *
2864  * Format of input:
2865  *	<mode>[=<flags>][:<nodelist>]
2866  *
2867  * On success, returns 0, else 1
2868  */
2869 int mpol_parse_str(char *str, struct mempolicy **mpol)
2870 {
2871 	struct mempolicy *new = NULL;
2872 	unsigned short mode_flags;
2873 	nodemask_t nodes;
2874 	char *nodelist = strchr(str, ':');
2875 	char *flags = strchr(str, '=');
2876 	int err = 1, mode;
2877 
2878 	if (flags)
2879 		*flags++ = '\0';	/* terminate mode string */
2880 
2881 	if (nodelist) {
2882 		/* NUL-terminate mode or flags string */
2883 		*nodelist++ = '\0';
2884 		if (nodelist_parse(nodelist, nodes))
2885 			goto out;
2886 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2887 			goto out;
2888 	} else
2889 		nodes_clear(nodes);
2890 
2891 	mode = match_string(policy_modes, MPOL_MAX, str);
2892 	if (mode < 0)
2893 		goto out;
2894 
2895 	switch (mode) {
2896 	case MPOL_PREFERRED:
2897 		/*
2898 		 * Insist on a nodelist of one node only, although later
2899 		 * we use first_node(nodes) to grab a single node, so here
2900 		 * nodelist (or nodes) cannot be empty.
2901 		 */
2902 		if (nodelist) {
2903 			char *rest = nodelist;
2904 			while (isdigit(*rest))
2905 				rest++;
2906 			if (*rest)
2907 				goto out;
2908 			if (nodes_empty(nodes))
2909 				goto out;
2910 		}
2911 		break;
2912 	case MPOL_INTERLEAVE:
2913 		/*
2914 		 * Default to online nodes with memory if no nodelist
2915 		 */
2916 		if (!nodelist)
2917 			nodes = node_states[N_MEMORY];
2918 		break;
2919 	case MPOL_LOCAL:
2920 		/*
2921 		 * Don't allow a nodelist;  mpol_new() checks flags
2922 		 */
2923 		if (nodelist)
2924 			goto out;
2925 		mode = MPOL_PREFERRED;
2926 		break;
2927 	case MPOL_DEFAULT:
2928 		/*
2929 		 * Insist on a empty nodelist
2930 		 */
2931 		if (!nodelist)
2932 			err = 0;
2933 		goto out;
2934 	case MPOL_BIND:
2935 		/*
2936 		 * Insist on a nodelist
2937 		 */
2938 		if (!nodelist)
2939 			goto out;
2940 	}
2941 
2942 	mode_flags = 0;
2943 	if (flags) {
2944 		/*
2945 		 * Currently, we only support two mutually exclusive
2946 		 * mode flags.
2947 		 */
2948 		if (!strcmp(flags, "static"))
2949 			mode_flags |= MPOL_F_STATIC_NODES;
2950 		else if (!strcmp(flags, "relative"))
2951 			mode_flags |= MPOL_F_RELATIVE_NODES;
2952 		else
2953 			goto out;
2954 	}
2955 
2956 	new = mpol_new(mode, mode_flags, &nodes);
2957 	if (IS_ERR(new))
2958 		goto out;
2959 
2960 	/*
2961 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2962 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2963 	 */
2964 	if (mode != MPOL_PREFERRED)
2965 		new->v.nodes = nodes;
2966 	else if (nodelist)
2967 		new->v.preferred_node = first_node(nodes);
2968 	else
2969 		new->flags |= MPOL_F_LOCAL;
2970 
2971 	/*
2972 	 * Save nodes for contextualization: this will be used to "clone"
2973 	 * the mempolicy in a specific context [cpuset] at a later time.
2974 	 */
2975 	new->w.user_nodemask = nodes;
2976 
2977 	err = 0;
2978 
2979 out:
2980 	/* Restore string for error message */
2981 	if (nodelist)
2982 		*--nodelist = ':';
2983 	if (flags)
2984 		*--flags = '=';
2985 	if (!err)
2986 		*mpol = new;
2987 	return err;
2988 }
2989 #endif /* CONFIG_TMPFS */
2990 
2991 /**
2992  * mpol_to_str - format a mempolicy structure for printing
2993  * @buffer:  to contain formatted mempolicy string
2994  * @maxlen:  length of @buffer
2995  * @pol:  pointer to mempolicy to be formatted
2996  *
2997  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2998  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2999  * longest flag, "relative", and to display at least a few node ids.
3000  */
3001 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3002 {
3003 	char *p = buffer;
3004 	nodemask_t nodes = NODE_MASK_NONE;
3005 	unsigned short mode = MPOL_DEFAULT;
3006 	unsigned short flags = 0;
3007 
3008 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3009 		mode = pol->mode;
3010 		flags = pol->flags;
3011 	}
3012 
3013 	switch (mode) {
3014 	case MPOL_DEFAULT:
3015 		break;
3016 	case MPOL_PREFERRED:
3017 		if (flags & MPOL_F_LOCAL)
3018 			mode = MPOL_LOCAL;
3019 		else
3020 			node_set(pol->v.preferred_node, nodes);
3021 		break;
3022 	case MPOL_BIND:
3023 	case MPOL_INTERLEAVE:
3024 		nodes = pol->v.nodes;
3025 		break;
3026 	default:
3027 		WARN_ON_ONCE(1);
3028 		snprintf(p, maxlen, "unknown");
3029 		return;
3030 	}
3031 
3032 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3033 
3034 	if (flags & MPOL_MODE_FLAGS) {
3035 		p += snprintf(p, buffer + maxlen - p, "=");
3036 
3037 		/*
3038 		 * Currently, the only defined flags are mutually exclusive
3039 		 */
3040 		if (flags & MPOL_F_STATIC_NODES)
3041 			p += snprintf(p, buffer + maxlen - p, "static");
3042 		else if (flags & MPOL_F_RELATIVE_NODES)
3043 			p += snprintf(p, buffer + maxlen - p, "relative");
3044 	}
3045 
3046 	if (!nodes_empty(nodes))
3047 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3048 			       nodemask_pr_args(&nodes));
3049 }
3050