xref: /openbmc/linux/mm/mempolicy.c (revision 87fcfa7b7fe6bf819033fe827a27f710e38639b5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	unsigned long start;
414 	unsigned long end;
415 	struct vm_area_struct *first;
416 };
417 
418 /*
419  * Check if the page's nid is in qp->nmask.
420  *
421  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
422  * in the invert of qp->nmask.
423  */
424 static inline bool queue_pages_required(struct page *page,
425 					struct queue_pages *qp)
426 {
427 	int nid = page_to_nid(page);
428 	unsigned long flags = qp->flags;
429 
430 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431 }
432 
433 /*
434  * queue_pages_pmd() has four possible return values:
435  * 0 - pages are placed on the right node or queued successfully.
436  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
437  *     specified.
438  * 2 - THP was split.
439  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
440  *        existing page was already on a node that does not follow the
441  *        policy.
442  */
443 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 				unsigned long end, struct mm_walk *walk)
445 {
446 	int ret = 0;
447 	struct page *page;
448 	struct queue_pages *qp = walk->private;
449 	unsigned long flags;
450 
451 	if (unlikely(is_pmd_migration_entry(*pmd))) {
452 		ret = -EIO;
453 		goto unlock;
454 	}
455 	page = pmd_page(*pmd);
456 	if (is_huge_zero_page(page)) {
457 		spin_unlock(ptl);
458 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
459 		ret = 2;
460 		goto out;
461 	}
462 	if (!queue_pages_required(page, qp))
463 		goto unlock;
464 
465 	flags = qp->flags;
466 	/* go to thp migration */
467 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
468 		if (!vma_migratable(walk->vma) ||
469 		    migrate_page_add(page, qp->pagelist, flags)) {
470 			ret = 1;
471 			goto unlock;
472 		}
473 	} else
474 		ret = -EIO;
475 unlock:
476 	spin_unlock(ptl);
477 out:
478 	return ret;
479 }
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  *
485  * queue_pages_pte_range() has three possible return values:
486  * 0 - pages are placed on the right node or queued successfully.
487  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
488  *     specified.
489  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
490  *        on a node that does not follow the policy.
491  */
492 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
493 			unsigned long end, struct mm_walk *walk)
494 {
495 	struct vm_area_struct *vma = walk->vma;
496 	struct page *page;
497 	struct queue_pages *qp = walk->private;
498 	unsigned long flags = qp->flags;
499 	int ret;
500 	bool has_unmovable = false;
501 	pte_t *pte;
502 	spinlock_t *ptl;
503 
504 	ptl = pmd_trans_huge_lock(pmd, vma);
505 	if (ptl) {
506 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
507 		if (ret != 2)
508 			return ret;
509 	}
510 	/* THP was split, fall through to pte walk */
511 
512 	if (pmd_trans_unstable(pmd))
513 		return 0;
514 
515 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
516 	for (; addr != end; pte++, addr += PAGE_SIZE) {
517 		if (!pte_present(*pte))
518 			continue;
519 		page = vm_normal_page(vma, addr, *pte);
520 		if (!page)
521 			continue;
522 		/*
523 		 * vm_normal_page() filters out zero pages, but there might
524 		 * still be PageReserved pages to skip, perhaps in a VDSO.
525 		 */
526 		if (PageReserved(page))
527 			continue;
528 		if (!queue_pages_required(page, qp))
529 			continue;
530 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
531 			/* MPOL_MF_STRICT must be specified if we get here */
532 			if (!vma_migratable(vma)) {
533 				has_unmovable = true;
534 				break;
535 			}
536 
537 			/*
538 			 * Do not abort immediately since there may be
539 			 * temporary off LRU pages in the range.  Still
540 			 * need migrate other LRU pages.
541 			 */
542 			if (migrate_page_add(page, qp->pagelist, flags))
543 				has_unmovable = true;
544 		} else
545 			break;
546 	}
547 	pte_unmap_unlock(pte - 1, ptl);
548 	cond_resched();
549 
550 	if (has_unmovable)
551 		return 1;
552 
553 	return addr != end ? -EIO : 0;
554 }
555 
556 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
557 			       unsigned long addr, unsigned long end,
558 			       struct mm_walk *walk)
559 {
560 #ifdef CONFIG_HUGETLB_PAGE
561 	struct queue_pages *qp = walk->private;
562 	unsigned long flags = qp->flags;
563 	struct page *page;
564 	spinlock_t *ptl;
565 	pte_t entry;
566 
567 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
568 	entry = huge_ptep_get(pte);
569 	if (!pte_present(entry))
570 		goto unlock;
571 	page = pte_page(entry);
572 	if (!queue_pages_required(page, qp))
573 		goto unlock;
574 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
575 	if (flags & (MPOL_MF_MOVE_ALL) ||
576 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
577 		isolate_huge_page(page, qp->pagelist);
578 unlock:
579 	spin_unlock(ptl);
580 #else
581 	BUG();
582 #endif
583 	return 0;
584 }
585 
586 #ifdef CONFIG_NUMA_BALANCING
587 /*
588  * This is used to mark a range of virtual addresses to be inaccessible.
589  * These are later cleared by a NUMA hinting fault. Depending on these
590  * faults, pages may be migrated for better NUMA placement.
591  *
592  * This is assuming that NUMA faults are handled using PROT_NONE. If
593  * an architecture makes a different choice, it will need further
594  * changes to the core.
595  */
596 unsigned long change_prot_numa(struct vm_area_struct *vma,
597 			unsigned long addr, unsigned long end)
598 {
599 	int nr_updated;
600 
601 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
602 	if (nr_updated)
603 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
604 
605 	return nr_updated;
606 }
607 #else
608 static unsigned long change_prot_numa(struct vm_area_struct *vma,
609 			unsigned long addr, unsigned long end)
610 {
611 	return 0;
612 }
613 #endif /* CONFIG_NUMA_BALANCING */
614 
615 static int queue_pages_test_walk(unsigned long start, unsigned long end,
616 				struct mm_walk *walk)
617 {
618 	struct vm_area_struct *vma = walk->vma;
619 	struct queue_pages *qp = walk->private;
620 	unsigned long endvma = vma->vm_end;
621 	unsigned long flags = qp->flags;
622 
623 	/* range check first */
624 	VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
625 
626 	if (!qp->first) {
627 		qp->first = vma;
628 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
629 			(qp->start < vma->vm_start))
630 			/* hole at head side of range */
631 			return -EFAULT;
632 	}
633 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
634 		((vma->vm_end < qp->end) &&
635 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
636 		/* hole at middle or tail of range */
637 		return -EFAULT;
638 
639 	/*
640 	 * Need check MPOL_MF_STRICT to return -EIO if possible
641 	 * regardless of vma_migratable
642 	 */
643 	if (!vma_migratable(vma) &&
644 	    !(flags & MPOL_MF_STRICT))
645 		return 1;
646 
647 	if (endvma > end)
648 		endvma = end;
649 
650 	if (flags & MPOL_MF_LAZY) {
651 		/* Similar to task_numa_work, skip inaccessible VMAs */
652 		if (!is_vm_hugetlb_page(vma) &&
653 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
654 			!(vma->vm_flags & VM_MIXEDMAP))
655 			change_prot_numa(vma, start, endvma);
656 		return 1;
657 	}
658 
659 	/* queue pages from current vma */
660 	if (flags & MPOL_MF_VALID)
661 		return 0;
662 	return 1;
663 }
664 
665 static const struct mm_walk_ops queue_pages_walk_ops = {
666 	.hugetlb_entry		= queue_pages_hugetlb,
667 	.pmd_entry		= queue_pages_pte_range,
668 	.test_walk		= queue_pages_test_walk,
669 };
670 
671 /*
672  * Walk through page tables and collect pages to be migrated.
673  *
674  * If pages found in a given range are on a set of nodes (determined by
675  * @nodes and @flags,) it's isolated and queued to the pagelist which is
676  * passed via @private.
677  *
678  * queue_pages_range() has three possible return values:
679  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
680  *     specified.
681  * 0 - queue pages successfully or no misplaced page.
682  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
683  *         memory range specified by nodemask and maxnode points outside
684  *         your accessible address space (-EFAULT)
685  */
686 static int
687 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
688 		nodemask_t *nodes, unsigned long flags,
689 		struct list_head *pagelist)
690 {
691 	int err;
692 	struct queue_pages qp = {
693 		.pagelist = pagelist,
694 		.flags = flags,
695 		.nmask = nodes,
696 		.start = start,
697 		.end = end,
698 		.first = NULL,
699 	};
700 
701 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
702 
703 	if (!qp.first)
704 		/* whole range in hole */
705 		err = -EFAULT;
706 
707 	return err;
708 }
709 
710 /*
711  * Apply policy to a single VMA
712  * This must be called with the mmap_sem held for writing.
713  */
714 static int vma_replace_policy(struct vm_area_struct *vma,
715 						struct mempolicy *pol)
716 {
717 	int err;
718 	struct mempolicy *old;
719 	struct mempolicy *new;
720 
721 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
722 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
723 		 vma->vm_ops, vma->vm_file,
724 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
725 
726 	new = mpol_dup(pol);
727 	if (IS_ERR(new))
728 		return PTR_ERR(new);
729 
730 	if (vma->vm_ops && vma->vm_ops->set_policy) {
731 		err = vma->vm_ops->set_policy(vma, new);
732 		if (err)
733 			goto err_out;
734 	}
735 
736 	old = vma->vm_policy;
737 	vma->vm_policy = new; /* protected by mmap_sem */
738 	mpol_put(old);
739 
740 	return 0;
741  err_out:
742 	mpol_put(new);
743 	return err;
744 }
745 
746 /* Step 2: apply policy to a range and do splits. */
747 static int mbind_range(struct mm_struct *mm, unsigned long start,
748 		       unsigned long end, struct mempolicy *new_pol)
749 {
750 	struct vm_area_struct *next;
751 	struct vm_area_struct *prev;
752 	struct vm_area_struct *vma;
753 	int err = 0;
754 	pgoff_t pgoff;
755 	unsigned long vmstart;
756 	unsigned long vmend;
757 
758 	vma = find_vma(mm, start);
759 	VM_BUG_ON(!vma);
760 
761 	prev = vma->vm_prev;
762 	if (start > vma->vm_start)
763 		prev = vma;
764 
765 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
766 		next = vma->vm_next;
767 		vmstart = max(start, vma->vm_start);
768 		vmend   = min(end, vma->vm_end);
769 
770 		if (mpol_equal(vma_policy(vma), new_pol))
771 			continue;
772 
773 		pgoff = vma->vm_pgoff +
774 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
775 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
776 				 vma->anon_vma, vma->vm_file, pgoff,
777 				 new_pol, vma->vm_userfaultfd_ctx);
778 		if (prev) {
779 			vma = prev;
780 			next = vma->vm_next;
781 			if (mpol_equal(vma_policy(vma), new_pol))
782 				continue;
783 			/* vma_merge() joined vma && vma->next, case 8 */
784 			goto replace;
785 		}
786 		if (vma->vm_start != vmstart) {
787 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
788 			if (err)
789 				goto out;
790 		}
791 		if (vma->vm_end != vmend) {
792 			err = split_vma(vma->vm_mm, vma, vmend, 0);
793 			if (err)
794 				goto out;
795 		}
796  replace:
797 		err = vma_replace_policy(vma, new_pol);
798 		if (err)
799 			goto out;
800 	}
801 
802  out:
803 	return err;
804 }
805 
806 /* Set the process memory policy */
807 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
808 			     nodemask_t *nodes)
809 {
810 	struct mempolicy *new, *old;
811 	NODEMASK_SCRATCH(scratch);
812 	int ret;
813 
814 	if (!scratch)
815 		return -ENOMEM;
816 
817 	new = mpol_new(mode, flags, nodes);
818 	if (IS_ERR(new)) {
819 		ret = PTR_ERR(new);
820 		goto out;
821 	}
822 
823 	task_lock(current);
824 	ret = mpol_set_nodemask(new, nodes, scratch);
825 	if (ret) {
826 		task_unlock(current);
827 		mpol_put(new);
828 		goto out;
829 	}
830 	old = current->mempolicy;
831 	current->mempolicy = new;
832 	if (new && new->mode == MPOL_INTERLEAVE)
833 		current->il_prev = MAX_NUMNODES-1;
834 	task_unlock(current);
835 	mpol_put(old);
836 	ret = 0;
837 out:
838 	NODEMASK_SCRATCH_FREE(scratch);
839 	return ret;
840 }
841 
842 /*
843  * Return nodemask for policy for get_mempolicy() query
844  *
845  * Called with task's alloc_lock held
846  */
847 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
848 {
849 	nodes_clear(*nodes);
850 	if (p == &default_policy)
851 		return;
852 
853 	switch (p->mode) {
854 	case MPOL_BIND:
855 		/* Fall through */
856 	case MPOL_INTERLEAVE:
857 		*nodes = p->v.nodes;
858 		break;
859 	case MPOL_PREFERRED:
860 		if (!(p->flags & MPOL_F_LOCAL))
861 			node_set(p->v.preferred_node, *nodes);
862 		/* else return empty node mask for local allocation */
863 		break;
864 	default:
865 		BUG();
866 	}
867 }
868 
869 static int lookup_node(struct mm_struct *mm, unsigned long addr)
870 {
871 	struct page *p;
872 	int err;
873 
874 	int locked = 1;
875 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
876 	if (err >= 0) {
877 		err = page_to_nid(p);
878 		put_page(p);
879 	}
880 	if (locked)
881 		up_read(&mm->mmap_sem);
882 	return err;
883 }
884 
885 /* Retrieve NUMA policy */
886 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
887 			     unsigned long addr, unsigned long flags)
888 {
889 	int err;
890 	struct mm_struct *mm = current->mm;
891 	struct vm_area_struct *vma = NULL;
892 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
893 
894 	if (flags &
895 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
896 		return -EINVAL;
897 
898 	if (flags & MPOL_F_MEMS_ALLOWED) {
899 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
900 			return -EINVAL;
901 		*policy = 0;	/* just so it's initialized */
902 		task_lock(current);
903 		*nmask  = cpuset_current_mems_allowed;
904 		task_unlock(current);
905 		return 0;
906 	}
907 
908 	if (flags & MPOL_F_ADDR) {
909 		/*
910 		 * Do NOT fall back to task policy if the
911 		 * vma/shared policy at addr is NULL.  We
912 		 * want to return MPOL_DEFAULT in this case.
913 		 */
914 		down_read(&mm->mmap_sem);
915 		vma = find_vma_intersection(mm, addr, addr+1);
916 		if (!vma) {
917 			up_read(&mm->mmap_sem);
918 			return -EFAULT;
919 		}
920 		if (vma->vm_ops && vma->vm_ops->get_policy)
921 			pol = vma->vm_ops->get_policy(vma, addr);
922 		else
923 			pol = vma->vm_policy;
924 	} else if (addr)
925 		return -EINVAL;
926 
927 	if (!pol)
928 		pol = &default_policy;	/* indicates default behavior */
929 
930 	if (flags & MPOL_F_NODE) {
931 		if (flags & MPOL_F_ADDR) {
932 			/*
933 			 * Take a refcount on the mpol, lookup_node()
934 			 * wil drop the mmap_sem, so after calling
935 			 * lookup_node() only "pol" remains valid, "vma"
936 			 * is stale.
937 			 */
938 			pol_refcount = pol;
939 			vma = NULL;
940 			mpol_get(pol);
941 			err = lookup_node(mm, addr);
942 			if (err < 0)
943 				goto out;
944 			*policy = err;
945 		} else if (pol == current->mempolicy &&
946 				pol->mode == MPOL_INTERLEAVE) {
947 			*policy = next_node_in(current->il_prev, pol->v.nodes);
948 		} else {
949 			err = -EINVAL;
950 			goto out;
951 		}
952 	} else {
953 		*policy = pol == &default_policy ? MPOL_DEFAULT :
954 						pol->mode;
955 		/*
956 		 * Internal mempolicy flags must be masked off before exposing
957 		 * the policy to userspace.
958 		 */
959 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
960 	}
961 
962 	err = 0;
963 	if (nmask) {
964 		if (mpol_store_user_nodemask(pol)) {
965 			*nmask = pol->w.user_nodemask;
966 		} else {
967 			task_lock(current);
968 			get_policy_nodemask(pol, nmask);
969 			task_unlock(current);
970 		}
971 	}
972 
973  out:
974 	mpol_cond_put(pol);
975 	if (vma)
976 		up_read(&mm->mmap_sem);
977 	if (pol_refcount)
978 		mpol_put(pol_refcount);
979 	return err;
980 }
981 
982 #ifdef CONFIG_MIGRATION
983 /*
984  * page migration, thp tail pages can be passed.
985  */
986 static int migrate_page_add(struct page *page, struct list_head *pagelist,
987 				unsigned long flags)
988 {
989 	struct page *head = compound_head(page);
990 	/*
991 	 * Avoid migrating a page that is shared with others.
992 	 */
993 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
994 		if (!isolate_lru_page(head)) {
995 			list_add_tail(&head->lru, pagelist);
996 			mod_node_page_state(page_pgdat(head),
997 				NR_ISOLATED_ANON + page_is_file_cache(head),
998 				hpage_nr_pages(head));
999 		} else if (flags & MPOL_MF_STRICT) {
1000 			/*
1001 			 * Non-movable page may reach here.  And, there may be
1002 			 * temporary off LRU pages or non-LRU movable pages.
1003 			 * Treat them as unmovable pages since they can't be
1004 			 * isolated, so they can't be moved at the moment.  It
1005 			 * should return -EIO for this case too.
1006 			 */
1007 			return -EIO;
1008 		}
1009 	}
1010 
1011 	return 0;
1012 }
1013 
1014 /* page allocation callback for NUMA node migration */
1015 struct page *alloc_new_node_page(struct page *page, unsigned long node)
1016 {
1017 	if (PageHuge(page))
1018 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1019 					node);
1020 	else if (PageTransHuge(page)) {
1021 		struct page *thp;
1022 
1023 		thp = alloc_pages_node(node,
1024 			(GFP_TRANSHUGE | __GFP_THISNODE),
1025 			HPAGE_PMD_ORDER);
1026 		if (!thp)
1027 			return NULL;
1028 		prep_transhuge_page(thp);
1029 		return thp;
1030 	} else
1031 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1032 						    __GFP_THISNODE, 0);
1033 }
1034 
1035 /*
1036  * Migrate pages from one node to a target node.
1037  * Returns error or the number of pages not migrated.
1038  */
1039 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1040 			   int flags)
1041 {
1042 	nodemask_t nmask;
1043 	LIST_HEAD(pagelist);
1044 	int err = 0;
1045 
1046 	nodes_clear(nmask);
1047 	node_set(source, nmask);
1048 
1049 	/*
1050 	 * This does not "check" the range but isolates all pages that
1051 	 * need migration.  Between passing in the full user address
1052 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1053 	 */
1054 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1055 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1056 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1057 
1058 	if (!list_empty(&pagelist)) {
1059 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1060 					MIGRATE_SYNC, MR_SYSCALL);
1061 		if (err)
1062 			putback_movable_pages(&pagelist);
1063 	}
1064 
1065 	return err;
1066 }
1067 
1068 /*
1069  * Move pages between the two nodesets so as to preserve the physical
1070  * layout as much as possible.
1071  *
1072  * Returns the number of page that could not be moved.
1073  */
1074 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1075 		     const nodemask_t *to, int flags)
1076 {
1077 	int busy = 0;
1078 	int err;
1079 	nodemask_t tmp;
1080 
1081 	err = migrate_prep();
1082 	if (err)
1083 		return err;
1084 
1085 	down_read(&mm->mmap_sem);
1086 
1087 	/*
1088 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1089 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1090 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1091 	 * The pair of nodemasks 'to' and 'from' define the map.
1092 	 *
1093 	 * If no pair of bits is found that way, fallback to picking some
1094 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1095 	 * 'source' and 'dest' bits are the same, this represents a node
1096 	 * that will be migrating to itself, so no pages need move.
1097 	 *
1098 	 * If no bits are left in 'tmp', or if all remaining bits left
1099 	 * in 'tmp' correspond to the same bit in 'to', return false
1100 	 * (nothing left to migrate).
1101 	 *
1102 	 * This lets us pick a pair of nodes to migrate between, such that
1103 	 * if possible the dest node is not already occupied by some other
1104 	 * source node, minimizing the risk of overloading the memory on a
1105 	 * node that would happen if we migrated incoming memory to a node
1106 	 * before migrating outgoing memory source that same node.
1107 	 *
1108 	 * A single scan of tmp is sufficient.  As we go, we remember the
1109 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1110 	 * that not only moved, but what's better, moved to an empty slot
1111 	 * (d is not set in tmp), then we break out then, with that pair.
1112 	 * Otherwise when we finish scanning from_tmp, we at least have the
1113 	 * most recent <s, d> pair that moved.  If we get all the way through
1114 	 * the scan of tmp without finding any node that moved, much less
1115 	 * moved to an empty node, then there is nothing left worth migrating.
1116 	 */
1117 
1118 	tmp = *from;
1119 	while (!nodes_empty(tmp)) {
1120 		int s,d;
1121 		int source = NUMA_NO_NODE;
1122 		int dest = 0;
1123 
1124 		for_each_node_mask(s, tmp) {
1125 
1126 			/*
1127 			 * do_migrate_pages() tries to maintain the relative
1128 			 * node relationship of the pages established between
1129 			 * threads and memory areas.
1130                          *
1131 			 * However if the number of source nodes is not equal to
1132 			 * the number of destination nodes we can not preserve
1133 			 * this node relative relationship.  In that case, skip
1134 			 * copying memory from a node that is in the destination
1135 			 * mask.
1136 			 *
1137 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1138 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1139 			 */
1140 
1141 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1142 						(node_isset(s, *to)))
1143 				continue;
1144 
1145 			d = node_remap(s, *from, *to);
1146 			if (s == d)
1147 				continue;
1148 
1149 			source = s;	/* Node moved. Memorize */
1150 			dest = d;
1151 
1152 			/* dest not in remaining from nodes? */
1153 			if (!node_isset(dest, tmp))
1154 				break;
1155 		}
1156 		if (source == NUMA_NO_NODE)
1157 			break;
1158 
1159 		node_clear(source, tmp);
1160 		err = migrate_to_node(mm, source, dest, flags);
1161 		if (err > 0)
1162 			busy += err;
1163 		if (err < 0)
1164 			break;
1165 	}
1166 	up_read(&mm->mmap_sem);
1167 	if (err < 0)
1168 		return err;
1169 	return busy;
1170 
1171 }
1172 
1173 /*
1174  * Allocate a new page for page migration based on vma policy.
1175  * Start by assuming the page is mapped by the same vma as contains @start.
1176  * Search forward from there, if not.  N.B., this assumes that the
1177  * list of pages handed to migrate_pages()--which is how we get here--
1178  * is in virtual address order.
1179  */
1180 static struct page *new_page(struct page *page, unsigned long start)
1181 {
1182 	struct vm_area_struct *vma;
1183 	unsigned long uninitialized_var(address);
1184 
1185 	vma = find_vma(current->mm, start);
1186 	while (vma) {
1187 		address = page_address_in_vma(page, vma);
1188 		if (address != -EFAULT)
1189 			break;
1190 		vma = vma->vm_next;
1191 	}
1192 
1193 	if (PageHuge(page)) {
1194 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1195 				vma, address);
1196 	} else if (PageTransHuge(page)) {
1197 		struct page *thp;
1198 
1199 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1200 					 HPAGE_PMD_ORDER);
1201 		if (!thp)
1202 			return NULL;
1203 		prep_transhuge_page(thp);
1204 		return thp;
1205 	}
1206 	/*
1207 	 * if !vma, alloc_page_vma() will use task or system default policy
1208 	 */
1209 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1210 			vma, address);
1211 }
1212 #else
1213 
1214 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1215 				unsigned long flags)
1216 {
1217 	return -EIO;
1218 }
1219 
1220 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1221 		     const nodemask_t *to, int flags)
1222 {
1223 	return -ENOSYS;
1224 }
1225 
1226 static struct page *new_page(struct page *page, unsigned long start)
1227 {
1228 	return NULL;
1229 }
1230 #endif
1231 
1232 static long do_mbind(unsigned long start, unsigned long len,
1233 		     unsigned short mode, unsigned short mode_flags,
1234 		     nodemask_t *nmask, unsigned long flags)
1235 {
1236 	struct mm_struct *mm = current->mm;
1237 	struct mempolicy *new;
1238 	unsigned long end;
1239 	int err;
1240 	int ret;
1241 	LIST_HEAD(pagelist);
1242 
1243 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1244 		return -EINVAL;
1245 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1246 		return -EPERM;
1247 
1248 	if (start & ~PAGE_MASK)
1249 		return -EINVAL;
1250 
1251 	if (mode == MPOL_DEFAULT)
1252 		flags &= ~MPOL_MF_STRICT;
1253 
1254 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1255 	end = start + len;
1256 
1257 	if (end < start)
1258 		return -EINVAL;
1259 	if (end == start)
1260 		return 0;
1261 
1262 	new = mpol_new(mode, mode_flags, nmask);
1263 	if (IS_ERR(new))
1264 		return PTR_ERR(new);
1265 
1266 	if (flags & MPOL_MF_LAZY)
1267 		new->flags |= MPOL_F_MOF;
1268 
1269 	/*
1270 	 * If we are using the default policy then operation
1271 	 * on discontinuous address spaces is okay after all
1272 	 */
1273 	if (!new)
1274 		flags |= MPOL_MF_DISCONTIG_OK;
1275 
1276 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1277 		 start, start + len, mode, mode_flags,
1278 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1279 
1280 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1281 
1282 		err = migrate_prep();
1283 		if (err)
1284 			goto mpol_out;
1285 	}
1286 	{
1287 		NODEMASK_SCRATCH(scratch);
1288 		if (scratch) {
1289 			down_write(&mm->mmap_sem);
1290 			task_lock(current);
1291 			err = mpol_set_nodemask(new, nmask, scratch);
1292 			task_unlock(current);
1293 			if (err)
1294 				up_write(&mm->mmap_sem);
1295 		} else
1296 			err = -ENOMEM;
1297 		NODEMASK_SCRATCH_FREE(scratch);
1298 	}
1299 	if (err)
1300 		goto mpol_out;
1301 
1302 	ret = queue_pages_range(mm, start, end, nmask,
1303 			  flags | MPOL_MF_INVERT, &pagelist);
1304 
1305 	if (ret < 0) {
1306 		err = ret;
1307 		goto up_out;
1308 	}
1309 
1310 	err = mbind_range(mm, start, end, new);
1311 
1312 	if (!err) {
1313 		int nr_failed = 0;
1314 
1315 		if (!list_empty(&pagelist)) {
1316 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1317 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1318 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1319 			if (nr_failed)
1320 				putback_movable_pages(&pagelist);
1321 		}
1322 
1323 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1324 			err = -EIO;
1325 	} else {
1326 up_out:
1327 		if (!list_empty(&pagelist))
1328 			putback_movable_pages(&pagelist);
1329 	}
1330 
1331 	up_write(&mm->mmap_sem);
1332 mpol_out:
1333 	mpol_put(new);
1334 	return err;
1335 }
1336 
1337 /*
1338  * User space interface with variable sized bitmaps for nodelists.
1339  */
1340 
1341 /* Copy a node mask from user space. */
1342 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1343 		     unsigned long maxnode)
1344 {
1345 	unsigned long k;
1346 	unsigned long t;
1347 	unsigned long nlongs;
1348 	unsigned long endmask;
1349 
1350 	--maxnode;
1351 	nodes_clear(*nodes);
1352 	if (maxnode == 0 || !nmask)
1353 		return 0;
1354 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1355 		return -EINVAL;
1356 
1357 	nlongs = BITS_TO_LONGS(maxnode);
1358 	if ((maxnode % BITS_PER_LONG) == 0)
1359 		endmask = ~0UL;
1360 	else
1361 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1362 
1363 	/*
1364 	 * When the user specified more nodes than supported just check
1365 	 * if the non supported part is all zero.
1366 	 *
1367 	 * If maxnode have more longs than MAX_NUMNODES, check
1368 	 * the bits in that area first. And then go through to
1369 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1370 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1371 	 */
1372 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1373 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1374 			if (get_user(t, nmask + k))
1375 				return -EFAULT;
1376 			if (k == nlongs - 1) {
1377 				if (t & endmask)
1378 					return -EINVAL;
1379 			} else if (t)
1380 				return -EINVAL;
1381 		}
1382 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1383 		endmask = ~0UL;
1384 	}
1385 
1386 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1387 		unsigned long valid_mask = endmask;
1388 
1389 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1390 		if (get_user(t, nmask + nlongs - 1))
1391 			return -EFAULT;
1392 		if (t & valid_mask)
1393 			return -EINVAL;
1394 	}
1395 
1396 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1397 		return -EFAULT;
1398 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1399 	return 0;
1400 }
1401 
1402 /* Copy a kernel node mask to user space */
1403 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1404 			      nodemask_t *nodes)
1405 {
1406 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1407 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1408 
1409 	if (copy > nbytes) {
1410 		if (copy > PAGE_SIZE)
1411 			return -EINVAL;
1412 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1413 			return -EFAULT;
1414 		copy = nbytes;
1415 	}
1416 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1417 }
1418 
1419 static long kernel_mbind(unsigned long start, unsigned long len,
1420 			 unsigned long mode, const unsigned long __user *nmask,
1421 			 unsigned long maxnode, unsigned int flags)
1422 {
1423 	nodemask_t nodes;
1424 	int err;
1425 	unsigned short mode_flags;
1426 
1427 	start = untagged_addr(start);
1428 	mode_flags = mode & MPOL_MODE_FLAGS;
1429 	mode &= ~MPOL_MODE_FLAGS;
1430 	if (mode >= MPOL_MAX)
1431 		return -EINVAL;
1432 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1433 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1434 		return -EINVAL;
1435 	err = get_nodes(&nodes, nmask, maxnode);
1436 	if (err)
1437 		return err;
1438 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1439 }
1440 
1441 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1442 		unsigned long, mode, const unsigned long __user *, nmask,
1443 		unsigned long, maxnode, unsigned int, flags)
1444 {
1445 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1446 }
1447 
1448 /* Set the process memory policy */
1449 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1450 				 unsigned long maxnode)
1451 {
1452 	int err;
1453 	nodemask_t nodes;
1454 	unsigned short flags;
1455 
1456 	flags = mode & MPOL_MODE_FLAGS;
1457 	mode &= ~MPOL_MODE_FLAGS;
1458 	if ((unsigned int)mode >= MPOL_MAX)
1459 		return -EINVAL;
1460 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1461 		return -EINVAL;
1462 	err = get_nodes(&nodes, nmask, maxnode);
1463 	if (err)
1464 		return err;
1465 	return do_set_mempolicy(mode, flags, &nodes);
1466 }
1467 
1468 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1469 		unsigned long, maxnode)
1470 {
1471 	return kernel_set_mempolicy(mode, nmask, maxnode);
1472 }
1473 
1474 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1475 				const unsigned long __user *old_nodes,
1476 				const unsigned long __user *new_nodes)
1477 {
1478 	struct mm_struct *mm = NULL;
1479 	struct task_struct *task;
1480 	nodemask_t task_nodes;
1481 	int err;
1482 	nodemask_t *old;
1483 	nodemask_t *new;
1484 	NODEMASK_SCRATCH(scratch);
1485 
1486 	if (!scratch)
1487 		return -ENOMEM;
1488 
1489 	old = &scratch->mask1;
1490 	new = &scratch->mask2;
1491 
1492 	err = get_nodes(old, old_nodes, maxnode);
1493 	if (err)
1494 		goto out;
1495 
1496 	err = get_nodes(new, new_nodes, maxnode);
1497 	if (err)
1498 		goto out;
1499 
1500 	/* Find the mm_struct */
1501 	rcu_read_lock();
1502 	task = pid ? find_task_by_vpid(pid) : current;
1503 	if (!task) {
1504 		rcu_read_unlock();
1505 		err = -ESRCH;
1506 		goto out;
1507 	}
1508 	get_task_struct(task);
1509 
1510 	err = -EINVAL;
1511 
1512 	/*
1513 	 * Check if this process has the right to modify the specified process.
1514 	 * Use the regular "ptrace_may_access()" checks.
1515 	 */
1516 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1517 		rcu_read_unlock();
1518 		err = -EPERM;
1519 		goto out_put;
1520 	}
1521 	rcu_read_unlock();
1522 
1523 	task_nodes = cpuset_mems_allowed(task);
1524 	/* Is the user allowed to access the target nodes? */
1525 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1526 		err = -EPERM;
1527 		goto out_put;
1528 	}
1529 
1530 	task_nodes = cpuset_mems_allowed(current);
1531 	nodes_and(*new, *new, task_nodes);
1532 	if (nodes_empty(*new))
1533 		goto out_put;
1534 
1535 	err = security_task_movememory(task);
1536 	if (err)
1537 		goto out_put;
1538 
1539 	mm = get_task_mm(task);
1540 	put_task_struct(task);
1541 
1542 	if (!mm) {
1543 		err = -EINVAL;
1544 		goto out;
1545 	}
1546 
1547 	err = do_migrate_pages(mm, old, new,
1548 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1549 
1550 	mmput(mm);
1551 out:
1552 	NODEMASK_SCRATCH_FREE(scratch);
1553 
1554 	return err;
1555 
1556 out_put:
1557 	put_task_struct(task);
1558 	goto out;
1559 
1560 }
1561 
1562 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1563 		const unsigned long __user *, old_nodes,
1564 		const unsigned long __user *, new_nodes)
1565 {
1566 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1567 }
1568 
1569 
1570 /* Retrieve NUMA policy */
1571 static int kernel_get_mempolicy(int __user *policy,
1572 				unsigned long __user *nmask,
1573 				unsigned long maxnode,
1574 				unsigned long addr,
1575 				unsigned long flags)
1576 {
1577 	int err;
1578 	int uninitialized_var(pval);
1579 	nodemask_t nodes;
1580 
1581 	addr = untagged_addr(addr);
1582 
1583 	if (nmask != NULL && maxnode < nr_node_ids)
1584 		return -EINVAL;
1585 
1586 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1587 
1588 	if (err)
1589 		return err;
1590 
1591 	if (policy && put_user(pval, policy))
1592 		return -EFAULT;
1593 
1594 	if (nmask)
1595 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1596 
1597 	return err;
1598 }
1599 
1600 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1601 		unsigned long __user *, nmask, unsigned long, maxnode,
1602 		unsigned long, addr, unsigned long, flags)
1603 {
1604 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1605 }
1606 
1607 #ifdef CONFIG_COMPAT
1608 
1609 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1610 		       compat_ulong_t __user *, nmask,
1611 		       compat_ulong_t, maxnode,
1612 		       compat_ulong_t, addr, compat_ulong_t, flags)
1613 {
1614 	long err;
1615 	unsigned long __user *nm = NULL;
1616 	unsigned long nr_bits, alloc_size;
1617 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1618 
1619 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1620 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1621 
1622 	if (nmask)
1623 		nm = compat_alloc_user_space(alloc_size);
1624 
1625 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1626 
1627 	if (!err && nmask) {
1628 		unsigned long copy_size;
1629 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1630 		err = copy_from_user(bm, nm, copy_size);
1631 		/* ensure entire bitmap is zeroed */
1632 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1633 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1634 	}
1635 
1636 	return err;
1637 }
1638 
1639 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1640 		       compat_ulong_t, maxnode)
1641 {
1642 	unsigned long __user *nm = NULL;
1643 	unsigned long nr_bits, alloc_size;
1644 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1645 
1646 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1647 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1648 
1649 	if (nmask) {
1650 		if (compat_get_bitmap(bm, nmask, nr_bits))
1651 			return -EFAULT;
1652 		nm = compat_alloc_user_space(alloc_size);
1653 		if (copy_to_user(nm, bm, alloc_size))
1654 			return -EFAULT;
1655 	}
1656 
1657 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1658 }
1659 
1660 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1661 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1662 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1663 {
1664 	unsigned long __user *nm = NULL;
1665 	unsigned long nr_bits, alloc_size;
1666 	nodemask_t bm;
1667 
1668 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1669 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1670 
1671 	if (nmask) {
1672 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1673 			return -EFAULT;
1674 		nm = compat_alloc_user_space(alloc_size);
1675 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1676 			return -EFAULT;
1677 	}
1678 
1679 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1680 }
1681 
1682 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1683 		       compat_ulong_t, maxnode,
1684 		       const compat_ulong_t __user *, old_nodes,
1685 		       const compat_ulong_t __user *, new_nodes)
1686 {
1687 	unsigned long __user *old = NULL;
1688 	unsigned long __user *new = NULL;
1689 	nodemask_t tmp_mask;
1690 	unsigned long nr_bits;
1691 	unsigned long size;
1692 
1693 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1694 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1695 	if (old_nodes) {
1696 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1697 			return -EFAULT;
1698 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1699 		if (new_nodes)
1700 			new = old + size / sizeof(unsigned long);
1701 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1702 			return -EFAULT;
1703 	}
1704 	if (new_nodes) {
1705 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1706 			return -EFAULT;
1707 		if (new == NULL)
1708 			new = compat_alloc_user_space(size);
1709 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1710 			return -EFAULT;
1711 	}
1712 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1713 }
1714 
1715 #endif /* CONFIG_COMPAT */
1716 
1717 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1718 						unsigned long addr)
1719 {
1720 	struct mempolicy *pol = NULL;
1721 
1722 	if (vma) {
1723 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1724 			pol = vma->vm_ops->get_policy(vma, addr);
1725 		} else if (vma->vm_policy) {
1726 			pol = vma->vm_policy;
1727 
1728 			/*
1729 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1730 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1731 			 * count on these policies which will be dropped by
1732 			 * mpol_cond_put() later
1733 			 */
1734 			if (mpol_needs_cond_ref(pol))
1735 				mpol_get(pol);
1736 		}
1737 	}
1738 
1739 	return pol;
1740 }
1741 
1742 /*
1743  * get_vma_policy(@vma, @addr)
1744  * @vma: virtual memory area whose policy is sought
1745  * @addr: address in @vma for shared policy lookup
1746  *
1747  * Returns effective policy for a VMA at specified address.
1748  * Falls back to current->mempolicy or system default policy, as necessary.
1749  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1750  * count--added by the get_policy() vm_op, as appropriate--to protect against
1751  * freeing by another task.  It is the caller's responsibility to free the
1752  * extra reference for shared policies.
1753  */
1754 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1755 						unsigned long addr)
1756 {
1757 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1758 
1759 	if (!pol)
1760 		pol = get_task_policy(current);
1761 
1762 	return pol;
1763 }
1764 
1765 bool vma_policy_mof(struct vm_area_struct *vma)
1766 {
1767 	struct mempolicy *pol;
1768 
1769 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1770 		bool ret = false;
1771 
1772 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1773 		if (pol && (pol->flags & MPOL_F_MOF))
1774 			ret = true;
1775 		mpol_cond_put(pol);
1776 
1777 		return ret;
1778 	}
1779 
1780 	pol = vma->vm_policy;
1781 	if (!pol)
1782 		pol = get_task_policy(current);
1783 
1784 	return pol->flags & MPOL_F_MOF;
1785 }
1786 
1787 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1788 {
1789 	enum zone_type dynamic_policy_zone = policy_zone;
1790 
1791 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1792 
1793 	/*
1794 	 * if policy->v.nodes has movable memory only,
1795 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1796 	 *
1797 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1798 	 * so if the following test faile, it implies
1799 	 * policy->v.nodes has movable memory only.
1800 	 */
1801 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1802 		dynamic_policy_zone = ZONE_MOVABLE;
1803 
1804 	return zone >= dynamic_policy_zone;
1805 }
1806 
1807 /*
1808  * Return a nodemask representing a mempolicy for filtering nodes for
1809  * page allocation
1810  */
1811 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1812 {
1813 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1814 	if (unlikely(policy->mode == MPOL_BIND) &&
1815 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1816 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1817 		return &policy->v.nodes;
1818 
1819 	return NULL;
1820 }
1821 
1822 /* Return the node id preferred by the given mempolicy, or the given id */
1823 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1824 								int nd)
1825 {
1826 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1827 		nd = policy->v.preferred_node;
1828 	else {
1829 		/*
1830 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1831 		 * because we might easily break the expectation to stay on the
1832 		 * requested node and not break the policy.
1833 		 */
1834 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1835 	}
1836 
1837 	return nd;
1838 }
1839 
1840 /* Do dynamic interleaving for a process */
1841 static unsigned interleave_nodes(struct mempolicy *policy)
1842 {
1843 	unsigned next;
1844 	struct task_struct *me = current;
1845 
1846 	next = next_node_in(me->il_prev, policy->v.nodes);
1847 	if (next < MAX_NUMNODES)
1848 		me->il_prev = next;
1849 	return next;
1850 }
1851 
1852 /*
1853  * Depending on the memory policy provide a node from which to allocate the
1854  * next slab entry.
1855  */
1856 unsigned int mempolicy_slab_node(void)
1857 {
1858 	struct mempolicy *policy;
1859 	int node = numa_mem_id();
1860 
1861 	if (in_interrupt())
1862 		return node;
1863 
1864 	policy = current->mempolicy;
1865 	if (!policy || policy->flags & MPOL_F_LOCAL)
1866 		return node;
1867 
1868 	switch (policy->mode) {
1869 	case MPOL_PREFERRED:
1870 		/*
1871 		 * handled MPOL_F_LOCAL above
1872 		 */
1873 		return policy->v.preferred_node;
1874 
1875 	case MPOL_INTERLEAVE:
1876 		return interleave_nodes(policy);
1877 
1878 	case MPOL_BIND: {
1879 		struct zoneref *z;
1880 
1881 		/*
1882 		 * Follow bind policy behavior and start allocation at the
1883 		 * first node.
1884 		 */
1885 		struct zonelist *zonelist;
1886 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1887 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1888 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1889 							&policy->v.nodes);
1890 		return z->zone ? zone_to_nid(z->zone) : node;
1891 	}
1892 
1893 	default:
1894 		BUG();
1895 	}
1896 }
1897 
1898 /*
1899  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1900  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1901  * number of present nodes.
1902  */
1903 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1904 {
1905 	unsigned nnodes = nodes_weight(pol->v.nodes);
1906 	unsigned target;
1907 	int i;
1908 	int nid;
1909 
1910 	if (!nnodes)
1911 		return numa_node_id();
1912 	target = (unsigned int)n % nnodes;
1913 	nid = first_node(pol->v.nodes);
1914 	for (i = 0; i < target; i++)
1915 		nid = next_node(nid, pol->v.nodes);
1916 	return nid;
1917 }
1918 
1919 /* Determine a node number for interleave */
1920 static inline unsigned interleave_nid(struct mempolicy *pol,
1921 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1922 {
1923 	if (vma) {
1924 		unsigned long off;
1925 
1926 		/*
1927 		 * for small pages, there is no difference between
1928 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1929 		 * for huge pages, since vm_pgoff is in units of small
1930 		 * pages, we need to shift off the always 0 bits to get
1931 		 * a useful offset.
1932 		 */
1933 		BUG_ON(shift < PAGE_SHIFT);
1934 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1935 		off += (addr - vma->vm_start) >> shift;
1936 		return offset_il_node(pol, off);
1937 	} else
1938 		return interleave_nodes(pol);
1939 }
1940 
1941 #ifdef CONFIG_HUGETLBFS
1942 /*
1943  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1944  * @vma: virtual memory area whose policy is sought
1945  * @addr: address in @vma for shared policy lookup and interleave policy
1946  * @gfp_flags: for requested zone
1947  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1948  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1949  *
1950  * Returns a nid suitable for a huge page allocation and a pointer
1951  * to the struct mempolicy for conditional unref after allocation.
1952  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1953  * @nodemask for filtering the zonelist.
1954  *
1955  * Must be protected by read_mems_allowed_begin()
1956  */
1957 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1958 				struct mempolicy **mpol, nodemask_t **nodemask)
1959 {
1960 	int nid;
1961 
1962 	*mpol = get_vma_policy(vma, addr);
1963 	*nodemask = NULL;	/* assume !MPOL_BIND */
1964 
1965 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1966 		nid = interleave_nid(*mpol, vma, addr,
1967 					huge_page_shift(hstate_vma(vma)));
1968 	} else {
1969 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1970 		if ((*mpol)->mode == MPOL_BIND)
1971 			*nodemask = &(*mpol)->v.nodes;
1972 	}
1973 	return nid;
1974 }
1975 
1976 /*
1977  * init_nodemask_of_mempolicy
1978  *
1979  * If the current task's mempolicy is "default" [NULL], return 'false'
1980  * to indicate default policy.  Otherwise, extract the policy nodemask
1981  * for 'bind' or 'interleave' policy into the argument nodemask, or
1982  * initialize the argument nodemask to contain the single node for
1983  * 'preferred' or 'local' policy and return 'true' to indicate presence
1984  * of non-default mempolicy.
1985  *
1986  * We don't bother with reference counting the mempolicy [mpol_get/put]
1987  * because the current task is examining it's own mempolicy and a task's
1988  * mempolicy is only ever changed by the task itself.
1989  *
1990  * N.B., it is the caller's responsibility to free a returned nodemask.
1991  */
1992 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1993 {
1994 	struct mempolicy *mempolicy;
1995 	int nid;
1996 
1997 	if (!(mask && current->mempolicy))
1998 		return false;
1999 
2000 	task_lock(current);
2001 	mempolicy = current->mempolicy;
2002 	switch (mempolicy->mode) {
2003 	case MPOL_PREFERRED:
2004 		if (mempolicy->flags & MPOL_F_LOCAL)
2005 			nid = numa_node_id();
2006 		else
2007 			nid = mempolicy->v.preferred_node;
2008 		init_nodemask_of_node(mask, nid);
2009 		break;
2010 
2011 	case MPOL_BIND:
2012 		/* Fall through */
2013 	case MPOL_INTERLEAVE:
2014 		*mask =  mempolicy->v.nodes;
2015 		break;
2016 
2017 	default:
2018 		BUG();
2019 	}
2020 	task_unlock(current);
2021 
2022 	return true;
2023 }
2024 #endif
2025 
2026 /*
2027  * mempolicy_nodemask_intersects
2028  *
2029  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2030  * policy.  Otherwise, check for intersection between mask and the policy
2031  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2032  * policy, always return true since it may allocate elsewhere on fallback.
2033  *
2034  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2035  */
2036 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2037 					const nodemask_t *mask)
2038 {
2039 	struct mempolicy *mempolicy;
2040 	bool ret = true;
2041 
2042 	if (!mask)
2043 		return ret;
2044 	task_lock(tsk);
2045 	mempolicy = tsk->mempolicy;
2046 	if (!mempolicy)
2047 		goto out;
2048 
2049 	switch (mempolicy->mode) {
2050 	case MPOL_PREFERRED:
2051 		/*
2052 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2053 		 * allocate from, they may fallback to other nodes when oom.
2054 		 * Thus, it's possible for tsk to have allocated memory from
2055 		 * nodes in mask.
2056 		 */
2057 		break;
2058 	case MPOL_BIND:
2059 	case MPOL_INTERLEAVE:
2060 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2061 		break;
2062 	default:
2063 		BUG();
2064 	}
2065 out:
2066 	task_unlock(tsk);
2067 	return ret;
2068 }
2069 
2070 /* Allocate a page in interleaved policy.
2071    Own path because it needs to do special accounting. */
2072 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2073 					unsigned nid)
2074 {
2075 	struct page *page;
2076 
2077 	page = __alloc_pages(gfp, order, nid);
2078 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2079 	if (!static_branch_likely(&vm_numa_stat_key))
2080 		return page;
2081 	if (page && page_to_nid(page) == nid) {
2082 		preempt_disable();
2083 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2084 		preempt_enable();
2085 	}
2086 	return page;
2087 }
2088 
2089 /**
2090  * 	alloc_pages_vma	- Allocate a page for a VMA.
2091  *
2092  * 	@gfp:
2093  *      %GFP_USER    user allocation.
2094  *      %GFP_KERNEL  kernel allocations,
2095  *      %GFP_HIGHMEM highmem/user allocations,
2096  *      %GFP_FS      allocation should not call back into a file system.
2097  *      %GFP_ATOMIC  don't sleep.
2098  *
2099  *	@order:Order of the GFP allocation.
2100  * 	@vma:  Pointer to VMA or NULL if not available.
2101  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2102  *	@node: Which node to prefer for allocation (modulo policy).
2103  *	@hugepage: for hugepages try only the preferred node if possible
2104  *
2105  * 	This function allocates a page from the kernel page pool and applies
2106  *	a NUMA policy associated with the VMA or the current process.
2107  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2108  *	mm_struct of the VMA to prevent it from going away. Should be used for
2109  *	all allocations for pages that will be mapped into user space. Returns
2110  *	NULL when no page can be allocated.
2111  */
2112 struct page *
2113 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2114 		unsigned long addr, int node, bool hugepage)
2115 {
2116 	struct mempolicy *pol;
2117 	struct page *page;
2118 	int preferred_nid;
2119 	nodemask_t *nmask;
2120 
2121 	pol = get_vma_policy(vma, addr);
2122 
2123 	if (pol->mode == MPOL_INTERLEAVE) {
2124 		unsigned nid;
2125 
2126 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2127 		mpol_cond_put(pol);
2128 		page = alloc_page_interleave(gfp, order, nid);
2129 		goto out;
2130 	}
2131 
2132 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2133 		int hpage_node = node;
2134 
2135 		/*
2136 		 * For hugepage allocation and non-interleave policy which
2137 		 * allows the current node (or other explicitly preferred
2138 		 * node) we only try to allocate from the current/preferred
2139 		 * node and don't fall back to other nodes, as the cost of
2140 		 * remote accesses would likely offset THP benefits.
2141 		 *
2142 		 * If the policy is interleave, or does not allow the current
2143 		 * node in its nodemask, we allocate the standard way.
2144 		 */
2145 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2146 			hpage_node = pol->v.preferred_node;
2147 
2148 		nmask = policy_nodemask(gfp, pol);
2149 		if (!nmask || node_isset(hpage_node, *nmask)) {
2150 			mpol_cond_put(pol);
2151 			/*
2152 			 * First, try to allocate THP only on local node, but
2153 			 * don't reclaim unnecessarily, just compact.
2154 			 */
2155 			page = __alloc_pages_node(hpage_node,
2156 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2157 
2158 			/*
2159 			 * If hugepage allocations are configured to always
2160 			 * synchronous compact or the vma has been madvised
2161 			 * to prefer hugepage backing, retry allowing remote
2162 			 * memory with both reclaim and compact as well.
2163 			 */
2164 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2165 				page = __alloc_pages_node(hpage_node,
2166 								gfp, order);
2167 
2168 			goto out;
2169 		}
2170 	}
2171 
2172 	nmask = policy_nodemask(gfp, pol);
2173 	preferred_nid = policy_node(gfp, pol, node);
2174 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2175 	mpol_cond_put(pol);
2176 out:
2177 	return page;
2178 }
2179 EXPORT_SYMBOL(alloc_pages_vma);
2180 
2181 /**
2182  * 	alloc_pages_current - Allocate pages.
2183  *
2184  *	@gfp:
2185  *		%GFP_USER   user allocation,
2186  *      	%GFP_KERNEL kernel allocation,
2187  *      	%GFP_HIGHMEM highmem allocation,
2188  *      	%GFP_FS     don't call back into a file system.
2189  *      	%GFP_ATOMIC don't sleep.
2190  *	@order: Power of two of allocation size in pages. 0 is a single page.
2191  *
2192  *	Allocate a page from the kernel page pool.  When not in
2193  *	interrupt context and apply the current process NUMA policy.
2194  *	Returns NULL when no page can be allocated.
2195  */
2196 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2197 {
2198 	struct mempolicy *pol = &default_policy;
2199 	struct page *page;
2200 
2201 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2202 		pol = get_task_policy(current);
2203 
2204 	/*
2205 	 * No reference counting needed for current->mempolicy
2206 	 * nor system default_policy
2207 	 */
2208 	if (pol->mode == MPOL_INTERLEAVE)
2209 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2210 	else
2211 		page = __alloc_pages_nodemask(gfp, order,
2212 				policy_node(gfp, pol, numa_node_id()),
2213 				policy_nodemask(gfp, pol));
2214 
2215 	return page;
2216 }
2217 EXPORT_SYMBOL(alloc_pages_current);
2218 
2219 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2220 {
2221 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2222 
2223 	if (IS_ERR(pol))
2224 		return PTR_ERR(pol);
2225 	dst->vm_policy = pol;
2226 	return 0;
2227 }
2228 
2229 /*
2230  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2231  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2232  * with the mems_allowed returned by cpuset_mems_allowed().  This
2233  * keeps mempolicies cpuset relative after its cpuset moves.  See
2234  * further kernel/cpuset.c update_nodemask().
2235  *
2236  * current's mempolicy may be rebinded by the other task(the task that changes
2237  * cpuset's mems), so we needn't do rebind work for current task.
2238  */
2239 
2240 /* Slow path of a mempolicy duplicate */
2241 struct mempolicy *__mpol_dup(struct mempolicy *old)
2242 {
2243 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2244 
2245 	if (!new)
2246 		return ERR_PTR(-ENOMEM);
2247 
2248 	/* task's mempolicy is protected by alloc_lock */
2249 	if (old == current->mempolicy) {
2250 		task_lock(current);
2251 		*new = *old;
2252 		task_unlock(current);
2253 	} else
2254 		*new = *old;
2255 
2256 	if (current_cpuset_is_being_rebound()) {
2257 		nodemask_t mems = cpuset_mems_allowed(current);
2258 		mpol_rebind_policy(new, &mems);
2259 	}
2260 	atomic_set(&new->refcnt, 1);
2261 	return new;
2262 }
2263 
2264 /* Slow path of a mempolicy comparison */
2265 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2266 {
2267 	if (!a || !b)
2268 		return false;
2269 	if (a->mode != b->mode)
2270 		return false;
2271 	if (a->flags != b->flags)
2272 		return false;
2273 	if (mpol_store_user_nodemask(a))
2274 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2275 			return false;
2276 
2277 	switch (a->mode) {
2278 	case MPOL_BIND:
2279 		/* Fall through */
2280 	case MPOL_INTERLEAVE:
2281 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2282 	case MPOL_PREFERRED:
2283 		/* a's ->flags is the same as b's */
2284 		if (a->flags & MPOL_F_LOCAL)
2285 			return true;
2286 		return a->v.preferred_node == b->v.preferred_node;
2287 	default:
2288 		BUG();
2289 		return false;
2290 	}
2291 }
2292 
2293 /*
2294  * Shared memory backing store policy support.
2295  *
2296  * Remember policies even when nobody has shared memory mapped.
2297  * The policies are kept in Red-Black tree linked from the inode.
2298  * They are protected by the sp->lock rwlock, which should be held
2299  * for any accesses to the tree.
2300  */
2301 
2302 /*
2303  * lookup first element intersecting start-end.  Caller holds sp->lock for
2304  * reading or for writing
2305  */
2306 static struct sp_node *
2307 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2308 {
2309 	struct rb_node *n = sp->root.rb_node;
2310 
2311 	while (n) {
2312 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2313 
2314 		if (start >= p->end)
2315 			n = n->rb_right;
2316 		else if (end <= p->start)
2317 			n = n->rb_left;
2318 		else
2319 			break;
2320 	}
2321 	if (!n)
2322 		return NULL;
2323 	for (;;) {
2324 		struct sp_node *w = NULL;
2325 		struct rb_node *prev = rb_prev(n);
2326 		if (!prev)
2327 			break;
2328 		w = rb_entry(prev, struct sp_node, nd);
2329 		if (w->end <= start)
2330 			break;
2331 		n = prev;
2332 	}
2333 	return rb_entry(n, struct sp_node, nd);
2334 }
2335 
2336 /*
2337  * Insert a new shared policy into the list.  Caller holds sp->lock for
2338  * writing.
2339  */
2340 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2341 {
2342 	struct rb_node **p = &sp->root.rb_node;
2343 	struct rb_node *parent = NULL;
2344 	struct sp_node *nd;
2345 
2346 	while (*p) {
2347 		parent = *p;
2348 		nd = rb_entry(parent, struct sp_node, nd);
2349 		if (new->start < nd->start)
2350 			p = &(*p)->rb_left;
2351 		else if (new->end > nd->end)
2352 			p = &(*p)->rb_right;
2353 		else
2354 			BUG();
2355 	}
2356 	rb_link_node(&new->nd, parent, p);
2357 	rb_insert_color(&new->nd, &sp->root);
2358 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2359 		 new->policy ? new->policy->mode : 0);
2360 }
2361 
2362 /* Find shared policy intersecting idx */
2363 struct mempolicy *
2364 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2365 {
2366 	struct mempolicy *pol = NULL;
2367 	struct sp_node *sn;
2368 
2369 	if (!sp->root.rb_node)
2370 		return NULL;
2371 	read_lock(&sp->lock);
2372 	sn = sp_lookup(sp, idx, idx+1);
2373 	if (sn) {
2374 		mpol_get(sn->policy);
2375 		pol = sn->policy;
2376 	}
2377 	read_unlock(&sp->lock);
2378 	return pol;
2379 }
2380 
2381 static void sp_free(struct sp_node *n)
2382 {
2383 	mpol_put(n->policy);
2384 	kmem_cache_free(sn_cache, n);
2385 }
2386 
2387 /**
2388  * mpol_misplaced - check whether current page node is valid in policy
2389  *
2390  * @page: page to be checked
2391  * @vma: vm area where page mapped
2392  * @addr: virtual address where page mapped
2393  *
2394  * Lookup current policy node id for vma,addr and "compare to" page's
2395  * node id.
2396  *
2397  * Returns:
2398  *	-1	- not misplaced, page is in the right node
2399  *	node	- node id where the page should be
2400  *
2401  * Policy determination "mimics" alloc_page_vma().
2402  * Called from fault path where we know the vma and faulting address.
2403  */
2404 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2405 {
2406 	struct mempolicy *pol;
2407 	struct zoneref *z;
2408 	int curnid = page_to_nid(page);
2409 	unsigned long pgoff;
2410 	int thiscpu = raw_smp_processor_id();
2411 	int thisnid = cpu_to_node(thiscpu);
2412 	int polnid = NUMA_NO_NODE;
2413 	int ret = -1;
2414 
2415 	pol = get_vma_policy(vma, addr);
2416 	if (!(pol->flags & MPOL_F_MOF))
2417 		goto out;
2418 
2419 	switch (pol->mode) {
2420 	case MPOL_INTERLEAVE:
2421 		pgoff = vma->vm_pgoff;
2422 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2423 		polnid = offset_il_node(pol, pgoff);
2424 		break;
2425 
2426 	case MPOL_PREFERRED:
2427 		if (pol->flags & MPOL_F_LOCAL)
2428 			polnid = numa_node_id();
2429 		else
2430 			polnid = pol->v.preferred_node;
2431 		break;
2432 
2433 	case MPOL_BIND:
2434 
2435 		/*
2436 		 * allows binding to multiple nodes.
2437 		 * use current page if in policy nodemask,
2438 		 * else select nearest allowed node, if any.
2439 		 * If no allowed nodes, use current [!misplaced].
2440 		 */
2441 		if (node_isset(curnid, pol->v.nodes))
2442 			goto out;
2443 		z = first_zones_zonelist(
2444 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2445 				gfp_zone(GFP_HIGHUSER),
2446 				&pol->v.nodes);
2447 		polnid = zone_to_nid(z->zone);
2448 		break;
2449 
2450 	default:
2451 		BUG();
2452 	}
2453 
2454 	/* Migrate the page towards the node whose CPU is referencing it */
2455 	if (pol->flags & MPOL_F_MORON) {
2456 		polnid = thisnid;
2457 
2458 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2459 			goto out;
2460 	}
2461 
2462 	if (curnid != polnid)
2463 		ret = polnid;
2464 out:
2465 	mpol_cond_put(pol);
2466 
2467 	return ret;
2468 }
2469 
2470 /*
2471  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2472  * dropped after task->mempolicy is set to NULL so that any allocation done as
2473  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2474  * policy.
2475  */
2476 void mpol_put_task_policy(struct task_struct *task)
2477 {
2478 	struct mempolicy *pol;
2479 
2480 	task_lock(task);
2481 	pol = task->mempolicy;
2482 	task->mempolicy = NULL;
2483 	task_unlock(task);
2484 	mpol_put(pol);
2485 }
2486 
2487 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2488 {
2489 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2490 	rb_erase(&n->nd, &sp->root);
2491 	sp_free(n);
2492 }
2493 
2494 static void sp_node_init(struct sp_node *node, unsigned long start,
2495 			unsigned long end, struct mempolicy *pol)
2496 {
2497 	node->start = start;
2498 	node->end = end;
2499 	node->policy = pol;
2500 }
2501 
2502 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2503 				struct mempolicy *pol)
2504 {
2505 	struct sp_node *n;
2506 	struct mempolicy *newpol;
2507 
2508 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2509 	if (!n)
2510 		return NULL;
2511 
2512 	newpol = mpol_dup(pol);
2513 	if (IS_ERR(newpol)) {
2514 		kmem_cache_free(sn_cache, n);
2515 		return NULL;
2516 	}
2517 	newpol->flags |= MPOL_F_SHARED;
2518 	sp_node_init(n, start, end, newpol);
2519 
2520 	return n;
2521 }
2522 
2523 /* Replace a policy range. */
2524 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2525 				 unsigned long end, struct sp_node *new)
2526 {
2527 	struct sp_node *n;
2528 	struct sp_node *n_new = NULL;
2529 	struct mempolicy *mpol_new = NULL;
2530 	int ret = 0;
2531 
2532 restart:
2533 	write_lock(&sp->lock);
2534 	n = sp_lookup(sp, start, end);
2535 	/* Take care of old policies in the same range. */
2536 	while (n && n->start < end) {
2537 		struct rb_node *next = rb_next(&n->nd);
2538 		if (n->start >= start) {
2539 			if (n->end <= end)
2540 				sp_delete(sp, n);
2541 			else
2542 				n->start = end;
2543 		} else {
2544 			/* Old policy spanning whole new range. */
2545 			if (n->end > end) {
2546 				if (!n_new)
2547 					goto alloc_new;
2548 
2549 				*mpol_new = *n->policy;
2550 				atomic_set(&mpol_new->refcnt, 1);
2551 				sp_node_init(n_new, end, n->end, mpol_new);
2552 				n->end = start;
2553 				sp_insert(sp, n_new);
2554 				n_new = NULL;
2555 				mpol_new = NULL;
2556 				break;
2557 			} else
2558 				n->end = start;
2559 		}
2560 		if (!next)
2561 			break;
2562 		n = rb_entry(next, struct sp_node, nd);
2563 	}
2564 	if (new)
2565 		sp_insert(sp, new);
2566 	write_unlock(&sp->lock);
2567 	ret = 0;
2568 
2569 err_out:
2570 	if (mpol_new)
2571 		mpol_put(mpol_new);
2572 	if (n_new)
2573 		kmem_cache_free(sn_cache, n_new);
2574 
2575 	return ret;
2576 
2577 alloc_new:
2578 	write_unlock(&sp->lock);
2579 	ret = -ENOMEM;
2580 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2581 	if (!n_new)
2582 		goto err_out;
2583 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2584 	if (!mpol_new)
2585 		goto err_out;
2586 	goto restart;
2587 }
2588 
2589 /**
2590  * mpol_shared_policy_init - initialize shared policy for inode
2591  * @sp: pointer to inode shared policy
2592  * @mpol:  struct mempolicy to install
2593  *
2594  * Install non-NULL @mpol in inode's shared policy rb-tree.
2595  * On entry, the current task has a reference on a non-NULL @mpol.
2596  * This must be released on exit.
2597  * This is called at get_inode() calls and we can use GFP_KERNEL.
2598  */
2599 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2600 {
2601 	int ret;
2602 
2603 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2604 	rwlock_init(&sp->lock);
2605 
2606 	if (mpol) {
2607 		struct vm_area_struct pvma;
2608 		struct mempolicy *new;
2609 		NODEMASK_SCRATCH(scratch);
2610 
2611 		if (!scratch)
2612 			goto put_mpol;
2613 		/* contextualize the tmpfs mount point mempolicy */
2614 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2615 		if (IS_ERR(new))
2616 			goto free_scratch; /* no valid nodemask intersection */
2617 
2618 		task_lock(current);
2619 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2620 		task_unlock(current);
2621 		if (ret)
2622 			goto put_new;
2623 
2624 		/* Create pseudo-vma that contains just the policy */
2625 		vma_init(&pvma, NULL);
2626 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2627 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2628 
2629 put_new:
2630 		mpol_put(new);			/* drop initial ref */
2631 free_scratch:
2632 		NODEMASK_SCRATCH_FREE(scratch);
2633 put_mpol:
2634 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2635 	}
2636 }
2637 
2638 int mpol_set_shared_policy(struct shared_policy *info,
2639 			struct vm_area_struct *vma, struct mempolicy *npol)
2640 {
2641 	int err;
2642 	struct sp_node *new = NULL;
2643 	unsigned long sz = vma_pages(vma);
2644 
2645 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2646 		 vma->vm_pgoff,
2647 		 sz, npol ? npol->mode : -1,
2648 		 npol ? npol->flags : -1,
2649 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2650 
2651 	if (npol) {
2652 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2653 		if (!new)
2654 			return -ENOMEM;
2655 	}
2656 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2657 	if (err && new)
2658 		sp_free(new);
2659 	return err;
2660 }
2661 
2662 /* Free a backing policy store on inode delete. */
2663 void mpol_free_shared_policy(struct shared_policy *p)
2664 {
2665 	struct sp_node *n;
2666 	struct rb_node *next;
2667 
2668 	if (!p->root.rb_node)
2669 		return;
2670 	write_lock(&p->lock);
2671 	next = rb_first(&p->root);
2672 	while (next) {
2673 		n = rb_entry(next, struct sp_node, nd);
2674 		next = rb_next(&n->nd);
2675 		sp_delete(p, n);
2676 	}
2677 	write_unlock(&p->lock);
2678 }
2679 
2680 #ifdef CONFIG_NUMA_BALANCING
2681 static int __initdata numabalancing_override;
2682 
2683 static void __init check_numabalancing_enable(void)
2684 {
2685 	bool numabalancing_default = false;
2686 
2687 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2688 		numabalancing_default = true;
2689 
2690 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2691 	if (numabalancing_override)
2692 		set_numabalancing_state(numabalancing_override == 1);
2693 
2694 	if (num_online_nodes() > 1 && !numabalancing_override) {
2695 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2696 			numabalancing_default ? "Enabling" : "Disabling");
2697 		set_numabalancing_state(numabalancing_default);
2698 	}
2699 }
2700 
2701 static int __init setup_numabalancing(char *str)
2702 {
2703 	int ret = 0;
2704 	if (!str)
2705 		goto out;
2706 
2707 	if (!strcmp(str, "enable")) {
2708 		numabalancing_override = 1;
2709 		ret = 1;
2710 	} else if (!strcmp(str, "disable")) {
2711 		numabalancing_override = -1;
2712 		ret = 1;
2713 	}
2714 out:
2715 	if (!ret)
2716 		pr_warn("Unable to parse numa_balancing=\n");
2717 
2718 	return ret;
2719 }
2720 __setup("numa_balancing=", setup_numabalancing);
2721 #else
2722 static inline void __init check_numabalancing_enable(void)
2723 {
2724 }
2725 #endif /* CONFIG_NUMA_BALANCING */
2726 
2727 /* assumes fs == KERNEL_DS */
2728 void __init numa_policy_init(void)
2729 {
2730 	nodemask_t interleave_nodes;
2731 	unsigned long largest = 0;
2732 	int nid, prefer = 0;
2733 
2734 	policy_cache = kmem_cache_create("numa_policy",
2735 					 sizeof(struct mempolicy),
2736 					 0, SLAB_PANIC, NULL);
2737 
2738 	sn_cache = kmem_cache_create("shared_policy_node",
2739 				     sizeof(struct sp_node),
2740 				     0, SLAB_PANIC, NULL);
2741 
2742 	for_each_node(nid) {
2743 		preferred_node_policy[nid] = (struct mempolicy) {
2744 			.refcnt = ATOMIC_INIT(1),
2745 			.mode = MPOL_PREFERRED,
2746 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2747 			.v = { .preferred_node = nid, },
2748 		};
2749 	}
2750 
2751 	/*
2752 	 * Set interleaving policy for system init. Interleaving is only
2753 	 * enabled across suitably sized nodes (default is >= 16MB), or
2754 	 * fall back to the largest node if they're all smaller.
2755 	 */
2756 	nodes_clear(interleave_nodes);
2757 	for_each_node_state(nid, N_MEMORY) {
2758 		unsigned long total_pages = node_present_pages(nid);
2759 
2760 		/* Preserve the largest node */
2761 		if (largest < total_pages) {
2762 			largest = total_pages;
2763 			prefer = nid;
2764 		}
2765 
2766 		/* Interleave this node? */
2767 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2768 			node_set(nid, interleave_nodes);
2769 	}
2770 
2771 	/* All too small, use the largest */
2772 	if (unlikely(nodes_empty(interleave_nodes)))
2773 		node_set(prefer, interleave_nodes);
2774 
2775 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2776 		pr_err("%s: interleaving failed\n", __func__);
2777 
2778 	check_numabalancing_enable();
2779 }
2780 
2781 /* Reset policy of current process to default */
2782 void numa_default_policy(void)
2783 {
2784 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2785 }
2786 
2787 /*
2788  * Parse and format mempolicy from/to strings
2789  */
2790 
2791 /*
2792  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2793  */
2794 static const char * const policy_modes[] =
2795 {
2796 	[MPOL_DEFAULT]    = "default",
2797 	[MPOL_PREFERRED]  = "prefer",
2798 	[MPOL_BIND]       = "bind",
2799 	[MPOL_INTERLEAVE] = "interleave",
2800 	[MPOL_LOCAL]      = "local",
2801 };
2802 
2803 
2804 #ifdef CONFIG_TMPFS
2805 /**
2806  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2807  * @str:  string containing mempolicy to parse
2808  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2809  *
2810  * Format of input:
2811  *	<mode>[=<flags>][:<nodelist>]
2812  *
2813  * On success, returns 0, else 1
2814  */
2815 int mpol_parse_str(char *str, struct mempolicy **mpol)
2816 {
2817 	struct mempolicy *new = NULL;
2818 	unsigned short mode_flags;
2819 	nodemask_t nodes;
2820 	char *nodelist = strchr(str, ':');
2821 	char *flags = strchr(str, '=');
2822 	int err = 1, mode;
2823 
2824 	if (flags)
2825 		*flags++ = '\0';	/* terminate mode string */
2826 
2827 	if (nodelist) {
2828 		/* NUL-terminate mode or flags string */
2829 		*nodelist++ = '\0';
2830 		if (nodelist_parse(nodelist, nodes))
2831 			goto out;
2832 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2833 			goto out;
2834 	} else
2835 		nodes_clear(nodes);
2836 
2837 	mode = match_string(policy_modes, MPOL_MAX, str);
2838 	if (mode < 0)
2839 		goto out;
2840 
2841 	switch (mode) {
2842 	case MPOL_PREFERRED:
2843 		/*
2844 		 * Insist on a nodelist of one node only
2845 		 */
2846 		if (nodelist) {
2847 			char *rest = nodelist;
2848 			while (isdigit(*rest))
2849 				rest++;
2850 			if (*rest)
2851 				goto out;
2852 		}
2853 		break;
2854 	case MPOL_INTERLEAVE:
2855 		/*
2856 		 * Default to online nodes with memory if no nodelist
2857 		 */
2858 		if (!nodelist)
2859 			nodes = node_states[N_MEMORY];
2860 		break;
2861 	case MPOL_LOCAL:
2862 		/*
2863 		 * Don't allow a nodelist;  mpol_new() checks flags
2864 		 */
2865 		if (nodelist)
2866 			goto out;
2867 		mode = MPOL_PREFERRED;
2868 		break;
2869 	case MPOL_DEFAULT:
2870 		/*
2871 		 * Insist on a empty nodelist
2872 		 */
2873 		if (!nodelist)
2874 			err = 0;
2875 		goto out;
2876 	case MPOL_BIND:
2877 		/*
2878 		 * Insist on a nodelist
2879 		 */
2880 		if (!nodelist)
2881 			goto out;
2882 	}
2883 
2884 	mode_flags = 0;
2885 	if (flags) {
2886 		/*
2887 		 * Currently, we only support two mutually exclusive
2888 		 * mode flags.
2889 		 */
2890 		if (!strcmp(flags, "static"))
2891 			mode_flags |= MPOL_F_STATIC_NODES;
2892 		else if (!strcmp(flags, "relative"))
2893 			mode_flags |= MPOL_F_RELATIVE_NODES;
2894 		else
2895 			goto out;
2896 	}
2897 
2898 	new = mpol_new(mode, mode_flags, &nodes);
2899 	if (IS_ERR(new))
2900 		goto out;
2901 
2902 	/*
2903 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2904 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2905 	 */
2906 	if (mode != MPOL_PREFERRED)
2907 		new->v.nodes = nodes;
2908 	else if (nodelist)
2909 		new->v.preferred_node = first_node(nodes);
2910 	else
2911 		new->flags |= MPOL_F_LOCAL;
2912 
2913 	/*
2914 	 * Save nodes for contextualization: this will be used to "clone"
2915 	 * the mempolicy in a specific context [cpuset] at a later time.
2916 	 */
2917 	new->w.user_nodemask = nodes;
2918 
2919 	err = 0;
2920 
2921 out:
2922 	/* Restore string for error message */
2923 	if (nodelist)
2924 		*--nodelist = ':';
2925 	if (flags)
2926 		*--flags = '=';
2927 	if (!err)
2928 		*mpol = new;
2929 	return err;
2930 }
2931 #endif /* CONFIG_TMPFS */
2932 
2933 /**
2934  * mpol_to_str - format a mempolicy structure for printing
2935  * @buffer:  to contain formatted mempolicy string
2936  * @maxlen:  length of @buffer
2937  * @pol:  pointer to mempolicy to be formatted
2938  *
2939  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2940  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2941  * longest flag, "relative", and to display at least a few node ids.
2942  */
2943 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2944 {
2945 	char *p = buffer;
2946 	nodemask_t nodes = NODE_MASK_NONE;
2947 	unsigned short mode = MPOL_DEFAULT;
2948 	unsigned short flags = 0;
2949 
2950 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2951 		mode = pol->mode;
2952 		flags = pol->flags;
2953 	}
2954 
2955 	switch (mode) {
2956 	case MPOL_DEFAULT:
2957 		break;
2958 	case MPOL_PREFERRED:
2959 		if (flags & MPOL_F_LOCAL)
2960 			mode = MPOL_LOCAL;
2961 		else
2962 			node_set(pol->v.preferred_node, nodes);
2963 		break;
2964 	case MPOL_BIND:
2965 	case MPOL_INTERLEAVE:
2966 		nodes = pol->v.nodes;
2967 		break;
2968 	default:
2969 		WARN_ON_ONCE(1);
2970 		snprintf(p, maxlen, "unknown");
2971 		return;
2972 	}
2973 
2974 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2975 
2976 	if (flags & MPOL_MODE_FLAGS) {
2977 		p += snprintf(p, buffer + maxlen - p, "=");
2978 
2979 		/*
2980 		 * Currently, the only defined flags are mutually exclusive
2981 		 */
2982 		if (flags & MPOL_F_STATIC_NODES)
2983 			p += snprintf(p, buffer + maxlen - p, "static");
2984 		else if (flags & MPOL_F_RELATIVE_NODES)
2985 			p += snprintf(p, buffer + maxlen - p, "relative");
2986 	}
2987 
2988 	if (!nodes_empty(nodes))
2989 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2990 			       nodemask_pr_args(&nodes));
2991 }
2992