xref: /openbmc/linux/mm/mempolicy.c (revision 3098f5eb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	unsigned long start;
414 	unsigned long end;
415 	struct vm_area_struct *first;
416 };
417 
418 /*
419  * Check if the page's nid is in qp->nmask.
420  *
421  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
422  * in the invert of qp->nmask.
423  */
424 static inline bool queue_pages_required(struct page *page,
425 					struct queue_pages *qp)
426 {
427 	int nid = page_to_nid(page);
428 	unsigned long flags = qp->flags;
429 
430 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431 }
432 
433 /*
434  * queue_pages_pmd() has four possible return values:
435  * 0 - pages are placed on the right node or queued successfully.
436  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
437  *     specified.
438  * 2 - THP was split.
439  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
440  *        existing page was already on a node that does not follow the
441  *        policy.
442  */
443 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 				unsigned long end, struct mm_walk *walk)
445 {
446 	int ret = 0;
447 	struct page *page;
448 	struct queue_pages *qp = walk->private;
449 	unsigned long flags;
450 
451 	if (unlikely(is_pmd_migration_entry(*pmd))) {
452 		ret = -EIO;
453 		goto unlock;
454 	}
455 	page = pmd_page(*pmd);
456 	if (is_huge_zero_page(page)) {
457 		spin_unlock(ptl);
458 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
459 		ret = 2;
460 		goto out;
461 	}
462 	if (!queue_pages_required(page, qp))
463 		goto unlock;
464 
465 	flags = qp->flags;
466 	/* go to thp migration */
467 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
468 		if (!vma_migratable(walk->vma) ||
469 		    migrate_page_add(page, qp->pagelist, flags)) {
470 			ret = 1;
471 			goto unlock;
472 		}
473 	} else
474 		ret = -EIO;
475 unlock:
476 	spin_unlock(ptl);
477 out:
478 	return ret;
479 }
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  *
485  * queue_pages_pte_range() has three possible return values:
486  * 0 - pages are placed on the right node or queued successfully.
487  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
488  *     specified.
489  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
490  *        on a node that does not follow the policy.
491  */
492 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
493 			unsigned long end, struct mm_walk *walk)
494 {
495 	struct vm_area_struct *vma = walk->vma;
496 	struct page *page;
497 	struct queue_pages *qp = walk->private;
498 	unsigned long flags = qp->flags;
499 	int ret;
500 	bool has_unmovable = false;
501 	pte_t *pte;
502 	spinlock_t *ptl;
503 
504 	ptl = pmd_trans_huge_lock(pmd, vma);
505 	if (ptl) {
506 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
507 		if (ret != 2)
508 			return ret;
509 	}
510 	/* THP was split, fall through to pte walk */
511 
512 	if (pmd_trans_unstable(pmd))
513 		return 0;
514 
515 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
516 	for (; addr != end; pte++, addr += PAGE_SIZE) {
517 		if (!pte_present(*pte))
518 			continue;
519 		page = vm_normal_page(vma, addr, *pte);
520 		if (!page)
521 			continue;
522 		/*
523 		 * vm_normal_page() filters out zero pages, but there might
524 		 * still be PageReserved pages to skip, perhaps in a VDSO.
525 		 */
526 		if (PageReserved(page))
527 			continue;
528 		if (!queue_pages_required(page, qp))
529 			continue;
530 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
531 			/* MPOL_MF_STRICT must be specified if we get here */
532 			if (!vma_migratable(vma)) {
533 				has_unmovable = true;
534 				break;
535 			}
536 
537 			/*
538 			 * Do not abort immediately since there may be
539 			 * temporary off LRU pages in the range.  Still
540 			 * need migrate other LRU pages.
541 			 */
542 			if (migrate_page_add(page, qp->pagelist, flags))
543 				has_unmovable = true;
544 		} else
545 			break;
546 	}
547 	pte_unmap_unlock(pte - 1, ptl);
548 	cond_resched();
549 
550 	if (has_unmovable)
551 		return 1;
552 
553 	return addr != end ? -EIO : 0;
554 }
555 
556 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
557 			       unsigned long addr, unsigned long end,
558 			       struct mm_walk *walk)
559 {
560 #ifdef CONFIG_HUGETLB_PAGE
561 	struct queue_pages *qp = walk->private;
562 	unsigned long flags = qp->flags;
563 	struct page *page;
564 	spinlock_t *ptl;
565 	pte_t entry;
566 
567 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
568 	entry = huge_ptep_get(pte);
569 	if (!pte_present(entry))
570 		goto unlock;
571 	page = pte_page(entry);
572 	if (!queue_pages_required(page, qp))
573 		goto unlock;
574 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
575 	if (flags & (MPOL_MF_MOVE_ALL) ||
576 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
577 		isolate_huge_page(page, qp->pagelist);
578 unlock:
579 	spin_unlock(ptl);
580 #else
581 	BUG();
582 #endif
583 	return 0;
584 }
585 
586 #ifdef CONFIG_NUMA_BALANCING
587 /*
588  * This is used to mark a range of virtual addresses to be inaccessible.
589  * These are later cleared by a NUMA hinting fault. Depending on these
590  * faults, pages may be migrated for better NUMA placement.
591  *
592  * This is assuming that NUMA faults are handled using PROT_NONE. If
593  * an architecture makes a different choice, it will need further
594  * changes to the core.
595  */
596 unsigned long change_prot_numa(struct vm_area_struct *vma,
597 			unsigned long addr, unsigned long end)
598 {
599 	int nr_updated;
600 
601 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
602 	if (nr_updated)
603 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
604 
605 	return nr_updated;
606 }
607 #else
608 static unsigned long change_prot_numa(struct vm_area_struct *vma,
609 			unsigned long addr, unsigned long end)
610 {
611 	return 0;
612 }
613 #endif /* CONFIG_NUMA_BALANCING */
614 
615 static int queue_pages_test_walk(unsigned long start, unsigned long end,
616 				struct mm_walk *walk)
617 {
618 	struct vm_area_struct *vma = walk->vma;
619 	struct queue_pages *qp = walk->private;
620 	unsigned long endvma = vma->vm_end;
621 	unsigned long flags = qp->flags;
622 
623 	/* range check first */
624 	VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
625 
626 	if (!qp->first) {
627 		qp->first = vma;
628 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
629 			(qp->start < vma->vm_start))
630 			/* hole at head side of range */
631 			return -EFAULT;
632 	}
633 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
634 		((vma->vm_end < qp->end) &&
635 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
636 		/* hole at middle or tail of range */
637 		return -EFAULT;
638 
639 	/*
640 	 * Need check MPOL_MF_STRICT to return -EIO if possible
641 	 * regardless of vma_migratable
642 	 */
643 	if (!vma_migratable(vma) &&
644 	    !(flags & MPOL_MF_STRICT))
645 		return 1;
646 
647 	if (endvma > end)
648 		endvma = end;
649 
650 	if (flags & MPOL_MF_LAZY) {
651 		/* Similar to task_numa_work, skip inaccessible VMAs */
652 		if (!is_vm_hugetlb_page(vma) &&
653 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
654 			!(vma->vm_flags & VM_MIXEDMAP))
655 			change_prot_numa(vma, start, endvma);
656 		return 1;
657 	}
658 
659 	/* queue pages from current vma */
660 	if (flags & MPOL_MF_VALID)
661 		return 0;
662 	return 1;
663 }
664 
665 static const struct mm_walk_ops queue_pages_walk_ops = {
666 	.hugetlb_entry		= queue_pages_hugetlb,
667 	.pmd_entry		= queue_pages_pte_range,
668 	.test_walk		= queue_pages_test_walk,
669 };
670 
671 /*
672  * Walk through page tables and collect pages to be migrated.
673  *
674  * If pages found in a given range are on a set of nodes (determined by
675  * @nodes and @flags,) it's isolated and queued to the pagelist which is
676  * passed via @private.
677  *
678  * queue_pages_range() has three possible return values:
679  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
680  *     specified.
681  * 0 - queue pages successfully or no misplaced page.
682  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
683  *         memory range specified by nodemask and maxnode points outside
684  *         your accessible address space (-EFAULT)
685  */
686 static int
687 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
688 		nodemask_t *nodes, unsigned long flags,
689 		struct list_head *pagelist)
690 {
691 	int err;
692 	struct queue_pages qp = {
693 		.pagelist = pagelist,
694 		.flags = flags,
695 		.nmask = nodes,
696 		.start = start,
697 		.end = end,
698 		.first = NULL,
699 	};
700 
701 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
702 
703 	if (!qp.first)
704 		/* whole range in hole */
705 		err = -EFAULT;
706 
707 	return err;
708 }
709 
710 /*
711  * Apply policy to a single VMA
712  * This must be called with the mmap_sem held for writing.
713  */
714 static int vma_replace_policy(struct vm_area_struct *vma,
715 						struct mempolicy *pol)
716 {
717 	int err;
718 	struct mempolicy *old;
719 	struct mempolicy *new;
720 
721 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
722 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
723 		 vma->vm_ops, vma->vm_file,
724 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
725 
726 	new = mpol_dup(pol);
727 	if (IS_ERR(new))
728 		return PTR_ERR(new);
729 
730 	if (vma->vm_ops && vma->vm_ops->set_policy) {
731 		err = vma->vm_ops->set_policy(vma, new);
732 		if (err)
733 			goto err_out;
734 	}
735 
736 	old = vma->vm_policy;
737 	vma->vm_policy = new; /* protected by mmap_sem */
738 	mpol_put(old);
739 
740 	return 0;
741  err_out:
742 	mpol_put(new);
743 	return err;
744 }
745 
746 /* Step 2: apply policy to a range and do splits. */
747 static int mbind_range(struct mm_struct *mm, unsigned long start,
748 		       unsigned long end, struct mempolicy *new_pol)
749 {
750 	struct vm_area_struct *next;
751 	struct vm_area_struct *prev;
752 	struct vm_area_struct *vma;
753 	int err = 0;
754 	pgoff_t pgoff;
755 	unsigned long vmstart;
756 	unsigned long vmend;
757 
758 	vma = find_vma(mm, start);
759 	VM_BUG_ON(!vma);
760 
761 	prev = vma->vm_prev;
762 	if (start > vma->vm_start)
763 		prev = vma;
764 
765 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
766 		next = vma->vm_next;
767 		vmstart = max(start, vma->vm_start);
768 		vmend   = min(end, vma->vm_end);
769 
770 		if (mpol_equal(vma_policy(vma), new_pol))
771 			continue;
772 
773 		pgoff = vma->vm_pgoff +
774 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
775 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
776 				 vma->anon_vma, vma->vm_file, pgoff,
777 				 new_pol, vma->vm_userfaultfd_ctx);
778 		if (prev) {
779 			vma = prev;
780 			next = vma->vm_next;
781 			if (mpol_equal(vma_policy(vma), new_pol))
782 				continue;
783 			/* vma_merge() joined vma && vma->next, case 8 */
784 			goto replace;
785 		}
786 		if (vma->vm_start != vmstart) {
787 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
788 			if (err)
789 				goto out;
790 		}
791 		if (vma->vm_end != vmend) {
792 			err = split_vma(vma->vm_mm, vma, vmend, 0);
793 			if (err)
794 				goto out;
795 		}
796  replace:
797 		err = vma_replace_policy(vma, new_pol);
798 		if (err)
799 			goto out;
800 	}
801 
802  out:
803 	return err;
804 }
805 
806 /* Set the process memory policy */
807 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
808 			     nodemask_t *nodes)
809 {
810 	struct mempolicy *new, *old;
811 	NODEMASK_SCRATCH(scratch);
812 	int ret;
813 
814 	if (!scratch)
815 		return -ENOMEM;
816 
817 	new = mpol_new(mode, flags, nodes);
818 	if (IS_ERR(new)) {
819 		ret = PTR_ERR(new);
820 		goto out;
821 	}
822 
823 	task_lock(current);
824 	ret = mpol_set_nodemask(new, nodes, scratch);
825 	if (ret) {
826 		task_unlock(current);
827 		mpol_put(new);
828 		goto out;
829 	}
830 	old = current->mempolicy;
831 	current->mempolicy = new;
832 	if (new && new->mode == MPOL_INTERLEAVE)
833 		current->il_prev = MAX_NUMNODES-1;
834 	task_unlock(current);
835 	mpol_put(old);
836 	ret = 0;
837 out:
838 	NODEMASK_SCRATCH_FREE(scratch);
839 	return ret;
840 }
841 
842 /*
843  * Return nodemask for policy for get_mempolicy() query
844  *
845  * Called with task's alloc_lock held
846  */
847 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
848 {
849 	nodes_clear(*nodes);
850 	if (p == &default_policy)
851 		return;
852 
853 	switch (p->mode) {
854 	case MPOL_BIND:
855 		/* Fall through */
856 	case MPOL_INTERLEAVE:
857 		*nodes = p->v.nodes;
858 		break;
859 	case MPOL_PREFERRED:
860 		if (!(p->flags & MPOL_F_LOCAL))
861 			node_set(p->v.preferred_node, *nodes);
862 		/* else return empty node mask for local allocation */
863 		break;
864 	default:
865 		BUG();
866 	}
867 }
868 
869 static int lookup_node(struct mm_struct *mm, unsigned long addr)
870 {
871 	struct page *p;
872 	int err;
873 
874 	int locked = 1;
875 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
876 	if (err >= 0) {
877 		err = page_to_nid(p);
878 		put_page(p);
879 	}
880 	if (locked)
881 		up_read(&mm->mmap_sem);
882 	return err;
883 }
884 
885 /* Retrieve NUMA policy */
886 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
887 			     unsigned long addr, unsigned long flags)
888 {
889 	int err;
890 	struct mm_struct *mm = current->mm;
891 	struct vm_area_struct *vma = NULL;
892 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
893 
894 	if (flags &
895 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
896 		return -EINVAL;
897 
898 	if (flags & MPOL_F_MEMS_ALLOWED) {
899 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
900 			return -EINVAL;
901 		*policy = 0;	/* just so it's initialized */
902 		task_lock(current);
903 		*nmask  = cpuset_current_mems_allowed;
904 		task_unlock(current);
905 		return 0;
906 	}
907 
908 	if (flags & MPOL_F_ADDR) {
909 		/*
910 		 * Do NOT fall back to task policy if the
911 		 * vma/shared policy at addr is NULL.  We
912 		 * want to return MPOL_DEFAULT in this case.
913 		 */
914 		down_read(&mm->mmap_sem);
915 		vma = find_vma_intersection(mm, addr, addr+1);
916 		if (!vma) {
917 			up_read(&mm->mmap_sem);
918 			return -EFAULT;
919 		}
920 		if (vma->vm_ops && vma->vm_ops->get_policy)
921 			pol = vma->vm_ops->get_policy(vma, addr);
922 		else
923 			pol = vma->vm_policy;
924 	} else if (addr)
925 		return -EINVAL;
926 
927 	if (!pol)
928 		pol = &default_policy;	/* indicates default behavior */
929 
930 	if (flags & MPOL_F_NODE) {
931 		if (flags & MPOL_F_ADDR) {
932 			/*
933 			 * Take a refcount on the mpol, lookup_node()
934 			 * wil drop the mmap_sem, so after calling
935 			 * lookup_node() only "pol" remains valid, "vma"
936 			 * is stale.
937 			 */
938 			pol_refcount = pol;
939 			vma = NULL;
940 			mpol_get(pol);
941 			err = lookup_node(mm, addr);
942 			if (err < 0)
943 				goto out;
944 			*policy = err;
945 		} else if (pol == current->mempolicy &&
946 				pol->mode == MPOL_INTERLEAVE) {
947 			*policy = next_node_in(current->il_prev, pol->v.nodes);
948 		} else {
949 			err = -EINVAL;
950 			goto out;
951 		}
952 	} else {
953 		*policy = pol == &default_policy ? MPOL_DEFAULT :
954 						pol->mode;
955 		/*
956 		 * Internal mempolicy flags must be masked off before exposing
957 		 * the policy to userspace.
958 		 */
959 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
960 	}
961 
962 	err = 0;
963 	if (nmask) {
964 		if (mpol_store_user_nodemask(pol)) {
965 			*nmask = pol->w.user_nodemask;
966 		} else {
967 			task_lock(current);
968 			get_policy_nodemask(pol, nmask);
969 			task_unlock(current);
970 		}
971 	}
972 
973  out:
974 	mpol_cond_put(pol);
975 	if (vma)
976 		up_read(&mm->mmap_sem);
977 	if (pol_refcount)
978 		mpol_put(pol_refcount);
979 	return err;
980 }
981 
982 #ifdef CONFIG_MIGRATION
983 /*
984  * page migration, thp tail pages can be passed.
985  */
986 static int migrate_page_add(struct page *page, struct list_head *pagelist,
987 				unsigned long flags)
988 {
989 	struct page *head = compound_head(page);
990 	/*
991 	 * Avoid migrating a page that is shared with others.
992 	 */
993 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
994 		if (!isolate_lru_page(head)) {
995 			list_add_tail(&head->lru, pagelist);
996 			mod_node_page_state(page_pgdat(head),
997 				NR_ISOLATED_ANON + page_is_file_cache(head),
998 				hpage_nr_pages(head));
999 		} else if (flags & MPOL_MF_STRICT) {
1000 			/*
1001 			 * Non-movable page may reach here.  And, there may be
1002 			 * temporary off LRU pages or non-LRU movable pages.
1003 			 * Treat them as unmovable pages since they can't be
1004 			 * isolated, so they can't be moved at the moment.  It
1005 			 * should return -EIO for this case too.
1006 			 */
1007 			return -EIO;
1008 		}
1009 	}
1010 
1011 	return 0;
1012 }
1013 
1014 /* page allocation callback for NUMA node migration */
1015 struct page *alloc_new_node_page(struct page *page, unsigned long node)
1016 {
1017 	if (PageHuge(page))
1018 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1019 					node);
1020 	else if (PageTransHuge(page)) {
1021 		struct page *thp;
1022 
1023 		thp = alloc_pages_node(node,
1024 			(GFP_TRANSHUGE | __GFP_THISNODE),
1025 			HPAGE_PMD_ORDER);
1026 		if (!thp)
1027 			return NULL;
1028 		prep_transhuge_page(thp);
1029 		return thp;
1030 	} else
1031 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1032 						    __GFP_THISNODE, 0);
1033 }
1034 
1035 /*
1036  * Migrate pages from one node to a target node.
1037  * Returns error or the number of pages not migrated.
1038  */
1039 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1040 			   int flags)
1041 {
1042 	nodemask_t nmask;
1043 	LIST_HEAD(pagelist);
1044 	int err = 0;
1045 
1046 	nodes_clear(nmask);
1047 	node_set(source, nmask);
1048 
1049 	/*
1050 	 * This does not "check" the range but isolates all pages that
1051 	 * need migration.  Between passing in the full user address
1052 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1053 	 */
1054 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1055 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1056 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1057 
1058 	if (!list_empty(&pagelist)) {
1059 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1060 					MIGRATE_SYNC, MR_SYSCALL);
1061 		if (err)
1062 			putback_movable_pages(&pagelist);
1063 	}
1064 
1065 	return err;
1066 }
1067 
1068 /*
1069  * Move pages between the two nodesets so as to preserve the physical
1070  * layout as much as possible.
1071  *
1072  * Returns the number of page that could not be moved.
1073  */
1074 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1075 		     const nodemask_t *to, int flags)
1076 {
1077 	int busy = 0;
1078 	int err;
1079 	nodemask_t tmp;
1080 
1081 	err = migrate_prep();
1082 	if (err)
1083 		return err;
1084 
1085 	down_read(&mm->mmap_sem);
1086 
1087 	/*
1088 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1089 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1090 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1091 	 * The pair of nodemasks 'to' and 'from' define the map.
1092 	 *
1093 	 * If no pair of bits is found that way, fallback to picking some
1094 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1095 	 * 'source' and 'dest' bits are the same, this represents a node
1096 	 * that will be migrating to itself, so no pages need move.
1097 	 *
1098 	 * If no bits are left in 'tmp', or if all remaining bits left
1099 	 * in 'tmp' correspond to the same bit in 'to', return false
1100 	 * (nothing left to migrate).
1101 	 *
1102 	 * This lets us pick a pair of nodes to migrate between, such that
1103 	 * if possible the dest node is not already occupied by some other
1104 	 * source node, minimizing the risk of overloading the memory on a
1105 	 * node that would happen if we migrated incoming memory to a node
1106 	 * before migrating outgoing memory source that same node.
1107 	 *
1108 	 * A single scan of tmp is sufficient.  As we go, we remember the
1109 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1110 	 * that not only moved, but what's better, moved to an empty slot
1111 	 * (d is not set in tmp), then we break out then, with that pair.
1112 	 * Otherwise when we finish scanning from_tmp, we at least have the
1113 	 * most recent <s, d> pair that moved.  If we get all the way through
1114 	 * the scan of tmp without finding any node that moved, much less
1115 	 * moved to an empty node, then there is nothing left worth migrating.
1116 	 */
1117 
1118 	tmp = *from;
1119 	while (!nodes_empty(tmp)) {
1120 		int s,d;
1121 		int source = NUMA_NO_NODE;
1122 		int dest = 0;
1123 
1124 		for_each_node_mask(s, tmp) {
1125 
1126 			/*
1127 			 * do_migrate_pages() tries to maintain the relative
1128 			 * node relationship of the pages established between
1129 			 * threads and memory areas.
1130                          *
1131 			 * However if the number of source nodes is not equal to
1132 			 * the number of destination nodes we can not preserve
1133 			 * this node relative relationship.  In that case, skip
1134 			 * copying memory from a node that is in the destination
1135 			 * mask.
1136 			 *
1137 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1138 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1139 			 */
1140 
1141 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1142 						(node_isset(s, *to)))
1143 				continue;
1144 
1145 			d = node_remap(s, *from, *to);
1146 			if (s == d)
1147 				continue;
1148 
1149 			source = s;	/* Node moved. Memorize */
1150 			dest = d;
1151 
1152 			/* dest not in remaining from nodes? */
1153 			if (!node_isset(dest, tmp))
1154 				break;
1155 		}
1156 		if (source == NUMA_NO_NODE)
1157 			break;
1158 
1159 		node_clear(source, tmp);
1160 		err = migrate_to_node(mm, source, dest, flags);
1161 		if (err > 0)
1162 			busy += err;
1163 		if (err < 0)
1164 			break;
1165 	}
1166 	up_read(&mm->mmap_sem);
1167 	if (err < 0)
1168 		return err;
1169 	return busy;
1170 
1171 }
1172 
1173 /*
1174  * Allocate a new page for page migration based on vma policy.
1175  * Start by assuming the page is mapped by the same vma as contains @start.
1176  * Search forward from there, if not.  N.B., this assumes that the
1177  * list of pages handed to migrate_pages()--which is how we get here--
1178  * is in virtual address order.
1179  */
1180 static struct page *new_page(struct page *page, unsigned long start)
1181 {
1182 	struct vm_area_struct *vma;
1183 	unsigned long uninitialized_var(address);
1184 
1185 	vma = find_vma(current->mm, start);
1186 	while (vma) {
1187 		address = page_address_in_vma(page, vma);
1188 		if (address != -EFAULT)
1189 			break;
1190 		vma = vma->vm_next;
1191 	}
1192 
1193 	if (PageHuge(page)) {
1194 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1195 				vma, address);
1196 	} else if (PageTransHuge(page)) {
1197 		struct page *thp;
1198 
1199 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1200 					 HPAGE_PMD_ORDER);
1201 		if (!thp)
1202 			return NULL;
1203 		prep_transhuge_page(thp);
1204 		return thp;
1205 	}
1206 	/*
1207 	 * if !vma, alloc_page_vma() will use task or system default policy
1208 	 */
1209 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1210 			vma, address);
1211 }
1212 #else
1213 
1214 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1215 				unsigned long flags)
1216 {
1217 	return -EIO;
1218 }
1219 
1220 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1221 		     const nodemask_t *to, int flags)
1222 {
1223 	return -ENOSYS;
1224 }
1225 
1226 static struct page *new_page(struct page *page, unsigned long start)
1227 {
1228 	return NULL;
1229 }
1230 #endif
1231 
1232 static long do_mbind(unsigned long start, unsigned long len,
1233 		     unsigned short mode, unsigned short mode_flags,
1234 		     nodemask_t *nmask, unsigned long flags)
1235 {
1236 	struct mm_struct *mm = current->mm;
1237 	struct mempolicy *new;
1238 	unsigned long end;
1239 	int err;
1240 	int ret;
1241 	LIST_HEAD(pagelist);
1242 
1243 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1244 		return -EINVAL;
1245 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1246 		return -EPERM;
1247 
1248 	if (start & ~PAGE_MASK)
1249 		return -EINVAL;
1250 
1251 	if (mode == MPOL_DEFAULT)
1252 		flags &= ~MPOL_MF_STRICT;
1253 
1254 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1255 	end = start + len;
1256 
1257 	if (end < start)
1258 		return -EINVAL;
1259 	if (end == start)
1260 		return 0;
1261 
1262 	new = mpol_new(mode, mode_flags, nmask);
1263 	if (IS_ERR(new))
1264 		return PTR_ERR(new);
1265 
1266 	if (flags & MPOL_MF_LAZY)
1267 		new->flags |= MPOL_F_MOF;
1268 
1269 	/*
1270 	 * If we are using the default policy then operation
1271 	 * on discontinuous address spaces is okay after all
1272 	 */
1273 	if (!new)
1274 		flags |= MPOL_MF_DISCONTIG_OK;
1275 
1276 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1277 		 start, start + len, mode, mode_flags,
1278 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1279 
1280 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1281 
1282 		err = migrate_prep();
1283 		if (err)
1284 			goto mpol_out;
1285 	}
1286 	{
1287 		NODEMASK_SCRATCH(scratch);
1288 		if (scratch) {
1289 			down_write(&mm->mmap_sem);
1290 			task_lock(current);
1291 			err = mpol_set_nodemask(new, nmask, scratch);
1292 			task_unlock(current);
1293 			if (err)
1294 				up_write(&mm->mmap_sem);
1295 		} else
1296 			err = -ENOMEM;
1297 		NODEMASK_SCRATCH_FREE(scratch);
1298 	}
1299 	if (err)
1300 		goto mpol_out;
1301 
1302 	ret = queue_pages_range(mm, start, end, nmask,
1303 			  flags | MPOL_MF_INVERT, &pagelist);
1304 
1305 	if (ret < 0) {
1306 		err = ret;
1307 		goto up_out;
1308 	}
1309 
1310 	err = mbind_range(mm, start, end, new);
1311 
1312 	if (!err) {
1313 		int nr_failed = 0;
1314 
1315 		if (!list_empty(&pagelist)) {
1316 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1317 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1318 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1319 			if (nr_failed)
1320 				putback_movable_pages(&pagelist);
1321 		}
1322 
1323 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1324 			err = -EIO;
1325 	} else {
1326 up_out:
1327 		if (!list_empty(&pagelist))
1328 			putback_movable_pages(&pagelist);
1329 	}
1330 
1331 	up_write(&mm->mmap_sem);
1332 mpol_out:
1333 	mpol_put(new);
1334 	return err;
1335 }
1336 
1337 /*
1338  * User space interface with variable sized bitmaps for nodelists.
1339  */
1340 
1341 /* Copy a node mask from user space. */
1342 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1343 		     unsigned long maxnode)
1344 {
1345 	unsigned long k;
1346 	unsigned long t;
1347 	unsigned long nlongs;
1348 	unsigned long endmask;
1349 
1350 	--maxnode;
1351 	nodes_clear(*nodes);
1352 	if (maxnode == 0 || !nmask)
1353 		return 0;
1354 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1355 		return -EINVAL;
1356 
1357 	nlongs = BITS_TO_LONGS(maxnode);
1358 	if ((maxnode % BITS_PER_LONG) == 0)
1359 		endmask = ~0UL;
1360 	else
1361 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1362 
1363 	/*
1364 	 * When the user specified more nodes than supported just check
1365 	 * if the non supported part is all zero.
1366 	 *
1367 	 * If maxnode have more longs than MAX_NUMNODES, check
1368 	 * the bits in that area first. And then go through to
1369 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1370 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1371 	 */
1372 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1373 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1374 			if (get_user(t, nmask + k))
1375 				return -EFAULT;
1376 			if (k == nlongs - 1) {
1377 				if (t & endmask)
1378 					return -EINVAL;
1379 			} else if (t)
1380 				return -EINVAL;
1381 		}
1382 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1383 		endmask = ~0UL;
1384 	}
1385 
1386 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1387 		unsigned long valid_mask = endmask;
1388 
1389 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1390 		if (get_user(t, nmask + nlongs - 1))
1391 			return -EFAULT;
1392 		if (t & valid_mask)
1393 			return -EINVAL;
1394 	}
1395 
1396 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1397 		return -EFAULT;
1398 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1399 	return 0;
1400 }
1401 
1402 /* Copy a kernel node mask to user space */
1403 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1404 			      nodemask_t *nodes)
1405 {
1406 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1407 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1408 
1409 	if (copy > nbytes) {
1410 		if (copy > PAGE_SIZE)
1411 			return -EINVAL;
1412 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1413 			return -EFAULT;
1414 		copy = nbytes;
1415 	}
1416 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1417 }
1418 
1419 static long kernel_mbind(unsigned long start, unsigned long len,
1420 			 unsigned long mode, const unsigned long __user *nmask,
1421 			 unsigned long maxnode, unsigned int flags)
1422 {
1423 	nodemask_t nodes;
1424 	int err;
1425 	unsigned short mode_flags;
1426 
1427 	start = untagged_addr(start);
1428 	mode_flags = mode & MPOL_MODE_FLAGS;
1429 	mode &= ~MPOL_MODE_FLAGS;
1430 	if (mode >= MPOL_MAX)
1431 		return -EINVAL;
1432 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1433 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1434 		return -EINVAL;
1435 	err = get_nodes(&nodes, nmask, maxnode);
1436 	if (err)
1437 		return err;
1438 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1439 }
1440 
1441 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1442 		unsigned long, mode, const unsigned long __user *, nmask,
1443 		unsigned long, maxnode, unsigned int, flags)
1444 {
1445 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1446 }
1447 
1448 /* Set the process memory policy */
1449 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1450 				 unsigned long maxnode)
1451 {
1452 	int err;
1453 	nodemask_t nodes;
1454 	unsigned short flags;
1455 
1456 	flags = mode & MPOL_MODE_FLAGS;
1457 	mode &= ~MPOL_MODE_FLAGS;
1458 	if ((unsigned int)mode >= MPOL_MAX)
1459 		return -EINVAL;
1460 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1461 		return -EINVAL;
1462 	err = get_nodes(&nodes, nmask, maxnode);
1463 	if (err)
1464 		return err;
1465 	return do_set_mempolicy(mode, flags, &nodes);
1466 }
1467 
1468 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1469 		unsigned long, maxnode)
1470 {
1471 	return kernel_set_mempolicy(mode, nmask, maxnode);
1472 }
1473 
1474 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1475 				const unsigned long __user *old_nodes,
1476 				const unsigned long __user *new_nodes)
1477 {
1478 	struct mm_struct *mm = NULL;
1479 	struct task_struct *task;
1480 	nodemask_t task_nodes;
1481 	int err;
1482 	nodemask_t *old;
1483 	nodemask_t *new;
1484 	NODEMASK_SCRATCH(scratch);
1485 
1486 	if (!scratch)
1487 		return -ENOMEM;
1488 
1489 	old = &scratch->mask1;
1490 	new = &scratch->mask2;
1491 
1492 	err = get_nodes(old, old_nodes, maxnode);
1493 	if (err)
1494 		goto out;
1495 
1496 	err = get_nodes(new, new_nodes, maxnode);
1497 	if (err)
1498 		goto out;
1499 
1500 	/* Find the mm_struct */
1501 	rcu_read_lock();
1502 	task = pid ? find_task_by_vpid(pid) : current;
1503 	if (!task) {
1504 		rcu_read_unlock();
1505 		err = -ESRCH;
1506 		goto out;
1507 	}
1508 	get_task_struct(task);
1509 
1510 	err = -EINVAL;
1511 
1512 	/*
1513 	 * Check if this process has the right to modify the specified process.
1514 	 * Use the regular "ptrace_may_access()" checks.
1515 	 */
1516 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1517 		rcu_read_unlock();
1518 		err = -EPERM;
1519 		goto out_put;
1520 	}
1521 	rcu_read_unlock();
1522 
1523 	task_nodes = cpuset_mems_allowed(task);
1524 	/* Is the user allowed to access the target nodes? */
1525 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1526 		err = -EPERM;
1527 		goto out_put;
1528 	}
1529 
1530 	task_nodes = cpuset_mems_allowed(current);
1531 	nodes_and(*new, *new, task_nodes);
1532 	if (nodes_empty(*new))
1533 		goto out_put;
1534 
1535 	err = security_task_movememory(task);
1536 	if (err)
1537 		goto out_put;
1538 
1539 	mm = get_task_mm(task);
1540 	put_task_struct(task);
1541 
1542 	if (!mm) {
1543 		err = -EINVAL;
1544 		goto out;
1545 	}
1546 
1547 	err = do_migrate_pages(mm, old, new,
1548 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1549 
1550 	mmput(mm);
1551 out:
1552 	NODEMASK_SCRATCH_FREE(scratch);
1553 
1554 	return err;
1555 
1556 out_put:
1557 	put_task_struct(task);
1558 	goto out;
1559 
1560 }
1561 
1562 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1563 		const unsigned long __user *, old_nodes,
1564 		const unsigned long __user *, new_nodes)
1565 {
1566 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1567 }
1568 
1569 
1570 /* Retrieve NUMA policy */
1571 static int kernel_get_mempolicy(int __user *policy,
1572 				unsigned long __user *nmask,
1573 				unsigned long maxnode,
1574 				unsigned long addr,
1575 				unsigned long flags)
1576 {
1577 	int err;
1578 	int uninitialized_var(pval);
1579 	nodemask_t nodes;
1580 
1581 	addr = untagged_addr(addr);
1582 
1583 	if (nmask != NULL && maxnode < nr_node_ids)
1584 		return -EINVAL;
1585 
1586 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1587 
1588 	if (err)
1589 		return err;
1590 
1591 	if (policy && put_user(pval, policy))
1592 		return -EFAULT;
1593 
1594 	if (nmask)
1595 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1596 
1597 	return err;
1598 }
1599 
1600 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1601 		unsigned long __user *, nmask, unsigned long, maxnode,
1602 		unsigned long, addr, unsigned long, flags)
1603 {
1604 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1605 }
1606 
1607 #ifdef CONFIG_COMPAT
1608 
1609 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1610 		       compat_ulong_t __user *, nmask,
1611 		       compat_ulong_t, maxnode,
1612 		       compat_ulong_t, addr, compat_ulong_t, flags)
1613 {
1614 	long err;
1615 	unsigned long __user *nm = NULL;
1616 	unsigned long nr_bits, alloc_size;
1617 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1618 
1619 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1620 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1621 
1622 	if (nmask)
1623 		nm = compat_alloc_user_space(alloc_size);
1624 
1625 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1626 
1627 	if (!err && nmask) {
1628 		unsigned long copy_size;
1629 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1630 		err = copy_from_user(bm, nm, copy_size);
1631 		/* ensure entire bitmap is zeroed */
1632 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1633 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1634 	}
1635 
1636 	return err;
1637 }
1638 
1639 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1640 		       compat_ulong_t, maxnode)
1641 {
1642 	unsigned long __user *nm = NULL;
1643 	unsigned long nr_bits, alloc_size;
1644 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1645 
1646 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1647 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1648 
1649 	if (nmask) {
1650 		if (compat_get_bitmap(bm, nmask, nr_bits))
1651 			return -EFAULT;
1652 		nm = compat_alloc_user_space(alloc_size);
1653 		if (copy_to_user(nm, bm, alloc_size))
1654 			return -EFAULT;
1655 	}
1656 
1657 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1658 }
1659 
1660 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1661 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1662 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1663 {
1664 	unsigned long __user *nm = NULL;
1665 	unsigned long nr_bits, alloc_size;
1666 	nodemask_t bm;
1667 
1668 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1669 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1670 
1671 	if (nmask) {
1672 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1673 			return -EFAULT;
1674 		nm = compat_alloc_user_space(alloc_size);
1675 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1676 			return -EFAULT;
1677 	}
1678 
1679 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1680 }
1681 
1682 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1683 		       compat_ulong_t, maxnode,
1684 		       const compat_ulong_t __user *, old_nodes,
1685 		       const compat_ulong_t __user *, new_nodes)
1686 {
1687 	unsigned long __user *old = NULL;
1688 	unsigned long __user *new = NULL;
1689 	nodemask_t tmp_mask;
1690 	unsigned long nr_bits;
1691 	unsigned long size;
1692 
1693 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1694 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1695 	if (old_nodes) {
1696 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1697 			return -EFAULT;
1698 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1699 		if (new_nodes)
1700 			new = old + size / sizeof(unsigned long);
1701 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1702 			return -EFAULT;
1703 	}
1704 	if (new_nodes) {
1705 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1706 			return -EFAULT;
1707 		if (new == NULL)
1708 			new = compat_alloc_user_space(size);
1709 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1710 			return -EFAULT;
1711 	}
1712 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1713 }
1714 
1715 #endif /* CONFIG_COMPAT */
1716 
1717 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1718 						unsigned long addr)
1719 {
1720 	struct mempolicy *pol = NULL;
1721 
1722 	if (vma) {
1723 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1724 			pol = vma->vm_ops->get_policy(vma, addr);
1725 		} else if (vma->vm_policy) {
1726 			pol = vma->vm_policy;
1727 
1728 			/*
1729 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1730 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1731 			 * count on these policies which will be dropped by
1732 			 * mpol_cond_put() later
1733 			 */
1734 			if (mpol_needs_cond_ref(pol))
1735 				mpol_get(pol);
1736 		}
1737 	}
1738 
1739 	return pol;
1740 }
1741 
1742 /*
1743  * get_vma_policy(@vma, @addr)
1744  * @vma: virtual memory area whose policy is sought
1745  * @addr: address in @vma for shared policy lookup
1746  *
1747  * Returns effective policy for a VMA at specified address.
1748  * Falls back to current->mempolicy or system default policy, as necessary.
1749  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1750  * count--added by the get_policy() vm_op, as appropriate--to protect against
1751  * freeing by another task.  It is the caller's responsibility to free the
1752  * extra reference for shared policies.
1753  */
1754 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1755 						unsigned long addr)
1756 {
1757 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1758 
1759 	if (!pol)
1760 		pol = get_task_policy(current);
1761 
1762 	return pol;
1763 }
1764 
1765 bool vma_policy_mof(struct vm_area_struct *vma)
1766 {
1767 	struct mempolicy *pol;
1768 
1769 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1770 		bool ret = false;
1771 
1772 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1773 		if (pol && (pol->flags & MPOL_F_MOF))
1774 			ret = true;
1775 		mpol_cond_put(pol);
1776 
1777 		return ret;
1778 	}
1779 
1780 	pol = vma->vm_policy;
1781 	if (!pol)
1782 		pol = get_task_policy(current);
1783 
1784 	return pol->flags & MPOL_F_MOF;
1785 }
1786 
1787 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1788 {
1789 	enum zone_type dynamic_policy_zone = policy_zone;
1790 
1791 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1792 
1793 	/*
1794 	 * if policy->v.nodes has movable memory only,
1795 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1796 	 *
1797 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1798 	 * so if the following test faile, it implies
1799 	 * policy->v.nodes has movable memory only.
1800 	 */
1801 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1802 		dynamic_policy_zone = ZONE_MOVABLE;
1803 
1804 	return zone >= dynamic_policy_zone;
1805 }
1806 
1807 /*
1808  * Return a nodemask representing a mempolicy for filtering nodes for
1809  * page allocation
1810  */
1811 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1812 {
1813 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1814 	if (unlikely(policy->mode == MPOL_BIND) &&
1815 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1816 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1817 		return &policy->v.nodes;
1818 
1819 	return NULL;
1820 }
1821 
1822 /* Return the node id preferred by the given mempolicy, or the given id */
1823 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1824 								int nd)
1825 {
1826 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1827 		nd = policy->v.preferred_node;
1828 	else {
1829 		/*
1830 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1831 		 * because we might easily break the expectation to stay on the
1832 		 * requested node and not break the policy.
1833 		 */
1834 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1835 	}
1836 
1837 	return nd;
1838 }
1839 
1840 /* Do dynamic interleaving for a process */
1841 static unsigned interleave_nodes(struct mempolicy *policy)
1842 {
1843 	unsigned next;
1844 	struct task_struct *me = current;
1845 
1846 	next = next_node_in(me->il_prev, policy->v.nodes);
1847 	if (next < MAX_NUMNODES)
1848 		me->il_prev = next;
1849 	return next;
1850 }
1851 
1852 /*
1853  * Depending on the memory policy provide a node from which to allocate the
1854  * next slab entry.
1855  */
1856 unsigned int mempolicy_slab_node(void)
1857 {
1858 	struct mempolicy *policy;
1859 	int node = numa_mem_id();
1860 
1861 	if (in_interrupt())
1862 		return node;
1863 
1864 	policy = current->mempolicy;
1865 	if (!policy || policy->flags & MPOL_F_LOCAL)
1866 		return node;
1867 
1868 	switch (policy->mode) {
1869 	case MPOL_PREFERRED:
1870 		/*
1871 		 * handled MPOL_F_LOCAL above
1872 		 */
1873 		return policy->v.preferred_node;
1874 
1875 	case MPOL_INTERLEAVE:
1876 		return interleave_nodes(policy);
1877 
1878 	case MPOL_BIND: {
1879 		struct zoneref *z;
1880 
1881 		/*
1882 		 * Follow bind policy behavior and start allocation at the
1883 		 * first node.
1884 		 */
1885 		struct zonelist *zonelist;
1886 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1887 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1888 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1889 							&policy->v.nodes);
1890 		return z->zone ? zone_to_nid(z->zone) : node;
1891 	}
1892 
1893 	default:
1894 		BUG();
1895 	}
1896 }
1897 
1898 /*
1899  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1900  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1901  * number of present nodes.
1902  */
1903 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1904 {
1905 	unsigned nnodes = nodes_weight(pol->v.nodes);
1906 	unsigned target;
1907 	int i;
1908 	int nid;
1909 
1910 	if (!nnodes)
1911 		return numa_node_id();
1912 	target = (unsigned int)n % nnodes;
1913 	nid = first_node(pol->v.nodes);
1914 	for (i = 0; i < target; i++)
1915 		nid = next_node(nid, pol->v.nodes);
1916 	return nid;
1917 }
1918 
1919 /* Determine a node number for interleave */
1920 static inline unsigned interleave_nid(struct mempolicy *pol,
1921 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1922 {
1923 	if (vma) {
1924 		unsigned long off;
1925 
1926 		/*
1927 		 * for small pages, there is no difference between
1928 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1929 		 * for huge pages, since vm_pgoff is in units of small
1930 		 * pages, we need to shift off the always 0 bits to get
1931 		 * a useful offset.
1932 		 */
1933 		BUG_ON(shift < PAGE_SHIFT);
1934 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1935 		off += (addr - vma->vm_start) >> shift;
1936 		return offset_il_node(pol, off);
1937 	} else
1938 		return interleave_nodes(pol);
1939 }
1940 
1941 #ifdef CONFIG_HUGETLBFS
1942 /*
1943  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1944  * @vma: virtual memory area whose policy is sought
1945  * @addr: address in @vma for shared policy lookup and interleave policy
1946  * @gfp_flags: for requested zone
1947  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1948  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1949  *
1950  * Returns a nid suitable for a huge page allocation and a pointer
1951  * to the struct mempolicy for conditional unref after allocation.
1952  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1953  * @nodemask for filtering the zonelist.
1954  *
1955  * Must be protected by read_mems_allowed_begin()
1956  */
1957 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1958 				struct mempolicy **mpol, nodemask_t **nodemask)
1959 {
1960 	int nid;
1961 
1962 	*mpol = get_vma_policy(vma, addr);
1963 	*nodemask = NULL;	/* assume !MPOL_BIND */
1964 
1965 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1966 		nid = interleave_nid(*mpol, vma, addr,
1967 					huge_page_shift(hstate_vma(vma)));
1968 	} else {
1969 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1970 		if ((*mpol)->mode == MPOL_BIND)
1971 			*nodemask = &(*mpol)->v.nodes;
1972 	}
1973 	return nid;
1974 }
1975 
1976 /*
1977  * init_nodemask_of_mempolicy
1978  *
1979  * If the current task's mempolicy is "default" [NULL], return 'false'
1980  * to indicate default policy.  Otherwise, extract the policy nodemask
1981  * for 'bind' or 'interleave' policy into the argument nodemask, or
1982  * initialize the argument nodemask to contain the single node for
1983  * 'preferred' or 'local' policy and return 'true' to indicate presence
1984  * of non-default mempolicy.
1985  *
1986  * We don't bother with reference counting the mempolicy [mpol_get/put]
1987  * because the current task is examining it's own mempolicy and a task's
1988  * mempolicy is only ever changed by the task itself.
1989  *
1990  * N.B., it is the caller's responsibility to free a returned nodemask.
1991  */
1992 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1993 {
1994 	struct mempolicy *mempolicy;
1995 	int nid;
1996 
1997 	if (!(mask && current->mempolicy))
1998 		return false;
1999 
2000 	task_lock(current);
2001 	mempolicy = current->mempolicy;
2002 	switch (mempolicy->mode) {
2003 	case MPOL_PREFERRED:
2004 		if (mempolicy->flags & MPOL_F_LOCAL)
2005 			nid = numa_node_id();
2006 		else
2007 			nid = mempolicy->v.preferred_node;
2008 		init_nodemask_of_node(mask, nid);
2009 		break;
2010 
2011 	case MPOL_BIND:
2012 		/* Fall through */
2013 	case MPOL_INTERLEAVE:
2014 		*mask =  mempolicy->v.nodes;
2015 		break;
2016 
2017 	default:
2018 		BUG();
2019 	}
2020 	task_unlock(current);
2021 
2022 	return true;
2023 }
2024 #endif
2025 
2026 /*
2027  * mempolicy_nodemask_intersects
2028  *
2029  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2030  * policy.  Otherwise, check for intersection between mask and the policy
2031  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2032  * policy, always return true since it may allocate elsewhere on fallback.
2033  *
2034  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2035  */
2036 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2037 					const nodemask_t *mask)
2038 {
2039 	struct mempolicy *mempolicy;
2040 	bool ret = true;
2041 
2042 	if (!mask)
2043 		return ret;
2044 	task_lock(tsk);
2045 	mempolicy = tsk->mempolicy;
2046 	if (!mempolicy)
2047 		goto out;
2048 
2049 	switch (mempolicy->mode) {
2050 	case MPOL_PREFERRED:
2051 		/*
2052 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2053 		 * allocate from, they may fallback to other nodes when oom.
2054 		 * Thus, it's possible for tsk to have allocated memory from
2055 		 * nodes in mask.
2056 		 */
2057 		break;
2058 	case MPOL_BIND:
2059 	case MPOL_INTERLEAVE:
2060 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2061 		break;
2062 	default:
2063 		BUG();
2064 	}
2065 out:
2066 	task_unlock(tsk);
2067 	return ret;
2068 }
2069 
2070 /* Allocate a page in interleaved policy.
2071    Own path because it needs to do special accounting. */
2072 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2073 					unsigned nid)
2074 {
2075 	struct page *page;
2076 
2077 	page = __alloc_pages(gfp, order, nid);
2078 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2079 	if (!static_branch_likely(&vm_numa_stat_key))
2080 		return page;
2081 	if (page && page_to_nid(page) == nid) {
2082 		preempt_disable();
2083 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2084 		preempt_enable();
2085 	}
2086 	return page;
2087 }
2088 
2089 /**
2090  * 	alloc_pages_vma	- Allocate a page for a VMA.
2091  *
2092  * 	@gfp:
2093  *      %GFP_USER    user allocation.
2094  *      %GFP_KERNEL  kernel allocations,
2095  *      %GFP_HIGHMEM highmem/user allocations,
2096  *      %GFP_FS      allocation should not call back into a file system.
2097  *      %GFP_ATOMIC  don't sleep.
2098  *
2099  *	@order:Order of the GFP allocation.
2100  * 	@vma:  Pointer to VMA or NULL if not available.
2101  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2102  *	@node: Which node to prefer for allocation (modulo policy).
2103  *	@hugepage: for hugepages try only the preferred node if possible
2104  *
2105  * 	This function allocates a page from the kernel page pool and applies
2106  *	a NUMA policy associated with the VMA or the current process.
2107  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2108  *	mm_struct of the VMA to prevent it from going away. Should be used for
2109  *	all allocations for pages that will be mapped into user space. Returns
2110  *	NULL when no page can be allocated.
2111  */
2112 struct page *
2113 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2114 		unsigned long addr, int node, bool hugepage)
2115 {
2116 	struct mempolicy *pol;
2117 	struct page *page;
2118 	int preferred_nid;
2119 	nodemask_t *nmask;
2120 
2121 	pol = get_vma_policy(vma, addr);
2122 
2123 	if (pol->mode == MPOL_INTERLEAVE) {
2124 		unsigned nid;
2125 
2126 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2127 		mpol_cond_put(pol);
2128 		page = alloc_page_interleave(gfp, order, nid);
2129 		goto out;
2130 	}
2131 
2132 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2133 		int hpage_node = node;
2134 
2135 		/*
2136 		 * For hugepage allocation and non-interleave policy which
2137 		 * allows the current node (or other explicitly preferred
2138 		 * node) we only try to allocate from the current/preferred
2139 		 * node and don't fall back to other nodes, as the cost of
2140 		 * remote accesses would likely offset THP benefits.
2141 		 *
2142 		 * If the policy is interleave, or does not allow the current
2143 		 * node in its nodemask, we allocate the standard way.
2144 		 */
2145 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2146 			hpage_node = pol->v.preferred_node;
2147 
2148 		nmask = policy_nodemask(gfp, pol);
2149 		if (!nmask || node_isset(hpage_node, *nmask)) {
2150 			mpol_cond_put(pol);
2151 			page = __alloc_pages_node(hpage_node,
2152 						gfp | __GFP_THISNODE, order);
2153 
2154 			/*
2155 			 * If hugepage allocations are configured to always
2156 			 * synchronous compact or the vma has been madvised
2157 			 * to prefer hugepage backing, retry allowing remote
2158 			 * memory as well.
2159 			 */
2160 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2161 				page = __alloc_pages_node(hpage_node,
2162 						gfp | __GFP_NORETRY, order);
2163 
2164 			goto out;
2165 		}
2166 	}
2167 
2168 	nmask = policy_nodemask(gfp, pol);
2169 	preferred_nid = policy_node(gfp, pol, node);
2170 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2171 	mpol_cond_put(pol);
2172 out:
2173 	return page;
2174 }
2175 EXPORT_SYMBOL(alloc_pages_vma);
2176 
2177 /**
2178  * 	alloc_pages_current - Allocate pages.
2179  *
2180  *	@gfp:
2181  *		%GFP_USER   user allocation,
2182  *      	%GFP_KERNEL kernel allocation,
2183  *      	%GFP_HIGHMEM highmem allocation,
2184  *      	%GFP_FS     don't call back into a file system.
2185  *      	%GFP_ATOMIC don't sleep.
2186  *	@order: Power of two of allocation size in pages. 0 is a single page.
2187  *
2188  *	Allocate a page from the kernel page pool.  When not in
2189  *	interrupt context and apply the current process NUMA policy.
2190  *	Returns NULL when no page can be allocated.
2191  */
2192 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2193 {
2194 	struct mempolicy *pol = &default_policy;
2195 	struct page *page;
2196 
2197 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2198 		pol = get_task_policy(current);
2199 
2200 	/*
2201 	 * No reference counting needed for current->mempolicy
2202 	 * nor system default_policy
2203 	 */
2204 	if (pol->mode == MPOL_INTERLEAVE)
2205 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2206 	else
2207 		page = __alloc_pages_nodemask(gfp, order,
2208 				policy_node(gfp, pol, numa_node_id()),
2209 				policy_nodemask(gfp, pol));
2210 
2211 	return page;
2212 }
2213 EXPORT_SYMBOL(alloc_pages_current);
2214 
2215 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2216 {
2217 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2218 
2219 	if (IS_ERR(pol))
2220 		return PTR_ERR(pol);
2221 	dst->vm_policy = pol;
2222 	return 0;
2223 }
2224 
2225 /*
2226  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2227  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2228  * with the mems_allowed returned by cpuset_mems_allowed().  This
2229  * keeps mempolicies cpuset relative after its cpuset moves.  See
2230  * further kernel/cpuset.c update_nodemask().
2231  *
2232  * current's mempolicy may be rebinded by the other task(the task that changes
2233  * cpuset's mems), so we needn't do rebind work for current task.
2234  */
2235 
2236 /* Slow path of a mempolicy duplicate */
2237 struct mempolicy *__mpol_dup(struct mempolicy *old)
2238 {
2239 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2240 
2241 	if (!new)
2242 		return ERR_PTR(-ENOMEM);
2243 
2244 	/* task's mempolicy is protected by alloc_lock */
2245 	if (old == current->mempolicy) {
2246 		task_lock(current);
2247 		*new = *old;
2248 		task_unlock(current);
2249 	} else
2250 		*new = *old;
2251 
2252 	if (current_cpuset_is_being_rebound()) {
2253 		nodemask_t mems = cpuset_mems_allowed(current);
2254 		mpol_rebind_policy(new, &mems);
2255 	}
2256 	atomic_set(&new->refcnt, 1);
2257 	return new;
2258 }
2259 
2260 /* Slow path of a mempolicy comparison */
2261 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2262 {
2263 	if (!a || !b)
2264 		return false;
2265 	if (a->mode != b->mode)
2266 		return false;
2267 	if (a->flags != b->flags)
2268 		return false;
2269 	if (mpol_store_user_nodemask(a))
2270 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2271 			return false;
2272 
2273 	switch (a->mode) {
2274 	case MPOL_BIND:
2275 		/* Fall through */
2276 	case MPOL_INTERLEAVE:
2277 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2278 	case MPOL_PREFERRED:
2279 		/* a's ->flags is the same as b's */
2280 		if (a->flags & MPOL_F_LOCAL)
2281 			return true;
2282 		return a->v.preferred_node == b->v.preferred_node;
2283 	default:
2284 		BUG();
2285 		return false;
2286 	}
2287 }
2288 
2289 /*
2290  * Shared memory backing store policy support.
2291  *
2292  * Remember policies even when nobody has shared memory mapped.
2293  * The policies are kept in Red-Black tree linked from the inode.
2294  * They are protected by the sp->lock rwlock, which should be held
2295  * for any accesses to the tree.
2296  */
2297 
2298 /*
2299  * lookup first element intersecting start-end.  Caller holds sp->lock for
2300  * reading or for writing
2301  */
2302 static struct sp_node *
2303 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2304 {
2305 	struct rb_node *n = sp->root.rb_node;
2306 
2307 	while (n) {
2308 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2309 
2310 		if (start >= p->end)
2311 			n = n->rb_right;
2312 		else if (end <= p->start)
2313 			n = n->rb_left;
2314 		else
2315 			break;
2316 	}
2317 	if (!n)
2318 		return NULL;
2319 	for (;;) {
2320 		struct sp_node *w = NULL;
2321 		struct rb_node *prev = rb_prev(n);
2322 		if (!prev)
2323 			break;
2324 		w = rb_entry(prev, struct sp_node, nd);
2325 		if (w->end <= start)
2326 			break;
2327 		n = prev;
2328 	}
2329 	return rb_entry(n, struct sp_node, nd);
2330 }
2331 
2332 /*
2333  * Insert a new shared policy into the list.  Caller holds sp->lock for
2334  * writing.
2335  */
2336 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2337 {
2338 	struct rb_node **p = &sp->root.rb_node;
2339 	struct rb_node *parent = NULL;
2340 	struct sp_node *nd;
2341 
2342 	while (*p) {
2343 		parent = *p;
2344 		nd = rb_entry(parent, struct sp_node, nd);
2345 		if (new->start < nd->start)
2346 			p = &(*p)->rb_left;
2347 		else if (new->end > nd->end)
2348 			p = &(*p)->rb_right;
2349 		else
2350 			BUG();
2351 	}
2352 	rb_link_node(&new->nd, parent, p);
2353 	rb_insert_color(&new->nd, &sp->root);
2354 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2355 		 new->policy ? new->policy->mode : 0);
2356 }
2357 
2358 /* Find shared policy intersecting idx */
2359 struct mempolicy *
2360 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2361 {
2362 	struct mempolicy *pol = NULL;
2363 	struct sp_node *sn;
2364 
2365 	if (!sp->root.rb_node)
2366 		return NULL;
2367 	read_lock(&sp->lock);
2368 	sn = sp_lookup(sp, idx, idx+1);
2369 	if (sn) {
2370 		mpol_get(sn->policy);
2371 		pol = sn->policy;
2372 	}
2373 	read_unlock(&sp->lock);
2374 	return pol;
2375 }
2376 
2377 static void sp_free(struct sp_node *n)
2378 {
2379 	mpol_put(n->policy);
2380 	kmem_cache_free(sn_cache, n);
2381 }
2382 
2383 /**
2384  * mpol_misplaced - check whether current page node is valid in policy
2385  *
2386  * @page: page to be checked
2387  * @vma: vm area where page mapped
2388  * @addr: virtual address where page mapped
2389  *
2390  * Lookup current policy node id for vma,addr and "compare to" page's
2391  * node id.
2392  *
2393  * Returns:
2394  *	-1	- not misplaced, page is in the right node
2395  *	node	- node id where the page should be
2396  *
2397  * Policy determination "mimics" alloc_page_vma().
2398  * Called from fault path where we know the vma and faulting address.
2399  */
2400 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2401 {
2402 	struct mempolicy *pol;
2403 	struct zoneref *z;
2404 	int curnid = page_to_nid(page);
2405 	unsigned long pgoff;
2406 	int thiscpu = raw_smp_processor_id();
2407 	int thisnid = cpu_to_node(thiscpu);
2408 	int polnid = NUMA_NO_NODE;
2409 	int ret = -1;
2410 
2411 	pol = get_vma_policy(vma, addr);
2412 	if (!(pol->flags & MPOL_F_MOF))
2413 		goto out;
2414 
2415 	switch (pol->mode) {
2416 	case MPOL_INTERLEAVE:
2417 		pgoff = vma->vm_pgoff;
2418 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2419 		polnid = offset_il_node(pol, pgoff);
2420 		break;
2421 
2422 	case MPOL_PREFERRED:
2423 		if (pol->flags & MPOL_F_LOCAL)
2424 			polnid = numa_node_id();
2425 		else
2426 			polnid = pol->v.preferred_node;
2427 		break;
2428 
2429 	case MPOL_BIND:
2430 
2431 		/*
2432 		 * allows binding to multiple nodes.
2433 		 * use current page if in policy nodemask,
2434 		 * else select nearest allowed node, if any.
2435 		 * If no allowed nodes, use current [!misplaced].
2436 		 */
2437 		if (node_isset(curnid, pol->v.nodes))
2438 			goto out;
2439 		z = first_zones_zonelist(
2440 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2441 				gfp_zone(GFP_HIGHUSER),
2442 				&pol->v.nodes);
2443 		polnid = zone_to_nid(z->zone);
2444 		break;
2445 
2446 	default:
2447 		BUG();
2448 	}
2449 
2450 	/* Migrate the page towards the node whose CPU is referencing it */
2451 	if (pol->flags & MPOL_F_MORON) {
2452 		polnid = thisnid;
2453 
2454 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2455 			goto out;
2456 	}
2457 
2458 	if (curnid != polnid)
2459 		ret = polnid;
2460 out:
2461 	mpol_cond_put(pol);
2462 
2463 	return ret;
2464 }
2465 
2466 /*
2467  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2468  * dropped after task->mempolicy is set to NULL so that any allocation done as
2469  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2470  * policy.
2471  */
2472 void mpol_put_task_policy(struct task_struct *task)
2473 {
2474 	struct mempolicy *pol;
2475 
2476 	task_lock(task);
2477 	pol = task->mempolicy;
2478 	task->mempolicy = NULL;
2479 	task_unlock(task);
2480 	mpol_put(pol);
2481 }
2482 
2483 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2484 {
2485 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2486 	rb_erase(&n->nd, &sp->root);
2487 	sp_free(n);
2488 }
2489 
2490 static void sp_node_init(struct sp_node *node, unsigned long start,
2491 			unsigned long end, struct mempolicy *pol)
2492 {
2493 	node->start = start;
2494 	node->end = end;
2495 	node->policy = pol;
2496 }
2497 
2498 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2499 				struct mempolicy *pol)
2500 {
2501 	struct sp_node *n;
2502 	struct mempolicy *newpol;
2503 
2504 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2505 	if (!n)
2506 		return NULL;
2507 
2508 	newpol = mpol_dup(pol);
2509 	if (IS_ERR(newpol)) {
2510 		kmem_cache_free(sn_cache, n);
2511 		return NULL;
2512 	}
2513 	newpol->flags |= MPOL_F_SHARED;
2514 	sp_node_init(n, start, end, newpol);
2515 
2516 	return n;
2517 }
2518 
2519 /* Replace a policy range. */
2520 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2521 				 unsigned long end, struct sp_node *new)
2522 {
2523 	struct sp_node *n;
2524 	struct sp_node *n_new = NULL;
2525 	struct mempolicy *mpol_new = NULL;
2526 	int ret = 0;
2527 
2528 restart:
2529 	write_lock(&sp->lock);
2530 	n = sp_lookup(sp, start, end);
2531 	/* Take care of old policies in the same range. */
2532 	while (n && n->start < end) {
2533 		struct rb_node *next = rb_next(&n->nd);
2534 		if (n->start >= start) {
2535 			if (n->end <= end)
2536 				sp_delete(sp, n);
2537 			else
2538 				n->start = end;
2539 		} else {
2540 			/* Old policy spanning whole new range. */
2541 			if (n->end > end) {
2542 				if (!n_new)
2543 					goto alloc_new;
2544 
2545 				*mpol_new = *n->policy;
2546 				atomic_set(&mpol_new->refcnt, 1);
2547 				sp_node_init(n_new, end, n->end, mpol_new);
2548 				n->end = start;
2549 				sp_insert(sp, n_new);
2550 				n_new = NULL;
2551 				mpol_new = NULL;
2552 				break;
2553 			} else
2554 				n->end = start;
2555 		}
2556 		if (!next)
2557 			break;
2558 		n = rb_entry(next, struct sp_node, nd);
2559 	}
2560 	if (new)
2561 		sp_insert(sp, new);
2562 	write_unlock(&sp->lock);
2563 	ret = 0;
2564 
2565 err_out:
2566 	if (mpol_new)
2567 		mpol_put(mpol_new);
2568 	if (n_new)
2569 		kmem_cache_free(sn_cache, n_new);
2570 
2571 	return ret;
2572 
2573 alloc_new:
2574 	write_unlock(&sp->lock);
2575 	ret = -ENOMEM;
2576 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2577 	if (!n_new)
2578 		goto err_out;
2579 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2580 	if (!mpol_new)
2581 		goto err_out;
2582 	goto restart;
2583 }
2584 
2585 /**
2586  * mpol_shared_policy_init - initialize shared policy for inode
2587  * @sp: pointer to inode shared policy
2588  * @mpol:  struct mempolicy to install
2589  *
2590  * Install non-NULL @mpol in inode's shared policy rb-tree.
2591  * On entry, the current task has a reference on a non-NULL @mpol.
2592  * This must be released on exit.
2593  * This is called at get_inode() calls and we can use GFP_KERNEL.
2594  */
2595 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2596 {
2597 	int ret;
2598 
2599 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2600 	rwlock_init(&sp->lock);
2601 
2602 	if (mpol) {
2603 		struct vm_area_struct pvma;
2604 		struct mempolicy *new;
2605 		NODEMASK_SCRATCH(scratch);
2606 
2607 		if (!scratch)
2608 			goto put_mpol;
2609 		/* contextualize the tmpfs mount point mempolicy */
2610 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2611 		if (IS_ERR(new))
2612 			goto free_scratch; /* no valid nodemask intersection */
2613 
2614 		task_lock(current);
2615 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2616 		task_unlock(current);
2617 		if (ret)
2618 			goto put_new;
2619 
2620 		/* Create pseudo-vma that contains just the policy */
2621 		vma_init(&pvma, NULL);
2622 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2623 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2624 
2625 put_new:
2626 		mpol_put(new);			/* drop initial ref */
2627 free_scratch:
2628 		NODEMASK_SCRATCH_FREE(scratch);
2629 put_mpol:
2630 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2631 	}
2632 }
2633 
2634 int mpol_set_shared_policy(struct shared_policy *info,
2635 			struct vm_area_struct *vma, struct mempolicy *npol)
2636 {
2637 	int err;
2638 	struct sp_node *new = NULL;
2639 	unsigned long sz = vma_pages(vma);
2640 
2641 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2642 		 vma->vm_pgoff,
2643 		 sz, npol ? npol->mode : -1,
2644 		 npol ? npol->flags : -1,
2645 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2646 
2647 	if (npol) {
2648 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2649 		if (!new)
2650 			return -ENOMEM;
2651 	}
2652 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2653 	if (err && new)
2654 		sp_free(new);
2655 	return err;
2656 }
2657 
2658 /* Free a backing policy store on inode delete. */
2659 void mpol_free_shared_policy(struct shared_policy *p)
2660 {
2661 	struct sp_node *n;
2662 	struct rb_node *next;
2663 
2664 	if (!p->root.rb_node)
2665 		return;
2666 	write_lock(&p->lock);
2667 	next = rb_first(&p->root);
2668 	while (next) {
2669 		n = rb_entry(next, struct sp_node, nd);
2670 		next = rb_next(&n->nd);
2671 		sp_delete(p, n);
2672 	}
2673 	write_unlock(&p->lock);
2674 }
2675 
2676 #ifdef CONFIG_NUMA_BALANCING
2677 static int __initdata numabalancing_override;
2678 
2679 static void __init check_numabalancing_enable(void)
2680 {
2681 	bool numabalancing_default = false;
2682 
2683 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2684 		numabalancing_default = true;
2685 
2686 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2687 	if (numabalancing_override)
2688 		set_numabalancing_state(numabalancing_override == 1);
2689 
2690 	if (num_online_nodes() > 1 && !numabalancing_override) {
2691 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2692 			numabalancing_default ? "Enabling" : "Disabling");
2693 		set_numabalancing_state(numabalancing_default);
2694 	}
2695 }
2696 
2697 static int __init setup_numabalancing(char *str)
2698 {
2699 	int ret = 0;
2700 	if (!str)
2701 		goto out;
2702 
2703 	if (!strcmp(str, "enable")) {
2704 		numabalancing_override = 1;
2705 		ret = 1;
2706 	} else if (!strcmp(str, "disable")) {
2707 		numabalancing_override = -1;
2708 		ret = 1;
2709 	}
2710 out:
2711 	if (!ret)
2712 		pr_warn("Unable to parse numa_balancing=\n");
2713 
2714 	return ret;
2715 }
2716 __setup("numa_balancing=", setup_numabalancing);
2717 #else
2718 static inline void __init check_numabalancing_enable(void)
2719 {
2720 }
2721 #endif /* CONFIG_NUMA_BALANCING */
2722 
2723 /* assumes fs == KERNEL_DS */
2724 void __init numa_policy_init(void)
2725 {
2726 	nodemask_t interleave_nodes;
2727 	unsigned long largest = 0;
2728 	int nid, prefer = 0;
2729 
2730 	policy_cache = kmem_cache_create("numa_policy",
2731 					 sizeof(struct mempolicy),
2732 					 0, SLAB_PANIC, NULL);
2733 
2734 	sn_cache = kmem_cache_create("shared_policy_node",
2735 				     sizeof(struct sp_node),
2736 				     0, SLAB_PANIC, NULL);
2737 
2738 	for_each_node(nid) {
2739 		preferred_node_policy[nid] = (struct mempolicy) {
2740 			.refcnt = ATOMIC_INIT(1),
2741 			.mode = MPOL_PREFERRED,
2742 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2743 			.v = { .preferred_node = nid, },
2744 		};
2745 	}
2746 
2747 	/*
2748 	 * Set interleaving policy for system init. Interleaving is only
2749 	 * enabled across suitably sized nodes (default is >= 16MB), or
2750 	 * fall back to the largest node if they're all smaller.
2751 	 */
2752 	nodes_clear(interleave_nodes);
2753 	for_each_node_state(nid, N_MEMORY) {
2754 		unsigned long total_pages = node_present_pages(nid);
2755 
2756 		/* Preserve the largest node */
2757 		if (largest < total_pages) {
2758 			largest = total_pages;
2759 			prefer = nid;
2760 		}
2761 
2762 		/* Interleave this node? */
2763 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2764 			node_set(nid, interleave_nodes);
2765 	}
2766 
2767 	/* All too small, use the largest */
2768 	if (unlikely(nodes_empty(interleave_nodes)))
2769 		node_set(prefer, interleave_nodes);
2770 
2771 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2772 		pr_err("%s: interleaving failed\n", __func__);
2773 
2774 	check_numabalancing_enable();
2775 }
2776 
2777 /* Reset policy of current process to default */
2778 void numa_default_policy(void)
2779 {
2780 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2781 }
2782 
2783 /*
2784  * Parse and format mempolicy from/to strings
2785  */
2786 
2787 /*
2788  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2789  */
2790 static const char * const policy_modes[] =
2791 {
2792 	[MPOL_DEFAULT]    = "default",
2793 	[MPOL_PREFERRED]  = "prefer",
2794 	[MPOL_BIND]       = "bind",
2795 	[MPOL_INTERLEAVE] = "interleave",
2796 	[MPOL_LOCAL]      = "local",
2797 };
2798 
2799 
2800 #ifdef CONFIG_TMPFS
2801 /**
2802  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2803  * @str:  string containing mempolicy to parse
2804  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2805  *
2806  * Format of input:
2807  *	<mode>[=<flags>][:<nodelist>]
2808  *
2809  * On success, returns 0, else 1
2810  */
2811 int mpol_parse_str(char *str, struct mempolicy **mpol)
2812 {
2813 	struct mempolicy *new = NULL;
2814 	unsigned short mode_flags;
2815 	nodemask_t nodes;
2816 	char *nodelist = strchr(str, ':');
2817 	char *flags = strchr(str, '=');
2818 	int err = 1, mode;
2819 
2820 	if (nodelist) {
2821 		/* NUL-terminate mode or flags string */
2822 		*nodelist++ = '\0';
2823 		if (nodelist_parse(nodelist, nodes))
2824 			goto out;
2825 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2826 			goto out;
2827 	} else
2828 		nodes_clear(nodes);
2829 
2830 	if (flags)
2831 		*flags++ = '\0';	/* terminate mode string */
2832 
2833 	mode = match_string(policy_modes, MPOL_MAX, str);
2834 	if (mode < 0)
2835 		goto out;
2836 
2837 	switch (mode) {
2838 	case MPOL_PREFERRED:
2839 		/*
2840 		 * Insist on a nodelist of one node only
2841 		 */
2842 		if (nodelist) {
2843 			char *rest = nodelist;
2844 			while (isdigit(*rest))
2845 				rest++;
2846 			if (*rest)
2847 				goto out;
2848 		}
2849 		break;
2850 	case MPOL_INTERLEAVE:
2851 		/*
2852 		 * Default to online nodes with memory if no nodelist
2853 		 */
2854 		if (!nodelist)
2855 			nodes = node_states[N_MEMORY];
2856 		break;
2857 	case MPOL_LOCAL:
2858 		/*
2859 		 * Don't allow a nodelist;  mpol_new() checks flags
2860 		 */
2861 		if (nodelist)
2862 			goto out;
2863 		mode = MPOL_PREFERRED;
2864 		break;
2865 	case MPOL_DEFAULT:
2866 		/*
2867 		 * Insist on a empty nodelist
2868 		 */
2869 		if (!nodelist)
2870 			err = 0;
2871 		goto out;
2872 	case MPOL_BIND:
2873 		/*
2874 		 * Insist on a nodelist
2875 		 */
2876 		if (!nodelist)
2877 			goto out;
2878 	}
2879 
2880 	mode_flags = 0;
2881 	if (flags) {
2882 		/*
2883 		 * Currently, we only support two mutually exclusive
2884 		 * mode flags.
2885 		 */
2886 		if (!strcmp(flags, "static"))
2887 			mode_flags |= MPOL_F_STATIC_NODES;
2888 		else if (!strcmp(flags, "relative"))
2889 			mode_flags |= MPOL_F_RELATIVE_NODES;
2890 		else
2891 			goto out;
2892 	}
2893 
2894 	new = mpol_new(mode, mode_flags, &nodes);
2895 	if (IS_ERR(new))
2896 		goto out;
2897 
2898 	/*
2899 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2900 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2901 	 */
2902 	if (mode != MPOL_PREFERRED)
2903 		new->v.nodes = nodes;
2904 	else if (nodelist)
2905 		new->v.preferred_node = first_node(nodes);
2906 	else
2907 		new->flags |= MPOL_F_LOCAL;
2908 
2909 	/*
2910 	 * Save nodes for contextualization: this will be used to "clone"
2911 	 * the mempolicy in a specific context [cpuset] at a later time.
2912 	 */
2913 	new->w.user_nodemask = nodes;
2914 
2915 	err = 0;
2916 
2917 out:
2918 	/* Restore string for error message */
2919 	if (nodelist)
2920 		*--nodelist = ':';
2921 	if (flags)
2922 		*--flags = '=';
2923 	if (!err)
2924 		*mpol = new;
2925 	return err;
2926 }
2927 #endif /* CONFIG_TMPFS */
2928 
2929 /**
2930  * mpol_to_str - format a mempolicy structure for printing
2931  * @buffer:  to contain formatted mempolicy string
2932  * @maxlen:  length of @buffer
2933  * @pol:  pointer to mempolicy to be formatted
2934  *
2935  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2936  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2937  * longest flag, "relative", and to display at least a few node ids.
2938  */
2939 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2940 {
2941 	char *p = buffer;
2942 	nodemask_t nodes = NODE_MASK_NONE;
2943 	unsigned short mode = MPOL_DEFAULT;
2944 	unsigned short flags = 0;
2945 
2946 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2947 		mode = pol->mode;
2948 		flags = pol->flags;
2949 	}
2950 
2951 	switch (mode) {
2952 	case MPOL_DEFAULT:
2953 		break;
2954 	case MPOL_PREFERRED:
2955 		if (flags & MPOL_F_LOCAL)
2956 			mode = MPOL_LOCAL;
2957 		else
2958 			node_set(pol->v.preferred_node, nodes);
2959 		break;
2960 	case MPOL_BIND:
2961 	case MPOL_INTERLEAVE:
2962 		nodes = pol->v.nodes;
2963 		break;
2964 	default:
2965 		WARN_ON_ONCE(1);
2966 		snprintf(p, maxlen, "unknown");
2967 		return;
2968 	}
2969 
2970 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2971 
2972 	if (flags & MPOL_MODE_FLAGS) {
2973 		p += snprintf(p, buffer + maxlen - p, "=");
2974 
2975 		/*
2976 		 * Currently, the only defined flags are mutually exclusive
2977 		 */
2978 		if (flags & MPOL_F_STATIC_NODES)
2979 			p += snprintf(p, buffer + maxlen - p, "static");
2980 		else if (flags & MPOL_F_RELATIVE_NODES)
2981 			p += snprintf(p, buffer + maxlen - p, "relative");
2982 	}
2983 
2984 	if (!nodes_empty(nodes))
2985 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2986 			       nodemask_pr_args(&nodes));
2987 }
2988