xref: /openbmc/linux/mm/mempolicy.c (revision a1b2f04ea527397fcacacd09e0d690927feef429)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static void migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	struct vm_area_struct *prev;
414 };
415 
416 /*
417  * Check if the page's nid is in qp->nmask.
418  *
419  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
420  * in the invert of qp->nmask.
421  */
422 static inline bool queue_pages_required(struct page *page,
423 					struct queue_pages *qp)
424 {
425 	int nid = page_to_nid(page);
426 	unsigned long flags = qp->flags;
427 
428 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
429 }
430 
431 /*
432  * queue_pages_pmd() has three possible return values:
433  * 1 - pages are placed on the right node or queued successfully.
434  * 0 - THP was split.
435  * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
436  *        page was already on a node that does not follow the policy.
437  */
438 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
439 				unsigned long end, struct mm_walk *walk)
440 {
441 	int ret = 0;
442 	struct page *page;
443 	struct queue_pages *qp = walk->private;
444 	unsigned long flags;
445 
446 	if (unlikely(is_pmd_migration_entry(*pmd))) {
447 		ret = -EIO;
448 		goto unlock;
449 	}
450 	page = pmd_page(*pmd);
451 	if (is_huge_zero_page(page)) {
452 		spin_unlock(ptl);
453 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
454 		goto out;
455 	}
456 	if (!queue_pages_required(page, qp)) {
457 		ret = 1;
458 		goto unlock;
459 	}
460 
461 	ret = 1;
462 	flags = qp->flags;
463 	/* go to thp migration */
464 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
465 		if (!vma_migratable(walk->vma)) {
466 			ret = -EIO;
467 			goto unlock;
468 		}
469 
470 		migrate_page_add(page, qp->pagelist, flags);
471 	} else
472 		ret = -EIO;
473 unlock:
474 	spin_unlock(ptl);
475 out:
476 	return ret;
477 }
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
484 			unsigned long end, struct mm_walk *walk)
485 {
486 	struct vm_area_struct *vma = walk->vma;
487 	struct page *page;
488 	struct queue_pages *qp = walk->private;
489 	unsigned long flags = qp->flags;
490 	int ret;
491 	pte_t *pte;
492 	spinlock_t *ptl;
493 
494 	ptl = pmd_trans_huge_lock(pmd, vma);
495 	if (ptl) {
496 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
497 		if (ret > 0)
498 			return 0;
499 		else if (ret < 0)
500 			return ret;
501 	}
502 
503 	if (pmd_trans_unstable(pmd))
504 		return 0;
505 
506 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
507 	for (; addr != end; pte++, addr += PAGE_SIZE) {
508 		if (!pte_present(*pte))
509 			continue;
510 		page = vm_normal_page(vma, addr, *pte);
511 		if (!page)
512 			continue;
513 		/*
514 		 * vm_normal_page() filters out zero pages, but there might
515 		 * still be PageReserved pages to skip, perhaps in a VDSO.
516 		 */
517 		if (PageReserved(page))
518 			continue;
519 		if (!queue_pages_required(page, qp))
520 			continue;
521 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
522 			if (!vma_migratable(vma))
523 				break;
524 			migrate_page_add(page, qp->pagelist, flags);
525 		} else
526 			break;
527 	}
528 	pte_unmap_unlock(pte - 1, ptl);
529 	cond_resched();
530 	return addr != end ? -EIO : 0;
531 }
532 
533 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
534 			       unsigned long addr, unsigned long end,
535 			       struct mm_walk *walk)
536 {
537 #ifdef CONFIG_HUGETLB_PAGE
538 	struct queue_pages *qp = walk->private;
539 	unsigned long flags = qp->flags;
540 	struct page *page;
541 	spinlock_t *ptl;
542 	pte_t entry;
543 
544 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
545 	entry = huge_ptep_get(pte);
546 	if (!pte_present(entry))
547 		goto unlock;
548 	page = pte_page(entry);
549 	if (!queue_pages_required(page, qp))
550 		goto unlock;
551 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
552 	if (flags & (MPOL_MF_MOVE_ALL) ||
553 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
554 		isolate_huge_page(page, qp->pagelist);
555 unlock:
556 	spin_unlock(ptl);
557 #else
558 	BUG();
559 #endif
560 	return 0;
561 }
562 
563 #ifdef CONFIG_NUMA_BALANCING
564 /*
565  * This is used to mark a range of virtual addresses to be inaccessible.
566  * These are later cleared by a NUMA hinting fault. Depending on these
567  * faults, pages may be migrated for better NUMA placement.
568  *
569  * This is assuming that NUMA faults are handled using PROT_NONE. If
570  * an architecture makes a different choice, it will need further
571  * changes to the core.
572  */
573 unsigned long change_prot_numa(struct vm_area_struct *vma,
574 			unsigned long addr, unsigned long end)
575 {
576 	int nr_updated;
577 
578 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
579 	if (nr_updated)
580 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
581 
582 	return nr_updated;
583 }
584 #else
585 static unsigned long change_prot_numa(struct vm_area_struct *vma,
586 			unsigned long addr, unsigned long end)
587 {
588 	return 0;
589 }
590 #endif /* CONFIG_NUMA_BALANCING */
591 
592 static int queue_pages_test_walk(unsigned long start, unsigned long end,
593 				struct mm_walk *walk)
594 {
595 	struct vm_area_struct *vma = walk->vma;
596 	struct queue_pages *qp = walk->private;
597 	unsigned long endvma = vma->vm_end;
598 	unsigned long flags = qp->flags;
599 
600 	/*
601 	 * Need check MPOL_MF_STRICT to return -EIO if possible
602 	 * regardless of vma_migratable
603 	 */
604 	if (!vma_migratable(vma) &&
605 	    !(flags & MPOL_MF_STRICT))
606 		return 1;
607 
608 	if (endvma > end)
609 		endvma = end;
610 	if (vma->vm_start > start)
611 		start = vma->vm_start;
612 
613 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
614 		if (!vma->vm_next && vma->vm_end < end)
615 			return -EFAULT;
616 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
617 			return -EFAULT;
618 	}
619 
620 	qp->prev = vma;
621 
622 	if (flags & MPOL_MF_LAZY) {
623 		/* Similar to task_numa_work, skip inaccessible VMAs */
624 		if (!is_vm_hugetlb_page(vma) &&
625 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
626 			!(vma->vm_flags & VM_MIXEDMAP))
627 			change_prot_numa(vma, start, endvma);
628 		return 1;
629 	}
630 
631 	/* queue pages from current vma */
632 	if (flags & MPOL_MF_VALID)
633 		return 0;
634 	return 1;
635 }
636 
637 /*
638  * Walk through page tables and collect pages to be migrated.
639  *
640  * If pages found in a given range are on a set of nodes (determined by
641  * @nodes and @flags,) it's isolated and queued to the pagelist which is
642  * passed via @private.)
643  */
644 static int
645 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
646 		nodemask_t *nodes, unsigned long flags,
647 		struct list_head *pagelist)
648 {
649 	struct queue_pages qp = {
650 		.pagelist = pagelist,
651 		.flags = flags,
652 		.nmask = nodes,
653 		.prev = NULL,
654 	};
655 	struct mm_walk queue_pages_walk = {
656 		.hugetlb_entry = queue_pages_hugetlb,
657 		.pmd_entry = queue_pages_pte_range,
658 		.test_walk = queue_pages_test_walk,
659 		.mm = mm,
660 		.private = &qp,
661 	};
662 
663 	return walk_page_range(start, end, &queue_pages_walk);
664 }
665 
666 /*
667  * Apply policy to a single VMA
668  * This must be called with the mmap_sem held for writing.
669  */
670 static int vma_replace_policy(struct vm_area_struct *vma,
671 						struct mempolicy *pol)
672 {
673 	int err;
674 	struct mempolicy *old;
675 	struct mempolicy *new;
676 
677 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
678 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
679 		 vma->vm_ops, vma->vm_file,
680 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
681 
682 	new = mpol_dup(pol);
683 	if (IS_ERR(new))
684 		return PTR_ERR(new);
685 
686 	if (vma->vm_ops && vma->vm_ops->set_policy) {
687 		err = vma->vm_ops->set_policy(vma, new);
688 		if (err)
689 			goto err_out;
690 	}
691 
692 	old = vma->vm_policy;
693 	vma->vm_policy = new; /* protected by mmap_sem */
694 	mpol_put(old);
695 
696 	return 0;
697  err_out:
698 	mpol_put(new);
699 	return err;
700 }
701 
702 /* Step 2: apply policy to a range and do splits. */
703 static int mbind_range(struct mm_struct *mm, unsigned long start,
704 		       unsigned long end, struct mempolicy *new_pol)
705 {
706 	struct vm_area_struct *next;
707 	struct vm_area_struct *prev;
708 	struct vm_area_struct *vma;
709 	int err = 0;
710 	pgoff_t pgoff;
711 	unsigned long vmstart;
712 	unsigned long vmend;
713 
714 	vma = find_vma(mm, start);
715 	if (!vma || vma->vm_start > start)
716 		return -EFAULT;
717 
718 	prev = vma->vm_prev;
719 	if (start > vma->vm_start)
720 		prev = vma;
721 
722 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
723 		next = vma->vm_next;
724 		vmstart = max(start, vma->vm_start);
725 		vmend   = min(end, vma->vm_end);
726 
727 		if (mpol_equal(vma_policy(vma), new_pol))
728 			continue;
729 
730 		pgoff = vma->vm_pgoff +
731 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
732 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
733 				 vma->anon_vma, vma->vm_file, pgoff,
734 				 new_pol, vma->vm_userfaultfd_ctx);
735 		if (prev) {
736 			vma = prev;
737 			next = vma->vm_next;
738 			if (mpol_equal(vma_policy(vma), new_pol))
739 				continue;
740 			/* vma_merge() joined vma && vma->next, case 8 */
741 			goto replace;
742 		}
743 		if (vma->vm_start != vmstart) {
744 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
745 			if (err)
746 				goto out;
747 		}
748 		if (vma->vm_end != vmend) {
749 			err = split_vma(vma->vm_mm, vma, vmend, 0);
750 			if (err)
751 				goto out;
752 		}
753  replace:
754 		err = vma_replace_policy(vma, new_pol);
755 		if (err)
756 			goto out;
757 	}
758 
759  out:
760 	return err;
761 }
762 
763 /* Set the process memory policy */
764 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
765 			     nodemask_t *nodes)
766 {
767 	struct mempolicy *new, *old;
768 	NODEMASK_SCRATCH(scratch);
769 	int ret;
770 
771 	if (!scratch)
772 		return -ENOMEM;
773 
774 	new = mpol_new(mode, flags, nodes);
775 	if (IS_ERR(new)) {
776 		ret = PTR_ERR(new);
777 		goto out;
778 	}
779 
780 	task_lock(current);
781 	ret = mpol_set_nodemask(new, nodes, scratch);
782 	if (ret) {
783 		task_unlock(current);
784 		mpol_put(new);
785 		goto out;
786 	}
787 	old = current->mempolicy;
788 	current->mempolicy = new;
789 	if (new && new->mode == MPOL_INTERLEAVE)
790 		current->il_prev = MAX_NUMNODES-1;
791 	task_unlock(current);
792 	mpol_put(old);
793 	ret = 0;
794 out:
795 	NODEMASK_SCRATCH_FREE(scratch);
796 	return ret;
797 }
798 
799 /*
800  * Return nodemask for policy for get_mempolicy() query
801  *
802  * Called with task's alloc_lock held
803  */
804 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
805 {
806 	nodes_clear(*nodes);
807 	if (p == &default_policy)
808 		return;
809 
810 	switch (p->mode) {
811 	case MPOL_BIND:
812 		/* Fall through */
813 	case MPOL_INTERLEAVE:
814 		*nodes = p->v.nodes;
815 		break;
816 	case MPOL_PREFERRED:
817 		if (!(p->flags & MPOL_F_LOCAL))
818 			node_set(p->v.preferred_node, *nodes);
819 		/* else return empty node mask for local allocation */
820 		break;
821 	default:
822 		BUG();
823 	}
824 }
825 
826 static int lookup_node(struct mm_struct *mm, unsigned long addr)
827 {
828 	struct page *p;
829 	int err;
830 
831 	int locked = 1;
832 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
833 	if (err >= 0) {
834 		err = page_to_nid(p);
835 		put_page(p);
836 	}
837 	if (locked)
838 		up_read(&mm->mmap_sem);
839 	return err;
840 }
841 
842 /* Retrieve NUMA policy */
843 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
844 			     unsigned long addr, unsigned long flags)
845 {
846 	int err;
847 	struct mm_struct *mm = current->mm;
848 	struct vm_area_struct *vma = NULL;
849 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
850 
851 	if (flags &
852 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
853 		return -EINVAL;
854 
855 	if (flags & MPOL_F_MEMS_ALLOWED) {
856 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
857 			return -EINVAL;
858 		*policy = 0;	/* just so it's initialized */
859 		task_lock(current);
860 		*nmask  = cpuset_current_mems_allowed;
861 		task_unlock(current);
862 		return 0;
863 	}
864 
865 	if (flags & MPOL_F_ADDR) {
866 		/*
867 		 * Do NOT fall back to task policy if the
868 		 * vma/shared policy at addr is NULL.  We
869 		 * want to return MPOL_DEFAULT in this case.
870 		 */
871 		down_read(&mm->mmap_sem);
872 		vma = find_vma_intersection(mm, addr, addr+1);
873 		if (!vma) {
874 			up_read(&mm->mmap_sem);
875 			return -EFAULT;
876 		}
877 		if (vma->vm_ops && vma->vm_ops->get_policy)
878 			pol = vma->vm_ops->get_policy(vma, addr);
879 		else
880 			pol = vma->vm_policy;
881 	} else if (addr)
882 		return -EINVAL;
883 
884 	if (!pol)
885 		pol = &default_policy;	/* indicates default behavior */
886 
887 	if (flags & MPOL_F_NODE) {
888 		if (flags & MPOL_F_ADDR) {
889 			/*
890 			 * Take a refcount on the mpol, lookup_node()
891 			 * wil drop the mmap_sem, so after calling
892 			 * lookup_node() only "pol" remains valid, "vma"
893 			 * is stale.
894 			 */
895 			pol_refcount = pol;
896 			vma = NULL;
897 			mpol_get(pol);
898 			err = lookup_node(mm, addr);
899 			if (err < 0)
900 				goto out;
901 			*policy = err;
902 		} else if (pol == current->mempolicy &&
903 				pol->mode == MPOL_INTERLEAVE) {
904 			*policy = next_node_in(current->il_prev, pol->v.nodes);
905 		} else {
906 			err = -EINVAL;
907 			goto out;
908 		}
909 	} else {
910 		*policy = pol == &default_policy ? MPOL_DEFAULT :
911 						pol->mode;
912 		/*
913 		 * Internal mempolicy flags must be masked off before exposing
914 		 * the policy to userspace.
915 		 */
916 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
917 	}
918 
919 	err = 0;
920 	if (nmask) {
921 		if (mpol_store_user_nodemask(pol)) {
922 			*nmask = pol->w.user_nodemask;
923 		} else {
924 			task_lock(current);
925 			get_policy_nodemask(pol, nmask);
926 			task_unlock(current);
927 		}
928 	}
929 
930  out:
931 	mpol_cond_put(pol);
932 	if (vma)
933 		up_read(&mm->mmap_sem);
934 	if (pol_refcount)
935 		mpol_put(pol_refcount);
936 	return err;
937 }
938 
939 #ifdef CONFIG_MIGRATION
940 /*
941  * page migration, thp tail pages can be passed.
942  */
943 static void migrate_page_add(struct page *page, struct list_head *pagelist,
944 				unsigned long flags)
945 {
946 	struct page *head = compound_head(page);
947 	/*
948 	 * Avoid migrating a page that is shared with others.
949 	 */
950 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
951 		if (!isolate_lru_page(head)) {
952 			list_add_tail(&head->lru, pagelist);
953 			mod_node_page_state(page_pgdat(head),
954 				NR_ISOLATED_ANON + page_is_file_cache(head),
955 				hpage_nr_pages(head));
956 		}
957 	}
958 }
959 
960 /* page allocation callback for NUMA node migration */
961 struct page *alloc_new_node_page(struct page *page, unsigned long node)
962 {
963 	if (PageHuge(page))
964 		return alloc_huge_page_node(page_hstate(compound_head(page)),
965 					node);
966 	else if (PageTransHuge(page)) {
967 		struct page *thp;
968 
969 		thp = alloc_pages_node(node,
970 			(GFP_TRANSHUGE | __GFP_THISNODE),
971 			HPAGE_PMD_ORDER);
972 		if (!thp)
973 			return NULL;
974 		prep_transhuge_page(thp);
975 		return thp;
976 	} else
977 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
978 						    __GFP_THISNODE, 0);
979 }
980 
981 /*
982  * Migrate pages from one node to a target node.
983  * Returns error or the number of pages not migrated.
984  */
985 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
986 			   int flags)
987 {
988 	nodemask_t nmask;
989 	LIST_HEAD(pagelist);
990 	int err = 0;
991 
992 	nodes_clear(nmask);
993 	node_set(source, nmask);
994 
995 	/*
996 	 * This does not "check" the range but isolates all pages that
997 	 * need migration.  Between passing in the full user address
998 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
999 	 */
1000 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1001 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1002 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1003 
1004 	if (!list_empty(&pagelist)) {
1005 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1006 					MIGRATE_SYNC, MR_SYSCALL);
1007 		if (err)
1008 			putback_movable_pages(&pagelist);
1009 	}
1010 
1011 	return err;
1012 }
1013 
1014 /*
1015  * Move pages between the two nodesets so as to preserve the physical
1016  * layout as much as possible.
1017  *
1018  * Returns the number of page that could not be moved.
1019  */
1020 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1021 		     const nodemask_t *to, int flags)
1022 {
1023 	int busy = 0;
1024 	int err;
1025 	nodemask_t tmp;
1026 
1027 	err = migrate_prep();
1028 	if (err)
1029 		return err;
1030 
1031 	down_read(&mm->mmap_sem);
1032 
1033 	/*
1034 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1035 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1036 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1037 	 * The pair of nodemasks 'to' and 'from' define the map.
1038 	 *
1039 	 * If no pair of bits is found that way, fallback to picking some
1040 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1041 	 * 'source' and 'dest' bits are the same, this represents a node
1042 	 * that will be migrating to itself, so no pages need move.
1043 	 *
1044 	 * If no bits are left in 'tmp', or if all remaining bits left
1045 	 * in 'tmp' correspond to the same bit in 'to', return false
1046 	 * (nothing left to migrate).
1047 	 *
1048 	 * This lets us pick a pair of nodes to migrate between, such that
1049 	 * if possible the dest node is not already occupied by some other
1050 	 * source node, minimizing the risk of overloading the memory on a
1051 	 * node that would happen if we migrated incoming memory to a node
1052 	 * before migrating outgoing memory source that same node.
1053 	 *
1054 	 * A single scan of tmp is sufficient.  As we go, we remember the
1055 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1056 	 * that not only moved, but what's better, moved to an empty slot
1057 	 * (d is not set in tmp), then we break out then, with that pair.
1058 	 * Otherwise when we finish scanning from_tmp, we at least have the
1059 	 * most recent <s, d> pair that moved.  If we get all the way through
1060 	 * the scan of tmp without finding any node that moved, much less
1061 	 * moved to an empty node, then there is nothing left worth migrating.
1062 	 */
1063 
1064 	tmp = *from;
1065 	while (!nodes_empty(tmp)) {
1066 		int s,d;
1067 		int source = NUMA_NO_NODE;
1068 		int dest = 0;
1069 
1070 		for_each_node_mask(s, tmp) {
1071 
1072 			/*
1073 			 * do_migrate_pages() tries to maintain the relative
1074 			 * node relationship of the pages established between
1075 			 * threads and memory areas.
1076                          *
1077 			 * However if the number of source nodes is not equal to
1078 			 * the number of destination nodes we can not preserve
1079 			 * this node relative relationship.  In that case, skip
1080 			 * copying memory from a node that is in the destination
1081 			 * mask.
1082 			 *
1083 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1084 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1085 			 */
1086 
1087 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1088 						(node_isset(s, *to)))
1089 				continue;
1090 
1091 			d = node_remap(s, *from, *to);
1092 			if (s == d)
1093 				continue;
1094 
1095 			source = s;	/* Node moved. Memorize */
1096 			dest = d;
1097 
1098 			/* dest not in remaining from nodes? */
1099 			if (!node_isset(dest, tmp))
1100 				break;
1101 		}
1102 		if (source == NUMA_NO_NODE)
1103 			break;
1104 
1105 		node_clear(source, tmp);
1106 		err = migrate_to_node(mm, source, dest, flags);
1107 		if (err > 0)
1108 			busy += err;
1109 		if (err < 0)
1110 			break;
1111 	}
1112 	up_read(&mm->mmap_sem);
1113 	if (err < 0)
1114 		return err;
1115 	return busy;
1116 
1117 }
1118 
1119 /*
1120  * Allocate a new page for page migration based on vma policy.
1121  * Start by assuming the page is mapped by the same vma as contains @start.
1122  * Search forward from there, if not.  N.B., this assumes that the
1123  * list of pages handed to migrate_pages()--which is how we get here--
1124  * is in virtual address order.
1125  */
1126 static struct page *new_page(struct page *page, unsigned long start)
1127 {
1128 	struct vm_area_struct *vma;
1129 	unsigned long uninitialized_var(address);
1130 
1131 	vma = find_vma(current->mm, start);
1132 	while (vma) {
1133 		address = page_address_in_vma(page, vma);
1134 		if (address != -EFAULT)
1135 			break;
1136 		vma = vma->vm_next;
1137 	}
1138 
1139 	if (PageHuge(page)) {
1140 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1141 				vma, address);
1142 	} else if (PageTransHuge(page)) {
1143 		struct page *thp;
1144 
1145 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1146 					 HPAGE_PMD_ORDER);
1147 		if (!thp)
1148 			return NULL;
1149 		prep_transhuge_page(thp);
1150 		return thp;
1151 	}
1152 	/*
1153 	 * if !vma, alloc_page_vma() will use task or system default policy
1154 	 */
1155 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1156 			vma, address);
1157 }
1158 #else
1159 
1160 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1161 				unsigned long flags)
1162 {
1163 }
1164 
1165 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1166 		     const nodemask_t *to, int flags)
1167 {
1168 	return -ENOSYS;
1169 }
1170 
1171 static struct page *new_page(struct page *page, unsigned long start)
1172 {
1173 	return NULL;
1174 }
1175 #endif
1176 
1177 static long do_mbind(unsigned long start, unsigned long len,
1178 		     unsigned short mode, unsigned short mode_flags,
1179 		     nodemask_t *nmask, unsigned long flags)
1180 {
1181 	struct mm_struct *mm = current->mm;
1182 	struct mempolicy *new;
1183 	unsigned long end;
1184 	int err;
1185 	LIST_HEAD(pagelist);
1186 
1187 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1188 		return -EINVAL;
1189 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1190 		return -EPERM;
1191 
1192 	if (start & ~PAGE_MASK)
1193 		return -EINVAL;
1194 
1195 	if (mode == MPOL_DEFAULT)
1196 		flags &= ~MPOL_MF_STRICT;
1197 
1198 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1199 	end = start + len;
1200 
1201 	if (end < start)
1202 		return -EINVAL;
1203 	if (end == start)
1204 		return 0;
1205 
1206 	new = mpol_new(mode, mode_flags, nmask);
1207 	if (IS_ERR(new))
1208 		return PTR_ERR(new);
1209 
1210 	if (flags & MPOL_MF_LAZY)
1211 		new->flags |= MPOL_F_MOF;
1212 
1213 	/*
1214 	 * If we are using the default policy then operation
1215 	 * on discontinuous address spaces is okay after all
1216 	 */
1217 	if (!new)
1218 		flags |= MPOL_MF_DISCONTIG_OK;
1219 
1220 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1221 		 start, start + len, mode, mode_flags,
1222 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1223 
1224 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1225 
1226 		err = migrate_prep();
1227 		if (err)
1228 			goto mpol_out;
1229 	}
1230 	{
1231 		NODEMASK_SCRATCH(scratch);
1232 		if (scratch) {
1233 			down_write(&mm->mmap_sem);
1234 			task_lock(current);
1235 			err = mpol_set_nodemask(new, nmask, scratch);
1236 			task_unlock(current);
1237 			if (err)
1238 				up_write(&mm->mmap_sem);
1239 		} else
1240 			err = -ENOMEM;
1241 		NODEMASK_SCRATCH_FREE(scratch);
1242 	}
1243 	if (err)
1244 		goto mpol_out;
1245 
1246 	err = queue_pages_range(mm, start, end, nmask,
1247 			  flags | MPOL_MF_INVERT, &pagelist);
1248 	if (!err)
1249 		err = mbind_range(mm, start, end, new);
1250 
1251 	if (!err) {
1252 		int nr_failed = 0;
1253 
1254 		if (!list_empty(&pagelist)) {
1255 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1256 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1257 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1258 			if (nr_failed)
1259 				putback_movable_pages(&pagelist);
1260 		}
1261 
1262 		if (nr_failed && (flags & MPOL_MF_STRICT))
1263 			err = -EIO;
1264 	} else
1265 		putback_movable_pages(&pagelist);
1266 
1267 	up_write(&mm->mmap_sem);
1268  mpol_out:
1269 	mpol_put(new);
1270 	return err;
1271 }
1272 
1273 /*
1274  * User space interface with variable sized bitmaps for nodelists.
1275  */
1276 
1277 /* Copy a node mask from user space. */
1278 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1279 		     unsigned long maxnode)
1280 {
1281 	unsigned long k;
1282 	unsigned long t;
1283 	unsigned long nlongs;
1284 	unsigned long endmask;
1285 
1286 	--maxnode;
1287 	nodes_clear(*nodes);
1288 	if (maxnode == 0 || !nmask)
1289 		return 0;
1290 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1291 		return -EINVAL;
1292 
1293 	nlongs = BITS_TO_LONGS(maxnode);
1294 	if ((maxnode % BITS_PER_LONG) == 0)
1295 		endmask = ~0UL;
1296 	else
1297 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1298 
1299 	/*
1300 	 * When the user specified more nodes than supported just check
1301 	 * if the non supported part is all zero.
1302 	 *
1303 	 * If maxnode have more longs than MAX_NUMNODES, check
1304 	 * the bits in that area first. And then go through to
1305 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1306 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1307 	 */
1308 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1309 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1310 			if (get_user(t, nmask + k))
1311 				return -EFAULT;
1312 			if (k == nlongs - 1) {
1313 				if (t & endmask)
1314 					return -EINVAL;
1315 			} else if (t)
1316 				return -EINVAL;
1317 		}
1318 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1319 		endmask = ~0UL;
1320 	}
1321 
1322 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1323 		unsigned long valid_mask = endmask;
1324 
1325 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1326 		if (get_user(t, nmask + nlongs - 1))
1327 			return -EFAULT;
1328 		if (t & valid_mask)
1329 			return -EINVAL;
1330 	}
1331 
1332 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1333 		return -EFAULT;
1334 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1335 	return 0;
1336 }
1337 
1338 /* Copy a kernel node mask to user space */
1339 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1340 			      nodemask_t *nodes)
1341 {
1342 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1343 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1344 
1345 	if (copy > nbytes) {
1346 		if (copy > PAGE_SIZE)
1347 			return -EINVAL;
1348 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1349 			return -EFAULT;
1350 		copy = nbytes;
1351 	}
1352 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1353 }
1354 
1355 static long kernel_mbind(unsigned long start, unsigned long len,
1356 			 unsigned long mode, const unsigned long __user *nmask,
1357 			 unsigned long maxnode, unsigned int flags)
1358 {
1359 	nodemask_t nodes;
1360 	int err;
1361 	unsigned short mode_flags;
1362 
1363 	mode_flags = mode & MPOL_MODE_FLAGS;
1364 	mode &= ~MPOL_MODE_FLAGS;
1365 	if (mode >= MPOL_MAX)
1366 		return -EINVAL;
1367 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1368 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1369 		return -EINVAL;
1370 	err = get_nodes(&nodes, nmask, maxnode);
1371 	if (err)
1372 		return err;
1373 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1374 }
1375 
1376 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1377 		unsigned long, mode, const unsigned long __user *, nmask,
1378 		unsigned long, maxnode, unsigned int, flags)
1379 {
1380 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1381 }
1382 
1383 /* Set the process memory policy */
1384 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1385 				 unsigned long maxnode)
1386 {
1387 	int err;
1388 	nodemask_t nodes;
1389 	unsigned short flags;
1390 
1391 	flags = mode & MPOL_MODE_FLAGS;
1392 	mode &= ~MPOL_MODE_FLAGS;
1393 	if ((unsigned int)mode >= MPOL_MAX)
1394 		return -EINVAL;
1395 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1396 		return -EINVAL;
1397 	err = get_nodes(&nodes, nmask, maxnode);
1398 	if (err)
1399 		return err;
1400 	return do_set_mempolicy(mode, flags, &nodes);
1401 }
1402 
1403 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1404 		unsigned long, maxnode)
1405 {
1406 	return kernel_set_mempolicy(mode, nmask, maxnode);
1407 }
1408 
1409 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1410 				const unsigned long __user *old_nodes,
1411 				const unsigned long __user *new_nodes)
1412 {
1413 	struct mm_struct *mm = NULL;
1414 	struct task_struct *task;
1415 	nodemask_t task_nodes;
1416 	int err;
1417 	nodemask_t *old;
1418 	nodemask_t *new;
1419 	NODEMASK_SCRATCH(scratch);
1420 
1421 	if (!scratch)
1422 		return -ENOMEM;
1423 
1424 	old = &scratch->mask1;
1425 	new = &scratch->mask2;
1426 
1427 	err = get_nodes(old, old_nodes, maxnode);
1428 	if (err)
1429 		goto out;
1430 
1431 	err = get_nodes(new, new_nodes, maxnode);
1432 	if (err)
1433 		goto out;
1434 
1435 	/* Find the mm_struct */
1436 	rcu_read_lock();
1437 	task = pid ? find_task_by_vpid(pid) : current;
1438 	if (!task) {
1439 		rcu_read_unlock();
1440 		err = -ESRCH;
1441 		goto out;
1442 	}
1443 	get_task_struct(task);
1444 
1445 	err = -EINVAL;
1446 
1447 	/*
1448 	 * Check if this process has the right to modify the specified process.
1449 	 * Use the regular "ptrace_may_access()" checks.
1450 	 */
1451 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1452 		rcu_read_unlock();
1453 		err = -EPERM;
1454 		goto out_put;
1455 	}
1456 	rcu_read_unlock();
1457 
1458 	task_nodes = cpuset_mems_allowed(task);
1459 	/* Is the user allowed to access the target nodes? */
1460 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1461 		err = -EPERM;
1462 		goto out_put;
1463 	}
1464 
1465 	task_nodes = cpuset_mems_allowed(current);
1466 	nodes_and(*new, *new, task_nodes);
1467 	if (nodes_empty(*new))
1468 		goto out_put;
1469 
1470 	nodes_and(*new, *new, node_states[N_MEMORY]);
1471 	if (nodes_empty(*new))
1472 		goto out_put;
1473 
1474 	err = security_task_movememory(task);
1475 	if (err)
1476 		goto out_put;
1477 
1478 	mm = get_task_mm(task);
1479 	put_task_struct(task);
1480 
1481 	if (!mm) {
1482 		err = -EINVAL;
1483 		goto out;
1484 	}
1485 
1486 	err = do_migrate_pages(mm, old, new,
1487 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1488 
1489 	mmput(mm);
1490 out:
1491 	NODEMASK_SCRATCH_FREE(scratch);
1492 
1493 	return err;
1494 
1495 out_put:
1496 	put_task_struct(task);
1497 	goto out;
1498 
1499 }
1500 
1501 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1502 		const unsigned long __user *, old_nodes,
1503 		const unsigned long __user *, new_nodes)
1504 {
1505 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1506 }
1507 
1508 
1509 /* Retrieve NUMA policy */
1510 static int kernel_get_mempolicy(int __user *policy,
1511 				unsigned long __user *nmask,
1512 				unsigned long maxnode,
1513 				unsigned long addr,
1514 				unsigned long flags)
1515 {
1516 	int err;
1517 	int uninitialized_var(pval);
1518 	nodemask_t nodes;
1519 
1520 	if (nmask != NULL && maxnode < nr_node_ids)
1521 		return -EINVAL;
1522 
1523 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1524 
1525 	if (err)
1526 		return err;
1527 
1528 	if (policy && put_user(pval, policy))
1529 		return -EFAULT;
1530 
1531 	if (nmask)
1532 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1533 
1534 	return err;
1535 }
1536 
1537 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1538 		unsigned long __user *, nmask, unsigned long, maxnode,
1539 		unsigned long, addr, unsigned long, flags)
1540 {
1541 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1542 }
1543 
1544 #ifdef CONFIG_COMPAT
1545 
1546 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1547 		       compat_ulong_t __user *, nmask,
1548 		       compat_ulong_t, maxnode,
1549 		       compat_ulong_t, addr, compat_ulong_t, flags)
1550 {
1551 	long err;
1552 	unsigned long __user *nm = NULL;
1553 	unsigned long nr_bits, alloc_size;
1554 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1555 
1556 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1557 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1558 
1559 	if (nmask)
1560 		nm = compat_alloc_user_space(alloc_size);
1561 
1562 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1563 
1564 	if (!err && nmask) {
1565 		unsigned long copy_size;
1566 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1567 		err = copy_from_user(bm, nm, copy_size);
1568 		/* ensure entire bitmap is zeroed */
1569 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1570 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1571 	}
1572 
1573 	return err;
1574 }
1575 
1576 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1577 		       compat_ulong_t, maxnode)
1578 {
1579 	unsigned long __user *nm = NULL;
1580 	unsigned long nr_bits, alloc_size;
1581 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1582 
1583 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1584 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1585 
1586 	if (nmask) {
1587 		if (compat_get_bitmap(bm, nmask, nr_bits))
1588 			return -EFAULT;
1589 		nm = compat_alloc_user_space(alloc_size);
1590 		if (copy_to_user(nm, bm, alloc_size))
1591 			return -EFAULT;
1592 	}
1593 
1594 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1595 }
1596 
1597 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1598 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1599 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1600 {
1601 	unsigned long __user *nm = NULL;
1602 	unsigned long nr_bits, alloc_size;
1603 	nodemask_t bm;
1604 
1605 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1606 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1607 
1608 	if (nmask) {
1609 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1610 			return -EFAULT;
1611 		nm = compat_alloc_user_space(alloc_size);
1612 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1613 			return -EFAULT;
1614 	}
1615 
1616 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1617 }
1618 
1619 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1620 		       compat_ulong_t, maxnode,
1621 		       const compat_ulong_t __user *, old_nodes,
1622 		       const compat_ulong_t __user *, new_nodes)
1623 {
1624 	unsigned long __user *old = NULL;
1625 	unsigned long __user *new = NULL;
1626 	nodemask_t tmp_mask;
1627 	unsigned long nr_bits;
1628 	unsigned long size;
1629 
1630 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1631 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1632 	if (old_nodes) {
1633 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1634 			return -EFAULT;
1635 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1636 		if (new_nodes)
1637 			new = old + size / sizeof(unsigned long);
1638 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1639 			return -EFAULT;
1640 	}
1641 	if (new_nodes) {
1642 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1643 			return -EFAULT;
1644 		if (new == NULL)
1645 			new = compat_alloc_user_space(size);
1646 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1647 			return -EFAULT;
1648 	}
1649 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1650 }
1651 
1652 #endif /* CONFIG_COMPAT */
1653 
1654 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1655 						unsigned long addr)
1656 {
1657 	struct mempolicy *pol = NULL;
1658 
1659 	if (vma) {
1660 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1661 			pol = vma->vm_ops->get_policy(vma, addr);
1662 		} else if (vma->vm_policy) {
1663 			pol = vma->vm_policy;
1664 
1665 			/*
1666 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1667 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1668 			 * count on these policies which will be dropped by
1669 			 * mpol_cond_put() later
1670 			 */
1671 			if (mpol_needs_cond_ref(pol))
1672 				mpol_get(pol);
1673 		}
1674 	}
1675 
1676 	return pol;
1677 }
1678 
1679 /*
1680  * get_vma_policy(@vma, @addr)
1681  * @vma: virtual memory area whose policy is sought
1682  * @addr: address in @vma for shared policy lookup
1683  *
1684  * Returns effective policy for a VMA at specified address.
1685  * Falls back to current->mempolicy or system default policy, as necessary.
1686  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1687  * count--added by the get_policy() vm_op, as appropriate--to protect against
1688  * freeing by another task.  It is the caller's responsibility to free the
1689  * extra reference for shared policies.
1690  */
1691 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1692 						unsigned long addr)
1693 {
1694 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1695 
1696 	if (!pol)
1697 		pol = get_task_policy(current);
1698 
1699 	return pol;
1700 }
1701 
1702 bool vma_policy_mof(struct vm_area_struct *vma)
1703 {
1704 	struct mempolicy *pol;
1705 
1706 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1707 		bool ret = false;
1708 
1709 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1710 		if (pol && (pol->flags & MPOL_F_MOF))
1711 			ret = true;
1712 		mpol_cond_put(pol);
1713 
1714 		return ret;
1715 	}
1716 
1717 	pol = vma->vm_policy;
1718 	if (!pol)
1719 		pol = get_task_policy(current);
1720 
1721 	return pol->flags & MPOL_F_MOF;
1722 }
1723 
1724 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1725 {
1726 	enum zone_type dynamic_policy_zone = policy_zone;
1727 
1728 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1729 
1730 	/*
1731 	 * if policy->v.nodes has movable memory only,
1732 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1733 	 *
1734 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1735 	 * so if the following test faile, it implies
1736 	 * policy->v.nodes has movable memory only.
1737 	 */
1738 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1739 		dynamic_policy_zone = ZONE_MOVABLE;
1740 
1741 	return zone >= dynamic_policy_zone;
1742 }
1743 
1744 /*
1745  * Return a nodemask representing a mempolicy for filtering nodes for
1746  * page allocation
1747  */
1748 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1749 {
1750 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1751 	if (unlikely(policy->mode == MPOL_BIND) &&
1752 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1753 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1754 		return &policy->v.nodes;
1755 
1756 	return NULL;
1757 }
1758 
1759 /* Return the node id preferred by the given mempolicy, or the given id */
1760 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1761 								int nd)
1762 {
1763 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1764 		nd = policy->v.preferred_node;
1765 	else {
1766 		/*
1767 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1768 		 * because we might easily break the expectation to stay on the
1769 		 * requested node and not break the policy.
1770 		 */
1771 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1772 	}
1773 
1774 	return nd;
1775 }
1776 
1777 /* Do dynamic interleaving for a process */
1778 static unsigned interleave_nodes(struct mempolicy *policy)
1779 {
1780 	unsigned next;
1781 	struct task_struct *me = current;
1782 
1783 	next = next_node_in(me->il_prev, policy->v.nodes);
1784 	if (next < MAX_NUMNODES)
1785 		me->il_prev = next;
1786 	return next;
1787 }
1788 
1789 /*
1790  * Depending on the memory policy provide a node from which to allocate the
1791  * next slab entry.
1792  */
1793 unsigned int mempolicy_slab_node(void)
1794 {
1795 	struct mempolicy *policy;
1796 	int node = numa_mem_id();
1797 
1798 	if (in_interrupt())
1799 		return node;
1800 
1801 	policy = current->mempolicy;
1802 	if (!policy || policy->flags & MPOL_F_LOCAL)
1803 		return node;
1804 
1805 	switch (policy->mode) {
1806 	case MPOL_PREFERRED:
1807 		/*
1808 		 * handled MPOL_F_LOCAL above
1809 		 */
1810 		return policy->v.preferred_node;
1811 
1812 	case MPOL_INTERLEAVE:
1813 		return interleave_nodes(policy);
1814 
1815 	case MPOL_BIND: {
1816 		struct zoneref *z;
1817 
1818 		/*
1819 		 * Follow bind policy behavior and start allocation at the
1820 		 * first node.
1821 		 */
1822 		struct zonelist *zonelist;
1823 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1824 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1825 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1826 							&policy->v.nodes);
1827 		return z->zone ? zone_to_nid(z->zone) : node;
1828 	}
1829 
1830 	default:
1831 		BUG();
1832 	}
1833 }
1834 
1835 /*
1836  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1837  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1838  * number of present nodes.
1839  */
1840 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1841 {
1842 	unsigned nnodes = nodes_weight(pol->v.nodes);
1843 	unsigned target;
1844 	int i;
1845 	int nid;
1846 
1847 	if (!nnodes)
1848 		return numa_node_id();
1849 	target = (unsigned int)n % nnodes;
1850 	nid = first_node(pol->v.nodes);
1851 	for (i = 0; i < target; i++)
1852 		nid = next_node(nid, pol->v.nodes);
1853 	return nid;
1854 }
1855 
1856 /* Determine a node number for interleave */
1857 static inline unsigned interleave_nid(struct mempolicy *pol,
1858 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1859 {
1860 	if (vma) {
1861 		unsigned long off;
1862 
1863 		/*
1864 		 * for small pages, there is no difference between
1865 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1866 		 * for huge pages, since vm_pgoff is in units of small
1867 		 * pages, we need to shift off the always 0 bits to get
1868 		 * a useful offset.
1869 		 */
1870 		BUG_ON(shift < PAGE_SHIFT);
1871 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1872 		off += (addr - vma->vm_start) >> shift;
1873 		return offset_il_node(pol, off);
1874 	} else
1875 		return interleave_nodes(pol);
1876 }
1877 
1878 #ifdef CONFIG_HUGETLBFS
1879 /*
1880  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1881  * @vma: virtual memory area whose policy is sought
1882  * @addr: address in @vma for shared policy lookup and interleave policy
1883  * @gfp_flags: for requested zone
1884  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1885  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1886  *
1887  * Returns a nid suitable for a huge page allocation and a pointer
1888  * to the struct mempolicy for conditional unref after allocation.
1889  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1890  * @nodemask for filtering the zonelist.
1891  *
1892  * Must be protected by read_mems_allowed_begin()
1893  */
1894 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1895 				struct mempolicy **mpol, nodemask_t **nodemask)
1896 {
1897 	int nid;
1898 
1899 	*mpol = get_vma_policy(vma, addr);
1900 	*nodemask = NULL;	/* assume !MPOL_BIND */
1901 
1902 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1903 		nid = interleave_nid(*mpol, vma, addr,
1904 					huge_page_shift(hstate_vma(vma)));
1905 	} else {
1906 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1907 		if ((*mpol)->mode == MPOL_BIND)
1908 			*nodemask = &(*mpol)->v.nodes;
1909 	}
1910 	return nid;
1911 }
1912 
1913 /*
1914  * init_nodemask_of_mempolicy
1915  *
1916  * If the current task's mempolicy is "default" [NULL], return 'false'
1917  * to indicate default policy.  Otherwise, extract the policy nodemask
1918  * for 'bind' or 'interleave' policy into the argument nodemask, or
1919  * initialize the argument nodemask to contain the single node for
1920  * 'preferred' or 'local' policy and return 'true' to indicate presence
1921  * of non-default mempolicy.
1922  *
1923  * We don't bother with reference counting the mempolicy [mpol_get/put]
1924  * because the current task is examining it's own mempolicy and a task's
1925  * mempolicy is only ever changed by the task itself.
1926  *
1927  * N.B., it is the caller's responsibility to free a returned nodemask.
1928  */
1929 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1930 {
1931 	struct mempolicy *mempolicy;
1932 	int nid;
1933 
1934 	if (!(mask && current->mempolicy))
1935 		return false;
1936 
1937 	task_lock(current);
1938 	mempolicy = current->mempolicy;
1939 	switch (mempolicy->mode) {
1940 	case MPOL_PREFERRED:
1941 		if (mempolicy->flags & MPOL_F_LOCAL)
1942 			nid = numa_node_id();
1943 		else
1944 			nid = mempolicy->v.preferred_node;
1945 		init_nodemask_of_node(mask, nid);
1946 		break;
1947 
1948 	case MPOL_BIND:
1949 		/* Fall through */
1950 	case MPOL_INTERLEAVE:
1951 		*mask =  mempolicy->v.nodes;
1952 		break;
1953 
1954 	default:
1955 		BUG();
1956 	}
1957 	task_unlock(current);
1958 
1959 	return true;
1960 }
1961 #endif
1962 
1963 /*
1964  * mempolicy_nodemask_intersects
1965  *
1966  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1967  * policy.  Otherwise, check for intersection between mask and the policy
1968  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1969  * policy, always return true since it may allocate elsewhere on fallback.
1970  *
1971  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1972  */
1973 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1974 					const nodemask_t *mask)
1975 {
1976 	struct mempolicy *mempolicy;
1977 	bool ret = true;
1978 
1979 	if (!mask)
1980 		return ret;
1981 	task_lock(tsk);
1982 	mempolicy = tsk->mempolicy;
1983 	if (!mempolicy)
1984 		goto out;
1985 
1986 	switch (mempolicy->mode) {
1987 	case MPOL_PREFERRED:
1988 		/*
1989 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1990 		 * allocate from, they may fallback to other nodes when oom.
1991 		 * Thus, it's possible for tsk to have allocated memory from
1992 		 * nodes in mask.
1993 		 */
1994 		break;
1995 	case MPOL_BIND:
1996 	case MPOL_INTERLEAVE:
1997 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1998 		break;
1999 	default:
2000 		BUG();
2001 	}
2002 out:
2003 	task_unlock(tsk);
2004 	return ret;
2005 }
2006 
2007 /* Allocate a page in interleaved policy.
2008    Own path because it needs to do special accounting. */
2009 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2010 					unsigned nid)
2011 {
2012 	struct page *page;
2013 
2014 	page = __alloc_pages(gfp, order, nid);
2015 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2016 	if (!static_branch_likely(&vm_numa_stat_key))
2017 		return page;
2018 	if (page && page_to_nid(page) == nid) {
2019 		preempt_disable();
2020 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2021 		preempt_enable();
2022 	}
2023 	return page;
2024 }
2025 
2026 /**
2027  * 	alloc_pages_vma	- Allocate a page for a VMA.
2028  *
2029  * 	@gfp:
2030  *      %GFP_USER    user allocation.
2031  *      %GFP_KERNEL  kernel allocations,
2032  *      %GFP_HIGHMEM highmem/user allocations,
2033  *      %GFP_FS      allocation should not call back into a file system.
2034  *      %GFP_ATOMIC  don't sleep.
2035  *
2036  *	@order:Order of the GFP allocation.
2037  * 	@vma:  Pointer to VMA or NULL if not available.
2038  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2039  *	@node: Which node to prefer for allocation (modulo policy).
2040  *	@hugepage: for hugepages try only the preferred node if possible
2041  *
2042  * 	This function allocates a page from the kernel page pool and applies
2043  *	a NUMA policy associated with the VMA or the current process.
2044  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2045  *	mm_struct of the VMA to prevent it from going away. Should be used for
2046  *	all allocations for pages that will be mapped into user space. Returns
2047  *	NULL when no page can be allocated.
2048  */
2049 struct page *
2050 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2051 		unsigned long addr, int node, bool hugepage)
2052 {
2053 	struct mempolicy *pol;
2054 	struct page *page;
2055 	int preferred_nid;
2056 	nodemask_t *nmask;
2057 
2058 	pol = get_vma_policy(vma, addr);
2059 
2060 	if (pol->mode == MPOL_INTERLEAVE) {
2061 		unsigned nid;
2062 
2063 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2064 		mpol_cond_put(pol);
2065 		page = alloc_page_interleave(gfp, order, nid);
2066 		goto out;
2067 	}
2068 
2069 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2070 		int hpage_node = node;
2071 
2072 		/*
2073 		 * For hugepage allocation and non-interleave policy which
2074 		 * allows the current node (or other explicitly preferred
2075 		 * node) we only try to allocate from the current/preferred
2076 		 * node and don't fall back to other nodes, as the cost of
2077 		 * remote accesses would likely offset THP benefits.
2078 		 *
2079 		 * If the policy is interleave, or does not allow the current
2080 		 * node in its nodemask, we allocate the standard way.
2081 		 */
2082 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2083 			hpage_node = pol->v.preferred_node;
2084 
2085 		nmask = policy_nodemask(gfp, pol);
2086 		if (!nmask || node_isset(hpage_node, *nmask)) {
2087 			mpol_cond_put(pol);
2088 			page = __alloc_pages_node(hpage_node,
2089 						gfp | __GFP_THISNODE, order);
2090 			goto out;
2091 		}
2092 	}
2093 
2094 	nmask = policy_nodemask(gfp, pol);
2095 	preferred_nid = policy_node(gfp, pol, node);
2096 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2097 	mpol_cond_put(pol);
2098 out:
2099 	return page;
2100 }
2101 EXPORT_SYMBOL(alloc_pages_vma);
2102 
2103 /**
2104  * 	alloc_pages_current - Allocate pages.
2105  *
2106  *	@gfp:
2107  *		%GFP_USER   user allocation,
2108  *      	%GFP_KERNEL kernel allocation,
2109  *      	%GFP_HIGHMEM highmem allocation,
2110  *      	%GFP_FS     don't call back into a file system.
2111  *      	%GFP_ATOMIC don't sleep.
2112  *	@order: Power of two of allocation size in pages. 0 is a single page.
2113  *
2114  *	Allocate a page from the kernel page pool.  When not in
2115  *	interrupt context and apply the current process NUMA policy.
2116  *	Returns NULL when no page can be allocated.
2117  */
2118 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2119 {
2120 	struct mempolicy *pol = &default_policy;
2121 	struct page *page;
2122 
2123 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2124 		pol = get_task_policy(current);
2125 
2126 	/*
2127 	 * No reference counting needed for current->mempolicy
2128 	 * nor system default_policy
2129 	 */
2130 	if (pol->mode == MPOL_INTERLEAVE)
2131 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2132 	else
2133 		page = __alloc_pages_nodemask(gfp, order,
2134 				policy_node(gfp, pol, numa_node_id()),
2135 				policy_nodemask(gfp, pol));
2136 
2137 	return page;
2138 }
2139 EXPORT_SYMBOL(alloc_pages_current);
2140 
2141 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2142 {
2143 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2144 
2145 	if (IS_ERR(pol))
2146 		return PTR_ERR(pol);
2147 	dst->vm_policy = pol;
2148 	return 0;
2149 }
2150 
2151 /*
2152  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2153  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2154  * with the mems_allowed returned by cpuset_mems_allowed().  This
2155  * keeps mempolicies cpuset relative after its cpuset moves.  See
2156  * further kernel/cpuset.c update_nodemask().
2157  *
2158  * current's mempolicy may be rebinded by the other task(the task that changes
2159  * cpuset's mems), so we needn't do rebind work for current task.
2160  */
2161 
2162 /* Slow path of a mempolicy duplicate */
2163 struct mempolicy *__mpol_dup(struct mempolicy *old)
2164 {
2165 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2166 
2167 	if (!new)
2168 		return ERR_PTR(-ENOMEM);
2169 
2170 	/* task's mempolicy is protected by alloc_lock */
2171 	if (old == current->mempolicy) {
2172 		task_lock(current);
2173 		*new = *old;
2174 		task_unlock(current);
2175 	} else
2176 		*new = *old;
2177 
2178 	if (current_cpuset_is_being_rebound()) {
2179 		nodemask_t mems = cpuset_mems_allowed(current);
2180 		mpol_rebind_policy(new, &mems);
2181 	}
2182 	atomic_set(&new->refcnt, 1);
2183 	return new;
2184 }
2185 
2186 /* Slow path of a mempolicy comparison */
2187 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2188 {
2189 	if (!a || !b)
2190 		return false;
2191 	if (a->mode != b->mode)
2192 		return false;
2193 	if (a->flags != b->flags)
2194 		return false;
2195 	if (mpol_store_user_nodemask(a))
2196 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2197 			return false;
2198 
2199 	switch (a->mode) {
2200 	case MPOL_BIND:
2201 		/* Fall through */
2202 	case MPOL_INTERLEAVE:
2203 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2204 	case MPOL_PREFERRED:
2205 		/* a's ->flags is the same as b's */
2206 		if (a->flags & MPOL_F_LOCAL)
2207 			return true;
2208 		return a->v.preferred_node == b->v.preferred_node;
2209 	default:
2210 		BUG();
2211 		return false;
2212 	}
2213 }
2214 
2215 /*
2216  * Shared memory backing store policy support.
2217  *
2218  * Remember policies even when nobody has shared memory mapped.
2219  * The policies are kept in Red-Black tree linked from the inode.
2220  * They are protected by the sp->lock rwlock, which should be held
2221  * for any accesses to the tree.
2222  */
2223 
2224 /*
2225  * lookup first element intersecting start-end.  Caller holds sp->lock for
2226  * reading or for writing
2227  */
2228 static struct sp_node *
2229 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2230 {
2231 	struct rb_node *n = sp->root.rb_node;
2232 
2233 	while (n) {
2234 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2235 
2236 		if (start >= p->end)
2237 			n = n->rb_right;
2238 		else if (end <= p->start)
2239 			n = n->rb_left;
2240 		else
2241 			break;
2242 	}
2243 	if (!n)
2244 		return NULL;
2245 	for (;;) {
2246 		struct sp_node *w = NULL;
2247 		struct rb_node *prev = rb_prev(n);
2248 		if (!prev)
2249 			break;
2250 		w = rb_entry(prev, struct sp_node, nd);
2251 		if (w->end <= start)
2252 			break;
2253 		n = prev;
2254 	}
2255 	return rb_entry(n, struct sp_node, nd);
2256 }
2257 
2258 /*
2259  * Insert a new shared policy into the list.  Caller holds sp->lock for
2260  * writing.
2261  */
2262 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2263 {
2264 	struct rb_node **p = &sp->root.rb_node;
2265 	struct rb_node *parent = NULL;
2266 	struct sp_node *nd;
2267 
2268 	while (*p) {
2269 		parent = *p;
2270 		nd = rb_entry(parent, struct sp_node, nd);
2271 		if (new->start < nd->start)
2272 			p = &(*p)->rb_left;
2273 		else if (new->end > nd->end)
2274 			p = &(*p)->rb_right;
2275 		else
2276 			BUG();
2277 	}
2278 	rb_link_node(&new->nd, parent, p);
2279 	rb_insert_color(&new->nd, &sp->root);
2280 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2281 		 new->policy ? new->policy->mode : 0);
2282 }
2283 
2284 /* Find shared policy intersecting idx */
2285 struct mempolicy *
2286 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2287 {
2288 	struct mempolicy *pol = NULL;
2289 	struct sp_node *sn;
2290 
2291 	if (!sp->root.rb_node)
2292 		return NULL;
2293 	read_lock(&sp->lock);
2294 	sn = sp_lookup(sp, idx, idx+1);
2295 	if (sn) {
2296 		mpol_get(sn->policy);
2297 		pol = sn->policy;
2298 	}
2299 	read_unlock(&sp->lock);
2300 	return pol;
2301 }
2302 
2303 static void sp_free(struct sp_node *n)
2304 {
2305 	mpol_put(n->policy);
2306 	kmem_cache_free(sn_cache, n);
2307 }
2308 
2309 /**
2310  * mpol_misplaced - check whether current page node is valid in policy
2311  *
2312  * @page: page to be checked
2313  * @vma: vm area where page mapped
2314  * @addr: virtual address where page mapped
2315  *
2316  * Lookup current policy node id for vma,addr and "compare to" page's
2317  * node id.
2318  *
2319  * Returns:
2320  *	-1	- not misplaced, page is in the right node
2321  *	node	- node id where the page should be
2322  *
2323  * Policy determination "mimics" alloc_page_vma().
2324  * Called from fault path where we know the vma and faulting address.
2325  */
2326 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2327 {
2328 	struct mempolicy *pol;
2329 	struct zoneref *z;
2330 	int curnid = page_to_nid(page);
2331 	unsigned long pgoff;
2332 	int thiscpu = raw_smp_processor_id();
2333 	int thisnid = cpu_to_node(thiscpu);
2334 	int polnid = NUMA_NO_NODE;
2335 	int ret = -1;
2336 
2337 	pol = get_vma_policy(vma, addr);
2338 	if (!(pol->flags & MPOL_F_MOF))
2339 		goto out;
2340 
2341 	switch (pol->mode) {
2342 	case MPOL_INTERLEAVE:
2343 		pgoff = vma->vm_pgoff;
2344 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2345 		polnid = offset_il_node(pol, pgoff);
2346 		break;
2347 
2348 	case MPOL_PREFERRED:
2349 		if (pol->flags & MPOL_F_LOCAL)
2350 			polnid = numa_node_id();
2351 		else
2352 			polnid = pol->v.preferred_node;
2353 		break;
2354 
2355 	case MPOL_BIND:
2356 
2357 		/*
2358 		 * allows binding to multiple nodes.
2359 		 * use current page if in policy nodemask,
2360 		 * else select nearest allowed node, if any.
2361 		 * If no allowed nodes, use current [!misplaced].
2362 		 */
2363 		if (node_isset(curnid, pol->v.nodes))
2364 			goto out;
2365 		z = first_zones_zonelist(
2366 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2367 				gfp_zone(GFP_HIGHUSER),
2368 				&pol->v.nodes);
2369 		polnid = zone_to_nid(z->zone);
2370 		break;
2371 
2372 	default:
2373 		BUG();
2374 	}
2375 
2376 	/* Migrate the page towards the node whose CPU is referencing it */
2377 	if (pol->flags & MPOL_F_MORON) {
2378 		polnid = thisnid;
2379 
2380 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2381 			goto out;
2382 	}
2383 
2384 	if (curnid != polnid)
2385 		ret = polnid;
2386 out:
2387 	mpol_cond_put(pol);
2388 
2389 	return ret;
2390 }
2391 
2392 /*
2393  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2394  * dropped after task->mempolicy is set to NULL so that any allocation done as
2395  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2396  * policy.
2397  */
2398 void mpol_put_task_policy(struct task_struct *task)
2399 {
2400 	struct mempolicy *pol;
2401 
2402 	task_lock(task);
2403 	pol = task->mempolicy;
2404 	task->mempolicy = NULL;
2405 	task_unlock(task);
2406 	mpol_put(pol);
2407 }
2408 
2409 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2410 {
2411 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2412 	rb_erase(&n->nd, &sp->root);
2413 	sp_free(n);
2414 }
2415 
2416 static void sp_node_init(struct sp_node *node, unsigned long start,
2417 			unsigned long end, struct mempolicy *pol)
2418 {
2419 	node->start = start;
2420 	node->end = end;
2421 	node->policy = pol;
2422 }
2423 
2424 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2425 				struct mempolicy *pol)
2426 {
2427 	struct sp_node *n;
2428 	struct mempolicy *newpol;
2429 
2430 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2431 	if (!n)
2432 		return NULL;
2433 
2434 	newpol = mpol_dup(pol);
2435 	if (IS_ERR(newpol)) {
2436 		kmem_cache_free(sn_cache, n);
2437 		return NULL;
2438 	}
2439 	newpol->flags |= MPOL_F_SHARED;
2440 	sp_node_init(n, start, end, newpol);
2441 
2442 	return n;
2443 }
2444 
2445 /* Replace a policy range. */
2446 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2447 				 unsigned long end, struct sp_node *new)
2448 {
2449 	struct sp_node *n;
2450 	struct sp_node *n_new = NULL;
2451 	struct mempolicy *mpol_new = NULL;
2452 	int ret = 0;
2453 
2454 restart:
2455 	write_lock(&sp->lock);
2456 	n = sp_lookup(sp, start, end);
2457 	/* Take care of old policies in the same range. */
2458 	while (n && n->start < end) {
2459 		struct rb_node *next = rb_next(&n->nd);
2460 		if (n->start >= start) {
2461 			if (n->end <= end)
2462 				sp_delete(sp, n);
2463 			else
2464 				n->start = end;
2465 		} else {
2466 			/* Old policy spanning whole new range. */
2467 			if (n->end > end) {
2468 				if (!n_new)
2469 					goto alloc_new;
2470 
2471 				*mpol_new = *n->policy;
2472 				atomic_set(&mpol_new->refcnt, 1);
2473 				sp_node_init(n_new, end, n->end, mpol_new);
2474 				n->end = start;
2475 				sp_insert(sp, n_new);
2476 				n_new = NULL;
2477 				mpol_new = NULL;
2478 				break;
2479 			} else
2480 				n->end = start;
2481 		}
2482 		if (!next)
2483 			break;
2484 		n = rb_entry(next, struct sp_node, nd);
2485 	}
2486 	if (new)
2487 		sp_insert(sp, new);
2488 	write_unlock(&sp->lock);
2489 	ret = 0;
2490 
2491 err_out:
2492 	if (mpol_new)
2493 		mpol_put(mpol_new);
2494 	if (n_new)
2495 		kmem_cache_free(sn_cache, n_new);
2496 
2497 	return ret;
2498 
2499 alloc_new:
2500 	write_unlock(&sp->lock);
2501 	ret = -ENOMEM;
2502 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2503 	if (!n_new)
2504 		goto err_out;
2505 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2506 	if (!mpol_new)
2507 		goto err_out;
2508 	goto restart;
2509 }
2510 
2511 /**
2512  * mpol_shared_policy_init - initialize shared policy for inode
2513  * @sp: pointer to inode shared policy
2514  * @mpol:  struct mempolicy to install
2515  *
2516  * Install non-NULL @mpol in inode's shared policy rb-tree.
2517  * On entry, the current task has a reference on a non-NULL @mpol.
2518  * This must be released on exit.
2519  * This is called at get_inode() calls and we can use GFP_KERNEL.
2520  */
2521 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2522 {
2523 	int ret;
2524 
2525 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2526 	rwlock_init(&sp->lock);
2527 
2528 	if (mpol) {
2529 		struct vm_area_struct pvma;
2530 		struct mempolicy *new;
2531 		NODEMASK_SCRATCH(scratch);
2532 
2533 		if (!scratch)
2534 			goto put_mpol;
2535 		/* contextualize the tmpfs mount point mempolicy */
2536 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2537 		if (IS_ERR(new))
2538 			goto free_scratch; /* no valid nodemask intersection */
2539 
2540 		task_lock(current);
2541 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2542 		task_unlock(current);
2543 		if (ret)
2544 			goto put_new;
2545 
2546 		/* Create pseudo-vma that contains just the policy */
2547 		vma_init(&pvma, NULL);
2548 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2549 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2550 
2551 put_new:
2552 		mpol_put(new);			/* drop initial ref */
2553 free_scratch:
2554 		NODEMASK_SCRATCH_FREE(scratch);
2555 put_mpol:
2556 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2557 	}
2558 }
2559 
2560 int mpol_set_shared_policy(struct shared_policy *info,
2561 			struct vm_area_struct *vma, struct mempolicy *npol)
2562 {
2563 	int err;
2564 	struct sp_node *new = NULL;
2565 	unsigned long sz = vma_pages(vma);
2566 
2567 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2568 		 vma->vm_pgoff,
2569 		 sz, npol ? npol->mode : -1,
2570 		 npol ? npol->flags : -1,
2571 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2572 
2573 	if (npol) {
2574 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2575 		if (!new)
2576 			return -ENOMEM;
2577 	}
2578 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2579 	if (err && new)
2580 		sp_free(new);
2581 	return err;
2582 }
2583 
2584 /* Free a backing policy store on inode delete. */
2585 void mpol_free_shared_policy(struct shared_policy *p)
2586 {
2587 	struct sp_node *n;
2588 	struct rb_node *next;
2589 
2590 	if (!p->root.rb_node)
2591 		return;
2592 	write_lock(&p->lock);
2593 	next = rb_first(&p->root);
2594 	while (next) {
2595 		n = rb_entry(next, struct sp_node, nd);
2596 		next = rb_next(&n->nd);
2597 		sp_delete(p, n);
2598 	}
2599 	write_unlock(&p->lock);
2600 }
2601 
2602 #ifdef CONFIG_NUMA_BALANCING
2603 static int __initdata numabalancing_override;
2604 
2605 static void __init check_numabalancing_enable(void)
2606 {
2607 	bool numabalancing_default = false;
2608 
2609 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2610 		numabalancing_default = true;
2611 
2612 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2613 	if (numabalancing_override)
2614 		set_numabalancing_state(numabalancing_override == 1);
2615 
2616 	if (num_online_nodes() > 1 && !numabalancing_override) {
2617 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2618 			numabalancing_default ? "Enabling" : "Disabling");
2619 		set_numabalancing_state(numabalancing_default);
2620 	}
2621 }
2622 
2623 static int __init setup_numabalancing(char *str)
2624 {
2625 	int ret = 0;
2626 	if (!str)
2627 		goto out;
2628 
2629 	if (!strcmp(str, "enable")) {
2630 		numabalancing_override = 1;
2631 		ret = 1;
2632 	} else if (!strcmp(str, "disable")) {
2633 		numabalancing_override = -1;
2634 		ret = 1;
2635 	}
2636 out:
2637 	if (!ret)
2638 		pr_warn("Unable to parse numa_balancing=\n");
2639 
2640 	return ret;
2641 }
2642 __setup("numa_balancing=", setup_numabalancing);
2643 #else
2644 static inline void __init check_numabalancing_enable(void)
2645 {
2646 }
2647 #endif /* CONFIG_NUMA_BALANCING */
2648 
2649 /* assumes fs == KERNEL_DS */
2650 void __init numa_policy_init(void)
2651 {
2652 	nodemask_t interleave_nodes;
2653 	unsigned long largest = 0;
2654 	int nid, prefer = 0;
2655 
2656 	policy_cache = kmem_cache_create("numa_policy",
2657 					 sizeof(struct mempolicy),
2658 					 0, SLAB_PANIC, NULL);
2659 
2660 	sn_cache = kmem_cache_create("shared_policy_node",
2661 				     sizeof(struct sp_node),
2662 				     0, SLAB_PANIC, NULL);
2663 
2664 	for_each_node(nid) {
2665 		preferred_node_policy[nid] = (struct mempolicy) {
2666 			.refcnt = ATOMIC_INIT(1),
2667 			.mode = MPOL_PREFERRED,
2668 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2669 			.v = { .preferred_node = nid, },
2670 		};
2671 	}
2672 
2673 	/*
2674 	 * Set interleaving policy for system init. Interleaving is only
2675 	 * enabled across suitably sized nodes (default is >= 16MB), or
2676 	 * fall back to the largest node if they're all smaller.
2677 	 */
2678 	nodes_clear(interleave_nodes);
2679 	for_each_node_state(nid, N_MEMORY) {
2680 		unsigned long total_pages = node_present_pages(nid);
2681 
2682 		/* Preserve the largest node */
2683 		if (largest < total_pages) {
2684 			largest = total_pages;
2685 			prefer = nid;
2686 		}
2687 
2688 		/* Interleave this node? */
2689 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2690 			node_set(nid, interleave_nodes);
2691 	}
2692 
2693 	/* All too small, use the largest */
2694 	if (unlikely(nodes_empty(interleave_nodes)))
2695 		node_set(prefer, interleave_nodes);
2696 
2697 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2698 		pr_err("%s: interleaving failed\n", __func__);
2699 
2700 	check_numabalancing_enable();
2701 }
2702 
2703 /* Reset policy of current process to default */
2704 void numa_default_policy(void)
2705 {
2706 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2707 }
2708 
2709 /*
2710  * Parse and format mempolicy from/to strings
2711  */
2712 
2713 /*
2714  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2715  */
2716 static const char * const policy_modes[] =
2717 {
2718 	[MPOL_DEFAULT]    = "default",
2719 	[MPOL_PREFERRED]  = "prefer",
2720 	[MPOL_BIND]       = "bind",
2721 	[MPOL_INTERLEAVE] = "interleave",
2722 	[MPOL_LOCAL]      = "local",
2723 };
2724 
2725 
2726 #ifdef CONFIG_TMPFS
2727 /**
2728  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2729  * @str:  string containing mempolicy to parse
2730  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2731  *
2732  * Format of input:
2733  *	<mode>[=<flags>][:<nodelist>]
2734  *
2735  * On success, returns 0, else 1
2736  */
2737 int mpol_parse_str(char *str, struct mempolicy **mpol)
2738 {
2739 	struct mempolicy *new = NULL;
2740 	unsigned short mode_flags;
2741 	nodemask_t nodes;
2742 	char *nodelist = strchr(str, ':');
2743 	char *flags = strchr(str, '=');
2744 	int err = 1, mode;
2745 
2746 	if (nodelist) {
2747 		/* NUL-terminate mode or flags string */
2748 		*nodelist++ = '\0';
2749 		if (nodelist_parse(nodelist, nodes))
2750 			goto out;
2751 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2752 			goto out;
2753 	} else
2754 		nodes_clear(nodes);
2755 
2756 	if (flags)
2757 		*flags++ = '\0';	/* terminate mode string */
2758 
2759 	mode = match_string(policy_modes, MPOL_MAX, str);
2760 	if (mode < 0)
2761 		goto out;
2762 
2763 	switch (mode) {
2764 	case MPOL_PREFERRED:
2765 		/*
2766 		 * Insist on a nodelist of one node only
2767 		 */
2768 		if (nodelist) {
2769 			char *rest = nodelist;
2770 			while (isdigit(*rest))
2771 				rest++;
2772 			if (*rest)
2773 				goto out;
2774 		}
2775 		break;
2776 	case MPOL_INTERLEAVE:
2777 		/*
2778 		 * Default to online nodes with memory if no nodelist
2779 		 */
2780 		if (!nodelist)
2781 			nodes = node_states[N_MEMORY];
2782 		break;
2783 	case MPOL_LOCAL:
2784 		/*
2785 		 * Don't allow a nodelist;  mpol_new() checks flags
2786 		 */
2787 		if (nodelist)
2788 			goto out;
2789 		mode = MPOL_PREFERRED;
2790 		break;
2791 	case MPOL_DEFAULT:
2792 		/*
2793 		 * Insist on a empty nodelist
2794 		 */
2795 		if (!nodelist)
2796 			err = 0;
2797 		goto out;
2798 	case MPOL_BIND:
2799 		/*
2800 		 * Insist on a nodelist
2801 		 */
2802 		if (!nodelist)
2803 			goto out;
2804 	}
2805 
2806 	mode_flags = 0;
2807 	if (flags) {
2808 		/*
2809 		 * Currently, we only support two mutually exclusive
2810 		 * mode flags.
2811 		 */
2812 		if (!strcmp(flags, "static"))
2813 			mode_flags |= MPOL_F_STATIC_NODES;
2814 		else if (!strcmp(flags, "relative"))
2815 			mode_flags |= MPOL_F_RELATIVE_NODES;
2816 		else
2817 			goto out;
2818 	}
2819 
2820 	new = mpol_new(mode, mode_flags, &nodes);
2821 	if (IS_ERR(new))
2822 		goto out;
2823 
2824 	/*
2825 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2826 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2827 	 */
2828 	if (mode != MPOL_PREFERRED)
2829 		new->v.nodes = nodes;
2830 	else if (nodelist)
2831 		new->v.preferred_node = first_node(nodes);
2832 	else
2833 		new->flags |= MPOL_F_LOCAL;
2834 
2835 	/*
2836 	 * Save nodes for contextualization: this will be used to "clone"
2837 	 * the mempolicy in a specific context [cpuset] at a later time.
2838 	 */
2839 	new->w.user_nodemask = nodes;
2840 
2841 	err = 0;
2842 
2843 out:
2844 	/* Restore string for error message */
2845 	if (nodelist)
2846 		*--nodelist = ':';
2847 	if (flags)
2848 		*--flags = '=';
2849 	if (!err)
2850 		*mpol = new;
2851 	return err;
2852 }
2853 #endif /* CONFIG_TMPFS */
2854 
2855 /**
2856  * mpol_to_str - format a mempolicy structure for printing
2857  * @buffer:  to contain formatted mempolicy string
2858  * @maxlen:  length of @buffer
2859  * @pol:  pointer to mempolicy to be formatted
2860  *
2861  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2862  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2863  * longest flag, "relative", and to display at least a few node ids.
2864  */
2865 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2866 {
2867 	char *p = buffer;
2868 	nodemask_t nodes = NODE_MASK_NONE;
2869 	unsigned short mode = MPOL_DEFAULT;
2870 	unsigned short flags = 0;
2871 
2872 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2873 		mode = pol->mode;
2874 		flags = pol->flags;
2875 	}
2876 
2877 	switch (mode) {
2878 	case MPOL_DEFAULT:
2879 		break;
2880 	case MPOL_PREFERRED:
2881 		if (flags & MPOL_F_LOCAL)
2882 			mode = MPOL_LOCAL;
2883 		else
2884 			node_set(pol->v.preferred_node, nodes);
2885 		break;
2886 	case MPOL_BIND:
2887 	case MPOL_INTERLEAVE:
2888 		nodes = pol->v.nodes;
2889 		break;
2890 	default:
2891 		WARN_ON_ONCE(1);
2892 		snprintf(p, maxlen, "unknown");
2893 		return;
2894 	}
2895 
2896 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2897 
2898 	if (flags & MPOL_MODE_FLAGS) {
2899 		p += snprintf(p, buffer + maxlen - p, "=");
2900 
2901 		/*
2902 		 * Currently, the only defined flags are mutually exclusive
2903 		 */
2904 		if (flags & MPOL_F_STATIC_NODES)
2905 			p += snprintf(p, buffer + maxlen - p, "static");
2906 		else if (flags & MPOL_F_RELATIVE_NODES)
2907 			p += snprintf(p, buffer + maxlen - p, "relative");
2908 	}
2909 
2910 	if (!nodes_empty(nodes))
2911 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2912 			       nodemask_pr_args(&nodes));
2913 }
2914