xref: /openbmc/linux/mm/mempolicy.c (revision 3a0d89d3)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 #include <linux/random.h>
98 
99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 	struct mempolicy *pol = p->mempolicy;
126 
127 	if (!pol) {
128 		int node = numa_node_id();
129 
130 		if (node != NUMA_NO_NODE) {
131 			pol = &preferred_node_policy[node];
132 			/*
133 			 * preferred_node_policy is not initialised early in
134 			 * boot
135 			 */
136 			if (!pol->mode)
137 				pol = NULL;
138 		}
139 	}
140 
141 	return pol;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 /* Check that the nodemask contains at least one populated zone */
165 static int is_valid_nodemask(const nodemask_t *nodemask)
166 {
167 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168 }
169 
170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171 {
172 	return pol->flags & MPOL_MODE_FLAGS;
173 }
174 
175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176 				   const nodemask_t *rel)
177 {
178 	nodemask_t tmp;
179 	nodes_fold(tmp, *orig, nodes_weight(*rel));
180 	nodes_onto(*ret, tmp, *rel);
181 }
182 
183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184 {
185 	if (nodes_empty(*nodes))
186 		return -EINVAL;
187 	pol->v.nodes = *nodes;
188 	return 0;
189 }
190 
191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192 {
193 	if (!nodes)
194 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
195 	else if (nodes_empty(*nodes))
196 		return -EINVAL;			/*  no allowed nodes */
197 	else
198 		pol->v.preferred_node = first_node(*nodes);
199 	return 0;
200 }
201 
202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203 {
204 	if (!is_valid_nodemask(nodes))
205 		return -EINVAL;
206 	pol->v.nodes = *nodes;
207 	return 0;
208 }
209 
210 /*
211  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212  * any, for the new policy.  mpol_new() has already validated the nodes
213  * parameter with respect to the policy mode and flags.  But, we need to
214  * handle an empty nodemask with MPOL_PREFERRED here.
215  *
216  * Must be called holding task's alloc_lock to protect task's mems_allowed
217  * and mempolicy.  May also be called holding the mmap_semaphore for write.
218  */
219 static int mpol_set_nodemask(struct mempolicy *pol,
220 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
221 {
222 	int ret;
223 
224 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225 	if (pol == NULL)
226 		return 0;
227 	/* Check N_MEMORY */
228 	nodes_and(nsc->mask1,
229 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
230 
231 	VM_BUG_ON(!nodes);
232 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233 		nodes = NULL;	/* explicit local allocation */
234 	else {
235 		if (pol->flags & MPOL_F_RELATIVE_NODES)
236 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237 		else
238 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
239 
240 		if (mpol_store_user_nodemask(pol))
241 			pol->w.user_nodemask = *nodes;
242 		else
243 			pol->w.cpuset_mems_allowed =
244 						cpuset_current_mems_allowed;
245 	}
246 
247 	if (nodes)
248 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249 	else
250 		ret = mpol_ops[pol->mode].create(pol, NULL);
251 	return ret;
252 }
253 
254 /*
255  * This function just creates a new policy, does some check and simple
256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
257  */
258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 				  nodemask_t *nodes)
260 {
261 	struct mempolicy *policy;
262 
263 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 
266 	if (mode == MPOL_DEFAULT) {
267 		if (nodes && !nodes_empty(*nodes))
268 			return ERR_PTR(-EINVAL);
269 		return NULL;
270 	}
271 	VM_BUG_ON(!nodes);
272 
273 	/*
274 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 	 * All other modes require a valid pointer to a non-empty nodemask.
277 	 */
278 	if (mode == MPOL_PREFERRED) {
279 		if (nodes_empty(*nodes)) {
280 			if (((flags & MPOL_F_STATIC_NODES) ||
281 			     (flags & MPOL_F_RELATIVE_NODES)))
282 				return ERR_PTR(-EINVAL);
283 		}
284 	} else if (mode == MPOL_LOCAL) {
285 		if (!nodes_empty(*nodes))
286 			return ERR_PTR(-EINVAL);
287 		mode = MPOL_PREFERRED;
288 	} else if (nodes_empty(*nodes))
289 		return ERR_PTR(-EINVAL);
290 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291 	if (!policy)
292 		return ERR_PTR(-ENOMEM);
293 	atomic_set(&policy->refcnt, 1);
294 	policy->mode = mode;
295 	policy->flags = flags;
296 
297 	return policy;
298 }
299 
300 /* Slow path of a mpol destructor. */
301 void __mpol_put(struct mempolicy *p)
302 {
303 	if (!atomic_dec_and_test(&p->refcnt))
304 		return;
305 	kmem_cache_free(policy_cache, p);
306 }
307 
308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309 				enum mpol_rebind_step step)
310 {
311 }
312 
313 /*
314  * step:
315  * 	MPOL_REBIND_ONCE  - do rebind work at once
316  * 	MPOL_REBIND_STEP1 - set all the newly nodes
317  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
318  */
319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320 				 enum mpol_rebind_step step)
321 {
322 	nodemask_t tmp;
323 
324 	if (pol->flags & MPOL_F_STATIC_NODES)
325 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 	else {
329 		/*
330 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331 		 * result
332 		 */
333 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334 			nodes_remap(tmp, pol->v.nodes,
335 					pol->w.cpuset_mems_allowed, *nodes);
336 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337 		} else if (step == MPOL_REBIND_STEP2) {
338 			tmp = pol->w.cpuset_mems_allowed;
339 			pol->w.cpuset_mems_allowed = *nodes;
340 		} else
341 			BUG();
342 	}
343 
344 	if (nodes_empty(tmp))
345 		tmp = *nodes;
346 
347 	if (step == MPOL_REBIND_STEP1)
348 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350 		pol->v.nodes = tmp;
351 	else
352 		BUG();
353 
354 	if (!node_isset(current->il_next, tmp)) {
355 		current->il_next = next_node(current->il_next, tmp);
356 		if (current->il_next >= MAX_NUMNODES)
357 			current->il_next = first_node(tmp);
358 		if (current->il_next >= MAX_NUMNODES)
359 			current->il_next = numa_node_id();
360 	}
361 }
362 
363 static void mpol_rebind_preferred(struct mempolicy *pol,
364 				  const nodemask_t *nodes,
365 				  enum mpol_rebind_step step)
366 {
367 	nodemask_t tmp;
368 
369 	if (pol->flags & MPOL_F_STATIC_NODES) {
370 		int node = first_node(pol->w.user_nodemask);
371 
372 		if (node_isset(node, *nodes)) {
373 			pol->v.preferred_node = node;
374 			pol->flags &= ~MPOL_F_LOCAL;
375 		} else
376 			pol->flags |= MPOL_F_LOCAL;
377 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379 		pol->v.preferred_node = first_node(tmp);
380 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
381 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
382 						   pol->w.cpuset_mems_allowed,
383 						   *nodes);
384 		pol->w.cpuset_mems_allowed = *nodes;
385 	}
386 }
387 
388 /*
389  * mpol_rebind_policy - Migrate a policy to a different set of nodes
390  *
391  * If read-side task has no lock to protect task->mempolicy, write-side
392  * task will rebind the task->mempolicy by two step. The first step is
393  * setting all the newly nodes, and the second step is cleaning all the
394  * disallowed nodes. In this way, we can avoid finding no node to alloc
395  * page.
396  * If we have a lock to protect task->mempolicy in read-side, we do
397  * rebind directly.
398  *
399  * step:
400  * 	MPOL_REBIND_ONCE  - do rebind work at once
401  * 	MPOL_REBIND_STEP1 - set all the newly nodes
402  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
403  */
404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405 				enum mpol_rebind_step step)
406 {
407 	if (!pol)
408 		return;
409 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414 		return;
415 
416 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417 		BUG();
418 
419 	if (step == MPOL_REBIND_STEP1)
420 		pol->flags |= MPOL_F_REBINDING;
421 	else if (step == MPOL_REBIND_STEP2)
422 		pol->flags &= ~MPOL_F_REBINDING;
423 	else if (step >= MPOL_REBIND_NSTEP)
424 		BUG();
425 
426 	mpol_ops[pol->mode].rebind(pol, newmask, step);
427 }
428 
429 /*
430  * Wrapper for mpol_rebind_policy() that just requires task
431  * pointer, and updates task mempolicy.
432  *
433  * Called with task's alloc_lock held.
434  */
435 
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437 			enum mpol_rebind_step step)
438 {
439 	mpol_rebind_policy(tsk->mempolicy, new, step);
440 }
441 
442 /*
443  * Rebind each vma in mm to new nodemask.
444  *
445  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
446  */
447 
448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449 {
450 	struct vm_area_struct *vma;
451 
452 	down_write(&mm->mmap_sem);
453 	for (vma = mm->mmap; vma; vma = vma->vm_next)
454 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455 	up_write(&mm->mmap_sem);
456 }
457 
458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459 	[MPOL_DEFAULT] = {
460 		.rebind = mpol_rebind_default,
461 	},
462 	[MPOL_INTERLEAVE] = {
463 		.create = mpol_new_interleave,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 	[MPOL_PREFERRED] = {
467 		.create = mpol_new_preferred,
468 		.rebind = mpol_rebind_preferred,
469 	},
470 	[MPOL_BIND] = {
471 		.create = mpol_new_bind,
472 		.rebind = mpol_rebind_nodemask,
473 	},
474 };
475 
476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
477 				unsigned long flags);
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484 		unsigned long addr, unsigned long end,
485 		const nodemask_t *nodes, unsigned long flags,
486 		void *private)
487 {
488 	pte_t *orig_pte;
489 	pte_t *pte;
490 	spinlock_t *ptl;
491 
492 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493 	do {
494 		struct page *page;
495 		int nid;
496 
497 		if (!pte_present(*pte))
498 			continue;
499 		page = vm_normal_page(vma, addr, *pte);
500 		if (!page)
501 			continue;
502 		/*
503 		 * vm_normal_page() filters out zero pages, but there might
504 		 * still be PageReserved pages to skip, perhaps in a VDSO.
505 		 */
506 		if (PageReserved(page))
507 			continue;
508 		nid = page_to_nid(page);
509 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510 			continue;
511 
512 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513 			migrate_page_add(page, private, flags);
514 		else
515 			break;
516 	} while (pte++, addr += PAGE_SIZE, addr != end);
517 	pte_unmap_unlock(orig_pte, ptl);
518 	return addr != end;
519 }
520 
521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 				    void *private)
524 {
525 #ifdef CONFIG_HUGETLB_PAGE
526 	int nid;
527 	struct page *page;
528 	spinlock_t *ptl;
529 
530 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
531 	page = pte_page(huge_ptep_get((pte_t *)pmd));
532 	nid = page_to_nid(page);
533 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
534 		goto unlock;
535 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
536 	if (flags & (MPOL_MF_MOVE_ALL) ||
537 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
538 		isolate_huge_page(page, private);
539 unlock:
540 	spin_unlock(ptl);
541 #else
542 	BUG();
543 #endif
544 }
545 
546 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
547 		unsigned long addr, unsigned long end,
548 		const nodemask_t *nodes, unsigned long flags,
549 		void *private)
550 {
551 	pmd_t *pmd;
552 	unsigned long next;
553 
554 	pmd = pmd_offset(pud, addr);
555 	do {
556 		next = pmd_addr_end(addr, end);
557 		if (!pmd_present(*pmd))
558 			continue;
559 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
560 			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
561 						flags, private);
562 			continue;
563 		}
564 		split_huge_page_pmd(vma, addr, pmd);
565 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
566 			continue;
567 		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
568 				    flags, private))
569 			return -EIO;
570 	} while (pmd++, addr = next, addr != end);
571 	return 0;
572 }
573 
574 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
575 		unsigned long addr, unsigned long end,
576 		const nodemask_t *nodes, unsigned long flags,
577 		void *private)
578 {
579 	pud_t *pud;
580 	unsigned long next;
581 
582 	pud = pud_offset(pgd, addr);
583 	do {
584 		next = pud_addr_end(addr, end);
585 		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
586 			continue;
587 		if (pud_none_or_clear_bad(pud))
588 			continue;
589 		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
590 				    flags, private))
591 			return -EIO;
592 	} while (pud++, addr = next, addr != end);
593 	return 0;
594 }
595 
596 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
597 		unsigned long addr, unsigned long end,
598 		const nodemask_t *nodes, unsigned long flags,
599 		void *private)
600 {
601 	pgd_t *pgd;
602 	unsigned long next;
603 
604 	pgd = pgd_offset(vma->vm_mm, addr);
605 	do {
606 		next = pgd_addr_end(addr, end);
607 		if (pgd_none_or_clear_bad(pgd))
608 			continue;
609 		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
610 				    flags, private))
611 			return -EIO;
612 	} while (pgd++, addr = next, addr != end);
613 	return 0;
614 }
615 
616 #ifdef CONFIG_NUMA_BALANCING
617 /*
618  * This is used to mark a range of virtual addresses to be inaccessible.
619  * These are later cleared by a NUMA hinting fault. Depending on these
620  * faults, pages may be migrated for better NUMA placement.
621  *
622  * This is assuming that NUMA faults are handled using PROT_NONE. If
623  * an architecture makes a different choice, it will need further
624  * changes to the core.
625  */
626 unsigned long change_prot_numa(struct vm_area_struct *vma,
627 			unsigned long addr, unsigned long end)
628 {
629 	int nr_updated;
630 
631 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632 	if (nr_updated)
633 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 
635 	return nr_updated;
636 }
637 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 			unsigned long addr, unsigned long end)
640 {
641 	return 0;
642 }
643 #endif /* CONFIG_NUMA_BALANCING */
644 
645 /*
646  * Walk through page tables and collect pages to be migrated.
647  *
648  * If pages found in a given range are on a set of nodes (determined by
649  * @nodes and @flags,) it's isolated and queued to the pagelist which is
650  * passed via @private.)
651  */
652 static struct vm_area_struct *
653 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654 		const nodemask_t *nodes, unsigned long flags, void *private)
655 {
656 	int err;
657 	struct vm_area_struct *first, *vma, *prev;
658 
659 
660 	first = find_vma(mm, start);
661 	if (!first)
662 		return ERR_PTR(-EFAULT);
663 	prev = NULL;
664 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665 		unsigned long endvma = vma->vm_end;
666 
667 		if (endvma > end)
668 			endvma = end;
669 		if (vma->vm_start > start)
670 			start = vma->vm_start;
671 
672 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
673 			if (!vma->vm_next && vma->vm_end < end)
674 				return ERR_PTR(-EFAULT);
675 			if (prev && prev->vm_end < vma->vm_start)
676 				return ERR_PTR(-EFAULT);
677 		}
678 
679 		if (flags & MPOL_MF_LAZY) {
680 			change_prot_numa(vma, start, endvma);
681 			goto next;
682 		}
683 
684 		if ((flags & MPOL_MF_STRICT) ||
685 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 		      vma_migratable(vma))) {
687 
688 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 						flags, private);
690 			if (err) {
691 				first = ERR_PTR(err);
692 				break;
693 			}
694 		}
695 next:
696 		prev = vma;
697 	}
698 	return first;
699 }
700 
701 /*
702  * Apply policy to a single VMA
703  * This must be called with the mmap_sem held for writing.
704  */
705 static int vma_replace_policy(struct vm_area_struct *vma,
706 						struct mempolicy *pol)
707 {
708 	int err;
709 	struct mempolicy *old;
710 	struct mempolicy *new;
711 
712 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
713 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
714 		 vma->vm_ops, vma->vm_file,
715 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
716 
717 	new = mpol_dup(pol);
718 	if (IS_ERR(new))
719 		return PTR_ERR(new);
720 
721 	if (vma->vm_ops && vma->vm_ops->set_policy) {
722 		err = vma->vm_ops->set_policy(vma, new);
723 		if (err)
724 			goto err_out;
725 	}
726 
727 	old = vma->vm_policy;
728 	vma->vm_policy = new; /* protected by mmap_sem */
729 	mpol_put(old);
730 
731 	return 0;
732  err_out:
733 	mpol_put(new);
734 	return err;
735 }
736 
737 /* Step 2: apply policy to a range and do splits. */
738 static int mbind_range(struct mm_struct *mm, unsigned long start,
739 		       unsigned long end, struct mempolicy *new_pol)
740 {
741 	struct vm_area_struct *next;
742 	struct vm_area_struct *prev;
743 	struct vm_area_struct *vma;
744 	int err = 0;
745 	pgoff_t pgoff;
746 	unsigned long vmstart;
747 	unsigned long vmend;
748 
749 	vma = find_vma(mm, start);
750 	if (!vma || vma->vm_start > start)
751 		return -EFAULT;
752 
753 	prev = vma->vm_prev;
754 	if (start > vma->vm_start)
755 		prev = vma;
756 
757 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
758 		next = vma->vm_next;
759 		vmstart = max(start, vma->vm_start);
760 		vmend   = min(end, vma->vm_end);
761 
762 		if (mpol_equal(vma_policy(vma), new_pol))
763 			continue;
764 
765 		pgoff = vma->vm_pgoff +
766 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
767 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768 				  vma->anon_vma, vma->vm_file, pgoff,
769 				  new_pol);
770 		if (prev) {
771 			vma = prev;
772 			next = vma->vm_next;
773 			if (mpol_equal(vma_policy(vma), new_pol))
774 				continue;
775 			/* vma_merge() joined vma && vma->next, case 8 */
776 			goto replace;
777 		}
778 		if (vma->vm_start != vmstart) {
779 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
780 			if (err)
781 				goto out;
782 		}
783 		if (vma->vm_end != vmend) {
784 			err = split_vma(vma->vm_mm, vma, vmend, 0);
785 			if (err)
786 				goto out;
787 		}
788  replace:
789 		err = vma_replace_policy(vma, new_pol);
790 		if (err)
791 			goto out;
792 	}
793 
794  out:
795 	return err;
796 }
797 
798 /* Set the process memory policy */
799 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
800 			     nodemask_t *nodes)
801 {
802 	struct mempolicy *new, *old;
803 	struct mm_struct *mm = current->mm;
804 	NODEMASK_SCRATCH(scratch);
805 	int ret;
806 
807 	if (!scratch)
808 		return -ENOMEM;
809 
810 	new = mpol_new(mode, flags, nodes);
811 	if (IS_ERR(new)) {
812 		ret = PTR_ERR(new);
813 		goto out;
814 	}
815 	/*
816 	 * prevent changing our mempolicy while show_numa_maps()
817 	 * is using it.
818 	 * Note:  do_set_mempolicy() can be called at init time
819 	 * with no 'mm'.
820 	 */
821 	if (mm)
822 		down_write(&mm->mmap_sem);
823 	task_lock(current);
824 	ret = mpol_set_nodemask(new, nodes, scratch);
825 	if (ret) {
826 		task_unlock(current);
827 		if (mm)
828 			up_write(&mm->mmap_sem);
829 		mpol_put(new);
830 		goto out;
831 	}
832 	old = current->mempolicy;
833 	current->mempolicy = new;
834 	if (new && new->mode == MPOL_INTERLEAVE &&
835 	    nodes_weight(new->v.nodes))
836 		current->il_next = first_node(new->v.nodes);
837 	task_unlock(current);
838 	if (mm)
839 		up_write(&mm->mmap_sem);
840 
841 	mpol_put(old);
842 	ret = 0;
843 out:
844 	NODEMASK_SCRATCH_FREE(scratch);
845 	return ret;
846 }
847 
848 /*
849  * Return nodemask for policy for get_mempolicy() query
850  *
851  * Called with task's alloc_lock held
852  */
853 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
854 {
855 	nodes_clear(*nodes);
856 	if (p == &default_policy)
857 		return;
858 
859 	switch (p->mode) {
860 	case MPOL_BIND:
861 		/* Fall through */
862 	case MPOL_INTERLEAVE:
863 		*nodes = p->v.nodes;
864 		break;
865 	case MPOL_PREFERRED:
866 		if (!(p->flags & MPOL_F_LOCAL))
867 			node_set(p->v.preferred_node, *nodes);
868 		/* else return empty node mask for local allocation */
869 		break;
870 	default:
871 		BUG();
872 	}
873 }
874 
875 static int lookup_node(struct mm_struct *mm, unsigned long addr)
876 {
877 	struct page *p;
878 	int err;
879 
880 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
881 	if (err >= 0) {
882 		err = page_to_nid(p);
883 		put_page(p);
884 	}
885 	return err;
886 }
887 
888 /* Retrieve NUMA policy */
889 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
890 			     unsigned long addr, unsigned long flags)
891 {
892 	int err;
893 	struct mm_struct *mm = current->mm;
894 	struct vm_area_struct *vma = NULL;
895 	struct mempolicy *pol = current->mempolicy;
896 
897 	if (flags &
898 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
899 		return -EINVAL;
900 
901 	if (flags & MPOL_F_MEMS_ALLOWED) {
902 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
903 			return -EINVAL;
904 		*policy = 0;	/* just so it's initialized */
905 		task_lock(current);
906 		*nmask  = cpuset_current_mems_allowed;
907 		task_unlock(current);
908 		return 0;
909 	}
910 
911 	if (flags & MPOL_F_ADDR) {
912 		/*
913 		 * Do NOT fall back to task policy if the
914 		 * vma/shared policy at addr is NULL.  We
915 		 * want to return MPOL_DEFAULT in this case.
916 		 */
917 		down_read(&mm->mmap_sem);
918 		vma = find_vma_intersection(mm, addr, addr+1);
919 		if (!vma) {
920 			up_read(&mm->mmap_sem);
921 			return -EFAULT;
922 		}
923 		if (vma->vm_ops && vma->vm_ops->get_policy)
924 			pol = vma->vm_ops->get_policy(vma, addr);
925 		else
926 			pol = vma->vm_policy;
927 	} else if (addr)
928 		return -EINVAL;
929 
930 	if (!pol)
931 		pol = &default_policy;	/* indicates default behavior */
932 
933 	if (flags & MPOL_F_NODE) {
934 		if (flags & MPOL_F_ADDR) {
935 			err = lookup_node(mm, addr);
936 			if (err < 0)
937 				goto out;
938 			*policy = err;
939 		} else if (pol == current->mempolicy &&
940 				pol->mode == MPOL_INTERLEAVE) {
941 			*policy = current->il_next;
942 		} else {
943 			err = -EINVAL;
944 			goto out;
945 		}
946 	} else {
947 		*policy = pol == &default_policy ? MPOL_DEFAULT :
948 						pol->mode;
949 		/*
950 		 * Internal mempolicy flags must be masked off before exposing
951 		 * the policy to userspace.
952 		 */
953 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
954 	}
955 
956 	if (vma) {
957 		up_read(&current->mm->mmap_sem);
958 		vma = NULL;
959 	}
960 
961 	err = 0;
962 	if (nmask) {
963 		if (mpol_store_user_nodemask(pol)) {
964 			*nmask = pol->w.user_nodemask;
965 		} else {
966 			task_lock(current);
967 			get_policy_nodemask(pol, nmask);
968 			task_unlock(current);
969 		}
970 	}
971 
972  out:
973 	mpol_cond_put(pol);
974 	if (vma)
975 		up_read(&current->mm->mmap_sem);
976 	return err;
977 }
978 
979 #ifdef CONFIG_MIGRATION
980 /*
981  * page migration
982  */
983 static void migrate_page_add(struct page *page, struct list_head *pagelist,
984 				unsigned long flags)
985 {
986 	/*
987 	 * Avoid migrating a page that is shared with others.
988 	 */
989 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
990 		if (!isolate_lru_page(page)) {
991 			list_add_tail(&page->lru, pagelist);
992 			inc_zone_page_state(page, NR_ISOLATED_ANON +
993 					    page_is_file_cache(page));
994 		}
995 	}
996 }
997 
998 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
999 {
1000 	if (PageHuge(page))
1001 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1002 					node);
1003 	else
1004 		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1005 }
1006 
1007 /*
1008  * Migrate pages from one node to a target node.
1009  * Returns error or the number of pages not migrated.
1010  */
1011 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1012 			   int flags)
1013 {
1014 	nodemask_t nmask;
1015 	LIST_HEAD(pagelist);
1016 	int err = 0;
1017 
1018 	nodes_clear(nmask);
1019 	node_set(source, nmask);
1020 
1021 	/*
1022 	 * This does not "check" the range but isolates all pages that
1023 	 * need migration.  Between passing in the full user address
1024 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1025 	 */
1026 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1027 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1028 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1029 
1030 	if (!list_empty(&pagelist)) {
1031 		err = migrate_pages(&pagelist, new_node_page, dest,
1032 					MIGRATE_SYNC, MR_SYSCALL);
1033 		if (err)
1034 			putback_movable_pages(&pagelist);
1035 	}
1036 
1037 	return err;
1038 }
1039 
1040 /*
1041  * Move pages between the two nodesets so as to preserve the physical
1042  * layout as much as possible.
1043  *
1044  * Returns the number of page that could not be moved.
1045  */
1046 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1047 		     const nodemask_t *to, int flags)
1048 {
1049 	int busy = 0;
1050 	int err;
1051 	nodemask_t tmp;
1052 
1053 	err = migrate_prep();
1054 	if (err)
1055 		return err;
1056 
1057 	down_read(&mm->mmap_sem);
1058 
1059 	err = migrate_vmas(mm, from, to, flags);
1060 	if (err)
1061 		goto out;
1062 
1063 	/*
1064 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1065 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1066 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1067 	 * The pair of nodemasks 'to' and 'from' define the map.
1068 	 *
1069 	 * If no pair of bits is found that way, fallback to picking some
1070 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1071 	 * 'source' and 'dest' bits are the same, this represents a node
1072 	 * that will be migrating to itself, so no pages need move.
1073 	 *
1074 	 * If no bits are left in 'tmp', or if all remaining bits left
1075 	 * in 'tmp' correspond to the same bit in 'to', return false
1076 	 * (nothing left to migrate).
1077 	 *
1078 	 * This lets us pick a pair of nodes to migrate between, such that
1079 	 * if possible the dest node is not already occupied by some other
1080 	 * source node, minimizing the risk of overloading the memory on a
1081 	 * node that would happen if we migrated incoming memory to a node
1082 	 * before migrating outgoing memory source that same node.
1083 	 *
1084 	 * A single scan of tmp is sufficient.  As we go, we remember the
1085 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1086 	 * that not only moved, but what's better, moved to an empty slot
1087 	 * (d is not set in tmp), then we break out then, with that pair.
1088 	 * Otherwise when we finish scanning from_tmp, we at least have the
1089 	 * most recent <s, d> pair that moved.  If we get all the way through
1090 	 * the scan of tmp without finding any node that moved, much less
1091 	 * moved to an empty node, then there is nothing left worth migrating.
1092 	 */
1093 
1094 	tmp = *from;
1095 	while (!nodes_empty(tmp)) {
1096 		int s,d;
1097 		int source = NUMA_NO_NODE;
1098 		int dest = 0;
1099 
1100 		for_each_node_mask(s, tmp) {
1101 
1102 			/*
1103 			 * do_migrate_pages() tries to maintain the relative
1104 			 * node relationship of the pages established between
1105 			 * threads and memory areas.
1106                          *
1107 			 * However if the number of source nodes is not equal to
1108 			 * the number of destination nodes we can not preserve
1109 			 * this node relative relationship.  In that case, skip
1110 			 * copying memory from a node that is in the destination
1111 			 * mask.
1112 			 *
1113 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1114 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1115 			 */
1116 
1117 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1118 						(node_isset(s, *to)))
1119 				continue;
1120 
1121 			d = node_remap(s, *from, *to);
1122 			if (s == d)
1123 				continue;
1124 
1125 			source = s;	/* Node moved. Memorize */
1126 			dest = d;
1127 
1128 			/* dest not in remaining from nodes? */
1129 			if (!node_isset(dest, tmp))
1130 				break;
1131 		}
1132 		if (source == NUMA_NO_NODE)
1133 			break;
1134 
1135 		node_clear(source, tmp);
1136 		err = migrate_to_node(mm, source, dest, flags);
1137 		if (err > 0)
1138 			busy += err;
1139 		if (err < 0)
1140 			break;
1141 	}
1142 out:
1143 	up_read(&mm->mmap_sem);
1144 	if (err < 0)
1145 		return err;
1146 	return busy;
1147 
1148 }
1149 
1150 /*
1151  * Allocate a new page for page migration based on vma policy.
1152  * Start assuming that page is mapped by vma pointed to by @private.
1153  * Search forward from there, if not.  N.B., this assumes that the
1154  * list of pages handed to migrate_pages()--which is how we get here--
1155  * is in virtual address order.
1156  */
1157 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1158 {
1159 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1160 	unsigned long uninitialized_var(address);
1161 
1162 	while (vma) {
1163 		address = page_address_in_vma(page, vma);
1164 		if (address != -EFAULT)
1165 			break;
1166 		vma = vma->vm_next;
1167 	}
1168 
1169 	if (PageHuge(page)) {
1170 		BUG_ON(!vma);
1171 		return alloc_huge_page_noerr(vma, address, 1);
1172 	}
1173 	/*
1174 	 * if !vma, alloc_page_vma() will use task or system default policy
1175 	 */
1176 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1177 }
1178 #else
1179 
1180 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1181 				unsigned long flags)
1182 {
1183 }
1184 
1185 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1186 		     const nodemask_t *to, int flags)
1187 {
1188 	return -ENOSYS;
1189 }
1190 
1191 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1192 {
1193 	return NULL;
1194 }
1195 #endif
1196 
1197 static long do_mbind(unsigned long start, unsigned long len,
1198 		     unsigned short mode, unsigned short mode_flags,
1199 		     nodemask_t *nmask, unsigned long flags)
1200 {
1201 	struct vm_area_struct *vma;
1202 	struct mm_struct *mm = current->mm;
1203 	struct mempolicy *new;
1204 	unsigned long end;
1205 	int err;
1206 	LIST_HEAD(pagelist);
1207 
1208 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1209 		return -EINVAL;
1210 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1211 		return -EPERM;
1212 
1213 	if (start & ~PAGE_MASK)
1214 		return -EINVAL;
1215 
1216 	if (mode == MPOL_DEFAULT)
1217 		flags &= ~MPOL_MF_STRICT;
1218 
1219 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1220 	end = start + len;
1221 
1222 	if (end < start)
1223 		return -EINVAL;
1224 	if (end == start)
1225 		return 0;
1226 
1227 	new = mpol_new(mode, mode_flags, nmask);
1228 	if (IS_ERR(new))
1229 		return PTR_ERR(new);
1230 
1231 	if (flags & MPOL_MF_LAZY)
1232 		new->flags |= MPOL_F_MOF;
1233 
1234 	/*
1235 	 * If we are using the default policy then operation
1236 	 * on discontinuous address spaces is okay after all
1237 	 */
1238 	if (!new)
1239 		flags |= MPOL_MF_DISCONTIG_OK;
1240 
1241 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1242 		 start, start + len, mode, mode_flags,
1243 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1244 
1245 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1246 
1247 		err = migrate_prep();
1248 		if (err)
1249 			goto mpol_out;
1250 	}
1251 	{
1252 		NODEMASK_SCRATCH(scratch);
1253 		if (scratch) {
1254 			down_write(&mm->mmap_sem);
1255 			task_lock(current);
1256 			err = mpol_set_nodemask(new, nmask, scratch);
1257 			task_unlock(current);
1258 			if (err)
1259 				up_write(&mm->mmap_sem);
1260 		} else
1261 			err = -ENOMEM;
1262 		NODEMASK_SCRATCH_FREE(scratch);
1263 	}
1264 	if (err)
1265 		goto mpol_out;
1266 
1267 	vma = queue_pages_range(mm, start, end, nmask,
1268 			  flags | MPOL_MF_INVERT, &pagelist);
1269 
1270 	err = PTR_ERR(vma);	/* maybe ... */
1271 	if (!IS_ERR(vma))
1272 		err = mbind_range(mm, start, end, new);
1273 
1274 	if (!err) {
1275 		int nr_failed = 0;
1276 
1277 		if (!list_empty(&pagelist)) {
1278 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1279 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1280 					(unsigned long)vma,
1281 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1282 			if (nr_failed)
1283 				putback_movable_pages(&pagelist);
1284 		}
1285 
1286 		if (nr_failed && (flags & MPOL_MF_STRICT))
1287 			err = -EIO;
1288 	} else
1289 		putback_movable_pages(&pagelist);
1290 
1291 	up_write(&mm->mmap_sem);
1292  mpol_out:
1293 	mpol_put(new);
1294 	return err;
1295 }
1296 
1297 /*
1298  * User space interface with variable sized bitmaps for nodelists.
1299  */
1300 
1301 /* Copy a node mask from user space. */
1302 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1303 		     unsigned long maxnode)
1304 {
1305 	unsigned long k;
1306 	unsigned long nlongs;
1307 	unsigned long endmask;
1308 
1309 	--maxnode;
1310 	nodes_clear(*nodes);
1311 	if (maxnode == 0 || !nmask)
1312 		return 0;
1313 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1314 		return -EINVAL;
1315 
1316 	nlongs = BITS_TO_LONGS(maxnode);
1317 	if ((maxnode % BITS_PER_LONG) == 0)
1318 		endmask = ~0UL;
1319 	else
1320 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1321 
1322 	/* When the user specified more nodes than supported just check
1323 	   if the non supported part is all zero. */
1324 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1325 		if (nlongs > PAGE_SIZE/sizeof(long))
1326 			return -EINVAL;
1327 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1328 			unsigned long t;
1329 			if (get_user(t, nmask + k))
1330 				return -EFAULT;
1331 			if (k == nlongs - 1) {
1332 				if (t & endmask)
1333 					return -EINVAL;
1334 			} else if (t)
1335 				return -EINVAL;
1336 		}
1337 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1338 		endmask = ~0UL;
1339 	}
1340 
1341 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1342 		return -EFAULT;
1343 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1344 	return 0;
1345 }
1346 
1347 /* Copy a kernel node mask to user space */
1348 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1349 			      nodemask_t *nodes)
1350 {
1351 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1352 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1353 
1354 	if (copy > nbytes) {
1355 		if (copy > PAGE_SIZE)
1356 			return -EINVAL;
1357 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1358 			return -EFAULT;
1359 		copy = nbytes;
1360 	}
1361 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1362 }
1363 
1364 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1365 		unsigned long, mode, unsigned long __user *, nmask,
1366 		unsigned long, maxnode, unsigned, flags)
1367 {
1368 	nodemask_t nodes;
1369 	int err;
1370 	unsigned short mode_flags;
1371 
1372 	mode_flags = mode & MPOL_MODE_FLAGS;
1373 	mode &= ~MPOL_MODE_FLAGS;
1374 	if (mode >= MPOL_MAX)
1375 		return -EINVAL;
1376 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1377 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1378 		return -EINVAL;
1379 	err = get_nodes(&nodes, nmask, maxnode);
1380 	if (err)
1381 		return err;
1382 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1383 }
1384 
1385 /* Set the process memory policy */
1386 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1387 		unsigned long, maxnode)
1388 {
1389 	int err;
1390 	nodemask_t nodes;
1391 	unsigned short flags;
1392 
1393 	flags = mode & MPOL_MODE_FLAGS;
1394 	mode &= ~MPOL_MODE_FLAGS;
1395 	if ((unsigned int)mode >= MPOL_MAX)
1396 		return -EINVAL;
1397 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1398 		return -EINVAL;
1399 	err = get_nodes(&nodes, nmask, maxnode);
1400 	if (err)
1401 		return err;
1402 	return do_set_mempolicy(mode, flags, &nodes);
1403 }
1404 
1405 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1406 		const unsigned long __user *, old_nodes,
1407 		const unsigned long __user *, new_nodes)
1408 {
1409 	const struct cred *cred = current_cred(), *tcred;
1410 	struct mm_struct *mm = NULL;
1411 	struct task_struct *task;
1412 	nodemask_t task_nodes;
1413 	int err;
1414 	nodemask_t *old;
1415 	nodemask_t *new;
1416 	NODEMASK_SCRATCH(scratch);
1417 
1418 	if (!scratch)
1419 		return -ENOMEM;
1420 
1421 	old = &scratch->mask1;
1422 	new = &scratch->mask2;
1423 
1424 	err = get_nodes(old, old_nodes, maxnode);
1425 	if (err)
1426 		goto out;
1427 
1428 	err = get_nodes(new, new_nodes, maxnode);
1429 	if (err)
1430 		goto out;
1431 
1432 	/* Find the mm_struct */
1433 	rcu_read_lock();
1434 	task = pid ? find_task_by_vpid(pid) : current;
1435 	if (!task) {
1436 		rcu_read_unlock();
1437 		err = -ESRCH;
1438 		goto out;
1439 	}
1440 	get_task_struct(task);
1441 
1442 	err = -EINVAL;
1443 
1444 	/*
1445 	 * Check if this process has the right to modify the specified
1446 	 * process. The right exists if the process has administrative
1447 	 * capabilities, superuser privileges or the same
1448 	 * userid as the target process.
1449 	 */
1450 	tcred = __task_cred(task);
1451 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1452 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1453 	    !capable(CAP_SYS_NICE)) {
1454 		rcu_read_unlock();
1455 		err = -EPERM;
1456 		goto out_put;
1457 	}
1458 	rcu_read_unlock();
1459 
1460 	task_nodes = cpuset_mems_allowed(task);
1461 	/* Is the user allowed to access the target nodes? */
1462 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1463 		err = -EPERM;
1464 		goto out_put;
1465 	}
1466 
1467 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1468 		err = -EINVAL;
1469 		goto out_put;
1470 	}
1471 
1472 	err = security_task_movememory(task);
1473 	if (err)
1474 		goto out_put;
1475 
1476 	mm = get_task_mm(task);
1477 	put_task_struct(task);
1478 
1479 	if (!mm) {
1480 		err = -EINVAL;
1481 		goto out;
1482 	}
1483 
1484 	err = do_migrate_pages(mm, old, new,
1485 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1486 
1487 	mmput(mm);
1488 out:
1489 	NODEMASK_SCRATCH_FREE(scratch);
1490 
1491 	return err;
1492 
1493 out_put:
1494 	put_task_struct(task);
1495 	goto out;
1496 
1497 }
1498 
1499 
1500 /* Retrieve NUMA policy */
1501 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1502 		unsigned long __user *, nmask, unsigned long, maxnode,
1503 		unsigned long, addr, unsigned long, flags)
1504 {
1505 	int err;
1506 	int uninitialized_var(pval);
1507 	nodemask_t nodes;
1508 
1509 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1510 		return -EINVAL;
1511 
1512 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1513 
1514 	if (err)
1515 		return err;
1516 
1517 	if (policy && put_user(pval, policy))
1518 		return -EFAULT;
1519 
1520 	if (nmask)
1521 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1522 
1523 	return err;
1524 }
1525 
1526 #ifdef CONFIG_COMPAT
1527 
1528 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1529 		       compat_ulong_t __user *, nmask,
1530 		       compat_ulong_t, maxnode,
1531 		       compat_ulong_t, addr, compat_ulong_t, flags)
1532 {
1533 	long err;
1534 	unsigned long __user *nm = NULL;
1535 	unsigned long nr_bits, alloc_size;
1536 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1537 
1538 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1539 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1540 
1541 	if (nmask)
1542 		nm = compat_alloc_user_space(alloc_size);
1543 
1544 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1545 
1546 	if (!err && nmask) {
1547 		unsigned long copy_size;
1548 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1549 		err = copy_from_user(bm, nm, copy_size);
1550 		/* ensure entire bitmap is zeroed */
1551 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1552 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1553 	}
1554 
1555 	return err;
1556 }
1557 
1558 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1559 		       compat_ulong_t, maxnode)
1560 {
1561 	long err = 0;
1562 	unsigned long __user *nm = NULL;
1563 	unsigned long nr_bits, alloc_size;
1564 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1565 
1566 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1567 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1568 
1569 	if (nmask) {
1570 		err = compat_get_bitmap(bm, nmask, nr_bits);
1571 		nm = compat_alloc_user_space(alloc_size);
1572 		err |= copy_to_user(nm, bm, alloc_size);
1573 	}
1574 
1575 	if (err)
1576 		return -EFAULT;
1577 
1578 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1579 }
1580 
1581 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1582 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1583 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1584 {
1585 	long err = 0;
1586 	unsigned long __user *nm = NULL;
1587 	unsigned long nr_bits, alloc_size;
1588 	nodemask_t bm;
1589 
1590 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1591 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1592 
1593 	if (nmask) {
1594 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1595 		nm = compat_alloc_user_space(alloc_size);
1596 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1597 	}
1598 
1599 	if (err)
1600 		return -EFAULT;
1601 
1602 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1603 }
1604 
1605 #endif
1606 
1607 /*
1608  * get_vma_policy(@task, @vma, @addr)
1609  * @task - task for fallback if vma policy == default
1610  * @vma   - virtual memory area whose policy is sought
1611  * @addr  - address in @vma for shared policy lookup
1612  *
1613  * Returns effective policy for a VMA at specified address.
1614  * Falls back to @task or system default policy, as necessary.
1615  * Current or other task's task mempolicy and non-shared vma policies must be
1616  * protected by task_lock(task) by the caller.
1617  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1618  * count--added by the get_policy() vm_op, as appropriate--to protect against
1619  * freeing by another task.  It is the caller's responsibility to free the
1620  * extra reference for shared policies.
1621  */
1622 struct mempolicy *get_vma_policy(struct task_struct *task,
1623 		struct vm_area_struct *vma, unsigned long addr)
1624 {
1625 	struct mempolicy *pol = get_task_policy(task);
1626 
1627 	if (vma) {
1628 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1629 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1630 									addr);
1631 			if (vpol)
1632 				pol = vpol;
1633 		} else if (vma->vm_policy) {
1634 			pol = vma->vm_policy;
1635 
1636 			/*
1637 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1638 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1639 			 * count on these policies which will be dropped by
1640 			 * mpol_cond_put() later
1641 			 */
1642 			if (mpol_needs_cond_ref(pol))
1643 				mpol_get(pol);
1644 		}
1645 	}
1646 	if (!pol)
1647 		pol = &default_policy;
1648 	return pol;
1649 }
1650 
1651 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1652 {
1653 	struct mempolicy *pol = get_task_policy(task);
1654 	if (vma) {
1655 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1656 			bool ret = false;
1657 
1658 			pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1659 			if (pol && (pol->flags & MPOL_F_MOF))
1660 				ret = true;
1661 			mpol_cond_put(pol);
1662 
1663 			return ret;
1664 		} else if (vma->vm_policy) {
1665 			pol = vma->vm_policy;
1666 		}
1667 	}
1668 
1669 	if (!pol)
1670 		return default_policy.flags & MPOL_F_MOF;
1671 
1672 	return pol->flags & MPOL_F_MOF;
1673 }
1674 
1675 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1676 {
1677 	enum zone_type dynamic_policy_zone = policy_zone;
1678 
1679 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1680 
1681 	/*
1682 	 * if policy->v.nodes has movable memory only,
1683 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1684 	 *
1685 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1686 	 * so if the following test faile, it implies
1687 	 * policy->v.nodes has movable memory only.
1688 	 */
1689 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1690 		dynamic_policy_zone = ZONE_MOVABLE;
1691 
1692 	return zone >= dynamic_policy_zone;
1693 }
1694 
1695 /*
1696  * Return a nodemask representing a mempolicy for filtering nodes for
1697  * page allocation
1698  */
1699 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1700 {
1701 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1702 	if (unlikely(policy->mode == MPOL_BIND) &&
1703 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1704 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1705 		return &policy->v.nodes;
1706 
1707 	return NULL;
1708 }
1709 
1710 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1711 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1712 	int nd)
1713 {
1714 	switch (policy->mode) {
1715 	case MPOL_PREFERRED:
1716 		if (!(policy->flags & MPOL_F_LOCAL))
1717 			nd = policy->v.preferred_node;
1718 		break;
1719 	case MPOL_BIND:
1720 		/*
1721 		 * Normally, MPOL_BIND allocations are node-local within the
1722 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1723 		 * current node isn't part of the mask, we use the zonelist for
1724 		 * the first node in the mask instead.
1725 		 */
1726 		if (unlikely(gfp & __GFP_THISNODE) &&
1727 				unlikely(!node_isset(nd, policy->v.nodes)))
1728 			nd = first_node(policy->v.nodes);
1729 		break;
1730 	default:
1731 		BUG();
1732 	}
1733 	return node_zonelist(nd, gfp);
1734 }
1735 
1736 /* Do dynamic interleaving for a process */
1737 static unsigned interleave_nodes(struct mempolicy *policy)
1738 {
1739 	unsigned nid, next;
1740 	struct task_struct *me = current;
1741 
1742 	nid = me->il_next;
1743 	next = next_node(nid, policy->v.nodes);
1744 	if (next >= MAX_NUMNODES)
1745 		next = first_node(policy->v.nodes);
1746 	if (next < MAX_NUMNODES)
1747 		me->il_next = next;
1748 	return nid;
1749 }
1750 
1751 /*
1752  * Depending on the memory policy provide a node from which to allocate the
1753  * next slab entry.
1754  */
1755 unsigned int mempolicy_slab_node(void)
1756 {
1757 	struct mempolicy *policy;
1758 	int node = numa_mem_id();
1759 
1760 	if (in_interrupt())
1761 		return node;
1762 
1763 	policy = current->mempolicy;
1764 	if (!policy || policy->flags & MPOL_F_LOCAL)
1765 		return node;
1766 
1767 	switch (policy->mode) {
1768 	case MPOL_PREFERRED:
1769 		/*
1770 		 * handled MPOL_F_LOCAL above
1771 		 */
1772 		return policy->v.preferred_node;
1773 
1774 	case MPOL_INTERLEAVE:
1775 		return interleave_nodes(policy);
1776 
1777 	case MPOL_BIND: {
1778 		/*
1779 		 * Follow bind policy behavior and start allocation at the
1780 		 * first node.
1781 		 */
1782 		struct zonelist *zonelist;
1783 		struct zone *zone;
1784 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1785 		zonelist = &NODE_DATA(node)->node_zonelists[0];
1786 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1787 							&policy->v.nodes,
1788 							&zone);
1789 		return zone ? zone->node : node;
1790 	}
1791 
1792 	default:
1793 		BUG();
1794 	}
1795 }
1796 
1797 /* Do static interleaving for a VMA with known offset. */
1798 static unsigned offset_il_node(struct mempolicy *pol,
1799 		struct vm_area_struct *vma, unsigned long off)
1800 {
1801 	unsigned nnodes = nodes_weight(pol->v.nodes);
1802 	unsigned target;
1803 	int c;
1804 	int nid = NUMA_NO_NODE;
1805 
1806 	if (!nnodes)
1807 		return numa_node_id();
1808 	target = (unsigned int)off % nnodes;
1809 	c = 0;
1810 	do {
1811 		nid = next_node(nid, pol->v.nodes);
1812 		c++;
1813 	} while (c <= target);
1814 	return nid;
1815 }
1816 
1817 /* Determine a node number for interleave */
1818 static inline unsigned interleave_nid(struct mempolicy *pol,
1819 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1820 {
1821 	if (vma) {
1822 		unsigned long off;
1823 
1824 		/*
1825 		 * for small pages, there is no difference between
1826 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1827 		 * for huge pages, since vm_pgoff is in units of small
1828 		 * pages, we need to shift off the always 0 bits to get
1829 		 * a useful offset.
1830 		 */
1831 		BUG_ON(shift < PAGE_SHIFT);
1832 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1833 		off += (addr - vma->vm_start) >> shift;
1834 		return offset_il_node(pol, vma, off);
1835 	} else
1836 		return interleave_nodes(pol);
1837 }
1838 
1839 /*
1840  * Return the bit number of a random bit set in the nodemask.
1841  * (returns NUMA_NO_NODE if nodemask is empty)
1842  */
1843 int node_random(const nodemask_t *maskp)
1844 {
1845 	int w, bit = NUMA_NO_NODE;
1846 
1847 	w = nodes_weight(*maskp);
1848 	if (w)
1849 		bit = bitmap_ord_to_pos(maskp->bits,
1850 			get_random_int() % w, MAX_NUMNODES);
1851 	return bit;
1852 }
1853 
1854 #ifdef CONFIG_HUGETLBFS
1855 /*
1856  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1857  * @vma = virtual memory area whose policy is sought
1858  * @addr = address in @vma for shared policy lookup and interleave policy
1859  * @gfp_flags = for requested zone
1860  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1861  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1862  *
1863  * Returns a zonelist suitable for a huge page allocation and a pointer
1864  * to the struct mempolicy for conditional unref after allocation.
1865  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1866  * @nodemask for filtering the zonelist.
1867  *
1868  * Must be protected by read_mems_allowed_begin()
1869  */
1870 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1871 				gfp_t gfp_flags, struct mempolicy **mpol,
1872 				nodemask_t **nodemask)
1873 {
1874 	struct zonelist *zl;
1875 
1876 	*mpol = get_vma_policy(current, vma, addr);
1877 	*nodemask = NULL;	/* assume !MPOL_BIND */
1878 
1879 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1880 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1881 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1882 	} else {
1883 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1884 		if ((*mpol)->mode == MPOL_BIND)
1885 			*nodemask = &(*mpol)->v.nodes;
1886 	}
1887 	return zl;
1888 }
1889 
1890 /*
1891  * init_nodemask_of_mempolicy
1892  *
1893  * If the current task's mempolicy is "default" [NULL], return 'false'
1894  * to indicate default policy.  Otherwise, extract the policy nodemask
1895  * for 'bind' or 'interleave' policy into the argument nodemask, or
1896  * initialize the argument nodemask to contain the single node for
1897  * 'preferred' or 'local' policy and return 'true' to indicate presence
1898  * of non-default mempolicy.
1899  *
1900  * We don't bother with reference counting the mempolicy [mpol_get/put]
1901  * because the current task is examining it's own mempolicy and a task's
1902  * mempolicy is only ever changed by the task itself.
1903  *
1904  * N.B., it is the caller's responsibility to free a returned nodemask.
1905  */
1906 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1907 {
1908 	struct mempolicy *mempolicy;
1909 	int nid;
1910 
1911 	if (!(mask && current->mempolicy))
1912 		return false;
1913 
1914 	task_lock(current);
1915 	mempolicy = current->mempolicy;
1916 	switch (mempolicy->mode) {
1917 	case MPOL_PREFERRED:
1918 		if (mempolicy->flags & MPOL_F_LOCAL)
1919 			nid = numa_node_id();
1920 		else
1921 			nid = mempolicy->v.preferred_node;
1922 		init_nodemask_of_node(mask, nid);
1923 		break;
1924 
1925 	case MPOL_BIND:
1926 		/* Fall through */
1927 	case MPOL_INTERLEAVE:
1928 		*mask =  mempolicy->v.nodes;
1929 		break;
1930 
1931 	default:
1932 		BUG();
1933 	}
1934 	task_unlock(current);
1935 
1936 	return true;
1937 }
1938 #endif
1939 
1940 /*
1941  * mempolicy_nodemask_intersects
1942  *
1943  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1944  * policy.  Otherwise, check for intersection between mask and the policy
1945  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1946  * policy, always return true since it may allocate elsewhere on fallback.
1947  *
1948  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1949  */
1950 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1951 					const nodemask_t *mask)
1952 {
1953 	struct mempolicy *mempolicy;
1954 	bool ret = true;
1955 
1956 	if (!mask)
1957 		return ret;
1958 	task_lock(tsk);
1959 	mempolicy = tsk->mempolicy;
1960 	if (!mempolicy)
1961 		goto out;
1962 
1963 	switch (mempolicy->mode) {
1964 	case MPOL_PREFERRED:
1965 		/*
1966 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1967 		 * allocate from, they may fallback to other nodes when oom.
1968 		 * Thus, it's possible for tsk to have allocated memory from
1969 		 * nodes in mask.
1970 		 */
1971 		break;
1972 	case MPOL_BIND:
1973 	case MPOL_INTERLEAVE:
1974 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1975 		break;
1976 	default:
1977 		BUG();
1978 	}
1979 out:
1980 	task_unlock(tsk);
1981 	return ret;
1982 }
1983 
1984 /* Allocate a page in interleaved policy.
1985    Own path because it needs to do special accounting. */
1986 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1987 					unsigned nid)
1988 {
1989 	struct zonelist *zl;
1990 	struct page *page;
1991 
1992 	zl = node_zonelist(nid, gfp);
1993 	page = __alloc_pages(gfp, order, zl);
1994 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1995 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1996 	return page;
1997 }
1998 
1999 /**
2000  * 	alloc_pages_vma	- Allocate a page for a VMA.
2001  *
2002  * 	@gfp:
2003  *      %GFP_USER    user allocation.
2004  *      %GFP_KERNEL  kernel allocations,
2005  *      %GFP_HIGHMEM highmem/user allocations,
2006  *      %GFP_FS      allocation should not call back into a file system.
2007  *      %GFP_ATOMIC  don't sleep.
2008  *
2009  *	@order:Order of the GFP allocation.
2010  * 	@vma:  Pointer to VMA or NULL if not available.
2011  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2012  *
2013  * 	This function allocates a page from the kernel page pool and applies
2014  *	a NUMA policy associated with the VMA or the current process.
2015  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2016  *	mm_struct of the VMA to prevent it from going away. Should be used for
2017  *	all allocations for pages that will be mapped into
2018  * 	user space. Returns NULL when no page can be allocated.
2019  *
2020  *	Should be called with the mm_sem of the vma hold.
2021  */
2022 struct page *
2023 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2024 		unsigned long addr, int node)
2025 {
2026 	struct mempolicy *pol;
2027 	struct page *page;
2028 	unsigned int cpuset_mems_cookie;
2029 
2030 retry_cpuset:
2031 	pol = get_vma_policy(current, vma, addr);
2032 	cpuset_mems_cookie = read_mems_allowed_begin();
2033 
2034 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2035 		unsigned nid;
2036 
2037 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2038 		mpol_cond_put(pol);
2039 		page = alloc_page_interleave(gfp, order, nid);
2040 		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2041 			goto retry_cpuset;
2042 
2043 		return page;
2044 	}
2045 	page = __alloc_pages_nodemask(gfp, order,
2046 				      policy_zonelist(gfp, pol, node),
2047 				      policy_nodemask(gfp, pol));
2048 	if (unlikely(mpol_needs_cond_ref(pol)))
2049 		__mpol_put(pol);
2050 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2051 		goto retry_cpuset;
2052 	return page;
2053 }
2054 
2055 /**
2056  * 	alloc_pages_current - Allocate pages.
2057  *
2058  *	@gfp:
2059  *		%GFP_USER   user allocation,
2060  *      	%GFP_KERNEL kernel allocation,
2061  *      	%GFP_HIGHMEM highmem allocation,
2062  *      	%GFP_FS     don't call back into a file system.
2063  *      	%GFP_ATOMIC don't sleep.
2064  *	@order: Power of two of allocation size in pages. 0 is a single page.
2065  *
2066  *	Allocate a page from the kernel page pool.  When not in
2067  *	interrupt context and apply the current process NUMA policy.
2068  *	Returns NULL when no page can be allocated.
2069  *
2070  *	Don't call cpuset_update_task_memory_state() unless
2071  *	1) it's ok to take cpuset_sem (can WAIT), and
2072  *	2) allocating for current task (not interrupt).
2073  */
2074 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2075 {
2076 	struct mempolicy *pol = get_task_policy(current);
2077 	struct page *page;
2078 	unsigned int cpuset_mems_cookie;
2079 
2080 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2081 		pol = &default_policy;
2082 
2083 retry_cpuset:
2084 	cpuset_mems_cookie = read_mems_allowed_begin();
2085 
2086 	/*
2087 	 * No reference counting needed for current->mempolicy
2088 	 * nor system default_policy
2089 	 */
2090 	if (pol->mode == MPOL_INTERLEAVE)
2091 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2092 	else
2093 		page = __alloc_pages_nodemask(gfp, order,
2094 				policy_zonelist(gfp, pol, numa_node_id()),
2095 				policy_nodemask(gfp, pol));
2096 
2097 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2098 		goto retry_cpuset;
2099 
2100 	return page;
2101 }
2102 EXPORT_SYMBOL(alloc_pages_current);
2103 
2104 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2105 {
2106 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2107 
2108 	if (IS_ERR(pol))
2109 		return PTR_ERR(pol);
2110 	dst->vm_policy = pol;
2111 	return 0;
2112 }
2113 
2114 /*
2115  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2116  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2117  * with the mems_allowed returned by cpuset_mems_allowed().  This
2118  * keeps mempolicies cpuset relative after its cpuset moves.  See
2119  * further kernel/cpuset.c update_nodemask().
2120  *
2121  * current's mempolicy may be rebinded by the other task(the task that changes
2122  * cpuset's mems), so we needn't do rebind work for current task.
2123  */
2124 
2125 /* Slow path of a mempolicy duplicate */
2126 struct mempolicy *__mpol_dup(struct mempolicy *old)
2127 {
2128 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2129 
2130 	if (!new)
2131 		return ERR_PTR(-ENOMEM);
2132 
2133 	/* task's mempolicy is protected by alloc_lock */
2134 	if (old == current->mempolicy) {
2135 		task_lock(current);
2136 		*new = *old;
2137 		task_unlock(current);
2138 	} else
2139 		*new = *old;
2140 
2141 	rcu_read_lock();
2142 	if (current_cpuset_is_being_rebound()) {
2143 		nodemask_t mems = cpuset_mems_allowed(current);
2144 		if (new->flags & MPOL_F_REBINDING)
2145 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2146 		else
2147 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2148 	}
2149 	rcu_read_unlock();
2150 	atomic_set(&new->refcnt, 1);
2151 	return new;
2152 }
2153 
2154 /* Slow path of a mempolicy comparison */
2155 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2156 {
2157 	if (!a || !b)
2158 		return false;
2159 	if (a->mode != b->mode)
2160 		return false;
2161 	if (a->flags != b->flags)
2162 		return false;
2163 	if (mpol_store_user_nodemask(a))
2164 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2165 			return false;
2166 
2167 	switch (a->mode) {
2168 	case MPOL_BIND:
2169 		/* Fall through */
2170 	case MPOL_INTERLEAVE:
2171 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2172 	case MPOL_PREFERRED:
2173 		return a->v.preferred_node == b->v.preferred_node;
2174 	default:
2175 		BUG();
2176 		return false;
2177 	}
2178 }
2179 
2180 /*
2181  * Shared memory backing store policy support.
2182  *
2183  * Remember policies even when nobody has shared memory mapped.
2184  * The policies are kept in Red-Black tree linked from the inode.
2185  * They are protected by the sp->lock spinlock, which should be held
2186  * for any accesses to the tree.
2187  */
2188 
2189 /* lookup first element intersecting start-end */
2190 /* Caller holds sp->lock */
2191 static struct sp_node *
2192 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2193 {
2194 	struct rb_node *n = sp->root.rb_node;
2195 
2196 	while (n) {
2197 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2198 
2199 		if (start >= p->end)
2200 			n = n->rb_right;
2201 		else if (end <= p->start)
2202 			n = n->rb_left;
2203 		else
2204 			break;
2205 	}
2206 	if (!n)
2207 		return NULL;
2208 	for (;;) {
2209 		struct sp_node *w = NULL;
2210 		struct rb_node *prev = rb_prev(n);
2211 		if (!prev)
2212 			break;
2213 		w = rb_entry(prev, struct sp_node, nd);
2214 		if (w->end <= start)
2215 			break;
2216 		n = prev;
2217 	}
2218 	return rb_entry(n, struct sp_node, nd);
2219 }
2220 
2221 /* Insert a new shared policy into the list. */
2222 /* Caller holds sp->lock */
2223 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2224 {
2225 	struct rb_node **p = &sp->root.rb_node;
2226 	struct rb_node *parent = NULL;
2227 	struct sp_node *nd;
2228 
2229 	while (*p) {
2230 		parent = *p;
2231 		nd = rb_entry(parent, struct sp_node, nd);
2232 		if (new->start < nd->start)
2233 			p = &(*p)->rb_left;
2234 		else if (new->end > nd->end)
2235 			p = &(*p)->rb_right;
2236 		else
2237 			BUG();
2238 	}
2239 	rb_link_node(&new->nd, parent, p);
2240 	rb_insert_color(&new->nd, &sp->root);
2241 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2242 		 new->policy ? new->policy->mode : 0);
2243 }
2244 
2245 /* Find shared policy intersecting idx */
2246 struct mempolicy *
2247 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2248 {
2249 	struct mempolicy *pol = NULL;
2250 	struct sp_node *sn;
2251 
2252 	if (!sp->root.rb_node)
2253 		return NULL;
2254 	spin_lock(&sp->lock);
2255 	sn = sp_lookup(sp, idx, idx+1);
2256 	if (sn) {
2257 		mpol_get(sn->policy);
2258 		pol = sn->policy;
2259 	}
2260 	spin_unlock(&sp->lock);
2261 	return pol;
2262 }
2263 
2264 static void sp_free(struct sp_node *n)
2265 {
2266 	mpol_put(n->policy);
2267 	kmem_cache_free(sn_cache, n);
2268 }
2269 
2270 /**
2271  * mpol_misplaced - check whether current page node is valid in policy
2272  *
2273  * @page   - page to be checked
2274  * @vma    - vm area where page mapped
2275  * @addr   - virtual address where page mapped
2276  *
2277  * Lookup current policy node id for vma,addr and "compare to" page's
2278  * node id.
2279  *
2280  * Returns:
2281  *	-1	- not misplaced, page is in the right node
2282  *	node	- node id where the page should be
2283  *
2284  * Policy determination "mimics" alloc_page_vma().
2285  * Called from fault path where we know the vma and faulting address.
2286  */
2287 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2288 {
2289 	struct mempolicy *pol;
2290 	struct zone *zone;
2291 	int curnid = page_to_nid(page);
2292 	unsigned long pgoff;
2293 	int thiscpu = raw_smp_processor_id();
2294 	int thisnid = cpu_to_node(thiscpu);
2295 	int polnid = -1;
2296 	int ret = -1;
2297 
2298 	BUG_ON(!vma);
2299 
2300 	pol = get_vma_policy(current, vma, addr);
2301 	if (!(pol->flags & MPOL_F_MOF))
2302 		goto out;
2303 
2304 	switch (pol->mode) {
2305 	case MPOL_INTERLEAVE:
2306 		BUG_ON(addr >= vma->vm_end);
2307 		BUG_ON(addr < vma->vm_start);
2308 
2309 		pgoff = vma->vm_pgoff;
2310 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2311 		polnid = offset_il_node(pol, vma, pgoff);
2312 		break;
2313 
2314 	case MPOL_PREFERRED:
2315 		if (pol->flags & MPOL_F_LOCAL)
2316 			polnid = numa_node_id();
2317 		else
2318 			polnid = pol->v.preferred_node;
2319 		break;
2320 
2321 	case MPOL_BIND:
2322 		/*
2323 		 * allows binding to multiple nodes.
2324 		 * use current page if in policy nodemask,
2325 		 * else select nearest allowed node, if any.
2326 		 * If no allowed nodes, use current [!misplaced].
2327 		 */
2328 		if (node_isset(curnid, pol->v.nodes))
2329 			goto out;
2330 		(void)first_zones_zonelist(
2331 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2332 				gfp_zone(GFP_HIGHUSER),
2333 				&pol->v.nodes, &zone);
2334 		polnid = zone->node;
2335 		break;
2336 
2337 	default:
2338 		BUG();
2339 	}
2340 
2341 	/* Migrate the page towards the node whose CPU is referencing it */
2342 	if (pol->flags & MPOL_F_MORON) {
2343 		polnid = thisnid;
2344 
2345 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2346 			goto out;
2347 	}
2348 
2349 	if (curnid != polnid)
2350 		ret = polnid;
2351 out:
2352 	mpol_cond_put(pol);
2353 
2354 	return ret;
2355 }
2356 
2357 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2358 {
2359 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2360 	rb_erase(&n->nd, &sp->root);
2361 	sp_free(n);
2362 }
2363 
2364 static void sp_node_init(struct sp_node *node, unsigned long start,
2365 			unsigned long end, struct mempolicy *pol)
2366 {
2367 	node->start = start;
2368 	node->end = end;
2369 	node->policy = pol;
2370 }
2371 
2372 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2373 				struct mempolicy *pol)
2374 {
2375 	struct sp_node *n;
2376 	struct mempolicy *newpol;
2377 
2378 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2379 	if (!n)
2380 		return NULL;
2381 
2382 	newpol = mpol_dup(pol);
2383 	if (IS_ERR(newpol)) {
2384 		kmem_cache_free(sn_cache, n);
2385 		return NULL;
2386 	}
2387 	newpol->flags |= MPOL_F_SHARED;
2388 	sp_node_init(n, start, end, newpol);
2389 
2390 	return n;
2391 }
2392 
2393 /* Replace a policy range. */
2394 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2395 				 unsigned long end, struct sp_node *new)
2396 {
2397 	struct sp_node *n;
2398 	struct sp_node *n_new = NULL;
2399 	struct mempolicy *mpol_new = NULL;
2400 	int ret = 0;
2401 
2402 restart:
2403 	spin_lock(&sp->lock);
2404 	n = sp_lookup(sp, start, end);
2405 	/* Take care of old policies in the same range. */
2406 	while (n && n->start < end) {
2407 		struct rb_node *next = rb_next(&n->nd);
2408 		if (n->start >= start) {
2409 			if (n->end <= end)
2410 				sp_delete(sp, n);
2411 			else
2412 				n->start = end;
2413 		} else {
2414 			/* Old policy spanning whole new range. */
2415 			if (n->end > end) {
2416 				if (!n_new)
2417 					goto alloc_new;
2418 
2419 				*mpol_new = *n->policy;
2420 				atomic_set(&mpol_new->refcnt, 1);
2421 				sp_node_init(n_new, end, n->end, mpol_new);
2422 				n->end = start;
2423 				sp_insert(sp, n_new);
2424 				n_new = NULL;
2425 				mpol_new = NULL;
2426 				break;
2427 			} else
2428 				n->end = start;
2429 		}
2430 		if (!next)
2431 			break;
2432 		n = rb_entry(next, struct sp_node, nd);
2433 	}
2434 	if (new)
2435 		sp_insert(sp, new);
2436 	spin_unlock(&sp->lock);
2437 	ret = 0;
2438 
2439 err_out:
2440 	if (mpol_new)
2441 		mpol_put(mpol_new);
2442 	if (n_new)
2443 		kmem_cache_free(sn_cache, n_new);
2444 
2445 	return ret;
2446 
2447 alloc_new:
2448 	spin_unlock(&sp->lock);
2449 	ret = -ENOMEM;
2450 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2451 	if (!n_new)
2452 		goto err_out;
2453 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2454 	if (!mpol_new)
2455 		goto err_out;
2456 	goto restart;
2457 }
2458 
2459 /**
2460  * mpol_shared_policy_init - initialize shared policy for inode
2461  * @sp: pointer to inode shared policy
2462  * @mpol:  struct mempolicy to install
2463  *
2464  * Install non-NULL @mpol in inode's shared policy rb-tree.
2465  * On entry, the current task has a reference on a non-NULL @mpol.
2466  * This must be released on exit.
2467  * This is called at get_inode() calls and we can use GFP_KERNEL.
2468  */
2469 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2470 {
2471 	int ret;
2472 
2473 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2474 	spin_lock_init(&sp->lock);
2475 
2476 	if (mpol) {
2477 		struct vm_area_struct pvma;
2478 		struct mempolicy *new;
2479 		NODEMASK_SCRATCH(scratch);
2480 
2481 		if (!scratch)
2482 			goto put_mpol;
2483 		/* contextualize the tmpfs mount point mempolicy */
2484 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2485 		if (IS_ERR(new))
2486 			goto free_scratch; /* no valid nodemask intersection */
2487 
2488 		task_lock(current);
2489 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2490 		task_unlock(current);
2491 		if (ret)
2492 			goto put_new;
2493 
2494 		/* Create pseudo-vma that contains just the policy */
2495 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2496 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2497 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2498 
2499 put_new:
2500 		mpol_put(new);			/* drop initial ref */
2501 free_scratch:
2502 		NODEMASK_SCRATCH_FREE(scratch);
2503 put_mpol:
2504 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2505 	}
2506 }
2507 
2508 int mpol_set_shared_policy(struct shared_policy *info,
2509 			struct vm_area_struct *vma, struct mempolicy *npol)
2510 {
2511 	int err;
2512 	struct sp_node *new = NULL;
2513 	unsigned long sz = vma_pages(vma);
2514 
2515 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2516 		 vma->vm_pgoff,
2517 		 sz, npol ? npol->mode : -1,
2518 		 npol ? npol->flags : -1,
2519 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2520 
2521 	if (npol) {
2522 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2523 		if (!new)
2524 			return -ENOMEM;
2525 	}
2526 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2527 	if (err && new)
2528 		sp_free(new);
2529 	return err;
2530 }
2531 
2532 /* Free a backing policy store on inode delete. */
2533 void mpol_free_shared_policy(struct shared_policy *p)
2534 {
2535 	struct sp_node *n;
2536 	struct rb_node *next;
2537 
2538 	if (!p->root.rb_node)
2539 		return;
2540 	spin_lock(&p->lock);
2541 	next = rb_first(&p->root);
2542 	while (next) {
2543 		n = rb_entry(next, struct sp_node, nd);
2544 		next = rb_next(&n->nd);
2545 		sp_delete(p, n);
2546 	}
2547 	spin_unlock(&p->lock);
2548 }
2549 
2550 #ifdef CONFIG_NUMA_BALANCING
2551 static int __initdata numabalancing_override;
2552 
2553 static void __init check_numabalancing_enable(void)
2554 {
2555 	bool numabalancing_default = false;
2556 
2557 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2558 		numabalancing_default = true;
2559 
2560 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2561 	if (numabalancing_override)
2562 		set_numabalancing_state(numabalancing_override == 1);
2563 
2564 	if (nr_node_ids > 1 && !numabalancing_override) {
2565 		pr_info("%s automatic NUMA balancing. "
2566 			"Configure with numa_balancing= or the "
2567 			"kernel.numa_balancing sysctl",
2568 			numabalancing_default ? "Enabling" : "Disabling");
2569 		set_numabalancing_state(numabalancing_default);
2570 	}
2571 }
2572 
2573 static int __init setup_numabalancing(char *str)
2574 {
2575 	int ret = 0;
2576 	if (!str)
2577 		goto out;
2578 
2579 	if (!strcmp(str, "enable")) {
2580 		numabalancing_override = 1;
2581 		ret = 1;
2582 	} else if (!strcmp(str, "disable")) {
2583 		numabalancing_override = -1;
2584 		ret = 1;
2585 	}
2586 out:
2587 	if (!ret)
2588 		pr_warn("Unable to parse numa_balancing=\n");
2589 
2590 	return ret;
2591 }
2592 __setup("numa_balancing=", setup_numabalancing);
2593 #else
2594 static inline void __init check_numabalancing_enable(void)
2595 {
2596 }
2597 #endif /* CONFIG_NUMA_BALANCING */
2598 
2599 /* assumes fs == KERNEL_DS */
2600 void __init numa_policy_init(void)
2601 {
2602 	nodemask_t interleave_nodes;
2603 	unsigned long largest = 0;
2604 	int nid, prefer = 0;
2605 
2606 	policy_cache = kmem_cache_create("numa_policy",
2607 					 sizeof(struct mempolicy),
2608 					 0, SLAB_PANIC, NULL);
2609 
2610 	sn_cache = kmem_cache_create("shared_policy_node",
2611 				     sizeof(struct sp_node),
2612 				     0, SLAB_PANIC, NULL);
2613 
2614 	for_each_node(nid) {
2615 		preferred_node_policy[nid] = (struct mempolicy) {
2616 			.refcnt = ATOMIC_INIT(1),
2617 			.mode = MPOL_PREFERRED,
2618 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2619 			.v = { .preferred_node = nid, },
2620 		};
2621 	}
2622 
2623 	/*
2624 	 * Set interleaving policy for system init. Interleaving is only
2625 	 * enabled across suitably sized nodes (default is >= 16MB), or
2626 	 * fall back to the largest node if they're all smaller.
2627 	 */
2628 	nodes_clear(interleave_nodes);
2629 	for_each_node_state(nid, N_MEMORY) {
2630 		unsigned long total_pages = node_present_pages(nid);
2631 
2632 		/* Preserve the largest node */
2633 		if (largest < total_pages) {
2634 			largest = total_pages;
2635 			prefer = nid;
2636 		}
2637 
2638 		/* Interleave this node? */
2639 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2640 			node_set(nid, interleave_nodes);
2641 	}
2642 
2643 	/* All too small, use the largest */
2644 	if (unlikely(nodes_empty(interleave_nodes)))
2645 		node_set(prefer, interleave_nodes);
2646 
2647 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2648 		printk("numa_policy_init: interleaving failed\n");
2649 
2650 	check_numabalancing_enable();
2651 }
2652 
2653 /* Reset policy of current process to default */
2654 void numa_default_policy(void)
2655 {
2656 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2657 }
2658 
2659 /*
2660  * Parse and format mempolicy from/to strings
2661  */
2662 
2663 /*
2664  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2665  */
2666 static const char * const policy_modes[] =
2667 {
2668 	[MPOL_DEFAULT]    = "default",
2669 	[MPOL_PREFERRED]  = "prefer",
2670 	[MPOL_BIND]       = "bind",
2671 	[MPOL_INTERLEAVE] = "interleave",
2672 	[MPOL_LOCAL]      = "local",
2673 };
2674 
2675 
2676 #ifdef CONFIG_TMPFS
2677 /**
2678  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2679  * @str:  string containing mempolicy to parse
2680  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2681  *
2682  * Format of input:
2683  *	<mode>[=<flags>][:<nodelist>]
2684  *
2685  * On success, returns 0, else 1
2686  */
2687 int mpol_parse_str(char *str, struct mempolicy **mpol)
2688 {
2689 	struct mempolicy *new = NULL;
2690 	unsigned short mode;
2691 	unsigned short mode_flags;
2692 	nodemask_t nodes;
2693 	char *nodelist = strchr(str, ':');
2694 	char *flags = strchr(str, '=');
2695 	int err = 1;
2696 
2697 	if (nodelist) {
2698 		/* NUL-terminate mode or flags string */
2699 		*nodelist++ = '\0';
2700 		if (nodelist_parse(nodelist, nodes))
2701 			goto out;
2702 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2703 			goto out;
2704 	} else
2705 		nodes_clear(nodes);
2706 
2707 	if (flags)
2708 		*flags++ = '\0';	/* terminate mode string */
2709 
2710 	for (mode = 0; mode < MPOL_MAX; mode++) {
2711 		if (!strcmp(str, policy_modes[mode])) {
2712 			break;
2713 		}
2714 	}
2715 	if (mode >= MPOL_MAX)
2716 		goto out;
2717 
2718 	switch (mode) {
2719 	case MPOL_PREFERRED:
2720 		/*
2721 		 * Insist on a nodelist of one node only
2722 		 */
2723 		if (nodelist) {
2724 			char *rest = nodelist;
2725 			while (isdigit(*rest))
2726 				rest++;
2727 			if (*rest)
2728 				goto out;
2729 		}
2730 		break;
2731 	case MPOL_INTERLEAVE:
2732 		/*
2733 		 * Default to online nodes with memory if no nodelist
2734 		 */
2735 		if (!nodelist)
2736 			nodes = node_states[N_MEMORY];
2737 		break;
2738 	case MPOL_LOCAL:
2739 		/*
2740 		 * Don't allow a nodelist;  mpol_new() checks flags
2741 		 */
2742 		if (nodelist)
2743 			goto out;
2744 		mode = MPOL_PREFERRED;
2745 		break;
2746 	case MPOL_DEFAULT:
2747 		/*
2748 		 * Insist on a empty nodelist
2749 		 */
2750 		if (!nodelist)
2751 			err = 0;
2752 		goto out;
2753 	case MPOL_BIND:
2754 		/*
2755 		 * Insist on a nodelist
2756 		 */
2757 		if (!nodelist)
2758 			goto out;
2759 	}
2760 
2761 	mode_flags = 0;
2762 	if (flags) {
2763 		/*
2764 		 * Currently, we only support two mutually exclusive
2765 		 * mode flags.
2766 		 */
2767 		if (!strcmp(flags, "static"))
2768 			mode_flags |= MPOL_F_STATIC_NODES;
2769 		else if (!strcmp(flags, "relative"))
2770 			mode_flags |= MPOL_F_RELATIVE_NODES;
2771 		else
2772 			goto out;
2773 	}
2774 
2775 	new = mpol_new(mode, mode_flags, &nodes);
2776 	if (IS_ERR(new))
2777 		goto out;
2778 
2779 	/*
2780 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2781 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2782 	 */
2783 	if (mode != MPOL_PREFERRED)
2784 		new->v.nodes = nodes;
2785 	else if (nodelist)
2786 		new->v.preferred_node = first_node(nodes);
2787 	else
2788 		new->flags |= MPOL_F_LOCAL;
2789 
2790 	/*
2791 	 * Save nodes for contextualization: this will be used to "clone"
2792 	 * the mempolicy in a specific context [cpuset] at a later time.
2793 	 */
2794 	new->w.user_nodemask = nodes;
2795 
2796 	err = 0;
2797 
2798 out:
2799 	/* Restore string for error message */
2800 	if (nodelist)
2801 		*--nodelist = ':';
2802 	if (flags)
2803 		*--flags = '=';
2804 	if (!err)
2805 		*mpol = new;
2806 	return err;
2807 }
2808 #endif /* CONFIG_TMPFS */
2809 
2810 /**
2811  * mpol_to_str - format a mempolicy structure for printing
2812  * @buffer:  to contain formatted mempolicy string
2813  * @maxlen:  length of @buffer
2814  * @pol:  pointer to mempolicy to be formatted
2815  *
2816  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2817  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2818  * longest flag, "relative", and to display at least a few node ids.
2819  */
2820 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2821 {
2822 	char *p = buffer;
2823 	nodemask_t nodes = NODE_MASK_NONE;
2824 	unsigned short mode = MPOL_DEFAULT;
2825 	unsigned short flags = 0;
2826 
2827 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2828 		mode = pol->mode;
2829 		flags = pol->flags;
2830 	}
2831 
2832 	switch (mode) {
2833 	case MPOL_DEFAULT:
2834 		break;
2835 	case MPOL_PREFERRED:
2836 		if (flags & MPOL_F_LOCAL)
2837 			mode = MPOL_LOCAL;
2838 		else
2839 			node_set(pol->v.preferred_node, nodes);
2840 		break;
2841 	case MPOL_BIND:
2842 	case MPOL_INTERLEAVE:
2843 		nodes = pol->v.nodes;
2844 		break;
2845 	default:
2846 		WARN_ON_ONCE(1);
2847 		snprintf(p, maxlen, "unknown");
2848 		return;
2849 	}
2850 
2851 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2852 
2853 	if (flags & MPOL_MODE_FLAGS) {
2854 		p += snprintf(p, buffer + maxlen - p, "=");
2855 
2856 		/*
2857 		 * Currently, the only defined flags are mutually exclusive
2858 		 */
2859 		if (flags & MPOL_F_STATIC_NODES)
2860 			p += snprintf(p, buffer + maxlen - p, "static");
2861 		else if (flags & MPOL_F_RELATIVE_NODES)
2862 			p += snprintf(p, buffer + maxlen - p, "relative");
2863 	}
2864 
2865 	if (!nodes_empty(nodes)) {
2866 		p += snprintf(p, buffer + maxlen - p, ":");
2867 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2868 	}
2869 }
2870