xref: /openbmc/linux/mm/mempolicy.c (revision e23feb16)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 #include <linux/random.h>
98 
99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 	struct mempolicy *pol = p->mempolicy;
126 
127 	if (!pol) {
128 		int node = numa_node_id();
129 
130 		if (node != NUMA_NO_NODE) {
131 			pol = &preferred_node_policy[node];
132 			/*
133 			 * preferred_node_policy is not initialised early in
134 			 * boot
135 			 */
136 			if (!pol->mode)
137 				pol = NULL;
138 		}
139 	}
140 
141 	return pol;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 /* Check that the nodemask contains at least one populated zone */
165 static int is_valid_nodemask(const nodemask_t *nodemask)
166 {
167 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168 }
169 
170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171 {
172 	return pol->flags & MPOL_MODE_FLAGS;
173 }
174 
175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176 				   const nodemask_t *rel)
177 {
178 	nodemask_t tmp;
179 	nodes_fold(tmp, *orig, nodes_weight(*rel));
180 	nodes_onto(*ret, tmp, *rel);
181 }
182 
183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184 {
185 	if (nodes_empty(*nodes))
186 		return -EINVAL;
187 	pol->v.nodes = *nodes;
188 	return 0;
189 }
190 
191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192 {
193 	if (!nodes)
194 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
195 	else if (nodes_empty(*nodes))
196 		return -EINVAL;			/*  no allowed nodes */
197 	else
198 		pol->v.preferred_node = first_node(*nodes);
199 	return 0;
200 }
201 
202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203 {
204 	if (!is_valid_nodemask(nodes))
205 		return -EINVAL;
206 	pol->v.nodes = *nodes;
207 	return 0;
208 }
209 
210 /*
211  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212  * any, for the new policy.  mpol_new() has already validated the nodes
213  * parameter with respect to the policy mode and flags.  But, we need to
214  * handle an empty nodemask with MPOL_PREFERRED here.
215  *
216  * Must be called holding task's alloc_lock to protect task's mems_allowed
217  * and mempolicy.  May also be called holding the mmap_semaphore for write.
218  */
219 static int mpol_set_nodemask(struct mempolicy *pol,
220 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
221 {
222 	int ret;
223 
224 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225 	if (pol == NULL)
226 		return 0;
227 	/* Check N_MEMORY */
228 	nodes_and(nsc->mask1,
229 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
230 
231 	VM_BUG_ON(!nodes);
232 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233 		nodes = NULL;	/* explicit local allocation */
234 	else {
235 		if (pol->flags & MPOL_F_RELATIVE_NODES)
236 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237 		else
238 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
239 
240 		if (mpol_store_user_nodemask(pol))
241 			pol->w.user_nodemask = *nodes;
242 		else
243 			pol->w.cpuset_mems_allowed =
244 						cpuset_current_mems_allowed;
245 	}
246 
247 	if (nodes)
248 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249 	else
250 		ret = mpol_ops[pol->mode].create(pol, NULL);
251 	return ret;
252 }
253 
254 /*
255  * This function just creates a new policy, does some check and simple
256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
257  */
258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 				  nodemask_t *nodes)
260 {
261 	struct mempolicy *policy;
262 
263 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 
266 	if (mode == MPOL_DEFAULT) {
267 		if (nodes && !nodes_empty(*nodes))
268 			return ERR_PTR(-EINVAL);
269 		return NULL;
270 	}
271 	VM_BUG_ON(!nodes);
272 
273 	/*
274 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 	 * All other modes require a valid pointer to a non-empty nodemask.
277 	 */
278 	if (mode == MPOL_PREFERRED) {
279 		if (nodes_empty(*nodes)) {
280 			if (((flags & MPOL_F_STATIC_NODES) ||
281 			     (flags & MPOL_F_RELATIVE_NODES)))
282 				return ERR_PTR(-EINVAL);
283 		}
284 	} else if (mode == MPOL_LOCAL) {
285 		if (!nodes_empty(*nodes))
286 			return ERR_PTR(-EINVAL);
287 		mode = MPOL_PREFERRED;
288 	} else if (nodes_empty(*nodes))
289 		return ERR_PTR(-EINVAL);
290 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291 	if (!policy)
292 		return ERR_PTR(-ENOMEM);
293 	atomic_set(&policy->refcnt, 1);
294 	policy->mode = mode;
295 	policy->flags = flags;
296 
297 	return policy;
298 }
299 
300 /* Slow path of a mpol destructor. */
301 void __mpol_put(struct mempolicy *p)
302 {
303 	if (!atomic_dec_and_test(&p->refcnt))
304 		return;
305 	kmem_cache_free(policy_cache, p);
306 }
307 
308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309 				enum mpol_rebind_step step)
310 {
311 }
312 
313 /*
314  * step:
315  * 	MPOL_REBIND_ONCE  - do rebind work at once
316  * 	MPOL_REBIND_STEP1 - set all the newly nodes
317  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
318  */
319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320 				 enum mpol_rebind_step step)
321 {
322 	nodemask_t tmp;
323 
324 	if (pol->flags & MPOL_F_STATIC_NODES)
325 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 	else {
329 		/*
330 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331 		 * result
332 		 */
333 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334 			nodes_remap(tmp, pol->v.nodes,
335 					pol->w.cpuset_mems_allowed, *nodes);
336 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337 		} else if (step == MPOL_REBIND_STEP2) {
338 			tmp = pol->w.cpuset_mems_allowed;
339 			pol->w.cpuset_mems_allowed = *nodes;
340 		} else
341 			BUG();
342 	}
343 
344 	if (nodes_empty(tmp))
345 		tmp = *nodes;
346 
347 	if (step == MPOL_REBIND_STEP1)
348 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350 		pol->v.nodes = tmp;
351 	else
352 		BUG();
353 
354 	if (!node_isset(current->il_next, tmp)) {
355 		current->il_next = next_node(current->il_next, tmp);
356 		if (current->il_next >= MAX_NUMNODES)
357 			current->il_next = first_node(tmp);
358 		if (current->il_next >= MAX_NUMNODES)
359 			current->il_next = numa_node_id();
360 	}
361 }
362 
363 static void mpol_rebind_preferred(struct mempolicy *pol,
364 				  const nodemask_t *nodes,
365 				  enum mpol_rebind_step step)
366 {
367 	nodemask_t tmp;
368 
369 	if (pol->flags & MPOL_F_STATIC_NODES) {
370 		int node = first_node(pol->w.user_nodemask);
371 
372 		if (node_isset(node, *nodes)) {
373 			pol->v.preferred_node = node;
374 			pol->flags &= ~MPOL_F_LOCAL;
375 		} else
376 			pol->flags |= MPOL_F_LOCAL;
377 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379 		pol->v.preferred_node = first_node(tmp);
380 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
381 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
382 						   pol->w.cpuset_mems_allowed,
383 						   *nodes);
384 		pol->w.cpuset_mems_allowed = *nodes;
385 	}
386 }
387 
388 /*
389  * mpol_rebind_policy - Migrate a policy to a different set of nodes
390  *
391  * If read-side task has no lock to protect task->mempolicy, write-side
392  * task will rebind the task->mempolicy by two step. The first step is
393  * setting all the newly nodes, and the second step is cleaning all the
394  * disallowed nodes. In this way, we can avoid finding no node to alloc
395  * page.
396  * If we have a lock to protect task->mempolicy in read-side, we do
397  * rebind directly.
398  *
399  * step:
400  * 	MPOL_REBIND_ONCE  - do rebind work at once
401  * 	MPOL_REBIND_STEP1 - set all the newly nodes
402  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
403  */
404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405 				enum mpol_rebind_step step)
406 {
407 	if (!pol)
408 		return;
409 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414 		return;
415 
416 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417 		BUG();
418 
419 	if (step == MPOL_REBIND_STEP1)
420 		pol->flags |= MPOL_F_REBINDING;
421 	else if (step == MPOL_REBIND_STEP2)
422 		pol->flags &= ~MPOL_F_REBINDING;
423 	else if (step >= MPOL_REBIND_NSTEP)
424 		BUG();
425 
426 	mpol_ops[pol->mode].rebind(pol, newmask, step);
427 }
428 
429 /*
430  * Wrapper for mpol_rebind_policy() that just requires task
431  * pointer, and updates task mempolicy.
432  *
433  * Called with task's alloc_lock held.
434  */
435 
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437 			enum mpol_rebind_step step)
438 {
439 	mpol_rebind_policy(tsk->mempolicy, new, step);
440 }
441 
442 /*
443  * Rebind each vma in mm to new nodemask.
444  *
445  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
446  */
447 
448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449 {
450 	struct vm_area_struct *vma;
451 
452 	down_write(&mm->mmap_sem);
453 	for (vma = mm->mmap; vma; vma = vma->vm_next)
454 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455 	up_write(&mm->mmap_sem);
456 }
457 
458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459 	[MPOL_DEFAULT] = {
460 		.rebind = mpol_rebind_default,
461 	},
462 	[MPOL_INTERLEAVE] = {
463 		.create = mpol_new_interleave,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 	[MPOL_PREFERRED] = {
467 		.create = mpol_new_preferred,
468 		.rebind = mpol_rebind_preferred,
469 	},
470 	[MPOL_BIND] = {
471 		.create = mpol_new_bind,
472 		.rebind = mpol_rebind_nodemask,
473 	},
474 };
475 
476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
477 				unsigned long flags);
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484 		unsigned long addr, unsigned long end,
485 		const nodemask_t *nodes, unsigned long flags,
486 		void *private)
487 {
488 	pte_t *orig_pte;
489 	pte_t *pte;
490 	spinlock_t *ptl;
491 
492 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493 	do {
494 		struct page *page;
495 		int nid;
496 
497 		if (!pte_present(*pte))
498 			continue;
499 		page = vm_normal_page(vma, addr, *pte);
500 		if (!page)
501 			continue;
502 		/*
503 		 * vm_normal_page() filters out zero pages, but there might
504 		 * still be PageReserved pages to skip, perhaps in a VDSO.
505 		 */
506 		if (PageReserved(page))
507 			continue;
508 		nid = page_to_nid(page);
509 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510 			continue;
511 
512 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513 			migrate_page_add(page, private, flags);
514 		else
515 			break;
516 	} while (pte++, addr += PAGE_SIZE, addr != end);
517 	pte_unmap_unlock(orig_pte, ptl);
518 	return addr != end;
519 }
520 
521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 				    void *private)
524 {
525 #ifdef CONFIG_HUGETLB_PAGE
526 	int nid;
527 	struct page *page;
528 
529 	spin_lock(&vma->vm_mm->page_table_lock);
530 	page = pte_page(huge_ptep_get((pte_t *)pmd));
531 	nid = page_to_nid(page);
532 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
533 		goto unlock;
534 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 	if (flags & (MPOL_MF_MOVE_ALL) ||
536 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 		isolate_huge_page(page, private);
538 unlock:
539 	spin_unlock(&vma->vm_mm->page_table_lock);
540 #else
541 	BUG();
542 #endif
543 }
544 
545 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
546 		unsigned long addr, unsigned long end,
547 		const nodemask_t *nodes, unsigned long flags,
548 		void *private)
549 {
550 	pmd_t *pmd;
551 	unsigned long next;
552 
553 	pmd = pmd_offset(pud, addr);
554 	do {
555 		next = pmd_addr_end(addr, end);
556 		if (!pmd_present(*pmd))
557 			continue;
558 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 						flags, private);
561 			continue;
562 		}
563 		split_huge_page_pmd(vma, addr, pmd);
564 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565 			continue;
566 		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567 				    flags, private))
568 			return -EIO;
569 	} while (pmd++, addr = next, addr != end);
570 	return 0;
571 }
572 
573 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
574 		unsigned long addr, unsigned long end,
575 		const nodemask_t *nodes, unsigned long flags,
576 		void *private)
577 {
578 	pud_t *pud;
579 	unsigned long next;
580 
581 	pud = pud_offset(pgd, addr);
582 	do {
583 		next = pud_addr_end(addr, end);
584 		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 			continue;
586 		if (pud_none_or_clear_bad(pud))
587 			continue;
588 		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589 				    flags, private))
590 			return -EIO;
591 	} while (pud++, addr = next, addr != end);
592 	return 0;
593 }
594 
595 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596 		unsigned long addr, unsigned long end,
597 		const nodemask_t *nodes, unsigned long flags,
598 		void *private)
599 {
600 	pgd_t *pgd;
601 	unsigned long next;
602 
603 	pgd = pgd_offset(vma->vm_mm, addr);
604 	do {
605 		next = pgd_addr_end(addr, end);
606 		if (pgd_none_or_clear_bad(pgd))
607 			continue;
608 		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609 				    flags, private))
610 			return -EIO;
611 	} while (pgd++, addr = next, addr != end);
612 	return 0;
613 }
614 
615 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
616 /*
617  * This is used to mark a range of virtual addresses to be inaccessible.
618  * These are later cleared by a NUMA hinting fault. Depending on these
619  * faults, pages may be migrated for better NUMA placement.
620  *
621  * This is assuming that NUMA faults are handled using PROT_NONE. If
622  * an architecture makes a different choice, it will need further
623  * changes to the core.
624  */
625 unsigned long change_prot_numa(struct vm_area_struct *vma,
626 			unsigned long addr, unsigned long end)
627 {
628 	int nr_updated;
629 	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
630 
631 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632 	if (nr_updated)
633 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 
635 	return nr_updated;
636 }
637 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 			unsigned long addr, unsigned long end)
640 {
641 	return 0;
642 }
643 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
644 
645 /*
646  * Walk through page tables and collect pages to be migrated.
647  *
648  * If pages found in a given range are on a set of nodes (determined by
649  * @nodes and @flags,) it's isolated and queued to the pagelist which is
650  * passed via @private.)
651  */
652 static struct vm_area_struct *
653 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654 		const nodemask_t *nodes, unsigned long flags, void *private)
655 {
656 	int err;
657 	struct vm_area_struct *first, *vma, *prev;
658 
659 
660 	first = find_vma(mm, start);
661 	if (!first)
662 		return ERR_PTR(-EFAULT);
663 	prev = NULL;
664 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665 		unsigned long endvma = vma->vm_end;
666 
667 		if (endvma > end)
668 			endvma = end;
669 		if (vma->vm_start > start)
670 			start = vma->vm_start;
671 
672 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
673 			if (!vma->vm_next && vma->vm_end < end)
674 				return ERR_PTR(-EFAULT);
675 			if (prev && prev->vm_end < vma->vm_start)
676 				return ERR_PTR(-EFAULT);
677 		}
678 
679 		if (flags & MPOL_MF_LAZY) {
680 			change_prot_numa(vma, start, endvma);
681 			goto next;
682 		}
683 
684 		if ((flags & MPOL_MF_STRICT) ||
685 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 		      vma_migratable(vma))) {
687 
688 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 						flags, private);
690 			if (err) {
691 				first = ERR_PTR(err);
692 				break;
693 			}
694 		}
695 next:
696 		prev = vma;
697 	}
698 	return first;
699 }
700 
701 /*
702  * Apply policy to a single VMA
703  * This must be called with the mmap_sem held for writing.
704  */
705 static int vma_replace_policy(struct vm_area_struct *vma,
706 						struct mempolicy *pol)
707 {
708 	int err;
709 	struct mempolicy *old;
710 	struct mempolicy *new;
711 
712 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
713 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
714 		 vma->vm_ops, vma->vm_file,
715 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
716 
717 	new = mpol_dup(pol);
718 	if (IS_ERR(new))
719 		return PTR_ERR(new);
720 
721 	if (vma->vm_ops && vma->vm_ops->set_policy) {
722 		err = vma->vm_ops->set_policy(vma, new);
723 		if (err)
724 			goto err_out;
725 	}
726 
727 	old = vma->vm_policy;
728 	vma->vm_policy = new; /* protected by mmap_sem */
729 	mpol_put(old);
730 
731 	return 0;
732  err_out:
733 	mpol_put(new);
734 	return err;
735 }
736 
737 /* Step 2: apply policy to a range and do splits. */
738 static int mbind_range(struct mm_struct *mm, unsigned long start,
739 		       unsigned long end, struct mempolicy *new_pol)
740 {
741 	struct vm_area_struct *next;
742 	struct vm_area_struct *prev;
743 	struct vm_area_struct *vma;
744 	int err = 0;
745 	pgoff_t pgoff;
746 	unsigned long vmstart;
747 	unsigned long vmend;
748 
749 	vma = find_vma(mm, start);
750 	if (!vma || vma->vm_start > start)
751 		return -EFAULT;
752 
753 	prev = vma->vm_prev;
754 	if (start > vma->vm_start)
755 		prev = vma;
756 
757 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
758 		next = vma->vm_next;
759 		vmstart = max(start, vma->vm_start);
760 		vmend   = min(end, vma->vm_end);
761 
762 		if (mpol_equal(vma_policy(vma), new_pol))
763 			continue;
764 
765 		pgoff = vma->vm_pgoff +
766 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
767 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768 				  vma->anon_vma, vma->vm_file, pgoff,
769 				  new_pol);
770 		if (prev) {
771 			vma = prev;
772 			next = vma->vm_next;
773 			if (mpol_equal(vma_policy(vma), new_pol))
774 				continue;
775 			/* vma_merge() joined vma && vma->next, case 8 */
776 			goto replace;
777 		}
778 		if (vma->vm_start != vmstart) {
779 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
780 			if (err)
781 				goto out;
782 		}
783 		if (vma->vm_end != vmend) {
784 			err = split_vma(vma->vm_mm, vma, vmend, 0);
785 			if (err)
786 				goto out;
787 		}
788  replace:
789 		err = vma_replace_policy(vma, new_pol);
790 		if (err)
791 			goto out;
792 	}
793 
794  out:
795 	return err;
796 }
797 
798 /*
799  * Update task->flags PF_MEMPOLICY bit: set iff non-default
800  * mempolicy.  Allows more rapid checking of this (combined perhaps
801  * with other PF_* flag bits) on memory allocation hot code paths.
802  *
803  * If called from outside this file, the task 'p' should -only- be
804  * a newly forked child not yet visible on the task list, because
805  * manipulating the task flags of a visible task is not safe.
806  *
807  * The above limitation is why this routine has the funny name
808  * mpol_fix_fork_child_flag().
809  *
810  * It is also safe to call this with a task pointer of current,
811  * which the static wrapper mpol_set_task_struct_flag() does,
812  * for use within this file.
813  */
814 
815 void mpol_fix_fork_child_flag(struct task_struct *p)
816 {
817 	if (p->mempolicy)
818 		p->flags |= PF_MEMPOLICY;
819 	else
820 		p->flags &= ~PF_MEMPOLICY;
821 }
822 
823 static void mpol_set_task_struct_flag(void)
824 {
825 	mpol_fix_fork_child_flag(current);
826 }
827 
828 /* Set the process memory policy */
829 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
830 			     nodemask_t *nodes)
831 {
832 	struct mempolicy *new, *old;
833 	struct mm_struct *mm = current->mm;
834 	NODEMASK_SCRATCH(scratch);
835 	int ret;
836 
837 	if (!scratch)
838 		return -ENOMEM;
839 
840 	new = mpol_new(mode, flags, nodes);
841 	if (IS_ERR(new)) {
842 		ret = PTR_ERR(new);
843 		goto out;
844 	}
845 	/*
846 	 * prevent changing our mempolicy while show_numa_maps()
847 	 * is using it.
848 	 * Note:  do_set_mempolicy() can be called at init time
849 	 * with no 'mm'.
850 	 */
851 	if (mm)
852 		down_write(&mm->mmap_sem);
853 	task_lock(current);
854 	ret = mpol_set_nodemask(new, nodes, scratch);
855 	if (ret) {
856 		task_unlock(current);
857 		if (mm)
858 			up_write(&mm->mmap_sem);
859 		mpol_put(new);
860 		goto out;
861 	}
862 	old = current->mempolicy;
863 	current->mempolicy = new;
864 	mpol_set_task_struct_flag();
865 	if (new && new->mode == MPOL_INTERLEAVE &&
866 	    nodes_weight(new->v.nodes))
867 		current->il_next = first_node(new->v.nodes);
868 	task_unlock(current);
869 	if (mm)
870 		up_write(&mm->mmap_sem);
871 
872 	mpol_put(old);
873 	ret = 0;
874 out:
875 	NODEMASK_SCRATCH_FREE(scratch);
876 	return ret;
877 }
878 
879 /*
880  * Return nodemask for policy for get_mempolicy() query
881  *
882  * Called with task's alloc_lock held
883  */
884 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
885 {
886 	nodes_clear(*nodes);
887 	if (p == &default_policy)
888 		return;
889 
890 	switch (p->mode) {
891 	case MPOL_BIND:
892 		/* Fall through */
893 	case MPOL_INTERLEAVE:
894 		*nodes = p->v.nodes;
895 		break;
896 	case MPOL_PREFERRED:
897 		if (!(p->flags & MPOL_F_LOCAL))
898 			node_set(p->v.preferred_node, *nodes);
899 		/* else return empty node mask for local allocation */
900 		break;
901 	default:
902 		BUG();
903 	}
904 }
905 
906 static int lookup_node(struct mm_struct *mm, unsigned long addr)
907 {
908 	struct page *p;
909 	int err;
910 
911 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
912 	if (err >= 0) {
913 		err = page_to_nid(p);
914 		put_page(p);
915 	}
916 	return err;
917 }
918 
919 /* Retrieve NUMA policy */
920 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
921 			     unsigned long addr, unsigned long flags)
922 {
923 	int err;
924 	struct mm_struct *mm = current->mm;
925 	struct vm_area_struct *vma = NULL;
926 	struct mempolicy *pol = current->mempolicy;
927 
928 	if (flags &
929 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
930 		return -EINVAL;
931 
932 	if (flags & MPOL_F_MEMS_ALLOWED) {
933 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
934 			return -EINVAL;
935 		*policy = 0;	/* just so it's initialized */
936 		task_lock(current);
937 		*nmask  = cpuset_current_mems_allowed;
938 		task_unlock(current);
939 		return 0;
940 	}
941 
942 	if (flags & MPOL_F_ADDR) {
943 		/*
944 		 * Do NOT fall back to task policy if the
945 		 * vma/shared policy at addr is NULL.  We
946 		 * want to return MPOL_DEFAULT in this case.
947 		 */
948 		down_read(&mm->mmap_sem);
949 		vma = find_vma_intersection(mm, addr, addr+1);
950 		if (!vma) {
951 			up_read(&mm->mmap_sem);
952 			return -EFAULT;
953 		}
954 		if (vma->vm_ops && vma->vm_ops->get_policy)
955 			pol = vma->vm_ops->get_policy(vma, addr);
956 		else
957 			pol = vma->vm_policy;
958 	} else if (addr)
959 		return -EINVAL;
960 
961 	if (!pol)
962 		pol = &default_policy;	/* indicates default behavior */
963 
964 	if (flags & MPOL_F_NODE) {
965 		if (flags & MPOL_F_ADDR) {
966 			err = lookup_node(mm, addr);
967 			if (err < 0)
968 				goto out;
969 			*policy = err;
970 		} else if (pol == current->mempolicy &&
971 				pol->mode == MPOL_INTERLEAVE) {
972 			*policy = current->il_next;
973 		} else {
974 			err = -EINVAL;
975 			goto out;
976 		}
977 	} else {
978 		*policy = pol == &default_policy ? MPOL_DEFAULT :
979 						pol->mode;
980 		/*
981 		 * Internal mempolicy flags must be masked off before exposing
982 		 * the policy to userspace.
983 		 */
984 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
985 	}
986 
987 	if (vma) {
988 		up_read(&current->mm->mmap_sem);
989 		vma = NULL;
990 	}
991 
992 	err = 0;
993 	if (nmask) {
994 		if (mpol_store_user_nodemask(pol)) {
995 			*nmask = pol->w.user_nodemask;
996 		} else {
997 			task_lock(current);
998 			get_policy_nodemask(pol, nmask);
999 			task_unlock(current);
1000 		}
1001 	}
1002 
1003  out:
1004 	mpol_cond_put(pol);
1005 	if (vma)
1006 		up_read(&current->mm->mmap_sem);
1007 	return err;
1008 }
1009 
1010 #ifdef CONFIG_MIGRATION
1011 /*
1012  * page migration
1013  */
1014 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1015 				unsigned long flags)
1016 {
1017 	/*
1018 	 * Avoid migrating a page that is shared with others.
1019 	 */
1020 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
1021 		if (!isolate_lru_page(page)) {
1022 			list_add_tail(&page->lru, pagelist);
1023 			inc_zone_page_state(page, NR_ISOLATED_ANON +
1024 					    page_is_file_cache(page));
1025 		}
1026 	}
1027 }
1028 
1029 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1030 {
1031 	if (PageHuge(page))
1032 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1033 					node);
1034 	else
1035 		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1036 }
1037 
1038 /*
1039  * Migrate pages from one node to a target node.
1040  * Returns error or the number of pages not migrated.
1041  */
1042 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1043 			   int flags)
1044 {
1045 	nodemask_t nmask;
1046 	LIST_HEAD(pagelist);
1047 	int err = 0;
1048 
1049 	nodes_clear(nmask);
1050 	node_set(source, nmask);
1051 
1052 	/*
1053 	 * This does not "check" the range but isolates all pages that
1054 	 * need migration.  Between passing in the full user address
1055 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1056 	 */
1057 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1058 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1059 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1060 
1061 	if (!list_empty(&pagelist)) {
1062 		err = migrate_pages(&pagelist, new_node_page, dest,
1063 					MIGRATE_SYNC, MR_SYSCALL);
1064 		if (err)
1065 			putback_movable_pages(&pagelist);
1066 	}
1067 
1068 	return err;
1069 }
1070 
1071 /*
1072  * Move pages between the two nodesets so as to preserve the physical
1073  * layout as much as possible.
1074  *
1075  * Returns the number of page that could not be moved.
1076  */
1077 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1078 		     const nodemask_t *to, int flags)
1079 {
1080 	int busy = 0;
1081 	int err;
1082 	nodemask_t tmp;
1083 
1084 	err = migrate_prep();
1085 	if (err)
1086 		return err;
1087 
1088 	down_read(&mm->mmap_sem);
1089 
1090 	err = migrate_vmas(mm, from, to, flags);
1091 	if (err)
1092 		goto out;
1093 
1094 	/*
1095 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1096 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1097 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1098 	 * The pair of nodemasks 'to' and 'from' define the map.
1099 	 *
1100 	 * If no pair of bits is found that way, fallback to picking some
1101 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1102 	 * 'source' and 'dest' bits are the same, this represents a node
1103 	 * that will be migrating to itself, so no pages need move.
1104 	 *
1105 	 * If no bits are left in 'tmp', or if all remaining bits left
1106 	 * in 'tmp' correspond to the same bit in 'to', return false
1107 	 * (nothing left to migrate).
1108 	 *
1109 	 * This lets us pick a pair of nodes to migrate between, such that
1110 	 * if possible the dest node is not already occupied by some other
1111 	 * source node, minimizing the risk of overloading the memory on a
1112 	 * node that would happen if we migrated incoming memory to a node
1113 	 * before migrating outgoing memory source that same node.
1114 	 *
1115 	 * A single scan of tmp is sufficient.  As we go, we remember the
1116 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1117 	 * that not only moved, but what's better, moved to an empty slot
1118 	 * (d is not set in tmp), then we break out then, with that pair.
1119 	 * Otherwise when we finish scanning from_tmp, we at least have the
1120 	 * most recent <s, d> pair that moved.  If we get all the way through
1121 	 * the scan of tmp without finding any node that moved, much less
1122 	 * moved to an empty node, then there is nothing left worth migrating.
1123 	 */
1124 
1125 	tmp = *from;
1126 	while (!nodes_empty(tmp)) {
1127 		int s,d;
1128 		int source = -1;
1129 		int dest = 0;
1130 
1131 		for_each_node_mask(s, tmp) {
1132 
1133 			/*
1134 			 * do_migrate_pages() tries to maintain the relative
1135 			 * node relationship of the pages established between
1136 			 * threads and memory areas.
1137                          *
1138 			 * However if the number of source nodes is not equal to
1139 			 * the number of destination nodes we can not preserve
1140 			 * this node relative relationship.  In that case, skip
1141 			 * copying memory from a node that is in the destination
1142 			 * mask.
1143 			 *
1144 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1145 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1146 			 */
1147 
1148 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1149 						(node_isset(s, *to)))
1150 				continue;
1151 
1152 			d = node_remap(s, *from, *to);
1153 			if (s == d)
1154 				continue;
1155 
1156 			source = s;	/* Node moved. Memorize */
1157 			dest = d;
1158 
1159 			/* dest not in remaining from nodes? */
1160 			if (!node_isset(dest, tmp))
1161 				break;
1162 		}
1163 		if (source == -1)
1164 			break;
1165 
1166 		node_clear(source, tmp);
1167 		err = migrate_to_node(mm, source, dest, flags);
1168 		if (err > 0)
1169 			busy += err;
1170 		if (err < 0)
1171 			break;
1172 	}
1173 out:
1174 	up_read(&mm->mmap_sem);
1175 	if (err < 0)
1176 		return err;
1177 	return busy;
1178 
1179 }
1180 
1181 /*
1182  * Allocate a new page for page migration based on vma policy.
1183  * Start assuming that page is mapped by vma pointed to by @private.
1184  * Search forward from there, if not.  N.B., this assumes that the
1185  * list of pages handed to migrate_pages()--which is how we get here--
1186  * is in virtual address order.
1187  */
1188 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1189 {
1190 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1191 	unsigned long uninitialized_var(address);
1192 
1193 	while (vma) {
1194 		address = page_address_in_vma(page, vma);
1195 		if (address != -EFAULT)
1196 			break;
1197 		vma = vma->vm_next;
1198 	}
1199 	/*
1200 	 * queue_pages_range() confirms that @page belongs to some vma,
1201 	 * so vma shouldn't be NULL.
1202 	 */
1203 	BUG_ON(!vma);
1204 
1205 	if (PageHuge(page))
1206 		return alloc_huge_page_noerr(vma, address, 1);
1207 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1208 }
1209 #else
1210 
1211 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1212 				unsigned long flags)
1213 {
1214 }
1215 
1216 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1217 		     const nodemask_t *to, int flags)
1218 {
1219 	return -ENOSYS;
1220 }
1221 
1222 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1223 {
1224 	return NULL;
1225 }
1226 #endif
1227 
1228 static long do_mbind(unsigned long start, unsigned long len,
1229 		     unsigned short mode, unsigned short mode_flags,
1230 		     nodemask_t *nmask, unsigned long flags)
1231 {
1232 	struct vm_area_struct *vma;
1233 	struct mm_struct *mm = current->mm;
1234 	struct mempolicy *new;
1235 	unsigned long end;
1236 	int err;
1237 	LIST_HEAD(pagelist);
1238 
1239 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1240 		return -EINVAL;
1241 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1242 		return -EPERM;
1243 
1244 	if (start & ~PAGE_MASK)
1245 		return -EINVAL;
1246 
1247 	if (mode == MPOL_DEFAULT)
1248 		flags &= ~MPOL_MF_STRICT;
1249 
1250 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1251 	end = start + len;
1252 
1253 	if (end < start)
1254 		return -EINVAL;
1255 	if (end == start)
1256 		return 0;
1257 
1258 	new = mpol_new(mode, mode_flags, nmask);
1259 	if (IS_ERR(new))
1260 		return PTR_ERR(new);
1261 
1262 	if (flags & MPOL_MF_LAZY)
1263 		new->flags |= MPOL_F_MOF;
1264 
1265 	/*
1266 	 * If we are using the default policy then operation
1267 	 * on discontinuous address spaces is okay after all
1268 	 */
1269 	if (!new)
1270 		flags |= MPOL_MF_DISCONTIG_OK;
1271 
1272 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1273 		 start, start + len, mode, mode_flags,
1274 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1275 
1276 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1277 
1278 		err = migrate_prep();
1279 		if (err)
1280 			goto mpol_out;
1281 	}
1282 	{
1283 		NODEMASK_SCRATCH(scratch);
1284 		if (scratch) {
1285 			down_write(&mm->mmap_sem);
1286 			task_lock(current);
1287 			err = mpol_set_nodemask(new, nmask, scratch);
1288 			task_unlock(current);
1289 			if (err)
1290 				up_write(&mm->mmap_sem);
1291 		} else
1292 			err = -ENOMEM;
1293 		NODEMASK_SCRATCH_FREE(scratch);
1294 	}
1295 	if (err)
1296 		goto mpol_out;
1297 
1298 	vma = queue_pages_range(mm, start, end, nmask,
1299 			  flags | MPOL_MF_INVERT, &pagelist);
1300 
1301 	err = PTR_ERR(vma);	/* maybe ... */
1302 	if (!IS_ERR(vma))
1303 		err = mbind_range(mm, start, end, new);
1304 
1305 	if (!err) {
1306 		int nr_failed = 0;
1307 
1308 		if (!list_empty(&pagelist)) {
1309 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1310 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1311 					(unsigned long)vma,
1312 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1313 			if (nr_failed)
1314 				putback_movable_pages(&pagelist);
1315 		}
1316 
1317 		if (nr_failed && (flags & MPOL_MF_STRICT))
1318 			err = -EIO;
1319 	} else
1320 		putback_lru_pages(&pagelist);
1321 
1322 	up_write(&mm->mmap_sem);
1323  mpol_out:
1324 	mpol_put(new);
1325 	return err;
1326 }
1327 
1328 /*
1329  * User space interface with variable sized bitmaps for nodelists.
1330  */
1331 
1332 /* Copy a node mask from user space. */
1333 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1334 		     unsigned long maxnode)
1335 {
1336 	unsigned long k;
1337 	unsigned long nlongs;
1338 	unsigned long endmask;
1339 
1340 	--maxnode;
1341 	nodes_clear(*nodes);
1342 	if (maxnode == 0 || !nmask)
1343 		return 0;
1344 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1345 		return -EINVAL;
1346 
1347 	nlongs = BITS_TO_LONGS(maxnode);
1348 	if ((maxnode % BITS_PER_LONG) == 0)
1349 		endmask = ~0UL;
1350 	else
1351 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1352 
1353 	/* When the user specified more nodes than supported just check
1354 	   if the non supported part is all zero. */
1355 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1356 		if (nlongs > PAGE_SIZE/sizeof(long))
1357 			return -EINVAL;
1358 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1359 			unsigned long t;
1360 			if (get_user(t, nmask + k))
1361 				return -EFAULT;
1362 			if (k == nlongs - 1) {
1363 				if (t & endmask)
1364 					return -EINVAL;
1365 			} else if (t)
1366 				return -EINVAL;
1367 		}
1368 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1369 		endmask = ~0UL;
1370 	}
1371 
1372 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1373 		return -EFAULT;
1374 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1375 	return 0;
1376 }
1377 
1378 /* Copy a kernel node mask to user space */
1379 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1380 			      nodemask_t *nodes)
1381 {
1382 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1383 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1384 
1385 	if (copy > nbytes) {
1386 		if (copy > PAGE_SIZE)
1387 			return -EINVAL;
1388 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1389 			return -EFAULT;
1390 		copy = nbytes;
1391 	}
1392 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1393 }
1394 
1395 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1396 		unsigned long, mode, unsigned long __user *, nmask,
1397 		unsigned long, maxnode, unsigned, flags)
1398 {
1399 	nodemask_t nodes;
1400 	int err;
1401 	unsigned short mode_flags;
1402 
1403 	mode_flags = mode & MPOL_MODE_FLAGS;
1404 	mode &= ~MPOL_MODE_FLAGS;
1405 	if (mode >= MPOL_MAX)
1406 		return -EINVAL;
1407 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1408 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1409 		return -EINVAL;
1410 	err = get_nodes(&nodes, nmask, maxnode);
1411 	if (err)
1412 		return err;
1413 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1414 }
1415 
1416 /* Set the process memory policy */
1417 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1418 		unsigned long, maxnode)
1419 {
1420 	int err;
1421 	nodemask_t nodes;
1422 	unsigned short flags;
1423 
1424 	flags = mode & MPOL_MODE_FLAGS;
1425 	mode &= ~MPOL_MODE_FLAGS;
1426 	if ((unsigned int)mode >= MPOL_MAX)
1427 		return -EINVAL;
1428 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1429 		return -EINVAL;
1430 	err = get_nodes(&nodes, nmask, maxnode);
1431 	if (err)
1432 		return err;
1433 	return do_set_mempolicy(mode, flags, &nodes);
1434 }
1435 
1436 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1437 		const unsigned long __user *, old_nodes,
1438 		const unsigned long __user *, new_nodes)
1439 {
1440 	const struct cred *cred = current_cred(), *tcred;
1441 	struct mm_struct *mm = NULL;
1442 	struct task_struct *task;
1443 	nodemask_t task_nodes;
1444 	int err;
1445 	nodemask_t *old;
1446 	nodemask_t *new;
1447 	NODEMASK_SCRATCH(scratch);
1448 
1449 	if (!scratch)
1450 		return -ENOMEM;
1451 
1452 	old = &scratch->mask1;
1453 	new = &scratch->mask2;
1454 
1455 	err = get_nodes(old, old_nodes, maxnode);
1456 	if (err)
1457 		goto out;
1458 
1459 	err = get_nodes(new, new_nodes, maxnode);
1460 	if (err)
1461 		goto out;
1462 
1463 	/* Find the mm_struct */
1464 	rcu_read_lock();
1465 	task = pid ? find_task_by_vpid(pid) : current;
1466 	if (!task) {
1467 		rcu_read_unlock();
1468 		err = -ESRCH;
1469 		goto out;
1470 	}
1471 	get_task_struct(task);
1472 
1473 	err = -EINVAL;
1474 
1475 	/*
1476 	 * Check if this process has the right to modify the specified
1477 	 * process. The right exists if the process has administrative
1478 	 * capabilities, superuser privileges or the same
1479 	 * userid as the target process.
1480 	 */
1481 	tcred = __task_cred(task);
1482 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1483 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1484 	    !capable(CAP_SYS_NICE)) {
1485 		rcu_read_unlock();
1486 		err = -EPERM;
1487 		goto out_put;
1488 	}
1489 	rcu_read_unlock();
1490 
1491 	task_nodes = cpuset_mems_allowed(task);
1492 	/* Is the user allowed to access the target nodes? */
1493 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1494 		err = -EPERM;
1495 		goto out_put;
1496 	}
1497 
1498 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1499 		err = -EINVAL;
1500 		goto out_put;
1501 	}
1502 
1503 	err = security_task_movememory(task);
1504 	if (err)
1505 		goto out_put;
1506 
1507 	mm = get_task_mm(task);
1508 	put_task_struct(task);
1509 
1510 	if (!mm) {
1511 		err = -EINVAL;
1512 		goto out;
1513 	}
1514 
1515 	err = do_migrate_pages(mm, old, new,
1516 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1517 
1518 	mmput(mm);
1519 out:
1520 	NODEMASK_SCRATCH_FREE(scratch);
1521 
1522 	return err;
1523 
1524 out_put:
1525 	put_task_struct(task);
1526 	goto out;
1527 
1528 }
1529 
1530 
1531 /* Retrieve NUMA policy */
1532 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1533 		unsigned long __user *, nmask, unsigned long, maxnode,
1534 		unsigned long, addr, unsigned long, flags)
1535 {
1536 	int err;
1537 	int uninitialized_var(pval);
1538 	nodemask_t nodes;
1539 
1540 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1541 		return -EINVAL;
1542 
1543 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1544 
1545 	if (err)
1546 		return err;
1547 
1548 	if (policy && put_user(pval, policy))
1549 		return -EFAULT;
1550 
1551 	if (nmask)
1552 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1553 
1554 	return err;
1555 }
1556 
1557 #ifdef CONFIG_COMPAT
1558 
1559 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1560 				     compat_ulong_t __user *nmask,
1561 				     compat_ulong_t maxnode,
1562 				     compat_ulong_t addr, compat_ulong_t flags)
1563 {
1564 	long err;
1565 	unsigned long __user *nm = NULL;
1566 	unsigned long nr_bits, alloc_size;
1567 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1568 
1569 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1570 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1571 
1572 	if (nmask)
1573 		nm = compat_alloc_user_space(alloc_size);
1574 
1575 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1576 
1577 	if (!err && nmask) {
1578 		unsigned long copy_size;
1579 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1580 		err = copy_from_user(bm, nm, copy_size);
1581 		/* ensure entire bitmap is zeroed */
1582 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1583 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1584 	}
1585 
1586 	return err;
1587 }
1588 
1589 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1590 				     compat_ulong_t maxnode)
1591 {
1592 	long err = 0;
1593 	unsigned long __user *nm = NULL;
1594 	unsigned long nr_bits, alloc_size;
1595 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1596 
1597 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1598 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1599 
1600 	if (nmask) {
1601 		err = compat_get_bitmap(bm, nmask, nr_bits);
1602 		nm = compat_alloc_user_space(alloc_size);
1603 		err |= copy_to_user(nm, bm, alloc_size);
1604 	}
1605 
1606 	if (err)
1607 		return -EFAULT;
1608 
1609 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1610 }
1611 
1612 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1613 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1614 			     compat_ulong_t maxnode, compat_ulong_t flags)
1615 {
1616 	long err = 0;
1617 	unsigned long __user *nm = NULL;
1618 	unsigned long nr_bits, alloc_size;
1619 	nodemask_t bm;
1620 
1621 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1622 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1623 
1624 	if (nmask) {
1625 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1626 		nm = compat_alloc_user_space(alloc_size);
1627 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1628 	}
1629 
1630 	if (err)
1631 		return -EFAULT;
1632 
1633 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1634 }
1635 
1636 #endif
1637 
1638 /*
1639  * get_vma_policy(@task, @vma, @addr)
1640  * @task - task for fallback if vma policy == default
1641  * @vma   - virtual memory area whose policy is sought
1642  * @addr  - address in @vma for shared policy lookup
1643  *
1644  * Returns effective policy for a VMA at specified address.
1645  * Falls back to @task or system default policy, as necessary.
1646  * Current or other task's task mempolicy and non-shared vma policies must be
1647  * protected by task_lock(task) by the caller.
1648  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1649  * count--added by the get_policy() vm_op, as appropriate--to protect against
1650  * freeing by another task.  It is the caller's responsibility to free the
1651  * extra reference for shared policies.
1652  */
1653 struct mempolicy *get_vma_policy(struct task_struct *task,
1654 		struct vm_area_struct *vma, unsigned long addr)
1655 {
1656 	struct mempolicy *pol = get_task_policy(task);
1657 
1658 	if (vma) {
1659 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1660 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1661 									addr);
1662 			if (vpol)
1663 				pol = vpol;
1664 		} else if (vma->vm_policy) {
1665 			pol = vma->vm_policy;
1666 
1667 			/*
1668 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1669 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1670 			 * count on these policies which will be dropped by
1671 			 * mpol_cond_put() later
1672 			 */
1673 			if (mpol_needs_cond_ref(pol))
1674 				mpol_get(pol);
1675 		}
1676 	}
1677 	if (!pol)
1678 		pol = &default_policy;
1679 	return pol;
1680 }
1681 
1682 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683 {
1684 	enum zone_type dynamic_policy_zone = policy_zone;
1685 
1686 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1687 
1688 	/*
1689 	 * if policy->v.nodes has movable memory only,
1690 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1691 	 *
1692 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1693 	 * so if the following test faile, it implies
1694 	 * policy->v.nodes has movable memory only.
1695 	 */
1696 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1697 		dynamic_policy_zone = ZONE_MOVABLE;
1698 
1699 	return zone >= dynamic_policy_zone;
1700 }
1701 
1702 /*
1703  * Return a nodemask representing a mempolicy for filtering nodes for
1704  * page allocation
1705  */
1706 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1707 {
1708 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1709 	if (unlikely(policy->mode == MPOL_BIND) &&
1710 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1711 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1712 		return &policy->v.nodes;
1713 
1714 	return NULL;
1715 }
1716 
1717 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1718 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1719 	int nd)
1720 {
1721 	switch (policy->mode) {
1722 	case MPOL_PREFERRED:
1723 		if (!(policy->flags & MPOL_F_LOCAL))
1724 			nd = policy->v.preferred_node;
1725 		break;
1726 	case MPOL_BIND:
1727 		/*
1728 		 * Normally, MPOL_BIND allocations are node-local within the
1729 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1730 		 * current node isn't part of the mask, we use the zonelist for
1731 		 * the first node in the mask instead.
1732 		 */
1733 		if (unlikely(gfp & __GFP_THISNODE) &&
1734 				unlikely(!node_isset(nd, policy->v.nodes)))
1735 			nd = first_node(policy->v.nodes);
1736 		break;
1737 	default:
1738 		BUG();
1739 	}
1740 	return node_zonelist(nd, gfp);
1741 }
1742 
1743 /* Do dynamic interleaving for a process */
1744 static unsigned interleave_nodes(struct mempolicy *policy)
1745 {
1746 	unsigned nid, next;
1747 	struct task_struct *me = current;
1748 
1749 	nid = me->il_next;
1750 	next = next_node(nid, policy->v.nodes);
1751 	if (next >= MAX_NUMNODES)
1752 		next = first_node(policy->v.nodes);
1753 	if (next < MAX_NUMNODES)
1754 		me->il_next = next;
1755 	return nid;
1756 }
1757 
1758 /*
1759  * Depending on the memory policy provide a node from which to allocate the
1760  * next slab entry.
1761  * @policy must be protected by freeing by the caller.  If @policy is
1762  * the current task's mempolicy, this protection is implicit, as only the
1763  * task can change it's policy.  The system default policy requires no
1764  * such protection.
1765  */
1766 unsigned slab_node(void)
1767 {
1768 	struct mempolicy *policy;
1769 
1770 	if (in_interrupt())
1771 		return numa_node_id();
1772 
1773 	policy = current->mempolicy;
1774 	if (!policy || policy->flags & MPOL_F_LOCAL)
1775 		return numa_node_id();
1776 
1777 	switch (policy->mode) {
1778 	case MPOL_PREFERRED:
1779 		/*
1780 		 * handled MPOL_F_LOCAL above
1781 		 */
1782 		return policy->v.preferred_node;
1783 
1784 	case MPOL_INTERLEAVE:
1785 		return interleave_nodes(policy);
1786 
1787 	case MPOL_BIND: {
1788 		/*
1789 		 * Follow bind policy behavior and start allocation at the
1790 		 * first node.
1791 		 */
1792 		struct zonelist *zonelist;
1793 		struct zone *zone;
1794 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1795 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1796 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1797 							&policy->v.nodes,
1798 							&zone);
1799 		return zone ? zone->node : numa_node_id();
1800 	}
1801 
1802 	default:
1803 		BUG();
1804 	}
1805 }
1806 
1807 /* Do static interleaving for a VMA with known offset. */
1808 static unsigned offset_il_node(struct mempolicy *pol,
1809 		struct vm_area_struct *vma, unsigned long off)
1810 {
1811 	unsigned nnodes = nodes_weight(pol->v.nodes);
1812 	unsigned target;
1813 	int c;
1814 	int nid = -1;
1815 
1816 	if (!nnodes)
1817 		return numa_node_id();
1818 	target = (unsigned int)off % nnodes;
1819 	c = 0;
1820 	do {
1821 		nid = next_node(nid, pol->v.nodes);
1822 		c++;
1823 	} while (c <= target);
1824 	return nid;
1825 }
1826 
1827 /* Determine a node number for interleave */
1828 static inline unsigned interleave_nid(struct mempolicy *pol,
1829 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1830 {
1831 	if (vma) {
1832 		unsigned long off;
1833 
1834 		/*
1835 		 * for small pages, there is no difference between
1836 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1837 		 * for huge pages, since vm_pgoff is in units of small
1838 		 * pages, we need to shift off the always 0 bits to get
1839 		 * a useful offset.
1840 		 */
1841 		BUG_ON(shift < PAGE_SHIFT);
1842 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1843 		off += (addr - vma->vm_start) >> shift;
1844 		return offset_il_node(pol, vma, off);
1845 	} else
1846 		return interleave_nodes(pol);
1847 }
1848 
1849 /*
1850  * Return the bit number of a random bit set in the nodemask.
1851  * (returns -1 if nodemask is empty)
1852  */
1853 int node_random(const nodemask_t *maskp)
1854 {
1855 	int w, bit = -1;
1856 
1857 	w = nodes_weight(*maskp);
1858 	if (w)
1859 		bit = bitmap_ord_to_pos(maskp->bits,
1860 			get_random_int() % w, MAX_NUMNODES);
1861 	return bit;
1862 }
1863 
1864 #ifdef CONFIG_HUGETLBFS
1865 /*
1866  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1867  * @vma = virtual memory area whose policy is sought
1868  * @addr = address in @vma for shared policy lookup and interleave policy
1869  * @gfp_flags = for requested zone
1870  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1871  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1872  *
1873  * Returns a zonelist suitable for a huge page allocation and a pointer
1874  * to the struct mempolicy for conditional unref after allocation.
1875  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1876  * @nodemask for filtering the zonelist.
1877  *
1878  * Must be protected by get_mems_allowed()
1879  */
1880 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1881 				gfp_t gfp_flags, struct mempolicy **mpol,
1882 				nodemask_t **nodemask)
1883 {
1884 	struct zonelist *zl;
1885 
1886 	*mpol = get_vma_policy(current, vma, addr);
1887 	*nodemask = NULL;	/* assume !MPOL_BIND */
1888 
1889 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1890 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1891 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1892 	} else {
1893 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1894 		if ((*mpol)->mode == MPOL_BIND)
1895 			*nodemask = &(*mpol)->v.nodes;
1896 	}
1897 	return zl;
1898 }
1899 
1900 /*
1901  * init_nodemask_of_mempolicy
1902  *
1903  * If the current task's mempolicy is "default" [NULL], return 'false'
1904  * to indicate default policy.  Otherwise, extract the policy nodemask
1905  * for 'bind' or 'interleave' policy into the argument nodemask, or
1906  * initialize the argument nodemask to contain the single node for
1907  * 'preferred' or 'local' policy and return 'true' to indicate presence
1908  * of non-default mempolicy.
1909  *
1910  * We don't bother with reference counting the mempolicy [mpol_get/put]
1911  * because the current task is examining it's own mempolicy and a task's
1912  * mempolicy is only ever changed by the task itself.
1913  *
1914  * N.B., it is the caller's responsibility to free a returned nodemask.
1915  */
1916 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1917 {
1918 	struct mempolicy *mempolicy;
1919 	int nid;
1920 
1921 	if (!(mask && current->mempolicy))
1922 		return false;
1923 
1924 	task_lock(current);
1925 	mempolicy = current->mempolicy;
1926 	switch (mempolicy->mode) {
1927 	case MPOL_PREFERRED:
1928 		if (mempolicy->flags & MPOL_F_LOCAL)
1929 			nid = numa_node_id();
1930 		else
1931 			nid = mempolicy->v.preferred_node;
1932 		init_nodemask_of_node(mask, nid);
1933 		break;
1934 
1935 	case MPOL_BIND:
1936 		/* Fall through */
1937 	case MPOL_INTERLEAVE:
1938 		*mask =  mempolicy->v.nodes;
1939 		break;
1940 
1941 	default:
1942 		BUG();
1943 	}
1944 	task_unlock(current);
1945 
1946 	return true;
1947 }
1948 #endif
1949 
1950 /*
1951  * mempolicy_nodemask_intersects
1952  *
1953  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1954  * policy.  Otherwise, check for intersection between mask and the policy
1955  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1956  * policy, always return true since it may allocate elsewhere on fallback.
1957  *
1958  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1959  */
1960 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1961 					const nodemask_t *mask)
1962 {
1963 	struct mempolicy *mempolicy;
1964 	bool ret = true;
1965 
1966 	if (!mask)
1967 		return ret;
1968 	task_lock(tsk);
1969 	mempolicy = tsk->mempolicy;
1970 	if (!mempolicy)
1971 		goto out;
1972 
1973 	switch (mempolicy->mode) {
1974 	case MPOL_PREFERRED:
1975 		/*
1976 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1977 		 * allocate from, they may fallback to other nodes when oom.
1978 		 * Thus, it's possible for tsk to have allocated memory from
1979 		 * nodes in mask.
1980 		 */
1981 		break;
1982 	case MPOL_BIND:
1983 	case MPOL_INTERLEAVE:
1984 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1985 		break;
1986 	default:
1987 		BUG();
1988 	}
1989 out:
1990 	task_unlock(tsk);
1991 	return ret;
1992 }
1993 
1994 /* Allocate a page in interleaved policy.
1995    Own path because it needs to do special accounting. */
1996 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1997 					unsigned nid)
1998 {
1999 	struct zonelist *zl;
2000 	struct page *page;
2001 
2002 	zl = node_zonelist(nid, gfp);
2003 	page = __alloc_pages(gfp, order, zl);
2004 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
2005 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
2006 	return page;
2007 }
2008 
2009 /**
2010  * 	alloc_pages_vma	- Allocate a page for a VMA.
2011  *
2012  * 	@gfp:
2013  *      %GFP_USER    user allocation.
2014  *      %GFP_KERNEL  kernel allocations,
2015  *      %GFP_HIGHMEM highmem/user allocations,
2016  *      %GFP_FS      allocation should not call back into a file system.
2017  *      %GFP_ATOMIC  don't sleep.
2018  *
2019  *	@order:Order of the GFP allocation.
2020  * 	@vma:  Pointer to VMA or NULL if not available.
2021  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2022  *
2023  * 	This function allocates a page from the kernel page pool and applies
2024  *	a NUMA policy associated with the VMA or the current process.
2025  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2026  *	mm_struct of the VMA to prevent it from going away. Should be used for
2027  *	all allocations for pages that will be mapped into
2028  * 	user space. Returns NULL when no page can be allocated.
2029  *
2030  *	Should be called with the mm_sem of the vma hold.
2031  */
2032 struct page *
2033 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2034 		unsigned long addr, int node)
2035 {
2036 	struct mempolicy *pol;
2037 	struct page *page;
2038 	unsigned int cpuset_mems_cookie;
2039 
2040 retry_cpuset:
2041 	pol = get_vma_policy(current, vma, addr);
2042 	cpuset_mems_cookie = get_mems_allowed();
2043 
2044 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2045 		unsigned nid;
2046 
2047 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2048 		mpol_cond_put(pol);
2049 		page = alloc_page_interleave(gfp, order, nid);
2050 		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2051 			goto retry_cpuset;
2052 
2053 		return page;
2054 	}
2055 	page = __alloc_pages_nodemask(gfp, order,
2056 				      policy_zonelist(gfp, pol, node),
2057 				      policy_nodemask(gfp, pol));
2058 	if (unlikely(mpol_needs_cond_ref(pol)))
2059 		__mpol_put(pol);
2060 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2061 		goto retry_cpuset;
2062 	return page;
2063 }
2064 
2065 /**
2066  * 	alloc_pages_current - Allocate pages.
2067  *
2068  *	@gfp:
2069  *		%GFP_USER   user allocation,
2070  *      	%GFP_KERNEL kernel allocation,
2071  *      	%GFP_HIGHMEM highmem allocation,
2072  *      	%GFP_FS     don't call back into a file system.
2073  *      	%GFP_ATOMIC don't sleep.
2074  *	@order: Power of two of allocation size in pages. 0 is a single page.
2075  *
2076  *	Allocate a page from the kernel page pool.  When not in
2077  *	interrupt context and apply the current process NUMA policy.
2078  *	Returns NULL when no page can be allocated.
2079  *
2080  *	Don't call cpuset_update_task_memory_state() unless
2081  *	1) it's ok to take cpuset_sem (can WAIT), and
2082  *	2) allocating for current task (not interrupt).
2083  */
2084 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2085 {
2086 	struct mempolicy *pol = get_task_policy(current);
2087 	struct page *page;
2088 	unsigned int cpuset_mems_cookie;
2089 
2090 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2091 		pol = &default_policy;
2092 
2093 retry_cpuset:
2094 	cpuset_mems_cookie = get_mems_allowed();
2095 
2096 	/*
2097 	 * No reference counting needed for current->mempolicy
2098 	 * nor system default_policy
2099 	 */
2100 	if (pol->mode == MPOL_INTERLEAVE)
2101 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2102 	else
2103 		page = __alloc_pages_nodemask(gfp, order,
2104 				policy_zonelist(gfp, pol, numa_node_id()),
2105 				policy_nodemask(gfp, pol));
2106 
2107 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2108 		goto retry_cpuset;
2109 
2110 	return page;
2111 }
2112 EXPORT_SYMBOL(alloc_pages_current);
2113 
2114 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2115 {
2116 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2117 
2118 	if (IS_ERR(pol))
2119 		return PTR_ERR(pol);
2120 	dst->vm_policy = pol;
2121 	return 0;
2122 }
2123 
2124 /*
2125  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2126  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2127  * with the mems_allowed returned by cpuset_mems_allowed().  This
2128  * keeps mempolicies cpuset relative after its cpuset moves.  See
2129  * further kernel/cpuset.c update_nodemask().
2130  *
2131  * current's mempolicy may be rebinded by the other task(the task that changes
2132  * cpuset's mems), so we needn't do rebind work for current task.
2133  */
2134 
2135 /* Slow path of a mempolicy duplicate */
2136 struct mempolicy *__mpol_dup(struct mempolicy *old)
2137 {
2138 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2139 
2140 	if (!new)
2141 		return ERR_PTR(-ENOMEM);
2142 
2143 	/* task's mempolicy is protected by alloc_lock */
2144 	if (old == current->mempolicy) {
2145 		task_lock(current);
2146 		*new = *old;
2147 		task_unlock(current);
2148 	} else
2149 		*new = *old;
2150 
2151 	rcu_read_lock();
2152 	if (current_cpuset_is_being_rebound()) {
2153 		nodemask_t mems = cpuset_mems_allowed(current);
2154 		if (new->flags & MPOL_F_REBINDING)
2155 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2156 		else
2157 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2158 	}
2159 	rcu_read_unlock();
2160 	atomic_set(&new->refcnt, 1);
2161 	return new;
2162 }
2163 
2164 /* Slow path of a mempolicy comparison */
2165 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2166 {
2167 	if (!a || !b)
2168 		return false;
2169 	if (a->mode != b->mode)
2170 		return false;
2171 	if (a->flags != b->flags)
2172 		return false;
2173 	if (mpol_store_user_nodemask(a))
2174 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2175 			return false;
2176 
2177 	switch (a->mode) {
2178 	case MPOL_BIND:
2179 		/* Fall through */
2180 	case MPOL_INTERLEAVE:
2181 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2182 	case MPOL_PREFERRED:
2183 		return a->v.preferred_node == b->v.preferred_node;
2184 	default:
2185 		BUG();
2186 		return false;
2187 	}
2188 }
2189 
2190 /*
2191  * Shared memory backing store policy support.
2192  *
2193  * Remember policies even when nobody has shared memory mapped.
2194  * The policies are kept in Red-Black tree linked from the inode.
2195  * They are protected by the sp->lock spinlock, which should be held
2196  * for any accesses to the tree.
2197  */
2198 
2199 /* lookup first element intersecting start-end */
2200 /* Caller holds sp->lock */
2201 static struct sp_node *
2202 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2203 {
2204 	struct rb_node *n = sp->root.rb_node;
2205 
2206 	while (n) {
2207 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2208 
2209 		if (start >= p->end)
2210 			n = n->rb_right;
2211 		else if (end <= p->start)
2212 			n = n->rb_left;
2213 		else
2214 			break;
2215 	}
2216 	if (!n)
2217 		return NULL;
2218 	for (;;) {
2219 		struct sp_node *w = NULL;
2220 		struct rb_node *prev = rb_prev(n);
2221 		if (!prev)
2222 			break;
2223 		w = rb_entry(prev, struct sp_node, nd);
2224 		if (w->end <= start)
2225 			break;
2226 		n = prev;
2227 	}
2228 	return rb_entry(n, struct sp_node, nd);
2229 }
2230 
2231 /* Insert a new shared policy into the list. */
2232 /* Caller holds sp->lock */
2233 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2234 {
2235 	struct rb_node **p = &sp->root.rb_node;
2236 	struct rb_node *parent = NULL;
2237 	struct sp_node *nd;
2238 
2239 	while (*p) {
2240 		parent = *p;
2241 		nd = rb_entry(parent, struct sp_node, nd);
2242 		if (new->start < nd->start)
2243 			p = &(*p)->rb_left;
2244 		else if (new->end > nd->end)
2245 			p = &(*p)->rb_right;
2246 		else
2247 			BUG();
2248 	}
2249 	rb_link_node(&new->nd, parent, p);
2250 	rb_insert_color(&new->nd, &sp->root);
2251 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2252 		 new->policy ? new->policy->mode : 0);
2253 }
2254 
2255 /* Find shared policy intersecting idx */
2256 struct mempolicy *
2257 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2258 {
2259 	struct mempolicy *pol = NULL;
2260 	struct sp_node *sn;
2261 
2262 	if (!sp->root.rb_node)
2263 		return NULL;
2264 	spin_lock(&sp->lock);
2265 	sn = sp_lookup(sp, idx, idx+1);
2266 	if (sn) {
2267 		mpol_get(sn->policy);
2268 		pol = sn->policy;
2269 	}
2270 	spin_unlock(&sp->lock);
2271 	return pol;
2272 }
2273 
2274 static void sp_free(struct sp_node *n)
2275 {
2276 	mpol_put(n->policy);
2277 	kmem_cache_free(sn_cache, n);
2278 }
2279 
2280 /**
2281  * mpol_misplaced - check whether current page node is valid in policy
2282  *
2283  * @page   - page to be checked
2284  * @vma    - vm area where page mapped
2285  * @addr   - virtual address where page mapped
2286  *
2287  * Lookup current policy node id for vma,addr and "compare to" page's
2288  * node id.
2289  *
2290  * Returns:
2291  *	-1	- not misplaced, page is in the right node
2292  *	node	- node id where the page should be
2293  *
2294  * Policy determination "mimics" alloc_page_vma().
2295  * Called from fault path where we know the vma and faulting address.
2296  */
2297 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2298 {
2299 	struct mempolicy *pol;
2300 	struct zone *zone;
2301 	int curnid = page_to_nid(page);
2302 	unsigned long pgoff;
2303 	int polnid = -1;
2304 	int ret = -1;
2305 
2306 	BUG_ON(!vma);
2307 
2308 	pol = get_vma_policy(current, vma, addr);
2309 	if (!(pol->flags & MPOL_F_MOF))
2310 		goto out;
2311 
2312 	switch (pol->mode) {
2313 	case MPOL_INTERLEAVE:
2314 		BUG_ON(addr >= vma->vm_end);
2315 		BUG_ON(addr < vma->vm_start);
2316 
2317 		pgoff = vma->vm_pgoff;
2318 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2319 		polnid = offset_il_node(pol, vma, pgoff);
2320 		break;
2321 
2322 	case MPOL_PREFERRED:
2323 		if (pol->flags & MPOL_F_LOCAL)
2324 			polnid = numa_node_id();
2325 		else
2326 			polnid = pol->v.preferred_node;
2327 		break;
2328 
2329 	case MPOL_BIND:
2330 		/*
2331 		 * allows binding to multiple nodes.
2332 		 * use current page if in policy nodemask,
2333 		 * else select nearest allowed node, if any.
2334 		 * If no allowed nodes, use current [!misplaced].
2335 		 */
2336 		if (node_isset(curnid, pol->v.nodes))
2337 			goto out;
2338 		(void)first_zones_zonelist(
2339 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2340 				gfp_zone(GFP_HIGHUSER),
2341 				&pol->v.nodes, &zone);
2342 		polnid = zone->node;
2343 		break;
2344 
2345 	default:
2346 		BUG();
2347 	}
2348 
2349 	/* Migrate the page towards the node whose CPU is referencing it */
2350 	if (pol->flags & MPOL_F_MORON) {
2351 		int last_nid;
2352 
2353 		polnid = numa_node_id();
2354 
2355 		/*
2356 		 * Multi-stage node selection is used in conjunction
2357 		 * with a periodic migration fault to build a temporal
2358 		 * task<->page relation. By using a two-stage filter we
2359 		 * remove short/unlikely relations.
2360 		 *
2361 		 * Using P(p) ~ n_p / n_t as per frequentist
2362 		 * probability, we can equate a task's usage of a
2363 		 * particular page (n_p) per total usage of this
2364 		 * page (n_t) (in a given time-span) to a probability.
2365 		 *
2366 		 * Our periodic faults will sample this probability and
2367 		 * getting the same result twice in a row, given these
2368 		 * samples are fully independent, is then given by
2369 		 * P(n)^2, provided our sample period is sufficiently
2370 		 * short compared to the usage pattern.
2371 		 *
2372 		 * This quadric squishes small probabilities, making
2373 		 * it less likely we act on an unlikely task<->page
2374 		 * relation.
2375 		 */
2376 		last_nid = page_nid_xchg_last(page, polnid);
2377 		if (last_nid != polnid)
2378 			goto out;
2379 	}
2380 
2381 	if (curnid != polnid)
2382 		ret = polnid;
2383 out:
2384 	mpol_cond_put(pol);
2385 
2386 	return ret;
2387 }
2388 
2389 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2390 {
2391 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2392 	rb_erase(&n->nd, &sp->root);
2393 	sp_free(n);
2394 }
2395 
2396 static void sp_node_init(struct sp_node *node, unsigned long start,
2397 			unsigned long end, struct mempolicy *pol)
2398 {
2399 	node->start = start;
2400 	node->end = end;
2401 	node->policy = pol;
2402 }
2403 
2404 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2405 				struct mempolicy *pol)
2406 {
2407 	struct sp_node *n;
2408 	struct mempolicy *newpol;
2409 
2410 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2411 	if (!n)
2412 		return NULL;
2413 
2414 	newpol = mpol_dup(pol);
2415 	if (IS_ERR(newpol)) {
2416 		kmem_cache_free(sn_cache, n);
2417 		return NULL;
2418 	}
2419 	newpol->flags |= MPOL_F_SHARED;
2420 	sp_node_init(n, start, end, newpol);
2421 
2422 	return n;
2423 }
2424 
2425 /* Replace a policy range. */
2426 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2427 				 unsigned long end, struct sp_node *new)
2428 {
2429 	struct sp_node *n;
2430 	struct sp_node *n_new = NULL;
2431 	struct mempolicy *mpol_new = NULL;
2432 	int ret = 0;
2433 
2434 restart:
2435 	spin_lock(&sp->lock);
2436 	n = sp_lookup(sp, start, end);
2437 	/* Take care of old policies in the same range. */
2438 	while (n && n->start < end) {
2439 		struct rb_node *next = rb_next(&n->nd);
2440 		if (n->start >= start) {
2441 			if (n->end <= end)
2442 				sp_delete(sp, n);
2443 			else
2444 				n->start = end;
2445 		} else {
2446 			/* Old policy spanning whole new range. */
2447 			if (n->end > end) {
2448 				if (!n_new)
2449 					goto alloc_new;
2450 
2451 				*mpol_new = *n->policy;
2452 				atomic_set(&mpol_new->refcnt, 1);
2453 				sp_node_init(n_new, end, n->end, mpol_new);
2454 				n->end = start;
2455 				sp_insert(sp, n_new);
2456 				n_new = NULL;
2457 				mpol_new = NULL;
2458 				break;
2459 			} else
2460 				n->end = start;
2461 		}
2462 		if (!next)
2463 			break;
2464 		n = rb_entry(next, struct sp_node, nd);
2465 	}
2466 	if (new)
2467 		sp_insert(sp, new);
2468 	spin_unlock(&sp->lock);
2469 	ret = 0;
2470 
2471 err_out:
2472 	if (mpol_new)
2473 		mpol_put(mpol_new);
2474 	if (n_new)
2475 		kmem_cache_free(sn_cache, n_new);
2476 
2477 	return ret;
2478 
2479 alloc_new:
2480 	spin_unlock(&sp->lock);
2481 	ret = -ENOMEM;
2482 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2483 	if (!n_new)
2484 		goto err_out;
2485 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2486 	if (!mpol_new)
2487 		goto err_out;
2488 	goto restart;
2489 }
2490 
2491 /**
2492  * mpol_shared_policy_init - initialize shared policy for inode
2493  * @sp: pointer to inode shared policy
2494  * @mpol:  struct mempolicy to install
2495  *
2496  * Install non-NULL @mpol in inode's shared policy rb-tree.
2497  * On entry, the current task has a reference on a non-NULL @mpol.
2498  * This must be released on exit.
2499  * This is called at get_inode() calls and we can use GFP_KERNEL.
2500  */
2501 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2502 {
2503 	int ret;
2504 
2505 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2506 	spin_lock_init(&sp->lock);
2507 
2508 	if (mpol) {
2509 		struct vm_area_struct pvma;
2510 		struct mempolicy *new;
2511 		NODEMASK_SCRATCH(scratch);
2512 
2513 		if (!scratch)
2514 			goto put_mpol;
2515 		/* contextualize the tmpfs mount point mempolicy */
2516 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2517 		if (IS_ERR(new))
2518 			goto free_scratch; /* no valid nodemask intersection */
2519 
2520 		task_lock(current);
2521 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2522 		task_unlock(current);
2523 		if (ret)
2524 			goto put_new;
2525 
2526 		/* Create pseudo-vma that contains just the policy */
2527 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2528 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2529 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2530 
2531 put_new:
2532 		mpol_put(new);			/* drop initial ref */
2533 free_scratch:
2534 		NODEMASK_SCRATCH_FREE(scratch);
2535 put_mpol:
2536 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2537 	}
2538 }
2539 
2540 int mpol_set_shared_policy(struct shared_policy *info,
2541 			struct vm_area_struct *vma, struct mempolicy *npol)
2542 {
2543 	int err;
2544 	struct sp_node *new = NULL;
2545 	unsigned long sz = vma_pages(vma);
2546 
2547 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2548 		 vma->vm_pgoff,
2549 		 sz, npol ? npol->mode : -1,
2550 		 npol ? npol->flags : -1,
2551 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2552 
2553 	if (npol) {
2554 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2555 		if (!new)
2556 			return -ENOMEM;
2557 	}
2558 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2559 	if (err && new)
2560 		sp_free(new);
2561 	return err;
2562 }
2563 
2564 /* Free a backing policy store on inode delete. */
2565 void mpol_free_shared_policy(struct shared_policy *p)
2566 {
2567 	struct sp_node *n;
2568 	struct rb_node *next;
2569 
2570 	if (!p->root.rb_node)
2571 		return;
2572 	spin_lock(&p->lock);
2573 	next = rb_first(&p->root);
2574 	while (next) {
2575 		n = rb_entry(next, struct sp_node, nd);
2576 		next = rb_next(&n->nd);
2577 		sp_delete(p, n);
2578 	}
2579 	spin_unlock(&p->lock);
2580 }
2581 
2582 #ifdef CONFIG_NUMA_BALANCING
2583 static bool __initdata numabalancing_override;
2584 
2585 static void __init check_numabalancing_enable(void)
2586 {
2587 	bool numabalancing_default = false;
2588 
2589 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2590 		numabalancing_default = true;
2591 
2592 	if (nr_node_ids > 1 && !numabalancing_override) {
2593 		printk(KERN_INFO "Enabling automatic NUMA balancing. "
2594 			"Configure with numa_balancing= or sysctl");
2595 		set_numabalancing_state(numabalancing_default);
2596 	}
2597 }
2598 
2599 static int __init setup_numabalancing(char *str)
2600 {
2601 	int ret = 0;
2602 	if (!str)
2603 		goto out;
2604 	numabalancing_override = true;
2605 
2606 	if (!strcmp(str, "enable")) {
2607 		set_numabalancing_state(true);
2608 		ret = 1;
2609 	} else if (!strcmp(str, "disable")) {
2610 		set_numabalancing_state(false);
2611 		ret = 1;
2612 	}
2613 out:
2614 	if (!ret)
2615 		printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2616 
2617 	return ret;
2618 }
2619 __setup("numa_balancing=", setup_numabalancing);
2620 #else
2621 static inline void __init check_numabalancing_enable(void)
2622 {
2623 }
2624 #endif /* CONFIG_NUMA_BALANCING */
2625 
2626 /* assumes fs == KERNEL_DS */
2627 void __init numa_policy_init(void)
2628 {
2629 	nodemask_t interleave_nodes;
2630 	unsigned long largest = 0;
2631 	int nid, prefer = 0;
2632 
2633 	policy_cache = kmem_cache_create("numa_policy",
2634 					 sizeof(struct mempolicy),
2635 					 0, SLAB_PANIC, NULL);
2636 
2637 	sn_cache = kmem_cache_create("shared_policy_node",
2638 				     sizeof(struct sp_node),
2639 				     0, SLAB_PANIC, NULL);
2640 
2641 	for_each_node(nid) {
2642 		preferred_node_policy[nid] = (struct mempolicy) {
2643 			.refcnt = ATOMIC_INIT(1),
2644 			.mode = MPOL_PREFERRED,
2645 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2646 			.v = { .preferred_node = nid, },
2647 		};
2648 	}
2649 
2650 	/*
2651 	 * Set interleaving policy for system init. Interleaving is only
2652 	 * enabled across suitably sized nodes (default is >= 16MB), or
2653 	 * fall back to the largest node if they're all smaller.
2654 	 */
2655 	nodes_clear(interleave_nodes);
2656 	for_each_node_state(nid, N_MEMORY) {
2657 		unsigned long total_pages = node_present_pages(nid);
2658 
2659 		/* Preserve the largest node */
2660 		if (largest < total_pages) {
2661 			largest = total_pages;
2662 			prefer = nid;
2663 		}
2664 
2665 		/* Interleave this node? */
2666 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2667 			node_set(nid, interleave_nodes);
2668 	}
2669 
2670 	/* All too small, use the largest */
2671 	if (unlikely(nodes_empty(interleave_nodes)))
2672 		node_set(prefer, interleave_nodes);
2673 
2674 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2675 		printk("numa_policy_init: interleaving failed\n");
2676 
2677 	check_numabalancing_enable();
2678 }
2679 
2680 /* Reset policy of current process to default */
2681 void numa_default_policy(void)
2682 {
2683 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2684 }
2685 
2686 /*
2687  * Parse and format mempolicy from/to strings
2688  */
2689 
2690 /*
2691  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2692  */
2693 static const char * const policy_modes[] =
2694 {
2695 	[MPOL_DEFAULT]    = "default",
2696 	[MPOL_PREFERRED]  = "prefer",
2697 	[MPOL_BIND]       = "bind",
2698 	[MPOL_INTERLEAVE] = "interleave",
2699 	[MPOL_LOCAL]      = "local",
2700 };
2701 
2702 
2703 #ifdef CONFIG_TMPFS
2704 /**
2705  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2706  * @str:  string containing mempolicy to parse
2707  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2708  *
2709  * Format of input:
2710  *	<mode>[=<flags>][:<nodelist>]
2711  *
2712  * On success, returns 0, else 1
2713  */
2714 int mpol_parse_str(char *str, struct mempolicy **mpol)
2715 {
2716 	struct mempolicy *new = NULL;
2717 	unsigned short mode;
2718 	unsigned short mode_flags;
2719 	nodemask_t nodes;
2720 	char *nodelist = strchr(str, ':');
2721 	char *flags = strchr(str, '=');
2722 	int err = 1;
2723 
2724 	if (nodelist) {
2725 		/* NUL-terminate mode or flags string */
2726 		*nodelist++ = '\0';
2727 		if (nodelist_parse(nodelist, nodes))
2728 			goto out;
2729 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2730 			goto out;
2731 	} else
2732 		nodes_clear(nodes);
2733 
2734 	if (flags)
2735 		*flags++ = '\0';	/* terminate mode string */
2736 
2737 	for (mode = 0; mode < MPOL_MAX; mode++) {
2738 		if (!strcmp(str, policy_modes[mode])) {
2739 			break;
2740 		}
2741 	}
2742 	if (mode >= MPOL_MAX)
2743 		goto out;
2744 
2745 	switch (mode) {
2746 	case MPOL_PREFERRED:
2747 		/*
2748 		 * Insist on a nodelist of one node only
2749 		 */
2750 		if (nodelist) {
2751 			char *rest = nodelist;
2752 			while (isdigit(*rest))
2753 				rest++;
2754 			if (*rest)
2755 				goto out;
2756 		}
2757 		break;
2758 	case MPOL_INTERLEAVE:
2759 		/*
2760 		 * Default to online nodes with memory if no nodelist
2761 		 */
2762 		if (!nodelist)
2763 			nodes = node_states[N_MEMORY];
2764 		break;
2765 	case MPOL_LOCAL:
2766 		/*
2767 		 * Don't allow a nodelist;  mpol_new() checks flags
2768 		 */
2769 		if (nodelist)
2770 			goto out;
2771 		mode = MPOL_PREFERRED;
2772 		break;
2773 	case MPOL_DEFAULT:
2774 		/*
2775 		 * Insist on a empty nodelist
2776 		 */
2777 		if (!nodelist)
2778 			err = 0;
2779 		goto out;
2780 	case MPOL_BIND:
2781 		/*
2782 		 * Insist on a nodelist
2783 		 */
2784 		if (!nodelist)
2785 			goto out;
2786 	}
2787 
2788 	mode_flags = 0;
2789 	if (flags) {
2790 		/*
2791 		 * Currently, we only support two mutually exclusive
2792 		 * mode flags.
2793 		 */
2794 		if (!strcmp(flags, "static"))
2795 			mode_flags |= MPOL_F_STATIC_NODES;
2796 		else if (!strcmp(flags, "relative"))
2797 			mode_flags |= MPOL_F_RELATIVE_NODES;
2798 		else
2799 			goto out;
2800 	}
2801 
2802 	new = mpol_new(mode, mode_flags, &nodes);
2803 	if (IS_ERR(new))
2804 		goto out;
2805 
2806 	/*
2807 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2808 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2809 	 */
2810 	if (mode != MPOL_PREFERRED)
2811 		new->v.nodes = nodes;
2812 	else if (nodelist)
2813 		new->v.preferred_node = first_node(nodes);
2814 	else
2815 		new->flags |= MPOL_F_LOCAL;
2816 
2817 	/*
2818 	 * Save nodes for contextualization: this will be used to "clone"
2819 	 * the mempolicy in a specific context [cpuset] at a later time.
2820 	 */
2821 	new->w.user_nodemask = nodes;
2822 
2823 	err = 0;
2824 
2825 out:
2826 	/* Restore string for error message */
2827 	if (nodelist)
2828 		*--nodelist = ':';
2829 	if (flags)
2830 		*--flags = '=';
2831 	if (!err)
2832 		*mpol = new;
2833 	return err;
2834 }
2835 #endif /* CONFIG_TMPFS */
2836 
2837 /**
2838  * mpol_to_str - format a mempolicy structure for printing
2839  * @buffer:  to contain formatted mempolicy string
2840  * @maxlen:  length of @buffer
2841  * @pol:  pointer to mempolicy to be formatted
2842  *
2843  * Convert a mempolicy into a string.
2844  * Returns the number of characters in buffer (if positive)
2845  * or an error (negative)
2846  */
2847 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2848 {
2849 	char *p = buffer;
2850 	int l;
2851 	nodemask_t nodes;
2852 	unsigned short mode;
2853 	unsigned short flags = pol ? pol->flags : 0;
2854 
2855 	/*
2856 	 * Sanity check:  room for longest mode, flag and some nodes
2857 	 */
2858 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2859 
2860 	if (!pol || pol == &default_policy)
2861 		mode = MPOL_DEFAULT;
2862 	else
2863 		mode = pol->mode;
2864 
2865 	switch (mode) {
2866 	case MPOL_DEFAULT:
2867 		nodes_clear(nodes);
2868 		break;
2869 
2870 	case MPOL_PREFERRED:
2871 		nodes_clear(nodes);
2872 		if (flags & MPOL_F_LOCAL)
2873 			mode = MPOL_LOCAL;
2874 		else
2875 			node_set(pol->v.preferred_node, nodes);
2876 		break;
2877 
2878 	case MPOL_BIND:
2879 		/* Fall through */
2880 	case MPOL_INTERLEAVE:
2881 		nodes = pol->v.nodes;
2882 		break;
2883 
2884 	default:
2885 		return -EINVAL;
2886 	}
2887 
2888 	l = strlen(policy_modes[mode]);
2889 	if (buffer + maxlen < p + l + 1)
2890 		return -ENOSPC;
2891 
2892 	strcpy(p, policy_modes[mode]);
2893 	p += l;
2894 
2895 	if (flags & MPOL_MODE_FLAGS) {
2896 		if (buffer + maxlen < p + 2)
2897 			return -ENOSPC;
2898 		*p++ = '=';
2899 
2900 		/*
2901 		 * Currently, the only defined flags are mutually exclusive
2902 		 */
2903 		if (flags & MPOL_F_STATIC_NODES)
2904 			p += snprintf(p, buffer + maxlen - p, "static");
2905 		else if (flags & MPOL_F_RELATIVE_NODES)
2906 			p += snprintf(p, buffer + maxlen - p, "relative");
2907 	}
2908 
2909 	if (!nodes_empty(nodes)) {
2910 		if (buffer + maxlen < p + 2)
2911 			return -ENOSPC;
2912 		*p++ = ':';
2913 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2914 	}
2915 	return p - buffer;
2916 }
2917