xref: /openbmc/linux/mm/mempolicy.c (revision 483eb062)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 #include <linux/random.h>
98 
99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 	struct mempolicy *pol = p->mempolicy;
126 
127 	if (!pol) {
128 		int node = numa_node_id();
129 
130 		if (node != NUMA_NO_NODE) {
131 			pol = &preferred_node_policy[node];
132 			/*
133 			 * preferred_node_policy is not initialised early in
134 			 * boot
135 			 */
136 			if (!pol->mode)
137 				pol = NULL;
138 		}
139 	}
140 
141 	return pol;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 /* Check that the nodemask contains at least one populated zone */
165 static int is_valid_nodemask(const nodemask_t *nodemask)
166 {
167 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168 }
169 
170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171 {
172 	return pol->flags & MPOL_MODE_FLAGS;
173 }
174 
175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176 				   const nodemask_t *rel)
177 {
178 	nodemask_t tmp;
179 	nodes_fold(tmp, *orig, nodes_weight(*rel));
180 	nodes_onto(*ret, tmp, *rel);
181 }
182 
183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184 {
185 	if (nodes_empty(*nodes))
186 		return -EINVAL;
187 	pol->v.nodes = *nodes;
188 	return 0;
189 }
190 
191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192 {
193 	if (!nodes)
194 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
195 	else if (nodes_empty(*nodes))
196 		return -EINVAL;			/*  no allowed nodes */
197 	else
198 		pol->v.preferred_node = first_node(*nodes);
199 	return 0;
200 }
201 
202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203 {
204 	if (!is_valid_nodemask(nodes))
205 		return -EINVAL;
206 	pol->v.nodes = *nodes;
207 	return 0;
208 }
209 
210 /*
211  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212  * any, for the new policy.  mpol_new() has already validated the nodes
213  * parameter with respect to the policy mode and flags.  But, we need to
214  * handle an empty nodemask with MPOL_PREFERRED here.
215  *
216  * Must be called holding task's alloc_lock to protect task's mems_allowed
217  * and mempolicy.  May also be called holding the mmap_semaphore for write.
218  */
219 static int mpol_set_nodemask(struct mempolicy *pol,
220 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
221 {
222 	int ret;
223 
224 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225 	if (pol == NULL)
226 		return 0;
227 	/* Check N_MEMORY */
228 	nodes_and(nsc->mask1,
229 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
230 
231 	VM_BUG_ON(!nodes);
232 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233 		nodes = NULL;	/* explicit local allocation */
234 	else {
235 		if (pol->flags & MPOL_F_RELATIVE_NODES)
236 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237 		else
238 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
239 
240 		if (mpol_store_user_nodemask(pol))
241 			pol->w.user_nodemask = *nodes;
242 		else
243 			pol->w.cpuset_mems_allowed =
244 						cpuset_current_mems_allowed;
245 	}
246 
247 	if (nodes)
248 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249 	else
250 		ret = mpol_ops[pol->mode].create(pol, NULL);
251 	return ret;
252 }
253 
254 /*
255  * This function just creates a new policy, does some check and simple
256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
257  */
258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 				  nodemask_t *nodes)
260 {
261 	struct mempolicy *policy;
262 
263 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 
266 	if (mode == MPOL_DEFAULT) {
267 		if (nodes && !nodes_empty(*nodes))
268 			return ERR_PTR(-EINVAL);
269 		return NULL;
270 	}
271 	VM_BUG_ON(!nodes);
272 
273 	/*
274 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 	 * All other modes require a valid pointer to a non-empty nodemask.
277 	 */
278 	if (mode == MPOL_PREFERRED) {
279 		if (nodes_empty(*nodes)) {
280 			if (((flags & MPOL_F_STATIC_NODES) ||
281 			     (flags & MPOL_F_RELATIVE_NODES)))
282 				return ERR_PTR(-EINVAL);
283 		}
284 	} else if (mode == MPOL_LOCAL) {
285 		if (!nodes_empty(*nodes))
286 			return ERR_PTR(-EINVAL);
287 		mode = MPOL_PREFERRED;
288 	} else if (nodes_empty(*nodes))
289 		return ERR_PTR(-EINVAL);
290 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291 	if (!policy)
292 		return ERR_PTR(-ENOMEM);
293 	atomic_set(&policy->refcnt, 1);
294 	policy->mode = mode;
295 	policy->flags = flags;
296 
297 	return policy;
298 }
299 
300 /* Slow path of a mpol destructor. */
301 void __mpol_put(struct mempolicy *p)
302 {
303 	if (!atomic_dec_and_test(&p->refcnt))
304 		return;
305 	kmem_cache_free(policy_cache, p);
306 }
307 
308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309 				enum mpol_rebind_step step)
310 {
311 }
312 
313 /*
314  * step:
315  * 	MPOL_REBIND_ONCE  - do rebind work at once
316  * 	MPOL_REBIND_STEP1 - set all the newly nodes
317  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
318  */
319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320 				 enum mpol_rebind_step step)
321 {
322 	nodemask_t tmp;
323 
324 	if (pol->flags & MPOL_F_STATIC_NODES)
325 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 	else {
329 		/*
330 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331 		 * result
332 		 */
333 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334 			nodes_remap(tmp, pol->v.nodes,
335 					pol->w.cpuset_mems_allowed, *nodes);
336 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337 		} else if (step == MPOL_REBIND_STEP2) {
338 			tmp = pol->w.cpuset_mems_allowed;
339 			pol->w.cpuset_mems_allowed = *nodes;
340 		} else
341 			BUG();
342 	}
343 
344 	if (nodes_empty(tmp))
345 		tmp = *nodes;
346 
347 	if (step == MPOL_REBIND_STEP1)
348 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350 		pol->v.nodes = tmp;
351 	else
352 		BUG();
353 
354 	if (!node_isset(current->il_next, tmp)) {
355 		current->il_next = next_node(current->il_next, tmp);
356 		if (current->il_next >= MAX_NUMNODES)
357 			current->il_next = first_node(tmp);
358 		if (current->il_next >= MAX_NUMNODES)
359 			current->il_next = numa_node_id();
360 	}
361 }
362 
363 static void mpol_rebind_preferred(struct mempolicy *pol,
364 				  const nodemask_t *nodes,
365 				  enum mpol_rebind_step step)
366 {
367 	nodemask_t tmp;
368 
369 	if (pol->flags & MPOL_F_STATIC_NODES) {
370 		int node = first_node(pol->w.user_nodemask);
371 
372 		if (node_isset(node, *nodes)) {
373 			pol->v.preferred_node = node;
374 			pol->flags &= ~MPOL_F_LOCAL;
375 		} else
376 			pol->flags |= MPOL_F_LOCAL;
377 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379 		pol->v.preferred_node = first_node(tmp);
380 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
381 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
382 						   pol->w.cpuset_mems_allowed,
383 						   *nodes);
384 		pol->w.cpuset_mems_allowed = *nodes;
385 	}
386 }
387 
388 /*
389  * mpol_rebind_policy - Migrate a policy to a different set of nodes
390  *
391  * If read-side task has no lock to protect task->mempolicy, write-side
392  * task will rebind the task->mempolicy by two step. The first step is
393  * setting all the newly nodes, and the second step is cleaning all the
394  * disallowed nodes. In this way, we can avoid finding no node to alloc
395  * page.
396  * If we have a lock to protect task->mempolicy in read-side, we do
397  * rebind directly.
398  *
399  * step:
400  * 	MPOL_REBIND_ONCE  - do rebind work at once
401  * 	MPOL_REBIND_STEP1 - set all the newly nodes
402  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
403  */
404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405 				enum mpol_rebind_step step)
406 {
407 	if (!pol)
408 		return;
409 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414 		return;
415 
416 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417 		BUG();
418 
419 	if (step == MPOL_REBIND_STEP1)
420 		pol->flags |= MPOL_F_REBINDING;
421 	else if (step == MPOL_REBIND_STEP2)
422 		pol->flags &= ~MPOL_F_REBINDING;
423 	else if (step >= MPOL_REBIND_NSTEP)
424 		BUG();
425 
426 	mpol_ops[pol->mode].rebind(pol, newmask, step);
427 }
428 
429 /*
430  * Wrapper for mpol_rebind_policy() that just requires task
431  * pointer, and updates task mempolicy.
432  *
433  * Called with task's alloc_lock held.
434  */
435 
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437 			enum mpol_rebind_step step)
438 {
439 	mpol_rebind_policy(tsk->mempolicy, new, step);
440 }
441 
442 /*
443  * Rebind each vma in mm to new nodemask.
444  *
445  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
446  */
447 
448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449 {
450 	struct vm_area_struct *vma;
451 
452 	down_write(&mm->mmap_sem);
453 	for (vma = mm->mmap; vma; vma = vma->vm_next)
454 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455 	up_write(&mm->mmap_sem);
456 }
457 
458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459 	[MPOL_DEFAULT] = {
460 		.rebind = mpol_rebind_default,
461 	},
462 	[MPOL_INTERLEAVE] = {
463 		.create = mpol_new_interleave,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 	[MPOL_PREFERRED] = {
467 		.create = mpol_new_preferred,
468 		.rebind = mpol_rebind_preferred,
469 	},
470 	[MPOL_BIND] = {
471 		.create = mpol_new_bind,
472 		.rebind = mpol_rebind_nodemask,
473 	},
474 };
475 
476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
477 				unsigned long flags);
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484 		unsigned long addr, unsigned long end,
485 		const nodemask_t *nodes, unsigned long flags,
486 		void *private)
487 {
488 	pte_t *orig_pte;
489 	pte_t *pte;
490 	spinlock_t *ptl;
491 
492 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493 	do {
494 		struct page *page;
495 		int nid;
496 
497 		if (!pte_present(*pte))
498 			continue;
499 		page = vm_normal_page(vma, addr, *pte);
500 		if (!page)
501 			continue;
502 		/*
503 		 * vm_normal_page() filters out zero pages, but there might
504 		 * still be PageReserved pages to skip, perhaps in a VDSO.
505 		 */
506 		if (PageReserved(page))
507 			continue;
508 		nid = page_to_nid(page);
509 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510 			continue;
511 
512 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513 			migrate_page_add(page, private, flags);
514 		else
515 			break;
516 	} while (pte++, addr += PAGE_SIZE, addr != end);
517 	pte_unmap_unlock(orig_pte, ptl);
518 	return addr != end;
519 }
520 
521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 				    void *private)
524 {
525 #ifdef CONFIG_HUGETLB_PAGE
526 	int nid;
527 	struct page *page;
528 	spinlock_t *ptl;
529 
530 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
531 	page = pte_page(huge_ptep_get((pte_t *)pmd));
532 	nid = page_to_nid(page);
533 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
534 		goto unlock;
535 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
536 	if (flags & (MPOL_MF_MOVE_ALL) ||
537 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
538 		isolate_huge_page(page, private);
539 unlock:
540 	spin_unlock(ptl);
541 #else
542 	BUG();
543 #endif
544 }
545 
546 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
547 		unsigned long addr, unsigned long end,
548 		const nodemask_t *nodes, unsigned long flags,
549 		void *private)
550 {
551 	pmd_t *pmd;
552 	unsigned long next;
553 
554 	pmd = pmd_offset(pud, addr);
555 	do {
556 		next = pmd_addr_end(addr, end);
557 		if (!pmd_present(*pmd))
558 			continue;
559 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
560 			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
561 						flags, private);
562 			continue;
563 		}
564 		split_huge_page_pmd(vma, addr, pmd);
565 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
566 			continue;
567 		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
568 				    flags, private))
569 			return -EIO;
570 	} while (pmd++, addr = next, addr != end);
571 	return 0;
572 }
573 
574 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
575 		unsigned long addr, unsigned long end,
576 		const nodemask_t *nodes, unsigned long flags,
577 		void *private)
578 {
579 	pud_t *pud;
580 	unsigned long next;
581 
582 	pud = pud_offset(pgd, addr);
583 	do {
584 		next = pud_addr_end(addr, end);
585 		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
586 			continue;
587 		if (pud_none_or_clear_bad(pud))
588 			continue;
589 		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
590 				    flags, private))
591 			return -EIO;
592 	} while (pud++, addr = next, addr != end);
593 	return 0;
594 }
595 
596 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
597 		unsigned long addr, unsigned long end,
598 		const nodemask_t *nodes, unsigned long flags,
599 		void *private)
600 {
601 	pgd_t *pgd;
602 	unsigned long next;
603 
604 	pgd = pgd_offset(vma->vm_mm, addr);
605 	do {
606 		next = pgd_addr_end(addr, end);
607 		if (pgd_none_or_clear_bad(pgd))
608 			continue;
609 		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
610 				    flags, private))
611 			return -EIO;
612 	} while (pgd++, addr = next, addr != end);
613 	return 0;
614 }
615 
616 #ifdef CONFIG_NUMA_BALANCING
617 /*
618  * This is used to mark a range of virtual addresses to be inaccessible.
619  * These are later cleared by a NUMA hinting fault. Depending on these
620  * faults, pages may be migrated for better NUMA placement.
621  *
622  * This is assuming that NUMA faults are handled using PROT_NONE. If
623  * an architecture makes a different choice, it will need further
624  * changes to the core.
625  */
626 unsigned long change_prot_numa(struct vm_area_struct *vma,
627 			unsigned long addr, unsigned long end)
628 {
629 	int nr_updated;
630 
631 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632 	if (nr_updated)
633 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 
635 	return nr_updated;
636 }
637 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 			unsigned long addr, unsigned long end)
640 {
641 	return 0;
642 }
643 #endif /* CONFIG_NUMA_BALANCING */
644 
645 /*
646  * Walk through page tables and collect pages to be migrated.
647  *
648  * If pages found in a given range are on a set of nodes (determined by
649  * @nodes and @flags,) it's isolated and queued to the pagelist which is
650  * passed via @private.)
651  */
652 static struct vm_area_struct *
653 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654 		const nodemask_t *nodes, unsigned long flags, void *private)
655 {
656 	int err;
657 	struct vm_area_struct *first, *vma, *prev;
658 
659 
660 	first = find_vma(mm, start);
661 	if (!first)
662 		return ERR_PTR(-EFAULT);
663 	prev = NULL;
664 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665 		unsigned long endvma = vma->vm_end;
666 
667 		if (endvma > end)
668 			endvma = end;
669 		if (vma->vm_start > start)
670 			start = vma->vm_start;
671 
672 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
673 			if (!vma->vm_next && vma->vm_end < end)
674 				return ERR_PTR(-EFAULT);
675 			if (prev && prev->vm_end < vma->vm_start)
676 				return ERR_PTR(-EFAULT);
677 		}
678 
679 		if (flags & MPOL_MF_LAZY) {
680 			change_prot_numa(vma, start, endvma);
681 			goto next;
682 		}
683 
684 		if ((flags & MPOL_MF_STRICT) ||
685 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 		      vma_migratable(vma))) {
687 
688 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 						flags, private);
690 			if (err) {
691 				first = ERR_PTR(err);
692 				break;
693 			}
694 		}
695 next:
696 		prev = vma;
697 	}
698 	return first;
699 }
700 
701 /*
702  * Apply policy to a single VMA
703  * This must be called with the mmap_sem held for writing.
704  */
705 static int vma_replace_policy(struct vm_area_struct *vma,
706 						struct mempolicy *pol)
707 {
708 	int err;
709 	struct mempolicy *old;
710 	struct mempolicy *new;
711 
712 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
713 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
714 		 vma->vm_ops, vma->vm_file,
715 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
716 
717 	new = mpol_dup(pol);
718 	if (IS_ERR(new))
719 		return PTR_ERR(new);
720 
721 	if (vma->vm_ops && vma->vm_ops->set_policy) {
722 		err = vma->vm_ops->set_policy(vma, new);
723 		if (err)
724 			goto err_out;
725 	}
726 
727 	old = vma->vm_policy;
728 	vma->vm_policy = new; /* protected by mmap_sem */
729 	mpol_put(old);
730 
731 	return 0;
732  err_out:
733 	mpol_put(new);
734 	return err;
735 }
736 
737 /* Step 2: apply policy to a range and do splits. */
738 static int mbind_range(struct mm_struct *mm, unsigned long start,
739 		       unsigned long end, struct mempolicy *new_pol)
740 {
741 	struct vm_area_struct *next;
742 	struct vm_area_struct *prev;
743 	struct vm_area_struct *vma;
744 	int err = 0;
745 	pgoff_t pgoff;
746 	unsigned long vmstart;
747 	unsigned long vmend;
748 
749 	vma = find_vma(mm, start);
750 	if (!vma || vma->vm_start > start)
751 		return -EFAULT;
752 
753 	prev = vma->vm_prev;
754 	if (start > vma->vm_start)
755 		prev = vma;
756 
757 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
758 		next = vma->vm_next;
759 		vmstart = max(start, vma->vm_start);
760 		vmend   = min(end, vma->vm_end);
761 
762 		if (mpol_equal(vma_policy(vma), new_pol))
763 			continue;
764 
765 		pgoff = vma->vm_pgoff +
766 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
767 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768 				  vma->anon_vma, vma->vm_file, pgoff,
769 				  new_pol);
770 		if (prev) {
771 			vma = prev;
772 			next = vma->vm_next;
773 			if (mpol_equal(vma_policy(vma), new_pol))
774 				continue;
775 			/* vma_merge() joined vma && vma->next, case 8 */
776 			goto replace;
777 		}
778 		if (vma->vm_start != vmstart) {
779 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
780 			if (err)
781 				goto out;
782 		}
783 		if (vma->vm_end != vmend) {
784 			err = split_vma(vma->vm_mm, vma, vmend, 0);
785 			if (err)
786 				goto out;
787 		}
788  replace:
789 		err = vma_replace_policy(vma, new_pol);
790 		if (err)
791 			goto out;
792 	}
793 
794  out:
795 	return err;
796 }
797 
798 /*
799  * Update task->flags PF_MEMPOLICY bit: set iff non-default
800  * mempolicy.  Allows more rapid checking of this (combined perhaps
801  * with other PF_* flag bits) on memory allocation hot code paths.
802  *
803  * If called from outside this file, the task 'p' should -only- be
804  * a newly forked child not yet visible on the task list, because
805  * manipulating the task flags of a visible task is not safe.
806  *
807  * The above limitation is why this routine has the funny name
808  * mpol_fix_fork_child_flag().
809  *
810  * It is also safe to call this with a task pointer of current,
811  * which the static wrapper mpol_set_task_struct_flag() does,
812  * for use within this file.
813  */
814 
815 void mpol_fix_fork_child_flag(struct task_struct *p)
816 {
817 	if (p->mempolicy)
818 		p->flags |= PF_MEMPOLICY;
819 	else
820 		p->flags &= ~PF_MEMPOLICY;
821 }
822 
823 static void mpol_set_task_struct_flag(void)
824 {
825 	mpol_fix_fork_child_flag(current);
826 }
827 
828 /* Set the process memory policy */
829 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
830 			     nodemask_t *nodes)
831 {
832 	struct mempolicy *new, *old;
833 	struct mm_struct *mm = current->mm;
834 	NODEMASK_SCRATCH(scratch);
835 	int ret;
836 
837 	if (!scratch)
838 		return -ENOMEM;
839 
840 	new = mpol_new(mode, flags, nodes);
841 	if (IS_ERR(new)) {
842 		ret = PTR_ERR(new);
843 		goto out;
844 	}
845 	/*
846 	 * prevent changing our mempolicy while show_numa_maps()
847 	 * is using it.
848 	 * Note:  do_set_mempolicy() can be called at init time
849 	 * with no 'mm'.
850 	 */
851 	if (mm)
852 		down_write(&mm->mmap_sem);
853 	task_lock(current);
854 	ret = mpol_set_nodemask(new, nodes, scratch);
855 	if (ret) {
856 		task_unlock(current);
857 		if (mm)
858 			up_write(&mm->mmap_sem);
859 		mpol_put(new);
860 		goto out;
861 	}
862 	old = current->mempolicy;
863 	current->mempolicy = new;
864 	mpol_set_task_struct_flag();
865 	if (new && new->mode == MPOL_INTERLEAVE &&
866 	    nodes_weight(new->v.nodes))
867 		current->il_next = first_node(new->v.nodes);
868 	task_unlock(current);
869 	if (mm)
870 		up_write(&mm->mmap_sem);
871 
872 	mpol_put(old);
873 	ret = 0;
874 out:
875 	NODEMASK_SCRATCH_FREE(scratch);
876 	return ret;
877 }
878 
879 /*
880  * Return nodemask for policy for get_mempolicy() query
881  *
882  * Called with task's alloc_lock held
883  */
884 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
885 {
886 	nodes_clear(*nodes);
887 	if (p == &default_policy)
888 		return;
889 
890 	switch (p->mode) {
891 	case MPOL_BIND:
892 		/* Fall through */
893 	case MPOL_INTERLEAVE:
894 		*nodes = p->v.nodes;
895 		break;
896 	case MPOL_PREFERRED:
897 		if (!(p->flags & MPOL_F_LOCAL))
898 			node_set(p->v.preferred_node, *nodes);
899 		/* else return empty node mask for local allocation */
900 		break;
901 	default:
902 		BUG();
903 	}
904 }
905 
906 static int lookup_node(struct mm_struct *mm, unsigned long addr)
907 {
908 	struct page *p;
909 	int err;
910 
911 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
912 	if (err >= 0) {
913 		err = page_to_nid(p);
914 		put_page(p);
915 	}
916 	return err;
917 }
918 
919 /* Retrieve NUMA policy */
920 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
921 			     unsigned long addr, unsigned long flags)
922 {
923 	int err;
924 	struct mm_struct *mm = current->mm;
925 	struct vm_area_struct *vma = NULL;
926 	struct mempolicy *pol = current->mempolicy;
927 
928 	if (flags &
929 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
930 		return -EINVAL;
931 
932 	if (flags & MPOL_F_MEMS_ALLOWED) {
933 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
934 			return -EINVAL;
935 		*policy = 0;	/* just so it's initialized */
936 		task_lock(current);
937 		*nmask  = cpuset_current_mems_allowed;
938 		task_unlock(current);
939 		return 0;
940 	}
941 
942 	if (flags & MPOL_F_ADDR) {
943 		/*
944 		 * Do NOT fall back to task policy if the
945 		 * vma/shared policy at addr is NULL.  We
946 		 * want to return MPOL_DEFAULT in this case.
947 		 */
948 		down_read(&mm->mmap_sem);
949 		vma = find_vma_intersection(mm, addr, addr+1);
950 		if (!vma) {
951 			up_read(&mm->mmap_sem);
952 			return -EFAULT;
953 		}
954 		if (vma->vm_ops && vma->vm_ops->get_policy)
955 			pol = vma->vm_ops->get_policy(vma, addr);
956 		else
957 			pol = vma->vm_policy;
958 	} else if (addr)
959 		return -EINVAL;
960 
961 	if (!pol)
962 		pol = &default_policy;	/* indicates default behavior */
963 
964 	if (flags & MPOL_F_NODE) {
965 		if (flags & MPOL_F_ADDR) {
966 			err = lookup_node(mm, addr);
967 			if (err < 0)
968 				goto out;
969 			*policy = err;
970 		} else if (pol == current->mempolicy &&
971 				pol->mode == MPOL_INTERLEAVE) {
972 			*policy = current->il_next;
973 		} else {
974 			err = -EINVAL;
975 			goto out;
976 		}
977 	} else {
978 		*policy = pol == &default_policy ? MPOL_DEFAULT :
979 						pol->mode;
980 		/*
981 		 * Internal mempolicy flags must be masked off before exposing
982 		 * the policy to userspace.
983 		 */
984 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
985 	}
986 
987 	if (vma) {
988 		up_read(&current->mm->mmap_sem);
989 		vma = NULL;
990 	}
991 
992 	err = 0;
993 	if (nmask) {
994 		if (mpol_store_user_nodemask(pol)) {
995 			*nmask = pol->w.user_nodemask;
996 		} else {
997 			task_lock(current);
998 			get_policy_nodemask(pol, nmask);
999 			task_unlock(current);
1000 		}
1001 	}
1002 
1003  out:
1004 	mpol_cond_put(pol);
1005 	if (vma)
1006 		up_read(&current->mm->mmap_sem);
1007 	return err;
1008 }
1009 
1010 #ifdef CONFIG_MIGRATION
1011 /*
1012  * page migration
1013  */
1014 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1015 				unsigned long flags)
1016 {
1017 	/*
1018 	 * Avoid migrating a page that is shared with others.
1019 	 */
1020 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
1021 		if (!isolate_lru_page(page)) {
1022 			list_add_tail(&page->lru, pagelist);
1023 			inc_zone_page_state(page, NR_ISOLATED_ANON +
1024 					    page_is_file_cache(page));
1025 		}
1026 	}
1027 }
1028 
1029 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1030 {
1031 	if (PageHuge(page))
1032 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1033 					node);
1034 	else
1035 		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1036 }
1037 
1038 /*
1039  * Migrate pages from one node to a target node.
1040  * Returns error or the number of pages not migrated.
1041  */
1042 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1043 			   int flags)
1044 {
1045 	nodemask_t nmask;
1046 	LIST_HEAD(pagelist);
1047 	int err = 0;
1048 
1049 	nodes_clear(nmask);
1050 	node_set(source, nmask);
1051 
1052 	/*
1053 	 * This does not "check" the range but isolates all pages that
1054 	 * need migration.  Between passing in the full user address
1055 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1056 	 */
1057 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1058 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1059 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1060 
1061 	if (!list_empty(&pagelist)) {
1062 		err = migrate_pages(&pagelist, new_node_page, dest,
1063 					MIGRATE_SYNC, MR_SYSCALL);
1064 		if (err)
1065 			putback_movable_pages(&pagelist);
1066 	}
1067 
1068 	return err;
1069 }
1070 
1071 /*
1072  * Move pages between the two nodesets so as to preserve the physical
1073  * layout as much as possible.
1074  *
1075  * Returns the number of page that could not be moved.
1076  */
1077 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1078 		     const nodemask_t *to, int flags)
1079 {
1080 	int busy = 0;
1081 	int err;
1082 	nodemask_t tmp;
1083 
1084 	err = migrate_prep();
1085 	if (err)
1086 		return err;
1087 
1088 	down_read(&mm->mmap_sem);
1089 
1090 	err = migrate_vmas(mm, from, to, flags);
1091 	if (err)
1092 		goto out;
1093 
1094 	/*
1095 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1096 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1097 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1098 	 * The pair of nodemasks 'to' and 'from' define the map.
1099 	 *
1100 	 * If no pair of bits is found that way, fallback to picking some
1101 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1102 	 * 'source' and 'dest' bits are the same, this represents a node
1103 	 * that will be migrating to itself, so no pages need move.
1104 	 *
1105 	 * If no bits are left in 'tmp', or if all remaining bits left
1106 	 * in 'tmp' correspond to the same bit in 'to', return false
1107 	 * (nothing left to migrate).
1108 	 *
1109 	 * This lets us pick a pair of nodes to migrate between, such that
1110 	 * if possible the dest node is not already occupied by some other
1111 	 * source node, minimizing the risk of overloading the memory on a
1112 	 * node that would happen if we migrated incoming memory to a node
1113 	 * before migrating outgoing memory source that same node.
1114 	 *
1115 	 * A single scan of tmp is sufficient.  As we go, we remember the
1116 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1117 	 * that not only moved, but what's better, moved to an empty slot
1118 	 * (d is not set in tmp), then we break out then, with that pair.
1119 	 * Otherwise when we finish scanning from_tmp, we at least have the
1120 	 * most recent <s, d> pair that moved.  If we get all the way through
1121 	 * the scan of tmp without finding any node that moved, much less
1122 	 * moved to an empty node, then there is nothing left worth migrating.
1123 	 */
1124 
1125 	tmp = *from;
1126 	while (!nodes_empty(tmp)) {
1127 		int s,d;
1128 		int source = NUMA_NO_NODE;
1129 		int dest = 0;
1130 
1131 		for_each_node_mask(s, tmp) {
1132 
1133 			/*
1134 			 * do_migrate_pages() tries to maintain the relative
1135 			 * node relationship of the pages established between
1136 			 * threads and memory areas.
1137                          *
1138 			 * However if the number of source nodes is not equal to
1139 			 * the number of destination nodes we can not preserve
1140 			 * this node relative relationship.  In that case, skip
1141 			 * copying memory from a node that is in the destination
1142 			 * mask.
1143 			 *
1144 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1145 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1146 			 */
1147 
1148 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1149 						(node_isset(s, *to)))
1150 				continue;
1151 
1152 			d = node_remap(s, *from, *to);
1153 			if (s == d)
1154 				continue;
1155 
1156 			source = s;	/* Node moved. Memorize */
1157 			dest = d;
1158 
1159 			/* dest not in remaining from nodes? */
1160 			if (!node_isset(dest, tmp))
1161 				break;
1162 		}
1163 		if (source == NUMA_NO_NODE)
1164 			break;
1165 
1166 		node_clear(source, tmp);
1167 		err = migrate_to_node(mm, source, dest, flags);
1168 		if (err > 0)
1169 			busy += err;
1170 		if (err < 0)
1171 			break;
1172 	}
1173 out:
1174 	up_read(&mm->mmap_sem);
1175 	if (err < 0)
1176 		return err;
1177 	return busy;
1178 
1179 }
1180 
1181 /*
1182  * Allocate a new page for page migration based on vma policy.
1183  * Start assuming that page is mapped by vma pointed to by @private.
1184  * Search forward from there, if not.  N.B., this assumes that the
1185  * list of pages handed to migrate_pages()--which is how we get here--
1186  * is in virtual address order.
1187  */
1188 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1189 {
1190 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1191 	unsigned long uninitialized_var(address);
1192 
1193 	while (vma) {
1194 		address = page_address_in_vma(page, vma);
1195 		if (address != -EFAULT)
1196 			break;
1197 		vma = vma->vm_next;
1198 	}
1199 
1200 	if (PageHuge(page)) {
1201 		BUG_ON(!vma);
1202 		return alloc_huge_page_noerr(vma, address, 1);
1203 	}
1204 	/*
1205 	 * if !vma, alloc_page_vma() will use task or system default policy
1206 	 */
1207 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1208 }
1209 #else
1210 
1211 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1212 				unsigned long flags)
1213 {
1214 }
1215 
1216 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1217 		     const nodemask_t *to, int flags)
1218 {
1219 	return -ENOSYS;
1220 }
1221 
1222 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1223 {
1224 	return NULL;
1225 }
1226 #endif
1227 
1228 static long do_mbind(unsigned long start, unsigned long len,
1229 		     unsigned short mode, unsigned short mode_flags,
1230 		     nodemask_t *nmask, unsigned long flags)
1231 {
1232 	struct vm_area_struct *vma;
1233 	struct mm_struct *mm = current->mm;
1234 	struct mempolicy *new;
1235 	unsigned long end;
1236 	int err;
1237 	LIST_HEAD(pagelist);
1238 
1239 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1240 		return -EINVAL;
1241 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1242 		return -EPERM;
1243 
1244 	if (start & ~PAGE_MASK)
1245 		return -EINVAL;
1246 
1247 	if (mode == MPOL_DEFAULT)
1248 		flags &= ~MPOL_MF_STRICT;
1249 
1250 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1251 	end = start + len;
1252 
1253 	if (end < start)
1254 		return -EINVAL;
1255 	if (end == start)
1256 		return 0;
1257 
1258 	new = mpol_new(mode, mode_flags, nmask);
1259 	if (IS_ERR(new))
1260 		return PTR_ERR(new);
1261 
1262 	if (flags & MPOL_MF_LAZY)
1263 		new->flags |= MPOL_F_MOF;
1264 
1265 	/*
1266 	 * If we are using the default policy then operation
1267 	 * on discontinuous address spaces is okay after all
1268 	 */
1269 	if (!new)
1270 		flags |= MPOL_MF_DISCONTIG_OK;
1271 
1272 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1273 		 start, start + len, mode, mode_flags,
1274 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1275 
1276 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1277 
1278 		err = migrate_prep();
1279 		if (err)
1280 			goto mpol_out;
1281 	}
1282 	{
1283 		NODEMASK_SCRATCH(scratch);
1284 		if (scratch) {
1285 			down_write(&mm->mmap_sem);
1286 			task_lock(current);
1287 			err = mpol_set_nodemask(new, nmask, scratch);
1288 			task_unlock(current);
1289 			if (err)
1290 				up_write(&mm->mmap_sem);
1291 		} else
1292 			err = -ENOMEM;
1293 		NODEMASK_SCRATCH_FREE(scratch);
1294 	}
1295 	if (err)
1296 		goto mpol_out;
1297 
1298 	vma = queue_pages_range(mm, start, end, nmask,
1299 			  flags | MPOL_MF_INVERT, &pagelist);
1300 
1301 	err = PTR_ERR(vma);	/* maybe ... */
1302 	if (!IS_ERR(vma))
1303 		err = mbind_range(mm, start, end, new);
1304 
1305 	if (!err) {
1306 		int nr_failed = 0;
1307 
1308 		if (!list_empty(&pagelist)) {
1309 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1310 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1311 					(unsigned long)vma,
1312 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1313 			if (nr_failed)
1314 				putback_movable_pages(&pagelist);
1315 		}
1316 
1317 		if (nr_failed && (flags & MPOL_MF_STRICT))
1318 			err = -EIO;
1319 	} else
1320 		putback_movable_pages(&pagelist);
1321 
1322 	up_write(&mm->mmap_sem);
1323  mpol_out:
1324 	mpol_put(new);
1325 	return err;
1326 }
1327 
1328 /*
1329  * User space interface with variable sized bitmaps for nodelists.
1330  */
1331 
1332 /* Copy a node mask from user space. */
1333 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1334 		     unsigned long maxnode)
1335 {
1336 	unsigned long k;
1337 	unsigned long nlongs;
1338 	unsigned long endmask;
1339 
1340 	--maxnode;
1341 	nodes_clear(*nodes);
1342 	if (maxnode == 0 || !nmask)
1343 		return 0;
1344 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1345 		return -EINVAL;
1346 
1347 	nlongs = BITS_TO_LONGS(maxnode);
1348 	if ((maxnode % BITS_PER_LONG) == 0)
1349 		endmask = ~0UL;
1350 	else
1351 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1352 
1353 	/* When the user specified more nodes than supported just check
1354 	   if the non supported part is all zero. */
1355 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1356 		if (nlongs > PAGE_SIZE/sizeof(long))
1357 			return -EINVAL;
1358 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1359 			unsigned long t;
1360 			if (get_user(t, nmask + k))
1361 				return -EFAULT;
1362 			if (k == nlongs - 1) {
1363 				if (t & endmask)
1364 					return -EINVAL;
1365 			} else if (t)
1366 				return -EINVAL;
1367 		}
1368 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1369 		endmask = ~0UL;
1370 	}
1371 
1372 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1373 		return -EFAULT;
1374 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1375 	return 0;
1376 }
1377 
1378 /* Copy a kernel node mask to user space */
1379 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1380 			      nodemask_t *nodes)
1381 {
1382 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1383 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1384 
1385 	if (copy > nbytes) {
1386 		if (copy > PAGE_SIZE)
1387 			return -EINVAL;
1388 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1389 			return -EFAULT;
1390 		copy = nbytes;
1391 	}
1392 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1393 }
1394 
1395 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1396 		unsigned long, mode, unsigned long __user *, nmask,
1397 		unsigned long, maxnode, unsigned, flags)
1398 {
1399 	nodemask_t nodes;
1400 	int err;
1401 	unsigned short mode_flags;
1402 
1403 	mode_flags = mode & MPOL_MODE_FLAGS;
1404 	mode &= ~MPOL_MODE_FLAGS;
1405 	if (mode >= MPOL_MAX)
1406 		return -EINVAL;
1407 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1408 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1409 		return -EINVAL;
1410 	err = get_nodes(&nodes, nmask, maxnode);
1411 	if (err)
1412 		return err;
1413 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1414 }
1415 
1416 /* Set the process memory policy */
1417 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1418 		unsigned long, maxnode)
1419 {
1420 	int err;
1421 	nodemask_t nodes;
1422 	unsigned short flags;
1423 
1424 	flags = mode & MPOL_MODE_FLAGS;
1425 	mode &= ~MPOL_MODE_FLAGS;
1426 	if ((unsigned int)mode >= MPOL_MAX)
1427 		return -EINVAL;
1428 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1429 		return -EINVAL;
1430 	err = get_nodes(&nodes, nmask, maxnode);
1431 	if (err)
1432 		return err;
1433 	return do_set_mempolicy(mode, flags, &nodes);
1434 }
1435 
1436 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1437 		const unsigned long __user *, old_nodes,
1438 		const unsigned long __user *, new_nodes)
1439 {
1440 	const struct cred *cred = current_cred(), *tcred;
1441 	struct mm_struct *mm = NULL;
1442 	struct task_struct *task;
1443 	nodemask_t task_nodes;
1444 	int err;
1445 	nodemask_t *old;
1446 	nodemask_t *new;
1447 	NODEMASK_SCRATCH(scratch);
1448 
1449 	if (!scratch)
1450 		return -ENOMEM;
1451 
1452 	old = &scratch->mask1;
1453 	new = &scratch->mask2;
1454 
1455 	err = get_nodes(old, old_nodes, maxnode);
1456 	if (err)
1457 		goto out;
1458 
1459 	err = get_nodes(new, new_nodes, maxnode);
1460 	if (err)
1461 		goto out;
1462 
1463 	/* Find the mm_struct */
1464 	rcu_read_lock();
1465 	task = pid ? find_task_by_vpid(pid) : current;
1466 	if (!task) {
1467 		rcu_read_unlock();
1468 		err = -ESRCH;
1469 		goto out;
1470 	}
1471 	get_task_struct(task);
1472 
1473 	err = -EINVAL;
1474 
1475 	/*
1476 	 * Check if this process has the right to modify the specified
1477 	 * process. The right exists if the process has administrative
1478 	 * capabilities, superuser privileges or the same
1479 	 * userid as the target process.
1480 	 */
1481 	tcred = __task_cred(task);
1482 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1483 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1484 	    !capable(CAP_SYS_NICE)) {
1485 		rcu_read_unlock();
1486 		err = -EPERM;
1487 		goto out_put;
1488 	}
1489 	rcu_read_unlock();
1490 
1491 	task_nodes = cpuset_mems_allowed(task);
1492 	/* Is the user allowed to access the target nodes? */
1493 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1494 		err = -EPERM;
1495 		goto out_put;
1496 	}
1497 
1498 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1499 		err = -EINVAL;
1500 		goto out_put;
1501 	}
1502 
1503 	err = security_task_movememory(task);
1504 	if (err)
1505 		goto out_put;
1506 
1507 	mm = get_task_mm(task);
1508 	put_task_struct(task);
1509 
1510 	if (!mm) {
1511 		err = -EINVAL;
1512 		goto out;
1513 	}
1514 
1515 	err = do_migrate_pages(mm, old, new,
1516 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1517 
1518 	mmput(mm);
1519 out:
1520 	NODEMASK_SCRATCH_FREE(scratch);
1521 
1522 	return err;
1523 
1524 out_put:
1525 	put_task_struct(task);
1526 	goto out;
1527 
1528 }
1529 
1530 
1531 /* Retrieve NUMA policy */
1532 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1533 		unsigned long __user *, nmask, unsigned long, maxnode,
1534 		unsigned long, addr, unsigned long, flags)
1535 {
1536 	int err;
1537 	int uninitialized_var(pval);
1538 	nodemask_t nodes;
1539 
1540 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1541 		return -EINVAL;
1542 
1543 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1544 
1545 	if (err)
1546 		return err;
1547 
1548 	if (policy && put_user(pval, policy))
1549 		return -EFAULT;
1550 
1551 	if (nmask)
1552 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1553 
1554 	return err;
1555 }
1556 
1557 #ifdef CONFIG_COMPAT
1558 
1559 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1560 				     compat_ulong_t __user *nmask,
1561 				     compat_ulong_t maxnode,
1562 				     compat_ulong_t addr, compat_ulong_t flags)
1563 {
1564 	long err;
1565 	unsigned long __user *nm = NULL;
1566 	unsigned long nr_bits, alloc_size;
1567 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1568 
1569 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1570 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1571 
1572 	if (nmask)
1573 		nm = compat_alloc_user_space(alloc_size);
1574 
1575 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1576 
1577 	if (!err && nmask) {
1578 		unsigned long copy_size;
1579 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1580 		err = copy_from_user(bm, nm, copy_size);
1581 		/* ensure entire bitmap is zeroed */
1582 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1583 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1584 	}
1585 
1586 	return err;
1587 }
1588 
1589 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1590 				     compat_ulong_t maxnode)
1591 {
1592 	long err = 0;
1593 	unsigned long __user *nm = NULL;
1594 	unsigned long nr_bits, alloc_size;
1595 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1596 
1597 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1598 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1599 
1600 	if (nmask) {
1601 		err = compat_get_bitmap(bm, nmask, nr_bits);
1602 		nm = compat_alloc_user_space(alloc_size);
1603 		err |= copy_to_user(nm, bm, alloc_size);
1604 	}
1605 
1606 	if (err)
1607 		return -EFAULT;
1608 
1609 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1610 }
1611 
1612 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1613 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1614 			     compat_ulong_t maxnode, compat_ulong_t flags)
1615 {
1616 	long err = 0;
1617 	unsigned long __user *nm = NULL;
1618 	unsigned long nr_bits, alloc_size;
1619 	nodemask_t bm;
1620 
1621 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1622 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1623 
1624 	if (nmask) {
1625 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1626 		nm = compat_alloc_user_space(alloc_size);
1627 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1628 	}
1629 
1630 	if (err)
1631 		return -EFAULT;
1632 
1633 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1634 }
1635 
1636 #endif
1637 
1638 /*
1639  * get_vma_policy(@task, @vma, @addr)
1640  * @task - task for fallback if vma policy == default
1641  * @vma   - virtual memory area whose policy is sought
1642  * @addr  - address in @vma for shared policy lookup
1643  *
1644  * Returns effective policy for a VMA at specified address.
1645  * Falls back to @task or system default policy, as necessary.
1646  * Current or other task's task mempolicy and non-shared vma policies must be
1647  * protected by task_lock(task) by the caller.
1648  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1649  * count--added by the get_policy() vm_op, as appropriate--to protect against
1650  * freeing by another task.  It is the caller's responsibility to free the
1651  * extra reference for shared policies.
1652  */
1653 struct mempolicy *get_vma_policy(struct task_struct *task,
1654 		struct vm_area_struct *vma, unsigned long addr)
1655 {
1656 	struct mempolicy *pol = get_task_policy(task);
1657 
1658 	if (vma) {
1659 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1660 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1661 									addr);
1662 			if (vpol)
1663 				pol = vpol;
1664 		} else if (vma->vm_policy) {
1665 			pol = vma->vm_policy;
1666 
1667 			/*
1668 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1669 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1670 			 * count on these policies which will be dropped by
1671 			 * mpol_cond_put() later
1672 			 */
1673 			if (mpol_needs_cond_ref(pol))
1674 				mpol_get(pol);
1675 		}
1676 	}
1677 	if (!pol)
1678 		pol = &default_policy;
1679 	return pol;
1680 }
1681 
1682 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1683 {
1684 	struct mempolicy *pol = get_task_policy(task);
1685 	if (vma) {
1686 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1687 			bool ret = false;
1688 
1689 			pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1690 			if (pol && (pol->flags & MPOL_F_MOF))
1691 				ret = true;
1692 			mpol_cond_put(pol);
1693 
1694 			return ret;
1695 		} else if (vma->vm_policy) {
1696 			pol = vma->vm_policy;
1697 		}
1698 	}
1699 
1700 	if (!pol)
1701 		return default_policy.flags & MPOL_F_MOF;
1702 
1703 	return pol->flags & MPOL_F_MOF;
1704 }
1705 
1706 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1707 {
1708 	enum zone_type dynamic_policy_zone = policy_zone;
1709 
1710 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1711 
1712 	/*
1713 	 * if policy->v.nodes has movable memory only,
1714 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1715 	 *
1716 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1717 	 * so if the following test faile, it implies
1718 	 * policy->v.nodes has movable memory only.
1719 	 */
1720 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1721 		dynamic_policy_zone = ZONE_MOVABLE;
1722 
1723 	return zone >= dynamic_policy_zone;
1724 }
1725 
1726 /*
1727  * Return a nodemask representing a mempolicy for filtering nodes for
1728  * page allocation
1729  */
1730 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1731 {
1732 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1733 	if (unlikely(policy->mode == MPOL_BIND) &&
1734 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1735 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1736 		return &policy->v.nodes;
1737 
1738 	return NULL;
1739 }
1740 
1741 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1742 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1743 	int nd)
1744 {
1745 	switch (policy->mode) {
1746 	case MPOL_PREFERRED:
1747 		if (!(policy->flags & MPOL_F_LOCAL))
1748 			nd = policy->v.preferred_node;
1749 		break;
1750 	case MPOL_BIND:
1751 		/*
1752 		 * Normally, MPOL_BIND allocations are node-local within the
1753 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1754 		 * current node isn't part of the mask, we use the zonelist for
1755 		 * the first node in the mask instead.
1756 		 */
1757 		if (unlikely(gfp & __GFP_THISNODE) &&
1758 				unlikely(!node_isset(nd, policy->v.nodes)))
1759 			nd = first_node(policy->v.nodes);
1760 		break;
1761 	default:
1762 		BUG();
1763 	}
1764 	return node_zonelist(nd, gfp);
1765 }
1766 
1767 /* Do dynamic interleaving for a process */
1768 static unsigned interleave_nodes(struct mempolicy *policy)
1769 {
1770 	unsigned nid, next;
1771 	struct task_struct *me = current;
1772 
1773 	nid = me->il_next;
1774 	next = next_node(nid, policy->v.nodes);
1775 	if (next >= MAX_NUMNODES)
1776 		next = first_node(policy->v.nodes);
1777 	if (next < MAX_NUMNODES)
1778 		me->il_next = next;
1779 	return nid;
1780 }
1781 
1782 /*
1783  * Depending on the memory policy provide a node from which to allocate the
1784  * next slab entry.
1785  * @policy must be protected by freeing by the caller.  If @policy is
1786  * the current task's mempolicy, this protection is implicit, as only the
1787  * task can change it's policy.  The system default policy requires no
1788  * such protection.
1789  */
1790 unsigned slab_node(void)
1791 {
1792 	struct mempolicy *policy;
1793 
1794 	if (in_interrupt())
1795 		return numa_node_id();
1796 
1797 	policy = current->mempolicy;
1798 	if (!policy || policy->flags & MPOL_F_LOCAL)
1799 		return numa_node_id();
1800 
1801 	switch (policy->mode) {
1802 	case MPOL_PREFERRED:
1803 		/*
1804 		 * handled MPOL_F_LOCAL above
1805 		 */
1806 		return policy->v.preferred_node;
1807 
1808 	case MPOL_INTERLEAVE:
1809 		return interleave_nodes(policy);
1810 
1811 	case MPOL_BIND: {
1812 		/*
1813 		 * Follow bind policy behavior and start allocation at the
1814 		 * first node.
1815 		 */
1816 		struct zonelist *zonelist;
1817 		struct zone *zone;
1818 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1819 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1820 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1821 							&policy->v.nodes,
1822 							&zone);
1823 		return zone ? zone->node : numa_node_id();
1824 	}
1825 
1826 	default:
1827 		BUG();
1828 	}
1829 }
1830 
1831 /* Do static interleaving for a VMA with known offset. */
1832 static unsigned offset_il_node(struct mempolicy *pol,
1833 		struct vm_area_struct *vma, unsigned long off)
1834 {
1835 	unsigned nnodes = nodes_weight(pol->v.nodes);
1836 	unsigned target;
1837 	int c;
1838 	int nid = NUMA_NO_NODE;
1839 
1840 	if (!nnodes)
1841 		return numa_node_id();
1842 	target = (unsigned int)off % nnodes;
1843 	c = 0;
1844 	do {
1845 		nid = next_node(nid, pol->v.nodes);
1846 		c++;
1847 	} while (c <= target);
1848 	return nid;
1849 }
1850 
1851 /* Determine a node number for interleave */
1852 static inline unsigned interleave_nid(struct mempolicy *pol,
1853 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1854 {
1855 	if (vma) {
1856 		unsigned long off;
1857 
1858 		/*
1859 		 * for small pages, there is no difference between
1860 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1861 		 * for huge pages, since vm_pgoff is in units of small
1862 		 * pages, we need to shift off the always 0 bits to get
1863 		 * a useful offset.
1864 		 */
1865 		BUG_ON(shift < PAGE_SHIFT);
1866 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1867 		off += (addr - vma->vm_start) >> shift;
1868 		return offset_il_node(pol, vma, off);
1869 	} else
1870 		return interleave_nodes(pol);
1871 }
1872 
1873 /*
1874  * Return the bit number of a random bit set in the nodemask.
1875  * (returns NUMA_NO_NODE if nodemask is empty)
1876  */
1877 int node_random(const nodemask_t *maskp)
1878 {
1879 	int w, bit = NUMA_NO_NODE;
1880 
1881 	w = nodes_weight(*maskp);
1882 	if (w)
1883 		bit = bitmap_ord_to_pos(maskp->bits,
1884 			get_random_int() % w, MAX_NUMNODES);
1885 	return bit;
1886 }
1887 
1888 #ifdef CONFIG_HUGETLBFS
1889 /*
1890  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1891  * @vma = virtual memory area whose policy is sought
1892  * @addr = address in @vma for shared policy lookup and interleave policy
1893  * @gfp_flags = for requested zone
1894  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1895  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1896  *
1897  * Returns a zonelist suitable for a huge page allocation and a pointer
1898  * to the struct mempolicy for conditional unref after allocation.
1899  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1900  * @nodemask for filtering the zonelist.
1901  *
1902  * Must be protected by get_mems_allowed()
1903  */
1904 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1905 				gfp_t gfp_flags, struct mempolicy **mpol,
1906 				nodemask_t **nodemask)
1907 {
1908 	struct zonelist *zl;
1909 
1910 	*mpol = get_vma_policy(current, vma, addr);
1911 	*nodemask = NULL;	/* assume !MPOL_BIND */
1912 
1913 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1914 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1915 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1916 	} else {
1917 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1918 		if ((*mpol)->mode == MPOL_BIND)
1919 			*nodemask = &(*mpol)->v.nodes;
1920 	}
1921 	return zl;
1922 }
1923 
1924 /*
1925  * init_nodemask_of_mempolicy
1926  *
1927  * If the current task's mempolicy is "default" [NULL], return 'false'
1928  * to indicate default policy.  Otherwise, extract the policy nodemask
1929  * for 'bind' or 'interleave' policy into the argument nodemask, or
1930  * initialize the argument nodemask to contain the single node for
1931  * 'preferred' or 'local' policy and return 'true' to indicate presence
1932  * of non-default mempolicy.
1933  *
1934  * We don't bother with reference counting the mempolicy [mpol_get/put]
1935  * because the current task is examining it's own mempolicy and a task's
1936  * mempolicy is only ever changed by the task itself.
1937  *
1938  * N.B., it is the caller's responsibility to free a returned nodemask.
1939  */
1940 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1941 {
1942 	struct mempolicy *mempolicy;
1943 	int nid;
1944 
1945 	if (!(mask && current->mempolicy))
1946 		return false;
1947 
1948 	task_lock(current);
1949 	mempolicy = current->mempolicy;
1950 	switch (mempolicy->mode) {
1951 	case MPOL_PREFERRED:
1952 		if (mempolicy->flags & MPOL_F_LOCAL)
1953 			nid = numa_node_id();
1954 		else
1955 			nid = mempolicy->v.preferred_node;
1956 		init_nodemask_of_node(mask, nid);
1957 		break;
1958 
1959 	case MPOL_BIND:
1960 		/* Fall through */
1961 	case MPOL_INTERLEAVE:
1962 		*mask =  mempolicy->v.nodes;
1963 		break;
1964 
1965 	default:
1966 		BUG();
1967 	}
1968 	task_unlock(current);
1969 
1970 	return true;
1971 }
1972 #endif
1973 
1974 /*
1975  * mempolicy_nodemask_intersects
1976  *
1977  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1978  * policy.  Otherwise, check for intersection between mask and the policy
1979  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1980  * policy, always return true since it may allocate elsewhere on fallback.
1981  *
1982  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1983  */
1984 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1985 					const nodemask_t *mask)
1986 {
1987 	struct mempolicy *mempolicy;
1988 	bool ret = true;
1989 
1990 	if (!mask)
1991 		return ret;
1992 	task_lock(tsk);
1993 	mempolicy = tsk->mempolicy;
1994 	if (!mempolicy)
1995 		goto out;
1996 
1997 	switch (mempolicy->mode) {
1998 	case MPOL_PREFERRED:
1999 		/*
2000 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2001 		 * allocate from, they may fallback to other nodes when oom.
2002 		 * Thus, it's possible for tsk to have allocated memory from
2003 		 * nodes in mask.
2004 		 */
2005 		break;
2006 	case MPOL_BIND:
2007 	case MPOL_INTERLEAVE:
2008 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2009 		break;
2010 	default:
2011 		BUG();
2012 	}
2013 out:
2014 	task_unlock(tsk);
2015 	return ret;
2016 }
2017 
2018 /* Allocate a page in interleaved policy.
2019    Own path because it needs to do special accounting. */
2020 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2021 					unsigned nid)
2022 {
2023 	struct zonelist *zl;
2024 	struct page *page;
2025 
2026 	zl = node_zonelist(nid, gfp);
2027 	page = __alloc_pages(gfp, order, zl);
2028 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
2029 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
2030 	return page;
2031 }
2032 
2033 /**
2034  * 	alloc_pages_vma	- Allocate a page for a VMA.
2035  *
2036  * 	@gfp:
2037  *      %GFP_USER    user allocation.
2038  *      %GFP_KERNEL  kernel allocations,
2039  *      %GFP_HIGHMEM highmem/user allocations,
2040  *      %GFP_FS      allocation should not call back into a file system.
2041  *      %GFP_ATOMIC  don't sleep.
2042  *
2043  *	@order:Order of the GFP allocation.
2044  * 	@vma:  Pointer to VMA or NULL if not available.
2045  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2046  *
2047  * 	This function allocates a page from the kernel page pool and applies
2048  *	a NUMA policy associated with the VMA or the current process.
2049  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2050  *	mm_struct of the VMA to prevent it from going away. Should be used for
2051  *	all allocations for pages that will be mapped into
2052  * 	user space. Returns NULL when no page can be allocated.
2053  *
2054  *	Should be called with the mm_sem of the vma hold.
2055  */
2056 struct page *
2057 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2058 		unsigned long addr, int node)
2059 {
2060 	struct mempolicy *pol;
2061 	struct page *page;
2062 	unsigned int cpuset_mems_cookie;
2063 
2064 retry_cpuset:
2065 	pol = get_vma_policy(current, vma, addr);
2066 	cpuset_mems_cookie = get_mems_allowed();
2067 
2068 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2069 		unsigned nid;
2070 
2071 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2072 		mpol_cond_put(pol);
2073 		page = alloc_page_interleave(gfp, order, nid);
2074 		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2075 			goto retry_cpuset;
2076 
2077 		return page;
2078 	}
2079 	page = __alloc_pages_nodemask(gfp, order,
2080 				      policy_zonelist(gfp, pol, node),
2081 				      policy_nodemask(gfp, pol));
2082 	if (unlikely(mpol_needs_cond_ref(pol)))
2083 		__mpol_put(pol);
2084 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2085 		goto retry_cpuset;
2086 	return page;
2087 }
2088 
2089 /**
2090  * 	alloc_pages_current - Allocate pages.
2091  *
2092  *	@gfp:
2093  *		%GFP_USER   user allocation,
2094  *      	%GFP_KERNEL kernel allocation,
2095  *      	%GFP_HIGHMEM highmem allocation,
2096  *      	%GFP_FS     don't call back into a file system.
2097  *      	%GFP_ATOMIC don't sleep.
2098  *	@order: Power of two of allocation size in pages. 0 is a single page.
2099  *
2100  *	Allocate a page from the kernel page pool.  When not in
2101  *	interrupt context and apply the current process NUMA policy.
2102  *	Returns NULL when no page can be allocated.
2103  *
2104  *	Don't call cpuset_update_task_memory_state() unless
2105  *	1) it's ok to take cpuset_sem (can WAIT), and
2106  *	2) allocating for current task (not interrupt).
2107  */
2108 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2109 {
2110 	struct mempolicy *pol = get_task_policy(current);
2111 	struct page *page;
2112 	unsigned int cpuset_mems_cookie;
2113 
2114 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2115 		pol = &default_policy;
2116 
2117 retry_cpuset:
2118 	cpuset_mems_cookie = get_mems_allowed();
2119 
2120 	/*
2121 	 * No reference counting needed for current->mempolicy
2122 	 * nor system default_policy
2123 	 */
2124 	if (pol->mode == MPOL_INTERLEAVE)
2125 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2126 	else
2127 		page = __alloc_pages_nodemask(gfp, order,
2128 				policy_zonelist(gfp, pol, numa_node_id()),
2129 				policy_nodemask(gfp, pol));
2130 
2131 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2132 		goto retry_cpuset;
2133 
2134 	return page;
2135 }
2136 EXPORT_SYMBOL(alloc_pages_current);
2137 
2138 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2139 {
2140 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2141 
2142 	if (IS_ERR(pol))
2143 		return PTR_ERR(pol);
2144 	dst->vm_policy = pol;
2145 	return 0;
2146 }
2147 
2148 /*
2149  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2150  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2151  * with the mems_allowed returned by cpuset_mems_allowed().  This
2152  * keeps mempolicies cpuset relative after its cpuset moves.  See
2153  * further kernel/cpuset.c update_nodemask().
2154  *
2155  * current's mempolicy may be rebinded by the other task(the task that changes
2156  * cpuset's mems), so we needn't do rebind work for current task.
2157  */
2158 
2159 /* Slow path of a mempolicy duplicate */
2160 struct mempolicy *__mpol_dup(struct mempolicy *old)
2161 {
2162 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2163 
2164 	if (!new)
2165 		return ERR_PTR(-ENOMEM);
2166 
2167 	/* task's mempolicy is protected by alloc_lock */
2168 	if (old == current->mempolicy) {
2169 		task_lock(current);
2170 		*new = *old;
2171 		task_unlock(current);
2172 	} else
2173 		*new = *old;
2174 
2175 	rcu_read_lock();
2176 	if (current_cpuset_is_being_rebound()) {
2177 		nodemask_t mems = cpuset_mems_allowed(current);
2178 		if (new->flags & MPOL_F_REBINDING)
2179 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2180 		else
2181 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2182 	}
2183 	rcu_read_unlock();
2184 	atomic_set(&new->refcnt, 1);
2185 	return new;
2186 }
2187 
2188 /* Slow path of a mempolicy comparison */
2189 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2190 {
2191 	if (!a || !b)
2192 		return false;
2193 	if (a->mode != b->mode)
2194 		return false;
2195 	if (a->flags != b->flags)
2196 		return false;
2197 	if (mpol_store_user_nodemask(a))
2198 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2199 			return false;
2200 
2201 	switch (a->mode) {
2202 	case MPOL_BIND:
2203 		/* Fall through */
2204 	case MPOL_INTERLEAVE:
2205 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2206 	case MPOL_PREFERRED:
2207 		return a->v.preferred_node == b->v.preferred_node;
2208 	default:
2209 		BUG();
2210 		return false;
2211 	}
2212 }
2213 
2214 /*
2215  * Shared memory backing store policy support.
2216  *
2217  * Remember policies even when nobody has shared memory mapped.
2218  * The policies are kept in Red-Black tree linked from the inode.
2219  * They are protected by the sp->lock spinlock, which should be held
2220  * for any accesses to the tree.
2221  */
2222 
2223 /* lookup first element intersecting start-end */
2224 /* Caller holds sp->lock */
2225 static struct sp_node *
2226 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2227 {
2228 	struct rb_node *n = sp->root.rb_node;
2229 
2230 	while (n) {
2231 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2232 
2233 		if (start >= p->end)
2234 			n = n->rb_right;
2235 		else if (end <= p->start)
2236 			n = n->rb_left;
2237 		else
2238 			break;
2239 	}
2240 	if (!n)
2241 		return NULL;
2242 	for (;;) {
2243 		struct sp_node *w = NULL;
2244 		struct rb_node *prev = rb_prev(n);
2245 		if (!prev)
2246 			break;
2247 		w = rb_entry(prev, struct sp_node, nd);
2248 		if (w->end <= start)
2249 			break;
2250 		n = prev;
2251 	}
2252 	return rb_entry(n, struct sp_node, nd);
2253 }
2254 
2255 /* Insert a new shared policy into the list. */
2256 /* Caller holds sp->lock */
2257 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2258 {
2259 	struct rb_node **p = &sp->root.rb_node;
2260 	struct rb_node *parent = NULL;
2261 	struct sp_node *nd;
2262 
2263 	while (*p) {
2264 		parent = *p;
2265 		nd = rb_entry(parent, struct sp_node, nd);
2266 		if (new->start < nd->start)
2267 			p = &(*p)->rb_left;
2268 		else if (new->end > nd->end)
2269 			p = &(*p)->rb_right;
2270 		else
2271 			BUG();
2272 	}
2273 	rb_link_node(&new->nd, parent, p);
2274 	rb_insert_color(&new->nd, &sp->root);
2275 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2276 		 new->policy ? new->policy->mode : 0);
2277 }
2278 
2279 /* Find shared policy intersecting idx */
2280 struct mempolicy *
2281 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2282 {
2283 	struct mempolicy *pol = NULL;
2284 	struct sp_node *sn;
2285 
2286 	if (!sp->root.rb_node)
2287 		return NULL;
2288 	spin_lock(&sp->lock);
2289 	sn = sp_lookup(sp, idx, idx+1);
2290 	if (sn) {
2291 		mpol_get(sn->policy);
2292 		pol = sn->policy;
2293 	}
2294 	spin_unlock(&sp->lock);
2295 	return pol;
2296 }
2297 
2298 static void sp_free(struct sp_node *n)
2299 {
2300 	mpol_put(n->policy);
2301 	kmem_cache_free(sn_cache, n);
2302 }
2303 
2304 #ifdef CONFIG_NUMA_BALANCING
2305 static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306 {
2307 	/* Never defer a private fault */
2308 	if (cpupid_match_pid(p, last_cpupid))
2309 		return false;
2310 
2311 	if (p->numa_migrate_deferred) {
2312 		p->numa_migrate_deferred--;
2313 		return true;
2314 	}
2315 	return false;
2316 }
2317 
2318 static inline void defer_numa_migrate(struct task_struct *p)
2319 {
2320 	p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321 }
2322 #else
2323 static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324 {
2325 	return false;
2326 }
2327 
2328 static inline void defer_numa_migrate(struct task_struct *p)
2329 {
2330 }
2331 #endif /* CONFIG_NUMA_BALANCING */
2332 
2333 /**
2334  * mpol_misplaced - check whether current page node is valid in policy
2335  *
2336  * @page   - page to be checked
2337  * @vma    - vm area where page mapped
2338  * @addr   - virtual address where page mapped
2339  *
2340  * Lookup current policy node id for vma,addr and "compare to" page's
2341  * node id.
2342  *
2343  * Returns:
2344  *	-1	- not misplaced, page is in the right node
2345  *	node	- node id where the page should be
2346  *
2347  * Policy determination "mimics" alloc_page_vma().
2348  * Called from fault path where we know the vma and faulting address.
2349  */
2350 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2351 {
2352 	struct mempolicy *pol;
2353 	struct zone *zone;
2354 	int curnid = page_to_nid(page);
2355 	unsigned long pgoff;
2356 	int thiscpu = raw_smp_processor_id();
2357 	int thisnid = cpu_to_node(thiscpu);
2358 	int polnid = -1;
2359 	int ret = -1;
2360 
2361 	BUG_ON(!vma);
2362 
2363 	pol = get_vma_policy(current, vma, addr);
2364 	if (!(pol->flags & MPOL_F_MOF))
2365 		goto out;
2366 
2367 	switch (pol->mode) {
2368 	case MPOL_INTERLEAVE:
2369 		BUG_ON(addr >= vma->vm_end);
2370 		BUG_ON(addr < vma->vm_start);
2371 
2372 		pgoff = vma->vm_pgoff;
2373 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2374 		polnid = offset_il_node(pol, vma, pgoff);
2375 		break;
2376 
2377 	case MPOL_PREFERRED:
2378 		if (pol->flags & MPOL_F_LOCAL)
2379 			polnid = numa_node_id();
2380 		else
2381 			polnid = pol->v.preferred_node;
2382 		break;
2383 
2384 	case MPOL_BIND:
2385 		/*
2386 		 * allows binding to multiple nodes.
2387 		 * use current page if in policy nodemask,
2388 		 * else select nearest allowed node, if any.
2389 		 * If no allowed nodes, use current [!misplaced].
2390 		 */
2391 		if (node_isset(curnid, pol->v.nodes))
2392 			goto out;
2393 		(void)first_zones_zonelist(
2394 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2395 				gfp_zone(GFP_HIGHUSER),
2396 				&pol->v.nodes, &zone);
2397 		polnid = zone->node;
2398 		break;
2399 
2400 	default:
2401 		BUG();
2402 	}
2403 
2404 	/* Migrate the page towards the node whose CPU is referencing it */
2405 	if (pol->flags & MPOL_F_MORON) {
2406 		int last_cpupid;
2407 		int this_cpupid;
2408 
2409 		polnid = thisnid;
2410 		this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2411 
2412 		/*
2413 		 * Multi-stage node selection is used in conjunction
2414 		 * with a periodic migration fault to build a temporal
2415 		 * task<->page relation. By using a two-stage filter we
2416 		 * remove short/unlikely relations.
2417 		 *
2418 		 * Using P(p) ~ n_p / n_t as per frequentist
2419 		 * probability, we can equate a task's usage of a
2420 		 * particular page (n_p) per total usage of this
2421 		 * page (n_t) (in a given time-span) to a probability.
2422 		 *
2423 		 * Our periodic faults will sample this probability and
2424 		 * getting the same result twice in a row, given these
2425 		 * samples are fully independent, is then given by
2426 		 * P(n)^2, provided our sample period is sufficiently
2427 		 * short compared to the usage pattern.
2428 		 *
2429 		 * This quadric squishes small probabilities, making
2430 		 * it less likely we act on an unlikely task<->page
2431 		 * relation.
2432 		 */
2433 		last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2434 		if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435 
2436 			/* See sysctl_numa_balancing_migrate_deferred comment */
2437 			if (!cpupid_match_pid(current, last_cpupid))
2438 				defer_numa_migrate(current);
2439 
2440 			goto out;
2441 		}
2442 
2443 		/*
2444 		 * The quadratic filter above reduces extraneous migration
2445 		 * of shared pages somewhat. This code reduces it even more,
2446 		 * reducing the overhead of page migrations of shared pages.
2447 		 * This makes workloads with shared pages rely more on
2448 		 * "move task near its memory", and less on "move memory
2449 		 * towards its task", which is exactly what we want.
2450 		 */
2451 		if (numa_migrate_deferred(current, last_cpupid))
2452 			goto out;
2453 	}
2454 
2455 	if (curnid != polnid)
2456 		ret = polnid;
2457 out:
2458 	mpol_cond_put(pol);
2459 
2460 	return ret;
2461 }
2462 
2463 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2464 {
2465 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2466 	rb_erase(&n->nd, &sp->root);
2467 	sp_free(n);
2468 }
2469 
2470 static void sp_node_init(struct sp_node *node, unsigned long start,
2471 			unsigned long end, struct mempolicy *pol)
2472 {
2473 	node->start = start;
2474 	node->end = end;
2475 	node->policy = pol;
2476 }
2477 
2478 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2479 				struct mempolicy *pol)
2480 {
2481 	struct sp_node *n;
2482 	struct mempolicy *newpol;
2483 
2484 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2485 	if (!n)
2486 		return NULL;
2487 
2488 	newpol = mpol_dup(pol);
2489 	if (IS_ERR(newpol)) {
2490 		kmem_cache_free(sn_cache, n);
2491 		return NULL;
2492 	}
2493 	newpol->flags |= MPOL_F_SHARED;
2494 	sp_node_init(n, start, end, newpol);
2495 
2496 	return n;
2497 }
2498 
2499 /* Replace a policy range. */
2500 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2501 				 unsigned long end, struct sp_node *new)
2502 {
2503 	struct sp_node *n;
2504 	struct sp_node *n_new = NULL;
2505 	struct mempolicy *mpol_new = NULL;
2506 	int ret = 0;
2507 
2508 restart:
2509 	spin_lock(&sp->lock);
2510 	n = sp_lookup(sp, start, end);
2511 	/* Take care of old policies in the same range. */
2512 	while (n && n->start < end) {
2513 		struct rb_node *next = rb_next(&n->nd);
2514 		if (n->start >= start) {
2515 			if (n->end <= end)
2516 				sp_delete(sp, n);
2517 			else
2518 				n->start = end;
2519 		} else {
2520 			/* Old policy spanning whole new range. */
2521 			if (n->end > end) {
2522 				if (!n_new)
2523 					goto alloc_new;
2524 
2525 				*mpol_new = *n->policy;
2526 				atomic_set(&mpol_new->refcnt, 1);
2527 				sp_node_init(n_new, end, n->end, mpol_new);
2528 				n->end = start;
2529 				sp_insert(sp, n_new);
2530 				n_new = NULL;
2531 				mpol_new = NULL;
2532 				break;
2533 			} else
2534 				n->end = start;
2535 		}
2536 		if (!next)
2537 			break;
2538 		n = rb_entry(next, struct sp_node, nd);
2539 	}
2540 	if (new)
2541 		sp_insert(sp, new);
2542 	spin_unlock(&sp->lock);
2543 	ret = 0;
2544 
2545 err_out:
2546 	if (mpol_new)
2547 		mpol_put(mpol_new);
2548 	if (n_new)
2549 		kmem_cache_free(sn_cache, n_new);
2550 
2551 	return ret;
2552 
2553 alloc_new:
2554 	spin_unlock(&sp->lock);
2555 	ret = -ENOMEM;
2556 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2557 	if (!n_new)
2558 		goto err_out;
2559 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2560 	if (!mpol_new)
2561 		goto err_out;
2562 	goto restart;
2563 }
2564 
2565 /**
2566  * mpol_shared_policy_init - initialize shared policy for inode
2567  * @sp: pointer to inode shared policy
2568  * @mpol:  struct mempolicy to install
2569  *
2570  * Install non-NULL @mpol in inode's shared policy rb-tree.
2571  * On entry, the current task has a reference on a non-NULL @mpol.
2572  * This must be released on exit.
2573  * This is called at get_inode() calls and we can use GFP_KERNEL.
2574  */
2575 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2576 {
2577 	int ret;
2578 
2579 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2580 	spin_lock_init(&sp->lock);
2581 
2582 	if (mpol) {
2583 		struct vm_area_struct pvma;
2584 		struct mempolicy *new;
2585 		NODEMASK_SCRATCH(scratch);
2586 
2587 		if (!scratch)
2588 			goto put_mpol;
2589 		/* contextualize the tmpfs mount point mempolicy */
2590 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2591 		if (IS_ERR(new))
2592 			goto free_scratch; /* no valid nodemask intersection */
2593 
2594 		task_lock(current);
2595 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2596 		task_unlock(current);
2597 		if (ret)
2598 			goto put_new;
2599 
2600 		/* Create pseudo-vma that contains just the policy */
2601 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2602 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2603 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2604 
2605 put_new:
2606 		mpol_put(new);			/* drop initial ref */
2607 free_scratch:
2608 		NODEMASK_SCRATCH_FREE(scratch);
2609 put_mpol:
2610 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2611 	}
2612 }
2613 
2614 int mpol_set_shared_policy(struct shared_policy *info,
2615 			struct vm_area_struct *vma, struct mempolicy *npol)
2616 {
2617 	int err;
2618 	struct sp_node *new = NULL;
2619 	unsigned long sz = vma_pages(vma);
2620 
2621 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2622 		 vma->vm_pgoff,
2623 		 sz, npol ? npol->mode : -1,
2624 		 npol ? npol->flags : -1,
2625 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2626 
2627 	if (npol) {
2628 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2629 		if (!new)
2630 			return -ENOMEM;
2631 	}
2632 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2633 	if (err && new)
2634 		sp_free(new);
2635 	return err;
2636 }
2637 
2638 /* Free a backing policy store on inode delete. */
2639 void mpol_free_shared_policy(struct shared_policy *p)
2640 {
2641 	struct sp_node *n;
2642 	struct rb_node *next;
2643 
2644 	if (!p->root.rb_node)
2645 		return;
2646 	spin_lock(&p->lock);
2647 	next = rb_first(&p->root);
2648 	while (next) {
2649 		n = rb_entry(next, struct sp_node, nd);
2650 		next = rb_next(&n->nd);
2651 		sp_delete(p, n);
2652 	}
2653 	spin_unlock(&p->lock);
2654 }
2655 
2656 #ifdef CONFIG_NUMA_BALANCING
2657 static int __initdata numabalancing_override;
2658 
2659 static void __init check_numabalancing_enable(void)
2660 {
2661 	bool numabalancing_default = false;
2662 
2663 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2664 		numabalancing_default = true;
2665 
2666 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2667 	if (numabalancing_override)
2668 		set_numabalancing_state(numabalancing_override == 1);
2669 
2670 	if (nr_node_ids > 1 && !numabalancing_override) {
2671 		pr_info("%s automatic NUMA balancing. "
2672 			"Configure with numa_balancing= or the "
2673 			"kernel.numa_balancing sysctl",
2674 			numabalancing_default ? "Enabling" : "Disabling");
2675 		set_numabalancing_state(numabalancing_default);
2676 	}
2677 }
2678 
2679 static int __init setup_numabalancing(char *str)
2680 {
2681 	int ret = 0;
2682 	if (!str)
2683 		goto out;
2684 
2685 	if (!strcmp(str, "enable")) {
2686 		numabalancing_override = 1;
2687 		ret = 1;
2688 	} else if (!strcmp(str, "disable")) {
2689 		numabalancing_override = -1;
2690 		ret = 1;
2691 	}
2692 out:
2693 	if (!ret)
2694 		pr_warn("Unable to parse numa_balancing=\n");
2695 
2696 	return ret;
2697 }
2698 __setup("numa_balancing=", setup_numabalancing);
2699 #else
2700 static inline void __init check_numabalancing_enable(void)
2701 {
2702 }
2703 #endif /* CONFIG_NUMA_BALANCING */
2704 
2705 /* assumes fs == KERNEL_DS */
2706 void __init numa_policy_init(void)
2707 {
2708 	nodemask_t interleave_nodes;
2709 	unsigned long largest = 0;
2710 	int nid, prefer = 0;
2711 
2712 	policy_cache = kmem_cache_create("numa_policy",
2713 					 sizeof(struct mempolicy),
2714 					 0, SLAB_PANIC, NULL);
2715 
2716 	sn_cache = kmem_cache_create("shared_policy_node",
2717 				     sizeof(struct sp_node),
2718 				     0, SLAB_PANIC, NULL);
2719 
2720 	for_each_node(nid) {
2721 		preferred_node_policy[nid] = (struct mempolicy) {
2722 			.refcnt = ATOMIC_INIT(1),
2723 			.mode = MPOL_PREFERRED,
2724 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2725 			.v = { .preferred_node = nid, },
2726 		};
2727 	}
2728 
2729 	/*
2730 	 * Set interleaving policy for system init. Interleaving is only
2731 	 * enabled across suitably sized nodes (default is >= 16MB), or
2732 	 * fall back to the largest node if they're all smaller.
2733 	 */
2734 	nodes_clear(interleave_nodes);
2735 	for_each_node_state(nid, N_MEMORY) {
2736 		unsigned long total_pages = node_present_pages(nid);
2737 
2738 		/* Preserve the largest node */
2739 		if (largest < total_pages) {
2740 			largest = total_pages;
2741 			prefer = nid;
2742 		}
2743 
2744 		/* Interleave this node? */
2745 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2746 			node_set(nid, interleave_nodes);
2747 	}
2748 
2749 	/* All too small, use the largest */
2750 	if (unlikely(nodes_empty(interleave_nodes)))
2751 		node_set(prefer, interleave_nodes);
2752 
2753 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2754 		printk("numa_policy_init: interleaving failed\n");
2755 
2756 	check_numabalancing_enable();
2757 }
2758 
2759 /* Reset policy of current process to default */
2760 void numa_default_policy(void)
2761 {
2762 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2763 }
2764 
2765 /*
2766  * Parse and format mempolicy from/to strings
2767  */
2768 
2769 /*
2770  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2771  */
2772 static const char * const policy_modes[] =
2773 {
2774 	[MPOL_DEFAULT]    = "default",
2775 	[MPOL_PREFERRED]  = "prefer",
2776 	[MPOL_BIND]       = "bind",
2777 	[MPOL_INTERLEAVE] = "interleave",
2778 	[MPOL_LOCAL]      = "local",
2779 };
2780 
2781 
2782 #ifdef CONFIG_TMPFS
2783 /**
2784  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2785  * @str:  string containing mempolicy to parse
2786  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2787  *
2788  * Format of input:
2789  *	<mode>[=<flags>][:<nodelist>]
2790  *
2791  * On success, returns 0, else 1
2792  */
2793 int mpol_parse_str(char *str, struct mempolicy **mpol)
2794 {
2795 	struct mempolicy *new = NULL;
2796 	unsigned short mode;
2797 	unsigned short mode_flags;
2798 	nodemask_t nodes;
2799 	char *nodelist = strchr(str, ':');
2800 	char *flags = strchr(str, '=');
2801 	int err = 1;
2802 
2803 	if (nodelist) {
2804 		/* NUL-terminate mode or flags string */
2805 		*nodelist++ = '\0';
2806 		if (nodelist_parse(nodelist, nodes))
2807 			goto out;
2808 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2809 			goto out;
2810 	} else
2811 		nodes_clear(nodes);
2812 
2813 	if (flags)
2814 		*flags++ = '\0';	/* terminate mode string */
2815 
2816 	for (mode = 0; mode < MPOL_MAX; mode++) {
2817 		if (!strcmp(str, policy_modes[mode])) {
2818 			break;
2819 		}
2820 	}
2821 	if (mode >= MPOL_MAX)
2822 		goto out;
2823 
2824 	switch (mode) {
2825 	case MPOL_PREFERRED:
2826 		/*
2827 		 * Insist on a nodelist of one node only
2828 		 */
2829 		if (nodelist) {
2830 			char *rest = nodelist;
2831 			while (isdigit(*rest))
2832 				rest++;
2833 			if (*rest)
2834 				goto out;
2835 		}
2836 		break;
2837 	case MPOL_INTERLEAVE:
2838 		/*
2839 		 * Default to online nodes with memory if no nodelist
2840 		 */
2841 		if (!nodelist)
2842 			nodes = node_states[N_MEMORY];
2843 		break;
2844 	case MPOL_LOCAL:
2845 		/*
2846 		 * Don't allow a nodelist;  mpol_new() checks flags
2847 		 */
2848 		if (nodelist)
2849 			goto out;
2850 		mode = MPOL_PREFERRED;
2851 		break;
2852 	case MPOL_DEFAULT:
2853 		/*
2854 		 * Insist on a empty nodelist
2855 		 */
2856 		if (!nodelist)
2857 			err = 0;
2858 		goto out;
2859 	case MPOL_BIND:
2860 		/*
2861 		 * Insist on a nodelist
2862 		 */
2863 		if (!nodelist)
2864 			goto out;
2865 	}
2866 
2867 	mode_flags = 0;
2868 	if (flags) {
2869 		/*
2870 		 * Currently, we only support two mutually exclusive
2871 		 * mode flags.
2872 		 */
2873 		if (!strcmp(flags, "static"))
2874 			mode_flags |= MPOL_F_STATIC_NODES;
2875 		else if (!strcmp(flags, "relative"))
2876 			mode_flags |= MPOL_F_RELATIVE_NODES;
2877 		else
2878 			goto out;
2879 	}
2880 
2881 	new = mpol_new(mode, mode_flags, &nodes);
2882 	if (IS_ERR(new))
2883 		goto out;
2884 
2885 	/*
2886 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2887 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2888 	 */
2889 	if (mode != MPOL_PREFERRED)
2890 		new->v.nodes = nodes;
2891 	else if (nodelist)
2892 		new->v.preferred_node = first_node(nodes);
2893 	else
2894 		new->flags |= MPOL_F_LOCAL;
2895 
2896 	/*
2897 	 * Save nodes for contextualization: this will be used to "clone"
2898 	 * the mempolicy in a specific context [cpuset] at a later time.
2899 	 */
2900 	new->w.user_nodemask = nodes;
2901 
2902 	err = 0;
2903 
2904 out:
2905 	/* Restore string for error message */
2906 	if (nodelist)
2907 		*--nodelist = ':';
2908 	if (flags)
2909 		*--flags = '=';
2910 	if (!err)
2911 		*mpol = new;
2912 	return err;
2913 }
2914 #endif /* CONFIG_TMPFS */
2915 
2916 /**
2917  * mpol_to_str - format a mempolicy structure for printing
2918  * @buffer:  to contain formatted mempolicy string
2919  * @maxlen:  length of @buffer
2920  * @pol:  pointer to mempolicy to be formatted
2921  *
2922  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2923  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2924  * longest flag, "relative", and to display at least a few node ids.
2925  */
2926 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2927 {
2928 	char *p = buffer;
2929 	nodemask_t nodes = NODE_MASK_NONE;
2930 	unsigned short mode = MPOL_DEFAULT;
2931 	unsigned short flags = 0;
2932 
2933 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2934 		mode = pol->mode;
2935 		flags = pol->flags;
2936 	}
2937 
2938 	switch (mode) {
2939 	case MPOL_DEFAULT:
2940 		break;
2941 	case MPOL_PREFERRED:
2942 		if (flags & MPOL_F_LOCAL)
2943 			mode = MPOL_LOCAL;
2944 		else
2945 			node_set(pol->v.preferred_node, nodes);
2946 		break;
2947 	case MPOL_BIND:
2948 	case MPOL_INTERLEAVE:
2949 		nodes = pol->v.nodes;
2950 		break;
2951 	default:
2952 		WARN_ON_ONCE(1);
2953 		snprintf(p, maxlen, "unknown");
2954 		return;
2955 	}
2956 
2957 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2958 
2959 	if (flags & MPOL_MODE_FLAGS) {
2960 		p += snprintf(p, buffer + maxlen - p, "=");
2961 
2962 		/*
2963 		 * Currently, the only defined flags are mutually exclusive
2964 		 */
2965 		if (flags & MPOL_F_STATIC_NODES)
2966 			p += snprintf(p, buffer + maxlen - p, "static");
2967 		else if (flags & MPOL_F_RELATIVE_NODES)
2968 			p += snprintf(p, buffer + maxlen - p, "relative");
2969 	}
2970 
2971 	if (!nodes_empty(nodes)) {
2972 		p += snprintf(p, buffer + maxlen - p, ":");
2973 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2974 	}
2975 }
2976