xref: /openbmc/linux/mm/mempolicy.c (revision c21b37f6)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 #include <linux/migrate.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 /* Do sanity checking on a policy */
114 static int mpol_check_policy(int mode, nodemask_t *nodes)
115 {
116 	int empty = nodes_empty(*nodes);
117 
118 	switch (mode) {
119 	case MPOL_DEFAULT:
120 		if (!empty)
121 			return -EINVAL;
122 		break;
123 	case MPOL_BIND:
124 	case MPOL_INTERLEAVE:
125 		/* Preferred will only use the first bit, but allow
126 		   more for now. */
127 		if (empty)
128 			return -EINVAL;
129 		break;
130 	}
131 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
132 }
133 
134 /* Generate a custom zonelist for the BIND policy. */
135 static struct zonelist *bind_zonelist(nodemask_t *nodes)
136 {
137 	struct zonelist *zl;
138 	int num, max, nd;
139 	enum zone_type k;
140 
141 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142 	max++;			/* space for zlcache_ptr (see mmzone.h) */
143 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
144 	if (!zl)
145 		return ERR_PTR(-ENOMEM);
146 	zl->zlcache_ptr = NULL;
147 	num = 0;
148 	/* First put in the highest zones from all nodes, then all the next
149 	   lower zones etc. Avoid empty zones because the memory allocator
150 	   doesn't like them. If you implement node hot removal you
151 	   have to fix that. */
152 	k = policy_zone;
153 	while (1) {
154 		for_each_node_mask(nd, *nodes) {
155 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
156 			if (z->present_pages > 0)
157 				zl->zones[num++] = z;
158 		}
159 		if (k == 0)
160 			break;
161 		k--;
162 	}
163 	if (num == 0) {
164 		kfree(zl);
165 		return ERR_PTR(-EINVAL);
166 	}
167 	zl->zones[num] = NULL;
168 	return zl;
169 }
170 
171 /* Create a new policy */
172 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
173 {
174 	struct mempolicy *policy;
175 
176 	pr_debug("setting mode %d nodes[0] %lx\n",
177 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
178 
179 	if (mode == MPOL_DEFAULT)
180 		return NULL;
181 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
182 	if (!policy)
183 		return ERR_PTR(-ENOMEM);
184 	atomic_set(&policy->refcnt, 1);
185 	switch (mode) {
186 	case MPOL_INTERLEAVE:
187 		policy->v.nodes = *nodes;
188 		if (nodes_weight(*nodes) == 0) {
189 			kmem_cache_free(policy_cache, policy);
190 			return ERR_PTR(-EINVAL);
191 		}
192 		break;
193 	case MPOL_PREFERRED:
194 		policy->v.preferred_node = first_node(*nodes);
195 		if (policy->v.preferred_node >= MAX_NUMNODES)
196 			policy->v.preferred_node = -1;
197 		break;
198 	case MPOL_BIND:
199 		policy->v.zonelist = bind_zonelist(nodes);
200 		if (IS_ERR(policy->v.zonelist)) {
201 			void *error_code = policy->v.zonelist;
202 			kmem_cache_free(policy_cache, policy);
203 			return error_code;
204 		}
205 		break;
206 	}
207 	policy->policy = mode;
208 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
209 	return policy;
210 }
211 
212 static void gather_stats(struct page *, void *, int pte_dirty);
213 static void migrate_page_add(struct page *page, struct list_head *pagelist,
214 				unsigned long flags);
215 
216 /* Scan through pages checking if pages follow certain conditions. */
217 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
218 		unsigned long addr, unsigned long end,
219 		const nodemask_t *nodes, unsigned long flags,
220 		void *private)
221 {
222 	pte_t *orig_pte;
223 	pte_t *pte;
224 	spinlock_t *ptl;
225 
226 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
227 	do {
228 		struct page *page;
229 		int nid;
230 
231 		if (!pte_present(*pte))
232 			continue;
233 		page = vm_normal_page(vma, addr, *pte);
234 		if (!page)
235 			continue;
236 		/*
237 		 * The check for PageReserved here is important to avoid
238 		 * handling zero pages and other pages that may have been
239 		 * marked special by the system.
240 		 *
241 		 * If the PageReserved would not be checked here then f.e.
242 		 * the location of the zero page could have an influence
243 		 * on MPOL_MF_STRICT, zero pages would be counted for
244 		 * the per node stats, and there would be useless attempts
245 		 * to put zero pages on the migration list.
246 		 */
247 		if (PageReserved(page))
248 			continue;
249 		nid = page_to_nid(page);
250 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
251 			continue;
252 
253 		if (flags & MPOL_MF_STATS)
254 			gather_stats(page, private, pte_dirty(*pte));
255 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 			migrate_page_add(page, private, flags);
257 		else
258 			break;
259 	} while (pte++, addr += PAGE_SIZE, addr != end);
260 	pte_unmap_unlock(orig_pte, ptl);
261 	return addr != end;
262 }
263 
264 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
265 		unsigned long addr, unsigned long end,
266 		const nodemask_t *nodes, unsigned long flags,
267 		void *private)
268 {
269 	pmd_t *pmd;
270 	unsigned long next;
271 
272 	pmd = pmd_offset(pud, addr);
273 	do {
274 		next = pmd_addr_end(addr, end);
275 		if (pmd_none_or_clear_bad(pmd))
276 			continue;
277 		if (check_pte_range(vma, pmd, addr, next, nodes,
278 				    flags, private))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 		unsigned long addr, unsigned long end,
286 		const nodemask_t *nodes, unsigned long flags,
287 		void *private)
288 {
289 	pud_t *pud;
290 	unsigned long next;
291 
292 	pud = pud_offset(pgd, addr);
293 	do {
294 		next = pud_addr_end(addr, end);
295 		if (pud_none_or_clear_bad(pud))
296 			continue;
297 		if (check_pmd_range(vma, pud, addr, next, nodes,
298 				    flags, private))
299 			return -EIO;
300 	} while (pud++, addr = next, addr != end);
301 	return 0;
302 }
303 
304 static inline int check_pgd_range(struct vm_area_struct *vma,
305 		unsigned long addr, unsigned long end,
306 		const nodemask_t *nodes, unsigned long flags,
307 		void *private)
308 {
309 	pgd_t *pgd;
310 	unsigned long next;
311 
312 	pgd = pgd_offset(vma->vm_mm, addr);
313 	do {
314 		next = pgd_addr_end(addr, end);
315 		if (pgd_none_or_clear_bad(pgd))
316 			continue;
317 		if (check_pud_range(vma, pgd, addr, next, nodes,
318 				    flags, private))
319 			return -EIO;
320 	} while (pgd++, addr = next, addr != end);
321 	return 0;
322 }
323 
324 /*
325  * Check if all pages in a range are on a set of nodes.
326  * If pagelist != NULL then isolate pages from the LRU and
327  * put them on the pagelist.
328  */
329 static struct vm_area_struct *
330 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 		const nodemask_t *nodes, unsigned long flags, void *private)
332 {
333 	int err;
334 	struct vm_area_struct *first, *vma, *prev;
335 
336 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
337 
338 		err = migrate_prep();
339 		if (err)
340 			return ERR_PTR(err);
341 	}
342 
343 	first = find_vma(mm, start);
344 	if (!first)
345 		return ERR_PTR(-EFAULT);
346 	prev = NULL;
347 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
348 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
349 			if (!vma->vm_next && vma->vm_end < end)
350 				return ERR_PTR(-EFAULT);
351 			if (prev && prev->vm_end < vma->vm_start)
352 				return ERR_PTR(-EFAULT);
353 		}
354 		if (!is_vm_hugetlb_page(vma) &&
355 		    ((flags & MPOL_MF_STRICT) ||
356 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
357 				vma_migratable(vma)))) {
358 			unsigned long endvma = vma->vm_end;
359 
360 			if (endvma > end)
361 				endvma = end;
362 			if (vma->vm_start > start)
363 				start = vma->vm_start;
364 			err = check_pgd_range(vma, start, endvma, nodes,
365 						flags, private);
366 			if (err) {
367 				first = ERR_PTR(err);
368 				break;
369 			}
370 		}
371 		prev = vma;
372 	}
373 	return first;
374 }
375 
376 /* Apply policy to a single VMA */
377 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
378 {
379 	int err = 0;
380 	struct mempolicy *old = vma->vm_policy;
381 
382 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
383 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
384 		 vma->vm_ops, vma->vm_file,
385 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
386 
387 	if (vma->vm_ops && vma->vm_ops->set_policy)
388 		err = vma->vm_ops->set_policy(vma, new);
389 	if (!err) {
390 		mpol_get(new);
391 		vma->vm_policy = new;
392 		mpol_free(old);
393 	}
394 	return err;
395 }
396 
397 /* Step 2: apply policy to a range and do splits. */
398 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
399 		       unsigned long end, struct mempolicy *new)
400 {
401 	struct vm_area_struct *next;
402 	int err;
403 
404 	err = 0;
405 	for (; vma && vma->vm_start < end; vma = next) {
406 		next = vma->vm_next;
407 		if (vma->vm_start < start)
408 			err = split_vma(vma->vm_mm, vma, start, 1);
409 		if (!err && vma->vm_end > end)
410 			err = split_vma(vma->vm_mm, vma, end, 0);
411 		if (!err)
412 			err = policy_vma(vma, new);
413 		if (err)
414 			break;
415 	}
416 	return err;
417 }
418 
419 static int contextualize_policy(int mode, nodemask_t *nodes)
420 {
421 	if (!nodes)
422 		return 0;
423 
424 	cpuset_update_task_memory_state();
425 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
426 		return -EINVAL;
427 	return mpol_check_policy(mode, nodes);
428 }
429 
430 
431 /*
432  * Update task->flags PF_MEMPOLICY bit: set iff non-default
433  * mempolicy.  Allows more rapid checking of this (combined perhaps
434  * with other PF_* flag bits) on memory allocation hot code paths.
435  *
436  * If called from outside this file, the task 'p' should -only- be
437  * a newly forked child not yet visible on the task list, because
438  * manipulating the task flags of a visible task is not safe.
439  *
440  * The above limitation is why this routine has the funny name
441  * mpol_fix_fork_child_flag().
442  *
443  * It is also safe to call this with a task pointer of current,
444  * which the static wrapper mpol_set_task_struct_flag() does,
445  * for use within this file.
446  */
447 
448 void mpol_fix_fork_child_flag(struct task_struct *p)
449 {
450 	if (p->mempolicy)
451 		p->flags |= PF_MEMPOLICY;
452 	else
453 		p->flags &= ~PF_MEMPOLICY;
454 }
455 
456 static void mpol_set_task_struct_flag(void)
457 {
458 	mpol_fix_fork_child_flag(current);
459 }
460 
461 /* Set the process memory policy */
462 long do_set_mempolicy(int mode, nodemask_t *nodes)
463 {
464 	struct mempolicy *new;
465 
466 	if (contextualize_policy(mode, nodes))
467 		return -EINVAL;
468 	new = mpol_new(mode, nodes);
469 	if (IS_ERR(new))
470 		return PTR_ERR(new);
471 	mpol_free(current->mempolicy);
472 	current->mempolicy = new;
473 	mpol_set_task_struct_flag();
474 	if (new && new->policy == MPOL_INTERLEAVE)
475 		current->il_next = first_node(new->v.nodes);
476 	return 0;
477 }
478 
479 /* Fill a zone bitmap for a policy */
480 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
481 {
482 	int i;
483 
484 	nodes_clear(*nodes);
485 	switch (p->policy) {
486 	case MPOL_BIND:
487 		for (i = 0; p->v.zonelist->zones[i]; i++)
488 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
489 				*nodes);
490 		break;
491 	case MPOL_DEFAULT:
492 		break;
493 	case MPOL_INTERLEAVE:
494 		*nodes = p->v.nodes;
495 		break;
496 	case MPOL_PREFERRED:
497 		/* or use current node instead of online map? */
498 		if (p->v.preferred_node < 0)
499 			*nodes = node_online_map;
500 		else
501 			node_set(p->v.preferred_node, *nodes);
502 		break;
503 	default:
504 		BUG();
505 	}
506 }
507 
508 static int lookup_node(struct mm_struct *mm, unsigned long addr)
509 {
510 	struct page *p;
511 	int err;
512 
513 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
514 	if (err >= 0) {
515 		err = page_to_nid(p);
516 		put_page(p);
517 	}
518 	return err;
519 }
520 
521 /* Retrieve NUMA policy */
522 long do_get_mempolicy(int *policy, nodemask_t *nmask,
523 			unsigned long addr, unsigned long flags)
524 {
525 	int err;
526 	struct mm_struct *mm = current->mm;
527 	struct vm_area_struct *vma = NULL;
528 	struct mempolicy *pol = current->mempolicy;
529 
530 	cpuset_update_task_memory_state();
531 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
532 		return -EINVAL;
533 	if (flags & MPOL_F_ADDR) {
534 		down_read(&mm->mmap_sem);
535 		vma = find_vma_intersection(mm, addr, addr+1);
536 		if (!vma) {
537 			up_read(&mm->mmap_sem);
538 			return -EFAULT;
539 		}
540 		if (vma->vm_ops && vma->vm_ops->get_policy)
541 			pol = vma->vm_ops->get_policy(vma, addr);
542 		else
543 			pol = vma->vm_policy;
544 	} else if (addr)
545 		return -EINVAL;
546 
547 	if (!pol)
548 		pol = &default_policy;
549 
550 	if (flags & MPOL_F_NODE) {
551 		if (flags & MPOL_F_ADDR) {
552 			err = lookup_node(mm, addr);
553 			if (err < 0)
554 				goto out;
555 			*policy = err;
556 		} else if (pol == current->mempolicy &&
557 				pol->policy == MPOL_INTERLEAVE) {
558 			*policy = current->il_next;
559 		} else {
560 			err = -EINVAL;
561 			goto out;
562 		}
563 	} else
564 		*policy = pol->policy;
565 
566 	if (vma) {
567 		up_read(&current->mm->mmap_sem);
568 		vma = NULL;
569 	}
570 
571 	err = 0;
572 	if (nmask)
573 		get_zonemask(pol, nmask);
574 
575  out:
576 	if (vma)
577 		up_read(&current->mm->mmap_sem);
578 	return err;
579 }
580 
581 #ifdef CONFIG_MIGRATION
582 /*
583  * page migration
584  */
585 static void migrate_page_add(struct page *page, struct list_head *pagelist,
586 				unsigned long flags)
587 {
588 	/*
589 	 * Avoid migrating a page that is shared with others.
590 	 */
591 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
592 		isolate_lru_page(page, pagelist);
593 }
594 
595 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
596 {
597 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
598 }
599 
600 /*
601  * Migrate pages from one node to a target node.
602  * Returns error or the number of pages not migrated.
603  */
604 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
605 {
606 	nodemask_t nmask;
607 	LIST_HEAD(pagelist);
608 	int err = 0;
609 
610 	nodes_clear(nmask);
611 	node_set(source, nmask);
612 
613 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
614 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
615 
616 	if (!list_empty(&pagelist))
617 		err = migrate_pages(&pagelist, new_node_page, dest);
618 
619 	return err;
620 }
621 
622 /*
623  * Move pages between the two nodesets so as to preserve the physical
624  * layout as much as possible.
625  *
626  * Returns the number of page that could not be moved.
627  */
628 int do_migrate_pages(struct mm_struct *mm,
629 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
630 {
631 	LIST_HEAD(pagelist);
632 	int busy = 0;
633 	int err = 0;
634 	nodemask_t tmp;
635 
636   	down_read(&mm->mmap_sem);
637 
638 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
639 	if (err)
640 		goto out;
641 
642 /*
643  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
644  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
645  * bit in 'tmp', and return that <source, dest> pair for migration.
646  * The pair of nodemasks 'to' and 'from' define the map.
647  *
648  * If no pair of bits is found that way, fallback to picking some
649  * pair of 'source' and 'dest' bits that are not the same.  If the
650  * 'source' and 'dest' bits are the same, this represents a node
651  * that will be migrating to itself, so no pages need move.
652  *
653  * If no bits are left in 'tmp', or if all remaining bits left
654  * in 'tmp' correspond to the same bit in 'to', return false
655  * (nothing left to migrate).
656  *
657  * This lets us pick a pair of nodes to migrate between, such that
658  * if possible the dest node is not already occupied by some other
659  * source node, minimizing the risk of overloading the memory on a
660  * node that would happen if we migrated incoming memory to a node
661  * before migrating outgoing memory source that same node.
662  *
663  * A single scan of tmp is sufficient.  As we go, we remember the
664  * most recent <s, d> pair that moved (s != d).  If we find a pair
665  * that not only moved, but what's better, moved to an empty slot
666  * (d is not set in tmp), then we break out then, with that pair.
667  * Otherwise when we finish scannng from_tmp, we at least have the
668  * most recent <s, d> pair that moved.  If we get all the way through
669  * the scan of tmp without finding any node that moved, much less
670  * moved to an empty node, then there is nothing left worth migrating.
671  */
672 
673 	tmp = *from_nodes;
674 	while (!nodes_empty(tmp)) {
675 		int s,d;
676 		int source = -1;
677 		int dest = 0;
678 
679 		for_each_node_mask(s, tmp) {
680 			d = node_remap(s, *from_nodes, *to_nodes);
681 			if (s == d)
682 				continue;
683 
684 			source = s;	/* Node moved. Memorize */
685 			dest = d;
686 
687 			/* dest not in remaining from nodes? */
688 			if (!node_isset(dest, tmp))
689 				break;
690 		}
691 		if (source == -1)
692 			break;
693 
694 		node_clear(source, tmp);
695 		err = migrate_to_node(mm, source, dest, flags);
696 		if (err > 0)
697 			busy += err;
698 		if (err < 0)
699 			break;
700 	}
701 out:
702 	up_read(&mm->mmap_sem);
703 	if (err < 0)
704 		return err;
705 	return busy;
706 
707 }
708 
709 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
710 {
711 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
712 
713 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
714 					page_address_in_vma(page, vma));
715 }
716 #else
717 
718 static void migrate_page_add(struct page *page, struct list_head *pagelist,
719 				unsigned long flags)
720 {
721 }
722 
723 int do_migrate_pages(struct mm_struct *mm,
724 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
725 {
726 	return -ENOSYS;
727 }
728 
729 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
730 {
731 	return NULL;
732 }
733 #endif
734 
735 long do_mbind(unsigned long start, unsigned long len,
736 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
737 {
738 	struct vm_area_struct *vma;
739 	struct mm_struct *mm = current->mm;
740 	struct mempolicy *new;
741 	unsigned long end;
742 	int err;
743 	LIST_HEAD(pagelist);
744 
745 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
746 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
747 	    || mode > MPOL_MAX)
748 		return -EINVAL;
749 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
750 		return -EPERM;
751 
752 	if (start & ~PAGE_MASK)
753 		return -EINVAL;
754 
755 	if (mode == MPOL_DEFAULT)
756 		flags &= ~MPOL_MF_STRICT;
757 
758 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
759 	end = start + len;
760 
761 	if (end < start)
762 		return -EINVAL;
763 	if (end == start)
764 		return 0;
765 
766 	if (mpol_check_policy(mode, nmask))
767 		return -EINVAL;
768 
769 	new = mpol_new(mode, nmask);
770 	if (IS_ERR(new))
771 		return PTR_ERR(new);
772 
773 	/*
774 	 * If we are using the default policy then operation
775 	 * on discontinuous address spaces is okay after all
776 	 */
777 	if (!new)
778 		flags |= MPOL_MF_DISCONTIG_OK;
779 
780 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
781 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
782 
783 	down_write(&mm->mmap_sem);
784 	vma = check_range(mm, start, end, nmask,
785 			  flags | MPOL_MF_INVERT, &pagelist);
786 
787 	err = PTR_ERR(vma);
788 	if (!IS_ERR(vma)) {
789 		int nr_failed = 0;
790 
791 		err = mbind_range(vma, start, end, new);
792 
793 		if (!list_empty(&pagelist))
794 			nr_failed = migrate_pages(&pagelist, new_vma_page,
795 						(unsigned long)vma);
796 
797 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
798 			err = -EIO;
799 	}
800 
801 	up_write(&mm->mmap_sem);
802 	mpol_free(new);
803 	return err;
804 }
805 
806 /*
807  * User space interface with variable sized bitmaps for nodelists.
808  */
809 
810 /* Copy a node mask from user space. */
811 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
812 		     unsigned long maxnode)
813 {
814 	unsigned long k;
815 	unsigned long nlongs;
816 	unsigned long endmask;
817 
818 	--maxnode;
819 	nodes_clear(*nodes);
820 	if (maxnode == 0 || !nmask)
821 		return 0;
822 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
823 		return -EINVAL;
824 
825 	nlongs = BITS_TO_LONGS(maxnode);
826 	if ((maxnode % BITS_PER_LONG) == 0)
827 		endmask = ~0UL;
828 	else
829 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
830 
831 	/* When the user specified more nodes than supported just check
832 	   if the non supported part is all zero. */
833 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
834 		if (nlongs > PAGE_SIZE/sizeof(long))
835 			return -EINVAL;
836 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
837 			unsigned long t;
838 			if (get_user(t, nmask + k))
839 				return -EFAULT;
840 			if (k == nlongs - 1) {
841 				if (t & endmask)
842 					return -EINVAL;
843 			} else if (t)
844 				return -EINVAL;
845 		}
846 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
847 		endmask = ~0UL;
848 	}
849 
850 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
851 		return -EFAULT;
852 	nodes_addr(*nodes)[nlongs-1] &= endmask;
853 	return 0;
854 }
855 
856 /* Copy a kernel node mask to user space */
857 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
858 			      nodemask_t *nodes)
859 {
860 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
861 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
862 
863 	if (copy > nbytes) {
864 		if (copy > PAGE_SIZE)
865 			return -EINVAL;
866 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
867 			return -EFAULT;
868 		copy = nbytes;
869 	}
870 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
871 }
872 
873 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
874 			unsigned long mode,
875 			unsigned long __user *nmask, unsigned long maxnode,
876 			unsigned flags)
877 {
878 	nodemask_t nodes;
879 	int err;
880 
881 	err = get_nodes(&nodes, nmask, maxnode);
882 	if (err)
883 		return err;
884 #ifdef CONFIG_CPUSETS
885 	/* Restrict the nodes to the allowed nodes in the cpuset */
886 	nodes_and(nodes, nodes, current->mems_allowed);
887 #endif
888 	return do_mbind(start, len, mode, &nodes, flags);
889 }
890 
891 /* Set the process memory policy */
892 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
893 		unsigned long maxnode)
894 {
895 	int err;
896 	nodemask_t nodes;
897 
898 	if (mode < 0 || mode > MPOL_MAX)
899 		return -EINVAL;
900 	err = get_nodes(&nodes, nmask, maxnode);
901 	if (err)
902 		return err;
903 	return do_set_mempolicy(mode, &nodes);
904 }
905 
906 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
907 		const unsigned long __user *old_nodes,
908 		const unsigned long __user *new_nodes)
909 {
910 	struct mm_struct *mm;
911 	struct task_struct *task;
912 	nodemask_t old;
913 	nodemask_t new;
914 	nodemask_t task_nodes;
915 	int err;
916 
917 	err = get_nodes(&old, old_nodes, maxnode);
918 	if (err)
919 		return err;
920 
921 	err = get_nodes(&new, new_nodes, maxnode);
922 	if (err)
923 		return err;
924 
925 	/* Find the mm_struct */
926 	read_lock(&tasklist_lock);
927 	task = pid ? find_task_by_pid(pid) : current;
928 	if (!task) {
929 		read_unlock(&tasklist_lock);
930 		return -ESRCH;
931 	}
932 	mm = get_task_mm(task);
933 	read_unlock(&tasklist_lock);
934 
935 	if (!mm)
936 		return -EINVAL;
937 
938 	/*
939 	 * Check if this process has the right to modify the specified
940 	 * process. The right exists if the process has administrative
941 	 * capabilities, superuser privileges or the same
942 	 * userid as the target process.
943 	 */
944 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
945 	    (current->uid != task->suid) && (current->uid != task->uid) &&
946 	    !capable(CAP_SYS_NICE)) {
947 		err = -EPERM;
948 		goto out;
949 	}
950 
951 	task_nodes = cpuset_mems_allowed(task);
952 	/* Is the user allowed to access the target nodes? */
953 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
954 		err = -EPERM;
955 		goto out;
956 	}
957 
958 	err = security_task_movememory(task);
959 	if (err)
960 		goto out;
961 
962 	err = do_migrate_pages(mm, &old, &new,
963 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
964 out:
965 	mmput(mm);
966 	return err;
967 }
968 
969 
970 /* Retrieve NUMA policy */
971 asmlinkage long sys_get_mempolicy(int __user *policy,
972 				unsigned long __user *nmask,
973 				unsigned long maxnode,
974 				unsigned long addr, unsigned long flags)
975 {
976 	int err, pval;
977 	nodemask_t nodes;
978 
979 	if (nmask != NULL && maxnode < MAX_NUMNODES)
980 		return -EINVAL;
981 
982 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
983 
984 	if (err)
985 		return err;
986 
987 	if (policy && put_user(pval, policy))
988 		return -EFAULT;
989 
990 	if (nmask)
991 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
992 
993 	return err;
994 }
995 
996 #ifdef CONFIG_COMPAT
997 
998 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
999 				     compat_ulong_t __user *nmask,
1000 				     compat_ulong_t maxnode,
1001 				     compat_ulong_t addr, compat_ulong_t flags)
1002 {
1003 	long err;
1004 	unsigned long __user *nm = NULL;
1005 	unsigned long nr_bits, alloc_size;
1006 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1007 
1008 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1009 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1010 
1011 	if (nmask)
1012 		nm = compat_alloc_user_space(alloc_size);
1013 
1014 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1015 
1016 	if (!err && nmask) {
1017 		err = copy_from_user(bm, nm, alloc_size);
1018 		/* ensure entire bitmap is zeroed */
1019 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1020 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1021 	}
1022 
1023 	return err;
1024 }
1025 
1026 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1027 				     compat_ulong_t maxnode)
1028 {
1029 	long err = 0;
1030 	unsigned long __user *nm = NULL;
1031 	unsigned long nr_bits, alloc_size;
1032 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1033 
1034 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1035 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1036 
1037 	if (nmask) {
1038 		err = compat_get_bitmap(bm, nmask, nr_bits);
1039 		nm = compat_alloc_user_space(alloc_size);
1040 		err |= copy_to_user(nm, bm, alloc_size);
1041 	}
1042 
1043 	if (err)
1044 		return -EFAULT;
1045 
1046 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1047 }
1048 
1049 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1050 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1051 			     compat_ulong_t maxnode, compat_ulong_t flags)
1052 {
1053 	long err = 0;
1054 	unsigned long __user *nm = NULL;
1055 	unsigned long nr_bits, alloc_size;
1056 	nodemask_t bm;
1057 
1058 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1059 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1060 
1061 	if (nmask) {
1062 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1063 		nm = compat_alloc_user_space(alloc_size);
1064 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1065 	}
1066 
1067 	if (err)
1068 		return -EFAULT;
1069 
1070 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1071 }
1072 
1073 #endif
1074 
1075 /* Return effective policy for a VMA */
1076 static struct mempolicy * get_vma_policy(struct task_struct *task,
1077 		struct vm_area_struct *vma, unsigned long addr)
1078 {
1079 	struct mempolicy *pol = task->mempolicy;
1080 
1081 	if (vma) {
1082 		if (vma->vm_ops && vma->vm_ops->get_policy)
1083 			pol = vma->vm_ops->get_policy(vma, addr);
1084 		else if (vma->vm_policy &&
1085 				vma->vm_policy->policy != MPOL_DEFAULT)
1086 			pol = vma->vm_policy;
1087 	}
1088 	if (!pol)
1089 		pol = &default_policy;
1090 	return pol;
1091 }
1092 
1093 /* Return a zonelist representing a mempolicy */
1094 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1095 {
1096 	int nd;
1097 
1098 	switch (policy->policy) {
1099 	case MPOL_PREFERRED:
1100 		nd = policy->v.preferred_node;
1101 		if (nd < 0)
1102 			nd = numa_node_id();
1103 		break;
1104 	case MPOL_BIND:
1105 		/* Lower zones don't get a policy applied */
1106 		/* Careful: current->mems_allowed might have moved */
1107 		if (gfp_zone(gfp) >= policy_zone)
1108 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1109 				return policy->v.zonelist;
1110 		/*FALL THROUGH*/
1111 	case MPOL_INTERLEAVE: /* should not happen */
1112 	case MPOL_DEFAULT:
1113 		nd = numa_node_id();
1114 		break;
1115 	default:
1116 		nd = 0;
1117 		BUG();
1118 	}
1119 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1120 }
1121 
1122 /* Do dynamic interleaving for a process */
1123 static unsigned interleave_nodes(struct mempolicy *policy)
1124 {
1125 	unsigned nid, next;
1126 	struct task_struct *me = current;
1127 
1128 	nid = me->il_next;
1129 	next = next_node(nid, policy->v.nodes);
1130 	if (next >= MAX_NUMNODES)
1131 		next = first_node(policy->v.nodes);
1132 	me->il_next = next;
1133 	return nid;
1134 }
1135 
1136 /*
1137  * Depending on the memory policy provide a node from which to allocate the
1138  * next slab entry.
1139  */
1140 unsigned slab_node(struct mempolicy *policy)
1141 {
1142 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1143 
1144 	switch (pol) {
1145 	case MPOL_INTERLEAVE:
1146 		return interleave_nodes(policy);
1147 
1148 	case MPOL_BIND:
1149 		/*
1150 		 * Follow bind policy behavior and start allocation at the
1151 		 * first node.
1152 		 */
1153 		return zone_to_nid(policy->v.zonelist->zones[0]);
1154 
1155 	case MPOL_PREFERRED:
1156 		if (policy->v.preferred_node >= 0)
1157 			return policy->v.preferred_node;
1158 		/* Fall through */
1159 
1160 	default:
1161 		return numa_node_id();
1162 	}
1163 }
1164 
1165 /* Do static interleaving for a VMA with known offset. */
1166 static unsigned offset_il_node(struct mempolicy *pol,
1167 		struct vm_area_struct *vma, unsigned long off)
1168 {
1169 	unsigned nnodes = nodes_weight(pol->v.nodes);
1170 	unsigned target = (unsigned)off % nnodes;
1171 	int c;
1172 	int nid = -1;
1173 
1174 	c = 0;
1175 	do {
1176 		nid = next_node(nid, pol->v.nodes);
1177 		c++;
1178 	} while (c <= target);
1179 	return nid;
1180 }
1181 
1182 /* Determine a node number for interleave */
1183 static inline unsigned interleave_nid(struct mempolicy *pol,
1184 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1185 {
1186 	if (vma) {
1187 		unsigned long off;
1188 
1189 		/*
1190 		 * for small pages, there is no difference between
1191 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1192 		 * for huge pages, since vm_pgoff is in units of small
1193 		 * pages, we need to shift off the always 0 bits to get
1194 		 * a useful offset.
1195 		 */
1196 		BUG_ON(shift < PAGE_SHIFT);
1197 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1198 		off += (addr - vma->vm_start) >> shift;
1199 		return offset_il_node(pol, vma, off);
1200 	} else
1201 		return interleave_nodes(pol);
1202 }
1203 
1204 #ifdef CONFIG_HUGETLBFS
1205 /* Return a zonelist suitable for a huge page allocation. */
1206 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1207 							gfp_t gfp_flags)
1208 {
1209 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1210 
1211 	if (pol->policy == MPOL_INTERLEAVE) {
1212 		unsigned nid;
1213 
1214 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1215 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1216 	}
1217 	return zonelist_policy(GFP_HIGHUSER, pol);
1218 }
1219 #endif
1220 
1221 /* Allocate a page in interleaved policy.
1222    Own path because it needs to do special accounting. */
1223 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1224 					unsigned nid)
1225 {
1226 	struct zonelist *zl;
1227 	struct page *page;
1228 
1229 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1230 	page = __alloc_pages(gfp, order, zl);
1231 	if (page && page_zone(page) == zl->zones[0])
1232 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1233 	return page;
1234 }
1235 
1236 /**
1237  * 	alloc_page_vma	- Allocate a page for a VMA.
1238  *
1239  * 	@gfp:
1240  *      %GFP_USER    user allocation.
1241  *      %GFP_KERNEL  kernel allocations,
1242  *      %GFP_HIGHMEM highmem/user allocations,
1243  *      %GFP_FS      allocation should not call back into a file system.
1244  *      %GFP_ATOMIC  don't sleep.
1245  *
1246  * 	@vma:  Pointer to VMA or NULL if not available.
1247  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1248  *
1249  * 	This function allocates a page from the kernel page pool and applies
1250  *	a NUMA policy associated with the VMA or the current process.
1251  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1252  *	mm_struct of the VMA to prevent it from going away. Should be used for
1253  *	all allocations for pages that will be mapped into
1254  * 	user space. Returns NULL when no page can be allocated.
1255  *
1256  *	Should be called with the mm_sem of the vma hold.
1257  */
1258 struct page *
1259 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1260 {
1261 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1262 
1263 	cpuset_update_task_memory_state();
1264 
1265 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1266 		unsigned nid;
1267 
1268 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1269 		return alloc_page_interleave(gfp, 0, nid);
1270 	}
1271 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1272 }
1273 
1274 /**
1275  * 	alloc_pages_current - Allocate pages.
1276  *
1277  *	@gfp:
1278  *		%GFP_USER   user allocation,
1279  *      	%GFP_KERNEL kernel allocation,
1280  *      	%GFP_HIGHMEM highmem allocation,
1281  *      	%GFP_FS     don't call back into a file system.
1282  *      	%GFP_ATOMIC don't sleep.
1283  *	@order: Power of two of allocation size in pages. 0 is a single page.
1284  *
1285  *	Allocate a page from the kernel page pool.  When not in
1286  *	interrupt context and apply the current process NUMA policy.
1287  *	Returns NULL when no page can be allocated.
1288  *
1289  *	Don't call cpuset_update_task_memory_state() unless
1290  *	1) it's ok to take cpuset_sem (can WAIT), and
1291  *	2) allocating for current task (not interrupt).
1292  */
1293 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1294 {
1295 	struct mempolicy *pol = current->mempolicy;
1296 
1297 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1298 		cpuset_update_task_memory_state();
1299 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1300 		pol = &default_policy;
1301 	if (pol->policy == MPOL_INTERLEAVE)
1302 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1303 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1304 }
1305 EXPORT_SYMBOL(alloc_pages_current);
1306 
1307 /*
1308  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1309  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1310  * with the mems_allowed returned by cpuset_mems_allowed().  This
1311  * keeps mempolicies cpuset relative after its cpuset moves.  See
1312  * further kernel/cpuset.c update_nodemask().
1313  */
1314 void *cpuset_being_rebound;
1315 
1316 /* Slow path of a mempolicy copy */
1317 struct mempolicy *__mpol_copy(struct mempolicy *old)
1318 {
1319 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1320 
1321 	if (!new)
1322 		return ERR_PTR(-ENOMEM);
1323 	if (current_cpuset_is_being_rebound()) {
1324 		nodemask_t mems = cpuset_mems_allowed(current);
1325 		mpol_rebind_policy(old, &mems);
1326 	}
1327 	*new = *old;
1328 	atomic_set(&new->refcnt, 1);
1329 	if (new->policy == MPOL_BIND) {
1330 		int sz = ksize(old->v.zonelist);
1331 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1332 		if (!new->v.zonelist) {
1333 			kmem_cache_free(policy_cache, new);
1334 			return ERR_PTR(-ENOMEM);
1335 		}
1336 	}
1337 	return new;
1338 }
1339 
1340 /* Slow path of a mempolicy comparison */
1341 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1342 {
1343 	if (!a || !b)
1344 		return 0;
1345 	if (a->policy != b->policy)
1346 		return 0;
1347 	switch (a->policy) {
1348 	case MPOL_DEFAULT:
1349 		return 1;
1350 	case MPOL_INTERLEAVE:
1351 		return nodes_equal(a->v.nodes, b->v.nodes);
1352 	case MPOL_PREFERRED:
1353 		return a->v.preferred_node == b->v.preferred_node;
1354 	case MPOL_BIND: {
1355 		int i;
1356 		for (i = 0; a->v.zonelist->zones[i]; i++)
1357 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1358 				return 0;
1359 		return b->v.zonelist->zones[i] == NULL;
1360 	}
1361 	default:
1362 		BUG();
1363 		return 0;
1364 	}
1365 }
1366 
1367 /* Slow path of a mpol destructor. */
1368 void __mpol_free(struct mempolicy *p)
1369 {
1370 	if (!atomic_dec_and_test(&p->refcnt))
1371 		return;
1372 	if (p->policy == MPOL_BIND)
1373 		kfree(p->v.zonelist);
1374 	p->policy = MPOL_DEFAULT;
1375 	kmem_cache_free(policy_cache, p);
1376 }
1377 
1378 /*
1379  * Shared memory backing store policy support.
1380  *
1381  * Remember policies even when nobody has shared memory mapped.
1382  * The policies are kept in Red-Black tree linked from the inode.
1383  * They are protected by the sp->lock spinlock, which should be held
1384  * for any accesses to the tree.
1385  */
1386 
1387 /* lookup first element intersecting start-end */
1388 /* Caller holds sp->lock */
1389 static struct sp_node *
1390 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1391 {
1392 	struct rb_node *n = sp->root.rb_node;
1393 
1394 	while (n) {
1395 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1396 
1397 		if (start >= p->end)
1398 			n = n->rb_right;
1399 		else if (end <= p->start)
1400 			n = n->rb_left;
1401 		else
1402 			break;
1403 	}
1404 	if (!n)
1405 		return NULL;
1406 	for (;;) {
1407 		struct sp_node *w = NULL;
1408 		struct rb_node *prev = rb_prev(n);
1409 		if (!prev)
1410 			break;
1411 		w = rb_entry(prev, struct sp_node, nd);
1412 		if (w->end <= start)
1413 			break;
1414 		n = prev;
1415 	}
1416 	return rb_entry(n, struct sp_node, nd);
1417 }
1418 
1419 /* Insert a new shared policy into the list. */
1420 /* Caller holds sp->lock */
1421 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1422 {
1423 	struct rb_node **p = &sp->root.rb_node;
1424 	struct rb_node *parent = NULL;
1425 	struct sp_node *nd;
1426 
1427 	while (*p) {
1428 		parent = *p;
1429 		nd = rb_entry(parent, struct sp_node, nd);
1430 		if (new->start < nd->start)
1431 			p = &(*p)->rb_left;
1432 		else if (new->end > nd->end)
1433 			p = &(*p)->rb_right;
1434 		else
1435 			BUG();
1436 	}
1437 	rb_link_node(&new->nd, parent, p);
1438 	rb_insert_color(&new->nd, &sp->root);
1439 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1440 		 new->policy ? new->policy->policy : 0);
1441 }
1442 
1443 /* Find shared policy intersecting idx */
1444 struct mempolicy *
1445 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1446 {
1447 	struct mempolicy *pol = NULL;
1448 	struct sp_node *sn;
1449 
1450 	if (!sp->root.rb_node)
1451 		return NULL;
1452 	spin_lock(&sp->lock);
1453 	sn = sp_lookup(sp, idx, idx+1);
1454 	if (sn) {
1455 		mpol_get(sn->policy);
1456 		pol = sn->policy;
1457 	}
1458 	spin_unlock(&sp->lock);
1459 	return pol;
1460 }
1461 
1462 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1463 {
1464 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1465 	rb_erase(&n->nd, &sp->root);
1466 	mpol_free(n->policy);
1467 	kmem_cache_free(sn_cache, n);
1468 }
1469 
1470 struct sp_node *
1471 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1472 {
1473 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1474 
1475 	if (!n)
1476 		return NULL;
1477 	n->start = start;
1478 	n->end = end;
1479 	mpol_get(pol);
1480 	n->policy = pol;
1481 	return n;
1482 }
1483 
1484 /* Replace a policy range. */
1485 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1486 				 unsigned long end, struct sp_node *new)
1487 {
1488 	struct sp_node *n, *new2 = NULL;
1489 
1490 restart:
1491 	spin_lock(&sp->lock);
1492 	n = sp_lookup(sp, start, end);
1493 	/* Take care of old policies in the same range. */
1494 	while (n && n->start < end) {
1495 		struct rb_node *next = rb_next(&n->nd);
1496 		if (n->start >= start) {
1497 			if (n->end <= end)
1498 				sp_delete(sp, n);
1499 			else
1500 				n->start = end;
1501 		} else {
1502 			/* Old policy spanning whole new range. */
1503 			if (n->end > end) {
1504 				if (!new2) {
1505 					spin_unlock(&sp->lock);
1506 					new2 = sp_alloc(end, n->end, n->policy);
1507 					if (!new2)
1508 						return -ENOMEM;
1509 					goto restart;
1510 				}
1511 				n->end = start;
1512 				sp_insert(sp, new2);
1513 				new2 = NULL;
1514 				break;
1515 			} else
1516 				n->end = start;
1517 		}
1518 		if (!next)
1519 			break;
1520 		n = rb_entry(next, struct sp_node, nd);
1521 	}
1522 	if (new)
1523 		sp_insert(sp, new);
1524 	spin_unlock(&sp->lock);
1525 	if (new2) {
1526 		mpol_free(new2->policy);
1527 		kmem_cache_free(sn_cache, new2);
1528 	}
1529 	return 0;
1530 }
1531 
1532 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1533 				nodemask_t *policy_nodes)
1534 {
1535 	info->root = RB_ROOT;
1536 	spin_lock_init(&info->lock);
1537 
1538 	if (policy != MPOL_DEFAULT) {
1539 		struct mempolicy *newpol;
1540 
1541 		/* Falls back to MPOL_DEFAULT on any error */
1542 		newpol = mpol_new(policy, policy_nodes);
1543 		if (!IS_ERR(newpol)) {
1544 			/* Create pseudo-vma that contains just the policy */
1545 			struct vm_area_struct pvma;
1546 
1547 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1548 			/* Policy covers entire file */
1549 			pvma.vm_end = TASK_SIZE;
1550 			mpol_set_shared_policy(info, &pvma, newpol);
1551 			mpol_free(newpol);
1552 		}
1553 	}
1554 }
1555 
1556 int mpol_set_shared_policy(struct shared_policy *info,
1557 			struct vm_area_struct *vma, struct mempolicy *npol)
1558 {
1559 	int err;
1560 	struct sp_node *new = NULL;
1561 	unsigned long sz = vma_pages(vma);
1562 
1563 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1564 		 vma->vm_pgoff,
1565 		 sz, npol? npol->policy : -1,
1566 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1567 
1568 	if (npol) {
1569 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1570 		if (!new)
1571 			return -ENOMEM;
1572 	}
1573 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1574 	if (err && new)
1575 		kmem_cache_free(sn_cache, new);
1576 	return err;
1577 }
1578 
1579 /* Free a backing policy store on inode delete. */
1580 void mpol_free_shared_policy(struct shared_policy *p)
1581 {
1582 	struct sp_node *n;
1583 	struct rb_node *next;
1584 
1585 	if (!p->root.rb_node)
1586 		return;
1587 	spin_lock(&p->lock);
1588 	next = rb_first(&p->root);
1589 	while (next) {
1590 		n = rb_entry(next, struct sp_node, nd);
1591 		next = rb_next(&n->nd);
1592 		rb_erase(&n->nd, &p->root);
1593 		mpol_free(n->policy);
1594 		kmem_cache_free(sn_cache, n);
1595 	}
1596 	spin_unlock(&p->lock);
1597 }
1598 
1599 /* assumes fs == KERNEL_DS */
1600 void __init numa_policy_init(void)
1601 {
1602 	nodemask_t interleave_nodes;
1603 	unsigned long largest = 0;
1604 	int nid, prefer = 0;
1605 
1606 	policy_cache = kmem_cache_create("numa_policy",
1607 					 sizeof(struct mempolicy),
1608 					 0, SLAB_PANIC, NULL);
1609 
1610 	sn_cache = kmem_cache_create("shared_policy_node",
1611 				     sizeof(struct sp_node),
1612 				     0, SLAB_PANIC, NULL);
1613 
1614 	/*
1615 	 * Set interleaving policy for system init. Interleaving is only
1616 	 * enabled across suitably sized nodes (default is >= 16MB), or
1617 	 * fall back to the largest node if they're all smaller.
1618 	 */
1619 	nodes_clear(interleave_nodes);
1620 	for_each_online_node(nid) {
1621 		unsigned long total_pages = node_present_pages(nid);
1622 
1623 		/* Preserve the largest node */
1624 		if (largest < total_pages) {
1625 			largest = total_pages;
1626 			prefer = nid;
1627 		}
1628 
1629 		/* Interleave this node? */
1630 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1631 			node_set(nid, interleave_nodes);
1632 	}
1633 
1634 	/* All too small, use the largest */
1635 	if (unlikely(nodes_empty(interleave_nodes)))
1636 		node_set(prefer, interleave_nodes);
1637 
1638 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1639 		printk("numa_policy_init: interleaving failed\n");
1640 }
1641 
1642 /* Reset policy of current process to default */
1643 void numa_default_policy(void)
1644 {
1645 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1646 }
1647 
1648 /* Migrate a policy to a different set of nodes */
1649 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1650 {
1651 	nodemask_t *mpolmask;
1652 	nodemask_t tmp;
1653 
1654 	if (!pol)
1655 		return;
1656 	mpolmask = &pol->cpuset_mems_allowed;
1657 	if (nodes_equal(*mpolmask, *newmask))
1658 		return;
1659 
1660 	switch (pol->policy) {
1661 	case MPOL_DEFAULT:
1662 		break;
1663 	case MPOL_INTERLEAVE:
1664 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1665 		pol->v.nodes = tmp;
1666 		*mpolmask = *newmask;
1667 		current->il_next = node_remap(current->il_next,
1668 						*mpolmask, *newmask);
1669 		break;
1670 	case MPOL_PREFERRED:
1671 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1672 						*mpolmask, *newmask);
1673 		*mpolmask = *newmask;
1674 		break;
1675 	case MPOL_BIND: {
1676 		nodemask_t nodes;
1677 		struct zone **z;
1678 		struct zonelist *zonelist;
1679 
1680 		nodes_clear(nodes);
1681 		for (z = pol->v.zonelist->zones; *z; z++)
1682 			node_set(zone_to_nid(*z), nodes);
1683 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1684 		nodes = tmp;
1685 
1686 		zonelist = bind_zonelist(&nodes);
1687 
1688 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1689 		 * If that old zonelist has no remaining mems_allowed nodes,
1690 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1691 		 */
1692 
1693 		if (!IS_ERR(zonelist)) {
1694 			/* Good - got mem - substitute new zonelist */
1695 			kfree(pol->v.zonelist);
1696 			pol->v.zonelist = zonelist;
1697 		}
1698 		*mpolmask = *newmask;
1699 		break;
1700 	}
1701 	default:
1702 		BUG();
1703 		break;
1704 	}
1705 }
1706 
1707 /*
1708  * Wrapper for mpol_rebind_policy() that just requires task
1709  * pointer, and updates task mempolicy.
1710  */
1711 
1712 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1713 {
1714 	mpol_rebind_policy(tsk->mempolicy, new);
1715 }
1716 
1717 /*
1718  * Rebind each vma in mm to new nodemask.
1719  *
1720  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1721  */
1722 
1723 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1724 {
1725 	struct vm_area_struct *vma;
1726 
1727 	down_write(&mm->mmap_sem);
1728 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1729 		mpol_rebind_policy(vma->vm_policy, new);
1730 	up_write(&mm->mmap_sem);
1731 }
1732 
1733 /*
1734  * Display pages allocated per node and memory policy via /proc.
1735  */
1736 
1737 static const char * const policy_types[] =
1738 	{ "default", "prefer", "bind", "interleave" };
1739 
1740 /*
1741  * Convert a mempolicy into a string.
1742  * Returns the number of characters in buffer (if positive)
1743  * or an error (negative)
1744  */
1745 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1746 {
1747 	char *p = buffer;
1748 	int l;
1749 	nodemask_t nodes;
1750 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1751 
1752 	switch (mode) {
1753 	case MPOL_DEFAULT:
1754 		nodes_clear(nodes);
1755 		break;
1756 
1757 	case MPOL_PREFERRED:
1758 		nodes_clear(nodes);
1759 		node_set(pol->v.preferred_node, nodes);
1760 		break;
1761 
1762 	case MPOL_BIND:
1763 		get_zonemask(pol, &nodes);
1764 		break;
1765 
1766 	case MPOL_INTERLEAVE:
1767 		nodes = pol->v.nodes;
1768 		break;
1769 
1770 	default:
1771 		BUG();
1772 		return -EFAULT;
1773 	}
1774 
1775 	l = strlen(policy_types[mode]);
1776  	if (buffer + maxlen < p + l + 1)
1777  		return -ENOSPC;
1778 
1779 	strcpy(p, policy_types[mode]);
1780 	p += l;
1781 
1782 	if (!nodes_empty(nodes)) {
1783 		if (buffer + maxlen < p + 2)
1784 			return -ENOSPC;
1785 		*p++ = '=';
1786 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1787 	}
1788 	return p - buffer;
1789 }
1790 
1791 struct numa_maps {
1792 	unsigned long pages;
1793 	unsigned long anon;
1794 	unsigned long active;
1795 	unsigned long writeback;
1796 	unsigned long mapcount_max;
1797 	unsigned long dirty;
1798 	unsigned long swapcache;
1799 	unsigned long node[MAX_NUMNODES];
1800 };
1801 
1802 static void gather_stats(struct page *page, void *private, int pte_dirty)
1803 {
1804 	struct numa_maps *md = private;
1805 	int count = page_mapcount(page);
1806 
1807 	md->pages++;
1808 	if (pte_dirty || PageDirty(page))
1809 		md->dirty++;
1810 
1811 	if (PageSwapCache(page))
1812 		md->swapcache++;
1813 
1814 	if (PageActive(page))
1815 		md->active++;
1816 
1817 	if (PageWriteback(page))
1818 		md->writeback++;
1819 
1820 	if (PageAnon(page))
1821 		md->anon++;
1822 
1823 	if (count > md->mapcount_max)
1824 		md->mapcount_max = count;
1825 
1826 	md->node[page_to_nid(page)]++;
1827 }
1828 
1829 #ifdef CONFIG_HUGETLB_PAGE
1830 static void check_huge_range(struct vm_area_struct *vma,
1831 		unsigned long start, unsigned long end,
1832 		struct numa_maps *md)
1833 {
1834 	unsigned long addr;
1835 	struct page *page;
1836 
1837 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1838 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1839 		pte_t pte;
1840 
1841 		if (!ptep)
1842 			continue;
1843 
1844 		pte = *ptep;
1845 		if (pte_none(pte))
1846 			continue;
1847 
1848 		page = pte_page(pte);
1849 		if (!page)
1850 			continue;
1851 
1852 		gather_stats(page, md, pte_dirty(*ptep));
1853 	}
1854 }
1855 #else
1856 static inline void check_huge_range(struct vm_area_struct *vma,
1857 		unsigned long start, unsigned long end,
1858 		struct numa_maps *md)
1859 {
1860 }
1861 #endif
1862 
1863 int show_numa_map(struct seq_file *m, void *v)
1864 {
1865 	struct proc_maps_private *priv = m->private;
1866 	struct vm_area_struct *vma = v;
1867 	struct numa_maps *md;
1868 	struct file *file = vma->vm_file;
1869 	struct mm_struct *mm = vma->vm_mm;
1870 	int n;
1871 	char buffer[50];
1872 
1873 	if (!mm)
1874 		return 0;
1875 
1876 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1877 	if (!md)
1878 		return 0;
1879 
1880 	mpol_to_str(buffer, sizeof(buffer),
1881 			    get_vma_policy(priv->task, vma, vma->vm_start));
1882 
1883 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1884 
1885 	if (file) {
1886 		seq_printf(m, " file=");
1887 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1888 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1889 		seq_printf(m, " heap");
1890 	} else if (vma->vm_start <= mm->start_stack &&
1891 			vma->vm_end >= mm->start_stack) {
1892 		seq_printf(m, " stack");
1893 	}
1894 
1895 	if (is_vm_hugetlb_page(vma)) {
1896 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1897 		seq_printf(m, " huge");
1898 	} else {
1899 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1900 				&node_online_map, MPOL_MF_STATS, md);
1901 	}
1902 
1903 	if (!md->pages)
1904 		goto out;
1905 
1906 	if (md->anon)
1907 		seq_printf(m," anon=%lu",md->anon);
1908 
1909 	if (md->dirty)
1910 		seq_printf(m," dirty=%lu",md->dirty);
1911 
1912 	if (md->pages != md->anon && md->pages != md->dirty)
1913 		seq_printf(m, " mapped=%lu", md->pages);
1914 
1915 	if (md->mapcount_max > 1)
1916 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1917 
1918 	if (md->swapcache)
1919 		seq_printf(m," swapcache=%lu", md->swapcache);
1920 
1921 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1922 		seq_printf(m," active=%lu", md->active);
1923 
1924 	if (md->writeback)
1925 		seq_printf(m," writeback=%lu", md->writeback);
1926 
1927 	for_each_online_node(n)
1928 		if (md->node[n])
1929 			seq_printf(m, " N%d=%lu", n, md->node[n]);
1930 out:
1931 	seq_putc(m, '\n');
1932 	kfree(md);
1933 
1934 	if (m->count < m->size)
1935 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1936 	return 0;
1937 }
1938 
1939