xref: /openbmc/linux/mm/mempolicy.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 #include <linux/migrate.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 #define PDprintk(fmt...)
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 enum zone_type policy_zone = 0;
109 
110 struct mempolicy default_policy = {
111 	.refcnt = ATOMIC_INIT(1), /* never free it */
112 	.policy = MPOL_DEFAULT,
113 };
114 
115 /* Do sanity checking on a policy */
116 static int mpol_check_policy(int mode, nodemask_t *nodes)
117 {
118 	int empty = nodes_empty(*nodes);
119 
120 	switch (mode) {
121 	case MPOL_DEFAULT:
122 		if (!empty)
123 			return -EINVAL;
124 		break;
125 	case MPOL_BIND:
126 	case MPOL_INTERLEAVE:
127 		/* Preferred will only use the first bit, but allow
128 		   more for now. */
129 		if (empty)
130 			return -EINVAL;
131 		break;
132 	}
133 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134 }
135 
136 /* Generate a custom zonelist for the BIND policy. */
137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
138 {
139 	struct zonelist *zl;
140 	int num, max, nd;
141 	enum zone_type k;
142 
143 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 	max++;			/* space for zlcache_ptr (see mmzone.h) */
145 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
146 	if (!zl)
147 		return ERR_PTR(-ENOMEM);
148 	zl->zlcache_ptr = NULL;
149 	num = 0;
150 	/* First put in the highest zones from all nodes, then all the next
151 	   lower zones etc. Avoid empty zones because the memory allocator
152 	   doesn't like them. If you implement node hot removal you
153 	   have to fix that. */
154 	k = policy_zone;
155 	while (1) {
156 		for_each_node_mask(nd, *nodes) {
157 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
158 			if (z->present_pages > 0)
159 				zl->zones[num++] = z;
160 		}
161 		if (k == 0)
162 			break;
163 		k--;
164 	}
165 	if (num == 0) {
166 		kfree(zl);
167 		return ERR_PTR(-EINVAL);
168 	}
169 	zl->zones[num] = NULL;
170 	return zl;
171 }
172 
173 /* Create a new policy */
174 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
175 {
176 	struct mempolicy *policy;
177 
178 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
179 	if (mode == MPOL_DEFAULT)
180 		return NULL;
181 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
182 	if (!policy)
183 		return ERR_PTR(-ENOMEM);
184 	atomic_set(&policy->refcnt, 1);
185 	switch (mode) {
186 	case MPOL_INTERLEAVE:
187 		policy->v.nodes = *nodes;
188 		if (nodes_weight(*nodes) == 0) {
189 			kmem_cache_free(policy_cache, policy);
190 			return ERR_PTR(-EINVAL);
191 		}
192 		break;
193 	case MPOL_PREFERRED:
194 		policy->v.preferred_node = first_node(*nodes);
195 		if (policy->v.preferred_node >= MAX_NUMNODES)
196 			policy->v.preferred_node = -1;
197 		break;
198 	case MPOL_BIND:
199 		policy->v.zonelist = bind_zonelist(nodes);
200 		if (IS_ERR(policy->v.zonelist)) {
201 			void *error_code = policy->v.zonelist;
202 			kmem_cache_free(policy_cache, policy);
203 			return error_code;
204 		}
205 		break;
206 	}
207 	policy->policy = mode;
208 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
209 	return policy;
210 }
211 
212 static void gather_stats(struct page *, void *, int pte_dirty);
213 static void migrate_page_add(struct page *page, struct list_head *pagelist,
214 				unsigned long flags);
215 
216 /* Scan through pages checking if pages follow certain conditions. */
217 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
218 		unsigned long addr, unsigned long end,
219 		const nodemask_t *nodes, unsigned long flags,
220 		void *private)
221 {
222 	pte_t *orig_pte;
223 	pte_t *pte;
224 	spinlock_t *ptl;
225 
226 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
227 	do {
228 		struct page *page;
229 		int nid;
230 
231 		if (!pte_present(*pte))
232 			continue;
233 		page = vm_normal_page(vma, addr, *pte);
234 		if (!page)
235 			continue;
236 		/*
237 		 * The check for PageReserved here is important to avoid
238 		 * handling zero pages and other pages that may have been
239 		 * marked special by the system.
240 		 *
241 		 * If the PageReserved would not be checked here then f.e.
242 		 * the location of the zero page could have an influence
243 		 * on MPOL_MF_STRICT, zero pages would be counted for
244 		 * the per node stats, and there would be useless attempts
245 		 * to put zero pages on the migration list.
246 		 */
247 		if (PageReserved(page))
248 			continue;
249 		nid = page_to_nid(page);
250 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
251 			continue;
252 
253 		if (flags & MPOL_MF_STATS)
254 			gather_stats(page, private, pte_dirty(*pte));
255 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 			migrate_page_add(page, private, flags);
257 		else
258 			break;
259 	} while (pte++, addr += PAGE_SIZE, addr != end);
260 	pte_unmap_unlock(orig_pte, ptl);
261 	return addr != end;
262 }
263 
264 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
265 		unsigned long addr, unsigned long end,
266 		const nodemask_t *nodes, unsigned long flags,
267 		void *private)
268 {
269 	pmd_t *pmd;
270 	unsigned long next;
271 
272 	pmd = pmd_offset(pud, addr);
273 	do {
274 		next = pmd_addr_end(addr, end);
275 		if (pmd_none_or_clear_bad(pmd))
276 			continue;
277 		if (check_pte_range(vma, pmd, addr, next, nodes,
278 				    flags, private))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 		unsigned long addr, unsigned long end,
286 		const nodemask_t *nodes, unsigned long flags,
287 		void *private)
288 {
289 	pud_t *pud;
290 	unsigned long next;
291 
292 	pud = pud_offset(pgd, addr);
293 	do {
294 		next = pud_addr_end(addr, end);
295 		if (pud_none_or_clear_bad(pud))
296 			continue;
297 		if (check_pmd_range(vma, pud, addr, next, nodes,
298 				    flags, private))
299 			return -EIO;
300 	} while (pud++, addr = next, addr != end);
301 	return 0;
302 }
303 
304 static inline int check_pgd_range(struct vm_area_struct *vma,
305 		unsigned long addr, unsigned long end,
306 		const nodemask_t *nodes, unsigned long flags,
307 		void *private)
308 {
309 	pgd_t *pgd;
310 	unsigned long next;
311 
312 	pgd = pgd_offset(vma->vm_mm, addr);
313 	do {
314 		next = pgd_addr_end(addr, end);
315 		if (pgd_none_or_clear_bad(pgd))
316 			continue;
317 		if (check_pud_range(vma, pgd, addr, next, nodes,
318 				    flags, private))
319 			return -EIO;
320 	} while (pgd++, addr = next, addr != end);
321 	return 0;
322 }
323 
324 /*
325  * Check if all pages in a range are on a set of nodes.
326  * If pagelist != NULL then isolate pages from the LRU and
327  * put them on the pagelist.
328  */
329 static struct vm_area_struct *
330 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 		const nodemask_t *nodes, unsigned long flags, void *private)
332 {
333 	int err;
334 	struct vm_area_struct *first, *vma, *prev;
335 
336 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
337 
338 		err = migrate_prep();
339 		if (err)
340 			return ERR_PTR(err);
341 	}
342 
343 	first = find_vma(mm, start);
344 	if (!first)
345 		return ERR_PTR(-EFAULT);
346 	prev = NULL;
347 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
348 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
349 			if (!vma->vm_next && vma->vm_end < end)
350 				return ERR_PTR(-EFAULT);
351 			if (prev && prev->vm_end < vma->vm_start)
352 				return ERR_PTR(-EFAULT);
353 		}
354 		if (!is_vm_hugetlb_page(vma) &&
355 		    ((flags & MPOL_MF_STRICT) ||
356 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
357 				vma_migratable(vma)))) {
358 			unsigned long endvma = vma->vm_end;
359 
360 			if (endvma > end)
361 				endvma = end;
362 			if (vma->vm_start > start)
363 				start = vma->vm_start;
364 			err = check_pgd_range(vma, start, endvma, nodes,
365 						flags, private);
366 			if (err) {
367 				first = ERR_PTR(err);
368 				break;
369 			}
370 		}
371 		prev = vma;
372 	}
373 	return first;
374 }
375 
376 /* Apply policy to a single VMA */
377 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
378 {
379 	int err = 0;
380 	struct mempolicy *old = vma->vm_policy;
381 
382 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
383 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
384 		 vma->vm_ops, vma->vm_file,
385 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
386 
387 	if (vma->vm_ops && vma->vm_ops->set_policy)
388 		err = vma->vm_ops->set_policy(vma, new);
389 	if (!err) {
390 		mpol_get(new);
391 		vma->vm_policy = new;
392 		mpol_free(old);
393 	}
394 	return err;
395 }
396 
397 /* Step 2: apply policy to a range and do splits. */
398 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
399 		       unsigned long end, struct mempolicy *new)
400 {
401 	struct vm_area_struct *next;
402 	int err;
403 
404 	err = 0;
405 	for (; vma && vma->vm_start < end; vma = next) {
406 		next = vma->vm_next;
407 		if (vma->vm_start < start)
408 			err = split_vma(vma->vm_mm, vma, start, 1);
409 		if (!err && vma->vm_end > end)
410 			err = split_vma(vma->vm_mm, vma, end, 0);
411 		if (!err)
412 			err = policy_vma(vma, new);
413 		if (err)
414 			break;
415 	}
416 	return err;
417 }
418 
419 static int contextualize_policy(int mode, nodemask_t *nodes)
420 {
421 	if (!nodes)
422 		return 0;
423 
424 	cpuset_update_task_memory_state();
425 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
426 		return -EINVAL;
427 	return mpol_check_policy(mode, nodes);
428 }
429 
430 
431 /*
432  * Update task->flags PF_MEMPOLICY bit: set iff non-default
433  * mempolicy.  Allows more rapid checking of this (combined perhaps
434  * with other PF_* flag bits) on memory allocation hot code paths.
435  *
436  * If called from outside this file, the task 'p' should -only- be
437  * a newly forked child not yet visible on the task list, because
438  * manipulating the task flags of a visible task is not safe.
439  *
440  * The above limitation is why this routine has the funny name
441  * mpol_fix_fork_child_flag().
442  *
443  * It is also safe to call this with a task pointer of current,
444  * which the static wrapper mpol_set_task_struct_flag() does,
445  * for use within this file.
446  */
447 
448 void mpol_fix_fork_child_flag(struct task_struct *p)
449 {
450 	if (p->mempolicy)
451 		p->flags |= PF_MEMPOLICY;
452 	else
453 		p->flags &= ~PF_MEMPOLICY;
454 }
455 
456 static void mpol_set_task_struct_flag(void)
457 {
458 	mpol_fix_fork_child_flag(current);
459 }
460 
461 /* Set the process memory policy */
462 long do_set_mempolicy(int mode, nodemask_t *nodes)
463 {
464 	struct mempolicy *new;
465 
466 	if (contextualize_policy(mode, nodes))
467 		return -EINVAL;
468 	new = mpol_new(mode, nodes);
469 	if (IS_ERR(new))
470 		return PTR_ERR(new);
471 	mpol_free(current->mempolicy);
472 	current->mempolicy = new;
473 	mpol_set_task_struct_flag();
474 	if (new && new->policy == MPOL_INTERLEAVE)
475 		current->il_next = first_node(new->v.nodes);
476 	return 0;
477 }
478 
479 /* Fill a zone bitmap for a policy */
480 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
481 {
482 	int i;
483 
484 	nodes_clear(*nodes);
485 	switch (p->policy) {
486 	case MPOL_BIND:
487 		for (i = 0; p->v.zonelist->zones[i]; i++)
488 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
489 				*nodes);
490 		break;
491 	case MPOL_DEFAULT:
492 		break;
493 	case MPOL_INTERLEAVE:
494 		*nodes = p->v.nodes;
495 		break;
496 	case MPOL_PREFERRED:
497 		/* or use current node instead of online map? */
498 		if (p->v.preferred_node < 0)
499 			*nodes = node_online_map;
500 		else
501 			node_set(p->v.preferred_node, *nodes);
502 		break;
503 	default:
504 		BUG();
505 	}
506 }
507 
508 static int lookup_node(struct mm_struct *mm, unsigned long addr)
509 {
510 	struct page *p;
511 	int err;
512 
513 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
514 	if (err >= 0) {
515 		err = page_to_nid(p);
516 		put_page(p);
517 	}
518 	return err;
519 }
520 
521 /* Retrieve NUMA policy */
522 long do_get_mempolicy(int *policy, nodemask_t *nmask,
523 			unsigned long addr, unsigned long flags)
524 {
525 	int err;
526 	struct mm_struct *mm = current->mm;
527 	struct vm_area_struct *vma = NULL;
528 	struct mempolicy *pol = current->mempolicy;
529 
530 	cpuset_update_task_memory_state();
531 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
532 		return -EINVAL;
533 	if (flags & MPOL_F_ADDR) {
534 		down_read(&mm->mmap_sem);
535 		vma = find_vma_intersection(mm, addr, addr+1);
536 		if (!vma) {
537 			up_read(&mm->mmap_sem);
538 			return -EFAULT;
539 		}
540 		if (vma->vm_ops && vma->vm_ops->get_policy)
541 			pol = vma->vm_ops->get_policy(vma, addr);
542 		else
543 			pol = vma->vm_policy;
544 	} else if (addr)
545 		return -EINVAL;
546 
547 	if (!pol)
548 		pol = &default_policy;
549 
550 	if (flags & MPOL_F_NODE) {
551 		if (flags & MPOL_F_ADDR) {
552 			err = lookup_node(mm, addr);
553 			if (err < 0)
554 				goto out;
555 			*policy = err;
556 		} else if (pol == current->mempolicy &&
557 				pol->policy == MPOL_INTERLEAVE) {
558 			*policy = current->il_next;
559 		} else {
560 			err = -EINVAL;
561 			goto out;
562 		}
563 	} else
564 		*policy = pol->policy;
565 
566 	if (vma) {
567 		up_read(&current->mm->mmap_sem);
568 		vma = NULL;
569 	}
570 
571 	err = 0;
572 	if (nmask)
573 		get_zonemask(pol, nmask);
574 
575  out:
576 	if (vma)
577 		up_read(&current->mm->mmap_sem);
578 	return err;
579 }
580 
581 #ifdef CONFIG_MIGRATION
582 /*
583  * page migration
584  */
585 static void migrate_page_add(struct page *page, struct list_head *pagelist,
586 				unsigned long flags)
587 {
588 	/*
589 	 * Avoid migrating a page that is shared with others.
590 	 */
591 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
592 		isolate_lru_page(page, pagelist);
593 }
594 
595 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
596 {
597 	return alloc_pages_node(node, GFP_HIGHUSER, 0);
598 }
599 
600 /*
601  * Migrate pages from one node to a target node.
602  * Returns error or the number of pages not migrated.
603  */
604 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
605 {
606 	nodemask_t nmask;
607 	LIST_HEAD(pagelist);
608 	int err = 0;
609 
610 	nodes_clear(nmask);
611 	node_set(source, nmask);
612 
613 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
614 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
615 
616 	if (!list_empty(&pagelist))
617 		err = migrate_pages(&pagelist, new_node_page, dest);
618 
619 	return err;
620 }
621 
622 /*
623  * Move pages between the two nodesets so as to preserve the physical
624  * layout as much as possible.
625  *
626  * Returns the number of page that could not be moved.
627  */
628 int do_migrate_pages(struct mm_struct *mm,
629 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
630 {
631 	LIST_HEAD(pagelist);
632 	int busy = 0;
633 	int err = 0;
634 	nodemask_t tmp;
635 
636   	down_read(&mm->mmap_sem);
637 
638 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
639 	if (err)
640 		goto out;
641 
642 /*
643  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
644  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
645  * bit in 'tmp', and return that <source, dest> pair for migration.
646  * The pair of nodemasks 'to' and 'from' define the map.
647  *
648  * If no pair of bits is found that way, fallback to picking some
649  * pair of 'source' and 'dest' bits that are not the same.  If the
650  * 'source' and 'dest' bits are the same, this represents a node
651  * that will be migrating to itself, so no pages need move.
652  *
653  * If no bits are left in 'tmp', or if all remaining bits left
654  * in 'tmp' correspond to the same bit in 'to', return false
655  * (nothing left to migrate).
656  *
657  * This lets us pick a pair of nodes to migrate between, such that
658  * if possible the dest node is not already occupied by some other
659  * source node, minimizing the risk of overloading the memory on a
660  * node that would happen if we migrated incoming memory to a node
661  * before migrating outgoing memory source that same node.
662  *
663  * A single scan of tmp is sufficient.  As we go, we remember the
664  * most recent <s, d> pair that moved (s != d).  If we find a pair
665  * that not only moved, but what's better, moved to an empty slot
666  * (d is not set in tmp), then we break out then, with that pair.
667  * Otherwise when we finish scannng from_tmp, we at least have the
668  * most recent <s, d> pair that moved.  If we get all the way through
669  * the scan of tmp without finding any node that moved, much less
670  * moved to an empty node, then there is nothing left worth migrating.
671  */
672 
673 	tmp = *from_nodes;
674 	while (!nodes_empty(tmp)) {
675 		int s,d;
676 		int source = -1;
677 		int dest = 0;
678 
679 		for_each_node_mask(s, tmp) {
680 			d = node_remap(s, *from_nodes, *to_nodes);
681 			if (s == d)
682 				continue;
683 
684 			source = s;	/* Node moved. Memorize */
685 			dest = d;
686 
687 			/* dest not in remaining from nodes? */
688 			if (!node_isset(dest, tmp))
689 				break;
690 		}
691 		if (source == -1)
692 			break;
693 
694 		node_clear(source, tmp);
695 		err = migrate_to_node(mm, source, dest, flags);
696 		if (err > 0)
697 			busy += err;
698 		if (err < 0)
699 			break;
700 	}
701 out:
702 	up_read(&mm->mmap_sem);
703 	if (err < 0)
704 		return err;
705 	return busy;
706 
707 }
708 
709 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
710 {
711 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
712 
713 	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
714 }
715 #else
716 
717 static void migrate_page_add(struct page *page, struct list_head *pagelist,
718 				unsigned long flags)
719 {
720 }
721 
722 int do_migrate_pages(struct mm_struct *mm,
723 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
724 {
725 	return -ENOSYS;
726 }
727 
728 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
729 {
730 	return NULL;
731 }
732 #endif
733 
734 long do_mbind(unsigned long start, unsigned long len,
735 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
736 {
737 	struct vm_area_struct *vma;
738 	struct mm_struct *mm = current->mm;
739 	struct mempolicy *new;
740 	unsigned long end;
741 	int err;
742 	LIST_HEAD(pagelist);
743 
744 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
745 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
746 	    || mode > MPOL_MAX)
747 		return -EINVAL;
748 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
749 		return -EPERM;
750 
751 	if (start & ~PAGE_MASK)
752 		return -EINVAL;
753 
754 	if (mode == MPOL_DEFAULT)
755 		flags &= ~MPOL_MF_STRICT;
756 
757 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
758 	end = start + len;
759 
760 	if (end < start)
761 		return -EINVAL;
762 	if (end == start)
763 		return 0;
764 
765 	if (mpol_check_policy(mode, nmask))
766 		return -EINVAL;
767 
768 	new = mpol_new(mode, nmask);
769 	if (IS_ERR(new))
770 		return PTR_ERR(new);
771 
772 	/*
773 	 * If we are using the default policy then operation
774 	 * on discontinuous address spaces is okay after all
775 	 */
776 	if (!new)
777 		flags |= MPOL_MF_DISCONTIG_OK;
778 
779 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
780 			mode,nodes_addr(nodes)[0]);
781 
782 	down_write(&mm->mmap_sem);
783 	vma = check_range(mm, start, end, nmask,
784 			  flags | MPOL_MF_INVERT, &pagelist);
785 
786 	err = PTR_ERR(vma);
787 	if (!IS_ERR(vma)) {
788 		int nr_failed = 0;
789 
790 		err = mbind_range(vma, start, end, new);
791 
792 		if (!list_empty(&pagelist))
793 			nr_failed = migrate_pages(&pagelist, new_vma_page,
794 						(unsigned long)vma);
795 
796 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
797 			err = -EIO;
798 	}
799 
800 	up_write(&mm->mmap_sem);
801 	mpol_free(new);
802 	return err;
803 }
804 
805 /*
806  * User space interface with variable sized bitmaps for nodelists.
807  */
808 
809 /* Copy a node mask from user space. */
810 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
811 		     unsigned long maxnode)
812 {
813 	unsigned long k;
814 	unsigned long nlongs;
815 	unsigned long endmask;
816 
817 	--maxnode;
818 	nodes_clear(*nodes);
819 	if (maxnode == 0 || !nmask)
820 		return 0;
821 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
822 		return -EINVAL;
823 
824 	nlongs = BITS_TO_LONGS(maxnode);
825 	if ((maxnode % BITS_PER_LONG) == 0)
826 		endmask = ~0UL;
827 	else
828 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
829 
830 	/* When the user specified more nodes than supported just check
831 	   if the non supported part is all zero. */
832 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
833 		if (nlongs > PAGE_SIZE/sizeof(long))
834 			return -EINVAL;
835 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
836 			unsigned long t;
837 			if (get_user(t, nmask + k))
838 				return -EFAULT;
839 			if (k == nlongs - 1) {
840 				if (t & endmask)
841 					return -EINVAL;
842 			} else if (t)
843 				return -EINVAL;
844 		}
845 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
846 		endmask = ~0UL;
847 	}
848 
849 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
850 		return -EFAULT;
851 	nodes_addr(*nodes)[nlongs-1] &= endmask;
852 	return 0;
853 }
854 
855 /* Copy a kernel node mask to user space */
856 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
857 			      nodemask_t *nodes)
858 {
859 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
860 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
861 
862 	if (copy > nbytes) {
863 		if (copy > PAGE_SIZE)
864 			return -EINVAL;
865 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
866 			return -EFAULT;
867 		copy = nbytes;
868 	}
869 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
870 }
871 
872 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
873 			unsigned long mode,
874 			unsigned long __user *nmask, unsigned long maxnode,
875 			unsigned flags)
876 {
877 	nodemask_t nodes;
878 	int err;
879 
880 	err = get_nodes(&nodes, nmask, maxnode);
881 	if (err)
882 		return err;
883 #ifdef CONFIG_CPUSETS
884 	/* Restrict the nodes to the allowed nodes in the cpuset */
885 	nodes_and(nodes, nodes, current->mems_allowed);
886 #endif
887 	return do_mbind(start, len, mode, &nodes, flags);
888 }
889 
890 /* Set the process memory policy */
891 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
892 		unsigned long maxnode)
893 {
894 	int err;
895 	nodemask_t nodes;
896 
897 	if (mode < 0 || mode > MPOL_MAX)
898 		return -EINVAL;
899 	err = get_nodes(&nodes, nmask, maxnode);
900 	if (err)
901 		return err;
902 	return do_set_mempolicy(mode, &nodes);
903 }
904 
905 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
906 		const unsigned long __user *old_nodes,
907 		const unsigned long __user *new_nodes)
908 {
909 	struct mm_struct *mm;
910 	struct task_struct *task;
911 	nodemask_t old;
912 	nodemask_t new;
913 	nodemask_t task_nodes;
914 	int err;
915 
916 	err = get_nodes(&old, old_nodes, maxnode);
917 	if (err)
918 		return err;
919 
920 	err = get_nodes(&new, new_nodes, maxnode);
921 	if (err)
922 		return err;
923 
924 	/* Find the mm_struct */
925 	read_lock(&tasklist_lock);
926 	task = pid ? find_task_by_pid(pid) : current;
927 	if (!task) {
928 		read_unlock(&tasklist_lock);
929 		return -ESRCH;
930 	}
931 	mm = get_task_mm(task);
932 	read_unlock(&tasklist_lock);
933 
934 	if (!mm)
935 		return -EINVAL;
936 
937 	/*
938 	 * Check if this process has the right to modify the specified
939 	 * process. The right exists if the process has administrative
940 	 * capabilities, superuser privileges or the same
941 	 * userid as the target process.
942 	 */
943 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
944 	    (current->uid != task->suid) && (current->uid != task->uid) &&
945 	    !capable(CAP_SYS_NICE)) {
946 		err = -EPERM;
947 		goto out;
948 	}
949 
950 	task_nodes = cpuset_mems_allowed(task);
951 	/* Is the user allowed to access the target nodes? */
952 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
953 		err = -EPERM;
954 		goto out;
955 	}
956 
957 	err = security_task_movememory(task);
958 	if (err)
959 		goto out;
960 
961 	err = do_migrate_pages(mm, &old, &new,
962 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
963 out:
964 	mmput(mm);
965 	return err;
966 }
967 
968 
969 /* Retrieve NUMA policy */
970 asmlinkage long sys_get_mempolicy(int __user *policy,
971 				unsigned long __user *nmask,
972 				unsigned long maxnode,
973 				unsigned long addr, unsigned long flags)
974 {
975 	int err, pval;
976 	nodemask_t nodes;
977 
978 	if (nmask != NULL && maxnode < MAX_NUMNODES)
979 		return -EINVAL;
980 
981 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
982 
983 	if (err)
984 		return err;
985 
986 	if (policy && put_user(pval, policy))
987 		return -EFAULT;
988 
989 	if (nmask)
990 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
991 
992 	return err;
993 }
994 
995 #ifdef CONFIG_COMPAT
996 
997 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
998 				     compat_ulong_t __user *nmask,
999 				     compat_ulong_t maxnode,
1000 				     compat_ulong_t addr, compat_ulong_t flags)
1001 {
1002 	long err;
1003 	unsigned long __user *nm = NULL;
1004 	unsigned long nr_bits, alloc_size;
1005 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1006 
1007 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1008 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1009 
1010 	if (nmask)
1011 		nm = compat_alloc_user_space(alloc_size);
1012 
1013 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1014 
1015 	if (!err && nmask) {
1016 		err = copy_from_user(bm, nm, alloc_size);
1017 		/* ensure entire bitmap is zeroed */
1018 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1019 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1020 	}
1021 
1022 	return err;
1023 }
1024 
1025 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1026 				     compat_ulong_t maxnode)
1027 {
1028 	long err = 0;
1029 	unsigned long __user *nm = NULL;
1030 	unsigned long nr_bits, alloc_size;
1031 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1032 
1033 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1034 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1035 
1036 	if (nmask) {
1037 		err = compat_get_bitmap(bm, nmask, nr_bits);
1038 		nm = compat_alloc_user_space(alloc_size);
1039 		err |= copy_to_user(nm, bm, alloc_size);
1040 	}
1041 
1042 	if (err)
1043 		return -EFAULT;
1044 
1045 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1046 }
1047 
1048 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1049 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1050 			     compat_ulong_t maxnode, compat_ulong_t flags)
1051 {
1052 	long err = 0;
1053 	unsigned long __user *nm = NULL;
1054 	unsigned long nr_bits, alloc_size;
1055 	nodemask_t bm;
1056 
1057 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1058 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1059 
1060 	if (nmask) {
1061 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1062 		nm = compat_alloc_user_space(alloc_size);
1063 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1064 	}
1065 
1066 	if (err)
1067 		return -EFAULT;
1068 
1069 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1070 }
1071 
1072 #endif
1073 
1074 /* Return effective policy for a VMA */
1075 static struct mempolicy * get_vma_policy(struct task_struct *task,
1076 		struct vm_area_struct *vma, unsigned long addr)
1077 {
1078 	struct mempolicy *pol = task->mempolicy;
1079 
1080 	if (vma) {
1081 		if (vma->vm_ops && vma->vm_ops->get_policy)
1082 			pol = vma->vm_ops->get_policy(vma, addr);
1083 		else if (vma->vm_policy &&
1084 				vma->vm_policy->policy != MPOL_DEFAULT)
1085 			pol = vma->vm_policy;
1086 	}
1087 	if (!pol)
1088 		pol = &default_policy;
1089 	return pol;
1090 }
1091 
1092 /* Return a zonelist representing a mempolicy */
1093 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1094 {
1095 	int nd;
1096 
1097 	switch (policy->policy) {
1098 	case MPOL_PREFERRED:
1099 		nd = policy->v.preferred_node;
1100 		if (nd < 0)
1101 			nd = numa_node_id();
1102 		break;
1103 	case MPOL_BIND:
1104 		/* Lower zones don't get a policy applied */
1105 		/* Careful: current->mems_allowed might have moved */
1106 		if (gfp_zone(gfp) >= policy_zone)
1107 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1108 				return policy->v.zonelist;
1109 		/*FALL THROUGH*/
1110 	case MPOL_INTERLEAVE: /* should not happen */
1111 	case MPOL_DEFAULT:
1112 		nd = numa_node_id();
1113 		break;
1114 	default:
1115 		nd = 0;
1116 		BUG();
1117 	}
1118 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1119 }
1120 
1121 /* Do dynamic interleaving for a process */
1122 static unsigned interleave_nodes(struct mempolicy *policy)
1123 {
1124 	unsigned nid, next;
1125 	struct task_struct *me = current;
1126 
1127 	nid = me->il_next;
1128 	next = next_node(nid, policy->v.nodes);
1129 	if (next >= MAX_NUMNODES)
1130 		next = first_node(policy->v.nodes);
1131 	me->il_next = next;
1132 	return nid;
1133 }
1134 
1135 /*
1136  * Depending on the memory policy provide a node from which to allocate the
1137  * next slab entry.
1138  */
1139 unsigned slab_node(struct mempolicy *policy)
1140 {
1141 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1142 
1143 	switch (pol) {
1144 	case MPOL_INTERLEAVE:
1145 		return interleave_nodes(policy);
1146 
1147 	case MPOL_BIND:
1148 		/*
1149 		 * Follow bind policy behavior and start allocation at the
1150 		 * first node.
1151 		 */
1152 		return zone_to_nid(policy->v.zonelist->zones[0]);
1153 
1154 	case MPOL_PREFERRED:
1155 		if (policy->v.preferred_node >= 0)
1156 			return policy->v.preferred_node;
1157 		/* Fall through */
1158 
1159 	default:
1160 		return numa_node_id();
1161 	}
1162 }
1163 
1164 /* Do static interleaving for a VMA with known offset. */
1165 static unsigned offset_il_node(struct mempolicy *pol,
1166 		struct vm_area_struct *vma, unsigned long off)
1167 {
1168 	unsigned nnodes = nodes_weight(pol->v.nodes);
1169 	unsigned target = (unsigned)off % nnodes;
1170 	int c;
1171 	int nid = -1;
1172 
1173 	c = 0;
1174 	do {
1175 		nid = next_node(nid, pol->v.nodes);
1176 		c++;
1177 	} while (c <= target);
1178 	return nid;
1179 }
1180 
1181 /* Determine a node number for interleave */
1182 static inline unsigned interleave_nid(struct mempolicy *pol,
1183 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1184 {
1185 	if (vma) {
1186 		unsigned long off;
1187 
1188 		/*
1189 		 * for small pages, there is no difference between
1190 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1191 		 * for huge pages, since vm_pgoff is in units of small
1192 		 * pages, we need to shift off the always 0 bits to get
1193 		 * a useful offset.
1194 		 */
1195 		BUG_ON(shift < PAGE_SHIFT);
1196 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1197 		off += (addr - vma->vm_start) >> shift;
1198 		return offset_il_node(pol, vma, off);
1199 	} else
1200 		return interleave_nodes(pol);
1201 }
1202 
1203 #ifdef CONFIG_HUGETLBFS
1204 /* Return a zonelist suitable for a huge page allocation. */
1205 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1206 {
1207 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1208 
1209 	if (pol->policy == MPOL_INTERLEAVE) {
1210 		unsigned nid;
1211 
1212 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1213 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1214 	}
1215 	return zonelist_policy(GFP_HIGHUSER, pol);
1216 }
1217 #endif
1218 
1219 /* Allocate a page in interleaved policy.
1220    Own path because it needs to do special accounting. */
1221 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1222 					unsigned nid)
1223 {
1224 	struct zonelist *zl;
1225 	struct page *page;
1226 
1227 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1228 	page = __alloc_pages(gfp, order, zl);
1229 	if (page && page_zone(page) == zl->zones[0])
1230 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1231 	return page;
1232 }
1233 
1234 /**
1235  * 	alloc_page_vma	- Allocate a page for a VMA.
1236  *
1237  * 	@gfp:
1238  *      %GFP_USER    user allocation.
1239  *      %GFP_KERNEL  kernel allocations,
1240  *      %GFP_HIGHMEM highmem/user allocations,
1241  *      %GFP_FS      allocation should not call back into a file system.
1242  *      %GFP_ATOMIC  don't sleep.
1243  *
1244  * 	@vma:  Pointer to VMA or NULL if not available.
1245  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1246  *
1247  * 	This function allocates a page from the kernel page pool and applies
1248  *	a NUMA policy associated with the VMA or the current process.
1249  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1250  *	mm_struct of the VMA to prevent it from going away. Should be used for
1251  *	all allocations for pages that will be mapped into
1252  * 	user space. Returns NULL when no page can be allocated.
1253  *
1254  *	Should be called with the mm_sem of the vma hold.
1255  */
1256 struct page *
1257 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1258 {
1259 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1260 
1261 	cpuset_update_task_memory_state();
1262 
1263 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1264 		unsigned nid;
1265 
1266 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1267 		return alloc_page_interleave(gfp, 0, nid);
1268 	}
1269 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1270 }
1271 
1272 /**
1273  * 	alloc_pages_current - Allocate pages.
1274  *
1275  *	@gfp:
1276  *		%GFP_USER   user allocation,
1277  *      	%GFP_KERNEL kernel allocation,
1278  *      	%GFP_HIGHMEM highmem allocation,
1279  *      	%GFP_FS     don't call back into a file system.
1280  *      	%GFP_ATOMIC don't sleep.
1281  *	@order: Power of two of allocation size in pages. 0 is a single page.
1282  *
1283  *	Allocate a page from the kernel page pool.  When not in
1284  *	interrupt context and apply the current process NUMA policy.
1285  *	Returns NULL when no page can be allocated.
1286  *
1287  *	Don't call cpuset_update_task_memory_state() unless
1288  *	1) it's ok to take cpuset_sem (can WAIT), and
1289  *	2) allocating for current task (not interrupt).
1290  */
1291 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1292 {
1293 	struct mempolicy *pol = current->mempolicy;
1294 
1295 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1296 		cpuset_update_task_memory_state();
1297 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1298 		pol = &default_policy;
1299 	if (pol->policy == MPOL_INTERLEAVE)
1300 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1301 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1302 }
1303 EXPORT_SYMBOL(alloc_pages_current);
1304 
1305 /*
1306  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1307  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1308  * with the mems_allowed returned by cpuset_mems_allowed().  This
1309  * keeps mempolicies cpuset relative after its cpuset moves.  See
1310  * further kernel/cpuset.c update_nodemask().
1311  */
1312 void *cpuset_being_rebound;
1313 
1314 /* Slow path of a mempolicy copy */
1315 struct mempolicy *__mpol_copy(struct mempolicy *old)
1316 {
1317 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1318 
1319 	if (!new)
1320 		return ERR_PTR(-ENOMEM);
1321 	if (current_cpuset_is_being_rebound()) {
1322 		nodemask_t mems = cpuset_mems_allowed(current);
1323 		mpol_rebind_policy(old, &mems);
1324 	}
1325 	*new = *old;
1326 	atomic_set(&new->refcnt, 1);
1327 	if (new->policy == MPOL_BIND) {
1328 		int sz = ksize(old->v.zonelist);
1329 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1330 		if (!new->v.zonelist) {
1331 			kmem_cache_free(policy_cache, new);
1332 			return ERR_PTR(-ENOMEM);
1333 		}
1334 	}
1335 	return new;
1336 }
1337 
1338 /* Slow path of a mempolicy comparison */
1339 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1340 {
1341 	if (!a || !b)
1342 		return 0;
1343 	if (a->policy != b->policy)
1344 		return 0;
1345 	switch (a->policy) {
1346 	case MPOL_DEFAULT:
1347 		return 1;
1348 	case MPOL_INTERLEAVE:
1349 		return nodes_equal(a->v.nodes, b->v.nodes);
1350 	case MPOL_PREFERRED:
1351 		return a->v.preferred_node == b->v.preferred_node;
1352 	case MPOL_BIND: {
1353 		int i;
1354 		for (i = 0; a->v.zonelist->zones[i]; i++)
1355 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1356 				return 0;
1357 		return b->v.zonelist->zones[i] == NULL;
1358 	}
1359 	default:
1360 		BUG();
1361 		return 0;
1362 	}
1363 }
1364 
1365 /* Slow path of a mpol destructor. */
1366 void __mpol_free(struct mempolicy *p)
1367 {
1368 	if (!atomic_dec_and_test(&p->refcnt))
1369 		return;
1370 	if (p->policy == MPOL_BIND)
1371 		kfree(p->v.zonelist);
1372 	p->policy = MPOL_DEFAULT;
1373 	kmem_cache_free(policy_cache, p);
1374 }
1375 
1376 /*
1377  * Shared memory backing store policy support.
1378  *
1379  * Remember policies even when nobody has shared memory mapped.
1380  * The policies are kept in Red-Black tree linked from the inode.
1381  * They are protected by the sp->lock spinlock, which should be held
1382  * for any accesses to the tree.
1383  */
1384 
1385 /* lookup first element intersecting start-end */
1386 /* Caller holds sp->lock */
1387 static struct sp_node *
1388 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1389 {
1390 	struct rb_node *n = sp->root.rb_node;
1391 
1392 	while (n) {
1393 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1394 
1395 		if (start >= p->end)
1396 			n = n->rb_right;
1397 		else if (end <= p->start)
1398 			n = n->rb_left;
1399 		else
1400 			break;
1401 	}
1402 	if (!n)
1403 		return NULL;
1404 	for (;;) {
1405 		struct sp_node *w = NULL;
1406 		struct rb_node *prev = rb_prev(n);
1407 		if (!prev)
1408 			break;
1409 		w = rb_entry(prev, struct sp_node, nd);
1410 		if (w->end <= start)
1411 			break;
1412 		n = prev;
1413 	}
1414 	return rb_entry(n, struct sp_node, nd);
1415 }
1416 
1417 /* Insert a new shared policy into the list. */
1418 /* Caller holds sp->lock */
1419 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1420 {
1421 	struct rb_node **p = &sp->root.rb_node;
1422 	struct rb_node *parent = NULL;
1423 	struct sp_node *nd;
1424 
1425 	while (*p) {
1426 		parent = *p;
1427 		nd = rb_entry(parent, struct sp_node, nd);
1428 		if (new->start < nd->start)
1429 			p = &(*p)->rb_left;
1430 		else if (new->end > nd->end)
1431 			p = &(*p)->rb_right;
1432 		else
1433 			BUG();
1434 	}
1435 	rb_link_node(&new->nd, parent, p);
1436 	rb_insert_color(&new->nd, &sp->root);
1437 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1438 		 new->policy ? new->policy->policy : 0);
1439 }
1440 
1441 /* Find shared policy intersecting idx */
1442 struct mempolicy *
1443 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1444 {
1445 	struct mempolicy *pol = NULL;
1446 	struct sp_node *sn;
1447 
1448 	if (!sp->root.rb_node)
1449 		return NULL;
1450 	spin_lock(&sp->lock);
1451 	sn = sp_lookup(sp, idx, idx+1);
1452 	if (sn) {
1453 		mpol_get(sn->policy);
1454 		pol = sn->policy;
1455 	}
1456 	spin_unlock(&sp->lock);
1457 	return pol;
1458 }
1459 
1460 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461 {
1462 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1463 	rb_erase(&n->nd, &sp->root);
1464 	mpol_free(n->policy);
1465 	kmem_cache_free(sn_cache, n);
1466 }
1467 
1468 struct sp_node *
1469 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1470 {
1471 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1472 
1473 	if (!n)
1474 		return NULL;
1475 	n->start = start;
1476 	n->end = end;
1477 	mpol_get(pol);
1478 	n->policy = pol;
1479 	return n;
1480 }
1481 
1482 /* Replace a policy range. */
1483 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1484 				 unsigned long end, struct sp_node *new)
1485 {
1486 	struct sp_node *n, *new2 = NULL;
1487 
1488 restart:
1489 	spin_lock(&sp->lock);
1490 	n = sp_lookup(sp, start, end);
1491 	/* Take care of old policies in the same range. */
1492 	while (n && n->start < end) {
1493 		struct rb_node *next = rb_next(&n->nd);
1494 		if (n->start >= start) {
1495 			if (n->end <= end)
1496 				sp_delete(sp, n);
1497 			else
1498 				n->start = end;
1499 		} else {
1500 			/* Old policy spanning whole new range. */
1501 			if (n->end > end) {
1502 				if (!new2) {
1503 					spin_unlock(&sp->lock);
1504 					new2 = sp_alloc(end, n->end, n->policy);
1505 					if (!new2)
1506 						return -ENOMEM;
1507 					goto restart;
1508 				}
1509 				n->end = start;
1510 				sp_insert(sp, new2);
1511 				new2 = NULL;
1512 				break;
1513 			} else
1514 				n->end = start;
1515 		}
1516 		if (!next)
1517 			break;
1518 		n = rb_entry(next, struct sp_node, nd);
1519 	}
1520 	if (new)
1521 		sp_insert(sp, new);
1522 	spin_unlock(&sp->lock);
1523 	if (new2) {
1524 		mpol_free(new2->policy);
1525 		kmem_cache_free(sn_cache, new2);
1526 	}
1527 	return 0;
1528 }
1529 
1530 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1531 				nodemask_t *policy_nodes)
1532 {
1533 	info->root = RB_ROOT;
1534 	spin_lock_init(&info->lock);
1535 
1536 	if (policy != MPOL_DEFAULT) {
1537 		struct mempolicy *newpol;
1538 
1539 		/* Falls back to MPOL_DEFAULT on any error */
1540 		newpol = mpol_new(policy, policy_nodes);
1541 		if (!IS_ERR(newpol)) {
1542 			/* Create pseudo-vma that contains just the policy */
1543 			struct vm_area_struct pvma;
1544 
1545 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1546 			/* Policy covers entire file */
1547 			pvma.vm_end = TASK_SIZE;
1548 			mpol_set_shared_policy(info, &pvma, newpol);
1549 			mpol_free(newpol);
1550 		}
1551 	}
1552 }
1553 
1554 int mpol_set_shared_policy(struct shared_policy *info,
1555 			struct vm_area_struct *vma, struct mempolicy *npol)
1556 {
1557 	int err;
1558 	struct sp_node *new = NULL;
1559 	unsigned long sz = vma_pages(vma);
1560 
1561 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1562 		 vma->vm_pgoff,
1563 		 sz, npol? npol->policy : -1,
1564 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565 
1566 	if (npol) {
1567 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1568 		if (!new)
1569 			return -ENOMEM;
1570 	}
1571 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1572 	if (err && new)
1573 		kmem_cache_free(sn_cache, new);
1574 	return err;
1575 }
1576 
1577 /* Free a backing policy store on inode delete. */
1578 void mpol_free_shared_policy(struct shared_policy *p)
1579 {
1580 	struct sp_node *n;
1581 	struct rb_node *next;
1582 
1583 	if (!p->root.rb_node)
1584 		return;
1585 	spin_lock(&p->lock);
1586 	next = rb_first(&p->root);
1587 	while (next) {
1588 		n = rb_entry(next, struct sp_node, nd);
1589 		next = rb_next(&n->nd);
1590 		rb_erase(&n->nd, &p->root);
1591 		mpol_free(n->policy);
1592 		kmem_cache_free(sn_cache, n);
1593 	}
1594 	spin_unlock(&p->lock);
1595 }
1596 
1597 /* assumes fs == KERNEL_DS */
1598 void __init numa_policy_init(void)
1599 {
1600 	policy_cache = kmem_cache_create("numa_policy",
1601 					 sizeof(struct mempolicy),
1602 					 0, SLAB_PANIC, NULL, NULL);
1603 
1604 	sn_cache = kmem_cache_create("shared_policy_node",
1605 				     sizeof(struct sp_node),
1606 				     0, SLAB_PANIC, NULL, NULL);
1607 
1608 	/* Set interleaving policy for system init. This way not all
1609 	   the data structures allocated at system boot end up in node zero. */
1610 
1611 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1612 		printk("numa_policy_init: interleaving failed\n");
1613 }
1614 
1615 /* Reset policy of current process to default */
1616 void numa_default_policy(void)
1617 {
1618 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1619 }
1620 
1621 /* Migrate a policy to a different set of nodes */
1622 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1623 {
1624 	nodemask_t *mpolmask;
1625 	nodemask_t tmp;
1626 
1627 	if (!pol)
1628 		return;
1629 	mpolmask = &pol->cpuset_mems_allowed;
1630 	if (nodes_equal(*mpolmask, *newmask))
1631 		return;
1632 
1633 	switch (pol->policy) {
1634 	case MPOL_DEFAULT:
1635 		break;
1636 	case MPOL_INTERLEAVE:
1637 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1638 		pol->v.nodes = tmp;
1639 		*mpolmask = *newmask;
1640 		current->il_next = node_remap(current->il_next,
1641 						*mpolmask, *newmask);
1642 		break;
1643 	case MPOL_PREFERRED:
1644 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1645 						*mpolmask, *newmask);
1646 		*mpolmask = *newmask;
1647 		break;
1648 	case MPOL_BIND: {
1649 		nodemask_t nodes;
1650 		struct zone **z;
1651 		struct zonelist *zonelist;
1652 
1653 		nodes_clear(nodes);
1654 		for (z = pol->v.zonelist->zones; *z; z++)
1655 			node_set(zone_to_nid(*z), nodes);
1656 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1657 		nodes = tmp;
1658 
1659 		zonelist = bind_zonelist(&nodes);
1660 
1661 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1662 		 * If that old zonelist has no remaining mems_allowed nodes,
1663 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1664 		 */
1665 
1666 		if (!IS_ERR(zonelist)) {
1667 			/* Good - got mem - substitute new zonelist */
1668 			kfree(pol->v.zonelist);
1669 			pol->v.zonelist = zonelist;
1670 		}
1671 		*mpolmask = *newmask;
1672 		break;
1673 	}
1674 	default:
1675 		BUG();
1676 		break;
1677 	}
1678 }
1679 
1680 /*
1681  * Wrapper for mpol_rebind_policy() that just requires task
1682  * pointer, and updates task mempolicy.
1683  */
1684 
1685 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1686 {
1687 	mpol_rebind_policy(tsk->mempolicy, new);
1688 }
1689 
1690 /*
1691  * Rebind each vma in mm to new nodemask.
1692  *
1693  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1694  */
1695 
1696 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1697 {
1698 	struct vm_area_struct *vma;
1699 
1700 	down_write(&mm->mmap_sem);
1701 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1702 		mpol_rebind_policy(vma->vm_policy, new);
1703 	up_write(&mm->mmap_sem);
1704 }
1705 
1706 /*
1707  * Display pages allocated per node and memory policy via /proc.
1708  */
1709 
1710 static const char * const policy_types[] =
1711 	{ "default", "prefer", "bind", "interleave" };
1712 
1713 /*
1714  * Convert a mempolicy into a string.
1715  * Returns the number of characters in buffer (if positive)
1716  * or an error (negative)
1717  */
1718 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1719 {
1720 	char *p = buffer;
1721 	int l;
1722 	nodemask_t nodes;
1723 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1724 
1725 	switch (mode) {
1726 	case MPOL_DEFAULT:
1727 		nodes_clear(nodes);
1728 		break;
1729 
1730 	case MPOL_PREFERRED:
1731 		nodes_clear(nodes);
1732 		node_set(pol->v.preferred_node, nodes);
1733 		break;
1734 
1735 	case MPOL_BIND:
1736 		get_zonemask(pol, &nodes);
1737 		break;
1738 
1739 	case MPOL_INTERLEAVE:
1740 		nodes = pol->v.nodes;
1741 		break;
1742 
1743 	default:
1744 		BUG();
1745 		return -EFAULT;
1746 	}
1747 
1748 	l = strlen(policy_types[mode]);
1749  	if (buffer + maxlen < p + l + 1)
1750  		return -ENOSPC;
1751 
1752 	strcpy(p, policy_types[mode]);
1753 	p += l;
1754 
1755 	if (!nodes_empty(nodes)) {
1756 		if (buffer + maxlen < p + 2)
1757 			return -ENOSPC;
1758 		*p++ = '=';
1759 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1760 	}
1761 	return p - buffer;
1762 }
1763 
1764 struct numa_maps {
1765 	unsigned long pages;
1766 	unsigned long anon;
1767 	unsigned long active;
1768 	unsigned long writeback;
1769 	unsigned long mapcount_max;
1770 	unsigned long dirty;
1771 	unsigned long swapcache;
1772 	unsigned long node[MAX_NUMNODES];
1773 };
1774 
1775 static void gather_stats(struct page *page, void *private, int pte_dirty)
1776 {
1777 	struct numa_maps *md = private;
1778 	int count = page_mapcount(page);
1779 
1780 	md->pages++;
1781 	if (pte_dirty || PageDirty(page))
1782 		md->dirty++;
1783 
1784 	if (PageSwapCache(page))
1785 		md->swapcache++;
1786 
1787 	if (PageActive(page))
1788 		md->active++;
1789 
1790 	if (PageWriteback(page))
1791 		md->writeback++;
1792 
1793 	if (PageAnon(page))
1794 		md->anon++;
1795 
1796 	if (count > md->mapcount_max)
1797 		md->mapcount_max = count;
1798 
1799 	md->node[page_to_nid(page)]++;
1800 }
1801 
1802 #ifdef CONFIG_HUGETLB_PAGE
1803 static void check_huge_range(struct vm_area_struct *vma,
1804 		unsigned long start, unsigned long end,
1805 		struct numa_maps *md)
1806 {
1807 	unsigned long addr;
1808 	struct page *page;
1809 
1810 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812 		pte_t pte;
1813 
1814 		if (!ptep)
1815 			continue;
1816 
1817 		pte = *ptep;
1818 		if (pte_none(pte))
1819 			continue;
1820 
1821 		page = pte_page(pte);
1822 		if (!page)
1823 			continue;
1824 
1825 		gather_stats(page, md, pte_dirty(*ptep));
1826 	}
1827 }
1828 #else
1829 static inline void check_huge_range(struct vm_area_struct *vma,
1830 		unsigned long start, unsigned long end,
1831 		struct numa_maps *md)
1832 {
1833 }
1834 #endif
1835 
1836 int show_numa_map(struct seq_file *m, void *v)
1837 {
1838 	struct proc_maps_private *priv = m->private;
1839 	struct vm_area_struct *vma = v;
1840 	struct numa_maps *md;
1841 	struct file *file = vma->vm_file;
1842 	struct mm_struct *mm = vma->vm_mm;
1843 	int n;
1844 	char buffer[50];
1845 
1846 	if (!mm)
1847 		return 0;
1848 
1849 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850 	if (!md)
1851 		return 0;
1852 
1853 	mpol_to_str(buffer, sizeof(buffer),
1854 			    get_vma_policy(priv->task, vma, vma->vm_start));
1855 
1856 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857 
1858 	if (file) {
1859 		seq_printf(m, " file=");
1860 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1861 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862 		seq_printf(m, " heap");
1863 	} else if (vma->vm_start <= mm->start_stack &&
1864 			vma->vm_end >= mm->start_stack) {
1865 		seq_printf(m, " stack");
1866 	}
1867 
1868 	if (is_vm_hugetlb_page(vma)) {
1869 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870 		seq_printf(m, " huge");
1871 	} else {
1872 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873 				&node_online_map, MPOL_MF_STATS, md);
1874 	}
1875 
1876 	if (!md->pages)
1877 		goto out;
1878 
1879 	if (md->anon)
1880 		seq_printf(m," anon=%lu",md->anon);
1881 
1882 	if (md->dirty)
1883 		seq_printf(m," dirty=%lu",md->dirty);
1884 
1885 	if (md->pages != md->anon && md->pages != md->dirty)
1886 		seq_printf(m, " mapped=%lu", md->pages);
1887 
1888 	if (md->mapcount_max > 1)
1889 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890 
1891 	if (md->swapcache)
1892 		seq_printf(m," swapcache=%lu", md->swapcache);
1893 
1894 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895 		seq_printf(m," active=%lu", md->active);
1896 
1897 	if (md->writeback)
1898 		seq_printf(m," writeback=%lu", md->writeback);
1899 
1900 	for_each_online_node(n)
1901 		if (md->node[n])
1902 			seq_printf(m, " N%d=%lu", n, md->node[n]);
1903 out:
1904 	seq_putc(m, '\n');
1905 	kfree(md);
1906 
1907 	if (m->count < m->size)
1908 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1909 	return 0;
1910 }
1911 
1912