xref: /openbmc/linux/mm/mempolicy.c (revision c5580a7ecb859c6821dd761c95fa150ec7695ae1)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 
90 #include <asm/tlbflush.h>
91 #include <asm/uaccess.h>
92 
93 /* Internal flags */
94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
97 
98 /* The number of pages to migrate per call to migrate_pages() */
99 #define MIGRATE_CHUNK_SIZE 256
100 
101 static kmem_cache_t *policy_cache;
102 static kmem_cache_t *sn_cache;
103 
104 #define PDprintk(fmt...)
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 int policy_zone = ZONE_DMA;
109 
110 struct mempolicy default_policy = {
111 	.refcnt = ATOMIC_INIT(1), /* never free it */
112 	.policy = MPOL_DEFAULT,
113 };
114 
115 /* Do sanity checking on a policy */
116 static int mpol_check_policy(int mode, nodemask_t *nodes)
117 {
118 	int empty = nodes_empty(*nodes);
119 
120 	switch (mode) {
121 	case MPOL_DEFAULT:
122 		if (!empty)
123 			return -EINVAL;
124 		break;
125 	case MPOL_BIND:
126 	case MPOL_INTERLEAVE:
127 		/* Preferred will only use the first bit, but allow
128 		   more for now. */
129 		if (empty)
130 			return -EINVAL;
131 		break;
132 	}
133 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134 }
135 
136 /* Generate a custom zonelist for the BIND policy. */
137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
138 {
139 	struct zonelist *zl;
140 	int num, max, nd, k;
141 
142 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
144 	if (!zl)
145 		return NULL;
146 	num = 0;
147 	/* First put in the highest zones from all nodes, then all the next
148 	   lower zones etc. Avoid empty zones because the memory allocator
149 	   doesn't like them. If you implement node hot removal you
150 	   have to fix that. */
151 	for (k = policy_zone; k >= 0; k--) {
152 		for_each_node_mask(nd, *nodes) {
153 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 			if (z->present_pages > 0)
155 				zl->zones[num++] = z;
156 		}
157 	}
158 	zl->zones[num] = NULL;
159 	return zl;
160 }
161 
162 /* Create a new policy */
163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
164 {
165 	struct mempolicy *policy;
166 
167 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
168 	if (mode == MPOL_DEFAULT)
169 		return NULL;
170 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
171 	if (!policy)
172 		return ERR_PTR(-ENOMEM);
173 	atomic_set(&policy->refcnt, 1);
174 	switch (mode) {
175 	case MPOL_INTERLEAVE:
176 		policy->v.nodes = *nodes;
177 		if (nodes_weight(*nodes) == 0) {
178 			kmem_cache_free(policy_cache, policy);
179 			return ERR_PTR(-EINVAL);
180 		}
181 		break;
182 	case MPOL_PREFERRED:
183 		policy->v.preferred_node = first_node(*nodes);
184 		if (policy->v.preferred_node >= MAX_NUMNODES)
185 			policy->v.preferred_node = -1;
186 		break;
187 	case MPOL_BIND:
188 		policy->v.zonelist = bind_zonelist(nodes);
189 		if (policy->v.zonelist == NULL) {
190 			kmem_cache_free(policy_cache, policy);
191 			return ERR_PTR(-ENOMEM);
192 		}
193 		break;
194 	}
195 	policy->policy = mode;
196 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
197 	return policy;
198 }
199 
200 static void gather_stats(struct page *, void *);
201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
202 				unsigned long flags);
203 
204 /* Scan through pages checking if pages follow certain conditions. */
205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
206 		unsigned long addr, unsigned long end,
207 		const nodemask_t *nodes, unsigned long flags,
208 		void *private)
209 {
210 	pte_t *orig_pte;
211 	pte_t *pte;
212 	spinlock_t *ptl;
213 
214 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
215 	do {
216 		struct page *page;
217 		unsigned int nid;
218 
219 		if (!pte_present(*pte))
220 			continue;
221 		page = vm_normal_page(vma, addr, *pte);
222 		if (!page)
223 			continue;
224 		/*
225 		 * The check for PageReserved here is important to avoid
226 		 * handling zero pages and other pages that may have been
227 		 * marked special by the system.
228 		 *
229 		 * If the PageReserved would not be checked here then f.e.
230 		 * the location of the zero page could have an influence
231 		 * on MPOL_MF_STRICT, zero pages would be counted for
232 		 * the per node stats, and there would be useless attempts
233 		 * to put zero pages on the migration list.
234 		 */
235 		if (PageReserved(page))
236 			continue;
237 		nid = page_to_nid(page);
238 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
239 			continue;
240 
241 		if (flags & MPOL_MF_STATS)
242 			gather_stats(page, private);
243 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
244 			migrate_page_add(page, private, flags);
245 		else
246 			break;
247 	} while (pte++, addr += PAGE_SIZE, addr != end);
248 	pte_unmap_unlock(orig_pte, ptl);
249 	return addr != end;
250 }
251 
252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
253 		unsigned long addr, unsigned long end,
254 		const nodemask_t *nodes, unsigned long flags,
255 		void *private)
256 {
257 	pmd_t *pmd;
258 	unsigned long next;
259 
260 	pmd = pmd_offset(pud, addr);
261 	do {
262 		next = pmd_addr_end(addr, end);
263 		if (pmd_none_or_clear_bad(pmd))
264 			continue;
265 		if (check_pte_range(vma, pmd, addr, next, nodes,
266 				    flags, private))
267 			return -EIO;
268 	} while (pmd++, addr = next, addr != end);
269 	return 0;
270 }
271 
272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
273 		unsigned long addr, unsigned long end,
274 		const nodemask_t *nodes, unsigned long flags,
275 		void *private)
276 {
277 	pud_t *pud;
278 	unsigned long next;
279 
280 	pud = pud_offset(pgd, addr);
281 	do {
282 		next = pud_addr_end(addr, end);
283 		if (pud_none_or_clear_bad(pud))
284 			continue;
285 		if (check_pmd_range(vma, pud, addr, next, nodes,
286 				    flags, private))
287 			return -EIO;
288 	} while (pud++, addr = next, addr != end);
289 	return 0;
290 }
291 
292 static inline int check_pgd_range(struct vm_area_struct *vma,
293 		unsigned long addr, unsigned long end,
294 		const nodemask_t *nodes, unsigned long flags,
295 		void *private)
296 {
297 	pgd_t *pgd;
298 	unsigned long next;
299 
300 	pgd = pgd_offset(vma->vm_mm, addr);
301 	do {
302 		next = pgd_addr_end(addr, end);
303 		if (pgd_none_or_clear_bad(pgd))
304 			continue;
305 		if (check_pud_range(vma, pgd, addr, next, nodes,
306 				    flags, private))
307 			return -EIO;
308 	} while (pgd++, addr = next, addr != end);
309 	return 0;
310 }
311 
312 /* Check if a vma is migratable */
313 static inline int vma_migratable(struct vm_area_struct *vma)
314 {
315 	if (vma->vm_flags & (
316 		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
317 		return 0;
318 	return 1;
319 }
320 
321 /*
322  * Check if all pages in a range are on a set of nodes.
323  * If pagelist != NULL then isolate pages from the LRU and
324  * put them on the pagelist.
325  */
326 static struct vm_area_struct *
327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
328 		const nodemask_t *nodes, unsigned long flags, void *private)
329 {
330 	int err;
331 	struct vm_area_struct *first, *vma, *prev;
332 
333 	/* Clear the LRU lists so pages can be isolated */
334 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
335 		lru_add_drain_all();
336 
337 	first = find_vma(mm, start);
338 	if (!first)
339 		return ERR_PTR(-EFAULT);
340 	prev = NULL;
341 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
342 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
343 			if (!vma->vm_next && vma->vm_end < end)
344 				return ERR_PTR(-EFAULT);
345 			if (prev && prev->vm_end < vma->vm_start)
346 				return ERR_PTR(-EFAULT);
347 		}
348 		if (!is_vm_hugetlb_page(vma) &&
349 		    ((flags & MPOL_MF_STRICT) ||
350 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
351 				vma_migratable(vma)))) {
352 			unsigned long endvma = vma->vm_end;
353 
354 			if (endvma > end)
355 				endvma = end;
356 			if (vma->vm_start > start)
357 				start = vma->vm_start;
358 			err = check_pgd_range(vma, start, endvma, nodes,
359 						flags, private);
360 			if (err) {
361 				first = ERR_PTR(err);
362 				break;
363 			}
364 		}
365 		prev = vma;
366 	}
367 	return first;
368 }
369 
370 /* Apply policy to a single VMA */
371 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
372 {
373 	int err = 0;
374 	struct mempolicy *old = vma->vm_policy;
375 
376 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
377 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
378 		 vma->vm_ops, vma->vm_file,
379 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
380 
381 	if (vma->vm_ops && vma->vm_ops->set_policy)
382 		err = vma->vm_ops->set_policy(vma, new);
383 	if (!err) {
384 		mpol_get(new);
385 		vma->vm_policy = new;
386 		mpol_free(old);
387 	}
388 	return err;
389 }
390 
391 /* Step 2: apply policy to a range and do splits. */
392 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
393 		       unsigned long end, struct mempolicy *new)
394 {
395 	struct vm_area_struct *next;
396 	int err;
397 
398 	err = 0;
399 	for (; vma && vma->vm_start < end; vma = next) {
400 		next = vma->vm_next;
401 		if (vma->vm_start < start)
402 			err = split_vma(vma->vm_mm, vma, start, 1);
403 		if (!err && vma->vm_end > end)
404 			err = split_vma(vma->vm_mm, vma, end, 0);
405 		if (!err)
406 			err = policy_vma(vma, new);
407 		if (err)
408 			break;
409 	}
410 	return err;
411 }
412 
413 static int contextualize_policy(int mode, nodemask_t *nodes)
414 {
415 	if (!nodes)
416 		return 0;
417 
418 	cpuset_update_task_memory_state();
419 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
420 		return -EINVAL;
421 	return mpol_check_policy(mode, nodes);
422 }
423 
424 /* Set the process memory policy */
425 long do_set_mempolicy(int mode, nodemask_t *nodes)
426 {
427 	struct mempolicy *new;
428 
429 	if (contextualize_policy(mode, nodes))
430 		return -EINVAL;
431 	new = mpol_new(mode, nodes);
432 	if (IS_ERR(new))
433 		return PTR_ERR(new);
434 	mpol_free(current->mempolicy);
435 	current->mempolicy = new;
436 	if (new && new->policy == MPOL_INTERLEAVE)
437 		current->il_next = first_node(new->v.nodes);
438 	return 0;
439 }
440 
441 /* Fill a zone bitmap for a policy */
442 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
443 {
444 	int i;
445 
446 	nodes_clear(*nodes);
447 	switch (p->policy) {
448 	case MPOL_BIND:
449 		for (i = 0; p->v.zonelist->zones[i]; i++)
450 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
451 				*nodes);
452 		break;
453 	case MPOL_DEFAULT:
454 		break;
455 	case MPOL_INTERLEAVE:
456 		*nodes = p->v.nodes;
457 		break;
458 	case MPOL_PREFERRED:
459 		/* or use current node instead of online map? */
460 		if (p->v.preferred_node < 0)
461 			*nodes = node_online_map;
462 		else
463 			node_set(p->v.preferred_node, *nodes);
464 		break;
465 	default:
466 		BUG();
467 	}
468 }
469 
470 static int lookup_node(struct mm_struct *mm, unsigned long addr)
471 {
472 	struct page *p;
473 	int err;
474 
475 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
476 	if (err >= 0) {
477 		err = page_to_nid(p);
478 		put_page(p);
479 	}
480 	return err;
481 }
482 
483 /* Retrieve NUMA policy */
484 long do_get_mempolicy(int *policy, nodemask_t *nmask,
485 			unsigned long addr, unsigned long flags)
486 {
487 	int err;
488 	struct mm_struct *mm = current->mm;
489 	struct vm_area_struct *vma = NULL;
490 	struct mempolicy *pol = current->mempolicy;
491 
492 	cpuset_update_task_memory_state();
493 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
494 		return -EINVAL;
495 	if (flags & MPOL_F_ADDR) {
496 		down_read(&mm->mmap_sem);
497 		vma = find_vma_intersection(mm, addr, addr+1);
498 		if (!vma) {
499 			up_read(&mm->mmap_sem);
500 			return -EFAULT;
501 		}
502 		if (vma->vm_ops && vma->vm_ops->get_policy)
503 			pol = vma->vm_ops->get_policy(vma, addr);
504 		else
505 			pol = vma->vm_policy;
506 	} else if (addr)
507 		return -EINVAL;
508 
509 	if (!pol)
510 		pol = &default_policy;
511 
512 	if (flags & MPOL_F_NODE) {
513 		if (flags & MPOL_F_ADDR) {
514 			err = lookup_node(mm, addr);
515 			if (err < 0)
516 				goto out;
517 			*policy = err;
518 		} else if (pol == current->mempolicy &&
519 				pol->policy == MPOL_INTERLEAVE) {
520 			*policy = current->il_next;
521 		} else {
522 			err = -EINVAL;
523 			goto out;
524 		}
525 	} else
526 		*policy = pol->policy;
527 
528 	if (vma) {
529 		up_read(&current->mm->mmap_sem);
530 		vma = NULL;
531 	}
532 
533 	err = 0;
534 	if (nmask)
535 		get_zonemask(pol, nmask);
536 
537  out:
538 	if (vma)
539 		up_read(&current->mm->mmap_sem);
540 	return err;
541 }
542 
543 /*
544  * page migration
545  */
546 
547 static void migrate_page_add(struct page *page, struct list_head *pagelist,
548 				unsigned long flags)
549 {
550 	/*
551 	 * Avoid migrating a page that is shared with others.
552 	 */
553 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
554 		if (isolate_lru_page(page))
555 			list_add(&page->lru, pagelist);
556 	}
557 }
558 
559 /*
560  * Migrate the list 'pagelist' of pages to a certain destination.
561  *
562  * Specify destination with either non-NULL vma or dest_node >= 0
563  * Return the number of pages not migrated or error code
564  */
565 static int migrate_pages_to(struct list_head *pagelist,
566 			struct vm_area_struct *vma, int dest)
567 {
568 	LIST_HEAD(newlist);
569 	LIST_HEAD(moved);
570 	LIST_HEAD(failed);
571 	int err = 0;
572 	int nr_pages;
573 	struct page *page;
574 	struct list_head *p;
575 
576 redo:
577 	nr_pages = 0;
578 	list_for_each(p, pagelist) {
579 		if (vma)
580 			page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
581 		else
582 			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
583 
584 		if (!page) {
585 			err = -ENOMEM;
586 			goto out;
587 		}
588 		list_add(&page->lru, &newlist);
589 		nr_pages++;
590 		if (nr_pages > MIGRATE_CHUNK_SIZE);
591 			break;
592 	}
593 	err = migrate_pages(pagelist, &newlist, &moved, &failed);
594 
595 	putback_lru_pages(&moved);	/* Call release pages instead ?? */
596 
597 	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
598 		goto redo;
599 out:
600 	/* Return leftover allocated pages */
601 	while (!list_empty(&newlist)) {
602 		page = list_entry(newlist.next, struct page, lru);
603 		list_del(&page->lru);
604 		__free_page(page);
605 	}
606 	list_splice(&failed, pagelist);
607 	if (err < 0)
608 		return err;
609 
610 	/* Calculate number of leftover pages */
611 	nr_pages = 0;
612 	list_for_each(p, pagelist)
613 		nr_pages++;
614 	return nr_pages;
615 }
616 
617 /*
618  * Migrate pages from one node to a target node.
619  * Returns error or the number of pages not migrated.
620  */
621 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
622 {
623 	nodemask_t nmask;
624 	LIST_HEAD(pagelist);
625 	int err = 0;
626 
627 	nodes_clear(nmask);
628 	node_set(source, nmask);
629 
630 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
631 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
632 
633 	if (!list_empty(&pagelist)) {
634 		err = migrate_pages_to(&pagelist, NULL, dest);
635 		if (!list_empty(&pagelist))
636 			putback_lru_pages(&pagelist);
637 	}
638 	return err;
639 }
640 
641 /*
642  * Move pages between the two nodesets so as to preserve the physical
643  * layout as much as possible.
644  *
645  * Returns the number of page that could not be moved.
646  */
647 int do_migrate_pages(struct mm_struct *mm,
648 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
649 {
650 	LIST_HEAD(pagelist);
651 	int busy = 0;
652 	int err = 0;
653 	nodemask_t tmp;
654 
655   	down_read(&mm->mmap_sem);
656 
657 /*
658  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
659  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
660  * bit in 'tmp', and return that <source, dest> pair for migration.
661  * The pair of nodemasks 'to' and 'from' define the map.
662  *
663  * If no pair of bits is found that way, fallback to picking some
664  * pair of 'source' and 'dest' bits that are not the same.  If the
665  * 'source' and 'dest' bits are the same, this represents a node
666  * that will be migrating to itself, so no pages need move.
667  *
668  * If no bits are left in 'tmp', or if all remaining bits left
669  * in 'tmp' correspond to the same bit in 'to', return false
670  * (nothing left to migrate).
671  *
672  * This lets us pick a pair of nodes to migrate between, such that
673  * if possible the dest node is not already occupied by some other
674  * source node, minimizing the risk of overloading the memory on a
675  * node that would happen if we migrated incoming memory to a node
676  * before migrating outgoing memory source that same node.
677  *
678  * A single scan of tmp is sufficient.  As we go, we remember the
679  * most recent <s, d> pair that moved (s != d).  If we find a pair
680  * that not only moved, but what's better, moved to an empty slot
681  * (d is not set in tmp), then we break out then, with that pair.
682  * Otherwise when we finish scannng from_tmp, we at least have the
683  * most recent <s, d> pair that moved.  If we get all the way through
684  * the scan of tmp without finding any node that moved, much less
685  * moved to an empty node, then there is nothing left worth migrating.
686  */
687 
688 	tmp = *from_nodes;
689 	while (!nodes_empty(tmp)) {
690 		int s,d;
691 		int source = -1;
692 		int dest = 0;
693 
694 		for_each_node_mask(s, tmp) {
695 			d = node_remap(s, *from_nodes, *to_nodes);
696 			if (s == d)
697 				continue;
698 
699 			source = s;	/* Node moved. Memorize */
700 			dest = d;
701 
702 			/* dest not in remaining from nodes? */
703 			if (!node_isset(dest, tmp))
704 				break;
705 		}
706 		if (source == -1)
707 			break;
708 
709 		node_clear(source, tmp);
710 		err = migrate_to_node(mm, source, dest, flags);
711 		if (err > 0)
712 			busy += err;
713 		if (err < 0)
714 			break;
715 	}
716 
717 	up_read(&mm->mmap_sem);
718 	if (err < 0)
719 		return err;
720 	return busy;
721 }
722 
723 long do_mbind(unsigned long start, unsigned long len,
724 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
725 {
726 	struct vm_area_struct *vma;
727 	struct mm_struct *mm = current->mm;
728 	struct mempolicy *new;
729 	unsigned long end;
730 	int err;
731 	LIST_HEAD(pagelist);
732 
733 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
734 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
735 	    || mode > MPOL_MAX)
736 		return -EINVAL;
737 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
738 		return -EPERM;
739 
740 	if (start & ~PAGE_MASK)
741 		return -EINVAL;
742 
743 	if (mode == MPOL_DEFAULT)
744 		flags &= ~MPOL_MF_STRICT;
745 
746 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
747 	end = start + len;
748 
749 	if (end < start)
750 		return -EINVAL;
751 	if (end == start)
752 		return 0;
753 
754 	if (mpol_check_policy(mode, nmask))
755 		return -EINVAL;
756 
757 	new = mpol_new(mode, nmask);
758 	if (IS_ERR(new))
759 		return PTR_ERR(new);
760 
761 	/*
762 	 * If we are using the default policy then operation
763 	 * on discontinuous address spaces is okay after all
764 	 */
765 	if (!new)
766 		flags |= MPOL_MF_DISCONTIG_OK;
767 
768 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
769 			mode,nodes_addr(nodes)[0]);
770 
771 	down_write(&mm->mmap_sem);
772 	vma = check_range(mm, start, end, nmask,
773 			  flags | MPOL_MF_INVERT, &pagelist);
774 
775 	err = PTR_ERR(vma);
776 	if (!IS_ERR(vma)) {
777 		int nr_failed = 0;
778 
779 		err = mbind_range(vma, start, end, new);
780 
781 		if (!list_empty(&pagelist))
782 			nr_failed = migrate_pages_to(&pagelist, vma, -1);
783 
784 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
785 			err = -EIO;
786 	}
787 	if (!list_empty(&pagelist))
788 		putback_lru_pages(&pagelist);
789 
790 	up_write(&mm->mmap_sem);
791 	mpol_free(new);
792 	return err;
793 }
794 
795 /*
796  * User space interface with variable sized bitmaps for nodelists.
797  */
798 
799 /* Copy a node mask from user space. */
800 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
801 		     unsigned long maxnode)
802 {
803 	unsigned long k;
804 	unsigned long nlongs;
805 	unsigned long endmask;
806 
807 	--maxnode;
808 	nodes_clear(*nodes);
809 	if (maxnode == 0 || !nmask)
810 		return 0;
811 	if (maxnode > PAGE_SIZE)
812 		return -EINVAL;
813 
814 	nlongs = BITS_TO_LONGS(maxnode);
815 	if ((maxnode % BITS_PER_LONG) == 0)
816 		endmask = ~0UL;
817 	else
818 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
819 
820 	/* When the user specified more nodes than supported just check
821 	   if the non supported part is all zero. */
822 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
823 		if (nlongs > PAGE_SIZE/sizeof(long))
824 			return -EINVAL;
825 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
826 			unsigned long t;
827 			if (get_user(t, nmask + k))
828 				return -EFAULT;
829 			if (k == nlongs - 1) {
830 				if (t & endmask)
831 					return -EINVAL;
832 			} else if (t)
833 				return -EINVAL;
834 		}
835 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
836 		endmask = ~0UL;
837 	}
838 
839 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
840 		return -EFAULT;
841 	nodes_addr(*nodes)[nlongs-1] &= endmask;
842 	return 0;
843 }
844 
845 /* Copy a kernel node mask to user space */
846 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
847 			      nodemask_t *nodes)
848 {
849 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
850 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
851 
852 	if (copy > nbytes) {
853 		if (copy > PAGE_SIZE)
854 			return -EINVAL;
855 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
856 			return -EFAULT;
857 		copy = nbytes;
858 	}
859 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
860 }
861 
862 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
863 			unsigned long mode,
864 			unsigned long __user *nmask, unsigned long maxnode,
865 			unsigned flags)
866 {
867 	nodemask_t nodes;
868 	int err;
869 
870 	err = get_nodes(&nodes, nmask, maxnode);
871 	if (err)
872 		return err;
873 	return do_mbind(start, len, mode, &nodes, flags);
874 }
875 
876 /* Set the process memory policy */
877 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
878 		unsigned long maxnode)
879 {
880 	int err;
881 	nodemask_t nodes;
882 
883 	if (mode < 0 || mode > MPOL_MAX)
884 		return -EINVAL;
885 	err = get_nodes(&nodes, nmask, maxnode);
886 	if (err)
887 		return err;
888 	return do_set_mempolicy(mode, &nodes);
889 }
890 
891 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
892 		const unsigned long __user *old_nodes,
893 		const unsigned long __user *new_nodes)
894 {
895 	struct mm_struct *mm;
896 	struct task_struct *task;
897 	nodemask_t old;
898 	nodemask_t new;
899 	nodemask_t task_nodes;
900 	int err;
901 
902 	err = get_nodes(&old, old_nodes, maxnode);
903 	if (err)
904 		return err;
905 
906 	err = get_nodes(&new, new_nodes, maxnode);
907 	if (err)
908 		return err;
909 
910 	/* Find the mm_struct */
911 	read_lock(&tasklist_lock);
912 	task = pid ? find_task_by_pid(pid) : current;
913 	if (!task) {
914 		read_unlock(&tasklist_lock);
915 		return -ESRCH;
916 	}
917 	mm = get_task_mm(task);
918 	read_unlock(&tasklist_lock);
919 
920 	if (!mm)
921 		return -EINVAL;
922 
923 	/*
924 	 * Check if this process has the right to modify the specified
925 	 * process. The right exists if the process has administrative
926 	 * capabilities, superuser priviledges or the same
927 	 * userid as the target process.
928 	 */
929 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
930 	    (current->uid != task->suid) && (current->uid != task->uid) &&
931 	    !capable(CAP_SYS_ADMIN)) {
932 		err = -EPERM;
933 		goto out;
934 	}
935 
936 	task_nodes = cpuset_mems_allowed(task);
937 	/* Is the user allowed to access the target nodes? */
938 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
939 		err = -EPERM;
940 		goto out;
941 	}
942 
943 	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
944 out:
945 	mmput(mm);
946 	return err;
947 }
948 
949 
950 /* Retrieve NUMA policy */
951 asmlinkage long sys_get_mempolicy(int __user *policy,
952 				unsigned long __user *nmask,
953 				unsigned long maxnode,
954 				unsigned long addr, unsigned long flags)
955 {
956 	int err, pval;
957 	nodemask_t nodes;
958 
959 	if (nmask != NULL && maxnode < MAX_NUMNODES)
960 		return -EINVAL;
961 
962 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
963 
964 	if (err)
965 		return err;
966 
967 	if (policy && put_user(pval, policy))
968 		return -EFAULT;
969 
970 	if (nmask)
971 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
972 
973 	return err;
974 }
975 
976 #ifdef CONFIG_COMPAT
977 
978 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
979 				     compat_ulong_t __user *nmask,
980 				     compat_ulong_t maxnode,
981 				     compat_ulong_t addr, compat_ulong_t flags)
982 {
983 	long err;
984 	unsigned long __user *nm = NULL;
985 	unsigned long nr_bits, alloc_size;
986 	DECLARE_BITMAP(bm, MAX_NUMNODES);
987 
988 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
989 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
990 
991 	if (nmask)
992 		nm = compat_alloc_user_space(alloc_size);
993 
994 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
995 
996 	if (!err && nmask) {
997 		err = copy_from_user(bm, nm, alloc_size);
998 		/* ensure entire bitmap is zeroed */
999 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1000 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1001 	}
1002 
1003 	return err;
1004 }
1005 
1006 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1007 				     compat_ulong_t maxnode)
1008 {
1009 	long err = 0;
1010 	unsigned long __user *nm = NULL;
1011 	unsigned long nr_bits, alloc_size;
1012 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1013 
1014 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1015 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1016 
1017 	if (nmask) {
1018 		err = compat_get_bitmap(bm, nmask, nr_bits);
1019 		nm = compat_alloc_user_space(alloc_size);
1020 		err |= copy_to_user(nm, bm, alloc_size);
1021 	}
1022 
1023 	if (err)
1024 		return -EFAULT;
1025 
1026 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1027 }
1028 
1029 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1030 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1031 			     compat_ulong_t maxnode, compat_ulong_t flags)
1032 {
1033 	long err = 0;
1034 	unsigned long __user *nm = NULL;
1035 	unsigned long nr_bits, alloc_size;
1036 	nodemask_t bm;
1037 
1038 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1039 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1040 
1041 	if (nmask) {
1042 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1043 		nm = compat_alloc_user_space(alloc_size);
1044 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1045 	}
1046 
1047 	if (err)
1048 		return -EFAULT;
1049 
1050 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1051 }
1052 
1053 #endif
1054 
1055 /* Return effective policy for a VMA */
1056 static struct mempolicy * get_vma_policy(struct task_struct *task,
1057 		struct vm_area_struct *vma, unsigned long addr)
1058 {
1059 	struct mempolicy *pol = task->mempolicy;
1060 
1061 	if (vma) {
1062 		if (vma->vm_ops && vma->vm_ops->get_policy)
1063 			pol = vma->vm_ops->get_policy(vma, addr);
1064 		else if (vma->vm_policy &&
1065 				vma->vm_policy->policy != MPOL_DEFAULT)
1066 			pol = vma->vm_policy;
1067 	}
1068 	if (!pol)
1069 		pol = &default_policy;
1070 	return pol;
1071 }
1072 
1073 /* Return a zonelist representing a mempolicy */
1074 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1075 {
1076 	int nd;
1077 
1078 	switch (policy->policy) {
1079 	case MPOL_PREFERRED:
1080 		nd = policy->v.preferred_node;
1081 		if (nd < 0)
1082 			nd = numa_node_id();
1083 		break;
1084 	case MPOL_BIND:
1085 		/* Lower zones don't get a policy applied */
1086 		/* Careful: current->mems_allowed might have moved */
1087 		if (gfp_zone(gfp) >= policy_zone)
1088 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1089 				return policy->v.zonelist;
1090 		/*FALL THROUGH*/
1091 	case MPOL_INTERLEAVE: /* should not happen */
1092 	case MPOL_DEFAULT:
1093 		nd = numa_node_id();
1094 		break;
1095 	default:
1096 		nd = 0;
1097 		BUG();
1098 	}
1099 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1100 }
1101 
1102 /* Do dynamic interleaving for a process */
1103 static unsigned interleave_nodes(struct mempolicy *policy)
1104 {
1105 	unsigned nid, next;
1106 	struct task_struct *me = current;
1107 
1108 	nid = me->il_next;
1109 	next = next_node(nid, policy->v.nodes);
1110 	if (next >= MAX_NUMNODES)
1111 		next = first_node(policy->v.nodes);
1112 	me->il_next = next;
1113 	return nid;
1114 }
1115 
1116 /*
1117  * Depending on the memory policy provide a node from which to allocate the
1118  * next slab entry.
1119  */
1120 unsigned slab_node(struct mempolicy *policy)
1121 {
1122 	switch (policy->policy) {
1123 	case MPOL_INTERLEAVE:
1124 		return interleave_nodes(policy);
1125 
1126 	case MPOL_BIND:
1127 		/*
1128 		 * Follow bind policy behavior and start allocation at the
1129 		 * first node.
1130 		 */
1131 		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1132 
1133 	case MPOL_PREFERRED:
1134 		if (policy->v.preferred_node >= 0)
1135 			return policy->v.preferred_node;
1136 		/* Fall through */
1137 
1138 	default:
1139 		return numa_node_id();
1140 	}
1141 }
1142 
1143 /* Do static interleaving for a VMA with known offset. */
1144 static unsigned offset_il_node(struct mempolicy *pol,
1145 		struct vm_area_struct *vma, unsigned long off)
1146 {
1147 	unsigned nnodes = nodes_weight(pol->v.nodes);
1148 	unsigned target = (unsigned)off % nnodes;
1149 	int c;
1150 	int nid = -1;
1151 
1152 	c = 0;
1153 	do {
1154 		nid = next_node(nid, pol->v.nodes);
1155 		c++;
1156 	} while (c <= target);
1157 	return nid;
1158 }
1159 
1160 /* Determine a node number for interleave */
1161 static inline unsigned interleave_nid(struct mempolicy *pol,
1162 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1163 {
1164 	if (vma) {
1165 		unsigned long off;
1166 
1167 		off = vma->vm_pgoff;
1168 		off += (addr - vma->vm_start) >> shift;
1169 		return offset_il_node(pol, vma, off);
1170 	} else
1171 		return interleave_nodes(pol);
1172 }
1173 
1174 #ifdef CONFIG_HUGETLBFS
1175 /* Return a zonelist suitable for a huge page allocation. */
1176 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1177 {
1178 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1179 
1180 	if (pol->policy == MPOL_INTERLEAVE) {
1181 		unsigned nid;
1182 
1183 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1184 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1185 	}
1186 	return zonelist_policy(GFP_HIGHUSER, pol);
1187 }
1188 #endif
1189 
1190 /* Allocate a page in interleaved policy.
1191    Own path because it needs to do special accounting. */
1192 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1193 					unsigned nid)
1194 {
1195 	struct zonelist *zl;
1196 	struct page *page;
1197 
1198 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1199 	page = __alloc_pages(gfp, order, zl);
1200 	if (page && page_zone(page) == zl->zones[0]) {
1201 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1202 		put_cpu();
1203 	}
1204 	return page;
1205 }
1206 
1207 /**
1208  * 	alloc_page_vma	- Allocate a page for a VMA.
1209  *
1210  * 	@gfp:
1211  *      %GFP_USER    user allocation.
1212  *      %GFP_KERNEL  kernel allocations,
1213  *      %GFP_HIGHMEM highmem/user allocations,
1214  *      %GFP_FS      allocation should not call back into a file system.
1215  *      %GFP_ATOMIC  don't sleep.
1216  *
1217  * 	@vma:  Pointer to VMA or NULL if not available.
1218  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1219  *
1220  * 	This function allocates a page from the kernel page pool and applies
1221  *	a NUMA policy associated with the VMA or the current process.
1222  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1223  *	mm_struct of the VMA to prevent it from going away. Should be used for
1224  *	all allocations for pages that will be mapped into
1225  * 	user space. Returns NULL when no page can be allocated.
1226  *
1227  *	Should be called with the mm_sem of the vma hold.
1228  */
1229 struct page *
1230 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1231 {
1232 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1233 
1234 	cpuset_update_task_memory_state();
1235 
1236 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1237 		unsigned nid;
1238 
1239 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1240 		return alloc_page_interleave(gfp, 0, nid);
1241 	}
1242 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1243 }
1244 
1245 /**
1246  * 	alloc_pages_current - Allocate pages.
1247  *
1248  *	@gfp:
1249  *		%GFP_USER   user allocation,
1250  *      	%GFP_KERNEL kernel allocation,
1251  *      	%GFP_HIGHMEM highmem allocation,
1252  *      	%GFP_FS     don't call back into a file system.
1253  *      	%GFP_ATOMIC don't sleep.
1254  *	@order: Power of two of allocation size in pages. 0 is a single page.
1255  *
1256  *	Allocate a page from the kernel page pool.  When not in
1257  *	interrupt context and apply the current process NUMA policy.
1258  *	Returns NULL when no page can be allocated.
1259  *
1260  *	Don't call cpuset_update_task_memory_state() unless
1261  *	1) it's ok to take cpuset_sem (can WAIT), and
1262  *	2) allocating for current task (not interrupt).
1263  */
1264 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1265 {
1266 	struct mempolicy *pol = current->mempolicy;
1267 
1268 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1269 		cpuset_update_task_memory_state();
1270 	if (!pol || in_interrupt())
1271 		pol = &default_policy;
1272 	if (pol->policy == MPOL_INTERLEAVE)
1273 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1274 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1275 }
1276 EXPORT_SYMBOL(alloc_pages_current);
1277 
1278 /*
1279  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1280  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1281  * with the mems_allowed returned by cpuset_mems_allowed().  This
1282  * keeps mempolicies cpuset relative after its cpuset moves.  See
1283  * further kernel/cpuset.c update_nodemask().
1284  */
1285 void *cpuset_being_rebound;
1286 
1287 /* Slow path of a mempolicy copy */
1288 struct mempolicy *__mpol_copy(struct mempolicy *old)
1289 {
1290 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1291 
1292 	if (!new)
1293 		return ERR_PTR(-ENOMEM);
1294 	if (current_cpuset_is_being_rebound()) {
1295 		nodemask_t mems = cpuset_mems_allowed(current);
1296 		mpol_rebind_policy(old, &mems);
1297 	}
1298 	*new = *old;
1299 	atomic_set(&new->refcnt, 1);
1300 	if (new->policy == MPOL_BIND) {
1301 		int sz = ksize(old->v.zonelist);
1302 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1303 		if (!new->v.zonelist) {
1304 			kmem_cache_free(policy_cache, new);
1305 			return ERR_PTR(-ENOMEM);
1306 		}
1307 		memcpy(new->v.zonelist, old->v.zonelist, sz);
1308 	}
1309 	return new;
1310 }
1311 
1312 /* Slow path of a mempolicy comparison */
1313 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1314 {
1315 	if (!a || !b)
1316 		return 0;
1317 	if (a->policy != b->policy)
1318 		return 0;
1319 	switch (a->policy) {
1320 	case MPOL_DEFAULT:
1321 		return 1;
1322 	case MPOL_INTERLEAVE:
1323 		return nodes_equal(a->v.nodes, b->v.nodes);
1324 	case MPOL_PREFERRED:
1325 		return a->v.preferred_node == b->v.preferred_node;
1326 	case MPOL_BIND: {
1327 		int i;
1328 		for (i = 0; a->v.zonelist->zones[i]; i++)
1329 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1330 				return 0;
1331 		return b->v.zonelist->zones[i] == NULL;
1332 	}
1333 	default:
1334 		BUG();
1335 		return 0;
1336 	}
1337 }
1338 
1339 /* Slow path of a mpol destructor. */
1340 void __mpol_free(struct mempolicy *p)
1341 {
1342 	if (!atomic_dec_and_test(&p->refcnt))
1343 		return;
1344 	if (p->policy == MPOL_BIND)
1345 		kfree(p->v.zonelist);
1346 	p->policy = MPOL_DEFAULT;
1347 	kmem_cache_free(policy_cache, p);
1348 }
1349 
1350 /*
1351  * Shared memory backing store policy support.
1352  *
1353  * Remember policies even when nobody has shared memory mapped.
1354  * The policies are kept in Red-Black tree linked from the inode.
1355  * They are protected by the sp->lock spinlock, which should be held
1356  * for any accesses to the tree.
1357  */
1358 
1359 /* lookup first element intersecting start-end */
1360 /* Caller holds sp->lock */
1361 static struct sp_node *
1362 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1363 {
1364 	struct rb_node *n = sp->root.rb_node;
1365 
1366 	while (n) {
1367 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1368 
1369 		if (start >= p->end)
1370 			n = n->rb_right;
1371 		else if (end <= p->start)
1372 			n = n->rb_left;
1373 		else
1374 			break;
1375 	}
1376 	if (!n)
1377 		return NULL;
1378 	for (;;) {
1379 		struct sp_node *w = NULL;
1380 		struct rb_node *prev = rb_prev(n);
1381 		if (!prev)
1382 			break;
1383 		w = rb_entry(prev, struct sp_node, nd);
1384 		if (w->end <= start)
1385 			break;
1386 		n = prev;
1387 	}
1388 	return rb_entry(n, struct sp_node, nd);
1389 }
1390 
1391 /* Insert a new shared policy into the list. */
1392 /* Caller holds sp->lock */
1393 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1394 {
1395 	struct rb_node **p = &sp->root.rb_node;
1396 	struct rb_node *parent = NULL;
1397 	struct sp_node *nd;
1398 
1399 	while (*p) {
1400 		parent = *p;
1401 		nd = rb_entry(parent, struct sp_node, nd);
1402 		if (new->start < nd->start)
1403 			p = &(*p)->rb_left;
1404 		else if (new->end > nd->end)
1405 			p = &(*p)->rb_right;
1406 		else
1407 			BUG();
1408 	}
1409 	rb_link_node(&new->nd, parent, p);
1410 	rb_insert_color(&new->nd, &sp->root);
1411 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1412 		 new->policy ? new->policy->policy : 0);
1413 }
1414 
1415 /* Find shared policy intersecting idx */
1416 struct mempolicy *
1417 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1418 {
1419 	struct mempolicy *pol = NULL;
1420 	struct sp_node *sn;
1421 
1422 	if (!sp->root.rb_node)
1423 		return NULL;
1424 	spin_lock(&sp->lock);
1425 	sn = sp_lookup(sp, idx, idx+1);
1426 	if (sn) {
1427 		mpol_get(sn->policy);
1428 		pol = sn->policy;
1429 	}
1430 	spin_unlock(&sp->lock);
1431 	return pol;
1432 }
1433 
1434 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1435 {
1436 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1437 	rb_erase(&n->nd, &sp->root);
1438 	mpol_free(n->policy);
1439 	kmem_cache_free(sn_cache, n);
1440 }
1441 
1442 struct sp_node *
1443 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1444 {
1445 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1446 
1447 	if (!n)
1448 		return NULL;
1449 	n->start = start;
1450 	n->end = end;
1451 	mpol_get(pol);
1452 	n->policy = pol;
1453 	return n;
1454 }
1455 
1456 /* Replace a policy range. */
1457 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1458 				 unsigned long end, struct sp_node *new)
1459 {
1460 	struct sp_node *n, *new2 = NULL;
1461 
1462 restart:
1463 	spin_lock(&sp->lock);
1464 	n = sp_lookup(sp, start, end);
1465 	/* Take care of old policies in the same range. */
1466 	while (n && n->start < end) {
1467 		struct rb_node *next = rb_next(&n->nd);
1468 		if (n->start >= start) {
1469 			if (n->end <= end)
1470 				sp_delete(sp, n);
1471 			else
1472 				n->start = end;
1473 		} else {
1474 			/* Old policy spanning whole new range. */
1475 			if (n->end > end) {
1476 				if (!new2) {
1477 					spin_unlock(&sp->lock);
1478 					new2 = sp_alloc(end, n->end, n->policy);
1479 					if (!new2)
1480 						return -ENOMEM;
1481 					goto restart;
1482 				}
1483 				n->end = start;
1484 				sp_insert(sp, new2);
1485 				new2 = NULL;
1486 				break;
1487 			} else
1488 				n->end = start;
1489 		}
1490 		if (!next)
1491 			break;
1492 		n = rb_entry(next, struct sp_node, nd);
1493 	}
1494 	if (new)
1495 		sp_insert(sp, new);
1496 	spin_unlock(&sp->lock);
1497 	if (new2) {
1498 		mpol_free(new2->policy);
1499 		kmem_cache_free(sn_cache, new2);
1500 	}
1501 	return 0;
1502 }
1503 
1504 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1505 				nodemask_t *policy_nodes)
1506 {
1507 	info->root = RB_ROOT;
1508 	spin_lock_init(&info->lock);
1509 
1510 	if (policy != MPOL_DEFAULT) {
1511 		struct mempolicy *newpol;
1512 
1513 		/* Falls back to MPOL_DEFAULT on any error */
1514 		newpol = mpol_new(policy, policy_nodes);
1515 		if (!IS_ERR(newpol)) {
1516 			/* Create pseudo-vma that contains just the policy */
1517 			struct vm_area_struct pvma;
1518 
1519 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1520 			/* Policy covers entire file */
1521 			pvma.vm_end = TASK_SIZE;
1522 			mpol_set_shared_policy(info, &pvma, newpol);
1523 			mpol_free(newpol);
1524 		}
1525 	}
1526 }
1527 
1528 int mpol_set_shared_policy(struct shared_policy *info,
1529 			struct vm_area_struct *vma, struct mempolicy *npol)
1530 {
1531 	int err;
1532 	struct sp_node *new = NULL;
1533 	unsigned long sz = vma_pages(vma);
1534 
1535 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1536 		 vma->vm_pgoff,
1537 		 sz, npol? npol->policy : -1,
1538 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1539 
1540 	if (npol) {
1541 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1542 		if (!new)
1543 			return -ENOMEM;
1544 	}
1545 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1546 	if (err && new)
1547 		kmem_cache_free(sn_cache, new);
1548 	return err;
1549 }
1550 
1551 /* Free a backing policy store on inode delete. */
1552 void mpol_free_shared_policy(struct shared_policy *p)
1553 {
1554 	struct sp_node *n;
1555 	struct rb_node *next;
1556 
1557 	if (!p->root.rb_node)
1558 		return;
1559 	spin_lock(&p->lock);
1560 	next = rb_first(&p->root);
1561 	while (next) {
1562 		n = rb_entry(next, struct sp_node, nd);
1563 		next = rb_next(&n->nd);
1564 		rb_erase(&n->nd, &p->root);
1565 		mpol_free(n->policy);
1566 		kmem_cache_free(sn_cache, n);
1567 	}
1568 	spin_unlock(&p->lock);
1569 }
1570 
1571 /* assumes fs == KERNEL_DS */
1572 void __init numa_policy_init(void)
1573 {
1574 	policy_cache = kmem_cache_create("numa_policy",
1575 					 sizeof(struct mempolicy),
1576 					 0, SLAB_PANIC, NULL, NULL);
1577 
1578 	sn_cache = kmem_cache_create("shared_policy_node",
1579 				     sizeof(struct sp_node),
1580 				     0, SLAB_PANIC, NULL, NULL);
1581 
1582 	/* Set interleaving policy for system init. This way not all
1583 	   the data structures allocated at system boot end up in node zero. */
1584 
1585 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1586 		printk("numa_policy_init: interleaving failed\n");
1587 }
1588 
1589 /* Reset policy of current process to default */
1590 void numa_default_policy(void)
1591 {
1592 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1593 }
1594 
1595 /* Migrate a policy to a different set of nodes */
1596 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1597 {
1598 	nodemask_t *mpolmask;
1599 	nodemask_t tmp;
1600 
1601 	if (!pol)
1602 		return;
1603 	mpolmask = &pol->cpuset_mems_allowed;
1604 	if (nodes_equal(*mpolmask, *newmask))
1605 		return;
1606 
1607 	switch (pol->policy) {
1608 	case MPOL_DEFAULT:
1609 		break;
1610 	case MPOL_INTERLEAVE:
1611 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1612 		pol->v.nodes = tmp;
1613 		*mpolmask = *newmask;
1614 		current->il_next = node_remap(current->il_next,
1615 						*mpolmask, *newmask);
1616 		break;
1617 	case MPOL_PREFERRED:
1618 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1619 						*mpolmask, *newmask);
1620 		*mpolmask = *newmask;
1621 		break;
1622 	case MPOL_BIND: {
1623 		nodemask_t nodes;
1624 		struct zone **z;
1625 		struct zonelist *zonelist;
1626 
1627 		nodes_clear(nodes);
1628 		for (z = pol->v.zonelist->zones; *z; z++)
1629 			node_set((*z)->zone_pgdat->node_id, nodes);
1630 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1631 		nodes = tmp;
1632 
1633 		zonelist = bind_zonelist(&nodes);
1634 
1635 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1636 		 * If that old zonelist has no remaining mems_allowed nodes,
1637 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1638 		 */
1639 
1640 		if (zonelist) {
1641 			/* Good - got mem - substitute new zonelist */
1642 			kfree(pol->v.zonelist);
1643 			pol->v.zonelist = zonelist;
1644 		}
1645 		*mpolmask = *newmask;
1646 		break;
1647 	}
1648 	default:
1649 		BUG();
1650 		break;
1651 	}
1652 }
1653 
1654 /*
1655  * Wrapper for mpol_rebind_policy() that just requires task
1656  * pointer, and updates task mempolicy.
1657  */
1658 
1659 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1660 {
1661 	mpol_rebind_policy(tsk->mempolicy, new);
1662 }
1663 
1664 /*
1665  * Rebind each vma in mm to new nodemask.
1666  *
1667  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1668  */
1669 
1670 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1671 {
1672 	struct vm_area_struct *vma;
1673 
1674 	down_write(&mm->mmap_sem);
1675 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1676 		mpol_rebind_policy(vma->vm_policy, new);
1677 	up_write(&mm->mmap_sem);
1678 }
1679 
1680 /*
1681  * Display pages allocated per node and memory policy via /proc.
1682  */
1683 
1684 static const char *policy_types[] = { "default", "prefer", "bind",
1685 				      "interleave" };
1686 
1687 /*
1688  * Convert a mempolicy into a string.
1689  * Returns the number of characters in buffer (if positive)
1690  * or an error (negative)
1691  */
1692 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1693 {
1694 	char *p = buffer;
1695 	int l;
1696 	nodemask_t nodes;
1697 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1698 
1699 	switch (mode) {
1700 	case MPOL_DEFAULT:
1701 		nodes_clear(nodes);
1702 		break;
1703 
1704 	case MPOL_PREFERRED:
1705 		nodes_clear(nodes);
1706 		node_set(pol->v.preferred_node, nodes);
1707 		break;
1708 
1709 	case MPOL_BIND:
1710 		get_zonemask(pol, &nodes);
1711 		break;
1712 
1713 	case MPOL_INTERLEAVE:
1714 		nodes = pol->v.nodes;
1715 		break;
1716 
1717 	default:
1718 		BUG();
1719 		return -EFAULT;
1720 	}
1721 
1722 	l = strlen(policy_types[mode]);
1723  	if (buffer + maxlen < p + l + 1)
1724  		return -ENOSPC;
1725 
1726 	strcpy(p, policy_types[mode]);
1727 	p += l;
1728 
1729 	if (!nodes_empty(nodes)) {
1730 		if (buffer + maxlen < p + 2)
1731 			return -ENOSPC;
1732 		*p++ = '=';
1733 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1734 	}
1735 	return p - buffer;
1736 }
1737 
1738 struct numa_maps {
1739 	unsigned long pages;
1740 	unsigned long anon;
1741 	unsigned long mapped;
1742 	unsigned long mapcount_max;
1743 	unsigned long node[MAX_NUMNODES];
1744 };
1745 
1746 static void gather_stats(struct page *page, void *private)
1747 {
1748 	struct numa_maps *md = private;
1749 	int count = page_mapcount(page);
1750 
1751 	if (count)
1752 		md->mapped++;
1753 
1754 	if (count > md->mapcount_max)
1755 		md->mapcount_max = count;
1756 
1757 	md->pages++;
1758 
1759 	if (PageAnon(page))
1760 		md->anon++;
1761 
1762 	md->node[page_to_nid(page)]++;
1763 	cond_resched();
1764 }
1765 
1766 int show_numa_map(struct seq_file *m, void *v)
1767 {
1768 	struct task_struct *task = m->private;
1769 	struct vm_area_struct *vma = v;
1770 	struct numa_maps *md;
1771 	int n;
1772 	char buffer[50];
1773 
1774 	if (!vma->vm_mm)
1775 		return 0;
1776 
1777 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1778 	if (!md)
1779 		return 0;
1780 
1781 	check_pgd_range(vma, vma->vm_start, vma->vm_end,
1782 		    &node_online_map, MPOL_MF_STATS, md);
1783 
1784 	if (md->pages) {
1785 		mpol_to_str(buffer, sizeof(buffer),
1786 			    get_vma_policy(task, vma, vma->vm_start));
1787 
1788 		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1789 			   vma->vm_start, buffer, md->pages,
1790 			   md->mapped, md->mapcount_max);
1791 
1792 		if (md->anon)
1793 			seq_printf(m," anon=%lu",md->anon);
1794 
1795 		for_each_online_node(n)
1796 			if (md->node[n])
1797 				seq_printf(m, " N%d=%lu", n, md->node[n]);
1798 
1799 		seq_putc(m, '\n');
1800 	}
1801 	kfree(md);
1802 
1803 	if (m->count < m->size)
1804 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1805 	return 0;
1806 }
1807 
1808