xref: /openbmc/linux/mm/mempolicy.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
88 
89 static kmem_cache_t *policy_cache;
90 static kmem_cache_t *sn_cache;
91 
92 #define PDprintk(fmt...)
93 
94 /* Highest zone. An specific allocation for a zone below that is not
95    policied. */
96 static int policy_zone;
97 
98 struct mempolicy default_policy = {
99 	.refcnt = ATOMIC_INIT(1), /* never free it */
100 	.policy = MPOL_DEFAULT,
101 };
102 
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode, nodemask_t *nodes)
105 {
106 	int empty = nodes_empty(*nodes);
107 
108 	switch (mode) {
109 	case MPOL_DEFAULT:
110 		if (!empty)
111 			return -EINVAL;
112 		break;
113 	case MPOL_BIND:
114 	case MPOL_INTERLEAVE:
115 		/* Preferred will only use the first bit, but allow
116 		   more for now. */
117 		if (empty)
118 			return -EINVAL;
119 		break;
120 	}
121 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
122 }
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
125 {
126 	struct zonelist *zl;
127 	int num, max, nd;
128 
129 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 	if (!zl)
132 		return NULL;
133 	num = 0;
134 	for_each_node_mask(nd, *nodes) {
135 		int k;
136 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 			if (!z->present_pages)
139 				continue;
140 			zl->zones[num++] = z;
141 			if (k > policy_zone)
142 				policy_zone = k;
143 		}
144 	}
145 	zl->zones[num] = NULL;
146 	return zl;
147 }
148 
149 /* Create a new policy */
150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
151 {
152 	struct mempolicy *policy;
153 
154 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 	if (mode == MPOL_DEFAULT)
156 		return NULL;
157 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 	if (!policy)
159 		return ERR_PTR(-ENOMEM);
160 	atomic_set(&policy->refcnt, 1);
161 	switch (mode) {
162 	case MPOL_INTERLEAVE:
163 		policy->v.nodes = *nodes;
164 		break;
165 	case MPOL_PREFERRED:
166 		policy->v.preferred_node = first_node(*nodes);
167 		if (policy->v.preferred_node >= MAX_NUMNODES)
168 			policy->v.preferred_node = -1;
169 		break;
170 	case MPOL_BIND:
171 		policy->v.zonelist = bind_zonelist(nodes);
172 		if (policy->v.zonelist == NULL) {
173 			kmem_cache_free(policy_cache, policy);
174 			return ERR_PTR(-ENOMEM);
175 		}
176 		break;
177 	}
178 	policy->policy = mode;
179 	return policy;
180 }
181 
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 		unsigned long addr, unsigned long end, nodemask_t *nodes)
185 {
186 	pte_t *orig_pte;
187 	pte_t *pte;
188 	spinlock_t *ptl;
189 
190 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 	do {
192 		unsigned long pfn;
193 		unsigned int nid;
194 
195 		if (!pte_present(*pte))
196 			continue;
197 		pfn = pte_pfn(*pte);
198 		if (!pfn_valid(pfn)) {
199 			print_bad_pte(vma, *pte, addr);
200 			continue;
201 		}
202 		nid = pfn_to_nid(pfn);
203 		if (!node_isset(nid, *nodes))
204 			break;
205 	} while (pte++, addr += PAGE_SIZE, addr != end);
206 	pte_unmap_unlock(orig_pte, ptl);
207 	return addr != end;
208 }
209 
210 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 		unsigned long addr, unsigned long end, nodemask_t *nodes)
212 {
213 	pmd_t *pmd;
214 	unsigned long next;
215 
216 	pmd = pmd_offset(pud, addr);
217 	do {
218 		next = pmd_addr_end(addr, end);
219 		if (pmd_none_or_clear_bad(pmd))
220 			continue;
221 		if (check_pte_range(vma, pmd, addr, next, nodes))
222 			return -EIO;
223 	} while (pmd++, addr = next, addr != end);
224 	return 0;
225 }
226 
227 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 		unsigned long addr, unsigned long end, nodemask_t *nodes)
229 {
230 	pud_t *pud;
231 	unsigned long next;
232 
233 	pud = pud_offset(pgd, addr);
234 	do {
235 		next = pud_addr_end(addr, end);
236 		if (pud_none_or_clear_bad(pud))
237 			continue;
238 		if (check_pmd_range(vma, pud, addr, next, nodes))
239 			return -EIO;
240 	} while (pud++, addr = next, addr != end);
241 	return 0;
242 }
243 
244 static inline int check_pgd_range(struct vm_area_struct *vma,
245 		unsigned long addr, unsigned long end, nodemask_t *nodes)
246 {
247 	pgd_t *pgd;
248 	unsigned long next;
249 
250 	pgd = pgd_offset(vma->vm_mm, addr);
251 	do {
252 		next = pgd_addr_end(addr, end);
253 		if (pgd_none_or_clear_bad(pgd))
254 			continue;
255 		if (check_pud_range(vma, pgd, addr, next, nodes))
256 			return -EIO;
257 	} while (pgd++, addr = next, addr != end);
258 	return 0;
259 }
260 
261 /* Step 1: check the range */
262 static struct vm_area_struct *
263 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 	    nodemask_t *nodes, unsigned long flags)
265 {
266 	int err;
267 	struct vm_area_struct *first, *vma, *prev;
268 
269 	first = find_vma(mm, start);
270 	if (!first)
271 		return ERR_PTR(-EFAULT);
272 	if (first->vm_flags & VM_RESERVED)
273 		return ERR_PTR(-EACCES);
274 	prev = NULL;
275 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 		if (!vma->vm_next && vma->vm_end < end)
277 			return ERR_PTR(-EFAULT);
278 		if (prev && prev->vm_end < vma->vm_start)
279 			return ERR_PTR(-EFAULT);
280 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
281 			unsigned long endvma = vma->vm_end;
282 			if (endvma > end)
283 				endvma = end;
284 			if (vma->vm_start > start)
285 				start = vma->vm_start;
286 			err = check_pgd_range(vma, start, endvma, nodes);
287 			if (err) {
288 				first = ERR_PTR(err);
289 				break;
290 			}
291 		}
292 		prev = vma;
293 	}
294 	return first;
295 }
296 
297 /* Apply policy to a single VMA */
298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
299 {
300 	int err = 0;
301 	struct mempolicy *old = vma->vm_policy;
302 
303 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
305 		 vma->vm_ops, vma->vm_file,
306 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
307 
308 	if (vma->vm_ops && vma->vm_ops->set_policy)
309 		err = vma->vm_ops->set_policy(vma, new);
310 	if (!err) {
311 		mpol_get(new);
312 		vma->vm_policy = new;
313 		mpol_free(old);
314 	}
315 	return err;
316 }
317 
318 /* Step 2: apply policy to a range and do splits. */
319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
320 		       unsigned long end, struct mempolicy *new)
321 {
322 	struct vm_area_struct *next;
323 	int err;
324 
325 	err = 0;
326 	for (; vma && vma->vm_start < end; vma = next) {
327 		next = vma->vm_next;
328 		if (vma->vm_start < start)
329 			err = split_vma(vma->vm_mm, vma, start, 1);
330 		if (!err && vma->vm_end > end)
331 			err = split_vma(vma->vm_mm, vma, end, 0);
332 		if (!err)
333 			err = policy_vma(vma, new);
334 		if (err)
335 			break;
336 	}
337 	return err;
338 }
339 
340 static int contextualize_policy(int mode, nodemask_t *nodes)
341 {
342 	if (!nodes)
343 		return 0;
344 
345 	/* Update current mems_allowed */
346 	cpuset_update_current_mems_allowed();
347 	/* Ignore nodes not set in current->mems_allowed */
348 	cpuset_restrict_to_mems_allowed(nodes->bits);
349 	return mpol_check_policy(mode, nodes);
350 }
351 
352 long do_mbind(unsigned long start, unsigned long len,
353 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
354 {
355 	struct vm_area_struct *vma;
356 	struct mm_struct *mm = current->mm;
357 	struct mempolicy *new;
358 	unsigned long end;
359 	int err;
360 
361 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362 		return -EINVAL;
363 	if (start & ~PAGE_MASK)
364 		return -EINVAL;
365 	if (mode == MPOL_DEFAULT)
366 		flags &= ~MPOL_MF_STRICT;
367 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368 	end = start + len;
369 	if (end < start)
370 		return -EINVAL;
371 	if (end == start)
372 		return 0;
373 	if (mpol_check_policy(mode, nmask))
374 		return -EINVAL;
375 	new = mpol_new(mode, nmask);
376 	if (IS_ERR(new))
377 		return PTR_ERR(new);
378 
379 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
380 			mode,nodes_addr(nodes)[0]);
381 
382 	down_write(&mm->mmap_sem);
383 	vma = check_range(mm, start, end, nmask, flags);
384 	err = PTR_ERR(vma);
385 	if (!IS_ERR(vma))
386 		err = mbind_range(vma, start, end, new);
387 	up_write(&mm->mmap_sem);
388 	mpol_free(new);
389 	return err;
390 }
391 
392 /* Set the process memory policy */
393 long do_set_mempolicy(int mode, nodemask_t *nodes)
394 {
395 	struct mempolicy *new;
396 
397 	if (contextualize_policy(mode, nodes))
398 		return -EINVAL;
399 	new = mpol_new(mode, nodes);
400 	if (IS_ERR(new))
401 		return PTR_ERR(new);
402 	mpol_free(current->mempolicy);
403 	current->mempolicy = new;
404 	if (new && new->policy == MPOL_INTERLEAVE)
405 		current->il_next = first_node(new->v.nodes);
406 	return 0;
407 }
408 
409 /* Fill a zone bitmap for a policy */
410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
411 {
412 	int i;
413 
414 	nodes_clear(*nodes);
415 	switch (p->policy) {
416 	case MPOL_BIND:
417 		for (i = 0; p->v.zonelist->zones[i]; i++)
418 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 				*nodes);
420 		break;
421 	case MPOL_DEFAULT:
422 		break;
423 	case MPOL_INTERLEAVE:
424 		*nodes = p->v.nodes;
425 		break;
426 	case MPOL_PREFERRED:
427 		/* or use current node instead of online map? */
428 		if (p->v.preferred_node < 0)
429 			*nodes = node_online_map;
430 		else
431 			node_set(p->v.preferred_node, *nodes);
432 		break;
433 	default:
434 		BUG();
435 	}
436 }
437 
438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
439 {
440 	struct page *p;
441 	int err;
442 
443 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
444 	if (err >= 0) {
445 		err = page_to_nid(p);
446 		put_page(p);
447 	}
448 	return err;
449 }
450 
451 /* Retrieve NUMA policy */
452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
453 			unsigned long addr, unsigned long flags)
454 {
455 	int err;
456 	struct mm_struct *mm = current->mm;
457 	struct vm_area_struct *vma = NULL;
458 	struct mempolicy *pol = current->mempolicy;
459 
460 	cpuset_update_current_mems_allowed();
461 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
462 		return -EINVAL;
463 	if (flags & MPOL_F_ADDR) {
464 		down_read(&mm->mmap_sem);
465 		vma = find_vma_intersection(mm, addr, addr+1);
466 		if (!vma) {
467 			up_read(&mm->mmap_sem);
468 			return -EFAULT;
469 		}
470 		if (vma->vm_ops && vma->vm_ops->get_policy)
471 			pol = vma->vm_ops->get_policy(vma, addr);
472 		else
473 			pol = vma->vm_policy;
474 	} else if (addr)
475 		return -EINVAL;
476 
477 	if (!pol)
478 		pol = &default_policy;
479 
480 	if (flags & MPOL_F_NODE) {
481 		if (flags & MPOL_F_ADDR) {
482 			err = lookup_node(mm, addr);
483 			if (err < 0)
484 				goto out;
485 			*policy = err;
486 		} else if (pol == current->mempolicy &&
487 				pol->policy == MPOL_INTERLEAVE) {
488 			*policy = current->il_next;
489 		} else {
490 			err = -EINVAL;
491 			goto out;
492 		}
493 	} else
494 		*policy = pol->policy;
495 
496 	if (vma) {
497 		up_read(&current->mm->mmap_sem);
498 		vma = NULL;
499 	}
500 
501 	err = 0;
502 	if (nmask)
503 		get_zonemask(pol, nmask);
504 
505  out:
506 	if (vma)
507 		up_read(&current->mm->mmap_sem);
508 	return err;
509 }
510 
511 /*
512  * User space interface with variable sized bitmaps for nodelists.
513  */
514 
515 /* Copy a node mask from user space. */
516 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
517 		     unsigned long maxnode)
518 {
519 	unsigned long k;
520 	unsigned long nlongs;
521 	unsigned long endmask;
522 
523 	--maxnode;
524 	nodes_clear(*nodes);
525 	if (maxnode == 0 || !nmask)
526 		return 0;
527 
528 	nlongs = BITS_TO_LONGS(maxnode);
529 	if ((maxnode % BITS_PER_LONG) == 0)
530 		endmask = ~0UL;
531 	else
532 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
533 
534 	/* When the user specified more nodes than supported just check
535 	   if the non supported part is all zero. */
536 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
537 		if (nlongs > PAGE_SIZE/sizeof(long))
538 			return -EINVAL;
539 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
540 			unsigned long t;
541 			if (get_user(t, nmask + k))
542 				return -EFAULT;
543 			if (k == nlongs - 1) {
544 				if (t & endmask)
545 					return -EINVAL;
546 			} else if (t)
547 				return -EINVAL;
548 		}
549 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
550 		endmask = ~0UL;
551 	}
552 
553 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
554 		return -EFAULT;
555 	nodes_addr(*nodes)[nlongs-1] &= endmask;
556 	return 0;
557 }
558 
559 /* Copy a kernel node mask to user space */
560 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
561 			      nodemask_t *nodes)
562 {
563 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
564 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
565 
566 	if (copy > nbytes) {
567 		if (copy > PAGE_SIZE)
568 			return -EINVAL;
569 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
570 			return -EFAULT;
571 		copy = nbytes;
572 	}
573 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
574 }
575 
576 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
577 			unsigned long mode,
578 			unsigned long __user *nmask, unsigned long maxnode,
579 			unsigned flags)
580 {
581 	nodemask_t nodes;
582 	int err;
583 
584 	err = get_nodes(&nodes, nmask, maxnode);
585 	if (err)
586 		return err;
587 	return do_mbind(start, len, mode, &nodes, flags);
588 }
589 
590 /* Set the process memory policy */
591 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
592 		unsigned long maxnode)
593 {
594 	int err;
595 	nodemask_t nodes;
596 
597 	if (mode < 0 || mode > MPOL_MAX)
598 		return -EINVAL;
599 	err = get_nodes(&nodes, nmask, maxnode);
600 	if (err)
601 		return err;
602 	return do_set_mempolicy(mode, &nodes);
603 }
604 
605 /* Retrieve NUMA policy */
606 asmlinkage long sys_get_mempolicy(int __user *policy,
607 				unsigned long __user *nmask,
608 				unsigned long maxnode,
609 				unsigned long addr, unsigned long flags)
610 {
611 	int err, pval;
612 	nodemask_t nodes;
613 
614 	if (nmask != NULL && maxnode < MAX_NUMNODES)
615 		return -EINVAL;
616 
617 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
618 
619 	if (err)
620 		return err;
621 
622 	if (policy && put_user(pval, policy))
623 		return -EFAULT;
624 
625 	if (nmask)
626 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
627 
628 	return err;
629 }
630 
631 #ifdef CONFIG_COMPAT
632 
633 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
634 				     compat_ulong_t __user *nmask,
635 				     compat_ulong_t maxnode,
636 				     compat_ulong_t addr, compat_ulong_t flags)
637 {
638 	long err;
639 	unsigned long __user *nm = NULL;
640 	unsigned long nr_bits, alloc_size;
641 	DECLARE_BITMAP(bm, MAX_NUMNODES);
642 
643 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
644 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
645 
646 	if (nmask)
647 		nm = compat_alloc_user_space(alloc_size);
648 
649 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
650 
651 	if (!err && nmask) {
652 		err = copy_from_user(bm, nm, alloc_size);
653 		/* ensure entire bitmap is zeroed */
654 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
655 		err |= compat_put_bitmap(nmask, bm, nr_bits);
656 	}
657 
658 	return err;
659 }
660 
661 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
662 				     compat_ulong_t maxnode)
663 {
664 	long err = 0;
665 	unsigned long __user *nm = NULL;
666 	unsigned long nr_bits, alloc_size;
667 	DECLARE_BITMAP(bm, MAX_NUMNODES);
668 
669 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
670 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
671 
672 	if (nmask) {
673 		err = compat_get_bitmap(bm, nmask, nr_bits);
674 		nm = compat_alloc_user_space(alloc_size);
675 		err |= copy_to_user(nm, bm, alloc_size);
676 	}
677 
678 	if (err)
679 		return -EFAULT;
680 
681 	return sys_set_mempolicy(mode, nm, nr_bits+1);
682 }
683 
684 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
685 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
686 			     compat_ulong_t maxnode, compat_ulong_t flags)
687 {
688 	long err = 0;
689 	unsigned long __user *nm = NULL;
690 	unsigned long nr_bits, alloc_size;
691 	nodemask_t bm;
692 
693 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
694 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
695 
696 	if (nmask) {
697 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
698 		nm = compat_alloc_user_space(alloc_size);
699 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
700 	}
701 
702 	if (err)
703 		return -EFAULT;
704 
705 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
706 }
707 
708 #endif
709 
710 /* Return effective policy for a VMA */
711 struct mempolicy *
712 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
713 {
714 	struct mempolicy *pol = task->mempolicy;
715 
716 	if (vma) {
717 		if (vma->vm_ops && vma->vm_ops->get_policy)
718 			pol = vma->vm_ops->get_policy(vma, addr);
719 		else if (vma->vm_policy &&
720 				vma->vm_policy->policy != MPOL_DEFAULT)
721 			pol = vma->vm_policy;
722 	}
723 	if (!pol)
724 		pol = &default_policy;
725 	return pol;
726 }
727 
728 /* Return a zonelist representing a mempolicy */
729 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
730 {
731 	int nd;
732 
733 	switch (policy->policy) {
734 	case MPOL_PREFERRED:
735 		nd = policy->v.preferred_node;
736 		if (nd < 0)
737 			nd = numa_node_id();
738 		break;
739 	case MPOL_BIND:
740 		/* Lower zones don't get a policy applied */
741 		/* Careful: current->mems_allowed might have moved */
742 		if (gfp_zone(gfp) >= policy_zone)
743 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
744 				return policy->v.zonelist;
745 		/*FALL THROUGH*/
746 	case MPOL_INTERLEAVE: /* should not happen */
747 	case MPOL_DEFAULT:
748 		nd = numa_node_id();
749 		break;
750 	default:
751 		nd = 0;
752 		BUG();
753 	}
754 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
755 }
756 
757 /* Do dynamic interleaving for a process */
758 static unsigned interleave_nodes(struct mempolicy *policy)
759 {
760 	unsigned nid, next;
761 	struct task_struct *me = current;
762 
763 	nid = me->il_next;
764 	next = next_node(nid, policy->v.nodes);
765 	if (next >= MAX_NUMNODES)
766 		next = first_node(policy->v.nodes);
767 	me->il_next = next;
768 	return nid;
769 }
770 
771 /* Do static interleaving for a VMA with known offset. */
772 static unsigned offset_il_node(struct mempolicy *pol,
773 		struct vm_area_struct *vma, unsigned long off)
774 {
775 	unsigned nnodes = nodes_weight(pol->v.nodes);
776 	unsigned target = (unsigned)off % nnodes;
777 	int c;
778 	int nid = -1;
779 
780 	c = 0;
781 	do {
782 		nid = next_node(nid, pol->v.nodes);
783 		c++;
784 	} while (c <= target);
785 	return nid;
786 }
787 
788 /* Allocate a page in interleaved policy.
789    Own path because it needs to do special accounting. */
790 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
791 					unsigned nid)
792 {
793 	struct zonelist *zl;
794 	struct page *page;
795 
796 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
797 	page = __alloc_pages(gfp, order, zl);
798 	if (page && page_zone(page) == zl->zones[0]) {
799 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
800 		put_cpu();
801 	}
802 	return page;
803 }
804 
805 /**
806  * 	alloc_page_vma	- Allocate a page for a VMA.
807  *
808  * 	@gfp:
809  *      %GFP_USER    user allocation.
810  *      %GFP_KERNEL  kernel allocations,
811  *      %GFP_HIGHMEM highmem/user allocations,
812  *      %GFP_FS      allocation should not call back into a file system.
813  *      %GFP_ATOMIC  don't sleep.
814  *
815  * 	@vma:  Pointer to VMA or NULL if not available.
816  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
817  *
818  * 	This function allocates a page from the kernel page pool and applies
819  *	a NUMA policy associated with the VMA or the current process.
820  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
821  *	mm_struct of the VMA to prevent it from going away. Should be used for
822  *	all allocations for pages that will be mapped into
823  * 	user space. Returns NULL when no page can be allocated.
824  *
825  *	Should be called with the mm_sem of the vma hold.
826  */
827 struct page *
828 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
829 {
830 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
831 
832 	cpuset_update_current_mems_allowed();
833 
834 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 		unsigned nid;
836 		if (vma) {
837 			unsigned long off;
838 			off = vma->vm_pgoff;
839 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
840 			nid = offset_il_node(pol, vma, off);
841 		} else {
842 			/* fall back to process interleaving */
843 			nid = interleave_nodes(pol);
844 		}
845 		return alloc_page_interleave(gfp, 0, nid);
846 	}
847 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
848 }
849 
850 /**
851  * 	alloc_pages_current - Allocate pages.
852  *
853  *	@gfp:
854  *		%GFP_USER   user allocation,
855  *      	%GFP_KERNEL kernel allocation,
856  *      	%GFP_HIGHMEM highmem allocation,
857  *      	%GFP_FS     don't call back into a file system.
858  *      	%GFP_ATOMIC don't sleep.
859  *	@order: Power of two of allocation size in pages. 0 is a single page.
860  *
861  *	Allocate a page from the kernel page pool.  When not in
862  *	interrupt context and apply the current process NUMA policy.
863  *	Returns NULL when no page can be allocated.
864  *
865  *	Don't call cpuset_update_current_mems_allowed() unless
866  *	1) it's ok to take cpuset_sem (can WAIT), and
867  *	2) allocating for current task (not interrupt).
868  */
869 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
870 {
871 	struct mempolicy *pol = current->mempolicy;
872 
873 	if ((gfp & __GFP_WAIT) && !in_interrupt())
874 		cpuset_update_current_mems_allowed();
875 	if (!pol || in_interrupt())
876 		pol = &default_policy;
877 	if (pol->policy == MPOL_INTERLEAVE)
878 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
879 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
880 }
881 EXPORT_SYMBOL(alloc_pages_current);
882 
883 /* Slow path of a mempolicy copy */
884 struct mempolicy *__mpol_copy(struct mempolicy *old)
885 {
886 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
887 
888 	if (!new)
889 		return ERR_PTR(-ENOMEM);
890 	*new = *old;
891 	atomic_set(&new->refcnt, 1);
892 	if (new->policy == MPOL_BIND) {
893 		int sz = ksize(old->v.zonelist);
894 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
895 		if (!new->v.zonelist) {
896 			kmem_cache_free(policy_cache, new);
897 			return ERR_PTR(-ENOMEM);
898 		}
899 		memcpy(new->v.zonelist, old->v.zonelist, sz);
900 	}
901 	return new;
902 }
903 
904 /* Slow path of a mempolicy comparison */
905 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
906 {
907 	if (!a || !b)
908 		return 0;
909 	if (a->policy != b->policy)
910 		return 0;
911 	switch (a->policy) {
912 	case MPOL_DEFAULT:
913 		return 1;
914 	case MPOL_INTERLEAVE:
915 		return nodes_equal(a->v.nodes, b->v.nodes);
916 	case MPOL_PREFERRED:
917 		return a->v.preferred_node == b->v.preferred_node;
918 	case MPOL_BIND: {
919 		int i;
920 		for (i = 0; a->v.zonelist->zones[i]; i++)
921 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
922 				return 0;
923 		return b->v.zonelist->zones[i] == NULL;
924 	}
925 	default:
926 		BUG();
927 		return 0;
928 	}
929 }
930 
931 /* Slow path of a mpol destructor. */
932 void __mpol_free(struct mempolicy *p)
933 {
934 	if (!atomic_dec_and_test(&p->refcnt))
935 		return;
936 	if (p->policy == MPOL_BIND)
937 		kfree(p->v.zonelist);
938 	p->policy = MPOL_DEFAULT;
939 	kmem_cache_free(policy_cache, p);
940 }
941 
942 /*
943  * Hugetlb policy. Same as above, just works with node numbers instead of
944  * zonelists.
945  */
946 
947 /* Find first node suitable for an allocation */
948 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949 {
950 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
951 
952 	switch (pol->policy) {
953 	case MPOL_DEFAULT:
954 		return numa_node_id();
955 	case MPOL_BIND:
956 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957 	case MPOL_INTERLEAVE:
958 		return interleave_nodes(pol);
959 	case MPOL_PREFERRED:
960 		return pol->v.preferred_node >= 0 ?
961 				pol->v.preferred_node : numa_node_id();
962 	}
963 	BUG();
964 	return 0;
965 }
966 
967 /* Find secondary valid nodes for an allocation */
968 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969 {
970 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
971 
972 	switch (pol->policy) {
973 	case MPOL_PREFERRED:
974 	case MPOL_DEFAULT:
975 	case MPOL_INTERLEAVE:
976 		return 1;
977 	case MPOL_BIND: {
978 		struct zone **z;
979 		for (z = pol->v.zonelist->zones; *z; z++)
980 			if ((*z)->zone_pgdat->node_id == nid)
981 				return 1;
982 		return 0;
983 	}
984 	default:
985 		BUG();
986 		return 0;
987 	}
988 }
989 
990 /*
991  * Shared memory backing store policy support.
992  *
993  * Remember policies even when nobody has shared memory mapped.
994  * The policies are kept in Red-Black tree linked from the inode.
995  * They are protected by the sp->lock spinlock, which should be held
996  * for any accesses to the tree.
997  */
998 
999 /* lookup first element intersecting start-end */
1000 /* Caller holds sp->lock */
1001 static struct sp_node *
1002 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1003 {
1004 	struct rb_node *n = sp->root.rb_node;
1005 
1006 	while (n) {
1007 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1008 
1009 		if (start >= p->end)
1010 			n = n->rb_right;
1011 		else if (end <= p->start)
1012 			n = n->rb_left;
1013 		else
1014 			break;
1015 	}
1016 	if (!n)
1017 		return NULL;
1018 	for (;;) {
1019 		struct sp_node *w = NULL;
1020 		struct rb_node *prev = rb_prev(n);
1021 		if (!prev)
1022 			break;
1023 		w = rb_entry(prev, struct sp_node, nd);
1024 		if (w->end <= start)
1025 			break;
1026 		n = prev;
1027 	}
1028 	return rb_entry(n, struct sp_node, nd);
1029 }
1030 
1031 /* Insert a new shared policy into the list. */
1032 /* Caller holds sp->lock */
1033 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1034 {
1035 	struct rb_node **p = &sp->root.rb_node;
1036 	struct rb_node *parent = NULL;
1037 	struct sp_node *nd;
1038 
1039 	while (*p) {
1040 		parent = *p;
1041 		nd = rb_entry(parent, struct sp_node, nd);
1042 		if (new->start < nd->start)
1043 			p = &(*p)->rb_left;
1044 		else if (new->end > nd->end)
1045 			p = &(*p)->rb_right;
1046 		else
1047 			BUG();
1048 	}
1049 	rb_link_node(&new->nd, parent, p);
1050 	rb_insert_color(&new->nd, &sp->root);
1051 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1052 		 new->policy ? new->policy->policy : 0);
1053 }
1054 
1055 /* Find shared policy intersecting idx */
1056 struct mempolicy *
1057 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1058 {
1059 	struct mempolicy *pol = NULL;
1060 	struct sp_node *sn;
1061 
1062 	if (!sp->root.rb_node)
1063 		return NULL;
1064 	spin_lock(&sp->lock);
1065 	sn = sp_lookup(sp, idx, idx+1);
1066 	if (sn) {
1067 		mpol_get(sn->policy);
1068 		pol = sn->policy;
1069 	}
1070 	spin_unlock(&sp->lock);
1071 	return pol;
1072 }
1073 
1074 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1075 {
1076 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1077 	rb_erase(&n->nd, &sp->root);
1078 	mpol_free(n->policy);
1079 	kmem_cache_free(sn_cache, n);
1080 }
1081 
1082 struct sp_node *
1083 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1084 {
1085 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1086 
1087 	if (!n)
1088 		return NULL;
1089 	n->start = start;
1090 	n->end = end;
1091 	mpol_get(pol);
1092 	n->policy = pol;
1093 	return n;
1094 }
1095 
1096 /* Replace a policy range. */
1097 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1098 				 unsigned long end, struct sp_node *new)
1099 {
1100 	struct sp_node *n, *new2 = NULL;
1101 
1102 restart:
1103 	spin_lock(&sp->lock);
1104 	n = sp_lookup(sp, start, end);
1105 	/* Take care of old policies in the same range. */
1106 	while (n && n->start < end) {
1107 		struct rb_node *next = rb_next(&n->nd);
1108 		if (n->start >= start) {
1109 			if (n->end <= end)
1110 				sp_delete(sp, n);
1111 			else
1112 				n->start = end;
1113 		} else {
1114 			/* Old policy spanning whole new range. */
1115 			if (n->end > end) {
1116 				if (!new2) {
1117 					spin_unlock(&sp->lock);
1118 					new2 = sp_alloc(end, n->end, n->policy);
1119 					if (!new2)
1120 						return -ENOMEM;
1121 					goto restart;
1122 				}
1123 				n->end = start;
1124 				sp_insert(sp, new2);
1125 				new2 = NULL;
1126 				break;
1127 			} else
1128 				n->end = start;
1129 		}
1130 		if (!next)
1131 			break;
1132 		n = rb_entry(next, struct sp_node, nd);
1133 	}
1134 	if (new)
1135 		sp_insert(sp, new);
1136 	spin_unlock(&sp->lock);
1137 	if (new2) {
1138 		mpol_free(new2->policy);
1139 		kmem_cache_free(sn_cache, new2);
1140 	}
1141 	return 0;
1142 }
1143 
1144 int mpol_set_shared_policy(struct shared_policy *info,
1145 			struct vm_area_struct *vma, struct mempolicy *npol)
1146 {
1147 	int err;
1148 	struct sp_node *new = NULL;
1149 	unsigned long sz = vma_pages(vma);
1150 
1151 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1152 		 vma->vm_pgoff,
1153 		 sz, npol? npol->policy : -1,
1154 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1155 
1156 	if (npol) {
1157 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1158 		if (!new)
1159 			return -ENOMEM;
1160 	}
1161 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1162 	if (err && new)
1163 		kmem_cache_free(sn_cache, new);
1164 	return err;
1165 }
1166 
1167 /* Free a backing policy store on inode delete. */
1168 void mpol_free_shared_policy(struct shared_policy *p)
1169 {
1170 	struct sp_node *n;
1171 	struct rb_node *next;
1172 
1173 	if (!p->root.rb_node)
1174 		return;
1175 	spin_lock(&p->lock);
1176 	next = rb_first(&p->root);
1177 	while (next) {
1178 		n = rb_entry(next, struct sp_node, nd);
1179 		next = rb_next(&n->nd);
1180 		rb_erase(&n->nd, &p->root);
1181 		mpol_free(n->policy);
1182 		kmem_cache_free(sn_cache, n);
1183 	}
1184 	spin_unlock(&p->lock);
1185 }
1186 
1187 /* assumes fs == KERNEL_DS */
1188 void __init numa_policy_init(void)
1189 {
1190 	policy_cache = kmem_cache_create("numa_policy",
1191 					 sizeof(struct mempolicy),
1192 					 0, SLAB_PANIC, NULL, NULL);
1193 
1194 	sn_cache = kmem_cache_create("shared_policy_node",
1195 				     sizeof(struct sp_node),
1196 				     0, SLAB_PANIC, NULL, NULL);
1197 
1198 	/* Set interleaving policy for system init. This way not all
1199 	   the data structures allocated at system boot end up in node zero. */
1200 
1201 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1202 		printk("numa_policy_init: interleaving failed\n");
1203 }
1204 
1205 /* Reset policy of current process to default */
1206 void numa_default_policy(void)
1207 {
1208 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1209 }
1210 
1211 /* Migrate a policy to a different set of nodes */
1212 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1213 							const nodemask_t *new)
1214 {
1215 	nodemask_t tmp;
1216 
1217 	if (!pol)
1218 		return;
1219 
1220 	switch (pol->policy) {
1221 	case MPOL_DEFAULT:
1222 		break;
1223 	case MPOL_INTERLEAVE:
1224 		nodes_remap(tmp, pol->v.nodes, *old, *new);
1225 		pol->v.nodes = tmp;
1226 		current->il_next = node_remap(current->il_next, *old, *new);
1227 		break;
1228 	case MPOL_PREFERRED:
1229 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230 								*old, *new);
1231 		break;
1232 	case MPOL_BIND: {
1233 		nodemask_t nodes;
1234 		struct zone **z;
1235 		struct zonelist *zonelist;
1236 
1237 		nodes_clear(nodes);
1238 		for (z = pol->v.zonelist->zones; *z; z++)
1239 			node_set((*z)->zone_pgdat->node_id, nodes);
1240 		nodes_remap(tmp, nodes, *old, *new);
1241 		nodes = tmp;
1242 
1243 		zonelist = bind_zonelist(&nodes);
1244 
1245 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1246 		 * If that old zonelist has no remaining mems_allowed nodes,
1247 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1248 		 */
1249 
1250 		if (zonelist) {
1251 			/* Good - got mem - substitute new zonelist */
1252 			kfree(pol->v.zonelist);
1253 			pol->v.zonelist = zonelist;
1254 		}
1255 		break;
1256 	}
1257 	default:
1258 		BUG();
1259 		break;
1260 	}
1261 }
1262 
1263 /*
1264  * Someone moved this task to different nodes.  Fixup mempolicies.
1265  *
1266  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1267  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268  */
1269 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1270 {
1271 	rebind_policy(current->mempolicy, old, new);
1272 }
1273