xref: /openbmc/linux/mm/mempolicy.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * Subject to the GNU Public License, version 2.
6  *
7  * NUMA policy allows the user to give hints in which node(s) memory should
8  * be allocated.
9  *
10  * Support four policies per VMA and per process:
11  *
12  * The VMA policy has priority over the process policy for a page fault.
13  *
14  * interleave     Allocate memory interleaved over a set of nodes,
15  *                with normal fallback if it fails.
16  *                For VMA based allocations this interleaves based on the
17  *                offset into the backing object or offset into the mapping
18  *                for anonymous memory. For process policy an process counter
19  *                is used.
20  * bind           Only allocate memory on a specific set of nodes,
21  *                no fallback.
22  * preferred       Try a specific node first before normal fallback.
23  *                As a special case node -1 here means do the allocation
24  *                on the local CPU. This is normally identical to default,
25  *                but useful to set in a VMA when you have a non default
26  *                process policy.
27  * default        Allocate on the local node first, or when on a VMA
28  *                use the process policy. This is what Linux always did
29  *		  in a NUMA aware kernel and still does by, ahem, default.
30  *
31  * The process policy is applied for most non interrupt memory allocations
32  * in that process' context. Interrupts ignore the policies and always
33  * try to allocate on the local CPU. The VMA policy is only applied for memory
34  * allocations for a VMA in the VM.
35  *
36  * Currently there are a few corner cases in swapping where the policy
37  * is not applied, but the majority should be handled. When process policy
38  * is used it is not remembered over swap outs/swap ins.
39  *
40  * Only the highest zone in the zone hierarchy gets policied. Allocations
41  * requesting a lower zone just use default policy. This implies that
42  * on systems with highmem kernel lowmem allocation don't get policied.
43  * Same with GFP_DMA allocations.
44  *
45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46  * all users and remembered even when nobody has memory mapped.
47  */
48 
49 /* Notebook:
50    fix mmap readahead to honour policy and enable policy for any page cache
51    object
52    statistics for bigpages
53    global policy for page cache? currently it uses process policy. Requires
54    first item above.
55    handle mremap for shared memory (currently ignored for the policy)
56    grows down?
57    make bind policy root only? It can trigger oom much faster and the
58    kernel is not always grateful with that.
59    could replace all the switch()es with a mempolicy_ops structure.
60 */
61 
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
81 
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
84 
85 #define PDprintk(fmt...)
86 
87 /* Highest zone. An specific allocation for a zone below that is not
88    policied. */
89 static int policy_zone;
90 
91 static struct mempolicy default_policy = {
92 	.refcnt = ATOMIC_INIT(1), /* never free it */
93 	.policy = MPOL_DEFAULT,
94 };
95 
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
98 {
99 	DECLARE_BITMAP(online2, MAX_NUMNODES);
100 
101 	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 	if (bitmap_empty(online2, MAX_NUMNODES))
103 		set_bit(0, online2);
104 	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 		return -EINVAL;
106 	return 0;
107 }
108 
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
111 {
112 	int empty = bitmap_empty(nodes, MAX_NUMNODES);
113 
114 	switch (mode) {
115 	case MPOL_DEFAULT:
116 		if (!empty)
117 			return -EINVAL;
118 		break;
119 	case MPOL_BIND:
120 	case MPOL_INTERLEAVE:
121 		/* Preferred will only use the first bit, but allow
122 		   more for now. */
123 		if (empty)
124 			return -EINVAL;
125 		break;
126 	}
127 	return nodes_online(nodes);
128 }
129 
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 		     unsigned long maxnode, int mode)
133 {
134 	unsigned long k;
135 	unsigned long nlongs;
136 	unsigned long endmask;
137 
138 	--maxnode;
139 	bitmap_zero(nodes, MAX_NUMNODES);
140 	if (maxnode == 0 || !nmask)
141 		return 0;
142 
143 	nlongs = BITS_TO_LONGS(maxnode);
144 	if ((maxnode % BITS_PER_LONG) == 0)
145 		endmask = ~0UL;
146 	else
147 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148 
149 	/* When the user specified more nodes than supported just check
150 	   if the non supported part is all zero. */
151 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 		if (nlongs > PAGE_SIZE/sizeof(long))
153 			return -EINVAL;
154 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 			unsigned long t;
156 			if (get_user(t,  nmask + k))
157 				return -EFAULT;
158 			if (k == nlongs - 1) {
159 				if (t & endmask)
160 					return -EINVAL;
161 			} else if (t)
162 				return -EINVAL;
163 		}
164 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 		endmask = ~0UL;
166 	}
167 
168 	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 		return -EFAULT;
170 	nodes[nlongs-1] &= endmask;
171 	/* Update current mems_allowed */
172 	cpuset_update_current_mems_allowed();
173 	/* Ignore nodes not set in current->mems_allowed */
174 	cpuset_restrict_to_mems_allowed(nodes);
175 	return mpol_check_policy(mode, nodes);
176 }
177 
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
180 {
181 	struct zonelist *zl;
182 	int num, max, nd;
183 
184 	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 	if (!zl)
187 		return NULL;
188 	num = 0;
189 	for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 	     nd < MAX_NUMNODES;
191 	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 		int k;
193 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 			if (!z->present_pages)
196 				continue;
197 			zl->zones[num++] = z;
198 			if (k > policy_zone)
199 				policy_zone = k;
200 		}
201 	}
202 	BUG_ON(num >= max);
203 	zl->zones[num] = NULL;
204 	return zl;
205 }
206 
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209 {
210 	struct mempolicy *policy;
211 
212 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 	if (mode == MPOL_DEFAULT)
214 		return NULL;
215 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 	if (!policy)
217 		return ERR_PTR(-ENOMEM);
218 	atomic_set(&policy->refcnt, 1);
219 	switch (mode) {
220 	case MPOL_INTERLEAVE:
221 		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 		break;
223 	case MPOL_PREFERRED:
224 		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 		if (policy->v.preferred_node >= MAX_NUMNODES)
226 			policy->v.preferred_node = -1;
227 		break;
228 	case MPOL_BIND:
229 		policy->v.zonelist = bind_zonelist(nodes);
230 		if (policy->v.zonelist == NULL) {
231 			kmem_cache_free(policy_cache, policy);
232 			return ERR_PTR(-ENOMEM);
233 		}
234 		break;
235 	}
236 	policy->policy = mode;
237 	return policy;
238 }
239 
240 /* Ensure all existing pages follow the policy. */
241 static int
242 verify_pages(struct mm_struct *mm,
243 	     unsigned long addr, unsigned long end, unsigned long *nodes)
244 {
245 	while (addr < end) {
246 		struct page *p;
247 		pte_t *pte;
248 		pmd_t *pmd;
249 		pud_t *pud;
250 		pgd_t *pgd;
251 		pgd = pgd_offset(mm, addr);
252 		if (pgd_none(*pgd)) {
253 			unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
254 			if (next > addr)
255 				break;
256 			addr = next;
257 			continue;
258 		}
259 		pud = pud_offset(pgd, addr);
260 		if (pud_none(*pud)) {
261 			addr = (addr + PUD_SIZE) & PUD_MASK;
262 			continue;
263 		}
264 		pmd = pmd_offset(pud, addr);
265 		if (pmd_none(*pmd)) {
266 			addr = (addr + PMD_SIZE) & PMD_MASK;
267 			continue;
268 		}
269 		p = NULL;
270 		pte = pte_offset_map(pmd, addr);
271 		if (pte_present(*pte))
272 			p = pte_page(*pte);
273 		pte_unmap(pte);
274 		if (p) {
275 			unsigned nid = page_to_nid(p);
276 			if (!test_bit(nid, nodes))
277 				return -EIO;
278 		}
279 		addr += PAGE_SIZE;
280 	}
281 	return 0;
282 }
283 
284 /* Step 1: check the range */
285 static struct vm_area_struct *
286 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
287 	    unsigned long *nodes, unsigned long flags)
288 {
289 	int err;
290 	struct vm_area_struct *first, *vma, *prev;
291 
292 	first = find_vma(mm, start);
293 	if (!first)
294 		return ERR_PTR(-EFAULT);
295 	prev = NULL;
296 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
297 		if (!vma->vm_next && vma->vm_end < end)
298 			return ERR_PTR(-EFAULT);
299 		if (prev && prev->vm_end < vma->vm_start)
300 			return ERR_PTR(-EFAULT);
301 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302 			err = verify_pages(vma->vm_mm,
303 					   vma->vm_start, vma->vm_end, nodes);
304 			if (err) {
305 				first = ERR_PTR(err);
306 				break;
307 			}
308 		}
309 		prev = vma;
310 	}
311 	return first;
312 }
313 
314 /* Apply policy to a single VMA */
315 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
316 {
317 	int err = 0;
318 	struct mempolicy *old = vma->vm_policy;
319 
320 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
321 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
322 		 vma->vm_ops, vma->vm_file,
323 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
324 
325 	if (vma->vm_ops && vma->vm_ops->set_policy)
326 		err = vma->vm_ops->set_policy(vma, new);
327 	if (!err) {
328 		mpol_get(new);
329 		vma->vm_policy = new;
330 		mpol_free(old);
331 	}
332 	return err;
333 }
334 
335 /* Step 2: apply policy to a range and do splits. */
336 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
337 		       unsigned long end, struct mempolicy *new)
338 {
339 	struct vm_area_struct *next;
340 	int err;
341 
342 	err = 0;
343 	for (; vma && vma->vm_start < end; vma = next) {
344 		next = vma->vm_next;
345 		if (vma->vm_start < start)
346 			err = split_vma(vma->vm_mm, vma, start, 1);
347 		if (!err && vma->vm_end > end)
348 			err = split_vma(vma->vm_mm, vma, end, 0);
349 		if (!err)
350 			err = policy_vma(vma, new);
351 		if (err)
352 			break;
353 	}
354 	return err;
355 }
356 
357 /* Change policy for a memory range */
358 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
359 			  unsigned long mode,
360 			  unsigned long __user *nmask, unsigned long maxnode,
361 			  unsigned flags)
362 {
363 	struct vm_area_struct *vma;
364 	struct mm_struct *mm = current->mm;
365 	struct mempolicy *new;
366 	unsigned long end;
367 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
368 	int err;
369 
370 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
371 		return -EINVAL;
372 	if (start & ~PAGE_MASK)
373 		return -EINVAL;
374 	if (mode == MPOL_DEFAULT)
375 		flags &= ~MPOL_MF_STRICT;
376 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
377 	end = start + len;
378 	if (end < start)
379 		return -EINVAL;
380 	if (end == start)
381 		return 0;
382 
383 	err = get_nodes(nodes, nmask, maxnode, mode);
384 	if (err)
385 		return err;
386 
387 	new = mpol_new(mode, nodes);
388 	if (IS_ERR(new))
389 		return PTR_ERR(new);
390 
391 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
392 			mode,nodes[0]);
393 
394 	down_write(&mm->mmap_sem);
395 	vma = check_range(mm, start, end, nodes, flags);
396 	err = PTR_ERR(vma);
397 	if (!IS_ERR(vma))
398 		err = mbind_range(vma, start, end, new);
399 	up_write(&mm->mmap_sem);
400 	mpol_free(new);
401 	return err;
402 }
403 
404 /* Set the process memory policy */
405 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
406 				   unsigned long maxnode)
407 {
408 	int err;
409 	struct mempolicy *new;
410 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
411 
412 	if (mode > MPOL_MAX)
413 		return -EINVAL;
414 	err = get_nodes(nodes, nmask, maxnode, mode);
415 	if (err)
416 		return err;
417 	new = mpol_new(mode, nodes);
418 	if (IS_ERR(new))
419 		return PTR_ERR(new);
420 	mpol_free(current->mempolicy);
421 	current->mempolicy = new;
422 	if (new && new->policy == MPOL_INTERLEAVE)
423 		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
424 	return 0;
425 }
426 
427 /* Fill a zone bitmap for a policy */
428 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
429 {
430 	int i;
431 
432 	bitmap_zero(nodes, MAX_NUMNODES);
433 	switch (p->policy) {
434 	case MPOL_BIND:
435 		for (i = 0; p->v.zonelist->zones[i]; i++)
436 			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
437 		break;
438 	case MPOL_DEFAULT:
439 		break;
440 	case MPOL_INTERLEAVE:
441 		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
442 		break;
443 	case MPOL_PREFERRED:
444 		/* or use current node instead of online map? */
445 		if (p->v.preferred_node < 0)
446 			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
447 		else
448 			__set_bit(p->v.preferred_node, nodes);
449 		break;
450 	default:
451 		BUG();
452 	}
453 }
454 
455 static int lookup_node(struct mm_struct *mm, unsigned long addr)
456 {
457 	struct page *p;
458 	int err;
459 
460 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
461 	if (err >= 0) {
462 		err = page_to_nid(p);
463 		put_page(p);
464 	}
465 	return err;
466 }
467 
468 /* Copy a kernel node mask to user space */
469 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
470 			      void *nodes, unsigned nbytes)
471 {
472 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
473 
474 	if (copy > nbytes) {
475 		if (copy > PAGE_SIZE)
476 			return -EINVAL;
477 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
478 			return -EFAULT;
479 		copy = nbytes;
480 	}
481 	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
482 }
483 
484 /* Retrieve NUMA policy */
485 asmlinkage long sys_get_mempolicy(int __user *policy,
486 				  unsigned long __user *nmask,
487 				  unsigned long maxnode,
488 				  unsigned long addr, unsigned long flags)
489 {
490 	int err, pval;
491 	struct mm_struct *mm = current->mm;
492 	struct vm_area_struct *vma = NULL;
493 	struct mempolicy *pol = current->mempolicy;
494 
495 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
496 		return -EINVAL;
497 	if (nmask != NULL && maxnode < MAX_NUMNODES)
498 		return -EINVAL;
499 	if (flags & MPOL_F_ADDR) {
500 		down_read(&mm->mmap_sem);
501 		vma = find_vma_intersection(mm, addr, addr+1);
502 		if (!vma) {
503 			up_read(&mm->mmap_sem);
504 			return -EFAULT;
505 		}
506 		if (vma->vm_ops && vma->vm_ops->get_policy)
507 			pol = vma->vm_ops->get_policy(vma, addr);
508 		else
509 			pol = vma->vm_policy;
510 	} else if (addr)
511 		return -EINVAL;
512 
513 	if (!pol)
514 		pol = &default_policy;
515 
516 	if (flags & MPOL_F_NODE) {
517 		if (flags & MPOL_F_ADDR) {
518 			err = lookup_node(mm, addr);
519 			if (err < 0)
520 				goto out;
521 			pval = err;
522 		} else if (pol == current->mempolicy &&
523 				pol->policy == MPOL_INTERLEAVE) {
524 			pval = current->il_next;
525 		} else {
526 			err = -EINVAL;
527 			goto out;
528 		}
529 	} else
530 		pval = pol->policy;
531 
532 	if (vma) {
533 		up_read(&current->mm->mmap_sem);
534 		vma = NULL;
535 	}
536 
537 	if (policy && put_user(pval, policy))
538 		return -EFAULT;
539 
540 	err = 0;
541 	if (nmask) {
542 		DECLARE_BITMAP(nodes, MAX_NUMNODES);
543 		get_zonemask(pol, nodes);
544 		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
545 	}
546 
547  out:
548 	if (vma)
549 		up_read(&current->mm->mmap_sem);
550 	return err;
551 }
552 
553 #ifdef CONFIG_COMPAT
554 
555 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
556 				     compat_ulong_t __user *nmask,
557 				     compat_ulong_t maxnode,
558 				     compat_ulong_t addr, compat_ulong_t flags)
559 {
560 	long err;
561 	unsigned long __user *nm = NULL;
562 	unsigned long nr_bits, alloc_size;
563 	DECLARE_BITMAP(bm, MAX_NUMNODES);
564 
565 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
566 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
567 
568 	if (nmask)
569 		nm = compat_alloc_user_space(alloc_size);
570 
571 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
572 
573 	if (!err && nmask) {
574 		err = copy_from_user(bm, nm, alloc_size);
575 		/* ensure entire bitmap is zeroed */
576 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
577 		err |= compat_put_bitmap(nmask, bm, nr_bits);
578 	}
579 
580 	return err;
581 }
582 
583 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
584 				     compat_ulong_t maxnode)
585 {
586 	long err = 0;
587 	unsigned long __user *nm = NULL;
588 	unsigned long nr_bits, alloc_size;
589 	DECLARE_BITMAP(bm, MAX_NUMNODES);
590 
591 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
593 
594 	if (nmask) {
595 		err = compat_get_bitmap(bm, nmask, nr_bits);
596 		nm = compat_alloc_user_space(alloc_size);
597 		err |= copy_to_user(nm, bm, alloc_size);
598 	}
599 
600 	if (err)
601 		return -EFAULT;
602 
603 	return sys_set_mempolicy(mode, nm, nr_bits+1);
604 }
605 
606 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
607 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
608 			     compat_ulong_t maxnode, compat_ulong_t flags)
609 {
610 	long err = 0;
611 	unsigned long __user *nm = NULL;
612 	unsigned long nr_bits, alloc_size;
613 	DECLARE_BITMAP(bm, MAX_NUMNODES);
614 
615 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
616 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
617 
618 	if (nmask) {
619 		err = compat_get_bitmap(bm, nmask, nr_bits);
620 		nm = compat_alloc_user_space(alloc_size);
621 		err |= copy_to_user(nm, bm, alloc_size);
622 	}
623 
624 	if (err)
625 		return -EFAULT;
626 
627 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
628 }
629 
630 #endif
631 
632 /* Return effective policy for a VMA */
633 static struct mempolicy *
634 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
635 {
636 	struct mempolicy *pol = current->mempolicy;
637 
638 	if (vma) {
639 		if (vma->vm_ops && vma->vm_ops->get_policy)
640 		        pol = vma->vm_ops->get_policy(vma, addr);
641 		else if (vma->vm_policy &&
642 				vma->vm_policy->policy != MPOL_DEFAULT)
643 			pol = vma->vm_policy;
644 	}
645 	if (!pol)
646 		pol = &default_policy;
647 	return pol;
648 }
649 
650 /* Return a zonelist representing a mempolicy */
651 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
652 {
653 	int nd;
654 
655 	switch (policy->policy) {
656 	case MPOL_PREFERRED:
657 		nd = policy->v.preferred_node;
658 		if (nd < 0)
659 			nd = numa_node_id();
660 		break;
661 	case MPOL_BIND:
662 		/* Lower zones don't get a policy applied */
663 		/* Careful: current->mems_allowed might have moved */
664 		if (gfp >= policy_zone)
665 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
666 				return policy->v.zonelist;
667 		/*FALL THROUGH*/
668 	case MPOL_INTERLEAVE: /* should not happen */
669 	case MPOL_DEFAULT:
670 		nd = numa_node_id();
671 		break;
672 	default:
673 		nd = 0;
674 		BUG();
675 	}
676 	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
677 }
678 
679 /* Do dynamic interleaving for a process */
680 static unsigned interleave_nodes(struct mempolicy *policy)
681 {
682 	unsigned nid, next;
683 	struct task_struct *me = current;
684 
685 	nid = me->il_next;
686 	BUG_ON(nid >= MAX_NUMNODES);
687 	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
688 	if (next >= MAX_NUMNODES)
689 		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
690 	me->il_next = next;
691 	return nid;
692 }
693 
694 /* Do static interleaving for a VMA with known offset. */
695 static unsigned offset_il_node(struct mempolicy *pol,
696 		struct vm_area_struct *vma, unsigned long off)
697 {
698 	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
699 	unsigned target = (unsigned)off % nnodes;
700 	int c;
701 	int nid = -1;
702 
703 	c = 0;
704 	do {
705 		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
706 		c++;
707 	} while (c <= target);
708 	BUG_ON(nid >= MAX_NUMNODES);
709 	BUG_ON(!test_bit(nid, pol->v.nodes));
710 	return nid;
711 }
712 
713 /* Allocate a page in interleaved policy.
714    Own path because it needs to do special accounting. */
715 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
716 {
717 	struct zonelist *zl;
718 	struct page *page;
719 
720 	BUG_ON(!node_online(nid));
721 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 	page = __alloc_pages(gfp, order, zl);
723 	if (page && page_zone(page) == zl->zones[0]) {
724 		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
725 		put_cpu();
726 	}
727 	return page;
728 }
729 
730 /**
731  * 	alloc_page_vma	- Allocate a page for a VMA.
732  *
733  * 	@gfp:
734  *      %GFP_USER    user allocation.
735  *      %GFP_KERNEL  kernel allocations,
736  *      %GFP_HIGHMEM highmem/user allocations,
737  *      %GFP_FS      allocation should not call back into a file system.
738  *      %GFP_ATOMIC  don't sleep.
739  *
740  * 	@vma:  Pointer to VMA or NULL if not available.
741  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
742  *
743  * 	This function allocates a page from the kernel page pool and applies
744  *	a NUMA policy associated with the VMA or the current process.
745  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
746  *	mm_struct of the VMA to prevent it from going away. Should be used for
747  *	all allocations for pages that will be mapped into
748  * 	user space. Returns NULL when no page can be allocated.
749  *
750  *	Should be called with the mm_sem of the vma hold.
751  */
752 struct page *
753 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
754 {
755 	struct mempolicy *pol = get_vma_policy(vma, addr);
756 
757 	cpuset_update_current_mems_allowed();
758 
759 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
760 		unsigned nid;
761 		if (vma) {
762 			unsigned long off;
763 			BUG_ON(addr >= vma->vm_end);
764 			BUG_ON(addr < vma->vm_start);
765 			off = vma->vm_pgoff;
766 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
767 			nid = offset_il_node(pol, vma, off);
768 		} else {
769 			/* fall back to process interleaving */
770 			nid = interleave_nodes(pol);
771 		}
772 		return alloc_page_interleave(gfp, 0, nid);
773 	}
774 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
775 }
776 
777 /**
778  * 	alloc_pages_current - Allocate pages.
779  *
780  *	@gfp:
781  *		%GFP_USER   user allocation,
782  *      	%GFP_KERNEL kernel allocation,
783  *      	%GFP_HIGHMEM highmem allocation,
784  *      	%GFP_FS     don't call back into a file system.
785  *      	%GFP_ATOMIC don't sleep.
786  *	@order: Power of two of allocation size in pages. 0 is a single page.
787  *
788  *	Allocate a page from the kernel page pool.  When not in
789  *	interrupt context and apply the current process NUMA policy.
790  *	Returns NULL when no page can be allocated.
791  *
792  *	Don't call cpuset_update_current_mems_allowed() unless
793  *	1) it's ok to take cpuset_sem (can WAIT), and
794  *	2) allocating for current task (not interrupt).
795  */
796 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
797 {
798 	struct mempolicy *pol = current->mempolicy;
799 
800 	if ((gfp & __GFP_WAIT) && !in_interrupt())
801 		cpuset_update_current_mems_allowed();
802 	if (!pol || in_interrupt())
803 		pol = &default_policy;
804 	if (pol->policy == MPOL_INTERLEAVE)
805 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
806 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
807 }
808 EXPORT_SYMBOL(alloc_pages_current);
809 
810 /* Slow path of a mempolicy copy */
811 struct mempolicy *__mpol_copy(struct mempolicy *old)
812 {
813 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
814 
815 	if (!new)
816 		return ERR_PTR(-ENOMEM);
817 	*new = *old;
818 	atomic_set(&new->refcnt, 1);
819 	if (new->policy == MPOL_BIND) {
820 		int sz = ksize(old->v.zonelist);
821 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
822 		if (!new->v.zonelist) {
823 			kmem_cache_free(policy_cache, new);
824 			return ERR_PTR(-ENOMEM);
825 		}
826 		memcpy(new->v.zonelist, old->v.zonelist, sz);
827 	}
828 	return new;
829 }
830 
831 /* Slow path of a mempolicy comparison */
832 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
833 {
834 	if (!a || !b)
835 		return 0;
836 	if (a->policy != b->policy)
837 		return 0;
838 	switch (a->policy) {
839 	case MPOL_DEFAULT:
840 		return 1;
841 	case MPOL_INTERLEAVE:
842 		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
843 	case MPOL_PREFERRED:
844 		return a->v.preferred_node == b->v.preferred_node;
845 	case MPOL_BIND: {
846 		int i;
847 		for (i = 0; a->v.zonelist->zones[i]; i++)
848 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
849 				return 0;
850 		return b->v.zonelist->zones[i] == NULL;
851 	}
852 	default:
853 		BUG();
854 		return 0;
855 	}
856 }
857 
858 /* Slow path of a mpol destructor. */
859 void __mpol_free(struct mempolicy *p)
860 {
861 	if (!atomic_dec_and_test(&p->refcnt))
862 		return;
863 	if (p->policy == MPOL_BIND)
864 		kfree(p->v.zonelist);
865 	p->policy = MPOL_DEFAULT;
866 	kmem_cache_free(policy_cache, p);
867 }
868 
869 /*
870  * Hugetlb policy. Same as above, just works with node numbers instead of
871  * zonelists.
872  */
873 
874 /* Find first node suitable for an allocation */
875 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
876 {
877 	struct mempolicy *pol = get_vma_policy(vma, addr);
878 
879 	switch (pol->policy) {
880 	case MPOL_DEFAULT:
881 		return numa_node_id();
882 	case MPOL_BIND:
883 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
884 	case MPOL_INTERLEAVE:
885 		return interleave_nodes(pol);
886 	case MPOL_PREFERRED:
887 		return pol->v.preferred_node >= 0 ?
888 				pol->v.preferred_node : numa_node_id();
889 	}
890 	BUG();
891 	return 0;
892 }
893 
894 /* Find secondary valid nodes for an allocation */
895 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
896 {
897 	struct mempolicy *pol = get_vma_policy(vma, addr);
898 
899 	switch (pol->policy) {
900 	case MPOL_PREFERRED:
901 	case MPOL_DEFAULT:
902 	case MPOL_INTERLEAVE:
903 		return 1;
904 	case MPOL_BIND: {
905 		struct zone **z;
906 		for (z = pol->v.zonelist->zones; *z; z++)
907 			if ((*z)->zone_pgdat->node_id == nid)
908 				return 1;
909 		return 0;
910 	}
911 	default:
912 		BUG();
913 		return 0;
914 	}
915 }
916 
917 /*
918  * Shared memory backing store policy support.
919  *
920  * Remember policies even when nobody has shared memory mapped.
921  * The policies are kept in Red-Black tree linked from the inode.
922  * They are protected by the sp->lock spinlock, which should be held
923  * for any accesses to the tree.
924  */
925 
926 /* lookup first element intersecting start-end */
927 /* Caller holds sp->lock */
928 static struct sp_node *
929 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
930 {
931 	struct rb_node *n = sp->root.rb_node;
932 
933 	while (n) {
934 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
935 
936 		if (start >= p->end)
937 			n = n->rb_right;
938 		else if (end <= p->start)
939 			n = n->rb_left;
940 		else
941 			break;
942 	}
943 	if (!n)
944 		return NULL;
945 	for (;;) {
946 		struct sp_node *w = NULL;
947 		struct rb_node *prev = rb_prev(n);
948 		if (!prev)
949 			break;
950 		w = rb_entry(prev, struct sp_node, nd);
951 		if (w->end <= start)
952 			break;
953 		n = prev;
954 	}
955 	return rb_entry(n, struct sp_node, nd);
956 }
957 
958 /* Insert a new shared policy into the list. */
959 /* Caller holds sp->lock */
960 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
961 {
962 	struct rb_node **p = &sp->root.rb_node;
963 	struct rb_node *parent = NULL;
964 	struct sp_node *nd;
965 
966 	while (*p) {
967 		parent = *p;
968 		nd = rb_entry(parent, struct sp_node, nd);
969 		if (new->start < nd->start)
970 			p = &(*p)->rb_left;
971 		else if (new->end > nd->end)
972 			p = &(*p)->rb_right;
973 		else
974 			BUG();
975 	}
976 	rb_link_node(&new->nd, parent, p);
977 	rb_insert_color(&new->nd, &sp->root);
978 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
979 		 new->policy ? new->policy->policy : 0);
980 }
981 
982 /* Find shared policy intersecting idx */
983 struct mempolicy *
984 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
985 {
986 	struct mempolicy *pol = NULL;
987 	struct sp_node *sn;
988 
989 	if (!sp->root.rb_node)
990 		return NULL;
991 	spin_lock(&sp->lock);
992 	sn = sp_lookup(sp, idx, idx+1);
993 	if (sn) {
994 		mpol_get(sn->policy);
995 		pol = sn->policy;
996 	}
997 	spin_unlock(&sp->lock);
998 	return pol;
999 }
1000 
1001 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1002 {
1003 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004 	rb_erase(&n->nd, &sp->root);
1005 	mpol_free(n->policy);
1006 	kmem_cache_free(sn_cache, n);
1007 }
1008 
1009 struct sp_node *
1010 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1011 {
1012 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1013 
1014 	if (!n)
1015 		return NULL;
1016 	n->start = start;
1017 	n->end = end;
1018 	mpol_get(pol);
1019 	n->policy = pol;
1020 	return n;
1021 }
1022 
1023 /* Replace a policy range. */
1024 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025 				 unsigned long end, struct sp_node *new)
1026 {
1027 	struct sp_node *n, *new2 = NULL;
1028 
1029 restart:
1030 	spin_lock(&sp->lock);
1031 	n = sp_lookup(sp, start, end);
1032 	/* Take care of old policies in the same range. */
1033 	while (n && n->start < end) {
1034 		struct rb_node *next = rb_next(&n->nd);
1035 		if (n->start >= start) {
1036 			if (n->end <= end)
1037 				sp_delete(sp, n);
1038 			else
1039 				n->start = end;
1040 		} else {
1041 			/* Old policy spanning whole new range. */
1042 			if (n->end > end) {
1043 				if (!new2) {
1044 					spin_unlock(&sp->lock);
1045 					new2 = sp_alloc(end, n->end, n->policy);
1046 					if (!new2)
1047 						return -ENOMEM;
1048 					goto restart;
1049 				}
1050 				n->end = start;
1051 				sp_insert(sp, new2);
1052 				new2 = NULL;
1053 				break;
1054 			} else
1055 				n->end = start;
1056 		}
1057 		if (!next)
1058 			break;
1059 		n = rb_entry(next, struct sp_node, nd);
1060 	}
1061 	if (new)
1062 		sp_insert(sp, new);
1063 	spin_unlock(&sp->lock);
1064 	if (new2) {
1065 		mpol_free(new2->policy);
1066 		kmem_cache_free(sn_cache, new2);
1067 	}
1068 	return 0;
1069 }
1070 
1071 int mpol_set_shared_policy(struct shared_policy *info,
1072 			struct vm_area_struct *vma, struct mempolicy *npol)
1073 {
1074 	int err;
1075 	struct sp_node *new = NULL;
1076 	unsigned long sz = vma_pages(vma);
1077 
1078 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1079 		 vma->vm_pgoff,
1080 		 sz, npol? npol->policy : -1,
1081 		npol ? npol->v.nodes[0] : -1);
1082 
1083 	if (npol) {
1084 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1085 		if (!new)
1086 			return -ENOMEM;
1087 	}
1088 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1089 	if (err && new)
1090 		kmem_cache_free(sn_cache, new);
1091 	return err;
1092 }
1093 
1094 /* Free a backing policy store on inode delete. */
1095 void mpol_free_shared_policy(struct shared_policy *p)
1096 {
1097 	struct sp_node *n;
1098 	struct rb_node *next;
1099 
1100 	if (!p->root.rb_node)
1101 		return;
1102 	spin_lock(&p->lock);
1103 	next = rb_first(&p->root);
1104 	while (next) {
1105 		n = rb_entry(next, struct sp_node, nd);
1106 		next = rb_next(&n->nd);
1107 		mpol_free(n->policy);
1108 		kmem_cache_free(sn_cache, n);
1109 	}
1110 	spin_unlock(&p->lock);
1111 	p->root = RB_ROOT;
1112 }
1113 
1114 /* assumes fs == KERNEL_DS */
1115 void __init numa_policy_init(void)
1116 {
1117 	policy_cache = kmem_cache_create("numa_policy",
1118 					 sizeof(struct mempolicy),
1119 					 0, SLAB_PANIC, NULL, NULL);
1120 
1121 	sn_cache = kmem_cache_create("shared_policy_node",
1122 				     sizeof(struct sp_node),
1123 				     0, SLAB_PANIC, NULL, NULL);
1124 
1125 	/* Set interleaving policy for system init. This way not all
1126 	   the data structures allocated at system boot end up in node zero. */
1127 
1128 	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1129 							MAX_NUMNODES) < 0)
1130 		printk("numa_policy_init: interleaving failed\n");
1131 }
1132 
1133 /* Reset policy of current process to default.
1134  * Assumes fs == KERNEL_DS */
1135 void numa_default_policy(void)
1136 {
1137 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1138 }
1139