xref: /openbmc/linux/mm/mempolicy.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/nodemask.h>
76 #include <linux/cpuset.h>
77 #include <linux/gfp.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/module.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 static void mpol_rebind_policy(struct mempolicy *pol,
114                                const nodemask_t *newmask);
115 
116 /* Do sanity checking on a policy */
117 static int mpol_check_policy(int mode, nodemask_t *nodes)
118 {
119 	int empty = nodes_empty(*nodes);
120 
121 	switch (mode) {
122 	case MPOL_DEFAULT:
123 		if (!empty)
124 			return -EINVAL;
125 		break;
126 	case MPOL_BIND:
127 	case MPOL_INTERLEAVE:
128 		/* Preferred will only use the first bit, but allow
129 		   more for now. */
130 		if (empty)
131 			return -EINVAL;
132 		break;
133 	}
134  	return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
135 }
136 
137 /* Generate a custom zonelist for the BIND policy. */
138 static struct zonelist *bind_zonelist(nodemask_t *nodes)
139 {
140 	struct zonelist *zl;
141 	int num, max, nd;
142 	enum zone_type k;
143 
144 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
145 	max++;			/* space for zlcache_ptr (see mmzone.h) */
146 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
147 	if (!zl)
148 		return ERR_PTR(-ENOMEM);
149 	zl->zlcache_ptr = NULL;
150 	num = 0;
151 	/* First put in the highest zones from all nodes, then all the next
152 	   lower zones etc. Avoid empty zones because the memory allocator
153 	   doesn't like them. If you implement node hot removal you
154 	   have to fix that. */
155 	k = MAX_NR_ZONES - 1;
156 	while (1) {
157 		for_each_node_mask(nd, *nodes) {
158 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
159 			if (z->present_pages > 0)
160 				zl->zones[num++] = z;
161 		}
162 		if (k == 0)
163 			break;
164 		k--;
165 	}
166 	if (num == 0) {
167 		kfree(zl);
168 		return ERR_PTR(-EINVAL);
169 	}
170 	zl->zones[num] = NULL;
171 	return zl;
172 }
173 
174 /* Create a new policy */
175 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176 {
177 	struct mempolicy *policy;
178 
179 	pr_debug("setting mode %d nodes[0] %lx\n",
180 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
181 
182 	if (mode == MPOL_DEFAULT)
183 		return NULL;
184 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
185 	if (!policy)
186 		return ERR_PTR(-ENOMEM);
187 	atomic_set(&policy->refcnt, 1);
188 	switch (mode) {
189 	case MPOL_INTERLEAVE:
190 		policy->v.nodes = *nodes;
191 		nodes_and(policy->v.nodes, policy->v.nodes,
192 					node_states[N_HIGH_MEMORY]);
193 		if (nodes_weight(policy->v.nodes) == 0) {
194 			kmem_cache_free(policy_cache, policy);
195 			return ERR_PTR(-EINVAL);
196 		}
197 		break;
198 	case MPOL_PREFERRED:
199 		policy->v.preferred_node = first_node(*nodes);
200 		if (policy->v.preferred_node >= MAX_NUMNODES)
201 			policy->v.preferred_node = -1;
202 		break;
203 	case MPOL_BIND:
204 		policy->v.zonelist = bind_zonelist(nodes);
205 		if (IS_ERR(policy->v.zonelist)) {
206 			void *error_code = policy->v.zonelist;
207 			kmem_cache_free(policy_cache, policy);
208 			return error_code;
209 		}
210 		break;
211 	}
212 	policy->policy = mode;
213 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
214 	return policy;
215 }
216 
217 static void gather_stats(struct page *, void *, int pte_dirty);
218 static void migrate_page_add(struct page *page, struct list_head *pagelist,
219 				unsigned long flags);
220 
221 /* Scan through pages checking if pages follow certain conditions. */
222 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
223 		unsigned long addr, unsigned long end,
224 		const nodemask_t *nodes, unsigned long flags,
225 		void *private)
226 {
227 	pte_t *orig_pte;
228 	pte_t *pte;
229 	spinlock_t *ptl;
230 
231 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
232 	do {
233 		struct page *page;
234 		int nid;
235 
236 		if (!pte_present(*pte))
237 			continue;
238 		page = vm_normal_page(vma, addr, *pte);
239 		if (!page)
240 			continue;
241 		/*
242 		 * The check for PageReserved here is important to avoid
243 		 * handling zero pages and other pages that may have been
244 		 * marked special by the system.
245 		 *
246 		 * If the PageReserved would not be checked here then f.e.
247 		 * the location of the zero page could have an influence
248 		 * on MPOL_MF_STRICT, zero pages would be counted for
249 		 * the per node stats, and there would be useless attempts
250 		 * to put zero pages on the migration list.
251 		 */
252 		if (PageReserved(page))
253 			continue;
254 		nid = page_to_nid(page);
255 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
256 			continue;
257 
258 		if (flags & MPOL_MF_STATS)
259 			gather_stats(page, private, pte_dirty(*pte));
260 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
261 			migrate_page_add(page, private, flags);
262 		else
263 			break;
264 	} while (pte++, addr += PAGE_SIZE, addr != end);
265 	pte_unmap_unlock(orig_pte, ptl);
266 	return addr != end;
267 }
268 
269 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
270 		unsigned long addr, unsigned long end,
271 		const nodemask_t *nodes, unsigned long flags,
272 		void *private)
273 {
274 	pmd_t *pmd;
275 	unsigned long next;
276 
277 	pmd = pmd_offset(pud, addr);
278 	do {
279 		next = pmd_addr_end(addr, end);
280 		if (pmd_none_or_clear_bad(pmd))
281 			continue;
282 		if (check_pte_range(vma, pmd, addr, next, nodes,
283 				    flags, private))
284 			return -EIO;
285 	} while (pmd++, addr = next, addr != end);
286 	return 0;
287 }
288 
289 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
290 		unsigned long addr, unsigned long end,
291 		const nodemask_t *nodes, unsigned long flags,
292 		void *private)
293 {
294 	pud_t *pud;
295 	unsigned long next;
296 
297 	pud = pud_offset(pgd, addr);
298 	do {
299 		next = pud_addr_end(addr, end);
300 		if (pud_none_or_clear_bad(pud))
301 			continue;
302 		if (check_pmd_range(vma, pud, addr, next, nodes,
303 				    flags, private))
304 			return -EIO;
305 	} while (pud++, addr = next, addr != end);
306 	return 0;
307 }
308 
309 static inline int check_pgd_range(struct vm_area_struct *vma,
310 		unsigned long addr, unsigned long end,
311 		const nodemask_t *nodes, unsigned long flags,
312 		void *private)
313 {
314 	pgd_t *pgd;
315 	unsigned long next;
316 
317 	pgd = pgd_offset(vma->vm_mm, addr);
318 	do {
319 		next = pgd_addr_end(addr, end);
320 		if (pgd_none_or_clear_bad(pgd))
321 			continue;
322 		if (check_pud_range(vma, pgd, addr, next, nodes,
323 				    flags, private))
324 			return -EIO;
325 	} while (pgd++, addr = next, addr != end);
326 	return 0;
327 }
328 
329 /*
330  * Check if all pages in a range are on a set of nodes.
331  * If pagelist != NULL then isolate pages from the LRU and
332  * put them on the pagelist.
333  */
334 static struct vm_area_struct *
335 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
336 		const nodemask_t *nodes, unsigned long flags, void *private)
337 {
338 	int err;
339 	struct vm_area_struct *first, *vma, *prev;
340 
341 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
342 
343 		err = migrate_prep();
344 		if (err)
345 			return ERR_PTR(err);
346 	}
347 
348 	first = find_vma(mm, start);
349 	if (!first)
350 		return ERR_PTR(-EFAULT);
351 	prev = NULL;
352 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
353 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
354 			if (!vma->vm_next && vma->vm_end < end)
355 				return ERR_PTR(-EFAULT);
356 			if (prev && prev->vm_end < vma->vm_start)
357 				return ERR_PTR(-EFAULT);
358 		}
359 		if (!is_vm_hugetlb_page(vma) &&
360 		    ((flags & MPOL_MF_STRICT) ||
361 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
362 				vma_migratable(vma)))) {
363 			unsigned long endvma = vma->vm_end;
364 
365 			if (endvma > end)
366 				endvma = end;
367 			if (vma->vm_start > start)
368 				start = vma->vm_start;
369 			err = check_pgd_range(vma, start, endvma, nodes,
370 						flags, private);
371 			if (err) {
372 				first = ERR_PTR(err);
373 				break;
374 			}
375 		}
376 		prev = vma;
377 	}
378 	return first;
379 }
380 
381 /* Apply policy to a single VMA */
382 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
383 {
384 	int err = 0;
385 	struct mempolicy *old = vma->vm_policy;
386 
387 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
388 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
389 		 vma->vm_ops, vma->vm_file,
390 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
391 
392 	if (vma->vm_ops && vma->vm_ops->set_policy)
393 		err = vma->vm_ops->set_policy(vma, new);
394 	if (!err) {
395 		mpol_get(new);
396 		vma->vm_policy = new;
397 		mpol_free(old);
398 	}
399 	return err;
400 }
401 
402 /* Step 2: apply policy to a range and do splits. */
403 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
404 		       unsigned long end, struct mempolicy *new)
405 {
406 	struct vm_area_struct *next;
407 	int err;
408 
409 	err = 0;
410 	for (; vma && vma->vm_start < end; vma = next) {
411 		next = vma->vm_next;
412 		if (vma->vm_start < start)
413 			err = split_vma(vma->vm_mm, vma, start, 1);
414 		if (!err && vma->vm_end > end)
415 			err = split_vma(vma->vm_mm, vma, end, 0);
416 		if (!err)
417 			err = policy_vma(vma, new);
418 		if (err)
419 			break;
420 	}
421 	return err;
422 }
423 
424 static int contextualize_policy(int mode, nodemask_t *nodes)
425 {
426 	if (!nodes)
427 		return 0;
428 
429 	cpuset_update_task_memory_state();
430 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
431 		return -EINVAL;
432 	return mpol_check_policy(mode, nodes);
433 }
434 
435 
436 /*
437  * Update task->flags PF_MEMPOLICY bit: set iff non-default
438  * mempolicy.  Allows more rapid checking of this (combined perhaps
439  * with other PF_* flag bits) on memory allocation hot code paths.
440  *
441  * If called from outside this file, the task 'p' should -only- be
442  * a newly forked child not yet visible on the task list, because
443  * manipulating the task flags of a visible task is not safe.
444  *
445  * The above limitation is why this routine has the funny name
446  * mpol_fix_fork_child_flag().
447  *
448  * It is also safe to call this with a task pointer of current,
449  * which the static wrapper mpol_set_task_struct_flag() does,
450  * for use within this file.
451  */
452 
453 void mpol_fix_fork_child_flag(struct task_struct *p)
454 {
455 	if (p->mempolicy)
456 		p->flags |= PF_MEMPOLICY;
457 	else
458 		p->flags &= ~PF_MEMPOLICY;
459 }
460 
461 static void mpol_set_task_struct_flag(void)
462 {
463 	mpol_fix_fork_child_flag(current);
464 }
465 
466 /* Set the process memory policy */
467 static long do_set_mempolicy(int mode, nodemask_t *nodes)
468 {
469 	struct mempolicy *new;
470 
471 	if (contextualize_policy(mode, nodes))
472 		return -EINVAL;
473 	new = mpol_new(mode, nodes);
474 	if (IS_ERR(new))
475 		return PTR_ERR(new);
476 	mpol_free(current->mempolicy);
477 	current->mempolicy = new;
478 	mpol_set_task_struct_flag();
479 	if (new && new->policy == MPOL_INTERLEAVE)
480 		current->il_next = first_node(new->v.nodes);
481 	return 0;
482 }
483 
484 /* Fill a zone bitmap for a policy */
485 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
486 {
487 	int i;
488 
489 	nodes_clear(*nodes);
490 	switch (p->policy) {
491 	case MPOL_BIND:
492 		for (i = 0; p->v.zonelist->zones[i]; i++)
493 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
494 				*nodes);
495 		break;
496 	case MPOL_DEFAULT:
497 		break;
498 	case MPOL_INTERLEAVE:
499 		*nodes = p->v.nodes;
500 		break;
501 	case MPOL_PREFERRED:
502 		/* or use current node instead of memory_map? */
503 		if (p->v.preferred_node < 0)
504 			*nodes = node_states[N_HIGH_MEMORY];
505 		else
506 			node_set(p->v.preferred_node, *nodes);
507 		break;
508 	default:
509 		BUG();
510 	}
511 }
512 
513 static int lookup_node(struct mm_struct *mm, unsigned long addr)
514 {
515 	struct page *p;
516 	int err;
517 
518 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
519 	if (err >= 0) {
520 		err = page_to_nid(p);
521 		put_page(p);
522 	}
523 	return err;
524 }
525 
526 /* Retrieve NUMA policy */
527 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
528 			     unsigned long addr, unsigned long flags)
529 {
530 	int err;
531 	struct mm_struct *mm = current->mm;
532 	struct vm_area_struct *vma = NULL;
533 	struct mempolicy *pol = current->mempolicy;
534 
535 	cpuset_update_task_memory_state();
536 	if (flags &
537 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
538 		return -EINVAL;
539 
540 	if (flags & MPOL_F_MEMS_ALLOWED) {
541 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
542 			return -EINVAL;
543 		*policy = 0;	/* just so it's initialized */
544 		*nmask  = cpuset_current_mems_allowed;
545 		return 0;
546 	}
547 
548 	if (flags & MPOL_F_ADDR) {
549 		down_read(&mm->mmap_sem);
550 		vma = find_vma_intersection(mm, addr, addr+1);
551 		if (!vma) {
552 			up_read(&mm->mmap_sem);
553 			return -EFAULT;
554 		}
555 		if (vma->vm_ops && vma->vm_ops->get_policy)
556 			pol = vma->vm_ops->get_policy(vma, addr);
557 		else
558 			pol = vma->vm_policy;
559 	} else if (addr)
560 		return -EINVAL;
561 
562 	if (!pol)
563 		pol = &default_policy;
564 
565 	if (flags & MPOL_F_NODE) {
566 		if (flags & MPOL_F_ADDR) {
567 			err = lookup_node(mm, addr);
568 			if (err < 0)
569 				goto out;
570 			*policy = err;
571 		} else if (pol == current->mempolicy &&
572 				pol->policy == MPOL_INTERLEAVE) {
573 			*policy = current->il_next;
574 		} else {
575 			err = -EINVAL;
576 			goto out;
577 		}
578 	} else
579 		*policy = pol->policy;
580 
581 	if (vma) {
582 		up_read(&current->mm->mmap_sem);
583 		vma = NULL;
584 	}
585 
586 	err = 0;
587 	if (nmask)
588 		get_zonemask(pol, nmask);
589 
590  out:
591 	if (vma)
592 		up_read(&current->mm->mmap_sem);
593 	return err;
594 }
595 
596 #ifdef CONFIG_MIGRATION
597 /*
598  * page migration
599  */
600 static void migrate_page_add(struct page *page, struct list_head *pagelist,
601 				unsigned long flags)
602 {
603 	/*
604 	 * Avoid migrating a page that is shared with others.
605 	 */
606 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
607 		isolate_lru_page(page, pagelist);
608 }
609 
610 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
611 {
612 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
613 }
614 
615 /*
616  * Migrate pages from one node to a target node.
617  * Returns error or the number of pages not migrated.
618  */
619 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
620 			   int flags)
621 {
622 	nodemask_t nmask;
623 	LIST_HEAD(pagelist);
624 	int err = 0;
625 
626 	nodes_clear(nmask);
627 	node_set(source, nmask);
628 
629 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
630 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
631 
632 	if (!list_empty(&pagelist))
633 		err = migrate_pages(&pagelist, new_node_page, dest);
634 
635 	return err;
636 }
637 
638 /*
639  * Move pages between the two nodesets so as to preserve the physical
640  * layout as much as possible.
641  *
642  * Returns the number of page that could not be moved.
643  */
644 int do_migrate_pages(struct mm_struct *mm,
645 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646 {
647 	LIST_HEAD(pagelist);
648 	int busy = 0;
649 	int err = 0;
650 	nodemask_t tmp;
651 
652   	down_read(&mm->mmap_sem);
653 
654 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
655 	if (err)
656 		goto out;
657 
658 /*
659  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
660  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
661  * bit in 'tmp', and return that <source, dest> pair for migration.
662  * The pair of nodemasks 'to' and 'from' define the map.
663  *
664  * If no pair of bits is found that way, fallback to picking some
665  * pair of 'source' and 'dest' bits that are not the same.  If the
666  * 'source' and 'dest' bits are the same, this represents a node
667  * that will be migrating to itself, so no pages need move.
668  *
669  * If no bits are left in 'tmp', or if all remaining bits left
670  * in 'tmp' correspond to the same bit in 'to', return false
671  * (nothing left to migrate).
672  *
673  * This lets us pick a pair of nodes to migrate between, such that
674  * if possible the dest node is not already occupied by some other
675  * source node, minimizing the risk of overloading the memory on a
676  * node that would happen if we migrated incoming memory to a node
677  * before migrating outgoing memory source that same node.
678  *
679  * A single scan of tmp is sufficient.  As we go, we remember the
680  * most recent <s, d> pair that moved (s != d).  If we find a pair
681  * that not only moved, but what's better, moved to an empty slot
682  * (d is not set in tmp), then we break out then, with that pair.
683  * Otherwise when we finish scannng from_tmp, we at least have the
684  * most recent <s, d> pair that moved.  If we get all the way through
685  * the scan of tmp without finding any node that moved, much less
686  * moved to an empty node, then there is nothing left worth migrating.
687  */
688 
689 	tmp = *from_nodes;
690 	while (!nodes_empty(tmp)) {
691 		int s,d;
692 		int source = -1;
693 		int dest = 0;
694 
695 		for_each_node_mask(s, tmp) {
696 			d = node_remap(s, *from_nodes, *to_nodes);
697 			if (s == d)
698 				continue;
699 
700 			source = s;	/* Node moved. Memorize */
701 			dest = d;
702 
703 			/* dest not in remaining from nodes? */
704 			if (!node_isset(dest, tmp))
705 				break;
706 		}
707 		if (source == -1)
708 			break;
709 
710 		node_clear(source, tmp);
711 		err = migrate_to_node(mm, source, dest, flags);
712 		if (err > 0)
713 			busy += err;
714 		if (err < 0)
715 			break;
716 	}
717 out:
718 	up_read(&mm->mmap_sem);
719 	if (err < 0)
720 		return err;
721 	return busy;
722 
723 }
724 
725 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
726 {
727 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
728 
729 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
730 					page_address_in_vma(page, vma));
731 }
732 #else
733 
734 static void migrate_page_add(struct page *page, struct list_head *pagelist,
735 				unsigned long flags)
736 {
737 }
738 
739 int do_migrate_pages(struct mm_struct *mm,
740 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
741 {
742 	return -ENOSYS;
743 }
744 
745 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
746 {
747 	return NULL;
748 }
749 #endif
750 
751 static long do_mbind(unsigned long start, unsigned long len,
752 		     unsigned long mode, nodemask_t *nmask,
753 		     unsigned long flags)
754 {
755 	struct vm_area_struct *vma;
756 	struct mm_struct *mm = current->mm;
757 	struct mempolicy *new;
758 	unsigned long end;
759 	int err;
760 	LIST_HEAD(pagelist);
761 
762 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
763 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
764 	    || mode > MPOL_MAX)
765 		return -EINVAL;
766 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
767 		return -EPERM;
768 
769 	if (start & ~PAGE_MASK)
770 		return -EINVAL;
771 
772 	if (mode == MPOL_DEFAULT)
773 		flags &= ~MPOL_MF_STRICT;
774 
775 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
776 	end = start + len;
777 
778 	if (end < start)
779 		return -EINVAL;
780 	if (end == start)
781 		return 0;
782 
783 	if (mpol_check_policy(mode, nmask))
784 		return -EINVAL;
785 
786 	new = mpol_new(mode, nmask);
787 	if (IS_ERR(new))
788 		return PTR_ERR(new);
789 
790 	/*
791 	 * If we are using the default policy then operation
792 	 * on discontinuous address spaces is okay after all
793 	 */
794 	if (!new)
795 		flags |= MPOL_MF_DISCONTIG_OK;
796 
797 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
798 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
799 
800 	down_write(&mm->mmap_sem);
801 	vma = check_range(mm, start, end, nmask,
802 			  flags | MPOL_MF_INVERT, &pagelist);
803 
804 	err = PTR_ERR(vma);
805 	if (!IS_ERR(vma)) {
806 		int nr_failed = 0;
807 
808 		err = mbind_range(vma, start, end, new);
809 
810 		if (!list_empty(&pagelist))
811 			nr_failed = migrate_pages(&pagelist, new_vma_page,
812 						(unsigned long)vma);
813 
814 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
815 			err = -EIO;
816 	}
817 
818 	up_write(&mm->mmap_sem);
819 	mpol_free(new);
820 	return err;
821 }
822 
823 /*
824  * User space interface with variable sized bitmaps for nodelists.
825  */
826 
827 /* Copy a node mask from user space. */
828 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
829 		     unsigned long maxnode)
830 {
831 	unsigned long k;
832 	unsigned long nlongs;
833 	unsigned long endmask;
834 
835 	--maxnode;
836 	nodes_clear(*nodes);
837 	if (maxnode == 0 || !nmask)
838 		return 0;
839 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
840 		return -EINVAL;
841 
842 	nlongs = BITS_TO_LONGS(maxnode);
843 	if ((maxnode % BITS_PER_LONG) == 0)
844 		endmask = ~0UL;
845 	else
846 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
847 
848 	/* When the user specified more nodes than supported just check
849 	   if the non supported part is all zero. */
850 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
851 		if (nlongs > PAGE_SIZE/sizeof(long))
852 			return -EINVAL;
853 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
854 			unsigned long t;
855 			if (get_user(t, nmask + k))
856 				return -EFAULT;
857 			if (k == nlongs - 1) {
858 				if (t & endmask)
859 					return -EINVAL;
860 			} else if (t)
861 				return -EINVAL;
862 		}
863 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
864 		endmask = ~0UL;
865 	}
866 
867 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
868 		return -EFAULT;
869 	nodes_addr(*nodes)[nlongs-1] &= endmask;
870 	return 0;
871 }
872 
873 /* Copy a kernel node mask to user space */
874 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
875 			      nodemask_t *nodes)
876 {
877 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
878 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
879 
880 	if (copy > nbytes) {
881 		if (copy > PAGE_SIZE)
882 			return -EINVAL;
883 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
884 			return -EFAULT;
885 		copy = nbytes;
886 	}
887 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
888 }
889 
890 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
891 			unsigned long mode,
892 			unsigned long __user *nmask, unsigned long maxnode,
893 			unsigned flags)
894 {
895 	nodemask_t nodes;
896 	int err;
897 
898 	err = get_nodes(&nodes, nmask, maxnode);
899 	if (err)
900 		return err;
901 #ifdef CONFIG_CPUSETS
902 	/* Restrict the nodes to the allowed nodes in the cpuset */
903 	nodes_and(nodes, nodes, current->mems_allowed);
904 #endif
905 	return do_mbind(start, len, mode, &nodes, flags);
906 }
907 
908 /* Set the process memory policy */
909 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
910 		unsigned long maxnode)
911 {
912 	int err;
913 	nodemask_t nodes;
914 
915 	if (mode < 0 || mode > MPOL_MAX)
916 		return -EINVAL;
917 	err = get_nodes(&nodes, nmask, maxnode);
918 	if (err)
919 		return err;
920 	return do_set_mempolicy(mode, &nodes);
921 }
922 
923 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
924 		const unsigned long __user *old_nodes,
925 		const unsigned long __user *new_nodes)
926 {
927 	struct mm_struct *mm;
928 	struct task_struct *task;
929 	nodemask_t old;
930 	nodemask_t new;
931 	nodemask_t task_nodes;
932 	int err;
933 
934 	err = get_nodes(&old, old_nodes, maxnode);
935 	if (err)
936 		return err;
937 
938 	err = get_nodes(&new, new_nodes, maxnode);
939 	if (err)
940 		return err;
941 
942 	/* Find the mm_struct */
943 	read_lock(&tasklist_lock);
944 	task = pid ? find_task_by_vpid(pid) : current;
945 	if (!task) {
946 		read_unlock(&tasklist_lock);
947 		return -ESRCH;
948 	}
949 	mm = get_task_mm(task);
950 	read_unlock(&tasklist_lock);
951 
952 	if (!mm)
953 		return -EINVAL;
954 
955 	/*
956 	 * Check if this process has the right to modify the specified
957 	 * process. The right exists if the process has administrative
958 	 * capabilities, superuser privileges or the same
959 	 * userid as the target process.
960 	 */
961 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
962 	    (current->uid != task->suid) && (current->uid != task->uid) &&
963 	    !capable(CAP_SYS_NICE)) {
964 		err = -EPERM;
965 		goto out;
966 	}
967 
968 	task_nodes = cpuset_mems_allowed(task);
969 	/* Is the user allowed to access the target nodes? */
970 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
971 		err = -EPERM;
972 		goto out;
973 	}
974 
975 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
976 		err = -EINVAL;
977 		goto out;
978 	}
979 
980 	err = security_task_movememory(task);
981 	if (err)
982 		goto out;
983 
984 	err = do_migrate_pages(mm, &old, &new,
985 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
986 out:
987 	mmput(mm);
988 	return err;
989 }
990 
991 
992 /* Retrieve NUMA policy */
993 asmlinkage long sys_get_mempolicy(int __user *policy,
994 				unsigned long __user *nmask,
995 				unsigned long maxnode,
996 				unsigned long addr, unsigned long flags)
997 {
998 	int err;
999 	int uninitialized_var(pval);
1000 	nodemask_t nodes;
1001 
1002 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1003 		return -EINVAL;
1004 
1005 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1006 
1007 	if (err)
1008 		return err;
1009 
1010 	if (policy && put_user(pval, policy))
1011 		return -EFAULT;
1012 
1013 	if (nmask)
1014 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1015 
1016 	return err;
1017 }
1018 
1019 #ifdef CONFIG_COMPAT
1020 
1021 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1022 				     compat_ulong_t __user *nmask,
1023 				     compat_ulong_t maxnode,
1024 				     compat_ulong_t addr, compat_ulong_t flags)
1025 {
1026 	long err;
1027 	unsigned long __user *nm = NULL;
1028 	unsigned long nr_bits, alloc_size;
1029 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1030 
1031 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1032 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1033 
1034 	if (nmask)
1035 		nm = compat_alloc_user_space(alloc_size);
1036 
1037 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1038 
1039 	if (!err && nmask) {
1040 		err = copy_from_user(bm, nm, alloc_size);
1041 		/* ensure entire bitmap is zeroed */
1042 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1043 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1044 	}
1045 
1046 	return err;
1047 }
1048 
1049 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1050 				     compat_ulong_t maxnode)
1051 {
1052 	long err = 0;
1053 	unsigned long __user *nm = NULL;
1054 	unsigned long nr_bits, alloc_size;
1055 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1056 
1057 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1058 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1059 
1060 	if (nmask) {
1061 		err = compat_get_bitmap(bm, nmask, nr_bits);
1062 		nm = compat_alloc_user_space(alloc_size);
1063 		err |= copy_to_user(nm, bm, alloc_size);
1064 	}
1065 
1066 	if (err)
1067 		return -EFAULT;
1068 
1069 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1070 }
1071 
1072 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1073 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1074 			     compat_ulong_t maxnode, compat_ulong_t flags)
1075 {
1076 	long err = 0;
1077 	unsigned long __user *nm = NULL;
1078 	unsigned long nr_bits, alloc_size;
1079 	nodemask_t bm;
1080 
1081 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1082 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1083 
1084 	if (nmask) {
1085 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1086 		nm = compat_alloc_user_space(alloc_size);
1087 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1088 	}
1089 
1090 	if (err)
1091 		return -EFAULT;
1092 
1093 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1094 }
1095 
1096 #endif
1097 
1098 /*
1099  * get_vma_policy(@task, @vma, @addr)
1100  * @task - task for fallback if vma policy == default
1101  * @vma   - virtual memory area whose policy is sought
1102  * @addr  - address in @vma for shared policy lookup
1103  *
1104  * Returns effective policy for a VMA at specified address.
1105  * Falls back to @task or system default policy, as necessary.
1106  * Returned policy has extra reference count if shared, vma,
1107  * or some other task's policy [show_numa_maps() can pass
1108  * @task != current].  It is the caller's responsibility to
1109  * free the reference in these cases.
1110  */
1111 static struct mempolicy * get_vma_policy(struct task_struct *task,
1112 		struct vm_area_struct *vma, unsigned long addr)
1113 {
1114 	struct mempolicy *pol = task->mempolicy;
1115 	int shared_pol = 0;
1116 
1117 	if (vma) {
1118 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1119 			pol = vma->vm_ops->get_policy(vma, addr);
1120 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1121 		} else if (vma->vm_policy &&
1122 				vma->vm_policy->policy != MPOL_DEFAULT)
1123 			pol = vma->vm_policy;
1124 	}
1125 	if (!pol)
1126 		pol = &default_policy;
1127 	else if (!shared_pol && pol != current->mempolicy)
1128 		mpol_get(pol);	/* vma or other task's policy */
1129 	return pol;
1130 }
1131 
1132 /* Return a zonelist representing a mempolicy */
1133 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1134 {
1135 	int nd;
1136 
1137 	switch (policy->policy) {
1138 	case MPOL_PREFERRED:
1139 		nd = policy->v.preferred_node;
1140 		if (nd < 0)
1141 			nd = numa_node_id();
1142 		break;
1143 	case MPOL_BIND:
1144 		/* Lower zones don't get a policy applied */
1145 		/* Careful: current->mems_allowed might have moved */
1146 		if (gfp_zone(gfp) >= policy_zone)
1147 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1148 				return policy->v.zonelist;
1149 		/*FALL THROUGH*/
1150 	case MPOL_INTERLEAVE: /* should not happen */
1151 	case MPOL_DEFAULT:
1152 		nd = numa_node_id();
1153 		break;
1154 	default:
1155 		nd = 0;
1156 		BUG();
1157 	}
1158 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1159 }
1160 
1161 /* Do dynamic interleaving for a process */
1162 static unsigned interleave_nodes(struct mempolicy *policy)
1163 {
1164 	unsigned nid, next;
1165 	struct task_struct *me = current;
1166 
1167 	nid = me->il_next;
1168 	next = next_node(nid, policy->v.nodes);
1169 	if (next >= MAX_NUMNODES)
1170 		next = first_node(policy->v.nodes);
1171 	me->il_next = next;
1172 	return nid;
1173 }
1174 
1175 /*
1176  * Depending on the memory policy provide a node from which to allocate the
1177  * next slab entry.
1178  */
1179 unsigned slab_node(struct mempolicy *policy)
1180 {
1181 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1182 
1183 	switch (pol) {
1184 	case MPOL_INTERLEAVE:
1185 		return interleave_nodes(policy);
1186 
1187 	case MPOL_BIND:
1188 		/*
1189 		 * Follow bind policy behavior and start allocation at the
1190 		 * first node.
1191 		 */
1192 		return zone_to_nid(policy->v.zonelist->zones[0]);
1193 
1194 	case MPOL_PREFERRED:
1195 		if (policy->v.preferred_node >= 0)
1196 			return policy->v.preferred_node;
1197 		/* Fall through */
1198 
1199 	default:
1200 		return numa_node_id();
1201 	}
1202 }
1203 
1204 /* Do static interleaving for a VMA with known offset. */
1205 static unsigned offset_il_node(struct mempolicy *pol,
1206 		struct vm_area_struct *vma, unsigned long off)
1207 {
1208 	unsigned nnodes = nodes_weight(pol->v.nodes);
1209 	unsigned target = (unsigned)off % nnodes;
1210 	int c;
1211 	int nid = -1;
1212 
1213 	c = 0;
1214 	do {
1215 		nid = next_node(nid, pol->v.nodes);
1216 		c++;
1217 	} while (c <= target);
1218 	return nid;
1219 }
1220 
1221 /* Determine a node number for interleave */
1222 static inline unsigned interleave_nid(struct mempolicy *pol,
1223 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1224 {
1225 	if (vma) {
1226 		unsigned long off;
1227 
1228 		/*
1229 		 * for small pages, there is no difference between
1230 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1231 		 * for huge pages, since vm_pgoff is in units of small
1232 		 * pages, we need to shift off the always 0 bits to get
1233 		 * a useful offset.
1234 		 */
1235 		BUG_ON(shift < PAGE_SHIFT);
1236 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1237 		off += (addr - vma->vm_start) >> shift;
1238 		return offset_il_node(pol, vma, off);
1239 	} else
1240 		return interleave_nodes(pol);
1241 }
1242 
1243 #ifdef CONFIG_HUGETLBFS
1244 /*
1245  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1246  * @vma = virtual memory area whose policy is sought
1247  * @addr = address in @vma for shared policy lookup and interleave policy
1248  * @gfp_flags = for requested zone
1249  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1250  *
1251  * Returns a zonelist suitable for a huge page allocation.
1252  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1253  * If it is also a policy for which get_vma_policy() returns an extra
1254  * reference, we must hold that reference until after allocation.
1255  * In that case, return policy via @mpol so hugetlb allocation can drop
1256  * the reference.  For non-'BIND referenced policies, we can/do drop the
1257  * reference here, so the caller doesn't need to know about the special case
1258  * for default and current task policy.
1259  */
1260 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1261 				gfp_t gfp_flags, struct mempolicy **mpol)
1262 {
1263 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1264 	struct zonelist *zl;
1265 
1266 	*mpol = NULL;		/* probably no unref needed */
1267 	if (pol->policy == MPOL_INTERLEAVE) {
1268 		unsigned nid;
1269 
1270 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1271 		__mpol_free(pol);		/* finished with pol */
1272 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1273 	}
1274 
1275 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1276 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1277 		if (pol->policy != MPOL_BIND)
1278 			__mpol_free(pol);	/* finished with pol */
1279 		else
1280 			*mpol = pol;	/* unref needed after allocation */
1281 	}
1282 	return zl;
1283 }
1284 #endif
1285 
1286 /* Allocate a page in interleaved policy.
1287    Own path because it needs to do special accounting. */
1288 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1289 					unsigned nid)
1290 {
1291 	struct zonelist *zl;
1292 	struct page *page;
1293 
1294 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1295 	page = __alloc_pages(gfp, order, zl);
1296 	if (page && page_zone(page) == zl->zones[0])
1297 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1298 	return page;
1299 }
1300 
1301 /**
1302  * 	alloc_page_vma	- Allocate a page for a VMA.
1303  *
1304  * 	@gfp:
1305  *      %GFP_USER    user allocation.
1306  *      %GFP_KERNEL  kernel allocations,
1307  *      %GFP_HIGHMEM highmem/user allocations,
1308  *      %GFP_FS      allocation should not call back into a file system.
1309  *      %GFP_ATOMIC  don't sleep.
1310  *
1311  * 	@vma:  Pointer to VMA or NULL if not available.
1312  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1313  *
1314  * 	This function allocates a page from the kernel page pool and applies
1315  *	a NUMA policy associated with the VMA or the current process.
1316  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1317  *	mm_struct of the VMA to prevent it from going away. Should be used for
1318  *	all allocations for pages that will be mapped into
1319  * 	user space. Returns NULL when no page can be allocated.
1320  *
1321  *	Should be called with the mm_sem of the vma hold.
1322  */
1323 struct page *
1324 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1325 {
1326 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1327 	struct zonelist *zl;
1328 
1329 	cpuset_update_task_memory_state();
1330 
1331 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1332 		unsigned nid;
1333 
1334 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1335 		return alloc_page_interleave(gfp, 0, nid);
1336 	}
1337 	zl = zonelist_policy(gfp, pol);
1338 	if (pol != &default_policy && pol != current->mempolicy) {
1339 		/*
1340 		 * slow path: ref counted policy -- shared or vma
1341 		 */
1342 		struct page *page =  __alloc_pages(gfp, 0, zl);
1343 		__mpol_free(pol);
1344 		return page;
1345 	}
1346 	/*
1347 	 * fast path:  default or task policy
1348 	 */
1349 	return __alloc_pages(gfp, 0, zl);
1350 }
1351 
1352 /**
1353  * 	alloc_pages_current - Allocate pages.
1354  *
1355  *	@gfp:
1356  *		%GFP_USER   user allocation,
1357  *      	%GFP_KERNEL kernel allocation,
1358  *      	%GFP_HIGHMEM highmem allocation,
1359  *      	%GFP_FS     don't call back into a file system.
1360  *      	%GFP_ATOMIC don't sleep.
1361  *	@order: Power of two of allocation size in pages. 0 is a single page.
1362  *
1363  *	Allocate a page from the kernel page pool.  When not in
1364  *	interrupt context and apply the current process NUMA policy.
1365  *	Returns NULL when no page can be allocated.
1366  *
1367  *	Don't call cpuset_update_task_memory_state() unless
1368  *	1) it's ok to take cpuset_sem (can WAIT), and
1369  *	2) allocating for current task (not interrupt).
1370  */
1371 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1372 {
1373 	struct mempolicy *pol = current->mempolicy;
1374 
1375 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1376 		cpuset_update_task_memory_state();
1377 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1378 		pol = &default_policy;
1379 	if (pol->policy == MPOL_INTERLEAVE)
1380 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1381 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1382 }
1383 EXPORT_SYMBOL(alloc_pages_current);
1384 
1385 /*
1386  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1387  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1388  * with the mems_allowed returned by cpuset_mems_allowed().  This
1389  * keeps mempolicies cpuset relative after its cpuset moves.  See
1390  * further kernel/cpuset.c update_nodemask().
1391  */
1392 
1393 /* Slow path of a mempolicy copy */
1394 struct mempolicy *__mpol_copy(struct mempolicy *old)
1395 {
1396 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1397 
1398 	if (!new)
1399 		return ERR_PTR(-ENOMEM);
1400 	if (current_cpuset_is_being_rebound()) {
1401 		nodemask_t mems = cpuset_mems_allowed(current);
1402 		mpol_rebind_policy(old, &mems);
1403 	}
1404 	*new = *old;
1405 	atomic_set(&new->refcnt, 1);
1406 	if (new->policy == MPOL_BIND) {
1407 		int sz = ksize(old->v.zonelist);
1408 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1409 		if (!new->v.zonelist) {
1410 			kmem_cache_free(policy_cache, new);
1411 			return ERR_PTR(-ENOMEM);
1412 		}
1413 	}
1414 	return new;
1415 }
1416 
1417 /* Slow path of a mempolicy comparison */
1418 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1419 {
1420 	if (!a || !b)
1421 		return 0;
1422 	if (a->policy != b->policy)
1423 		return 0;
1424 	switch (a->policy) {
1425 	case MPOL_DEFAULT:
1426 		return 1;
1427 	case MPOL_INTERLEAVE:
1428 		return nodes_equal(a->v.nodes, b->v.nodes);
1429 	case MPOL_PREFERRED:
1430 		return a->v.preferred_node == b->v.preferred_node;
1431 	case MPOL_BIND: {
1432 		int i;
1433 		for (i = 0; a->v.zonelist->zones[i]; i++)
1434 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1435 				return 0;
1436 		return b->v.zonelist->zones[i] == NULL;
1437 	}
1438 	default:
1439 		BUG();
1440 		return 0;
1441 	}
1442 }
1443 
1444 /* Slow path of a mpol destructor. */
1445 void __mpol_free(struct mempolicy *p)
1446 {
1447 	if (!atomic_dec_and_test(&p->refcnt))
1448 		return;
1449 	if (p->policy == MPOL_BIND)
1450 		kfree(p->v.zonelist);
1451 	p->policy = MPOL_DEFAULT;
1452 	kmem_cache_free(policy_cache, p);
1453 }
1454 
1455 /*
1456  * Shared memory backing store policy support.
1457  *
1458  * Remember policies even when nobody has shared memory mapped.
1459  * The policies are kept in Red-Black tree linked from the inode.
1460  * They are protected by the sp->lock spinlock, which should be held
1461  * for any accesses to the tree.
1462  */
1463 
1464 /* lookup first element intersecting start-end */
1465 /* Caller holds sp->lock */
1466 static struct sp_node *
1467 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1468 {
1469 	struct rb_node *n = sp->root.rb_node;
1470 
1471 	while (n) {
1472 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1473 
1474 		if (start >= p->end)
1475 			n = n->rb_right;
1476 		else if (end <= p->start)
1477 			n = n->rb_left;
1478 		else
1479 			break;
1480 	}
1481 	if (!n)
1482 		return NULL;
1483 	for (;;) {
1484 		struct sp_node *w = NULL;
1485 		struct rb_node *prev = rb_prev(n);
1486 		if (!prev)
1487 			break;
1488 		w = rb_entry(prev, struct sp_node, nd);
1489 		if (w->end <= start)
1490 			break;
1491 		n = prev;
1492 	}
1493 	return rb_entry(n, struct sp_node, nd);
1494 }
1495 
1496 /* Insert a new shared policy into the list. */
1497 /* Caller holds sp->lock */
1498 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1499 {
1500 	struct rb_node **p = &sp->root.rb_node;
1501 	struct rb_node *parent = NULL;
1502 	struct sp_node *nd;
1503 
1504 	while (*p) {
1505 		parent = *p;
1506 		nd = rb_entry(parent, struct sp_node, nd);
1507 		if (new->start < nd->start)
1508 			p = &(*p)->rb_left;
1509 		else if (new->end > nd->end)
1510 			p = &(*p)->rb_right;
1511 		else
1512 			BUG();
1513 	}
1514 	rb_link_node(&new->nd, parent, p);
1515 	rb_insert_color(&new->nd, &sp->root);
1516 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1517 		 new->policy ? new->policy->policy : 0);
1518 }
1519 
1520 /* Find shared policy intersecting idx */
1521 struct mempolicy *
1522 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1523 {
1524 	struct mempolicy *pol = NULL;
1525 	struct sp_node *sn;
1526 
1527 	if (!sp->root.rb_node)
1528 		return NULL;
1529 	spin_lock(&sp->lock);
1530 	sn = sp_lookup(sp, idx, idx+1);
1531 	if (sn) {
1532 		mpol_get(sn->policy);
1533 		pol = sn->policy;
1534 	}
1535 	spin_unlock(&sp->lock);
1536 	return pol;
1537 }
1538 
1539 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1540 {
1541 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1542 	rb_erase(&n->nd, &sp->root);
1543 	mpol_free(n->policy);
1544 	kmem_cache_free(sn_cache, n);
1545 }
1546 
1547 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1548 				struct mempolicy *pol)
1549 {
1550 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1551 
1552 	if (!n)
1553 		return NULL;
1554 	n->start = start;
1555 	n->end = end;
1556 	mpol_get(pol);
1557 	n->policy = pol;
1558 	return n;
1559 }
1560 
1561 /* Replace a policy range. */
1562 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1563 				 unsigned long end, struct sp_node *new)
1564 {
1565 	struct sp_node *n, *new2 = NULL;
1566 
1567 restart:
1568 	spin_lock(&sp->lock);
1569 	n = sp_lookup(sp, start, end);
1570 	/* Take care of old policies in the same range. */
1571 	while (n && n->start < end) {
1572 		struct rb_node *next = rb_next(&n->nd);
1573 		if (n->start >= start) {
1574 			if (n->end <= end)
1575 				sp_delete(sp, n);
1576 			else
1577 				n->start = end;
1578 		} else {
1579 			/* Old policy spanning whole new range. */
1580 			if (n->end > end) {
1581 				if (!new2) {
1582 					spin_unlock(&sp->lock);
1583 					new2 = sp_alloc(end, n->end, n->policy);
1584 					if (!new2)
1585 						return -ENOMEM;
1586 					goto restart;
1587 				}
1588 				n->end = start;
1589 				sp_insert(sp, new2);
1590 				new2 = NULL;
1591 				break;
1592 			} else
1593 				n->end = start;
1594 		}
1595 		if (!next)
1596 			break;
1597 		n = rb_entry(next, struct sp_node, nd);
1598 	}
1599 	if (new)
1600 		sp_insert(sp, new);
1601 	spin_unlock(&sp->lock);
1602 	if (new2) {
1603 		mpol_free(new2->policy);
1604 		kmem_cache_free(sn_cache, new2);
1605 	}
1606 	return 0;
1607 }
1608 
1609 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1610 				nodemask_t *policy_nodes)
1611 {
1612 	info->root = RB_ROOT;
1613 	spin_lock_init(&info->lock);
1614 
1615 	if (policy != MPOL_DEFAULT) {
1616 		struct mempolicy *newpol;
1617 
1618 		/* Falls back to MPOL_DEFAULT on any error */
1619 		newpol = mpol_new(policy, policy_nodes);
1620 		if (!IS_ERR(newpol)) {
1621 			/* Create pseudo-vma that contains just the policy */
1622 			struct vm_area_struct pvma;
1623 
1624 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1625 			/* Policy covers entire file */
1626 			pvma.vm_end = TASK_SIZE;
1627 			mpol_set_shared_policy(info, &pvma, newpol);
1628 			mpol_free(newpol);
1629 		}
1630 	}
1631 }
1632 
1633 int mpol_set_shared_policy(struct shared_policy *info,
1634 			struct vm_area_struct *vma, struct mempolicy *npol)
1635 {
1636 	int err;
1637 	struct sp_node *new = NULL;
1638 	unsigned long sz = vma_pages(vma);
1639 
1640 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1641 		 vma->vm_pgoff,
1642 		 sz, npol? npol->policy : -1,
1643 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1644 
1645 	if (npol) {
1646 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1647 		if (!new)
1648 			return -ENOMEM;
1649 	}
1650 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1651 	if (err && new)
1652 		kmem_cache_free(sn_cache, new);
1653 	return err;
1654 }
1655 
1656 /* Free a backing policy store on inode delete. */
1657 void mpol_free_shared_policy(struct shared_policy *p)
1658 {
1659 	struct sp_node *n;
1660 	struct rb_node *next;
1661 
1662 	if (!p->root.rb_node)
1663 		return;
1664 	spin_lock(&p->lock);
1665 	next = rb_first(&p->root);
1666 	while (next) {
1667 		n = rb_entry(next, struct sp_node, nd);
1668 		next = rb_next(&n->nd);
1669 		rb_erase(&n->nd, &p->root);
1670 		mpol_free(n->policy);
1671 		kmem_cache_free(sn_cache, n);
1672 	}
1673 	spin_unlock(&p->lock);
1674 }
1675 
1676 /* assumes fs == KERNEL_DS */
1677 void __init numa_policy_init(void)
1678 {
1679 	nodemask_t interleave_nodes;
1680 	unsigned long largest = 0;
1681 	int nid, prefer = 0;
1682 
1683 	policy_cache = kmem_cache_create("numa_policy",
1684 					 sizeof(struct mempolicy),
1685 					 0, SLAB_PANIC, NULL);
1686 
1687 	sn_cache = kmem_cache_create("shared_policy_node",
1688 				     sizeof(struct sp_node),
1689 				     0, SLAB_PANIC, NULL);
1690 
1691 	/*
1692 	 * Set interleaving policy for system init. Interleaving is only
1693 	 * enabled across suitably sized nodes (default is >= 16MB), or
1694 	 * fall back to the largest node if they're all smaller.
1695 	 */
1696 	nodes_clear(interleave_nodes);
1697 	for_each_node_state(nid, N_HIGH_MEMORY) {
1698 		unsigned long total_pages = node_present_pages(nid);
1699 
1700 		/* Preserve the largest node */
1701 		if (largest < total_pages) {
1702 			largest = total_pages;
1703 			prefer = nid;
1704 		}
1705 
1706 		/* Interleave this node? */
1707 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1708 			node_set(nid, interleave_nodes);
1709 	}
1710 
1711 	/* All too small, use the largest */
1712 	if (unlikely(nodes_empty(interleave_nodes)))
1713 		node_set(prefer, interleave_nodes);
1714 
1715 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1716 		printk("numa_policy_init: interleaving failed\n");
1717 }
1718 
1719 /* Reset policy of current process to default */
1720 void numa_default_policy(void)
1721 {
1722 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1723 }
1724 
1725 /* Migrate a policy to a different set of nodes */
1726 static void mpol_rebind_policy(struct mempolicy *pol,
1727 			       const nodemask_t *newmask)
1728 {
1729 	nodemask_t *mpolmask;
1730 	nodemask_t tmp;
1731 
1732 	if (!pol)
1733 		return;
1734 	mpolmask = &pol->cpuset_mems_allowed;
1735 	if (nodes_equal(*mpolmask, *newmask))
1736 		return;
1737 
1738 	switch (pol->policy) {
1739 	case MPOL_DEFAULT:
1740 		break;
1741 	case MPOL_INTERLEAVE:
1742 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1743 		pol->v.nodes = tmp;
1744 		*mpolmask = *newmask;
1745 		current->il_next = node_remap(current->il_next,
1746 						*mpolmask, *newmask);
1747 		break;
1748 	case MPOL_PREFERRED:
1749 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1750 						*mpolmask, *newmask);
1751 		*mpolmask = *newmask;
1752 		break;
1753 	case MPOL_BIND: {
1754 		nodemask_t nodes;
1755 		struct zone **z;
1756 		struct zonelist *zonelist;
1757 
1758 		nodes_clear(nodes);
1759 		for (z = pol->v.zonelist->zones; *z; z++)
1760 			node_set(zone_to_nid(*z), nodes);
1761 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1762 		nodes = tmp;
1763 
1764 		zonelist = bind_zonelist(&nodes);
1765 
1766 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1767 		 * If that old zonelist has no remaining mems_allowed nodes,
1768 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1769 		 */
1770 
1771 		if (!IS_ERR(zonelist)) {
1772 			/* Good - got mem - substitute new zonelist */
1773 			kfree(pol->v.zonelist);
1774 			pol->v.zonelist = zonelist;
1775 		}
1776 		*mpolmask = *newmask;
1777 		break;
1778 	}
1779 	default:
1780 		BUG();
1781 		break;
1782 	}
1783 }
1784 
1785 /*
1786  * Wrapper for mpol_rebind_policy() that just requires task
1787  * pointer, and updates task mempolicy.
1788  */
1789 
1790 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1791 {
1792 	mpol_rebind_policy(tsk->mempolicy, new);
1793 }
1794 
1795 /*
1796  * Rebind each vma in mm to new nodemask.
1797  *
1798  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1799  */
1800 
1801 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1802 {
1803 	struct vm_area_struct *vma;
1804 
1805 	down_write(&mm->mmap_sem);
1806 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1807 		mpol_rebind_policy(vma->vm_policy, new);
1808 	up_write(&mm->mmap_sem);
1809 }
1810 
1811 /*
1812  * Display pages allocated per node and memory policy via /proc.
1813  */
1814 
1815 static const char * const policy_types[] =
1816 	{ "default", "prefer", "bind", "interleave" };
1817 
1818 /*
1819  * Convert a mempolicy into a string.
1820  * Returns the number of characters in buffer (if positive)
1821  * or an error (negative)
1822  */
1823 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1824 {
1825 	char *p = buffer;
1826 	int l;
1827 	nodemask_t nodes;
1828 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1829 
1830 	switch (mode) {
1831 	case MPOL_DEFAULT:
1832 		nodes_clear(nodes);
1833 		break;
1834 
1835 	case MPOL_PREFERRED:
1836 		nodes_clear(nodes);
1837 		node_set(pol->v.preferred_node, nodes);
1838 		break;
1839 
1840 	case MPOL_BIND:
1841 		get_zonemask(pol, &nodes);
1842 		break;
1843 
1844 	case MPOL_INTERLEAVE:
1845 		nodes = pol->v.nodes;
1846 		break;
1847 
1848 	default:
1849 		BUG();
1850 		return -EFAULT;
1851 	}
1852 
1853 	l = strlen(policy_types[mode]);
1854  	if (buffer + maxlen < p + l + 1)
1855  		return -ENOSPC;
1856 
1857 	strcpy(p, policy_types[mode]);
1858 	p += l;
1859 
1860 	if (!nodes_empty(nodes)) {
1861 		if (buffer + maxlen < p + 2)
1862 			return -ENOSPC;
1863 		*p++ = '=';
1864 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1865 	}
1866 	return p - buffer;
1867 }
1868 
1869 struct numa_maps {
1870 	unsigned long pages;
1871 	unsigned long anon;
1872 	unsigned long active;
1873 	unsigned long writeback;
1874 	unsigned long mapcount_max;
1875 	unsigned long dirty;
1876 	unsigned long swapcache;
1877 	unsigned long node[MAX_NUMNODES];
1878 };
1879 
1880 static void gather_stats(struct page *page, void *private, int pte_dirty)
1881 {
1882 	struct numa_maps *md = private;
1883 	int count = page_mapcount(page);
1884 
1885 	md->pages++;
1886 	if (pte_dirty || PageDirty(page))
1887 		md->dirty++;
1888 
1889 	if (PageSwapCache(page))
1890 		md->swapcache++;
1891 
1892 	if (PageActive(page))
1893 		md->active++;
1894 
1895 	if (PageWriteback(page))
1896 		md->writeback++;
1897 
1898 	if (PageAnon(page))
1899 		md->anon++;
1900 
1901 	if (count > md->mapcount_max)
1902 		md->mapcount_max = count;
1903 
1904 	md->node[page_to_nid(page)]++;
1905 }
1906 
1907 #ifdef CONFIG_HUGETLB_PAGE
1908 static void check_huge_range(struct vm_area_struct *vma,
1909 		unsigned long start, unsigned long end,
1910 		struct numa_maps *md)
1911 {
1912 	unsigned long addr;
1913 	struct page *page;
1914 
1915 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1916 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1917 		pte_t pte;
1918 
1919 		if (!ptep)
1920 			continue;
1921 
1922 		pte = *ptep;
1923 		if (pte_none(pte))
1924 			continue;
1925 
1926 		page = pte_page(pte);
1927 		if (!page)
1928 			continue;
1929 
1930 		gather_stats(page, md, pte_dirty(*ptep));
1931 	}
1932 }
1933 #else
1934 static inline void check_huge_range(struct vm_area_struct *vma,
1935 		unsigned long start, unsigned long end,
1936 		struct numa_maps *md)
1937 {
1938 }
1939 #endif
1940 
1941 int show_numa_map(struct seq_file *m, void *v)
1942 {
1943 	struct proc_maps_private *priv = m->private;
1944 	struct vm_area_struct *vma = v;
1945 	struct numa_maps *md;
1946 	struct file *file = vma->vm_file;
1947 	struct mm_struct *mm = vma->vm_mm;
1948 	struct mempolicy *pol;
1949 	int n;
1950 	char buffer[50];
1951 
1952 	if (!mm)
1953 		return 0;
1954 
1955 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1956 	if (!md)
1957 		return 0;
1958 
1959 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1960 	mpol_to_str(buffer, sizeof(buffer), pol);
1961 	/*
1962 	 * unref shared or other task's mempolicy
1963 	 */
1964 	if (pol != &default_policy && pol != current->mempolicy)
1965 		__mpol_free(pol);
1966 
1967 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1968 
1969 	if (file) {
1970 		seq_printf(m, " file=");
1971 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1972 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1973 		seq_printf(m, " heap");
1974 	} else if (vma->vm_start <= mm->start_stack &&
1975 			vma->vm_end >= mm->start_stack) {
1976 		seq_printf(m, " stack");
1977 	}
1978 
1979 	if (is_vm_hugetlb_page(vma)) {
1980 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1981 		seq_printf(m, " huge");
1982 	} else {
1983 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1984 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
1985 	}
1986 
1987 	if (!md->pages)
1988 		goto out;
1989 
1990 	if (md->anon)
1991 		seq_printf(m," anon=%lu",md->anon);
1992 
1993 	if (md->dirty)
1994 		seq_printf(m," dirty=%lu",md->dirty);
1995 
1996 	if (md->pages != md->anon && md->pages != md->dirty)
1997 		seq_printf(m, " mapped=%lu", md->pages);
1998 
1999 	if (md->mapcount_max > 1)
2000 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2001 
2002 	if (md->swapcache)
2003 		seq_printf(m," swapcache=%lu", md->swapcache);
2004 
2005 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2006 		seq_printf(m," active=%lu", md->active);
2007 
2008 	if (md->writeback)
2009 		seq_printf(m," writeback=%lu", md->writeback);
2010 
2011 	for_each_node_state(n, N_HIGH_MEMORY)
2012 		if (md->node[n])
2013 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2014 out:
2015 	seq_putc(m, '\n');
2016 	kfree(md);
2017 
2018 	if (m->count < m->size)
2019 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2020 	return 0;
2021 }
2022