xref: /openbmc/linux/mm/mempolicy.c (revision f42b3800)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/nodemask.h>
76 #include <linux/cpuset.h>
77 #include <linux/gfp.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/module.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 static void mpol_rebind_policy(struct mempolicy *pol,
114                                const nodemask_t *newmask);
115 
116 /* Do sanity checking on a policy */
117 static int mpol_check_policy(int mode, nodemask_t *nodes)
118 {
119 	int was_empty, is_empty;
120 
121 	if (!nodes)
122 		return 0;
123 
124 	/*
125 	 * "Contextualize" the in-coming nodemast for cpusets:
126 	 * Remember whether in-coming nodemask was empty,  If not,
127 	 * restrict the nodes to the allowed nodes in the cpuset.
128 	 * This is guaranteed to be a subset of nodes with memory.
129 	 */
130 	cpuset_update_task_memory_state();
131 	is_empty = was_empty = nodes_empty(*nodes);
132 	if (!was_empty) {
133 		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
134 		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
135 	}
136 
137 	switch (mode) {
138 	case MPOL_DEFAULT:
139 		/*
140 		 * require caller to specify an empty nodemask
141 		 * before "contextualization"
142 		 */
143 		if (!was_empty)
144 			return -EINVAL;
145 		break;
146 	case MPOL_BIND:
147 	case MPOL_INTERLEAVE:
148 		/*
149 		 * require at least 1 valid node after "contextualization"
150 		 */
151 		if (is_empty)
152 			return -EINVAL;
153 		break;
154 	case MPOL_PREFERRED:
155 		/*
156 		 * Did caller specify invalid nodes?
157 		 * Don't silently accept this as "local allocation".
158 		 */
159 		if (!was_empty && is_empty)
160 			return -EINVAL;
161 		break;
162 	}
163 	return 0;
164 }
165 
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
168 {
169 	struct zonelist *zl;
170 	int num, max, nd;
171 	enum zone_type k;
172 
173 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
174 	max++;			/* space for zlcache_ptr (see mmzone.h) */
175 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
176 	if (!zl)
177 		return ERR_PTR(-ENOMEM);
178 	zl->zlcache_ptr = NULL;
179 	num = 0;
180 	/* First put in the highest zones from all nodes, then all the next
181 	   lower zones etc. Avoid empty zones because the memory allocator
182 	   doesn't like them. If you implement node hot removal you
183 	   have to fix that. */
184 	k = MAX_NR_ZONES - 1;
185 	while (1) {
186 		for_each_node_mask(nd, *nodes) {
187 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 			if (z->present_pages > 0)
189 				zl->zones[num++] = z;
190 		}
191 		if (k == 0)
192 			break;
193 		k--;
194 	}
195 	if (num == 0) {
196 		kfree(zl);
197 		return ERR_PTR(-EINVAL);
198 	}
199 	zl->zones[num] = NULL;
200 	return zl;
201 }
202 
203 /* Create a new policy */
204 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
205 {
206 	struct mempolicy *policy;
207 
208 	pr_debug("setting mode %d nodes[0] %lx\n",
209 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
210 
211 	if (mode == MPOL_DEFAULT)
212 		return NULL;
213 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
214 	if (!policy)
215 		return ERR_PTR(-ENOMEM);
216 	atomic_set(&policy->refcnt, 1);
217 	switch (mode) {
218 	case MPOL_INTERLEAVE:
219 		policy->v.nodes = *nodes;
220 		if (nodes_weight(policy->v.nodes) == 0) {
221 			kmem_cache_free(policy_cache, policy);
222 			return ERR_PTR(-EINVAL);
223 		}
224 		break;
225 	case MPOL_PREFERRED:
226 		policy->v.preferred_node = first_node(*nodes);
227 		if (policy->v.preferred_node >= MAX_NUMNODES)
228 			policy->v.preferred_node = -1;
229 		break;
230 	case MPOL_BIND:
231 		policy->v.zonelist = bind_zonelist(nodes);
232 		if (IS_ERR(policy->v.zonelist)) {
233 			void *error_code = policy->v.zonelist;
234 			kmem_cache_free(policy_cache, policy);
235 			return error_code;
236 		}
237 		break;
238 	}
239 	policy->policy = mode;
240 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
241 	return policy;
242 }
243 
244 static void gather_stats(struct page *, void *, int pte_dirty);
245 static void migrate_page_add(struct page *page, struct list_head *pagelist,
246 				unsigned long flags);
247 
248 /* Scan through pages checking if pages follow certain conditions. */
249 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
250 		unsigned long addr, unsigned long end,
251 		const nodemask_t *nodes, unsigned long flags,
252 		void *private)
253 {
254 	pte_t *orig_pte;
255 	pte_t *pte;
256 	spinlock_t *ptl;
257 
258 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
259 	do {
260 		struct page *page;
261 		int nid;
262 
263 		if (!pte_present(*pte))
264 			continue;
265 		page = vm_normal_page(vma, addr, *pte);
266 		if (!page)
267 			continue;
268 		/*
269 		 * The check for PageReserved here is important to avoid
270 		 * handling zero pages and other pages that may have been
271 		 * marked special by the system.
272 		 *
273 		 * If the PageReserved would not be checked here then f.e.
274 		 * the location of the zero page could have an influence
275 		 * on MPOL_MF_STRICT, zero pages would be counted for
276 		 * the per node stats, and there would be useless attempts
277 		 * to put zero pages on the migration list.
278 		 */
279 		if (PageReserved(page))
280 			continue;
281 		nid = page_to_nid(page);
282 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
283 			continue;
284 
285 		if (flags & MPOL_MF_STATS)
286 			gather_stats(page, private, pte_dirty(*pte));
287 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
288 			migrate_page_add(page, private, flags);
289 		else
290 			break;
291 	} while (pte++, addr += PAGE_SIZE, addr != end);
292 	pte_unmap_unlock(orig_pte, ptl);
293 	return addr != end;
294 }
295 
296 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
297 		unsigned long addr, unsigned long end,
298 		const nodemask_t *nodes, unsigned long flags,
299 		void *private)
300 {
301 	pmd_t *pmd;
302 	unsigned long next;
303 
304 	pmd = pmd_offset(pud, addr);
305 	do {
306 		next = pmd_addr_end(addr, end);
307 		if (pmd_none_or_clear_bad(pmd))
308 			continue;
309 		if (check_pte_range(vma, pmd, addr, next, nodes,
310 				    flags, private))
311 			return -EIO;
312 	} while (pmd++, addr = next, addr != end);
313 	return 0;
314 }
315 
316 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
317 		unsigned long addr, unsigned long end,
318 		const nodemask_t *nodes, unsigned long flags,
319 		void *private)
320 {
321 	pud_t *pud;
322 	unsigned long next;
323 
324 	pud = pud_offset(pgd, addr);
325 	do {
326 		next = pud_addr_end(addr, end);
327 		if (pud_none_or_clear_bad(pud))
328 			continue;
329 		if (check_pmd_range(vma, pud, addr, next, nodes,
330 				    flags, private))
331 			return -EIO;
332 	} while (pud++, addr = next, addr != end);
333 	return 0;
334 }
335 
336 static inline int check_pgd_range(struct vm_area_struct *vma,
337 		unsigned long addr, unsigned long end,
338 		const nodemask_t *nodes, unsigned long flags,
339 		void *private)
340 {
341 	pgd_t *pgd;
342 	unsigned long next;
343 
344 	pgd = pgd_offset(vma->vm_mm, addr);
345 	do {
346 		next = pgd_addr_end(addr, end);
347 		if (pgd_none_or_clear_bad(pgd))
348 			continue;
349 		if (check_pud_range(vma, pgd, addr, next, nodes,
350 				    flags, private))
351 			return -EIO;
352 	} while (pgd++, addr = next, addr != end);
353 	return 0;
354 }
355 
356 /*
357  * Check if all pages in a range are on a set of nodes.
358  * If pagelist != NULL then isolate pages from the LRU and
359  * put them on the pagelist.
360  */
361 static struct vm_area_struct *
362 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
363 		const nodemask_t *nodes, unsigned long flags, void *private)
364 {
365 	int err;
366 	struct vm_area_struct *first, *vma, *prev;
367 
368 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
369 
370 		err = migrate_prep();
371 		if (err)
372 			return ERR_PTR(err);
373 	}
374 
375 	first = find_vma(mm, start);
376 	if (!first)
377 		return ERR_PTR(-EFAULT);
378 	prev = NULL;
379 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
380 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
381 			if (!vma->vm_next && vma->vm_end < end)
382 				return ERR_PTR(-EFAULT);
383 			if (prev && prev->vm_end < vma->vm_start)
384 				return ERR_PTR(-EFAULT);
385 		}
386 		if (!is_vm_hugetlb_page(vma) &&
387 		    ((flags & MPOL_MF_STRICT) ||
388 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
389 				vma_migratable(vma)))) {
390 			unsigned long endvma = vma->vm_end;
391 
392 			if (endvma > end)
393 				endvma = end;
394 			if (vma->vm_start > start)
395 				start = vma->vm_start;
396 			err = check_pgd_range(vma, start, endvma, nodes,
397 						flags, private);
398 			if (err) {
399 				first = ERR_PTR(err);
400 				break;
401 			}
402 		}
403 		prev = vma;
404 	}
405 	return first;
406 }
407 
408 /* Apply policy to a single VMA */
409 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
410 {
411 	int err = 0;
412 	struct mempolicy *old = vma->vm_policy;
413 
414 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
415 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
416 		 vma->vm_ops, vma->vm_file,
417 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
418 
419 	if (vma->vm_ops && vma->vm_ops->set_policy)
420 		err = vma->vm_ops->set_policy(vma, new);
421 	if (!err) {
422 		mpol_get(new);
423 		vma->vm_policy = new;
424 		mpol_free(old);
425 	}
426 	return err;
427 }
428 
429 /* Step 2: apply policy to a range and do splits. */
430 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
431 		       unsigned long end, struct mempolicy *new)
432 {
433 	struct vm_area_struct *next;
434 	int err;
435 
436 	err = 0;
437 	for (; vma && vma->vm_start < end; vma = next) {
438 		next = vma->vm_next;
439 		if (vma->vm_start < start)
440 			err = split_vma(vma->vm_mm, vma, start, 1);
441 		if (!err && vma->vm_end > end)
442 			err = split_vma(vma->vm_mm, vma, end, 0);
443 		if (!err)
444 			err = policy_vma(vma, new);
445 		if (err)
446 			break;
447 	}
448 	return err;
449 }
450 
451 /*
452  * Update task->flags PF_MEMPOLICY bit: set iff non-default
453  * mempolicy.  Allows more rapid checking of this (combined perhaps
454  * with other PF_* flag bits) on memory allocation hot code paths.
455  *
456  * If called from outside this file, the task 'p' should -only- be
457  * a newly forked child not yet visible on the task list, because
458  * manipulating the task flags of a visible task is not safe.
459  *
460  * The above limitation is why this routine has the funny name
461  * mpol_fix_fork_child_flag().
462  *
463  * It is also safe to call this with a task pointer of current,
464  * which the static wrapper mpol_set_task_struct_flag() does,
465  * for use within this file.
466  */
467 
468 void mpol_fix_fork_child_flag(struct task_struct *p)
469 {
470 	if (p->mempolicy)
471 		p->flags |= PF_MEMPOLICY;
472 	else
473 		p->flags &= ~PF_MEMPOLICY;
474 }
475 
476 static void mpol_set_task_struct_flag(void)
477 {
478 	mpol_fix_fork_child_flag(current);
479 }
480 
481 /* Set the process memory policy */
482 static long do_set_mempolicy(int mode, nodemask_t *nodes)
483 {
484 	struct mempolicy *new;
485 
486 	if (mpol_check_policy(mode, nodes))
487 		return -EINVAL;
488 	new = mpol_new(mode, nodes);
489 	if (IS_ERR(new))
490 		return PTR_ERR(new);
491 	mpol_free(current->mempolicy);
492 	current->mempolicy = new;
493 	mpol_set_task_struct_flag();
494 	if (new && new->policy == MPOL_INTERLEAVE)
495 		current->il_next = first_node(new->v.nodes);
496 	return 0;
497 }
498 
499 /* Fill a zone bitmap for a policy */
500 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
501 {
502 	int i;
503 
504 	nodes_clear(*nodes);
505 	switch (p->policy) {
506 	case MPOL_BIND:
507 		for (i = 0; p->v.zonelist->zones[i]; i++)
508 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
509 				*nodes);
510 		break;
511 	case MPOL_DEFAULT:
512 		break;
513 	case MPOL_INTERLEAVE:
514 		*nodes = p->v.nodes;
515 		break;
516 	case MPOL_PREFERRED:
517 		/* or use current node instead of memory_map? */
518 		if (p->v.preferred_node < 0)
519 			*nodes = node_states[N_HIGH_MEMORY];
520 		else
521 			node_set(p->v.preferred_node, *nodes);
522 		break;
523 	default:
524 		BUG();
525 	}
526 }
527 
528 static int lookup_node(struct mm_struct *mm, unsigned long addr)
529 {
530 	struct page *p;
531 	int err;
532 
533 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
534 	if (err >= 0) {
535 		err = page_to_nid(p);
536 		put_page(p);
537 	}
538 	return err;
539 }
540 
541 /* Retrieve NUMA policy */
542 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
543 			     unsigned long addr, unsigned long flags)
544 {
545 	int err;
546 	struct mm_struct *mm = current->mm;
547 	struct vm_area_struct *vma = NULL;
548 	struct mempolicy *pol = current->mempolicy;
549 
550 	cpuset_update_task_memory_state();
551 	if (flags &
552 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
553 		return -EINVAL;
554 
555 	if (flags & MPOL_F_MEMS_ALLOWED) {
556 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
557 			return -EINVAL;
558 		*policy = 0;	/* just so it's initialized */
559 		*nmask  = cpuset_current_mems_allowed;
560 		return 0;
561 	}
562 
563 	if (flags & MPOL_F_ADDR) {
564 		down_read(&mm->mmap_sem);
565 		vma = find_vma_intersection(mm, addr, addr+1);
566 		if (!vma) {
567 			up_read(&mm->mmap_sem);
568 			return -EFAULT;
569 		}
570 		if (vma->vm_ops && vma->vm_ops->get_policy)
571 			pol = vma->vm_ops->get_policy(vma, addr);
572 		else
573 			pol = vma->vm_policy;
574 	} else if (addr)
575 		return -EINVAL;
576 
577 	if (!pol)
578 		pol = &default_policy;
579 
580 	if (flags & MPOL_F_NODE) {
581 		if (flags & MPOL_F_ADDR) {
582 			err = lookup_node(mm, addr);
583 			if (err < 0)
584 				goto out;
585 			*policy = err;
586 		} else if (pol == current->mempolicy &&
587 				pol->policy == MPOL_INTERLEAVE) {
588 			*policy = current->il_next;
589 		} else {
590 			err = -EINVAL;
591 			goto out;
592 		}
593 	} else
594 		*policy = pol->policy;
595 
596 	if (vma) {
597 		up_read(&current->mm->mmap_sem);
598 		vma = NULL;
599 	}
600 
601 	err = 0;
602 	if (nmask)
603 		get_zonemask(pol, nmask);
604 
605  out:
606 	if (vma)
607 		up_read(&current->mm->mmap_sem);
608 	return err;
609 }
610 
611 #ifdef CONFIG_MIGRATION
612 /*
613  * page migration
614  */
615 static void migrate_page_add(struct page *page, struct list_head *pagelist,
616 				unsigned long flags)
617 {
618 	/*
619 	 * Avoid migrating a page that is shared with others.
620 	 */
621 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
622 		isolate_lru_page(page, pagelist);
623 }
624 
625 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
626 {
627 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
628 }
629 
630 /*
631  * Migrate pages from one node to a target node.
632  * Returns error or the number of pages not migrated.
633  */
634 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
635 			   int flags)
636 {
637 	nodemask_t nmask;
638 	LIST_HEAD(pagelist);
639 	int err = 0;
640 
641 	nodes_clear(nmask);
642 	node_set(source, nmask);
643 
644 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
645 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
646 
647 	if (!list_empty(&pagelist))
648 		err = migrate_pages(&pagelist, new_node_page, dest);
649 
650 	return err;
651 }
652 
653 /*
654  * Move pages between the two nodesets so as to preserve the physical
655  * layout as much as possible.
656  *
657  * Returns the number of page that could not be moved.
658  */
659 int do_migrate_pages(struct mm_struct *mm,
660 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
661 {
662 	LIST_HEAD(pagelist);
663 	int busy = 0;
664 	int err = 0;
665 	nodemask_t tmp;
666 
667   	down_read(&mm->mmap_sem);
668 
669 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
670 	if (err)
671 		goto out;
672 
673 /*
674  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
675  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
676  * bit in 'tmp', and return that <source, dest> pair for migration.
677  * The pair of nodemasks 'to' and 'from' define the map.
678  *
679  * If no pair of bits is found that way, fallback to picking some
680  * pair of 'source' and 'dest' bits that are not the same.  If the
681  * 'source' and 'dest' bits are the same, this represents a node
682  * that will be migrating to itself, so no pages need move.
683  *
684  * If no bits are left in 'tmp', or if all remaining bits left
685  * in 'tmp' correspond to the same bit in 'to', return false
686  * (nothing left to migrate).
687  *
688  * This lets us pick a pair of nodes to migrate between, such that
689  * if possible the dest node is not already occupied by some other
690  * source node, minimizing the risk of overloading the memory on a
691  * node that would happen if we migrated incoming memory to a node
692  * before migrating outgoing memory source that same node.
693  *
694  * A single scan of tmp is sufficient.  As we go, we remember the
695  * most recent <s, d> pair that moved (s != d).  If we find a pair
696  * that not only moved, but what's better, moved to an empty slot
697  * (d is not set in tmp), then we break out then, with that pair.
698  * Otherwise when we finish scannng from_tmp, we at least have the
699  * most recent <s, d> pair that moved.  If we get all the way through
700  * the scan of tmp without finding any node that moved, much less
701  * moved to an empty node, then there is nothing left worth migrating.
702  */
703 
704 	tmp = *from_nodes;
705 	while (!nodes_empty(tmp)) {
706 		int s,d;
707 		int source = -1;
708 		int dest = 0;
709 
710 		for_each_node_mask(s, tmp) {
711 			d = node_remap(s, *from_nodes, *to_nodes);
712 			if (s == d)
713 				continue;
714 
715 			source = s;	/* Node moved. Memorize */
716 			dest = d;
717 
718 			/* dest not in remaining from nodes? */
719 			if (!node_isset(dest, tmp))
720 				break;
721 		}
722 		if (source == -1)
723 			break;
724 
725 		node_clear(source, tmp);
726 		err = migrate_to_node(mm, source, dest, flags);
727 		if (err > 0)
728 			busy += err;
729 		if (err < 0)
730 			break;
731 	}
732 out:
733 	up_read(&mm->mmap_sem);
734 	if (err < 0)
735 		return err;
736 	return busy;
737 
738 }
739 
740 /*
741  * Allocate a new page for page migration based on vma policy.
742  * Start assuming that page is mapped by vma pointed to by @private.
743  * Search forward from there, if not.  N.B., this assumes that the
744  * list of pages handed to migrate_pages()--which is how we get here--
745  * is in virtual address order.
746  */
747 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
748 {
749 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
750 	unsigned long uninitialized_var(address);
751 
752 	while (vma) {
753 		address = page_address_in_vma(page, vma);
754 		if (address != -EFAULT)
755 			break;
756 		vma = vma->vm_next;
757 	}
758 
759 	/*
760 	 * if !vma, alloc_page_vma() will use task or system default policy
761 	 */
762 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
763 }
764 #else
765 
766 static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 				unsigned long flags)
768 {
769 }
770 
771 int do_migrate_pages(struct mm_struct *mm,
772 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
773 {
774 	return -ENOSYS;
775 }
776 
777 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
778 {
779 	return NULL;
780 }
781 #endif
782 
783 static long do_mbind(unsigned long start, unsigned long len,
784 		     unsigned long mode, nodemask_t *nmask,
785 		     unsigned long flags)
786 {
787 	struct vm_area_struct *vma;
788 	struct mm_struct *mm = current->mm;
789 	struct mempolicy *new;
790 	unsigned long end;
791 	int err;
792 	LIST_HEAD(pagelist);
793 
794 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
795 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
796 	    || mode > MPOL_MAX)
797 		return -EINVAL;
798 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
799 		return -EPERM;
800 
801 	if (start & ~PAGE_MASK)
802 		return -EINVAL;
803 
804 	if (mode == MPOL_DEFAULT)
805 		flags &= ~MPOL_MF_STRICT;
806 
807 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
808 	end = start + len;
809 
810 	if (end < start)
811 		return -EINVAL;
812 	if (end == start)
813 		return 0;
814 
815 	if (mpol_check_policy(mode, nmask))
816 		return -EINVAL;
817 
818 	new = mpol_new(mode, nmask);
819 	if (IS_ERR(new))
820 		return PTR_ERR(new);
821 
822 	/*
823 	 * If we are using the default policy then operation
824 	 * on discontinuous address spaces is okay after all
825 	 */
826 	if (!new)
827 		flags |= MPOL_MF_DISCONTIG_OK;
828 
829 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
830 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
831 
832 	down_write(&mm->mmap_sem);
833 	vma = check_range(mm, start, end, nmask,
834 			  flags | MPOL_MF_INVERT, &pagelist);
835 
836 	err = PTR_ERR(vma);
837 	if (!IS_ERR(vma)) {
838 		int nr_failed = 0;
839 
840 		err = mbind_range(vma, start, end, new);
841 
842 		if (!list_empty(&pagelist))
843 			nr_failed = migrate_pages(&pagelist, new_vma_page,
844 						(unsigned long)vma);
845 
846 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
847 			err = -EIO;
848 	}
849 
850 	up_write(&mm->mmap_sem);
851 	mpol_free(new);
852 	return err;
853 }
854 
855 /*
856  * User space interface with variable sized bitmaps for nodelists.
857  */
858 
859 /* Copy a node mask from user space. */
860 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
861 		     unsigned long maxnode)
862 {
863 	unsigned long k;
864 	unsigned long nlongs;
865 	unsigned long endmask;
866 
867 	--maxnode;
868 	nodes_clear(*nodes);
869 	if (maxnode == 0 || !nmask)
870 		return 0;
871 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
872 		return -EINVAL;
873 
874 	nlongs = BITS_TO_LONGS(maxnode);
875 	if ((maxnode % BITS_PER_LONG) == 0)
876 		endmask = ~0UL;
877 	else
878 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
879 
880 	/* When the user specified more nodes than supported just check
881 	   if the non supported part is all zero. */
882 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
883 		if (nlongs > PAGE_SIZE/sizeof(long))
884 			return -EINVAL;
885 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
886 			unsigned long t;
887 			if (get_user(t, nmask + k))
888 				return -EFAULT;
889 			if (k == nlongs - 1) {
890 				if (t & endmask)
891 					return -EINVAL;
892 			} else if (t)
893 				return -EINVAL;
894 		}
895 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
896 		endmask = ~0UL;
897 	}
898 
899 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
900 		return -EFAULT;
901 	nodes_addr(*nodes)[nlongs-1] &= endmask;
902 	return 0;
903 }
904 
905 /* Copy a kernel node mask to user space */
906 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
907 			      nodemask_t *nodes)
908 {
909 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
910 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
911 
912 	if (copy > nbytes) {
913 		if (copy > PAGE_SIZE)
914 			return -EINVAL;
915 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
916 			return -EFAULT;
917 		copy = nbytes;
918 	}
919 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
920 }
921 
922 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
923 			unsigned long mode,
924 			unsigned long __user *nmask, unsigned long maxnode,
925 			unsigned flags)
926 {
927 	nodemask_t nodes;
928 	int err;
929 
930 	err = get_nodes(&nodes, nmask, maxnode);
931 	if (err)
932 		return err;
933 	return do_mbind(start, len, mode, &nodes, flags);
934 }
935 
936 /* Set the process memory policy */
937 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
938 		unsigned long maxnode)
939 {
940 	int err;
941 	nodemask_t nodes;
942 
943 	if (mode < 0 || mode > MPOL_MAX)
944 		return -EINVAL;
945 	err = get_nodes(&nodes, nmask, maxnode);
946 	if (err)
947 		return err;
948 	return do_set_mempolicy(mode, &nodes);
949 }
950 
951 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
952 		const unsigned long __user *old_nodes,
953 		const unsigned long __user *new_nodes)
954 {
955 	struct mm_struct *mm;
956 	struct task_struct *task;
957 	nodemask_t old;
958 	nodemask_t new;
959 	nodemask_t task_nodes;
960 	int err;
961 
962 	err = get_nodes(&old, old_nodes, maxnode);
963 	if (err)
964 		return err;
965 
966 	err = get_nodes(&new, new_nodes, maxnode);
967 	if (err)
968 		return err;
969 
970 	/* Find the mm_struct */
971 	read_lock(&tasklist_lock);
972 	task = pid ? find_task_by_vpid(pid) : current;
973 	if (!task) {
974 		read_unlock(&tasklist_lock);
975 		return -ESRCH;
976 	}
977 	mm = get_task_mm(task);
978 	read_unlock(&tasklist_lock);
979 
980 	if (!mm)
981 		return -EINVAL;
982 
983 	/*
984 	 * Check if this process has the right to modify the specified
985 	 * process. The right exists if the process has administrative
986 	 * capabilities, superuser privileges or the same
987 	 * userid as the target process.
988 	 */
989 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
990 	    (current->uid != task->suid) && (current->uid != task->uid) &&
991 	    !capable(CAP_SYS_NICE)) {
992 		err = -EPERM;
993 		goto out;
994 	}
995 
996 	task_nodes = cpuset_mems_allowed(task);
997 	/* Is the user allowed to access the target nodes? */
998 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
999 		err = -EPERM;
1000 		goto out;
1001 	}
1002 
1003 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1004 		err = -EINVAL;
1005 		goto out;
1006 	}
1007 
1008 	err = security_task_movememory(task);
1009 	if (err)
1010 		goto out;
1011 
1012 	err = do_migrate_pages(mm, &old, &new,
1013 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1014 out:
1015 	mmput(mm);
1016 	return err;
1017 }
1018 
1019 
1020 /* Retrieve NUMA policy */
1021 asmlinkage long sys_get_mempolicy(int __user *policy,
1022 				unsigned long __user *nmask,
1023 				unsigned long maxnode,
1024 				unsigned long addr, unsigned long flags)
1025 {
1026 	int err;
1027 	int uninitialized_var(pval);
1028 	nodemask_t nodes;
1029 
1030 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1031 		return -EINVAL;
1032 
1033 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1034 
1035 	if (err)
1036 		return err;
1037 
1038 	if (policy && put_user(pval, policy))
1039 		return -EFAULT;
1040 
1041 	if (nmask)
1042 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1043 
1044 	return err;
1045 }
1046 
1047 #ifdef CONFIG_COMPAT
1048 
1049 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1050 				     compat_ulong_t __user *nmask,
1051 				     compat_ulong_t maxnode,
1052 				     compat_ulong_t addr, compat_ulong_t flags)
1053 {
1054 	long err;
1055 	unsigned long __user *nm = NULL;
1056 	unsigned long nr_bits, alloc_size;
1057 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1058 
1059 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1060 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1061 
1062 	if (nmask)
1063 		nm = compat_alloc_user_space(alloc_size);
1064 
1065 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1066 
1067 	if (!err && nmask) {
1068 		err = copy_from_user(bm, nm, alloc_size);
1069 		/* ensure entire bitmap is zeroed */
1070 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1071 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1072 	}
1073 
1074 	return err;
1075 }
1076 
1077 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1078 				     compat_ulong_t maxnode)
1079 {
1080 	long err = 0;
1081 	unsigned long __user *nm = NULL;
1082 	unsigned long nr_bits, alloc_size;
1083 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1084 
1085 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1086 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1087 
1088 	if (nmask) {
1089 		err = compat_get_bitmap(bm, nmask, nr_bits);
1090 		nm = compat_alloc_user_space(alloc_size);
1091 		err |= copy_to_user(nm, bm, alloc_size);
1092 	}
1093 
1094 	if (err)
1095 		return -EFAULT;
1096 
1097 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1098 }
1099 
1100 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1101 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1102 			     compat_ulong_t maxnode, compat_ulong_t flags)
1103 {
1104 	long err = 0;
1105 	unsigned long __user *nm = NULL;
1106 	unsigned long nr_bits, alloc_size;
1107 	nodemask_t bm;
1108 
1109 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1110 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1111 
1112 	if (nmask) {
1113 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1114 		nm = compat_alloc_user_space(alloc_size);
1115 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1116 	}
1117 
1118 	if (err)
1119 		return -EFAULT;
1120 
1121 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1122 }
1123 
1124 #endif
1125 
1126 /*
1127  * get_vma_policy(@task, @vma, @addr)
1128  * @task - task for fallback if vma policy == default
1129  * @vma   - virtual memory area whose policy is sought
1130  * @addr  - address in @vma for shared policy lookup
1131  *
1132  * Returns effective policy for a VMA at specified address.
1133  * Falls back to @task or system default policy, as necessary.
1134  * Returned policy has extra reference count if shared, vma,
1135  * or some other task's policy [show_numa_maps() can pass
1136  * @task != current].  It is the caller's responsibility to
1137  * free the reference in these cases.
1138  */
1139 static struct mempolicy * get_vma_policy(struct task_struct *task,
1140 		struct vm_area_struct *vma, unsigned long addr)
1141 {
1142 	struct mempolicy *pol = task->mempolicy;
1143 	int shared_pol = 0;
1144 
1145 	if (vma) {
1146 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1147 			pol = vma->vm_ops->get_policy(vma, addr);
1148 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1149 		} else if (vma->vm_policy &&
1150 				vma->vm_policy->policy != MPOL_DEFAULT)
1151 			pol = vma->vm_policy;
1152 	}
1153 	if (!pol)
1154 		pol = &default_policy;
1155 	else if (!shared_pol && pol != current->mempolicy)
1156 		mpol_get(pol);	/* vma or other task's policy */
1157 	return pol;
1158 }
1159 
1160 /* Return a zonelist representing a mempolicy */
1161 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1162 {
1163 	int nd;
1164 
1165 	switch (policy->policy) {
1166 	case MPOL_PREFERRED:
1167 		nd = policy->v.preferred_node;
1168 		if (nd < 0)
1169 			nd = numa_node_id();
1170 		break;
1171 	case MPOL_BIND:
1172 		/* Lower zones don't get a policy applied */
1173 		/* Careful: current->mems_allowed might have moved */
1174 		if (gfp_zone(gfp) >= policy_zone)
1175 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1176 				return policy->v.zonelist;
1177 		/*FALL THROUGH*/
1178 	case MPOL_INTERLEAVE: /* should not happen */
1179 	case MPOL_DEFAULT:
1180 		nd = numa_node_id();
1181 		break;
1182 	default:
1183 		nd = 0;
1184 		BUG();
1185 	}
1186 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1187 }
1188 
1189 /* Do dynamic interleaving for a process */
1190 static unsigned interleave_nodes(struct mempolicy *policy)
1191 {
1192 	unsigned nid, next;
1193 	struct task_struct *me = current;
1194 
1195 	nid = me->il_next;
1196 	next = next_node(nid, policy->v.nodes);
1197 	if (next >= MAX_NUMNODES)
1198 		next = first_node(policy->v.nodes);
1199 	me->il_next = next;
1200 	return nid;
1201 }
1202 
1203 /*
1204  * Depending on the memory policy provide a node from which to allocate the
1205  * next slab entry.
1206  */
1207 unsigned slab_node(struct mempolicy *policy)
1208 {
1209 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1210 
1211 	switch (pol) {
1212 	case MPOL_INTERLEAVE:
1213 		return interleave_nodes(policy);
1214 
1215 	case MPOL_BIND:
1216 		/*
1217 		 * Follow bind policy behavior and start allocation at the
1218 		 * first node.
1219 		 */
1220 		return zone_to_nid(policy->v.zonelist->zones[0]);
1221 
1222 	case MPOL_PREFERRED:
1223 		if (policy->v.preferred_node >= 0)
1224 			return policy->v.preferred_node;
1225 		/* Fall through */
1226 
1227 	default:
1228 		return numa_node_id();
1229 	}
1230 }
1231 
1232 /* Do static interleaving for a VMA with known offset. */
1233 static unsigned offset_il_node(struct mempolicy *pol,
1234 		struct vm_area_struct *vma, unsigned long off)
1235 {
1236 	unsigned nnodes = nodes_weight(pol->v.nodes);
1237 	unsigned target = (unsigned)off % nnodes;
1238 	int c;
1239 	int nid = -1;
1240 
1241 	c = 0;
1242 	do {
1243 		nid = next_node(nid, pol->v.nodes);
1244 		c++;
1245 	} while (c <= target);
1246 	return nid;
1247 }
1248 
1249 /* Determine a node number for interleave */
1250 static inline unsigned interleave_nid(struct mempolicy *pol,
1251 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1252 {
1253 	if (vma) {
1254 		unsigned long off;
1255 
1256 		/*
1257 		 * for small pages, there is no difference between
1258 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1259 		 * for huge pages, since vm_pgoff is in units of small
1260 		 * pages, we need to shift off the always 0 bits to get
1261 		 * a useful offset.
1262 		 */
1263 		BUG_ON(shift < PAGE_SHIFT);
1264 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1265 		off += (addr - vma->vm_start) >> shift;
1266 		return offset_il_node(pol, vma, off);
1267 	} else
1268 		return interleave_nodes(pol);
1269 }
1270 
1271 #ifdef CONFIG_HUGETLBFS
1272 /*
1273  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1274  * @vma = virtual memory area whose policy is sought
1275  * @addr = address in @vma for shared policy lookup and interleave policy
1276  * @gfp_flags = for requested zone
1277  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1278  *
1279  * Returns a zonelist suitable for a huge page allocation.
1280  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1281  * If it is also a policy for which get_vma_policy() returns an extra
1282  * reference, we must hold that reference until after allocation.
1283  * In that case, return policy via @mpol so hugetlb allocation can drop
1284  * the reference.  For non-'BIND referenced policies, we can/do drop the
1285  * reference here, so the caller doesn't need to know about the special case
1286  * for default and current task policy.
1287  */
1288 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1289 				gfp_t gfp_flags, struct mempolicy **mpol)
1290 {
1291 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1292 	struct zonelist *zl;
1293 
1294 	*mpol = NULL;		/* probably no unref needed */
1295 	if (pol->policy == MPOL_INTERLEAVE) {
1296 		unsigned nid;
1297 
1298 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1299 		if (unlikely(pol != &default_policy &&
1300 				pol != current->mempolicy))
1301 			__mpol_free(pol);	/* finished with pol */
1302 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1303 	}
1304 
1305 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1306 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1307 		if (pol->policy != MPOL_BIND)
1308 			__mpol_free(pol);	/* finished with pol */
1309 		else
1310 			*mpol = pol;	/* unref needed after allocation */
1311 	}
1312 	return zl;
1313 }
1314 #endif
1315 
1316 /* Allocate a page in interleaved policy.
1317    Own path because it needs to do special accounting. */
1318 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1319 					unsigned nid)
1320 {
1321 	struct zonelist *zl;
1322 	struct page *page;
1323 
1324 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1325 	page = __alloc_pages(gfp, order, zl);
1326 	if (page && page_zone(page) == zl->zones[0])
1327 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1328 	return page;
1329 }
1330 
1331 /**
1332  * 	alloc_page_vma	- Allocate a page for a VMA.
1333  *
1334  * 	@gfp:
1335  *      %GFP_USER    user allocation.
1336  *      %GFP_KERNEL  kernel allocations,
1337  *      %GFP_HIGHMEM highmem/user allocations,
1338  *      %GFP_FS      allocation should not call back into a file system.
1339  *      %GFP_ATOMIC  don't sleep.
1340  *
1341  * 	@vma:  Pointer to VMA or NULL if not available.
1342  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1343  *
1344  * 	This function allocates a page from the kernel page pool and applies
1345  *	a NUMA policy associated with the VMA or the current process.
1346  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1347  *	mm_struct of the VMA to prevent it from going away. Should be used for
1348  *	all allocations for pages that will be mapped into
1349  * 	user space. Returns NULL when no page can be allocated.
1350  *
1351  *	Should be called with the mm_sem of the vma hold.
1352  */
1353 struct page *
1354 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1355 {
1356 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1357 	struct zonelist *zl;
1358 
1359 	cpuset_update_task_memory_state();
1360 
1361 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1362 		unsigned nid;
1363 
1364 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1365 		if (unlikely(pol != &default_policy &&
1366 				pol != current->mempolicy))
1367 			__mpol_free(pol);	/* finished with pol */
1368 		return alloc_page_interleave(gfp, 0, nid);
1369 	}
1370 	zl = zonelist_policy(gfp, pol);
1371 	if (pol != &default_policy && pol != current->mempolicy) {
1372 		/*
1373 		 * slow path: ref counted policy -- shared or vma
1374 		 */
1375 		struct page *page =  __alloc_pages(gfp, 0, zl);
1376 		__mpol_free(pol);
1377 		return page;
1378 	}
1379 	/*
1380 	 * fast path:  default or task policy
1381 	 */
1382 	return __alloc_pages(gfp, 0, zl);
1383 }
1384 
1385 /**
1386  * 	alloc_pages_current - Allocate pages.
1387  *
1388  *	@gfp:
1389  *		%GFP_USER   user allocation,
1390  *      	%GFP_KERNEL kernel allocation,
1391  *      	%GFP_HIGHMEM highmem allocation,
1392  *      	%GFP_FS     don't call back into a file system.
1393  *      	%GFP_ATOMIC don't sleep.
1394  *	@order: Power of two of allocation size in pages. 0 is a single page.
1395  *
1396  *	Allocate a page from the kernel page pool.  When not in
1397  *	interrupt context and apply the current process NUMA policy.
1398  *	Returns NULL when no page can be allocated.
1399  *
1400  *	Don't call cpuset_update_task_memory_state() unless
1401  *	1) it's ok to take cpuset_sem (can WAIT), and
1402  *	2) allocating for current task (not interrupt).
1403  */
1404 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1405 {
1406 	struct mempolicy *pol = current->mempolicy;
1407 
1408 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1409 		cpuset_update_task_memory_state();
1410 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1411 		pol = &default_policy;
1412 	if (pol->policy == MPOL_INTERLEAVE)
1413 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1414 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1415 }
1416 EXPORT_SYMBOL(alloc_pages_current);
1417 
1418 /*
1419  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1420  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1421  * with the mems_allowed returned by cpuset_mems_allowed().  This
1422  * keeps mempolicies cpuset relative after its cpuset moves.  See
1423  * further kernel/cpuset.c update_nodemask().
1424  */
1425 
1426 /* Slow path of a mempolicy copy */
1427 struct mempolicy *__mpol_copy(struct mempolicy *old)
1428 {
1429 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1430 
1431 	if (!new)
1432 		return ERR_PTR(-ENOMEM);
1433 	if (current_cpuset_is_being_rebound()) {
1434 		nodemask_t mems = cpuset_mems_allowed(current);
1435 		mpol_rebind_policy(old, &mems);
1436 	}
1437 	*new = *old;
1438 	atomic_set(&new->refcnt, 1);
1439 	if (new->policy == MPOL_BIND) {
1440 		int sz = ksize(old->v.zonelist);
1441 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1442 		if (!new->v.zonelist) {
1443 			kmem_cache_free(policy_cache, new);
1444 			return ERR_PTR(-ENOMEM);
1445 		}
1446 	}
1447 	return new;
1448 }
1449 
1450 /* Slow path of a mempolicy comparison */
1451 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1452 {
1453 	if (!a || !b)
1454 		return 0;
1455 	if (a->policy != b->policy)
1456 		return 0;
1457 	switch (a->policy) {
1458 	case MPOL_DEFAULT:
1459 		return 1;
1460 	case MPOL_INTERLEAVE:
1461 		return nodes_equal(a->v.nodes, b->v.nodes);
1462 	case MPOL_PREFERRED:
1463 		return a->v.preferred_node == b->v.preferred_node;
1464 	case MPOL_BIND: {
1465 		int i;
1466 		for (i = 0; a->v.zonelist->zones[i]; i++)
1467 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1468 				return 0;
1469 		return b->v.zonelist->zones[i] == NULL;
1470 	}
1471 	default:
1472 		BUG();
1473 		return 0;
1474 	}
1475 }
1476 
1477 /* Slow path of a mpol destructor. */
1478 void __mpol_free(struct mempolicy *p)
1479 {
1480 	if (!atomic_dec_and_test(&p->refcnt))
1481 		return;
1482 	if (p->policy == MPOL_BIND)
1483 		kfree(p->v.zonelist);
1484 	p->policy = MPOL_DEFAULT;
1485 	kmem_cache_free(policy_cache, p);
1486 }
1487 
1488 /*
1489  * Shared memory backing store policy support.
1490  *
1491  * Remember policies even when nobody has shared memory mapped.
1492  * The policies are kept in Red-Black tree linked from the inode.
1493  * They are protected by the sp->lock spinlock, which should be held
1494  * for any accesses to the tree.
1495  */
1496 
1497 /* lookup first element intersecting start-end */
1498 /* Caller holds sp->lock */
1499 static struct sp_node *
1500 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1501 {
1502 	struct rb_node *n = sp->root.rb_node;
1503 
1504 	while (n) {
1505 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1506 
1507 		if (start >= p->end)
1508 			n = n->rb_right;
1509 		else if (end <= p->start)
1510 			n = n->rb_left;
1511 		else
1512 			break;
1513 	}
1514 	if (!n)
1515 		return NULL;
1516 	for (;;) {
1517 		struct sp_node *w = NULL;
1518 		struct rb_node *prev = rb_prev(n);
1519 		if (!prev)
1520 			break;
1521 		w = rb_entry(prev, struct sp_node, nd);
1522 		if (w->end <= start)
1523 			break;
1524 		n = prev;
1525 	}
1526 	return rb_entry(n, struct sp_node, nd);
1527 }
1528 
1529 /* Insert a new shared policy into the list. */
1530 /* Caller holds sp->lock */
1531 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1532 {
1533 	struct rb_node **p = &sp->root.rb_node;
1534 	struct rb_node *parent = NULL;
1535 	struct sp_node *nd;
1536 
1537 	while (*p) {
1538 		parent = *p;
1539 		nd = rb_entry(parent, struct sp_node, nd);
1540 		if (new->start < nd->start)
1541 			p = &(*p)->rb_left;
1542 		else if (new->end > nd->end)
1543 			p = &(*p)->rb_right;
1544 		else
1545 			BUG();
1546 	}
1547 	rb_link_node(&new->nd, parent, p);
1548 	rb_insert_color(&new->nd, &sp->root);
1549 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1550 		 new->policy ? new->policy->policy : 0);
1551 }
1552 
1553 /* Find shared policy intersecting idx */
1554 struct mempolicy *
1555 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1556 {
1557 	struct mempolicy *pol = NULL;
1558 	struct sp_node *sn;
1559 
1560 	if (!sp->root.rb_node)
1561 		return NULL;
1562 	spin_lock(&sp->lock);
1563 	sn = sp_lookup(sp, idx, idx+1);
1564 	if (sn) {
1565 		mpol_get(sn->policy);
1566 		pol = sn->policy;
1567 	}
1568 	spin_unlock(&sp->lock);
1569 	return pol;
1570 }
1571 
1572 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1573 {
1574 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1575 	rb_erase(&n->nd, &sp->root);
1576 	mpol_free(n->policy);
1577 	kmem_cache_free(sn_cache, n);
1578 }
1579 
1580 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1581 				struct mempolicy *pol)
1582 {
1583 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1584 
1585 	if (!n)
1586 		return NULL;
1587 	n->start = start;
1588 	n->end = end;
1589 	mpol_get(pol);
1590 	n->policy = pol;
1591 	return n;
1592 }
1593 
1594 /* Replace a policy range. */
1595 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1596 				 unsigned long end, struct sp_node *new)
1597 {
1598 	struct sp_node *n, *new2 = NULL;
1599 
1600 restart:
1601 	spin_lock(&sp->lock);
1602 	n = sp_lookup(sp, start, end);
1603 	/* Take care of old policies in the same range. */
1604 	while (n && n->start < end) {
1605 		struct rb_node *next = rb_next(&n->nd);
1606 		if (n->start >= start) {
1607 			if (n->end <= end)
1608 				sp_delete(sp, n);
1609 			else
1610 				n->start = end;
1611 		} else {
1612 			/* Old policy spanning whole new range. */
1613 			if (n->end > end) {
1614 				if (!new2) {
1615 					spin_unlock(&sp->lock);
1616 					new2 = sp_alloc(end, n->end, n->policy);
1617 					if (!new2)
1618 						return -ENOMEM;
1619 					goto restart;
1620 				}
1621 				n->end = start;
1622 				sp_insert(sp, new2);
1623 				new2 = NULL;
1624 				break;
1625 			} else
1626 				n->end = start;
1627 		}
1628 		if (!next)
1629 			break;
1630 		n = rb_entry(next, struct sp_node, nd);
1631 	}
1632 	if (new)
1633 		sp_insert(sp, new);
1634 	spin_unlock(&sp->lock);
1635 	if (new2) {
1636 		mpol_free(new2->policy);
1637 		kmem_cache_free(sn_cache, new2);
1638 	}
1639 	return 0;
1640 }
1641 
1642 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1643 				nodemask_t *policy_nodes)
1644 {
1645 	info->root = RB_ROOT;
1646 	spin_lock_init(&info->lock);
1647 
1648 	if (policy != MPOL_DEFAULT) {
1649 		struct mempolicy *newpol;
1650 
1651 		/* Falls back to MPOL_DEFAULT on any error */
1652 		newpol = mpol_new(policy, policy_nodes);
1653 		if (!IS_ERR(newpol)) {
1654 			/* Create pseudo-vma that contains just the policy */
1655 			struct vm_area_struct pvma;
1656 
1657 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1658 			/* Policy covers entire file */
1659 			pvma.vm_end = TASK_SIZE;
1660 			mpol_set_shared_policy(info, &pvma, newpol);
1661 			mpol_free(newpol);
1662 		}
1663 	}
1664 }
1665 
1666 int mpol_set_shared_policy(struct shared_policy *info,
1667 			struct vm_area_struct *vma, struct mempolicy *npol)
1668 {
1669 	int err;
1670 	struct sp_node *new = NULL;
1671 	unsigned long sz = vma_pages(vma);
1672 
1673 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1674 		 vma->vm_pgoff,
1675 		 sz, npol? npol->policy : -1,
1676 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1677 
1678 	if (npol) {
1679 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1680 		if (!new)
1681 			return -ENOMEM;
1682 	}
1683 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1684 	if (err && new)
1685 		kmem_cache_free(sn_cache, new);
1686 	return err;
1687 }
1688 
1689 /* Free a backing policy store on inode delete. */
1690 void mpol_free_shared_policy(struct shared_policy *p)
1691 {
1692 	struct sp_node *n;
1693 	struct rb_node *next;
1694 
1695 	if (!p->root.rb_node)
1696 		return;
1697 	spin_lock(&p->lock);
1698 	next = rb_first(&p->root);
1699 	while (next) {
1700 		n = rb_entry(next, struct sp_node, nd);
1701 		next = rb_next(&n->nd);
1702 		rb_erase(&n->nd, &p->root);
1703 		mpol_free(n->policy);
1704 		kmem_cache_free(sn_cache, n);
1705 	}
1706 	spin_unlock(&p->lock);
1707 }
1708 
1709 /* assumes fs == KERNEL_DS */
1710 void __init numa_policy_init(void)
1711 {
1712 	nodemask_t interleave_nodes;
1713 	unsigned long largest = 0;
1714 	int nid, prefer = 0;
1715 
1716 	policy_cache = kmem_cache_create("numa_policy",
1717 					 sizeof(struct mempolicy),
1718 					 0, SLAB_PANIC, NULL);
1719 
1720 	sn_cache = kmem_cache_create("shared_policy_node",
1721 				     sizeof(struct sp_node),
1722 				     0, SLAB_PANIC, NULL);
1723 
1724 	/*
1725 	 * Set interleaving policy for system init. Interleaving is only
1726 	 * enabled across suitably sized nodes (default is >= 16MB), or
1727 	 * fall back to the largest node if they're all smaller.
1728 	 */
1729 	nodes_clear(interleave_nodes);
1730 	for_each_node_state(nid, N_HIGH_MEMORY) {
1731 		unsigned long total_pages = node_present_pages(nid);
1732 
1733 		/* Preserve the largest node */
1734 		if (largest < total_pages) {
1735 			largest = total_pages;
1736 			prefer = nid;
1737 		}
1738 
1739 		/* Interleave this node? */
1740 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1741 			node_set(nid, interleave_nodes);
1742 	}
1743 
1744 	/* All too small, use the largest */
1745 	if (unlikely(nodes_empty(interleave_nodes)))
1746 		node_set(prefer, interleave_nodes);
1747 
1748 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1749 		printk("numa_policy_init: interleaving failed\n");
1750 }
1751 
1752 /* Reset policy of current process to default */
1753 void numa_default_policy(void)
1754 {
1755 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1756 }
1757 
1758 /* Migrate a policy to a different set of nodes */
1759 static void mpol_rebind_policy(struct mempolicy *pol,
1760 			       const nodemask_t *newmask)
1761 {
1762 	nodemask_t *mpolmask;
1763 	nodemask_t tmp;
1764 
1765 	if (!pol)
1766 		return;
1767 	mpolmask = &pol->cpuset_mems_allowed;
1768 	if (nodes_equal(*mpolmask, *newmask))
1769 		return;
1770 
1771 	switch (pol->policy) {
1772 	case MPOL_DEFAULT:
1773 		break;
1774 	case MPOL_INTERLEAVE:
1775 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1776 		pol->v.nodes = tmp;
1777 		*mpolmask = *newmask;
1778 		current->il_next = node_remap(current->il_next,
1779 						*mpolmask, *newmask);
1780 		break;
1781 	case MPOL_PREFERRED:
1782 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1783 						*mpolmask, *newmask);
1784 		*mpolmask = *newmask;
1785 		break;
1786 	case MPOL_BIND: {
1787 		nodemask_t nodes;
1788 		struct zone **z;
1789 		struct zonelist *zonelist;
1790 
1791 		nodes_clear(nodes);
1792 		for (z = pol->v.zonelist->zones; *z; z++)
1793 			node_set(zone_to_nid(*z), nodes);
1794 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1795 		nodes = tmp;
1796 
1797 		zonelist = bind_zonelist(&nodes);
1798 
1799 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1800 		 * If that old zonelist has no remaining mems_allowed nodes,
1801 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1802 		 */
1803 
1804 		if (!IS_ERR(zonelist)) {
1805 			/* Good - got mem - substitute new zonelist */
1806 			kfree(pol->v.zonelist);
1807 			pol->v.zonelist = zonelist;
1808 		}
1809 		*mpolmask = *newmask;
1810 		break;
1811 	}
1812 	default:
1813 		BUG();
1814 		break;
1815 	}
1816 }
1817 
1818 /*
1819  * Wrapper for mpol_rebind_policy() that just requires task
1820  * pointer, and updates task mempolicy.
1821  */
1822 
1823 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1824 {
1825 	mpol_rebind_policy(tsk->mempolicy, new);
1826 }
1827 
1828 /*
1829  * Rebind each vma in mm to new nodemask.
1830  *
1831  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1832  */
1833 
1834 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1835 {
1836 	struct vm_area_struct *vma;
1837 
1838 	down_write(&mm->mmap_sem);
1839 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1840 		mpol_rebind_policy(vma->vm_policy, new);
1841 	up_write(&mm->mmap_sem);
1842 }
1843 
1844 /*
1845  * Display pages allocated per node and memory policy via /proc.
1846  */
1847 
1848 static const char * const policy_types[] =
1849 	{ "default", "prefer", "bind", "interleave" };
1850 
1851 /*
1852  * Convert a mempolicy into a string.
1853  * Returns the number of characters in buffer (if positive)
1854  * or an error (negative)
1855  */
1856 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1857 {
1858 	char *p = buffer;
1859 	int l;
1860 	nodemask_t nodes;
1861 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1862 
1863 	switch (mode) {
1864 	case MPOL_DEFAULT:
1865 		nodes_clear(nodes);
1866 		break;
1867 
1868 	case MPOL_PREFERRED:
1869 		nodes_clear(nodes);
1870 		node_set(pol->v.preferred_node, nodes);
1871 		break;
1872 
1873 	case MPOL_BIND:
1874 		get_zonemask(pol, &nodes);
1875 		break;
1876 
1877 	case MPOL_INTERLEAVE:
1878 		nodes = pol->v.nodes;
1879 		break;
1880 
1881 	default:
1882 		BUG();
1883 		return -EFAULT;
1884 	}
1885 
1886 	l = strlen(policy_types[mode]);
1887  	if (buffer + maxlen < p + l + 1)
1888  		return -ENOSPC;
1889 
1890 	strcpy(p, policy_types[mode]);
1891 	p += l;
1892 
1893 	if (!nodes_empty(nodes)) {
1894 		if (buffer + maxlen < p + 2)
1895 			return -ENOSPC;
1896 		*p++ = '=';
1897 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1898 	}
1899 	return p - buffer;
1900 }
1901 
1902 struct numa_maps {
1903 	unsigned long pages;
1904 	unsigned long anon;
1905 	unsigned long active;
1906 	unsigned long writeback;
1907 	unsigned long mapcount_max;
1908 	unsigned long dirty;
1909 	unsigned long swapcache;
1910 	unsigned long node[MAX_NUMNODES];
1911 };
1912 
1913 static void gather_stats(struct page *page, void *private, int pte_dirty)
1914 {
1915 	struct numa_maps *md = private;
1916 	int count = page_mapcount(page);
1917 
1918 	md->pages++;
1919 	if (pte_dirty || PageDirty(page))
1920 		md->dirty++;
1921 
1922 	if (PageSwapCache(page))
1923 		md->swapcache++;
1924 
1925 	if (PageActive(page))
1926 		md->active++;
1927 
1928 	if (PageWriteback(page))
1929 		md->writeback++;
1930 
1931 	if (PageAnon(page))
1932 		md->anon++;
1933 
1934 	if (count > md->mapcount_max)
1935 		md->mapcount_max = count;
1936 
1937 	md->node[page_to_nid(page)]++;
1938 }
1939 
1940 #ifdef CONFIG_HUGETLB_PAGE
1941 static void check_huge_range(struct vm_area_struct *vma,
1942 		unsigned long start, unsigned long end,
1943 		struct numa_maps *md)
1944 {
1945 	unsigned long addr;
1946 	struct page *page;
1947 
1948 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1949 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1950 		pte_t pte;
1951 
1952 		if (!ptep)
1953 			continue;
1954 
1955 		pte = *ptep;
1956 		if (pte_none(pte))
1957 			continue;
1958 
1959 		page = pte_page(pte);
1960 		if (!page)
1961 			continue;
1962 
1963 		gather_stats(page, md, pte_dirty(*ptep));
1964 	}
1965 }
1966 #else
1967 static inline void check_huge_range(struct vm_area_struct *vma,
1968 		unsigned long start, unsigned long end,
1969 		struct numa_maps *md)
1970 {
1971 }
1972 #endif
1973 
1974 int show_numa_map(struct seq_file *m, void *v)
1975 {
1976 	struct proc_maps_private *priv = m->private;
1977 	struct vm_area_struct *vma = v;
1978 	struct numa_maps *md;
1979 	struct file *file = vma->vm_file;
1980 	struct mm_struct *mm = vma->vm_mm;
1981 	struct mempolicy *pol;
1982 	int n;
1983 	char buffer[50];
1984 
1985 	if (!mm)
1986 		return 0;
1987 
1988 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1989 	if (!md)
1990 		return 0;
1991 
1992 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1993 	mpol_to_str(buffer, sizeof(buffer), pol);
1994 	/*
1995 	 * unref shared or other task's mempolicy
1996 	 */
1997 	if (pol != &default_policy && pol != current->mempolicy)
1998 		__mpol_free(pol);
1999 
2000 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2001 
2002 	if (file) {
2003 		seq_printf(m, " file=");
2004 		seq_path(m, &file->f_path, "\n\t= ");
2005 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2006 		seq_printf(m, " heap");
2007 	} else if (vma->vm_start <= mm->start_stack &&
2008 			vma->vm_end >= mm->start_stack) {
2009 		seq_printf(m, " stack");
2010 	}
2011 
2012 	if (is_vm_hugetlb_page(vma)) {
2013 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2014 		seq_printf(m, " huge");
2015 	} else {
2016 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2017 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2018 	}
2019 
2020 	if (!md->pages)
2021 		goto out;
2022 
2023 	if (md->anon)
2024 		seq_printf(m," anon=%lu",md->anon);
2025 
2026 	if (md->dirty)
2027 		seq_printf(m," dirty=%lu",md->dirty);
2028 
2029 	if (md->pages != md->anon && md->pages != md->dirty)
2030 		seq_printf(m, " mapped=%lu", md->pages);
2031 
2032 	if (md->mapcount_max > 1)
2033 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2034 
2035 	if (md->swapcache)
2036 		seq_printf(m," swapcache=%lu", md->swapcache);
2037 
2038 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2039 		seq_printf(m," active=%lu", md->active);
2040 
2041 	if (md->writeback)
2042 		seq_printf(m," writeback=%lu", md->writeback);
2043 
2044 	for_each_node_state(n, N_HIGH_MEMORY)
2045 		if (md->node[n])
2046 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2047 out:
2048 	seq_putc(m, '\n');
2049 	kfree(md);
2050 
2051 	if (m->count < m->size)
2052 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2053 	return 0;
2054 }
2055