xref: /openbmc/linux/mm/mempolicy.c (revision 4800cd83)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/module.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 
94 #include <asm/tlbflush.h>
95 #include <asm/uaccess.h>
96 
97 #include "internal.h"
98 
99 /* Internal flags */
100 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
103 
104 static struct kmem_cache *policy_cache;
105 static struct kmem_cache *sn_cache;
106 
107 /* Highest zone. An specific allocation for a zone below that is not
108    policied. */
109 enum zone_type policy_zone = 0;
110 
111 /*
112  * run-time system-wide default policy => local allocation
113  */
114 struct mempolicy default_policy = {
115 	.refcnt = ATOMIC_INIT(1), /* never free it */
116 	.mode = MPOL_PREFERRED,
117 	.flags = MPOL_F_LOCAL,
118 };
119 
120 static const struct mempolicy_operations {
121 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 	/*
123 	 * If read-side task has no lock to protect task->mempolicy, write-side
124 	 * task will rebind the task->mempolicy by two step. The first step is
125 	 * setting all the newly nodes, and the second step is cleaning all the
126 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 	 * page.
128 	 * If we have a lock to protect task->mempolicy in read-side, we do
129 	 * rebind directly.
130 	 *
131 	 * step:
132 	 * 	MPOL_REBIND_ONCE - do rebind work at once
133 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
134 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 	 */
136 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 			enum mpol_rebind_step step);
138 } mpol_ops[MPOL_MAX];
139 
140 /* Check that the nodemask contains at least one populated zone */
141 static int is_valid_nodemask(const nodemask_t *nodemask)
142 {
143 	int nd, k;
144 
145 	for_each_node_mask(nd, *nodemask) {
146 		struct zone *z;
147 
148 		for (k = 0; k <= policy_zone; k++) {
149 			z = &NODE_DATA(nd)->node_zones[k];
150 			if (z->present_pages > 0)
151 				return 1;
152 		}
153 	}
154 
155 	return 0;
156 }
157 
158 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
159 {
160 	return pol->flags & MPOL_MODE_FLAGS;
161 }
162 
163 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
164 				   const nodemask_t *rel)
165 {
166 	nodemask_t tmp;
167 	nodes_fold(tmp, *orig, nodes_weight(*rel));
168 	nodes_onto(*ret, tmp, *rel);
169 }
170 
171 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
172 {
173 	if (nodes_empty(*nodes))
174 		return -EINVAL;
175 	pol->v.nodes = *nodes;
176 	return 0;
177 }
178 
179 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (!nodes)
182 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
183 	else if (nodes_empty(*nodes))
184 		return -EINVAL;			/*  no allowed nodes */
185 	else
186 		pol->v.preferred_node = first_node(*nodes);
187 	return 0;
188 }
189 
190 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 {
192 	if (!is_valid_nodemask(nodes))
193 		return -EINVAL;
194 	pol->v.nodes = *nodes;
195 	return 0;
196 }
197 
198 /*
199  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
200  * any, for the new policy.  mpol_new() has already validated the nodes
201  * parameter with respect to the policy mode and flags.  But, we need to
202  * handle an empty nodemask with MPOL_PREFERRED here.
203  *
204  * Must be called holding task's alloc_lock to protect task's mems_allowed
205  * and mempolicy.  May also be called holding the mmap_semaphore for write.
206  */
207 static int mpol_set_nodemask(struct mempolicy *pol,
208 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
209 {
210 	int ret;
211 
212 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 	if (pol == NULL)
214 		return 0;
215 	/* Check N_HIGH_MEMORY */
216 	nodes_and(nsc->mask1,
217 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
218 
219 	VM_BUG_ON(!nodes);
220 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
221 		nodes = NULL;	/* explicit local allocation */
222 	else {
223 		if (pol->flags & MPOL_F_RELATIVE_NODES)
224 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
225 		else
226 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
227 
228 		if (mpol_store_user_nodemask(pol))
229 			pol->w.user_nodemask = *nodes;
230 		else
231 			pol->w.cpuset_mems_allowed =
232 						cpuset_current_mems_allowed;
233 	}
234 
235 	if (nodes)
236 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
237 	else
238 		ret = mpol_ops[pol->mode].create(pol, NULL);
239 	return ret;
240 }
241 
242 /*
243  * This function just creates a new policy, does some check and simple
244  * initialization. You must invoke mpol_set_nodemask() to set nodes.
245  */
246 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
247 				  nodemask_t *nodes)
248 {
249 	struct mempolicy *policy;
250 
251 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
252 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
253 
254 	if (mode == MPOL_DEFAULT) {
255 		if (nodes && !nodes_empty(*nodes))
256 			return ERR_PTR(-EINVAL);
257 		return NULL;	/* simply delete any existing policy */
258 	}
259 	VM_BUG_ON(!nodes);
260 
261 	/*
262 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
263 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
264 	 * All other modes require a valid pointer to a non-empty nodemask.
265 	 */
266 	if (mode == MPOL_PREFERRED) {
267 		if (nodes_empty(*nodes)) {
268 			if (((flags & MPOL_F_STATIC_NODES) ||
269 			     (flags & MPOL_F_RELATIVE_NODES)))
270 				return ERR_PTR(-EINVAL);
271 		}
272 	} else if (nodes_empty(*nodes))
273 		return ERR_PTR(-EINVAL);
274 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
275 	if (!policy)
276 		return ERR_PTR(-ENOMEM);
277 	atomic_set(&policy->refcnt, 1);
278 	policy->mode = mode;
279 	policy->flags = flags;
280 
281 	return policy;
282 }
283 
284 /* Slow path of a mpol destructor. */
285 void __mpol_put(struct mempolicy *p)
286 {
287 	if (!atomic_dec_and_test(&p->refcnt))
288 		return;
289 	kmem_cache_free(policy_cache, p);
290 }
291 
292 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 				enum mpol_rebind_step step)
294 {
295 }
296 
297 /*
298  * step:
299  * 	MPOL_REBIND_ONCE  - do rebind work at once
300  * 	MPOL_REBIND_STEP1 - set all the newly nodes
301  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
302  */
303 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 				 enum mpol_rebind_step step)
305 {
306 	nodemask_t tmp;
307 
308 	if (pol->flags & MPOL_F_STATIC_NODES)
309 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
310 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
311 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
312 	else {
313 		/*
314 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
315 		 * result
316 		 */
317 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 			nodes_remap(tmp, pol->v.nodes,
319 					pol->w.cpuset_mems_allowed, *nodes);
320 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 		} else if (step == MPOL_REBIND_STEP2) {
322 			tmp = pol->w.cpuset_mems_allowed;
323 			pol->w.cpuset_mems_allowed = *nodes;
324 		} else
325 			BUG();
326 	}
327 
328 	if (nodes_empty(tmp))
329 		tmp = *nodes;
330 
331 	if (step == MPOL_REBIND_STEP1)
332 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 		pol->v.nodes = tmp;
335 	else
336 		BUG();
337 
338 	if (!node_isset(current->il_next, tmp)) {
339 		current->il_next = next_node(current->il_next, tmp);
340 		if (current->il_next >= MAX_NUMNODES)
341 			current->il_next = first_node(tmp);
342 		if (current->il_next >= MAX_NUMNODES)
343 			current->il_next = numa_node_id();
344 	}
345 }
346 
347 static void mpol_rebind_preferred(struct mempolicy *pol,
348 				  const nodemask_t *nodes,
349 				  enum mpol_rebind_step step)
350 {
351 	nodemask_t tmp;
352 
353 	if (pol->flags & MPOL_F_STATIC_NODES) {
354 		int node = first_node(pol->w.user_nodemask);
355 
356 		if (node_isset(node, *nodes)) {
357 			pol->v.preferred_node = node;
358 			pol->flags &= ~MPOL_F_LOCAL;
359 		} else
360 			pol->flags |= MPOL_F_LOCAL;
361 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 		pol->v.preferred_node = first_node(tmp);
364 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
365 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 						   pol->w.cpuset_mems_allowed,
367 						   *nodes);
368 		pol->w.cpuset_mems_allowed = *nodes;
369 	}
370 }
371 
372 /*
373  * mpol_rebind_policy - Migrate a policy to a different set of nodes
374  *
375  * If read-side task has no lock to protect task->mempolicy, write-side
376  * task will rebind the task->mempolicy by two step. The first step is
377  * setting all the newly nodes, and the second step is cleaning all the
378  * disallowed nodes. In this way, we can avoid finding no node to alloc
379  * page.
380  * If we have a lock to protect task->mempolicy in read-side, we do
381  * rebind directly.
382  *
383  * step:
384  * 	MPOL_REBIND_ONCE  - do rebind work at once
385  * 	MPOL_REBIND_STEP1 - set all the newly nodes
386  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
387  */
388 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 				enum mpol_rebind_step step)
390 {
391 	if (!pol)
392 		return;
393 	if (!mpol_store_user_nodemask(pol) && step == 0 &&
394 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 		return;
396 
397 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 		return;
399 
400 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 		BUG();
402 
403 	if (step == MPOL_REBIND_STEP1)
404 		pol->flags |= MPOL_F_REBINDING;
405 	else if (step == MPOL_REBIND_STEP2)
406 		pol->flags &= ~MPOL_F_REBINDING;
407 	else if (step >= MPOL_REBIND_NSTEP)
408 		BUG();
409 
410 	mpol_ops[pol->mode].rebind(pol, newmask, step);
411 }
412 
413 /*
414  * Wrapper for mpol_rebind_policy() that just requires task
415  * pointer, and updates task mempolicy.
416  *
417  * Called with task's alloc_lock held.
418  */
419 
420 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 			enum mpol_rebind_step step)
422 {
423 	mpol_rebind_policy(tsk->mempolicy, new, step);
424 }
425 
426 /*
427  * Rebind each vma in mm to new nodemask.
428  *
429  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
430  */
431 
432 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
433 {
434 	struct vm_area_struct *vma;
435 
436 	down_write(&mm->mmap_sem);
437 	for (vma = mm->mmap; vma; vma = vma->vm_next)
438 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
439 	up_write(&mm->mmap_sem);
440 }
441 
442 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
443 	[MPOL_DEFAULT] = {
444 		.rebind = mpol_rebind_default,
445 	},
446 	[MPOL_INTERLEAVE] = {
447 		.create = mpol_new_interleave,
448 		.rebind = mpol_rebind_nodemask,
449 	},
450 	[MPOL_PREFERRED] = {
451 		.create = mpol_new_preferred,
452 		.rebind = mpol_rebind_preferred,
453 	},
454 	[MPOL_BIND] = {
455 		.create = mpol_new_bind,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 };
459 
460 static void gather_stats(struct page *, void *, int pte_dirty);
461 static void migrate_page_add(struct page *page, struct list_head *pagelist,
462 				unsigned long flags);
463 
464 /* Scan through pages checking if pages follow certain conditions. */
465 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
466 		unsigned long addr, unsigned long end,
467 		const nodemask_t *nodes, unsigned long flags,
468 		void *private)
469 {
470 	pte_t *orig_pte;
471 	pte_t *pte;
472 	spinlock_t *ptl;
473 
474 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
475 	do {
476 		struct page *page;
477 		int nid;
478 
479 		if (!pte_present(*pte))
480 			continue;
481 		page = vm_normal_page(vma, addr, *pte);
482 		if (!page)
483 			continue;
484 		/*
485 		 * vm_normal_page() filters out zero pages, but there might
486 		 * still be PageReserved pages to skip, perhaps in a VDSO.
487 		 * And we cannot move PageKsm pages sensibly or safely yet.
488 		 */
489 		if (PageReserved(page) || PageKsm(page))
490 			continue;
491 		nid = page_to_nid(page);
492 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
493 			continue;
494 
495 		if (flags & MPOL_MF_STATS)
496 			gather_stats(page, private, pte_dirty(*pte));
497 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
498 			migrate_page_add(page, private, flags);
499 		else
500 			break;
501 	} while (pte++, addr += PAGE_SIZE, addr != end);
502 	pte_unmap_unlock(orig_pte, ptl);
503 	return addr != end;
504 }
505 
506 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
507 		unsigned long addr, unsigned long end,
508 		const nodemask_t *nodes, unsigned long flags,
509 		void *private)
510 {
511 	pmd_t *pmd;
512 	unsigned long next;
513 
514 	pmd = pmd_offset(pud, addr);
515 	do {
516 		next = pmd_addr_end(addr, end);
517 		split_huge_page_pmd(vma->vm_mm, pmd);
518 		if (pmd_none_or_clear_bad(pmd))
519 			continue;
520 		if (check_pte_range(vma, pmd, addr, next, nodes,
521 				    flags, private))
522 			return -EIO;
523 	} while (pmd++, addr = next, addr != end);
524 	return 0;
525 }
526 
527 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
528 		unsigned long addr, unsigned long end,
529 		const nodemask_t *nodes, unsigned long flags,
530 		void *private)
531 {
532 	pud_t *pud;
533 	unsigned long next;
534 
535 	pud = pud_offset(pgd, addr);
536 	do {
537 		next = pud_addr_end(addr, end);
538 		if (pud_none_or_clear_bad(pud))
539 			continue;
540 		if (check_pmd_range(vma, pud, addr, next, nodes,
541 				    flags, private))
542 			return -EIO;
543 	} while (pud++, addr = next, addr != end);
544 	return 0;
545 }
546 
547 static inline int check_pgd_range(struct vm_area_struct *vma,
548 		unsigned long addr, unsigned long end,
549 		const nodemask_t *nodes, unsigned long flags,
550 		void *private)
551 {
552 	pgd_t *pgd;
553 	unsigned long next;
554 
555 	pgd = pgd_offset(vma->vm_mm, addr);
556 	do {
557 		next = pgd_addr_end(addr, end);
558 		if (pgd_none_or_clear_bad(pgd))
559 			continue;
560 		if (check_pud_range(vma, pgd, addr, next, nodes,
561 				    flags, private))
562 			return -EIO;
563 	} while (pgd++, addr = next, addr != end);
564 	return 0;
565 }
566 
567 /*
568  * Check if all pages in a range are on a set of nodes.
569  * If pagelist != NULL then isolate pages from the LRU and
570  * put them on the pagelist.
571  */
572 static struct vm_area_struct *
573 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
574 		const nodemask_t *nodes, unsigned long flags, void *private)
575 {
576 	int err;
577 	struct vm_area_struct *first, *vma, *prev;
578 
579 
580 	first = find_vma(mm, start);
581 	if (!first)
582 		return ERR_PTR(-EFAULT);
583 	prev = NULL;
584 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
585 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
586 			if (!vma->vm_next && vma->vm_end < end)
587 				return ERR_PTR(-EFAULT);
588 			if (prev && prev->vm_end < vma->vm_start)
589 				return ERR_PTR(-EFAULT);
590 		}
591 		if (!is_vm_hugetlb_page(vma) &&
592 		    ((flags & MPOL_MF_STRICT) ||
593 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
594 				vma_migratable(vma)))) {
595 			unsigned long endvma = vma->vm_end;
596 
597 			if (endvma > end)
598 				endvma = end;
599 			if (vma->vm_start > start)
600 				start = vma->vm_start;
601 			err = check_pgd_range(vma, start, endvma, nodes,
602 						flags, private);
603 			if (err) {
604 				first = ERR_PTR(err);
605 				break;
606 			}
607 		}
608 		prev = vma;
609 	}
610 	return first;
611 }
612 
613 /* Apply policy to a single VMA */
614 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
615 {
616 	int err = 0;
617 	struct mempolicy *old = vma->vm_policy;
618 
619 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
620 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
621 		 vma->vm_ops, vma->vm_file,
622 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
623 
624 	if (vma->vm_ops && vma->vm_ops->set_policy)
625 		err = vma->vm_ops->set_policy(vma, new);
626 	if (!err) {
627 		mpol_get(new);
628 		vma->vm_policy = new;
629 		mpol_put(old);
630 	}
631 	return err;
632 }
633 
634 /* Step 2: apply policy to a range and do splits. */
635 static int mbind_range(struct mm_struct *mm, unsigned long start,
636 		       unsigned long end, struct mempolicy *new_pol)
637 {
638 	struct vm_area_struct *next;
639 	struct vm_area_struct *prev;
640 	struct vm_area_struct *vma;
641 	int err = 0;
642 	pgoff_t pgoff;
643 	unsigned long vmstart;
644 	unsigned long vmend;
645 
646 	vma = find_vma_prev(mm, start, &prev);
647 	if (!vma || vma->vm_start > start)
648 		return -EFAULT;
649 
650 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
651 		next = vma->vm_next;
652 		vmstart = max(start, vma->vm_start);
653 		vmend   = min(end, vma->vm_end);
654 
655 		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
656 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
657 				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
658 		if (prev) {
659 			vma = prev;
660 			next = vma->vm_next;
661 			continue;
662 		}
663 		if (vma->vm_start != vmstart) {
664 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
665 			if (err)
666 				goto out;
667 		}
668 		if (vma->vm_end != vmend) {
669 			err = split_vma(vma->vm_mm, vma, vmend, 0);
670 			if (err)
671 				goto out;
672 		}
673 		err = policy_vma(vma, new_pol);
674 		if (err)
675 			goto out;
676 	}
677 
678  out:
679 	return err;
680 }
681 
682 /*
683  * Update task->flags PF_MEMPOLICY bit: set iff non-default
684  * mempolicy.  Allows more rapid checking of this (combined perhaps
685  * with other PF_* flag bits) on memory allocation hot code paths.
686  *
687  * If called from outside this file, the task 'p' should -only- be
688  * a newly forked child not yet visible on the task list, because
689  * manipulating the task flags of a visible task is not safe.
690  *
691  * The above limitation is why this routine has the funny name
692  * mpol_fix_fork_child_flag().
693  *
694  * It is also safe to call this with a task pointer of current,
695  * which the static wrapper mpol_set_task_struct_flag() does,
696  * for use within this file.
697  */
698 
699 void mpol_fix_fork_child_flag(struct task_struct *p)
700 {
701 	if (p->mempolicy)
702 		p->flags |= PF_MEMPOLICY;
703 	else
704 		p->flags &= ~PF_MEMPOLICY;
705 }
706 
707 static void mpol_set_task_struct_flag(void)
708 {
709 	mpol_fix_fork_child_flag(current);
710 }
711 
712 /* Set the process memory policy */
713 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
714 			     nodemask_t *nodes)
715 {
716 	struct mempolicy *new, *old;
717 	struct mm_struct *mm = current->mm;
718 	NODEMASK_SCRATCH(scratch);
719 	int ret;
720 
721 	if (!scratch)
722 		return -ENOMEM;
723 
724 	new = mpol_new(mode, flags, nodes);
725 	if (IS_ERR(new)) {
726 		ret = PTR_ERR(new);
727 		goto out;
728 	}
729 	/*
730 	 * prevent changing our mempolicy while show_numa_maps()
731 	 * is using it.
732 	 * Note:  do_set_mempolicy() can be called at init time
733 	 * with no 'mm'.
734 	 */
735 	if (mm)
736 		down_write(&mm->mmap_sem);
737 	task_lock(current);
738 	ret = mpol_set_nodemask(new, nodes, scratch);
739 	if (ret) {
740 		task_unlock(current);
741 		if (mm)
742 			up_write(&mm->mmap_sem);
743 		mpol_put(new);
744 		goto out;
745 	}
746 	old = current->mempolicy;
747 	current->mempolicy = new;
748 	mpol_set_task_struct_flag();
749 	if (new && new->mode == MPOL_INTERLEAVE &&
750 	    nodes_weight(new->v.nodes))
751 		current->il_next = first_node(new->v.nodes);
752 	task_unlock(current);
753 	if (mm)
754 		up_write(&mm->mmap_sem);
755 
756 	mpol_put(old);
757 	ret = 0;
758 out:
759 	NODEMASK_SCRATCH_FREE(scratch);
760 	return ret;
761 }
762 
763 /*
764  * Return nodemask for policy for get_mempolicy() query
765  *
766  * Called with task's alloc_lock held
767  */
768 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
769 {
770 	nodes_clear(*nodes);
771 	if (p == &default_policy)
772 		return;
773 
774 	switch (p->mode) {
775 	case MPOL_BIND:
776 		/* Fall through */
777 	case MPOL_INTERLEAVE:
778 		*nodes = p->v.nodes;
779 		break;
780 	case MPOL_PREFERRED:
781 		if (!(p->flags & MPOL_F_LOCAL))
782 			node_set(p->v.preferred_node, *nodes);
783 		/* else return empty node mask for local allocation */
784 		break;
785 	default:
786 		BUG();
787 	}
788 }
789 
790 static int lookup_node(struct mm_struct *mm, unsigned long addr)
791 {
792 	struct page *p;
793 	int err;
794 
795 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
796 	if (err >= 0) {
797 		err = page_to_nid(p);
798 		put_page(p);
799 	}
800 	return err;
801 }
802 
803 /* Retrieve NUMA policy */
804 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
805 			     unsigned long addr, unsigned long flags)
806 {
807 	int err;
808 	struct mm_struct *mm = current->mm;
809 	struct vm_area_struct *vma = NULL;
810 	struct mempolicy *pol = current->mempolicy;
811 
812 	if (flags &
813 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
814 		return -EINVAL;
815 
816 	if (flags & MPOL_F_MEMS_ALLOWED) {
817 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
818 			return -EINVAL;
819 		*policy = 0;	/* just so it's initialized */
820 		task_lock(current);
821 		*nmask  = cpuset_current_mems_allowed;
822 		task_unlock(current);
823 		return 0;
824 	}
825 
826 	if (flags & MPOL_F_ADDR) {
827 		/*
828 		 * Do NOT fall back to task policy if the
829 		 * vma/shared policy at addr is NULL.  We
830 		 * want to return MPOL_DEFAULT in this case.
831 		 */
832 		down_read(&mm->mmap_sem);
833 		vma = find_vma_intersection(mm, addr, addr+1);
834 		if (!vma) {
835 			up_read(&mm->mmap_sem);
836 			return -EFAULT;
837 		}
838 		if (vma->vm_ops && vma->vm_ops->get_policy)
839 			pol = vma->vm_ops->get_policy(vma, addr);
840 		else
841 			pol = vma->vm_policy;
842 	} else if (addr)
843 		return -EINVAL;
844 
845 	if (!pol)
846 		pol = &default_policy;	/* indicates default behavior */
847 
848 	if (flags & MPOL_F_NODE) {
849 		if (flags & MPOL_F_ADDR) {
850 			err = lookup_node(mm, addr);
851 			if (err < 0)
852 				goto out;
853 			*policy = err;
854 		} else if (pol == current->mempolicy &&
855 				pol->mode == MPOL_INTERLEAVE) {
856 			*policy = current->il_next;
857 		} else {
858 			err = -EINVAL;
859 			goto out;
860 		}
861 	} else {
862 		*policy = pol == &default_policy ? MPOL_DEFAULT :
863 						pol->mode;
864 		/*
865 		 * Internal mempolicy flags must be masked off before exposing
866 		 * the policy to userspace.
867 		 */
868 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
869 	}
870 
871 	if (vma) {
872 		up_read(&current->mm->mmap_sem);
873 		vma = NULL;
874 	}
875 
876 	err = 0;
877 	if (nmask) {
878 		if (mpol_store_user_nodemask(pol)) {
879 			*nmask = pol->w.user_nodemask;
880 		} else {
881 			task_lock(current);
882 			get_policy_nodemask(pol, nmask);
883 			task_unlock(current);
884 		}
885 	}
886 
887  out:
888 	mpol_cond_put(pol);
889 	if (vma)
890 		up_read(&current->mm->mmap_sem);
891 	return err;
892 }
893 
894 #ifdef CONFIG_MIGRATION
895 /*
896  * page migration
897  */
898 static void migrate_page_add(struct page *page, struct list_head *pagelist,
899 				unsigned long flags)
900 {
901 	/*
902 	 * Avoid migrating a page that is shared with others.
903 	 */
904 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
905 		if (!isolate_lru_page(page)) {
906 			list_add_tail(&page->lru, pagelist);
907 			inc_zone_page_state(page, NR_ISOLATED_ANON +
908 					    page_is_file_cache(page));
909 		}
910 	}
911 }
912 
913 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
914 {
915 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
916 }
917 
918 /*
919  * Migrate pages from one node to a target node.
920  * Returns error or the number of pages not migrated.
921  */
922 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
923 			   int flags)
924 {
925 	nodemask_t nmask;
926 	LIST_HEAD(pagelist);
927 	int err = 0;
928 	struct vm_area_struct *vma;
929 
930 	nodes_clear(nmask);
931 	node_set(source, nmask);
932 
933 	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
934 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
935 	if (IS_ERR(vma))
936 		return PTR_ERR(vma);
937 
938 	if (!list_empty(&pagelist)) {
939 		err = migrate_pages(&pagelist, new_node_page, dest,
940 								false, true);
941 		if (err)
942 			putback_lru_pages(&pagelist);
943 	}
944 
945 	return err;
946 }
947 
948 /*
949  * Move pages between the two nodesets so as to preserve the physical
950  * layout as much as possible.
951  *
952  * Returns the number of page that could not be moved.
953  */
954 int do_migrate_pages(struct mm_struct *mm,
955 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
956 {
957 	int busy = 0;
958 	int err;
959 	nodemask_t tmp;
960 
961 	err = migrate_prep();
962 	if (err)
963 		return err;
964 
965 	down_read(&mm->mmap_sem);
966 
967 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
968 	if (err)
969 		goto out;
970 
971 	/*
972 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
973 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
974 	 * bit in 'tmp', and return that <source, dest> pair for migration.
975 	 * The pair of nodemasks 'to' and 'from' define the map.
976 	 *
977 	 * If no pair of bits is found that way, fallback to picking some
978 	 * pair of 'source' and 'dest' bits that are not the same.  If the
979 	 * 'source' and 'dest' bits are the same, this represents a node
980 	 * that will be migrating to itself, so no pages need move.
981 	 *
982 	 * If no bits are left in 'tmp', or if all remaining bits left
983 	 * in 'tmp' correspond to the same bit in 'to', return false
984 	 * (nothing left to migrate).
985 	 *
986 	 * This lets us pick a pair of nodes to migrate between, such that
987 	 * if possible the dest node is not already occupied by some other
988 	 * source node, minimizing the risk of overloading the memory on a
989 	 * node that would happen if we migrated incoming memory to a node
990 	 * before migrating outgoing memory source that same node.
991 	 *
992 	 * A single scan of tmp is sufficient.  As we go, we remember the
993 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
994 	 * that not only moved, but what's better, moved to an empty slot
995 	 * (d is not set in tmp), then we break out then, with that pair.
996 	 * Otherwise when we finish scannng from_tmp, we at least have the
997 	 * most recent <s, d> pair that moved.  If we get all the way through
998 	 * the scan of tmp without finding any node that moved, much less
999 	 * moved to an empty node, then there is nothing left worth migrating.
1000 	 */
1001 
1002 	tmp = *from_nodes;
1003 	while (!nodes_empty(tmp)) {
1004 		int s,d;
1005 		int source = -1;
1006 		int dest = 0;
1007 
1008 		for_each_node_mask(s, tmp) {
1009 			d = node_remap(s, *from_nodes, *to_nodes);
1010 			if (s == d)
1011 				continue;
1012 
1013 			source = s;	/* Node moved. Memorize */
1014 			dest = d;
1015 
1016 			/* dest not in remaining from nodes? */
1017 			if (!node_isset(dest, tmp))
1018 				break;
1019 		}
1020 		if (source == -1)
1021 			break;
1022 
1023 		node_clear(source, tmp);
1024 		err = migrate_to_node(mm, source, dest, flags);
1025 		if (err > 0)
1026 			busy += err;
1027 		if (err < 0)
1028 			break;
1029 	}
1030 out:
1031 	up_read(&mm->mmap_sem);
1032 	if (err < 0)
1033 		return err;
1034 	return busy;
1035 
1036 }
1037 
1038 /*
1039  * Allocate a new page for page migration based on vma policy.
1040  * Start assuming that page is mapped by vma pointed to by @private.
1041  * Search forward from there, if not.  N.B., this assumes that the
1042  * list of pages handed to migrate_pages()--which is how we get here--
1043  * is in virtual address order.
1044  */
1045 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1046 {
1047 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1048 	unsigned long uninitialized_var(address);
1049 
1050 	while (vma) {
1051 		address = page_address_in_vma(page, vma);
1052 		if (address != -EFAULT)
1053 			break;
1054 		vma = vma->vm_next;
1055 	}
1056 
1057 	/*
1058 	 * if !vma, alloc_page_vma() will use task or system default policy
1059 	 */
1060 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1061 }
1062 #else
1063 
1064 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1065 				unsigned long flags)
1066 {
1067 }
1068 
1069 int do_migrate_pages(struct mm_struct *mm,
1070 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1071 {
1072 	return -ENOSYS;
1073 }
1074 
1075 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1076 {
1077 	return NULL;
1078 }
1079 #endif
1080 
1081 static long do_mbind(unsigned long start, unsigned long len,
1082 		     unsigned short mode, unsigned short mode_flags,
1083 		     nodemask_t *nmask, unsigned long flags)
1084 {
1085 	struct vm_area_struct *vma;
1086 	struct mm_struct *mm = current->mm;
1087 	struct mempolicy *new;
1088 	unsigned long end;
1089 	int err;
1090 	LIST_HEAD(pagelist);
1091 
1092 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1093 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1094 		return -EINVAL;
1095 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1096 		return -EPERM;
1097 
1098 	if (start & ~PAGE_MASK)
1099 		return -EINVAL;
1100 
1101 	if (mode == MPOL_DEFAULT)
1102 		flags &= ~MPOL_MF_STRICT;
1103 
1104 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1105 	end = start + len;
1106 
1107 	if (end < start)
1108 		return -EINVAL;
1109 	if (end == start)
1110 		return 0;
1111 
1112 	new = mpol_new(mode, mode_flags, nmask);
1113 	if (IS_ERR(new))
1114 		return PTR_ERR(new);
1115 
1116 	/*
1117 	 * If we are using the default policy then operation
1118 	 * on discontinuous address spaces is okay after all
1119 	 */
1120 	if (!new)
1121 		flags |= MPOL_MF_DISCONTIG_OK;
1122 
1123 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1124 		 start, start + len, mode, mode_flags,
1125 		 nmask ? nodes_addr(*nmask)[0] : -1);
1126 
1127 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1128 
1129 		err = migrate_prep();
1130 		if (err)
1131 			goto mpol_out;
1132 	}
1133 	{
1134 		NODEMASK_SCRATCH(scratch);
1135 		if (scratch) {
1136 			down_write(&mm->mmap_sem);
1137 			task_lock(current);
1138 			err = mpol_set_nodemask(new, nmask, scratch);
1139 			task_unlock(current);
1140 			if (err)
1141 				up_write(&mm->mmap_sem);
1142 		} else
1143 			err = -ENOMEM;
1144 		NODEMASK_SCRATCH_FREE(scratch);
1145 	}
1146 	if (err)
1147 		goto mpol_out;
1148 
1149 	vma = check_range(mm, start, end, nmask,
1150 			  flags | MPOL_MF_INVERT, &pagelist);
1151 
1152 	err = PTR_ERR(vma);
1153 	if (!IS_ERR(vma)) {
1154 		int nr_failed = 0;
1155 
1156 		err = mbind_range(mm, start, end, new);
1157 
1158 		if (!list_empty(&pagelist)) {
1159 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1160 						(unsigned long)vma,
1161 						false, true);
1162 			if (nr_failed)
1163 				putback_lru_pages(&pagelist);
1164 		}
1165 
1166 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1167 			err = -EIO;
1168 	} else
1169 		putback_lru_pages(&pagelist);
1170 
1171 	up_write(&mm->mmap_sem);
1172  mpol_out:
1173 	mpol_put(new);
1174 	return err;
1175 }
1176 
1177 /*
1178  * User space interface with variable sized bitmaps for nodelists.
1179  */
1180 
1181 /* Copy a node mask from user space. */
1182 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1183 		     unsigned long maxnode)
1184 {
1185 	unsigned long k;
1186 	unsigned long nlongs;
1187 	unsigned long endmask;
1188 
1189 	--maxnode;
1190 	nodes_clear(*nodes);
1191 	if (maxnode == 0 || !nmask)
1192 		return 0;
1193 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1194 		return -EINVAL;
1195 
1196 	nlongs = BITS_TO_LONGS(maxnode);
1197 	if ((maxnode % BITS_PER_LONG) == 0)
1198 		endmask = ~0UL;
1199 	else
1200 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1201 
1202 	/* When the user specified more nodes than supported just check
1203 	   if the non supported part is all zero. */
1204 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1205 		if (nlongs > PAGE_SIZE/sizeof(long))
1206 			return -EINVAL;
1207 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1208 			unsigned long t;
1209 			if (get_user(t, nmask + k))
1210 				return -EFAULT;
1211 			if (k == nlongs - 1) {
1212 				if (t & endmask)
1213 					return -EINVAL;
1214 			} else if (t)
1215 				return -EINVAL;
1216 		}
1217 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1218 		endmask = ~0UL;
1219 	}
1220 
1221 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1222 		return -EFAULT;
1223 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1224 	return 0;
1225 }
1226 
1227 /* Copy a kernel node mask to user space */
1228 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1229 			      nodemask_t *nodes)
1230 {
1231 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1232 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1233 
1234 	if (copy > nbytes) {
1235 		if (copy > PAGE_SIZE)
1236 			return -EINVAL;
1237 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1238 			return -EFAULT;
1239 		copy = nbytes;
1240 	}
1241 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1242 }
1243 
1244 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1245 		unsigned long, mode, unsigned long __user *, nmask,
1246 		unsigned long, maxnode, unsigned, flags)
1247 {
1248 	nodemask_t nodes;
1249 	int err;
1250 	unsigned short mode_flags;
1251 
1252 	mode_flags = mode & MPOL_MODE_FLAGS;
1253 	mode &= ~MPOL_MODE_FLAGS;
1254 	if (mode >= MPOL_MAX)
1255 		return -EINVAL;
1256 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1257 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1258 		return -EINVAL;
1259 	err = get_nodes(&nodes, nmask, maxnode);
1260 	if (err)
1261 		return err;
1262 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1263 }
1264 
1265 /* Set the process memory policy */
1266 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1267 		unsigned long, maxnode)
1268 {
1269 	int err;
1270 	nodemask_t nodes;
1271 	unsigned short flags;
1272 
1273 	flags = mode & MPOL_MODE_FLAGS;
1274 	mode &= ~MPOL_MODE_FLAGS;
1275 	if ((unsigned int)mode >= MPOL_MAX)
1276 		return -EINVAL;
1277 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1278 		return -EINVAL;
1279 	err = get_nodes(&nodes, nmask, maxnode);
1280 	if (err)
1281 		return err;
1282 	return do_set_mempolicy(mode, flags, &nodes);
1283 }
1284 
1285 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1286 		const unsigned long __user *, old_nodes,
1287 		const unsigned long __user *, new_nodes)
1288 {
1289 	const struct cred *cred = current_cred(), *tcred;
1290 	struct mm_struct *mm = NULL;
1291 	struct task_struct *task;
1292 	nodemask_t task_nodes;
1293 	int err;
1294 	nodemask_t *old;
1295 	nodemask_t *new;
1296 	NODEMASK_SCRATCH(scratch);
1297 
1298 	if (!scratch)
1299 		return -ENOMEM;
1300 
1301 	old = &scratch->mask1;
1302 	new = &scratch->mask2;
1303 
1304 	err = get_nodes(old, old_nodes, maxnode);
1305 	if (err)
1306 		goto out;
1307 
1308 	err = get_nodes(new, new_nodes, maxnode);
1309 	if (err)
1310 		goto out;
1311 
1312 	/* Find the mm_struct */
1313 	rcu_read_lock();
1314 	task = pid ? find_task_by_vpid(pid) : current;
1315 	if (!task) {
1316 		rcu_read_unlock();
1317 		err = -ESRCH;
1318 		goto out;
1319 	}
1320 	mm = get_task_mm(task);
1321 	rcu_read_unlock();
1322 
1323 	err = -EINVAL;
1324 	if (!mm)
1325 		goto out;
1326 
1327 	/*
1328 	 * Check if this process has the right to modify the specified
1329 	 * process. The right exists if the process has administrative
1330 	 * capabilities, superuser privileges or the same
1331 	 * userid as the target process.
1332 	 */
1333 	rcu_read_lock();
1334 	tcred = __task_cred(task);
1335 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1336 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1337 	    !capable(CAP_SYS_NICE)) {
1338 		rcu_read_unlock();
1339 		err = -EPERM;
1340 		goto out;
1341 	}
1342 	rcu_read_unlock();
1343 
1344 	task_nodes = cpuset_mems_allowed(task);
1345 	/* Is the user allowed to access the target nodes? */
1346 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1347 		err = -EPERM;
1348 		goto out;
1349 	}
1350 
1351 	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1352 		err = -EINVAL;
1353 		goto out;
1354 	}
1355 
1356 	err = security_task_movememory(task);
1357 	if (err)
1358 		goto out;
1359 
1360 	err = do_migrate_pages(mm, old, new,
1361 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1362 out:
1363 	if (mm)
1364 		mmput(mm);
1365 	NODEMASK_SCRATCH_FREE(scratch);
1366 
1367 	return err;
1368 }
1369 
1370 
1371 /* Retrieve NUMA policy */
1372 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1373 		unsigned long __user *, nmask, unsigned long, maxnode,
1374 		unsigned long, addr, unsigned long, flags)
1375 {
1376 	int err;
1377 	int uninitialized_var(pval);
1378 	nodemask_t nodes;
1379 
1380 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1381 		return -EINVAL;
1382 
1383 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1384 
1385 	if (err)
1386 		return err;
1387 
1388 	if (policy && put_user(pval, policy))
1389 		return -EFAULT;
1390 
1391 	if (nmask)
1392 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1393 
1394 	return err;
1395 }
1396 
1397 #ifdef CONFIG_COMPAT
1398 
1399 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1400 				     compat_ulong_t __user *nmask,
1401 				     compat_ulong_t maxnode,
1402 				     compat_ulong_t addr, compat_ulong_t flags)
1403 {
1404 	long err;
1405 	unsigned long __user *nm = NULL;
1406 	unsigned long nr_bits, alloc_size;
1407 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1408 
1409 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1410 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1411 
1412 	if (nmask)
1413 		nm = compat_alloc_user_space(alloc_size);
1414 
1415 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1416 
1417 	if (!err && nmask) {
1418 		err = copy_from_user(bm, nm, alloc_size);
1419 		/* ensure entire bitmap is zeroed */
1420 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1421 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1422 	}
1423 
1424 	return err;
1425 }
1426 
1427 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1428 				     compat_ulong_t maxnode)
1429 {
1430 	long err = 0;
1431 	unsigned long __user *nm = NULL;
1432 	unsigned long nr_bits, alloc_size;
1433 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1434 
1435 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1436 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1437 
1438 	if (nmask) {
1439 		err = compat_get_bitmap(bm, nmask, nr_bits);
1440 		nm = compat_alloc_user_space(alloc_size);
1441 		err |= copy_to_user(nm, bm, alloc_size);
1442 	}
1443 
1444 	if (err)
1445 		return -EFAULT;
1446 
1447 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1448 }
1449 
1450 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1451 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1452 			     compat_ulong_t maxnode, compat_ulong_t flags)
1453 {
1454 	long err = 0;
1455 	unsigned long __user *nm = NULL;
1456 	unsigned long nr_bits, alloc_size;
1457 	nodemask_t bm;
1458 
1459 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1460 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1461 
1462 	if (nmask) {
1463 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1464 		nm = compat_alloc_user_space(alloc_size);
1465 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1466 	}
1467 
1468 	if (err)
1469 		return -EFAULT;
1470 
1471 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1472 }
1473 
1474 #endif
1475 
1476 /*
1477  * get_vma_policy(@task, @vma, @addr)
1478  * @task - task for fallback if vma policy == default
1479  * @vma   - virtual memory area whose policy is sought
1480  * @addr  - address in @vma for shared policy lookup
1481  *
1482  * Returns effective policy for a VMA at specified address.
1483  * Falls back to @task or system default policy, as necessary.
1484  * Current or other task's task mempolicy and non-shared vma policies
1485  * are protected by the task's mmap_sem, which must be held for read by
1486  * the caller.
1487  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1488  * count--added by the get_policy() vm_op, as appropriate--to protect against
1489  * freeing by another task.  It is the caller's responsibility to free the
1490  * extra reference for shared policies.
1491  */
1492 static struct mempolicy *get_vma_policy(struct task_struct *task,
1493 		struct vm_area_struct *vma, unsigned long addr)
1494 {
1495 	struct mempolicy *pol = task->mempolicy;
1496 
1497 	if (vma) {
1498 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1499 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1500 									addr);
1501 			if (vpol)
1502 				pol = vpol;
1503 		} else if (vma->vm_policy)
1504 			pol = vma->vm_policy;
1505 	}
1506 	if (!pol)
1507 		pol = &default_policy;
1508 	return pol;
1509 }
1510 
1511 /*
1512  * Return a nodemask representing a mempolicy for filtering nodes for
1513  * page allocation
1514  */
1515 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1516 {
1517 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1518 	if (unlikely(policy->mode == MPOL_BIND) &&
1519 			gfp_zone(gfp) >= policy_zone &&
1520 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1521 		return &policy->v.nodes;
1522 
1523 	return NULL;
1524 }
1525 
1526 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1527 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528 	int nd)
1529 {
1530 	switch (policy->mode) {
1531 	case MPOL_PREFERRED:
1532 		if (!(policy->flags & MPOL_F_LOCAL))
1533 			nd = policy->v.preferred_node;
1534 		break;
1535 	case MPOL_BIND:
1536 		/*
1537 		 * Normally, MPOL_BIND allocations are node-local within the
1538 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1539 		 * current node isn't part of the mask, we use the zonelist for
1540 		 * the first node in the mask instead.
1541 		 */
1542 		if (unlikely(gfp & __GFP_THISNODE) &&
1543 				unlikely(!node_isset(nd, policy->v.nodes)))
1544 			nd = first_node(policy->v.nodes);
1545 		break;
1546 	default:
1547 		BUG();
1548 	}
1549 	return node_zonelist(nd, gfp);
1550 }
1551 
1552 /* Do dynamic interleaving for a process */
1553 static unsigned interleave_nodes(struct mempolicy *policy)
1554 {
1555 	unsigned nid, next;
1556 	struct task_struct *me = current;
1557 
1558 	nid = me->il_next;
1559 	next = next_node(nid, policy->v.nodes);
1560 	if (next >= MAX_NUMNODES)
1561 		next = first_node(policy->v.nodes);
1562 	if (next < MAX_NUMNODES)
1563 		me->il_next = next;
1564 	return nid;
1565 }
1566 
1567 /*
1568  * Depending on the memory policy provide a node from which to allocate the
1569  * next slab entry.
1570  * @policy must be protected by freeing by the caller.  If @policy is
1571  * the current task's mempolicy, this protection is implicit, as only the
1572  * task can change it's policy.  The system default policy requires no
1573  * such protection.
1574  */
1575 unsigned slab_node(struct mempolicy *policy)
1576 {
1577 	if (!policy || policy->flags & MPOL_F_LOCAL)
1578 		return numa_node_id();
1579 
1580 	switch (policy->mode) {
1581 	case MPOL_PREFERRED:
1582 		/*
1583 		 * handled MPOL_F_LOCAL above
1584 		 */
1585 		return policy->v.preferred_node;
1586 
1587 	case MPOL_INTERLEAVE:
1588 		return interleave_nodes(policy);
1589 
1590 	case MPOL_BIND: {
1591 		/*
1592 		 * Follow bind policy behavior and start allocation at the
1593 		 * first node.
1594 		 */
1595 		struct zonelist *zonelist;
1596 		struct zone *zone;
1597 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1598 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1599 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1600 							&policy->v.nodes,
1601 							&zone);
1602 		return zone ? zone->node : numa_node_id();
1603 	}
1604 
1605 	default:
1606 		BUG();
1607 	}
1608 }
1609 
1610 /* Do static interleaving for a VMA with known offset. */
1611 static unsigned offset_il_node(struct mempolicy *pol,
1612 		struct vm_area_struct *vma, unsigned long off)
1613 {
1614 	unsigned nnodes = nodes_weight(pol->v.nodes);
1615 	unsigned target;
1616 	int c;
1617 	int nid = -1;
1618 
1619 	if (!nnodes)
1620 		return numa_node_id();
1621 	target = (unsigned int)off % nnodes;
1622 	c = 0;
1623 	do {
1624 		nid = next_node(nid, pol->v.nodes);
1625 		c++;
1626 	} while (c <= target);
1627 	return nid;
1628 }
1629 
1630 /* Determine a node number for interleave */
1631 static inline unsigned interleave_nid(struct mempolicy *pol,
1632 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1633 {
1634 	if (vma) {
1635 		unsigned long off;
1636 
1637 		/*
1638 		 * for small pages, there is no difference between
1639 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1640 		 * for huge pages, since vm_pgoff is in units of small
1641 		 * pages, we need to shift off the always 0 bits to get
1642 		 * a useful offset.
1643 		 */
1644 		BUG_ON(shift < PAGE_SHIFT);
1645 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1646 		off += (addr - vma->vm_start) >> shift;
1647 		return offset_il_node(pol, vma, off);
1648 	} else
1649 		return interleave_nodes(pol);
1650 }
1651 
1652 #ifdef CONFIG_HUGETLBFS
1653 /*
1654  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1655  * @vma = virtual memory area whose policy is sought
1656  * @addr = address in @vma for shared policy lookup and interleave policy
1657  * @gfp_flags = for requested zone
1658  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1659  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1660  *
1661  * Returns a zonelist suitable for a huge page allocation and a pointer
1662  * to the struct mempolicy for conditional unref after allocation.
1663  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1664  * @nodemask for filtering the zonelist.
1665  *
1666  * Must be protected by get_mems_allowed()
1667  */
1668 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1669 				gfp_t gfp_flags, struct mempolicy **mpol,
1670 				nodemask_t **nodemask)
1671 {
1672 	struct zonelist *zl;
1673 
1674 	*mpol = get_vma_policy(current, vma, addr);
1675 	*nodemask = NULL;	/* assume !MPOL_BIND */
1676 
1677 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1678 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1679 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1680 	} else {
1681 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1682 		if ((*mpol)->mode == MPOL_BIND)
1683 			*nodemask = &(*mpol)->v.nodes;
1684 	}
1685 	return zl;
1686 }
1687 
1688 /*
1689  * init_nodemask_of_mempolicy
1690  *
1691  * If the current task's mempolicy is "default" [NULL], return 'false'
1692  * to indicate default policy.  Otherwise, extract the policy nodemask
1693  * for 'bind' or 'interleave' policy into the argument nodemask, or
1694  * initialize the argument nodemask to contain the single node for
1695  * 'preferred' or 'local' policy and return 'true' to indicate presence
1696  * of non-default mempolicy.
1697  *
1698  * We don't bother with reference counting the mempolicy [mpol_get/put]
1699  * because the current task is examining it's own mempolicy and a task's
1700  * mempolicy is only ever changed by the task itself.
1701  *
1702  * N.B., it is the caller's responsibility to free a returned nodemask.
1703  */
1704 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1705 {
1706 	struct mempolicy *mempolicy;
1707 	int nid;
1708 
1709 	if (!(mask && current->mempolicy))
1710 		return false;
1711 
1712 	task_lock(current);
1713 	mempolicy = current->mempolicy;
1714 	switch (mempolicy->mode) {
1715 	case MPOL_PREFERRED:
1716 		if (mempolicy->flags & MPOL_F_LOCAL)
1717 			nid = numa_node_id();
1718 		else
1719 			nid = mempolicy->v.preferred_node;
1720 		init_nodemask_of_node(mask, nid);
1721 		break;
1722 
1723 	case MPOL_BIND:
1724 		/* Fall through */
1725 	case MPOL_INTERLEAVE:
1726 		*mask =  mempolicy->v.nodes;
1727 		break;
1728 
1729 	default:
1730 		BUG();
1731 	}
1732 	task_unlock(current);
1733 
1734 	return true;
1735 }
1736 #endif
1737 
1738 /*
1739  * mempolicy_nodemask_intersects
1740  *
1741  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1742  * policy.  Otherwise, check for intersection between mask and the policy
1743  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1744  * policy, always return true since it may allocate elsewhere on fallback.
1745  *
1746  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1747  */
1748 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1749 					const nodemask_t *mask)
1750 {
1751 	struct mempolicy *mempolicy;
1752 	bool ret = true;
1753 
1754 	if (!mask)
1755 		return ret;
1756 	task_lock(tsk);
1757 	mempolicy = tsk->mempolicy;
1758 	if (!mempolicy)
1759 		goto out;
1760 
1761 	switch (mempolicy->mode) {
1762 	case MPOL_PREFERRED:
1763 		/*
1764 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1765 		 * allocate from, they may fallback to other nodes when oom.
1766 		 * Thus, it's possible for tsk to have allocated memory from
1767 		 * nodes in mask.
1768 		 */
1769 		break;
1770 	case MPOL_BIND:
1771 	case MPOL_INTERLEAVE:
1772 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1773 		break;
1774 	default:
1775 		BUG();
1776 	}
1777 out:
1778 	task_unlock(tsk);
1779 	return ret;
1780 }
1781 
1782 /* Allocate a page in interleaved policy.
1783    Own path because it needs to do special accounting. */
1784 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1785 					unsigned nid)
1786 {
1787 	struct zonelist *zl;
1788 	struct page *page;
1789 
1790 	zl = node_zonelist(nid, gfp);
1791 	page = __alloc_pages(gfp, order, zl);
1792 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1793 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1794 	return page;
1795 }
1796 
1797 /**
1798  * 	alloc_pages_vma	- Allocate a page for a VMA.
1799  *
1800  * 	@gfp:
1801  *      %GFP_USER    user allocation.
1802  *      %GFP_KERNEL  kernel allocations,
1803  *      %GFP_HIGHMEM highmem/user allocations,
1804  *      %GFP_FS      allocation should not call back into a file system.
1805  *      %GFP_ATOMIC  don't sleep.
1806  *
1807  *	@order:Order of the GFP allocation.
1808  * 	@vma:  Pointer to VMA or NULL if not available.
1809  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1810  *
1811  * 	This function allocates a page from the kernel page pool and applies
1812  *	a NUMA policy associated with the VMA or the current process.
1813  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1814  *	mm_struct of the VMA to prevent it from going away. Should be used for
1815  *	all allocations for pages that will be mapped into
1816  * 	user space. Returns NULL when no page can be allocated.
1817  *
1818  *	Should be called with the mm_sem of the vma hold.
1819  */
1820 struct page *
1821 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1822 		unsigned long addr, int node)
1823 {
1824 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1825 	struct zonelist *zl;
1826 	struct page *page;
1827 
1828 	get_mems_allowed();
1829 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1830 		unsigned nid;
1831 
1832 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1833 		mpol_cond_put(pol);
1834 		page = alloc_page_interleave(gfp, order, nid);
1835 		put_mems_allowed();
1836 		return page;
1837 	}
1838 	zl = policy_zonelist(gfp, pol, node);
1839 	if (unlikely(mpol_needs_cond_ref(pol))) {
1840 		/*
1841 		 * slow path: ref counted shared policy
1842 		 */
1843 		struct page *page =  __alloc_pages_nodemask(gfp, order,
1844 						zl, policy_nodemask(gfp, pol));
1845 		__mpol_put(pol);
1846 		put_mems_allowed();
1847 		return page;
1848 	}
1849 	/*
1850 	 * fast path:  default or task policy
1851 	 */
1852 	page = __alloc_pages_nodemask(gfp, order, zl,
1853 				      policy_nodemask(gfp, pol));
1854 	put_mems_allowed();
1855 	return page;
1856 }
1857 
1858 /**
1859  * 	alloc_pages_current - Allocate pages.
1860  *
1861  *	@gfp:
1862  *		%GFP_USER   user allocation,
1863  *      	%GFP_KERNEL kernel allocation,
1864  *      	%GFP_HIGHMEM highmem allocation,
1865  *      	%GFP_FS     don't call back into a file system.
1866  *      	%GFP_ATOMIC don't sleep.
1867  *	@order: Power of two of allocation size in pages. 0 is a single page.
1868  *
1869  *	Allocate a page from the kernel page pool.  When not in
1870  *	interrupt context and apply the current process NUMA policy.
1871  *	Returns NULL when no page can be allocated.
1872  *
1873  *	Don't call cpuset_update_task_memory_state() unless
1874  *	1) it's ok to take cpuset_sem (can WAIT), and
1875  *	2) allocating for current task (not interrupt).
1876  */
1877 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1878 {
1879 	struct mempolicy *pol = current->mempolicy;
1880 	struct page *page;
1881 
1882 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1883 		pol = &default_policy;
1884 
1885 	get_mems_allowed();
1886 	/*
1887 	 * No reference counting needed for current->mempolicy
1888 	 * nor system default_policy
1889 	 */
1890 	if (pol->mode == MPOL_INTERLEAVE)
1891 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1892 	else
1893 		page = __alloc_pages_nodemask(gfp, order,
1894 				policy_zonelist(gfp, pol, numa_node_id()),
1895 				policy_nodemask(gfp, pol));
1896 	put_mems_allowed();
1897 	return page;
1898 }
1899 EXPORT_SYMBOL(alloc_pages_current);
1900 
1901 /*
1902  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1903  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1904  * with the mems_allowed returned by cpuset_mems_allowed().  This
1905  * keeps mempolicies cpuset relative after its cpuset moves.  See
1906  * further kernel/cpuset.c update_nodemask().
1907  *
1908  * current's mempolicy may be rebinded by the other task(the task that changes
1909  * cpuset's mems), so we needn't do rebind work for current task.
1910  */
1911 
1912 /* Slow path of a mempolicy duplicate */
1913 struct mempolicy *__mpol_dup(struct mempolicy *old)
1914 {
1915 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1916 
1917 	if (!new)
1918 		return ERR_PTR(-ENOMEM);
1919 
1920 	/* task's mempolicy is protected by alloc_lock */
1921 	if (old == current->mempolicy) {
1922 		task_lock(current);
1923 		*new = *old;
1924 		task_unlock(current);
1925 	} else
1926 		*new = *old;
1927 
1928 	rcu_read_lock();
1929 	if (current_cpuset_is_being_rebound()) {
1930 		nodemask_t mems = cpuset_mems_allowed(current);
1931 		if (new->flags & MPOL_F_REBINDING)
1932 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1933 		else
1934 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1935 	}
1936 	rcu_read_unlock();
1937 	atomic_set(&new->refcnt, 1);
1938 	return new;
1939 }
1940 
1941 /*
1942  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1943  * eliminate the * MPOL_F_* flags that require conditional ref and
1944  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1945  * after return.  Use the returned value.
1946  *
1947  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1948  * policy lookup, even if the policy needs/has extra ref on lookup.
1949  * shmem_readahead needs this.
1950  */
1951 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1952 						struct mempolicy *frompol)
1953 {
1954 	if (!mpol_needs_cond_ref(frompol))
1955 		return frompol;
1956 
1957 	*tompol = *frompol;
1958 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1959 	__mpol_put(frompol);
1960 	return tompol;
1961 }
1962 
1963 /* Slow path of a mempolicy comparison */
1964 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1965 {
1966 	if (!a || !b)
1967 		return 0;
1968 	if (a->mode != b->mode)
1969 		return 0;
1970 	if (a->flags != b->flags)
1971 		return 0;
1972 	if (mpol_store_user_nodemask(a))
1973 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1974 			return 0;
1975 
1976 	switch (a->mode) {
1977 	case MPOL_BIND:
1978 		/* Fall through */
1979 	case MPOL_INTERLEAVE:
1980 		return nodes_equal(a->v.nodes, b->v.nodes);
1981 	case MPOL_PREFERRED:
1982 		return a->v.preferred_node == b->v.preferred_node &&
1983 			a->flags == b->flags;
1984 	default:
1985 		BUG();
1986 		return 0;
1987 	}
1988 }
1989 
1990 /*
1991  * Shared memory backing store policy support.
1992  *
1993  * Remember policies even when nobody has shared memory mapped.
1994  * The policies are kept in Red-Black tree linked from the inode.
1995  * They are protected by the sp->lock spinlock, which should be held
1996  * for any accesses to the tree.
1997  */
1998 
1999 /* lookup first element intersecting start-end */
2000 /* Caller holds sp->lock */
2001 static struct sp_node *
2002 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2003 {
2004 	struct rb_node *n = sp->root.rb_node;
2005 
2006 	while (n) {
2007 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2008 
2009 		if (start >= p->end)
2010 			n = n->rb_right;
2011 		else if (end <= p->start)
2012 			n = n->rb_left;
2013 		else
2014 			break;
2015 	}
2016 	if (!n)
2017 		return NULL;
2018 	for (;;) {
2019 		struct sp_node *w = NULL;
2020 		struct rb_node *prev = rb_prev(n);
2021 		if (!prev)
2022 			break;
2023 		w = rb_entry(prev, struct sp_node, nd);
2024 		if (w->end <= start)
2025 			break;
2026 		n = prev;
2027 	}
2028 	return rb_entry(n, struct sp_node, nd);
2029 }
2030 
2031 /* Insert a new shared policy into the list. */
2032 /* Caller holds sp->lock */
2033 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2034 {
2035 	struct rb_node **p = &sp->root.rb_node;
2036 	struct rb_node *parent = NULL;
2037 	struct sp_node *nd;
2038 
2039 	while (*p) {
2040 		parent = *p;
2041 		nd = rb_entry(parent, struct sp_node, nd);
2042 		if (new->start < nd->start)
2043 			p = &(*p)->rb_left;
2044 		else if (new->end > nd->end)
2045 			p = &(*p)->rb_right;
2046 		else
2047 			BUG();
2048 	}
2049 	rb_link_node(&new->nd, parent, p);
2050 	rb_insert_color(&new->nd, &sp->root);
2051 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2052 		 new->policy ? new->policy->mode : 0);
2053 }
2054 
2055 /* Find shared policy intersecting idx */
2056 struct mempolicy *
2057 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2058 {
2059 	struct mempolicy *pol = NULL;
2060 	struct sp_node *sn;
2061 
2062 	if (!sp->root.rb_node)
2063 		return NULL;
2064 	spin_lock(&sp->lock);
2065 	sn = sp_lookup(sp, idx, idx+1);
2066 	if (sn) {
2067 		mpol_get(sn->policy);
2068 		pol = sn->policy;
2069 	}
2070 	spin_unlock(&sp->lock);
2071 	return pol;
2072 }
2073 
2074 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2075 {
2076 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2077 	rb_erase(&n->nd, &sp->root);
2078 	mpol_put(n->policy);
2079 	kmem_cache_free(sn_cache, n);
2080 }
2081 
2082 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2083 				struct mempolicy *pol)
2084 {
2085 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2086 
2087 	if (!n)
2088 		return NULL;
2089 	n->start = start;
2090 	n->end = end;
2091 	mpol_get(pol);
2092 	pol->flags |= MPOL_F_SHARED;	/* for unref */
2093 	n->policy = pol;
2094 	return n;
2095 }
2096 
2097 /* Replace a policy range. */
2098 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2099 				 unsigned long end, struct sp_node *new)
2100 {
2101 	struct sp_node *n, *new2 = NULL;
2102 
2103 restart:
2104 	spin_lock(&sp->lock);
2105 	n = sp_lookup(sp, start, end);
2106 	/* Take care of old policies in the same range. */
2107 	while (n && n->start < end) {
2108 		struct rb_node *next = rb_next(&n->nd);
2109 		if (n->start >= start) {
2110 			if (n->end <= end)
2111 				sp_delete(sp, n);
2112 			else
2113 				n->start = end;
2114 		} else {
2115 			/* Old policy spanning whole new range. */
2116 			if (n->end > end) {
2117 				if (!new2) {
2118 					spin_unlock(&sp->lock);
2119 					new2 = sp_alloc(end, n->end, n->policy);
2120 					if (!new2)
2121 						return -ENOMEM;
2122 					goto restart;
2123 				}
2124 				n->end = start;
2125 				sp_insert(sp, new2);
2126 				new2 = NULL;
2127 				break;
2128 			} else
2129 				n->end = start;
2130 		}
2131 		if (!next)
2132 			break;
2133 		n = rb_entry(next, struct sp_node, nd);
2134 	}
2135 	if (new)
2136 		sp_insert(sp, new);
2137 	spin_unlock(&sp->lock);
2138 	if (new2) {
2139 		mpol_put(new2->policy);
2140 		kmem_cache_free(sn_cache, new2);
2141 	}
2142 	return 0;
2143 }
2144 
2145 /**
2146  * mpol_shared_policy_init - initialize shared policy for inode
2147  * @sp: pointer to inode shared policy
2148  * @mpol:  struct mempolicy to install
2149  *
2150  * Install non-NULL @mpol in inode's shared policy rb-tree.
2151  * On entry, the current task has a reference on a non-NULL @mpol.
2152  * This must be released on exit.
2153  * This is called at get_inode() calls and we can use GFP_KERNEL.
2154  */
2155 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2156 {
2157 	int ret;
2158 
2159 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2160 	spin_lock_init(&sp->lock);
2161 
2162 	if (mpol) {
2163 		struct vm_area_struct pvma;
2164 		struct mempolicy *new;
2165 		NODEMASK_SCRATCH(scratch);
2166 
2167 		if (!scratch)
2168 			goto put_mpol;
2169 		/* contextualize the tmpfs mount point mempolicy */
2170 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2171 		if (IS_ERR(new))
2172 			goto free_scratch; /* no valid nodemask intersection */
2173 
2174 		task_lock(current);
2175 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2176 		task_unlock(current);
2177 		if (ret)
2178 			goto put_new;
2179 
2180 		/* Create pseudo-vma that contains just the policy */
2181 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2182 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2183 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2184 
2185 put_new:
2186 		mpol_put(new);			/* drop initial ref */
2187 free_scratch:
2188 		NODEMASK_SCRATCH_FREE(scratch);
2189 put_mpol:
2190 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2191 	}
2192 }
2193 
2194 int mpol_set_shared_policy(struct shared_policy *info,
2195 			struct vm_area_struct *vma, struct mempolicy *npol)
2196 {
2197 	int err;
2198 	struct sp_node *new = NULL;
2199 	unsigned long sz = vma_pages(vma);
2200 
2201 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2202 		 vma->vm_pgoff,
2203 		 sz, npol ? npol->mode : -1,
2204 		 npol ? npol->flags : -1,
2205 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2206 
2207 	if (npol) {
2208 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2209 		if (!new)
2210 			return -ENOMEM;
2211 	}
2212 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2213 	if (err && new)
2214 		kmem_cache_free(sn_cache, new);
2215 	return err;
2216 }
2217 
2218 /* Free a backing policy store on inode delete. */
2219 void mpol_free_shared_policy(struct shared_policy *p)
2220 {
2221 	struct sp_node *n;
2222 	struct rb_node *next;
2223 
2224 	if (!p->root.rb_node)
2225 		return;
2226 	spin_lock(&p->lock);
2227 	next = rb_first(&p->root);
2228 	while (next) {
2229 		n = rb_entry(next, struct sp_node, nd);
2230 		next = rb_next(&n->nd);
2231 		rb_erase(&n->nd, &p->root);
2232 		mpol_put(n->policy);
2233 		kmem_cache_free(sn_cache, n);
2234 	}
2235 	spin_unlock(&p->lock);
2236 }
2237 
2238 /* assumes fs == KERNEL_DS */
2239 void __init numa_policy_init(void)
2240 {
2241 	nodemask_t interleave_nodes;
2242 	unsigned long largest = 0;
2243 	int nid, prefer = 0;
2244 
2245 	policy_cache = kmem_cache_create("numa_policy",
2246 					 sizeof(struct mempolicy),
2247 					 0, SLAB_PANIC, NULL);
2248 
2249 	sn_cache = kmem_cache_create("shared_policy_node",
2250 				     sizeof(struct sp_node),
2251 				     0, SLAB_PANIC, NULL);
2252 
2253 	/*
2254 	 * Set interleaving policy for system init. Interleaving is only
2255 	 * enabled across suitably sized nodes (default is >= 16MB), or
2256 	 * fall back to the largest node if they're all smaller.
2257 	 */
2258 	nodes_clear(interleave_nodes);
2259 	for_each_node_state(nid, N_HIGH_MEMORY) {
2260 		unsigned long total_pages = node_present_pages(nid);
2261 
2262 		/* Preserve the largest node */
2263 		if (largest < total_pages) {
2264 			largest = total_pages;
2265 			prefer = nid;
2266 		}
2267 
2268 		/* Interleave this node? */
2269 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2270 			node_set(nid, interleave_nodes);
2271 	}
2272 
2273 	/* All too small, use the largest */
2274 	if (unlikely(nodes_empty(interleave_nodes)))
2275 		node_set(prefer, interleave_nodes);
2276 
2277 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2278 		printk("numa_policy_init: interleaving failed\n");
2279 }
2280 
2281 /* Reset policy of current process to default */
2282 void numa_default_policy(void)
2283 {
2284 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2285 }
2286 
2287 /*
2288  * Parse and format mempolicy from/to strings
2289  */
2290 
2291 /*
2292  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2293  * Used only for mpol_parse_str() and mpol_to_str()
2294  */
2295 #define MPOL_LOCAL MPOL_MAX
2296 static const char * const policy_modes[] =
2297 {
2298 	[MPOL_DEFAULT]    = "default",
2299 	[MPOL_PREFERRED]  = "prefer",
2300 	[MPOL_BIND]       = "bind",
2301 	[MPOL_INTERLEAVE] = "interleave",
2302 	[MPOL_LOCAL]      = "local"
2303 };
2304 
2305 
2306 #ifdef CONFIG_TMPFS
2307 /**
2308  * mpol_parse_str - parse string to mempolicy
2309  * @str:  string containing mempolicy to parse
2310  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2311  * @no_context:  flag whether to "contextualize" the mempolicy
2312  *
2313  * Format of input:
2314  *	<mode>[=<flags>][:<nodelist>]
2315  *
2316  * if @no_context is true, save the input nodemask in w.user_nodemask in
2317  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2318  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2319  * mount option.  Note that if 'static' or 'relative' mode flags were
2320  * specified, the input nodemask will already have been saved.  Saving
2321  * it again is redundant, but safe.
2322  *
2323  * On success, returns 0, else 1
2324  */
2325 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2326 {
2327 	struct mempolicy *new = NULL;
2328 	unsigned short mode;
2329 	unsigned short uninitialized_var(mode_flags);
2330 	nodemask_t nodes;
2331 	char *nodelist = strchr(str, ':');
2332 	char *flags = strchr(str, '=');
2333 	int err = 1;
2334 
2335 	if (nodelist) {
2336 		/* NUL-terminate mode or flags string */
2337 		*nodelist++ = '\0';
2338 		if (nodelist_parse(nodelist, nodes))
2339 			goto out;
2340 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2341 			goto out;
2342 	} else
2343 		nodes_clear(nodes);
2344 
2345 	if (flags)
2346 		*flags++ = '\0';	/* terminate mode string */
2347 
2348 	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2349 		if (!strcmp(str, policy_modes[mode])) {
2350 			break;
2351 		}
2352 	}
2353 	if (mode > MPOL_LOCAL)
2354 		goto out;
2355 
2356 	switch (mode) {
2357 	case MPOL_PREFERRED:
2358 		/*
2359 		 * Insist on a nodelist of one node only
2360 		 */
2361 		if (nodelist) {
2362 			char *rest = nodelist;
2363 			while (isdigit(*rest))
2364 				rest++;
2365 			if (*rest)
2366 				goto out;
2367 		}
2368 		break;
2369 	case MPOL_INTERLEAVE:
2370 		/*
2371 		 * Default to online nodes with memory if no nodelist
2372 		 */
2373 		if (!nodelist)
2374 			nodes = node_states[N_HIGH_MEMORY];
2375 		break;
2376 	case MPOL_LOCAL:
2377 		/*
2378 		 * Don't allow a nodelist;  mpol_new() checks flags
2379 		 */
2380 		if (nodelist)
2381 			goto out;
2382 		mode = MPOL_PREFERRED;
2383 		break;
2384 	case MPOL_DEFAULT:
2385 		/*
2386 		 * Insist on a empty nodelist
2387 		 */
2388 		if (!nodelist)
2389 			err = 0;
2390 		goto out;
2391 	case MPOL_BIND:
2392 		/*
2393 		 * Insist on a nodelist
2394 		 */
2395 		if (!nodelist)
2396 			goto out;
2397 	}
2398 
2399 	mode_flags = 0;
2400 	if (flags) {
2401 		/*
2402 		 * Currently, we only support two mutually exclusive
2403 		 * mode flags.
2404 		 */
2405 		if (!strcmp(flags, "static"))
2406 			mode_flags |= MPOL_F_STATIC_NODES;
2407 		else if (!strcmp(flags, "relative"))
2408 			mode_flags |= MPOL_F_RELATIVE_NODES;
2409 		else
2410 			goto out;
2411 	}
2412 
2413 	new = mpol_new(mode, mode_flags, &nodes);
2414 	if (IS_ERR(new))
2415 		goto out;
2416 
2417 	if (no_context) {
2418 		/* save for contextualization */
2419 		new->w.user_nodemask = nodes;
2420 	} else {
2421 		int ret;
2422 		NODEMASK_SCRATCH(scratch);
2423 		if (scratch) {
2424 			task_lock(current);
2425 			ret = mpol_set_nodemask(new, &nodes, scratch);
2426 			task_unlock(current);
2427 		} else
2428 			ret = -ENOMEM;
2429 		NODEMASK_SCRATCH_FREE(scratch);
2430 		if (ret) {
2431 			mpol_put(new);
2432 			goto out;
2433 		}
2434 	}
2435 	err = 0;
2436 
2437 out:
2438 	/* Restore string for error message */
2439 	if (nodelist)
2440 		*--nodelist = ':';
2441 	if (flags)
2442 		*--flags = '=';
2443 	if (!err)
2444 		*mpol = new;
2445 	return err;
2446 }
2447 #endif /* CONFIG_TMPFS */
2448 
2449 /**
2450  * mpol_to_str - format a mempolicy structure for printing
2451  * @buffer:  to contain formatted mempolicy string
2452  * @maxlen:  length of @buffer
2453  * @pol:  pointer to mempolicy to be formatted
2454  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2455  *
2456  * Convert a mempolicy into a string.
2457  * Returns the number of characters in buffer (if positive)
2458  * or an error (negative)
2459  */
2460 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2461 {
2462 	char *p = buffer;
2463 	int l;
2464 	nodemask_t nodes;
2465 	unsigned short mode;
2466 	unsigned short flags = pol ? pol->flags : 0;
2467 
2468 	/*
2469 	 * Sanity check:  room for longest mode, flag and some nodes
2470 	 */
2471 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2472 
2473 	if (!pol || pol == &default_policy)
2474 		mode = MPOL_DEFAULT;
2475 	else
2476 		mode = pol->mode;
2477 
2478 	switch (mode) {
2479 	case MPOL_DEFAULT:
2480 		nodes_clear(nodes);
2481 		break;
2482 
2483 	case MPOL_PREFERRED:
2484 		nodes_clear(nodes);
2485 		if (flags & MPOL_F_LOCAL)
2486 			mode = MPOL_LOCAL;	/* pseudo-policy */
2487 		else
2488 			node_set(pol->v.preferred_node, nodes);
2489 		break;
2490 
2491 	case MPOL_BIND:
2492 		/* Fall through */
2493 	case MPOL_INTERLEAVE:
2494 		if (no_context)
2495 			nodes = pol->w.user_nodemask;
2496 		else
2497 			nodes = pol->v.nodes;
2498 		break;
2499 
2500 	default:
2501 		BUG();
2502 	}
2503 
2504 	l = strlen(policy_modes[mode]);
2505 	if (buffer + maxlen < p + l + 1)
2506 		return -ENOSPC;
2507 
2508 	strcpy(p, policy_modes[mode]);
2509 	p += l;
2510 
2511 	if (flags & MPOL_MODE_FLAGS) {
2512 		if (buffer + maxlen < p + 2)
2513 			return -ENOSPC;
2514 		*p++ = '=';
2515 
2516 		/*
2517 		 * Currently, the only defined flags are mutually exclusive
2518 		 */
2519 		if (flags & MPOL_F_STATIC_NODES)
2520 			p += snprintf(p, buffer + maxlen - p, "static");
2521 		else if (flags & MPOL_F_RELATIVE_NODES)
2522 			p += snprintf(p, buffer + maxlen - p, "relative");
2523 	}
2524 
2525 	if (!nodes_empty(nodes)) {
2526 		if (buffer + maxlen < p + 2)
2527 			return -ENOSPC;
2528 		*p++ = ':';
2529 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2530 	}
2531 	return p - buffer;
2532 }
2533 
2534 struct numa_maps {
2535 	unsigned long pages;
2536 	unsigned long anon;
2537 	unsigned long active;
2538 	unsigned long writeback;
2539 	unsigned long mapcount_max;
2540 	unsigned long dirty;
2541 	unsigned long swapcache;
2542 	unsigned long node[MAX_NUMNODES];
2543 };
2544 
2545 static void gather_stats(struct page *page, void *private, int pte_dirty)
2546 {
2547 	struct numa_maps *md = private;
2548 	int count = page_mapcount(page);
2549 
2550 	md->pages++;
2551 	if (pte_dirty || PageDirty(page))
2552 		md->dirty++;
2553 
2554 	if (PageSwapCache(page))
2555 		md->swapcache++;
2556 
2557 	if (PageActive(page) || PageUnevictable(page))
2558 		md->active++;
2559 
2560 	if (PageWriteback(page))
2561 		md->writeback++;
2562 
2563 	if (PageAnon(page))
2564 		md->anon++;
2565 
2566 	if (count > md->mapcount_max)
2567 		md->mapcount_max = count;
2568 
2569 	md->node[page_to_nid(page)]++;
2570 }
2571 
2572 #ifdef CONFIG_HUGETLB_PAGE
2573 static void check_huge_range(struct vm_area_struct *vma,
2574 		unsigned long start, unsigned long end,
2575 		struct numa_maps *md)
2576 {
2577 	unsigned long addr;
2578 	struct page *page;
2579 	struct hstate *h = hstate_vma(vma);
2580 	unsigned long sz = huge_page_size(h);
2581 
2582 	for (addr = start; addr < end; addr += sz) {
2583 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2584 						addr & huge_page_mask(h));
2585 		pte_t pte;
2586 
2587 		if (!ptep)
2588 			continue;
2589 
2590 		pte = *ptep;
2591 		if (pte_none(pte))
2592 			continue;
2593 
2594 		page = pte_page(pte);
2595 		if (!page)
2596 			continue;
2597 
2598 		gather_stats(page, md, pte_dirty(*ptep));
2599 	}
2600 }
2601 #else
2602 static inline void check_huge_range(struct vm_area_struct *vma,
2603 		unsigned long start, unsigned long end,
2604 		struct numa_maps *md)
2605 {
2606 }
2607 #endif
2608 
2609 /*
2610  * Display pages allocated per node and memory policy via /proc.
2611  */
2612 int show_numa_map(struct seq_file *m, void *v)
2613 {
2614 	struct proc_maps_private *priv = m->private;
2615 	struct vm_area_struct *vma = v;
2616 	struct numa_maps *md;
2617 	struct file *file = vma->vm_file;
2618 	struct mm_struct *mm = vma->vm_mm;
2619 	struct mempolicy *pol;
2620 	int n;
2621 	char buffer[50];
2622 
2623 	if (!mm)
2624 		return 0;
2625 
2626 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2627 	if (!md)
2628 		return 0;
2629 
2630 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2631 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2632 	mpol_cond_put(pol);
2633 
2634 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2635 
2636 	if (file) {
2637 		seq_printf(m, " file=");
2638 		seq_path(m, &file->f_path, "\n\t= ");
2639 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2640 		seq_printf(m, " heap");
2641 	} else if (vma->vm_start <= mm->start_stack &&
2642 			vma->vm_end >= mm->start_stack) {
2643 		seq_printf(m, " stack");
2644 	}
2645 
2646 	if (is_vm_hugetlb_page(vma)) {
2647 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2648 		seq_printf(m, " huge");
2649 	} else {
2650 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2651 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2652 	}
2653 
2654 	if (!md->pages)
2655 		goto out;
2656 
2657 	if (md->anon)
2658 		seq_printf(m," anon=%lu",md->anon);
2659 
2660 	if (md->dirty)
2661 		seq_printf(m," dirty=%lu",md->dirty);
2662 
2663 	if (md->pages != md->anon && md->pages != md->dirty)
2664 		seq_printf(m, " mapped=%lu", md->pages);
2665 
2666 	if (md->mapcount_max > 1)
2667 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2668 
2669 	if (md->swapcache)
2670 		seq_printf(m," swapcache=%lu", md->swapcache);
2671 
2672 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2673 		seq_printf(m," active=%lu", md->active);
2674 
2675 	if (md->writeback)
2676 		seq_printf(m," writeback=%lu", md->writeback);
2677 
2678 	for_each_node_state(n, N_HIGH_MEMORY)
2679 		if (md->node[n])
2680 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2681 out:
2682 	seq_putc(m, '\n');
2683 	kfree(md);
2684 
2685 	if (m->count < m->size)
2686 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2687 	return 0;
2688 }
2689