xref: /openbmc/linux/mm/mempolicy.c (revision 565d76cb)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/module.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 
94 #include <asm/tlbflush.h>
95 #include <asm/uaccess.h>
96 
97 #include "internal.h"
98 
99 /* Internal flags */
100 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
103 
104 static struct kmem_cache *policy_cache;
105 static struct kmem_cache *sn_cache;
106 
107 /* Highest zone. An specific allocation for a zone below that is not
108    policied. */
109 enum zone_type policy_zone = 0;
110 
111 /*
112  * run-time system-wide default policy => local allocation
113  */
114 struct mempolicy default_policy = {
115 	.refcnt = ATOMIC_INIT(1), /* never free it */
116 	.mode = MPOL_PREFERRED,
117 	.flags = MPOL_F_LOCAL,
118 };
119 
120 static const struct mempolicy_operations {
121 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 	/*
123 	 * If read-side task has no lock to protect task->mempolicy, write-side
124 	 * task will rebind the task->mempolicy by two step. The first step is
125 	 * setting all the newly nodes, and the second step is cleaning all the
126 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 	 * page.
128 	 * If we have a lock to protect task->mempolicy in read-side, we do
129 	 * rebind directly.
130 	 *
131 	 * step:
132 	 * 	MPOL_REBIND_ONCE - do rebind work at once
133 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
134 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 	 */
136 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 			enum mpol_rebind_step step);
138 } mpol_ops[MPOL_MAX];
139 
140 /* Check that the nodemask contains at least one populated zone */
141 static int is_valid_nodemask(const nodemask_t *nodemask)
142 {
143 	int nd, k;
144 
145 	for_each_node_mask(nd, *nodemask) {
146 		struct zone *z;
147 
148 		for (k = 0; k <= policy_zone; k++) {
149 			z = &NODE_DATA(nd)->node_zones[k];
150 			if (z->present_pages > 0)
151 				return 1;
152 		}
153 	}
154 
155 	return 0;
156 }
157 
158 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
159 {
160 	return pol->flags & MPOL_MODE_FLAGS;
161 }
162 
163 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
164 				   const nodemask_t *rel)
165 {
166 	nodemask_t tmp;
167 	nodes_fold(tmp, *orig, nodes_weight(*rel));
168 	nodes_onto(*ret, tmp, *rel);
169 }
170 
171 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
172 {
173 	if (nodes_empty(*nodes))
174 		return -EINVAL;
175 	pol->v.nodes = *nodes;
176 	return 0;
177 }
178 
179 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (!nodes)
182 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
183 	else if (nodes_empty(*nodes))
184 		return -EINVAL;			/*  no allowed nodes */
185 	else
186 		pol->v.preferred_node = first_node(*nodes);
187 	return 0;
188 }
189 
190 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 {
192 	if (!is_valid_nodemask(nodes))
193 		return -EINVAL;
194 	pol->v.nodes = *nodes;
195 	return 0;
196 }
197 
198 /*
199  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
200  * any, for the new policy.  mpol_new() has already validated the nodes
201  * parameter with respect to the policy mode and flags.  But, we need to
202  * handle an empty nodemask with MPOL_PREFERRED here.
203  *
204  * Must be called holding task's alloc_lock to protect task's mems_allowed
205  * and mempolicy.  May also be called holding the mmap_semaphore for write.
206  */
207 static int mpol_set_nodemask(struct mempolicy *pol,
208 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
209 {
210 	int ret;
211 
212 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 	if (pol == NULL)
214 		return 0;
215 	/* Check N_HIGH_MEMORY */
216 	nodes_and(nsc->mask1,
217 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
218 
219 	VM_BUG_ON(!nodes);
220 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
221 		nodes = NULL;	/* explicit local allocation */
222 	else {
223 		if (pol->flags & MPOL_F_RELATIVE_NODES)
224 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
225 		else
226 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
227 
228 		if (mpol_store_user_nodemask(pol))
229 			pol->w.user_nodemask = *nodes;
230 		else
231 			pol->w.cpuset_mems_allowed =
232 						cpuset_current_mems_allowed;
233 	}
234 
235 	if (nodes)
236 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
237 	else
238 		ret = mpol_ops[pol->mode].create(pol, NULL);
239 	return ret;
240 }
241 
242 /*
243  * This function just creates a new policy, does some check and simple
244  * initialization. You must invoke mpol_set_nodemask() to set nodes.
245  */
246 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
247 				  nodemask_t *nodes)
248 {
249 	struct mempolicy *policy;
250 
251 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
252 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
253 
254 	if (mode == MPOL_DEFAULT) {
255 		if (nodes && !nodes_empty(*nodes))
256 			return ERR_PTR(-EINVAL);
257 		return NULL;	/* simply delete any existing policy */
258 	}
259 	VM_BUG_ON(!nodes);
260 
261 	/*
262 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
263 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
264 	 * All other modes require a valid pointer to a non-empty nodemask.
265 	 */
266 	if (mode == MPOL_PREFERRED) {
267 		if (nodes_empty(*nodes)) {
268 			if (((flags & MPOL_F_STATIC_NODES) ||
269 			     (flags & MPOL_F_RELATIVE_NODES)))
270 				return ERR_PTR(-EINVAL);
271 		}
272 	} else if (nodes_empty(*nodes))
273 		return ERR_PTR(-EINVAL);
274 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
275 	if (!policy)
276 		return ERR_PTR(-ENOMEM);
277 	atomic_set(&policy->refcnt, 1);
278 	policy->mode = mode;
279 	policy->flags = flags;
280 
281 	return policy;
282 }
283 
284 /* Slow path of a mpol destructor. */
285 void __mpol_put(struct mempolicy *p)
286 {
287 	if (!atomic_dec_and_test(&p->refcnt))
288 		return;
289 	kmem_cache_free(policy_cache, p);
290 }
291 
292 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 				enum mpol_rebind_step step)
294 {
295 }
296 
297 /*
298  * step:
299  * 	MPOL_REBIND_ONCE  - do rebind work at once
300  * 	MPOL_REBIND_STEP1 - set all the newly nodes
301  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
302  */
303 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 				 enum mpol_rebind_step step)
305 {
306 	nodemask_t tmp;
307 
308 	if (pol->flags & MPOL_F_STATIC_NODES)
309 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
310 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
311 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
312 	else {
313 		/*
314 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
315 		 * result
316 		 */
317 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 			nodes_remap(tmp, pol->v.nodes,
319 					pol->w.cpuset_mems_allowed, *nodes);
320 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 		} else if (step == MPOL_REBIND_STEP2) {
322 			tmp = pol->w.cpuset_mems_allowed;
323 			pol->w.cpuset_mems_allowed = *nodes;
324 		} else
325 			BUG();
326 	}
327 
328 	if (nodes_empty(tmp))
329 		tmp = *nodes;
330 
331 	if (step == MPOL_REBIND_STEP1)
332 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 		pol->v.nodes = tmp;
335 	else
336 		BUG();
337 
338 	if (!node_isset(current->il_next, tmp)) {
339 		current->il_next = next_node(current->il_next, tmp);
340 		if (current->il_next >= MAX_NUMNODES)
341 			current->il_next = first_node(tmp);
342 		if (current->il_next >= MAX_NUMNODES)
343 			current->il_next = numa_node_id();
344 	}
345 }
346 
347 static void mpol_rebind_preferred(struct mempolicy *pol,
348 				  const nodemask_t *nodes,
349 				  enum mpol_rebind_step step)
350 {
351 	nodemask_t tmp;
352 
353 	if (pol->flags & MPOL_F_STATIC_NODES) {
354 		int node = first_node(pol->w.user_nodemask);
355 
356 		if (node_isset(node, *nodes)) {
357 			pol->v.preferred_node = node;
358 			pol->flags &= ~MPOL_F_LOCAL;
359 		} else
360 			pol->flags |= MPOL_F_LOCAL;
361 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 		pol->v.preferred_node = first_node(tmp);
364 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
365 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 						   pol->w.cpuset_mems_allowed,
367 						   *nodes);
368 		pol->w.cpuset_mems_allowed = *nodes;
369 	}
370 }
371 
372 /*
373  * mpol_rebind_policy - Migrate a policy to a different set of nodes
374  *
375  * If read-side task has no lock to protect task->mempolicy, write-side
376  * task will rebind the task->mempolicy by two step. The first step is
377  * setting all the newly nodes, and the second step is cleaning all the
378  * disallowed nodes. In this way, we can avoid finding no node to alloc
379  * page.
380  * If we have a lock to protect task->mempolicy in read-side, we do
381  * rebind directly.
382  *
383  * step:
384  * 	MPOL_REBIND_ONCE  - do rebind work at once
385  * 	MPOL_REBIND_STEP1 - set all the newly nodes
386  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
387  */
388 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 				enum mpol_rebind_step step)
390 {
391 	if (!pol)
392 		return;
393 	if (!mpol_store_user_nodemask(pol) && step == 0 &&
394 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 		return;
396 
397 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 		return;
399 
400 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 		BUG();
402 
403 	if (step == MPOL_REBIND_STEP1)
404 		pol->flags |= MPOL_F_REBINDING;
405 	else if (step == MPOL_REBIND_STEP2)
406 		pol->flags &= ~MPOL_F_REBINDING;
407 	else if (step >= MPOL_REBIND_NSTEP)
408 		BUG();
409 
410 	mpol_ops[pol->mode].rebind(pol, newmask, step);
411 }
412 
413 /*
414  * Wrapper for mpol_rebind_policy() that just requires task
415  * pointer, and updates task mempolicy.
416  *
417  * Called with task's alloc_lock held.
418  */
419 
420 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 			enum mpol_rebind_step step)
422 {
423 	mpol_rebind_policy(tsk->mempolicy, new, step);
424 }
425 
426 /*
427  * Rebind each vma in mm to new nodemask.
428  *
429  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
430  */
431 
432 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
433 {
434 	struct vm_area_struct *vma;
435 
436 	down_write(&mm->mmap_sem);
437 	for (vma = mm->mmap; vma; vma = vma->vm_next)
438 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
439 	up_write(&mm->mmap_sem);
440 }
441 
442 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
443 	[MPOL_DEFAULT] = {
444 		.rebind = mpol_rebind_default,
445 	},
446 	[MPOL_INTERLEAVE] = {
447 		.create = mpol_new_interleave,
448 		.rebind = mpol_rebind_nodemask,
449 	},
450 	[MPOL_PREFERRED] = {
451 		.create = mpol_new_preferred,
452 		.rebind = mpol_rebind_preferred,
453 	},
454 	[MPOL_BIND] = {
455 		.create = mpol_new_bind,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 };
459 
460 static void gather_stats(struct page *, void *, int pte_dirty);
461 static void migrate_page_add(struct page *page, struct list_head *pagelist,
462 				unsigned long flags);
463 
464 /* Scan through pages checking if pages follow certain conditions. */
465 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
466 		unsigned long addr, unsigned long end,
467 		const nodemask_t *nodes, unsigned long flags,
468 		void *private)
469 {
470 	pte_t *orig_pte;
471 	pte_t *pte;
472 	spinlock_t *ptl;
473 
474 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
475 	do {
476 		struct page *page;
477 		int nid;
478 
479 		if (!pte_present(*pte))
480 			continue;
481 		page = vm_normal_page(vma, addr, *pte);
482 		if (!page)
483 			continue;
484 		/*
485 		 * vm_normal_page() filters out zero pages, but there might
486 		 * still be PageReserved pages to skip, perhaps in a VDSO.
487 		 * And we cannot move PageKsm pages sensibly or safely yet.
488 		 */
489 		if (PageReserved(page) || PageKsm(page))
490 			continue;
491 		nid = page_to_nid(page);
492 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
493 			continue;
494 
495 		if (flags & MPOL_MF_STATS)
496 			gather_stats(page, private, pte_dirty(*pte));
497 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
498 			migrate_page_add(page, private, flags);
499 		else
500 			break;
501 	} while (pte++, addr += PAGE_SIZE, addr != end);
502 	pte_unmap_unlock(orig_pte, ptl);
503 	return addr != end;
504 }
505 
506 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
507 		unsigned long addr, unsigned long end,
508 		const nodemask_t *nodes, unsigned long flags,
509 		void *private)
510 {
511 	pmd_t *pmd;
512 	unsigned long next;
513 
514 	pmd = pmd_offset(pud, addr);
515 	do {
516 		next = pmd_addr_end(addr, end);
517 		split_huge_page_pmd(vma->vm_mm, pmd);
518 		if (pmd_none_or_clear_bad(pmd))
519 			continue;
520 		if (check_pte_range(vma, pmd, addr, next, nodes,
521 				    flags, private))
522 			return -EIO;
523 	} while (pmd++, addr = next, addr != end);
524 	return 0;
525 }
526 
527 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
528 		unsigned long addr, unsigned long end,
529 		const nodemask_t *nodes, unsigned long flags,
530 		void *private)
531 {
532 	pud_t *pud;
533 	unsigned long next;
534 
535 	pud = pud_offset(pgd, addr);
536 	do {
537 		next = pud_addr_end(addr, end);
538 		if (pud_none_or_clear_bad(pud))
539 			continue;
540 		if (check_pmd_range(vma, pud, addr, next, nodes,
541 				    flags, private))
542 			return -EIO;
543 	} while (pud++, addr = next, addr != end);
544 	return 0;
545 }
546 
547 static inline int check_pgd_range(struct vm_area_struct *vma,
548 		unsigned long addr, unsigned long end,
549 		const nodemask_t *nodes, unsigned long flags,
550 		void *private)
551 {
552 	pgd_t *pgd;
553 	unsigned long next;
554 
555 	pgd = pgd_offset(vma->vm_mm, addr);
556 	do {
557 		next = pgd_addr_end(addr, end);
558 		if (pgd_none_or_clear_bad(pgd))
559 			continue;
560 		if (check_pud_range(vma, pgd, addr, next, nodes,
561 				    flags, private))
562 			return -EIO;
563 	} while (pgd++, addr = next, addr != end);
564 	return 0;
565 }
566 
567 /*
568  * Check if all pages in a range are on a set of nodes.
569  * If pagelist != NULL then isolate pages from the LRU and
570  * put them on the pagelist.
571  */
572 static struct vm_area_struct *
573 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
574 		const nodemask_t *nodes, unsigned long flags, void *private)
575 {
576 	int err;
577 	struct vm_area_struct *first, *vma, *prev;
578 
579 
580 	first = find_vma(mm, start);
581 	if (!first)
582 		return ERR_PTR(-EFAULT);
583 	prev = NULL;
584 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
585 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
586 			if (!vma->vm_next && vma->vm_end < end)
587 				return ERR_PTR(-EFAULT);
588 			if (prev && prev->vm_end < vma->vm_start)
589 				return ERR_PTR(-EFAULT);
590 		}
591 		if (!is_vm_hugetlb_page(vma) &&
592 		    ((flags & MPOL_MF_STRICT) ||
593 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
594 				vma_migratable(vma)))) {
595 			unsigned long endvma = vma->vm_end;
596 
597 			if (endvma > end)
598 				endvma = end;
599 			if (vma->vm_start > start)
600 				start = vma->vm_start;
601 			err = check_pgd_range(vma, start, endvma, nodes,
602 						flags, private);
603 			if (err) {
604 				first = ERR_PTR(err);
605 				break;
606 			}
607 		}
608 		prev = vma;
609 	}
610 	return first;
611 }
612 
613 /* Apply policy to a single VMA */
614 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
615 {
616 	int err = 0;
617 	struct mempolicy *old = vma->vm_policy;
618 
619 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
620 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
621 		 vma->vm_ops, vma->vm_file,
622 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
623 
624 	if (vma->vm_ops && vma->vm_ops->set_policy)
625 		err = vma->vm_ops->set_policy(vma, new);
626 	if (!err) {
627 		mpol_get(new);
628 		vma->vm_policy = new;
629 		mpol_put(old);
630 	}
631 	return err;
632 }
633 
634 /* Step 2: apply policy to a range and do splits. */
635 static int mbind_range(struct mm_struct *mm, unsigned long start,
636 		       unsigned long end, struct mempolicy *new_pol)
637 {
638 	struct vm_area_struct *next;
639 	struct vm_area_struct *prev;
640 	struct vm_area_struct *vma;
641 	int err = 0;
642 	pgoff_t pgoff;
643 	unsigned long vmstart;
644 	unsigned long vmend;
645 
646 	vma = find_vma_prev(mm, start, &prev);
647 	if (!vma || vma->vm_start > start)
648 		return -EFAULT;
649 
650 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
651 		next = vma->vm_next;
652 		vmstart = max(start, vma->vm_start);
653 		vmend   = min(end, vma->vm_end);
654 
655 		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
656 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
657 				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
658 		if (prev) {
659 			vma = prev;
660 			next = vma->vm_next;
661 			continue;
662 		}
663 		if (vma->vm_start != vmstart) {
664 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
665 			if (err)
666 				goto out;
667 		}
668 		if (vma->vm_end != vmend) {
669 			err = split_vma(vma->vm_mm, vma, vmend, 0);
670 			if (err)
671 				goto out;
672 		}
673 		err = policy_vma(vma, new_pol);
674 		if (err)
675 			goto out;
676 	}
677 
678  out:
679 	return err;
680 }
681 
682 /*
683  * Update task->flags PF_MEMPOLICY bit: set iff non-default
684  * mempolicy.  Allows more rapid checking of this (combined perhaps
685  * with other PF_* flag bits) on memory allocation hot code paths.
686  *
687  * If called from outside this file, the task 'p' should -only- be
688  * a newly forked child not yet visible on the task list, because
689  * manipulating the task flags of a visible task is not safe.
690  *
691  * The above limitation is why this routine has the funny name
692  * mpol_fix_fork_child_flag().
693  *
694  * It is also safe to call this with a task pointer of current,
695  * which the static wrapper mpol_set_task_struct_flag() does,
696  * for use within this file.
697  */
698 
699 void mpol_fix_fork_child_flag(struct task_struct *p)
700 {
701 	if (p->mempolicy)
702 		p->flags |= PF_MEMPOLICY;
703 	else
704 		p->flags &= ~PF_MEMPOLICY;
705 }
706 
707 static void mpol_set_task_struct_flag(void)
708 {
709 	mpol_fix_fork_child_flag(current);
710 }
711 
712 /* Set the process memory policy */
713 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
714 			     nodemask_t *nodes)
715 {
716 	struct mempolicy *new, *old;
717 	struct mm_struct *mm = current->mm;
718 	NODEMASK_SCRATCH(scratch);
719 	int ret;
720 
721 	if (!scratch)
722 		return -ENOMEM;
723 
724 	new = mpol_new(mode, flags, nodes);
725 	if (IS_ERR(new)) {
726 		ret = PTR_ERR(new);
727 		goto out;
728 	}
729 	/*
730 	 * prevent changing our mempolicy while show_numa_maps()
731 	 * is using it.
732 	 * Note:  do_set_mempolicy() can be called at init time
733 	 * with no 'mm'.
734 	 */
735 	if (mm)
736 		down_write(&mm->mmap_sem);
737 	task_lock(current);
738 	ret = mpol_set_nodemask(new, nodes, scratch);
739 	if (ret) {
740 		task_unlock(current);
741 		if (mm)
742 			up_write(&mm->mmap_sem);
743 		mpol_put(new);
744 		goto out;
745 	}
746 	old = current->mempolicy;
747 	current->mempolicy = new;
748 	mpol_set_task_struct_flag();
749 	if (new && new->mode == MPOL_INTERLEAVE &&
750 	    nodes_weight(new->v.nodes))
751 		current->il_next = first_node(new->v.nodes);
752 	task_unlock(current);
753 	if (mm)
754 		up_write(&mm->mmap_sem);
755 
756 	mpol_put(old);
757 	ret = 0;
758 out:
759 	NODEMASK_SCRATCH_FREE(scratch);
760 	return ret;
761 }
762 
763 /*
764  * Return nodemask for policy for get_mempolicy() query
765  *
766  * Called with task's alloc_lock held
767  */
768 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
769 {
770 	nodes_clear(*nodes);
771 	if (p == &default_policy)
772 		return;
773 
774 	switch (p->mode) {
775 	case MPOL_BIND:
776 		/* Fall through */
777 	case MPOL_INTERLEAVE:
778 		*nodes = p->v.nodes;
779 		break;
780 	case MPOL_PREFERRED:
781 		if (!(p->flags & MPOL_F_LOCAL))
782 			node_set(p->v.preferred_node, *nodes);
783 		/* else return empty node mask for local allocation */
784 		break;
785 	default:
786 		BUG();
787 	}
788 }
789 
790 static int lookup_node(struct mm_struct *mm, unsigned long addr)
791 {
792 	struct page *p;
793 	int err;
794 
795 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
796 	if (err >= 0) {
797 		err = page_to_nid(p);
798 		put_page(p);
799 	}
800 	return err;
801 }
802 
803 /* Retrieve NUMA policy */
804 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
805 			     unsigned long addr, unsigned long flags)
806 {
807 	int err;
808 	struct mm_struct *mm = current->mm;
809 	struct vm_area_struct *vma = NULL;
810 	struct mempolicy *pol = current->mempolicy;
811 
812 	if (flags &
813 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
814 		return -EINVAL;
815 
816 	if (flags & MPOL_F_MEMS_ALLOWED) {
817 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
818 			return -EINVAL;
819 		*policy = 0;	/* just so it's initialized */
820 		task_lock(current);
821 		*nmask  = cpuset_current_mems_allowed;
822 		task_unlock(current);
823 		return 0;
824 	}
825 
826 	if (flags & MPOL_F_ADDR) {
827 		/*
828 		 * Do NOT fall back to task policy if the
829 		 * vma/shared policy at addr is NULL.  We
830 		 * want to return MPOL_DEFAULT in this case.
831 		 */
832 		down_read(&mm->mmap_sem);
833 		vma = find_vma_intersection(mm, addr, addr+1);
834 		if (!vma) {
835 			up_read(&mm->mmap_sem);
836 			return -EFAULT;
837 		}
838 		if (vma->vm_ops && vma->vm_ops->get_policy)
839 			pol = vma->vm_ops->get_policy(vma, addr);
840 		else
841 			pol = vma->vm_policy;
842 	} else if (addr)
843 		return -EINVAL;
844 
845 	if (!pol)
846 		pol = &default_policy;	/* indicates default behavior */
847 
848 	if (flags & MPOL_F_NODE) {
849 		if (flags & MPOL_F_ADDR) {
850 			err = lookup_node(mm, addr);
851 			if (err < 0)
852 				goto out;
853 			*policy = err;
854 		} else if (pol == current->mempolicy &&
855 				pol->mode == MPOL_INTERLEAVE) {
856 			*policy = current->il_next;
857 		} else {
858 			err = -EINVAL;
859 			goto out;
860 		}
861 	} else {
862 		*policy = pol == &default_policy ? MPOL_DEFAULT :
863 						pol->mode;
864 		/*
865 		 * Internal mempolicy flags must be masked off before exposing
866 		 * the policy to userspace.
867 		 */
868 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
869 	}
870 
871 	if (vma) {
872 		up_read(&current->mm->mmap_sem);
873 		vma = NULL;
874 	}
875 
876 	err = 0;
877 	if (nmask) {
878 		if (mpol_store_user_nodemask(pol)) {
879 			*nmask = pol->w.user_nodemask;
880 		} else {
881 			task_lock(current);
882 			get_policy_nodemask(pol, nmask);
883 			task_unlock(current);
884 		}
885 	}
886 
887  out:
888 	mpol_cond_put(pol);
889 	if (vma)
890 		up_read(&current->mm->mmap_sem);
891 	return err;
892 }
893 
894 #ifdef CONFIG_MIGRATION
895 /*
896  * page migration
897  */
898 static void migrate_page_add(struct page *page, struct list_head *pagelist,
899 				unsigned long flags)
900 {
901 	/*
902 	 * Avoid migrating a page that is shared with others.
903 	 */
904 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
905 		if (!isolate_lru_page(page)) {
906 			list_add_tail(&page->lru, pagelist);
907 			inc_zone_page_state(page, NR_ISOLATED_ANON +
908 					    page_is_file_cache(page));
909 		}
910 	}
911 }
912 
913 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
914 {
915 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
916 }
917 
918 /*
919  * Migrate pages from one node to a target node.
920  * Returns error or the number of pages not migrated.
921  */
922 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
923 			   int flags)
924 {
925 	nodemask_t nmask;
926 	LIST_HEAD(pagelist);
927 	int err = 0;
928 	struct vm_area_struct *vma;
929 
930 	nodes_clear(nmask);
931 	node_set(source, nmask);
932 
933 	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
934 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
935 	if (IS_ERR(vma))
936 		return PTR_ERR(vma);
937 
938 	if (!list_empty(&pagelist)) {
939 		err = migrate_pages(&pagelist, new_node_page, dest,
940 								false, true);
941 		if (err)
942 			putback_lru_pages(&pagelist);
943 	}
944 
945 	return err;
946 }
947 
948 /*
949  * Move pages between the two nodesets so as to preserve the physical
950  * layout as much as possible.
951  *
952  * Returns the number of page that could not be moved.
953  */
954 int do_migrate_pages(struct mm_struct *mm,
955 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
956 {
957 	int busy = 0;
958 	int err;
959 	nodemask_t tmp;
960 
961 	err = migrate_prep();
962 	if (err)
963 		return err;
964 
965 	down_read(&mm->mmap_sem);
966 
967 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
968 	if (err)
969 		goto out;
970 
971 	/*
972 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
973 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
974 	 * bit in 'tmp', and return that <source, dest> pair for migration.
975 	 * The pair of nodemasks 'to' and 'from' define the map.
976 	 *
977 	 * If no pair of bits is found that way, fallback to picking some
978 	 * pair of 'source' and 'dest' bits that are not the same.  If the
979 	 * 'source' and 'dest' bits are the same, this represents a node
980 	 * that will be migrating to itself, so no pages need move.
981 	 *
982 	 * If no bits are left in 'tmp', or if all remaining bits left
983 	 * in 'tmp' correspond to the same bit in 'to', return false
984 	 * (nothing left to migrate).
985 	 *
986 	 * This lets us pick a pair of nodes to migrate between, such that
987 	 * if possible the dest node is not already occupied by some other
988 	 * source node, minimizing the risk of overloading the memory on a
989 	 * node that would happen if we migrated incoming memory to a node
990 	 * before migrating outgoing memory source that same node.
991 	 *
992 	 * A single scan of tmp is sufficient.  As we go, we remember the
993 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
994 	 * that not only moved, but what's better, moved to an empty slot
995 	 * (d is not set in tmp), then we break out then, with that pair.
996 	 * Otherwise when we finish scanning from_tmp, we at least have the
997 	 * most recent <s, d> pair that moved.  If we get all the way through
998 	 * the scan of tmp without finding any node that moved, much less
999 	 * moved to an empty node, then there is nothing left worth migrating.
1000 	 */
1001 
1002 	tmp = *from_nodes;
1003 	while (!nodes_empty(tmp)) {
1004 		int s,d;
1005 		int source = -1;
1006 		int dest = 0;
1007 
1008 		for_each_node_mask(s, tmp) {
1009 			d = node_remap(s, *from_nodes, *to_nodes);
1010 			if (s == d)
1011 				continue;
1012 
1013 			source = s;	/* Node moved. Memorize */
1014 			dest = d;
1015 
1016 			/* dest not in remaining from nodes? */
1017 			if (!node_isset(dest, tmp))
1018 				break;
1019 		}
1020 		if (source == -1)
1021 			break;
1022 
1023 		node_clear(source, tmp);
1024 		err = migrate_to_node(mm, source, dest, flags);
1025 		if (err > 0)
1026 			busy += err;
1027 		if (err < 0)
1028 			break;
1029 	}
1030 out:
1031 	up_read(&mm->mmap_sem);
1032 	if (err < 0)
1033 		return err;
1034 	return busy;
1035 
1036 }
1037 
1038 /*
1039  * Allocate a new page for page migration based on vma policy.
1040  * Start assuming that page is mapped by vma pointed to by @private.
1041  * Search forward from there, if not.  N.B., this assumes that the
1042  * list of pages handed to migrate_pages()--which is how we get here--
1043  * is in virtual address order.
1044  */
1045 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1046 {
1047 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1048 	unsigned long uninitialized_var(address);
1049 
1050 	while (vma) {
1051 		address = page_address_in_vma(page, vma);
1052 		if (address != -EFAULT)
1053 			break;
1054 		vma = vma->vm_next;
1055 	}
1056 
1057 	/*
1058 	 * if !vma, alloc_page_vma() will use task or system default policy
1059 	 */
1060 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1061 }
1062 #else
1063 
1064 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1065 				unsigned long flags)
1066 {
1067 }
1068 
1069 int do_migrate_pages(struct mm_struct *mm,
1070 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1071 {
1072 	return -ENOSYS;
1073 }
1074 
1075 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1076 {
1077 	return NULL;
1078 }
1079 #endif
1080 
1081 static long do_mbind(unsigned long start, unsigned long len,
1082 		     unsigned short mode, unsigned short mode_flags,
1083 		     nodemask_t *nmask, unsigned long flags)
1084 {
1085 	struct vm_area_struct *vma;
1086 	struct mm_struct *mm = current->mm;
1087 	struct mempolicy *new;
1088 	unsigned long end;
1089 	int err;
1090 	LIST_HEAD(pagelist);
1091 
1092 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1093 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1094 		return -EINVAL;
1095 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1096 		return -EPERM;
1097 
1098 	if (start & ~PAGE_MASK)
1099 		return -EINVAL;
1100 
1101 	if (mode == MPOL_DEFAULT)
1102 		flags &= ~MPOL_MF_STRICT;
1103 
1104 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1105 	end = start + len;
1106 
1107 	if (end < start)
1108 		return -EINVAL;
1109 	if (end == start)
1110 		return 0;
1111 
1112 	new = mpol_new(mode, mode_flags, nmask);
1113 	if (IS_ERR(new))
1114 		return PTR_ERR(new);
1115 
1116 	/*
1117 	 * If we are using the default policy then operation
1118 	 * on discontinuous address spaces is okay after all
1119 	 */
1120 	if (!new)
1121 		flags |= MPOL_MF_DISCONTIG_OK;
1122 
1123 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1124 		 start, start + len, mode, mode_flags,
1125 		 nmask ? nodes_addr(*nmask)[0] : -1);
1126 
1127 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1128 
1129 		err = migrate_prep();
1130 		if (err)
1131 			goto mpol_out;
1132 	}
1133 	{
1134 		NODEMASK_SCRATCH(scratch);
1135 		if (scratch) {
1136 			down_write(&mm->mmap_sem);
1137 			task_lock(current);
1138 			err = mpol_set_nodemask(new, nmask, scratch);
1139 			task_unlock(current);
1140 			if (err)
1141 				up_write(&mm->mmap_sem);
1142 		} else
1143 			err = -ENOMEM;
1144 		NODEMASK_SCRATCH_FREE(scratch);
1145 	}
1146 	if (err)
1147 		goto mpol_out;
1148 
1149 	vma = check_range(mm, start, end, nmask,
1150 			  flags | MPOL_MF_INVERT, &pagelist);
1151 
1152 	err = PTR_ERR(vma);
1153 	if (!IS_ERR(vma)) {
1154 		int nr_failed = 0;
1155 
1156 		err = mbind_range(mm, start, end, new);
1157 
1158 		if (!list_empty(&pagelist)) {
1159 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1160 						(unsigned long)vma,
1161 						false, true);
1162 			if (nr_failed)
1163 				putback_lru_pages(&pagelist);
1164 		}
1165 
1166 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1167 			err = -EIO;
1168 	} else
1169 		putback_lru_pages(&pagelist);
1170 
1171 	up_write(&mm->mmap_sem);
1172  mpol_out:
1173 	mpol_put(new);
1174 	return err;
1175 }
1176 
1177 /*
1178  * User space interface with variable sized bitmaps for nodelists.
1179  */
1180 
1181 /* Copy a node mask from user space. */
1182 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1183 		     unsigned long maxnode)
1184 {
1185 	unsigned long k;
1186 	unsigned long nlongs;
1187 	unsigned long endmask;
1188 
1189 	--maxnode;
1190 	nodes_clear(*nodes);
1191 	if (maxnode == 0 || !nmask)
1192 		return 0;
1193 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1194 		return -EINVAL;
1195 
1196 	nlongs = BITS_TO_LONGS(maxnode);
1197 	if ((maxnode % BITS_PER_LONG) == 0)
1198 		endmask = ~0UL;
1199 	else
1200 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1201 
1202 	/* When the user specified more nodes than supported just check
1203 	   if the non supported part is all zero. */
1204 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1205 		if (nlongs > PAGE_SIZE/sizeof(long))
1206 			return -EINVAL;
1207 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1208 			unsigned long t;
1209 			if (get_user(t, nmask + k))
1210 				return -EFAULT;
1211 			if (k == nlongs - 1) {
1212 				if (t & endmask)
1213 					return -EINVAL;
1214 			} else if (t)
1215 				return -EINVAL;
1216 		}
1217 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1218 		endmask = ~0UL;
1219 	}
1220 
1221 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1222 		return -EFAULT;
1223 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1224 	return 0;
1225 }
1226 
1227 /* Copy a kernel node mask to user space */
1228 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1229 			      nodemask_t *nodes)
1230 {
1231 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1232 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1233 
1234 	if (copy > nbytes) {
1235 		if (copy > PAGE_SIZE)
1236 			return -EINVAL;
1237 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1238 			return -EFAULT;
1239 		copy = nbytes;
1240 	}
1241 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1242 }
1243 
1244 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1245 		unsigned long, mode, unsigned long __user *, nmask,
1246 		unsigned long, maxnode, unsigned, flags)
1247 {
1248 	nodemask_t nodes;
1249 	int err;
1250 	unsigned short mode_flags;
1251 
1252 	mode_flags = mode & MPOL_MODE_FLAGS;
1253 	mode &= ~MPOL_MODE_FLAGS;
1254 	if (mode >= MPOL_MAX)
1255 		return -EINVAL;
1256 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1257 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1258 		return -EINVAL;
1259 	err = get_nodes(&nodes, nmask, maxnode);
1260 	if (err)
1261 		return err;
1262 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1263 }
1264 
1265 /* Set the process memory policy */
1266 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1267 		unsigned long, maxnode)
1268 {
1269 	int err;
1270 	nodemask_t nodes;
1271 	unsigned short flags;
1272 
1273 	flags = mode & MPOL_MODE_FLAGS;
1274 	mode &= ~MPOL_MODE_FLAGS;
1275 	if ((unsigned int)mode >= MPOL_MAX)
1276 		return -EINVAL;
1277 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1278 		return -EINVAL;
1279 	err = get_nodes(&nodes, nmask, maxnode);
1280 	if (err)
1281 		return err;
1282 	return do_set_mempolicy(mode, flags, &nodes);
1283 }
1284 
1285 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1286 		const unsigned long __user *, old_nodes,
1287 		const unsigned long __user *, new_nodes)
1288 {
1289 	const struct cred *cred = current_cred(), *tcred;
1290 	struct mm_struct *mm = NULL;
1291 	struct task_struct *task;
1292 	nodemask_t task_nodes;
1293 	int err;
1294 	nodemask_t *old;
1295 	nodemask_t *new;
1296 	NODEMASK_SCRATCH(scratch);
1297 
1298 	if (!scratch)
1299 		return -ENOMEM;
1300 
1301 	old = &scratch->mask1;
1302 	new = &scratch->mask2;
1303 
1304 	err = get_nodes(old, old_nodes, maxnode);
1305 	if (err)
1306 		goto out;
1307 
1308 	err = get_nodes(new, new_nodes, maxnode);
1309 	if (err)
1310 		goto out;
1311 
1312 	/* Find the mm_struct */
1313 	rcu_read_lock();
1314 	task = pid ? find_task_by_vpid(pid) : current;
1315 	if (!task) {
1316 		rcu_read_unlock();
1317 		err = -ESRCH;
1318 		goto out;
1319 	}
1320 	mm = get_task_mm(task);
1321 	rcu_read_unlock();
1322 
1323 	err = -EINVAL;
1324 	if (!mm)
1325 		goto out;
1326 
1327 	/*
1328 	 * Check if this process has the right to modify the specified
1329 	 * process. The right exists if the process has administrative
1330 	 * capabilities, superuser privileges or the same
1331 	 * userid as the target process.
1332 	 */
1333 	rcu_read_lock();
1334 	tcred = __task_cred(task);
1335 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1336 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1337 	    !capable(CAP_SYS_NICE)) {
1338 		rcu_read_unlock();
1339 		err = -EPERM;
1340 		goto out;
1341 	}
1342 	rcu_read_unlock();
1343 
1344 	task_nodes = cpuset_mems_allowed(task);
1345 	/* Is the user allowed to access the target nodes? */
1346 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1347 		err = -EPERM;
1348 		goto out;
1349 	}
1350 
1351 	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1352 		err = -EINVAL;
1353 		goto out;
1354 	}
1355 
1356 	err = security_task_movememory(task);
1357 	if (err)
1358 		goto out;
1359 
1360 	err = do_migrate_pages(mm, old, new,
1361 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1362 out:
1363 	if (mm)
1364 		mmput(mm);
1365 	NODEMASK_SCRATCH_FREE(scratch);
1366 
1367 	return err;
1368 }
1369 
1370 
1371 /* Retrieve NUMA policy */
1372 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1373 		unsigned long __user *, nmask, unsigned long, maxnode,
1374 		unsigned long, addr, unsigned long, flags)
1375 {
1376 	int err;
1377 	int uninitialized_var(pval);
1378 	nodemask_t nodes;
1379 
1380 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1381 		return -EINVAL;
1382 
1383 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1384 
1385 	if (err)
1386 		return err;
1387 
1388 	if (policy && put_user(pval, policy))
1389 		return -EFAULT;
1390 
1391 	if (nmask)
1392 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1393 
1394 	return err;
1395 }
1396 
1397 #ifdef CONFIG_COMPAT
1398 
1399 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1400 				     compat_ulong_t __user *nmask,
1401 				     compat_ulong_t maxnode,
1402 				     compat_ulong_t addr, compat_ulong_t flags)
1403 {
1404 	long err;
1405 	unsigned long __user *nm = NULL;
1406 	unsigned long nr_bits, alloc_size;
1407 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1408 
1409 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1410 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1411 
1412 	if (nmask)
1413 		nm = compat_alloc_user_space(alloc_size);
1414 
1415 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1416 
1417 	if (!err && nmask) {
1418 		err = copy_from_user(bm, nm, alloc_size);
1419 		/* ensure entire bitmap is zeroed */
1420 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1421 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1422 	}
1423 
1424 	return err;
1425 }
1426 
1427 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1428 				     compat_ulong_t maxnode)
1429 {
1430 	long err = 0;
1431 	unsigned long __user *nm = NULL;
1432 	unsigned long nr_bits, alloc_size;
1433 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1434 
1435 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1436 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1437 
1438 	if (nmask) {
1439 		err = compat_get_bitmap(bm, nmask, nr_bits);
1440 		nm = compat_alloc_user_space(alloc_size);
1441 		err |= copy_to_user(nm, bm, alloc_size);
1442 	}
1443 
1444 	if (err)
1445 		return -EFAULT;
1446 
1447 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1448 }
1449 
1450 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1451 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1452 			     compat_ulong_t maxnode, compat_ulong_t flags)
1453 {
1454 	long err = 0;
1455 	unsigned long __user *nm = NULL;
1456 	unsigned long nr_bits, alloc_size;
1457 	nodemask_t bm;
1458 
1459 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1460 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1461 
1462 	if (nmask) {
1463 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1464 		nm = compat_alloc_user_space(alloc_size);
1465 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1466 	}
1467 
1468 	if (err)
1469 		return -EFAULT;
1470 
1471 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1472 }
1473 
1474 #endif
1475 
1476 /*
1477  * get_vma_policy(@task, @vma, @addr)
1478  * @task - task for fallback if vma policy == default
1479  * @vma   - virtual memory area whose policy is sought
1480  * @addr  - address in @vma for shared policy lookup
1481  *
1482  * Returns effective policy for a VMA at specified address.
1483  * Falls back to @task or system default policy, as necessary.
1484  * Current or other task's task mempolicy and non-shared vma policies
1485  * are protected by the task's mmap_sem, which must be held for read by
1486  * the caller.
1487  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1488  * count--added by the get_policy() vm_op, as appropriate--to protect against
1489  * freeing by another task.  It is the caller's responsibility to free the
1490  * extra reference for shared policies.
1491  */
1492 static struct mempolicy *get_vma_policy(struct task_struct *task,
1493 		struct vm_area_struct *vma, unsigned long addr)
1494 {
1495 	struct mempolicy *pol = task->mempolicy;
1496 
1497 	if (vma) {
1498 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1499 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1500 									addr);
1501 			if (vpol)
1502 				pol = vpol;
1503 		} else if (vma->vm_policy)
1504 			pol = vma->vm_policy;
1505 	}
1506 	if (!pol)
1507 		pol = &default_policy;
1508 	return pol;
1509 }
1510 
1511 /*
1512  * Return a nodemask representing a mempolicy for filtering nodes for
1513  * page allocation
1514  */
1515 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1516 {
1517 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1518 	if (unlikely(policy->mode == MPOL_BIND) &&
1519 			gfp_zone(gfp) >= policy_zone &&
1520 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1521 		return &policy->v.nodes;
1522 
1523 	return NULL;
1524 }
1525 
1526 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1527 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528 	int nd)
1529 {
1530 	switch (policy->mode) {
1531 	case MPOL_PREFERRED:
1532 		if (!(policy->flags & MPOL_F_LOCAL))
1533 			nd = policy->v.preferred_node;
1534 		break;
1535 	case MPOL_BIND:
1536 		/*
1537 		 * Normally, MPOL_BIND allocations are node-local within the
1538 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1539 		 * current node isn't part of the mask, we use the zonelist for
1540 		 * the first node in the mask instead.
1541 		 */
1542 		if (unlikely(gfp & __GFP_THISNODE) &&
1543 				unlikely(!node_isset(nd, policy->v.nodes)))
1544 			nd = first_node(policy->v.nodes);
1545 		break;
1546 	default:
1547 		BUG();
1548 	}
1549 	return node_zonelist(nd, gfp);
1550 }
1551 
1552 /* Do dynamic interleaving for a process */
1553 static unsigned interleave_nodes(struct mempolicy *policy)
1554 {
1555 	unsigned nid, next;
1556 	struct task_struct *me = current;
1557 
1558 	nid = me->il_next;
1559 	next = next_node(nid, policy->v.nodes);
1560 	if (next >= MAX_NUMNODES)
1561 		next = first_node(policy->v.nodes);
1562 	if (next < MAX_NUMNODES)
1563 		me->il_next = next;
1564 	return nid;
1565 }
1566 
1567 /*
1568  * Depending on the memory policy provide a node from which to allocate the
1569  * next slab entry.
1570  * @policy must be protected by freeing by the caller.  If @policy is
1571  * the current task's mempolicy, this protection is implicit, as only the
1572  * task can change it's policy.  The system default policy requires no
1573  * such protection.
1574  */
1575 unsigned slab_node(struct mempolicy *policy)
1576 {
1577 	if (!policy || policy->flags & MPOL_F_LOCAL)
1578 		return numa_node_id();
1579 
1580 	switch (policy->mode) {
1581 	case MPOL_PREFERRED:
1582 		/*
1583 		 * handled MPOL_F_LOCAL above
1584 		 */
1585 		return policy->v.preferred_node;
1586 
1587 	case MPOL_INTERLEAVE:
1588 		return interleave_nodes(policy);
1589 
1590 	case MPOL_BIND: {
1591 		/*
1592 		 * Follow bind policy behavior and start allocation at the
1593 		 * first node.
1594 		 */
1595 		struct zonelist *zonelist;
1596 		struct zone *zone;
1597 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1598 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1599 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1600 							&policy->v.nodes,
1601 							&zone);
1602 		return zone ? zone->node : numa_node_id();
1603 	}
1604 
1605 	default:
1606 		BUG();
1607 	}
1608 }
1609 
1610 /* Do static interleaving for a VMA with known offset. */
1611 static unsigned offset_il_node(struct mempolicy *pol,
1612 		struct vm_area_struct *vma, unsigned long off)
1613 {
1614 	unsigned nnodes = nodes_weight(pol->v.nodes);
1615 	unsigned target;
1616 	int c;
1617 	int nid = -1;
1618 
1619 	if (!nnodes)
1620 		return numa_node_id();
1621 	target = (unsigned int)off % nnodes;
1622 	c = 0;
1623 	do {
1624 		nid = next_node(nid, pol->v.nodes);
1625 		c++;
1626 	} while (c <= target);
1627 	return nid;
1628 }
1629 
1630 /* Determine a node number for interleave */
1631 static inline unsigned interleave_nid(struct mempolicy *pol,
1632 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1633 {
1634 	if (vma) {
1635 		unsigned long off;
1636 
1637 		/*
1638 		 * for small pages, there is no difference between
1639 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1640 		 * for huge pages, since vm_pgoff is in units of small
1641 		 * pages, we need to shift off the always 0 bits to get
1642 		 * a useful offset.
1643 		 */
1644 		BUG_ON(shift < PAGE_SHIFT);
1645 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1646 		off += (addr - vma->vm_start) >> shift;
1647 		return offset_il_node(pol, vma, off);
1648 	} else
1649 		return interleave_nodes(pol);
1650 }
1651 
1652 #ifdef CONFIG_HUGETLBFS
1653 /*
1654  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1655  * @vma = virtual memory area whose policy is sought
1656  * @addr = address in @vma for shared policy lookup and interleave policy
1657  * @gfp_flags = for requested zone
1658  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1659  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1660  *
1661  * Returns a zonelist suitable for a huge page allocation and a pointer
1662  * to the struct mempolicy for conditional unref after allocation.
1663  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1664  * @nodemask for filtering the zonelist.
1665  *
1666  * Must be protected by get_mems_allowed()
1667  */
1668 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1669 				gfp_t gfp_flags, struct mempolicy **mpol,
1670 				nodemask_t **nodemask)
1671 {
1672 	struct zonelist *zl;
1673 
1674 	*mpol = get_vma_policy(current, vma, addr);
1675 	*nodemask = NULL;	/* assume !MPOL_BIND */
1676 
1677 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1678 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1679 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1680 	} else {
1681 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1682 		if ((*mpol)->mode == MPOL_BIND)
1683 			*nodemask = &(*mpol)->v.nodes;
1684 	}
1685 	return zl;
1686 }
1687 
1688 /*
1689  * init_nodemask_of_mempolicy
1690  *
1691  * If the current task's mempolicy is "default" [NULL], return 'false'
1692  * to indicate default policy.  Otherwise, extract the policy nodemask
1693  * for 'bind' or 'interleave' policy into the argument nodemask, or
1694  * initialize the argument nodemask to contain the single node for
1695  * 'preferred' or 'local' policy and return 'true' to indicate presence
1696  * of non-default mempolicy.
1697  *
1698  * We don't bother with reference counting the mempolicy [mpol_get/put]
1699  * because the current task is examining it's own mempolicy and a task's
1700  * mempolicy is only ever changed by the task itself.
1701  *
1702  * N.B., it is the caller's responsibility to free a returned nodemask.
1703  */
1704 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1705 {
1706 	struct mempolicy *mempolicy;
1707 	int nid;
1708 
1709 	if (!(mask && current->mempolicy))
1710 		return false;
1711 
1712 	task_lock(current);
1713 	mempolicy = current->mempolicy;
1714 	switch (mempolicy->mode) {
1715 	case MPOL_PREFERRED:
1716 		if (mempolicy->flags & MPOL_F_LOCAL)
1717 			nid = numa_node_id();
1718 		else
1719 			nid = mempolicy->v.preferred_node;
1720 		init_nodemask_of_node(mask, nid);
1721 		break;
1722 
1723 	case MPOL_BIND:
1724 		/* Fall through */
1725 	case MPOL_INTERLEAVE:
1726 		*mask =  mempolicy->v.nodes;
1727 		break;
1728 
1729 	default:
1730 		BUG();
1731 	}
1732 	task_unlock(current);
1733 
1734 	return true;
1735 }
1736 #endif
1737 
1738 /*
1739  * mempolicy_nodemask_intersects
1740  *
1741  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1742  * policy.  Otherwise, check for intersection between mask and the policy
1743  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1744  * policy, always return true since it may allocate elsewhere on fallback.
1745  *
1746  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1747  */
1748 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1749 					const nodemask_t *mask)
1750 {
1751 	struct mempolicy *mempolicy;
1752 	bool ret = true;
1753 
1754 	if (!mask)
1755 		return ret;
1756 	task_lock(tsk);
1757 	mempolicy = tsk->mempolicy;
1758 	if (!mempolicy)
1759 		goto out;
1760 
1761 	switch (mempolicy->mode) {
1762 	case MPOL_PREFERRED:
1763 		/*
1764 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1765 		 * allocate from, they may fallback to other nodes when oom.
1766 		 * Thus, it's possible for tsk to have allocated memory from
1767 		 * nodes in mask.
1768 		 */
1769 		break;
1770 	case MPOL_BIND:
1771 	case MPOL_INTERLEAVE:
1772 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1773 		break;
1774 	default:
1775 		BUG();
1776 	}
1777 out:
1778 	task_unlock(tsk);
1779 	return ret;
1780 }
1781 
1782 /* Allocate a page in interleaved policy.
1783    Own path because it needs to do special accounting. */
1784 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1785 					unsigned nid)
1786 {
1787 	struct zonelist *zl;
1788 	struct page *page;
1789 
1790 	zl = node_zonelist(nid, gfp);
1791 	page = __alloc_pages(gfp, order, zl);
1792 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1793 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1794 	return page;
1795 }
1796 
1797 /**
1798  * 	alloc_pages_vma	- Allocate a page for a VMA.
1799  *
1800  * 	@gfp:
1801  *      %GFP_USER    user allocation.
1802  *      %GFP_KERNEL  kernel allocations,
1803  *      %GFP_HIGHMEM highmem/user allocations,
1804  *      %GFP_FS      allocation should not call back into a file system.
1805  *      %GFP_ATOMIC  don't sleep.
1806  *
1807  *	@order:Order of the GFP allocation.
1808  * 	@vma:  Pointer to VMA or NULL if not available.
1809  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1810  *
1811  * 	This function allocates a page from the kernel page pool and applies
1812  *	a NUMA policy associated with the VMA or the current process.
1813  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1814  *	mm_struct of the VMA to prevent it from going away. Should be used for
1815  *	all allocations for pages that will be mapped into
1816  * 	user space. Returns NULL when no page can be allocated.
1817  *
1818  *	Should be called with the mm_sem of the vma hold.
1819  */
1820 struct page *
1821 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1822 		unsigned long addr, int node)
1823 {
1824 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1825 	struct zonelist *zl;
1826 	struct page *page;
1827 
1828 	get_mems_allowed();
1829 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1830 		unsigned nid;
1831 
1832 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1833 		mpol_cond_put(pol);
1834 		page = alloc_page_interleave(gfp, order, nid);
1835 		put_mems_allowed();
1836 		return page;
1837 	}
1838 	zl = policy_zonelist(gfp, pol, node);
1839 	if (unlikely(mpol_needs_cond_ref(pol))) {
1840 		/*
1841 		 * slow path: ref counted shared policy
1842 		 */
1843 		struct page *page =  __alloc_pages_nodemask(gfp, order,
1844 						zl, policy_nodemask(gfp, pol));
1845 		__mpol_put(pol);
1846 		put_mems_allowed();
1847 		return page;
1848 	}
1849 	/*
1850 	 * fast path:  default or task policy
1851 	 */
1852 	page = __alloc_pages_nodemask(gfp, order, zl,
1853 				      policy_nodemask(gfp, pol));
1854 	put_mems_allowed();
1855 	return page;
1856 }
1857 
1858 /**
1859  * 	alloc_pages_current - Allocate pages.
1860  *
1861  *	@gfp:
1862  *		%GFP_USER   user allocation,
1863  *      	%GFP_KERNEL kernel allocation,
1864  *      	%GFP_HIGHMEM highmem allocation,
1865  *      	%GFP_FS     don't call back into a file system.
1866  *      	%GFP_ATOMIC don't sleep.
1867  *	@order: Power of two of allocation size in pages. 0 is a single page.
1868  *
1869  *	Allocate a page from the kernel page pool.  When not in
1870  *	interrupt context and apply the current process NUMA policy.
1871  *	Returns NULL when no page can be allocated.
1872  *
1873  *	Don't call cpuset_update_task_memory_state() unless
1874  *	1) it's ok to take cpuset_sem (can WAIT), and
1875  *	2) allocating for current task (not interrupt).
1876  */
1877 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1878 {
1879 	struct mempolicy *pol = current->mempolicy;
1880 	struct page *page;
1881 
1882 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1883 		pol = &default_policy;
1884 
1885 	get_mems_allowed();
1886 	/*
1887 	 * No reference counting needed for current->mempolicy
1888 	 * nor system default_policy
1889 	 */
1890 	if (pol->mode == MPOL_INTERLEAVE)
1891 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1892 	else
1893 		page = __alloc_pages_nodemask(gfp, order,
1894 				policy_zonelist(gfp, pol, numa_node_id()),
1895 				policy_nodemask(gfp, pol));
1896 	put_mems_allowed();
1897 	return page;
1898 }
1899 EXPORT_SYMBOL(alloc_pages_current);
1900 
1901 /*
1902  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1903  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1904  * with the mems_allowed returned by cpuset_mems_allowed().  This
1905  * keeps mempolicies cpuset relative after its cpuset moves.  See
1906  * further kernel/cpuset.c update_nodemask().
1907  *
1908  * current's mempolicy may be rebinded by the other task(the task that changes
1909  * cpuset's mems), so we needn't do rebind work for current task.
1910  */
1911 
1912 /* Slow path of a mempolicy duplicate */
1913 struct mempolicy *__mpol_dup(struct mempolicy *old)
1914 {
1915 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1916 
1917 	if (!new)
1918 		return ERR_PTR(-ENOMEM);
1919 
1920 	/* task's mempolicy is protected by alloc_lock */
1921 	if (old == current->mempolicy) {
1922 		task_lock(current);
1923 		*new = *old;
1924 		task_unlock(current);
1925 	} else
1926 		*new = *old;
1927 
1928 	rcu_read_lock();
1929 	if (current_cpuset_is_being_rebound()) {
1930 		nodemask_t mems = cpuset_mems_allowed(current);
1931 		if (new->flags & MPOL_F_REBINDING)
1932 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1933 		else
1934 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1935 	}
1936 	rcu_read_unlock();
1937 	atomic_set(&new->refcnt, 1);
1938 	return new;
1939 }
1940 
1941 /*
1942  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1943  * eliminate the * MPOL_F_* flags that require conditional ref and
1944  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1945  * after return.  Use the returned value.
1946  *
1947  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1948  * policy lookup, even if the policy needs/has extra ref on lookup.
1949  * shmem_readahead needs this.
1950  */
1951 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1952 						struct mempolicy *frompol)
1953 {
1954 	if (!mpol_needs_cond_ref(frompol))
1955 		return frompol;
1956 
1957 	*tompol = *frompol;
1958 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1959 	__mpol_put(frompol);
1960 	return tompol;
1961 }
1962 
1963 /* Slow path of a mempolicy comparison */
1964 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1965 {
1966 	if (!a || !b)
1967 		return 0;
1968 	if (a->mode != b->mode)
1969 		return 0;
1970 	if (a->flags != b->flags)
1971 		return 0;
1972 	if (mpol_store_user_nodemask(a))
1973 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1974 			return 0;
1975 
1976 	switch (a->mode) {
1977 	case MPOL_BIND:
1978 		/* Fall through */
1979 	case MPOL_INTERLEAVE:
1980 		return nodes_equal(a->v.nodes, b->v.nodes);
1981 	case MPOL_PREFERRED:
1982 		return a->v.preferred_node == b->v.preferred_node;
1983 	default:
1984 		BUG();
1985 		return 0;
1986 	}
1987 }
1988 
1989 /*
1990  * Shared memory backing store policy support.
1991  *
1992  * Remember policies even when nobody has shared memory mapped.
1993  * The policies are kept in Red-Black tree linked from the inode.
1994  * They are protected by the sp->lock spinlock, which should be held
1995  * for any accesses to the tree.
1996  */
1997 
1998 /* lookup first element intersecting start-end */
1999 /* Caller holds sp->lock */
2000 static struct sp_node *
2001 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2002 {
2003 	struct rb_node *n = sp->root.rb_node;
2004 
2005 	while (n) {
2006 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2007 
2008 		if (start >= p->end)
2009 			n = n->rb_right;
2010 		else if (end <= p->start)
2011 			n = n->rb_left;
2012 		else
2013 			break;
2014 	}
2015 	if (!n)
2016 		return NULL;
2017 	for (;;) {
2018 		struct sp_node *w = NULL;
2019 		struct rb_node *prev = rb_prev(n);
2020 		if (!prev)
2021 			break;
2022 		w = rb_entry(prev, struct sp_node, nd);
2023 		if (w->end <= start)
2024 			break;
2025 		n = prev;
2026 	}
2027 	return rb_entry(n, struct sp_node, nd);
2028 }
2029 
2030 /* Insert a new shared policy into the list. */
2031 /* Caller holds sp->lock */
2032 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2033 {
2034 	struct rb_node **p = &sp->root.rb_node;
2035 	struct rb_node *parent = NULL;
2036 	struct sp_node *nd;
2037 
2038 	while (*p) {
2039 		parent = *p;
2040 		nd = rb_entry(parent, struct sp_node, nd);
2041 		if (new->start < nd->start)
2042 			p = &(*p)->rb_left;
2043 		else if (new->end > nd->end)
2044 			p = &(*p)->rb_right;
2045 		else
2046 			BUG();
2047 	}
2048 	rb_link_node(&new->nd, parent, p);
2049 	rb_insert_color(&new->nd, &sp->root);
2050 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2051 		 new->policy ? new->policy->mode : 0);
2052 }
2053 
2054 /* Find shared policy intersecting idx */
2055 struct mempolicy *
2056 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2057 {
2058 	struct mempolicy *pol = NULL;
2059 	struct sp_node *sn;
2060 
2061 	if (!sp->root.rb_node)
2062 		return NULL;
2063 	spin_lock(&sp->lock);
2064 	sn = sp_lookup(sp, idx, idx+1);
2065 	if (sn) {
2066 		mpol_get(sn->policy);
2067 		pol = sn->policy;
2068 	}
2069 	spin_unlock(&sp->lock);
2070 	return pol;
2071 }
2072 
2073 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2074 {
2075 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2076 	rb_erase(&n->nd, &sp->root);
2077 	mpol_put(n->policy);
2078 	kmem_cache_free(sn_cache, n);
2079 }
2080 
2081 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2082 				struct mempolicy *pol)
2083 {
2084 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2085 
2086 	if (!n)
2087 		return NULL;
2088 	n->start = start;
2089 	n->end = end;
2090 	mpol_get(pol);
2091 	pol->flags |= MPOL_F_SHARED;	/* for unref */
2092 	n->policy = pol;
2093 	return n;
2094 }
2095 
2096 /* Replace a policy range. */
2097 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2098 				 unsigned long end, struct sp_node *new)
2099 {
2100 	struct sp_node *n, *new2 = NULL;
2101 
2102 restart:
2103 	spin_lock(&sp->lock);
2104 	n = sp_lookup(sp, start, end);
2105 	/* Take care of old policies in the same range. */
2106 	while (n && n->start < end) {
2107 		struct rb_node *next = rb_next(&n->nd);
2108 		if (n->start >= start) {
2109 			if (n->end <= end)
2110 				sp_delete(sp, n);
2111 			else
2112 				n->start = end;
2113 		} else {
2114 			/* Old policy spanning whole new range. */
2115 			if (n->end > end) {
2116 				if (!new2) {
2117 					spin_unlock(&sp->lock);
2118 					new2 = sp_alloc(end, n->end, n->policy);
2119 					if (!new2)
2120 						return -ENOMEM;
2121 					goto restart;
2122 				}
2123 				n->end = start;
2124 				sp_insert(sp, new2);
2125 				new2 = NULL;
2126 				break;
2127 			} else
2128 				n->end = start;
2129 		}
2130 		if (!next)
2131 			break;
2132 		n = rb_entry(next, struct sp_node, nd);
2133 	}
2134 	if (new)
2135 		sp_insert(sp, new);
2136 	spin_unlock(&sp->lock);
2137 	if (new2) {
2138 		mpol_put(new2->policy);
2139 		kmem_cache_free(sn_cache, new2);
2140 	}
2141 	return 0;
2142 }
2143 
2144 /**
2145  * mpol_shared_policy_init - initialize shared policy for inode
2146  * @sp: pointer to inode shared policy
2147  * @mpol:  struct mempolicy to install
2148  *
2149  * Install non-NULL @mpol in inode's shared policy rb-tree.
2150  * On entry, the current task has a reference on a non-NULL @mpol.
2151  * This must be released on exit.
2152  * This is called at get_inode() calls and we can use GFP_KERNEL.
2153  */
2154 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2155 {
2156 	int ret;
2157 
2158 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2159 	spin_lock_init(&sp->lock);
2160 
2161 	if (mpol) {
2162 		struct vm_area_struct pvma;
2163 		struct mempolicy *new;
2164 		NODEMASK_SCRATCH(scratch);
2165 
2166 		if (!scratch)
2167 			goto put_mpol;
2168 		/* contextualize the tmpfs mount point mempolicy */
2169 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2170 		if (IS_ERR(new))
2171 			goto free_scratch; /* no valid nodemask intersection */
2172 
2173 		task_lock(current);
2174 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2175 		task_unlock(current);
2176 		if (ret)
2177 			goto put_new;
2178 
2179 		/* Create pseudo-vma that contains just the policy */
2180 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2181 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2182 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2183 
2184 put_new:
2185 		mpol_put(new);			/* drop initial ref */
2186 free_scratch:
2187 		NODEMASK_SCRATCH_FREE(scratch);
2188 put_mpol:
2189 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2190 	}
2191 }
2192 
2193 int mpol_set_shared_policy(struct shared_policy *info,
2194 			struct vm_area_struct *vma, struct mempolicy *npol)
2195 {
2196 	int err;
2197 	struct sp_node *new = NULL;
2198 	unsigned long sz = vma_pages(vma);
2199 
2200 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2201 		 vma->vm_pgoff,
2202 		 sz, npol ? npol->mode : -1,
2203 		 npol ? npol->flags : -1,
2204 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2205 
2206 	if (npol) {
2207 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2208 		if (!new)
2209 			return -ENOMEM;
2210 	}
2211 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2212 	if (err && new)
2213 		kmem_cache_free(sn_cache, new);
2214 	return err;
2215 }
2216 
2217 /* Free a backing policy store on inode delete. */
2218 void mpol_free_shared_policy(struct shared_policy *p)
2219 {
2220 	struct sp_node *n;
2221 	struct rb_node *next;
2222 
2223 	if (!p->root.rb_node)
2224 		return;
2225 	spin_lock(&p->lock);
2226 	next = rb_first(&p->root);
2227 	while (next) {
2228 		n = rb_entry(next, struct sp_node, nd);
2229 		next = rb_next(&n->nd);
2230 		rb_erase(&n->nd, &p->root);
2231 		mpol_put(n->policy);
2232 		kmem_cache_free(sn_cache, n);
2233 	}
2234 	spin_unlock(&p->lock);
2235 }
2236 
2237 /* assumes fs == KERNEL_DS */
2238 void __init numa_policy_init(void)
2239 {
2240 	nodemask_t interleave_nodes;
2241 	unsigned long largest = 0;
2242 	int nid, prefer = 0;
2243 
2244 	policy_cache = kmem_cache_create("numa_policy",
2245 					 sizeof(struct mempolicy),
2246 					 0, SLAB_PANIC, NULL);
2247 
2248 	sn_cache = kmem_cache_create("shared_policy_node",
2249 				     sizeof(struct sp_node),
2250 				     0, SLAB_PANIC, NULL);
2251 
2252 	/*
2253 	 * Set interleaving policy for system init. Interleaving is only
2254 	 * enabled across suitably sized nodes (default is >= 16MB), or
2255 	 * fall back to the largest node if they're all smaller.
2256 	 */
2257 	nodes_clear(interleave_nodes);
2258 	for_each_node_state(nid, N_HIGH_MEMORY) {
2259 		unsigned long total_pages = node_present_pages(nid);
2260 
2261 		/* Preserve the largest node */
2262 		if (largest < total_pages) {
2263 			largest = total_pages;
2264 			prefer = nid;
2265 		}
2266 
2267 		/* Interleave this node? */
2268 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2269 			node_set(nid, interleave_nodes);
2270 	}
2271 
2272 	/* All too small, use the largest */
2273 	if (unlikely(nodes_empty(interleave_nodes)))
2274 		node_set(prefer, interleave_nodes);
2275 
2276 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2277 		printk("numa_policy_init: interleaving failed\n");
2278 }
2279 
2280 /* Reset policy of current process to default */
2281 void numa_default_policy(void)
2282 {
2283 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2284 }
2285 
2286 /*
2287  * Parse and format mempolicy from/to strings
2288  */
2289 
2290 /*
2291  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2292  * Used only for mpol_parse_str() and mpol_to_str()
2293  */
2294 #define MPOL_LOCAL MPOL_MAX
2295 static const char * const policy_modes[] =
2296 {
2297 	[MPOL_DEFAULT]    = "default",
2298 	[MPOL_PREFERRED]  = "prefer",
2299 	[MPOL_BIND]       = "bind",
2300 	[MPOL_INTERLEAVE] = "interleave",
2301 	[MPOL_LOCAL]      = "local"
2302 };
2303 
2304 
2305 #ifdef CONFIG_TMPFS
2306 /**
2307  * mpol_parse_str - parse string to mempolicy
2308  * @str:  string containing mempolicy to parse
2309  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2310  * @no_context:  flag whether to "contextualize" the mempolicy
2311  *
2312  * Format of input:
2313  *	<mode>[=<flags>][:<nodelist>]
2314  *
2315  * if @no_context is true, save the input nodemask in w.user_nodemask in
2316  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2317  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2318  * mount option.  Note that if 'static' or 'relative' mode flags were
2319  * specified, the input nodemask will already have been saved.  Saving
2320  * it again is redundant, but safe.
2321  *
2322  * On success, returns 0, else 1
2323  */
2324 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2325 {
2326 	struct mempolicy *new = NULL;
2327 	unsigned short mode;
2328 	unsigned short uninitialized_var(mode_flags);
2329 	nodemask_t nodes;
2330 	char *nodelist = strchr(str, ':');
2331 	char *flags = strchr(str, '=');
2332 	int err = 1;
2333 
2334 	if (nodelist) {
2335 		/* NUL-terminate mode or flags string */
2336 		*nodelist++ = '\0';
2337 		if (nodelist_parse(nodelist, nodes))
2338 			goto out;
2339 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2340 			goto out;
2341 	} else
2342 		nodes_clear(nodes);
2343 
2344 	if (flags)
2345 		*flags++ = '\0';	/* terminate mode string */
2346 
2347 	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2348 		if (!strcmp(str, policy_modes[mode])) {
2349 			break;
2350 		}
2351 	}
2352 	if (mode > MPOL_LOCAL)
2353 		goto out;
2354 
2355 	switch (mode) {
2356 	case MPOL_PREFERRED:
2357 		/*
2358 		 * Insist on a nodelist of one node only
2359 		 */
2360 		if (nodelist) {
2361 			char *rest = nodelist;
2362 			while (isdigit(*rest))
2363 				rest++;
2364 			if (*rest)
2365 				goto out;
2366 		}
2367 		break;
2368 	case MPOL_INTERLEAVE:
2369 		/*
2370 		 * Default to online nodes with memory if no nodelist
2371 		 */
2372 		if (!nodelist)
2373 			nodes = node_states[N_HIGH_MEMORY];
2374 		break;
2375 	case MPOL_LOCAL:
2376 		/*
2377 		 * Don't allow a nodelist;  mpol_new() checks flags
2378 		 */
2379 		if (nodelist)
2380 			goto out;
2381 		mode = MPOL_PREFERRED;
2382 		break;
2383 	case MPOL_DEFAULT:
2384 		/*
2385 		 * Insist on a empty nodelist
2386 		 */
2387 		if (!nodelist)
2388 			err = 0;
2389 		goto out;
2390 	case MPOL_BIND:
2391 		/*
2392 		 * Insist on a nodelist
2393 		 */
2394 		if (!nodelist)
2395 			goto out;
2396 	}
2397 
2398 	mode_flags = 0;
2399 	if (flags) {
2400 		/*
2401 		 * Currently, we only support two mutually exclusive
2402 		 * mode flags.
2403 		 */
2404 		if (!strcmp(flags, "static"))
2405 			mode_flags |= MPOL_F_STATIC_NODES;
2406 		else if (!strcmp(flags, "relative"))
2407 			mode_flags |= MPOL_F_RELATIVE_NODES;
2408 		else
2409 			goto out;
2410 	}
2411 
2412 	new = mpol_new(mode, mode_flags, &nodes);
2413 	if (IS_ERR(new))
2414 		goto out;
2415 
2416 	if (no_context) {
2417 		/* save for contextualization */
2418 		new->w.user_nodemask = nodes;
2419 	} else {
2420 		int ret;
2421 		NODEMASK_SCRATCH(scratch);
2422 		if (scratch) {
2423 			task_lock(current);
2424 			ret = mpol_set_nodemask(new, &nodes, scratch);
2425 			task_unlock(current);
2426 		} else
2427 			ret = -ENOMEM;
2428 		NODEMASK_SCRATCH_FREE(scratch);
2429 		if (ret) {
2430 			mpol_put(new);
2431 			goto out;
2432 		}
2433 	}
2434 	err = 0;
2435 
2436 out:
2437 	/* Restore string for error message */
2438 	if (nodelist)
2439 		*--nodelist = ':';
2440 	if (flags)
2441 		*--flags = '=';
2442 	if (!err)
2443 		*mpol = new;
2444 	return err;
2445 }
2446 #endif /* CONFIG_TMPFS */
2447 
2448 /**
2449  * mpol_to_str - format a mempolicy structure for printing
2450  * @buffer:  to contain formatted mempolicy string
2451  * @maxlen:  length of @buffer
2452  * @pol:  pointer to mempolicy to be formatted
2453  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2454  *
2455  * Convert a mempolicy into a string.
2456  * Returns the number of characters in buffer (if positive)
2457  * or an error (negative)
2458  */
2459 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2460 {
2461 	char *p = buffer;
2462 	int l;
2463 	nodemask_t nodes;
2464 	unsigned short mode;
2465 	unsigned short flags = pol ? pol->flags : 0;
2466 
2467 	/*
2468 	 * Sanity check:  room for longest mode, flag and some nodes
2469 	 */
2470 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2471 
2472 	if (!pol || pol == &default_policy)
2473 		mode = MPOL_DEFAULT;
2474 	else
2475 		mode = pol->mode;
2476 
2477 	switch (mode) {
2478 	case MPOL_DEFAULT:
2479 		nodes_clear(nodes);
2480 		break;
2481 
2482 	case MPOL_PREFERRED:
2483 		nodes_clear(nodes);
2484 		if (flags & MPOL_F_LOCAL)
2485 			mode = MPOL_LOCAL;	/* pseudo-policy */
2486 		else
2487 			node_set(pol->v.preferred_node, nodes);
2488 		break;
2489 
2490 	case MPOL_BIND:
2491 		/* Fall through */
2492 	case MPOL_INTERLEAVE:
2493 		if (no_context)
2494 			nodes = pol->w.user_nodemask;
2495 		else
2496 			nodes = pol->v.nodes;
2497 		break;
2498 
2499 	default:
2500 		BUG();
2501 	}
2502 
2503 	l = strlen(policy_modes[mode]);
2504 	if (buffer + maxlen < p + l + 1)
2505 		return -ENOSPC;
2506 
2507 	strcpy(p, policy_modes[mode]);
2508 	p += l;
2509 
2510 	if (flags & MPOL_MODE_FLAGS) {
2511 		if (buffer + maxlen < p + 2)
2512 			return -ENOSPC;
2513 		*p++ = '=';
2514 
2515 		/*
2516 		 * Currently, the only defined flags are mutually exclusive
2517 		 */
2518 		if (flags & MPOL_F_STATIC_NODES)
2519 			p += snprintf(p, buffer + maxlen - p, "static");
2520 		else if (flags & MPOL_F_RELATIVE_NODES)
2521 			p += snprintf(p, buffer + maxlen - p, "relative");
2522 	}
2523 
2524 	if (!nodes_empty(nodes)) {
2525 		if (buffer + maxlen < p + 2)
2526 			return -ENOSPC;
2527 		*p++ = ':';
2528 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2529 	}
2530 	return p - buffer;
2531 }
2532 
2533 struct numa_maps {
2534 	unsigned long pages;
2535 	unsigned long anon;
2536 	unsigned long active;
2537 	unsigned long writeback;
2538 	unsigned long mapcount_max;
2539 	unsigned long dirty;
2540 	unsigned long swapcache;
2541 	unsigned long node[MAX_NUMNODES];
2542 };
2543 
2544 static void gather_stats(struct page *page, void *private, int pte_dirty)
2545 {
2546 	struct numa_maps *md = private;
2547 	int count = page_mapcount(page);
2548 
2549 	md->pages++;
2550 	if (pte_dirty || PageDirty(page))
2551 		md->dirty++;
2552 
2553 	if (PageSwapCache(page))
2554 		md->swapcache++;
2555 
2556 	if (PageActive(page) || PageUnevictable(page))
2557 		md->active++;
2558 
2559 	if (PageWriteback(page))
2560 		md->writeback++;
2561 
2562 	if (PageAnon(page))
2563 		md->anon++;
2564 
2565 	if (count > md->mapcount_max)
2566 		md->mapcount_max = count;
2567 
2568 	md->node[page_to_nid(page)]++;
2569 }
2570 
2571 #ifdef CONFIG_HUGETLB_PAGE
2572 static void check_huge_range(struct vm_area_struct *vma,
2573 		unsigned long start, unsigned long end,
2574 		struct numa_maps *md)
2575 {
2576 	unsigned long addr;
2577 	struct page *page;
2578 	struct hstate *h = hstate_vma(vma);
2579 	unsigned long sz = huge_page_size(h);
2580 
2581 	for (addr = start; addr < end; addr += sz) {
2582 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2583 						addr & huge_page_mask(h));
2584 		pte_t pte;
2585 
2586 		if (!ptep)
2587 			continue;
2588 
2589 		pte = *ptep;
2590 		if (pte_none(pte))
2591 			continue;
2592 
2593 		page = pte_page(pte);
2594 		if (!page)
2595 			continue;
2596 
2597 		gather_stats(page, md, pte_dirty(*ptep));
2598 	}
2599 }
2600 #else
2601 static inline void check_huge_range(struct vm_area_struct *vma,
2602 		unsigned long start, unsigned long end,
2603 		struct numa_maps *md)
2604 {
2605 }
2606 #endif
2607 
2608 /*
2609  * Display pages allocated per node and memory policy via /proc.
2610  */
2611 int show_numa_map(struct seq_file *m, void *v)
2612 {
2613 	struct proc_maps_private *priv = m->private;
2614 	struct vm_area_struct *vma = v;
2615 	struct numa_maps *md;
2616 	struct file *file = vma->vm_file;
2617 	struct mm_struct *mm = vma->vm_mm;
2618 	struct mempolicy *pol;
2619 	int n;
2620 	char buffer[50];
2621 
2622 	if (!mm)
2623 		return 0;
2624 
2625 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2626 	if (!md)
2627 		return 0;
2628 
2629 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2630 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2631 	mpol_cond_put(pol);
2632 
2633 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2634 
2635 	if (file) {
2636 		seq_printf(m, " file=");
2637 		seq_path(m, &file->f_path, "\n\t= ");
2638 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2639 		seq_printf(m, " heap");
2640 	} else if (vma->vm_start <= mm->start_stack &&
2641 			vma->vm_end >= mm->start_stack) {
2642 		seq_printf(m, " stack");
2643 	}
2644 
2645 	if (is_vm_hugetlb_page(vma)) {
2646 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2647 		seq_printf(m, " huge");
2648 	} else {
2649 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2650 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2651 	}
2652 
2653 	if (!md->pages)
2654 		goto out;
2655 
2656 	if (md->anon)
2657 		seq_printf(m," anon=%lu",md->anon);
2658 
2659 	if (md->dirty)
2660 		seq_printf(m," dirty=%lu",md->dirty);
2661 
2662 	if (md->pages != md->anon && md->pages != md->dirty)
2663 		seq_printf(m, " mapped=%lu", md->pages);
2664 
2665 	if (md->mapcount_max > 1)
2666 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2667 
2668 	if (md->swapcache)
2669 		seq_printf(m," swapcache=%lu", md->swapcache);
2670 
2671 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2672 		seq_printf(m," active=%lu", md->active);
2673 
2674 	if (md->writeback)
2675 		seq_printf(m," writeback=%lu", md->writeback);
2676 
2677 	for_each_node_state(n, N_HIGH_MEMORY)
2678 		if (md->node[n])
2679 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2680 out:
2681 	seq_putc(m, '\n');
2682 	kfree(md);
2683 
2684 	if (m->count < m->size)
2685 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2686 	return 0;
2687 }
2688