xref: /openbmc/linux/mm/mempolicy.c (revision 37012946da940521fb997a758a219d2f1ab56e51)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
58bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
61da177e4SLinus Torvalds  * Subject to the GNU Public License, version 2.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
221da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds  *                no fallback.
248bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
278bccd85fSChristoph Lameter  *
281da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
291da177e4SLinus Torvalds  *                As a special case node -1 here means do the allocation
301da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds  *                process policy.
338bccd85fSChristoph Lameter  *
341da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
351da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
361da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
371da177e4SLinus Torvalds  *
381da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
391da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
401da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
411da177e4SLinus Torvalds  * allocations for a VMA in the VM.
421da177e4SLinus Torvalds  *
431da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
441da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
451da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
461da177e4SLinus Torvalds  *
471da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
481da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
491da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
501da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
511da177e4SLinus Torvalds  *
521da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
531da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
541da177e4SLinus Torvalds  */
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds /* Notebook:
571da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
581da177e4SLinus Torvalds    object
591da177e4SLinus Torvalds    statistics for bigpages
601da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
611da177e4SLinus Torvalds    first item above.
621da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
631da177e4SLinus Torvalds    grows down?
641da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
651da177e4SLinus Torvalds    kernel is not always grateful with that.
661da177e4SLinus Torvalds */
671da177e4SLinus Torvalds 
681da177e4SLinus Torvalds #include <linux/mempolicy.h>
691da177e4SLinus Torvalds #include <linux/mm.h>
701da177e4SLinus Torvalds #include <linux/highmem.h>
711da177e4SLinus Torvalds #include <linux/hugetlb.h>
721da177e4SLinus Torvalds #include <linux/kernel.h>
731da177e4SLinus Torvalds #include <linux/sched.h>
741da177e4SLinus Torvalds #include <linux/nodemask.h>
751da177e4SLinus Torvalds #include <linux/cpuset.h>
761da177e4SLinus Torvalds #include <linux/gfp.h>
771da177e4SLinus Torvalds #include <linux/slab.h>
781da177e4SLinus Torvalds #include <linux/string.h>
791da177e4SLinus Torvalds #include <linux/module.h>
80b488893aSPavel Emelyanov #include <linux/nsproxy.h>
811da177e4SLinus Torvalds #include <linux/interrupt.h>
821da177e4SLinus Torvalds #include <linux/init.h>
831da177e4SLinus Torvalds #include <linux/compat.h>
84dc9aa5b9SChristoph Lameter #include <linux/swap.h>
851a75a6c8SChristoph Lameter #include <linux/seq_file.h>
861a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
87b20a3503SChristoph Lameter #include <linux/migrate.h>
8895a402c3SChristoph Lameter #include <linux/rmap.h>
8986c3a764SDavid Quigley #include <linux/security.h>
90dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
91dc9aa5b9SChristoph Lameter 
921da177e4SLinus Torvalds #include <asm/tlbflush.h>
931da177e4SLinus Torvalds #include <asm/uaccess.h>
941da177e4SLinus Torvalds 
9538e35860SChristoph Lameter /* Internal flags */
96dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
9738e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
981a75a6c8SChristoph Lameter #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
99dc9aa5b9SChristoph Lameter 
100fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
101fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1041da177e4SLinus Torvalds    policied. */
1056267276fSChristoph Lameter enum zone_type policy_zone = 0;
1061da177e4SLinus Torvalds 
107d42c6997SAndi Kleen struct mempolicy default_policy = {
1081da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
1091da177e4SLinus Torvalds 	.policy = MPOL_DEFAULT,
1101da177e4SLinus Torvalds };
1111da177e4SLinus Torvalds 
112*37012946SDavid Rientjes static const struct mempolicy_operations {
113*37012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
114*37012946SDavid Rientjes 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
115*37012946SDavid Rientjes } mpol_ops[MPOL_MAX];
116*37012946SDavid Rientjes 
11719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */
118*37012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask)
1191da177e4SLinus Torvalds {
12019770b32SMel Gorman 	int nd, k;
1211da177e4SLinus Torvalds 
12219770b32SMel Gorman 	/* Check that there is something useful in this mask */
12319770b32SMel Gorman 	k = policy_zone;
12419770b32SMel Gorman 
12519770b32SMel Gorman 	for_each_node_mask(nd, *nodemask) {
12619770b32SMel Gorman 		struct zone *z;
12719770b32SMel Gorman 
12819770b32SMel Gorman 		for (k = 0; k <= policy_zone; k++) {
12919770b32SMel Gorman 			z = &NODE_DATA(nd)->node_zones[k];
130dd942ae3SAndi Kleen 			if (z->present_pages > 0)
13119770b32SMel Gorman 				return 1;
132dd942ae3SAndi Kleen 		}
133dd942ae3SAndi Kleen 	}
13419770b32SMel Gorman 
13519770b32SMel Gorman 	return 0;
1361da177e4SLinus Torvalds }
1371da177e4SLinus Torvalds 
138f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139f5b087b5SDavid Rientjes {
1404c50bc01SDavid Rientjes 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
1414c50bc01SDavid Rientjes }
1424c50bc01SDavid Rientjes 
1434c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1444c50bc01SDavid Rientjes 				   const nodemask_t *rel)
1454c50bc01SDavid Rientjes {
1464c50bc01SDavid Rientjes 	nodemask_t tmp;
1474c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
1484c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
149f5b087b5SDavid Rientjes }
150f5b087b5SDavid Rientjes 
151*37012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
152*37012946SDavid Rientjes {
153*37012946SDavid Rientjes 	if (nodes_empty(*nodes))
154*37012946SDavid Rientjes 		return -EINVAL;
155*37012946SDavid Rientjes 	pol->v.nodes = *nodes;
156*37012946SDavid Rientjes 	return 0;
157*37012946SDavid Rientjes }
158*37012946SDavid Rientjes 
159*37012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
160*37012946SDavid Rientjes {
161*37012946SDavid Rientjes 	if (!nodes)
162*37012946SDavid Rientjes 		pol->v.preferred_node = -1;	/* local allocation */
163*37012946SDavid Rientjes 	else if (nodes_empty(*nodes))
164*37012946SDavid Rientjes 		return -EINVAL;			/*  no allowed nodes */
165*37012946SDavid Rientjes 	else
166*37012946SDavid Rientjes 		pol->v.preferred_node = first_node(*nodes);
167*37012946SDavid Rientjes 	return 0;
168*37012946SDavid Rientjes }
169*37012946SDavid Rientjes 
170*37012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
171*37012946SDavid Rientjes {
172*37012946SDavid Rientjes 	if (!is_valid_nodemask(nodes))
173*37012946SDavid Rientjes 		return -EINVAL;
174*37012946SDavid Rientjes 	pol->v.nodes = *nodes;
175*37012946SDavid Rientjes 	return 0;
176*37012946SDavid Rientjes }
177*37012946SDavid Rientjes 
1781da177e4SLinus Torvalds /* Create a new policy */
179028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180028fec41SDavid Rientjes 				  nodemask_t *nodes)
1811da177e4SLinus Torvalds {
1821da177e4SLinus Torvalds 	struct mempolicy *policy;
183f5b087b5SDavid Rientjes 	nodemask_t cpuset_context_nmask;
184*37012946SDavid Rientjes 	int localalloc = 0;
185*37012946SDavid Rientjes 	int ret;
1861da177e4SLinus Torvalds 
187028fec41SDavid Rientjes 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
188028fec41SDavid Rientjes 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
189140d5a49SPaul Mundt 
1901da177e4SLinus Torvalds 	if (mode == MPOL_DEFAULT)
191*37012946SDavid Rientjes 		return NULL;
192*37012946SDavid Rientjes 	if (!nodes || nodes_empty(*nodes)) {
193*37012946SDavid Rientjes 		if (mode != MPOL_PREFERRED)
194*37012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
195*37012946SDavid Rientjes 		localalloc = 1;	/* special case:  no mode flags */
196*37012946SDavid Rientjes 	}
1971da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1981da177e4SLinus Torvalds 	if (!policy)
1991da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2001da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
201*37012946SDavid Rientjes 	policy->policy = mode;
202*37012946SDavid Rientjes 
203*37012946SDavid Rientjes 	if (!localalloc) {
204*37012946SDavid Rientjes 		policy->flags = flags;
205f5b087b5SDavid Rientjes 		cpuset_update_task_memory_state();
2064c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
2074c50bc01SDavid Rientjes 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
2084c50bc01SDavid Rientjes 					       &cpuset_current_mems_allowed);
2094c50bc01SDavid Rientjes 		else
2104c50bc01SDavid Rientjes 			nodes_and(cpuset_context_nmask, *nodes,
2114c50bc01SDavid Rientjes 				  cpuset_current_mems_allowed);
212f5b087b5SDavid Rientjes 		if (mpol_store_user_nodemask(policy))
213f5b087b5SDavid Rientjes 			policy->w.user_nodemask = *nodes;
214f5b087b5SDavid Rientjes 		else
215*37012946SDavid Rientjes 			policy->w.cpuset_mems_allowed =
216*37012946SDavid Rientjes 						cpuset_mems_allowed(current);
2171da177e4SLinus Torvalds 	}
2181da177e4SLinus Torvalds 
219*37012946SDavid Rientjes 	ret = mpol_ops[mode].create(policy,
220*37012946SDavid Rientjes 				localalloc ? NULL : &cpuset_context_nmask);
221*37012946SDavid Rientjes 	if (ret < 0) {
222*37012946SDavid Rientjes 		kmem_cache_free(policy_cache, policy);
223*37012946SDavid Rientjes 		return ERR_PTR(ret);
224*37012946SDavid Rientjes 	}
225*37012946SDavid Rientjes 	return policy;
226*37012946SDavid Rientjes }
227*37012946SDavid Rientjes 
228*37012946SDavid Rientjes static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
229*37012946SDavid Rientjes {
230*37012946SDavid Rientjes }
231*37012946SDavid Rientjes 
232*37012946SDavid Rientjes static void mpol_rebind_nodemask(struct mempolicy *pol,
233*37012946SDavid Rientjes 				 const nodemask_t *nodes)
2341d0d2680SDavid Rientjes {
2351d0d2680SDavid Rientjes 	nodemask_t tmp;
2361d0d2680SDavid Rientjes 
237*37012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
238*37012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
239*37012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
240*37012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2411d0d2680SDavid Rientjes 	else {
242*37012946SDavid Rientjes 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
243*37012946SDavid Rientjes 			    *nodes);
244*37012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2451d0d2680SDavid Rientjes 	}
246*37012946SDavid Rientjes 
2471d0d2680SDavid Rientjes 	pol->v.nodes = tmp;
2481d0d2680SDavid Rientjes 	if (!node_isset(current->il_next, tmp)) {
2491d0d2680SDavid Rientjes 		current->il_next = next_node(current->il_next, tmp);
2501d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2511d0d2680SDavid Rientjes 			current->il_next = first_node(tmp);
2521d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2531d0d2680SDavid Rientjes 			current->il_next = numa_node_id();
2541d0d2680SDavid Rientjes 	}
255*37012946SDavid Rientjes }
256*37012946SDavid Rientjes 
257*37012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
258*37012946SDavid Rientjes 				  const nodemask_t *nodes)
259*37012946SDavid Rientjes {
260*37012946SDavid Rientjes 	nodemask_t tmp;
261*37012946SDavid Rientjes 
262*37012946SDavid Rientjes 	/*
263*37012946SDavid Rientjes 	 * check 'STATIC_NODES first, as preferred_node == -1 may be
264*37012946SDavid Rientjes 	 * a temporary, "fallback" state for this policy.
265*37012946SDavid Rientjes 	 */
266*37012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES) {
2671d0d2680SDavid Rientjes 		int node = first_node(pol->w.user_nodemask);
2681d0d2680SDavid Rientjes 
269*37012946SDavid Rientjes 		if (node_isset(node, *nodes))
2701d0d2680SDavid Rientjes 			pol->v.preferred_node = node;
2711d0d2680SDavid Rientjes 		else
2721d0d2680SDavid Rientjes 			pol->v.preferred_node = -1;
273*37012946SDavid Rientjes 	} else if (pol->v.preferred_node == -1) {
274*37012946SDavid Rientjes 		return;	/* no remap required for explicit local alloc */
275*37012946SDavid Rientjes 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
276*37012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2771d0d2680SDavid Rientjes 		pol->v.preferred_node = first_node(tmp);
2781d0d2680SDavid Rientjes 	} else {
2791d0d2680SDavid Rientjes 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
280*37012946SDavid Rientjes 						   pol->w.cpuset_mems_allowed,
281*37012946SDavid Rientjes 						   *nodes);
282*37012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2831d0d2680SDavid Rientjes 	}
2841d0d2680SDavid Rientjes }
285*37012946SDavid Rientjes 
286*37012946SDavid Rientjes /* Migrate a policy to a different set of nodes */
287*37012946SDavid Rientjes static void mpol_rebind_policy(struct mempolicy *pol,
288*37012946SDavid Rientjes 			       const nodemask_t *newmask)
289*37012946SDavid Rientjes {
290*37012946SDavid Rientjes 	if (!pol)
291*37012946SDavid Rientjes 		return;
292*37012946SDavid Rientjes 	if (!mpol_store_user_nodemask(pol) &&
293*37012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
294*37012946SDavid Rientjes 		return;
295*37012946SDavid Rientjes 	mpol_ops[pol->policy].rebind(pol, newmask);
2961d0d2680SDavid Rientjes }
2971d0d2680SDavid Rientjes 
2981d0d2680SDavid Rientjes /*
2991d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
3001d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
3011d0d2680SDavid Rientjes  */
3021d0d2680SDavid Rientjes 
3031d0d2680SDavid Rientjes void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3041d0d2680SDavid Rientjes {
3051d0d2680SDavid Rientjes 	mpol_rebind_policy(tsk->mempolicy, new);
3061d0d2680SDavid Rientjes }
3071d0d2680SDavid Rientjes 
3081d0d2680SDavid Rientjes /*
3091d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
3101d0d2680SDavid Rientjes  *
3111d0d2680SDavid Rientjes  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
3121d0d2680SDavid Rientjes  */
3131d0d2680SDavid Rientjes 
3141d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
3151d0d2680SDavid Rientjes {
3161d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
3171d0d2680SDavid Rientjes 
3181d0d2680SDavid Rientjes 	down_write(&mm->mmap_sem);
3191d0d2680SDavid Rientjes 	for (vma = mm->mmap; vma; vma = vma->vm_next)
3201d0d2680SDavid Rientjes 		mpol_rebind_policy(vma->vm_policy, new);
3211d0d2680SDavid Rientjes 	up_write(&mm->mmap_sem);
3221d0d2680SDavid Rientjes }
3231d0d2680SDavid Rientjes 
324*37012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
325*37012946SDavid Rientjes 	[MPOL_DEFAULT] = {
326*37012946SDavid Rientjes 		.rebind = mpol_rebind_default,
327*37012946SDavid Rientjes 	},
328*37012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
329*37012946SDavid Rientjes 		.create = mpol_new_interleave,
330*37012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
331*37012946SDavid Rientjes 	},
332*37012946SDavid Rientjes 	[MPOL_PREFERRED] = {
333*37012946SDavid Rientjes 		.create = mpol_new_preferred,
334*37012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
335*37012946SDavid Rientjes 	},
336*37012946SDavid Rientjes 	[MPOL_BIND] = {
337*37012946SDavid Rientjes 		.create = mpol_new_bind,
338*37012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
339*37012946SDavid Rientjes 	},
340*37012946SDavid Rientjes };
341*37012946SDavid Rientjes 
342397874dfSChristoph Lameter static void gather_stats(struct page *, void *, int pte_dirty);
343fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
344fc301289SChristoph Lameter 				unsigned long flags);
3451a75a6c8SChristoph Lameter 
34638e35860SChristoph Lameter /* Scan through pages checking if pages follow certain conditions. */
347b5810039SNick Piggin static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
348dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
349dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
35038e35860SChristoph Lameter 		void *private)
3511da177e4SLinus Torvalds {
35291612e0dSHugh Dickins 	pte_t *orig_pte;
35391612e0dSHugh Dickins 	pte_t *pte;
354705e87c0SHugh Dickins 	spinlock_t *ptl;
355941150a3SHugh Dickins 
356705e87c0SHugh Dickins 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
35791612e0dSHugh Dickins 	do {
3586aab341eSLinus Torvalds 		struct page *page;
35925ba77c1SAndy Whitcroft 		int nid;
36091612e0dSHugh Dickins 
36191612e0dSHugh Dickins 		if (!pte_present(*pte))
36291612e0dSHugh Dickins 			continue;
3636aab341eSLinus Torvalds 		page = vm_normal_page(vma, addr, *pte);
3646aab341eSLinus Torvalds 		if (!page)
36591612e0dSHugh Dickins 			continue;
366053837fcSNick Piggin 		/*
367053837fcSNick Piggin 		 * The check for PageReserved here is important to avoid
368053837fcSNick Piggin 		 * handling zero pages and other pages that may have been
369053837fcSNick Piggin 		 * marked special by the system.
370053837fcSNick Piggin 		 *
371053837fcSNick Piggin 		 * If the PageReserved would not be checked here then f.e.
372053837fcSNick Piggin 		 * the location of the zero page could have an influence
373053837fcSNick Piggin 		 * on MPOL_MF_STRICT, zero pages would be counted for
374053837fcSNick Piggin 		 * the per node stats, and there would be useless attempts
375053837fcSNick Piggin 		 * to put zero pages on the migration list.
376053837fcSNick Piggin 		 */
377f4598c8bSChristoph Lameter 		if (PageReserved(page))
378f4598c8bSChristoph Lameter 			continue;
3796aab341eSLinus Torvalds 		nid = page_to_nid(page);
38038e35860SChristoph Lameter 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
38138e35860SChristoph Lameter 			continue;
38238e35860SChristoph Lameter 
3831a75a6c8SChristoph Lameter 		if (flags & MPOL_MF_STATS)
384397874dfSChristoph Lameter 			gather_stats(page, private, pte_dirty(*pte));
385053837fcSNick Piggin 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
386fc301289SChristoph Lameter 			migrate_page_add(page, private, flags);
387dc9aa5b9SChristoph Lameter 		else
3881da177e4SLinus Torvalds 			break;
38991612e0dSHugh Dickins 	} while (pte++, addr += PAGE_SIZE, addr != end);
390705e87c0SHugh Dickins 	pte_unmap_unlock(orig_pte, ptl);
39191612e0dSHugh Dickins 	return addr != end;
39291612e0dSHugh Dickins }
39391612e0dSHugh Dickins 
394b5810039SNick Piggin static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
395dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
396dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
39738e35860SChristoph Lameter 		void *private)
39891612e0dSHugh Dickins {
39991612e0dSHugh Dickins 	pmd_t *pmd;
40091612e0dSHugh Dickins 	unsigned long next;
40191612e0dSHugh Dickins 
40291612e0dSHugh Dickins 	pmd = pmd_offset(pud, addr);
40391612e0dSHugh Dickins 	do {
40491612e0dSHugh Dickins 		next = pmd_addr_end(addr, end);
40591612e0dSHugh Dickins 		if (pmd_none_or_clear_bad(pmd))
40691612e0dSHugh Dickins 			continue;
407dc9aa5b9SChristoph Lameter 		if (check_pte_range(vma, pmd, addr, next, nodes,
40838e35860SChristoph Lameter 				    flags, private))
40991612e0dSHugh Dickins 			return -EIO;
41091612e0dSHugh Dickins 	} while (pmd++, addr = next, addr != end);
41191612e0dSHugh Dickins 	return 0;
41291612e0dSHugh Dickins }
41391612e0dSHugh Dickins 
414b5810039SNick Piggin static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
415dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
416dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
41738e35860SChristoph Lameter 		void *private)
41891612e0dSHugh Dickins {
41991612e0dSHugh Dickins 	pud_t *pud;
42091612e0dSHugh Dickins 	unsigned long next;
42191612e0dSHugh Dickins 
42291612e0dSHugh Dickins 	pud = pud_offset(pgd, addr);
42391612e0dSHugh Dickins 	do {
42491612e0dSHugh Dickins 		next = pud_addr_end(addr, end);
42591612e0dSHugh Dickins 		if (pud_none_or_clear_bad(pud))
42691612e0dSHugh Dickins 			continue;
427dc9aa5b9SChristoph Lameter 		if (check_pmd_range(vma, pud, addr, next, nodes,
42838e35860SChristoph Lameter 				    flags, private))
42991612e0dSHugh Dickins 			return -EIO;
43091612e0dSHugh Dickins 	} while (pud++, addr = next, addr != end);
43191612e0dSHugh Dickins 	return 0;
43291612e0dSHugh Dickins }
43391612e0dSHugh Dickins 
434b5810039SNick Piggin static inline int check_pgd_range(struct vm_area_struct *vma,
435dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
436dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
43738e35860SChristoph Lameter 		void *private)
43891612e0dSHugh Dickins {
43991612e0dSHugh Dickins 	pgd_t *pgd;
44091612e0dSHugh Dickins 	unsigned long next;
44191612e0dSHugh Dickins 
442b5810039SNick Piggin 	pgd = pgd_offset(vma->vm_mm, addr);
44391612e0dSHugh Dickins 	do {
44491612e0dSHugh Dickins 		next = pgd_addr_end(addr, end);
44591612e0dSHugh Dickins 		if (pgd_none_or_clear_bad(pgd))
44691612e0dSHugh Dickins 			continue;
447dc9aa5b9SChristoph Lameter 		if (check_pud_range(vma, pgd, addr, next, nodes,
44838e35860SChristoph Lameter 				    flags, private))
44991612e0dSHugh Dickins 			return -EIO;
45091612e0dSHugh Dickins 	} while (pgd++, addr = next, addr != end);
45191612e0dSHugh Dickins 	return 0;
4521da177e4SLinus Torvalds }
4531da177e4SLinus Torvalds 
454dc9aa5b9SChristoph Lameter /*
455dc9aa5b9SChristoph Lameter  * Check if all pages in a range are on a set of nodes.
456dc9aa5b9SChristoph Lameter  * If pagelist != NULL then isolate pages from the LRU and
457dc9aa5b9SChristoph Lameter  * put them on the pagelist.
458dc9aa5b9SChristoph Lameter  */
4591da177e4SLinus Torvalds static struct vm_area_struct *
4601da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
46138e35860SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags, void *private)
4621da177e4SLinus Torvalds {
4631da177e4SLinus Torvalds 	int err;
4641da177e4SLinus Torvalds 	struct vm_area_struct *first, *vma, *prev;
4651da177e4SLinus Torvalds 
46690036ee5SChristoph Lameter 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
46790036ee5SChristoph Lameter 
468b20a3503SChristoph Lameter 		err = migrate_prep();
469b20a3503SChristoph Lameter 		if (err)
470b20a3503SChristoph Lameter 			return ERR_PTR(err);
47190036ee5SChristoph Lameter 	}
472053837fcSNick Piggin 
4731da177e4SLinus Torvalds 	first = find_vma(mm, start);
4741da177e4SLinus Torvalds 	if (!first)
4751da177e4SLinus Torvalds 		return ERR_PTR(-EFAULT);
4761da177e4SLinus Torvalds 	prev = NULL;
4771da177e4SLinus Torvalds 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
478dc9aa5b9SChristoph Lameter 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
4791da177e4SLinus Torvalds 			if (!vma->vm_next && vma->vm_end < end)
4801da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
4811da177e4SLinus Torvalds 			if (prev && prev->vm_end < vma->vm_start)
4821da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
483dc9aa5b9SChristoph Lameter 		}
484dc9aa5b9SChristoph Lameter 		if (!is_vm_hugetlb_page(vma) &&
485dc9aa5b9SChristoph Lameter 		    ((flags & MPOL_MF_STRICT) ||
486dc9aa5b9SChristoph Lameter 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
487dc9aa5b9SChristoph Lameter 				vma_migratable(vma)))) {
4885b952b3cSAndi Kleen 			unsigned long endvma = vma->vm_end;
489dc9aa5b9SChristoph Lameter 
4905b952b3cSAndi Kleen 			if (endvma > end)
4915b952b3cSAndi Kleen 				endvma = end;
4925b952b3cSAndi Kleen 			if (vma->vm_start > start)
4935b952b3cSAndi Kleen 				start = vma->vm_start;
494dc9aa5b9SChristoph Lameter 			err = check_pgd_range(vma, start, endvma, nodes,
49538e35860SChristoph Lameter 						flags, private);
4961da177e4SLinus Torvalds 			if (err) {
4971da177e4SLinus Torvalds 				first = ERR_PTR(err);
4981da177e4SLinus Torvalds 				break;
4991da177e4SLinus Torvalds 			}
5001da177e4SLinus Torvalds 		}
5011da177e4SLinus Torvalds 		prev = vma;
5021da177e4SLinus Torvalds 	}
5031da177e4SLinus Torvalds 	return first;
5041da177e4SLinus Torvalds }
5051da177e4SLinus Torvalds 
5061da177e4SLinus Torvalds /* Apply policy to a single VMA */
5071da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
5081da177e4SLinus Torvalds {
5091da177e4SLinus Torvalds 	int err = 0;
5101da177e4SLinus Torvalds 	struct mempolicy *old = vma->vm_policy;
5111da177e4SLinus Torvalds 
512140d5a49SPaul Mundt 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
5131da177e4SLinus Torvalds 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
5141da177e4SLinus Torvalds 		 vma->vm_ops, vma->vm_file,
5151da177e4SLinus Torvalds 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
5161da177e4SLinus Torvalds 
5171da177e4SLinus Torvalds 	if (vma->vm_ops && vma->vm_ops->set_policy)
5181da177e4SLinus Torvalds 		err = vma->vm_ops->set_policy(vma, new);
5191da177e4SLinus Torvalds 	if (!err) {
5201da177e4SLinus Torvalds 		mpol_get(new);
5211da177e4SLinus Torvalds 		vma->vm_policy = new;
5221da177e4SLinus Torvalds 		mpol_free(old);
5231da177e4SLinus Torvalds 	}
5241da177e4SLinus Torvalds 	return err;
5251da177e4SLinus Torvalds }
5261da177e4SLinus Torvalds 
5271da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
5281da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start,
5291da177e4SLinus Torvalds 		       unsigned long end, struct mempolicy *new)
5301da177e4SLinus Torvalds {
5311da177e4SLinus Torvalds 	struct vm_area_struct *next;
5321da177e4SLinus Torvalds 	int err;
5331da177e4SLinus Torvalds 
5341da177e4SLinus Torvalds 	err = 0;
5351da177e4SLinus Torvalds 	for (; vma && vma->vm_start < end; vma = next) {
5361da177e4SLinus Torvalds 		next = vma->vm_next;
5371da177e4SLinus Torvalds 		if (vma->vm_start < start)
5381da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, start, 1);
5391da177e4SLinus Torvalds 		if (!err && vma->vm_end > end)
5401da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, end, 0);
5411da177e4SLinus Torvalds 		if (!err)
5421da177e4SLinus Torvalds 			err = policy_vma(vma, new);
5431da177e4SLinus Torvalds 		if (err)
5441da177e4SLinus Torvalds 			break;
5451da177e4SLinus Torvalds 	}
5461da177e4SLinus Torvalds 	return err;
5471da177e4SLinus Torvalds }
5481da177e4SLinus Torvalds 
549c61afb18SPaul Jackson /*
550c61afb18SPaul Jackson  * Update task->flags PF_MEMPOLICY bit: set iff non-default
551c61afb18SPaul Jackson  * mempolicy.  Allows more rapid checking of this (combined perhaps
552c61afb18SPaul Jackson  * with other PF_* flag bits) on memory allocation hot code paths.
553c61afb18SPaul Jackson  *
554c61afb18SPaul Jackson  * If called from outside this file, the task 'p' should -only- be
555c61afb18SPaul Jackson  * a newly forked child not yet visible on the task list, because
556c61afb18SPaul Jackson  * manipulating the task flags of a visible task is not safe.
557c61afb18SPaul Jackson  *
558c61afb18SPaul Jackson  * The above limitation is why this routine has the funny name
559c61afb18SPaul Jackson  * mpol_fix_fork_child_flag().
560c61afb18SPaul Jackson  *
561c61afb18SPaul Jackson  * It is also safe to call this with a task pointer of current,
562c61afb18SPaul Jackson  * which the static wrapper mpol_set_task_struct_flag() does,
563c61afb18SPaul Jackson  * for use within this file.
564c61afb18SPaul Jackson  */
565c61afb18SPaul Jackson 
566c61afb18SPaul Jackson void mpol_fix_fork_child_flag(struct task_struct *p)
567c61afb18SPaul Jackson {
568c61afb18SPaul Jackson 	if (p->mempolicy)
569c61afb18SPaul Jackson 		p->flags |= PF_MEMPOLICY;
570c61afb18SPaul Jackson 	else
571c61afb18SPaul Jackson 		p->flags &= ~PF_MEMPOLICY;
572c61afb18SPaul Jackson }
573c61afb18SPaul Jackson 
574c61afb18SPaul Jackson static void mpol_set_task_struct_flag(void)
575c61afb18SPaul Jackson {
576c61afb18SPaul Jackson 	mpol_fix_fork_child_flag(current);
577c61afb18SPaul Jackson }
578c61afb18SPaul Jackson 
5791da177e4SLinus Torvalds /* Set the process memory policy */
580028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
581028fec41SDavid Rientjes 			     nodemask_t *nodes)
5821da177e4SLinus Torvalds {
5831da177e4SLinus Torvalds 	struct mempolicy *new;
5841da177e4SLinus Torvalds 
585028fec41SDavid Rientjes 	new = mpol_new(mode, flags, nodes);
5861da177e4SLinus Torvalds 	if (IS_ERR(new))
5871da177e4SLinus Torvalds 		return PTR_ERR(new);
5881da177e4SLinus Torvalds 	mpol_free(current->mempolicy);
5891da177e4SLinus Torvalds 	current->mempolicy = new;
590c61afb18SPaul Jackson 	mpol_set_task_struct_flag();
591f5b087b5SDavid Rientjes 	if (new && new->policy == MPOL_INTERLEAVE &&
592f5b087b5SDavid Rientjes 	    nodes_weight(new->v.nodes))
593dfcd3c0dSAndi Kleen 		current->il_next = first_node(new->v.nodes);
5941da177e4SLinus Torvalds 	return 0;
5951da177e4SLinus Torvalds }
5961da177e4SLinus Torvalds 
5971da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */
598dfcd3c0dSAndi Kleen static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
5991da177e4SLinus Torvalds {
600dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
6011da177e4SLinus Torvalds 	switch (p->policy) {
6021da177e4SLinus Torvalds 	case MPOL_DEFAULT:
6031da177e4SLinus Torvalds 		break;
60419770b32SMel Gorman 	case MPOL_BIND:
60519770b32SMel Gorman 		/* Fall through */
6061da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
607dfcd3c0dSAndi Kleen 		*nodes = p->v.nodes;
6081da177e4SLinus Torvalds 		break;
6091da177e4SLinus Torvalds 	case MPOL_PREFERRED:
61056bbd65dSChristoph Lameter 		/* or use current node instead of memory_map? */
6111da177e4SLinus Torvalds 		if (p->v.preferred_node < 0)
61256bbd65dSChristoph Lameter 			*nodes = node_states[N_HIGH_MEMORY];
6131da177e4SLinus Torvalds 		else
614dfcd3c0dSAndi Kleen 			node_set(p->v.preferred_node, *nodes);
6151da177e4SLinus Torvalds 		break;
6161da177e4SLinus Torvalds 	default:
6171da177e4SLinus Torvalds 		BUG();
6181da177e4SLinus Torvalds 	}
6191da177e4SLinus Torvalds }
6201da177e4SLinus Torvalds 
6211da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr)
6221da177e4SLinus Torvalds {
6231da177e4SLinus Torvalds 	struct page *p;
6241da177e4SLinus Torvalds 	int err;
6251da177e4SLinus Torvalds 
6261da177e4SLinus Torvalds 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
6271da177e4SLinus Torvalds 	if (err >= 0) {
6281da177e4SLinus Torvalds 		err = page_to_nid(p);
6291da177e4SLinus Torvalds 		put_page(p);
6301da177e4SLinus Torvalds 	}
6311da177e4SLinus Torvalds 	return err;
6321da177e4SLinus Torvalds }
6331da177e4SLinus Torvalds 
6341da177e4SLinus Torvalds /* Retrieve NUMA policy */
635dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
6361da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
6371da177e4SLinus Torvalds {
6388bccd85fSChristoph Lameter 	int err;
6391da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
6401da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
6411da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
6421da177e4SLinus Torvalds 
643cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
644754af6f5SLee Schermerhorn 	if (flags &
645754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
6461da177e4SLinus Torvalds 		return -EINVAL;
647754af6f5SLee Schermerhorn 
648754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
649754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
650754af6f5SLee Schermerhorn 			return -EINVAL;
651754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
652754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
653754af6f5SLee Schermerhorn 		return 0;
654754af6f5SLee Schermerhorn 	}
655754af6f5SLee Schermerhorn 
6561da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
6571da177e4SLinus Torvalds 		down_read(&mm->mmap_sem);
6581da177e4SLinus Torvalds 		vma = find_vma_intersection(mm, addr, addr+1);
6591da177e4SLinus Torvalds 		if (!vma) {
6601da177e4SLinus Torvalds 			up_read(&mm->mmap_sem);
6611da177e4SLinus Torvalds 			return -EFAULT;
6621da177e4SLinus Torvalds 		}
6631da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
6641da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
6651da177e4SLinus Torvalds 		else
6661da177e4SLinus Torvalds 			pol = vma->vm_policy;
6671da177e4SLinus Torvalds 	} else if (addr)
6681da177e4SLinus Torvalds 		return -EINVAL;
6691da177e4SLinus Torvalds 
6701da177e4SLinus Torvalds 	if (!pol)
6711da177e4SLinus Torvalds 		pol = &default_policy;
6721da177e4SLinus Torvalds 
6731da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
6741da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
6751da177e4SLinus Torvalds 			err = lookup_node(mm, addr);
6761da177e4SLinus Torvalds 			if (err < 0)
6771da177e4SLinus Torvalds 				goto out;
6788bccd85fSChristoph Lameter 			*policy = err;
6791da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
6801da177e4SLinus Torvalds 				pol->policy == MPOL_INTERLEAVE) {
6818bccd85fSChristoph Lameter 			*policy = current->il_next;
6821da177e4SLinus Torvalds 		} else {
6831da177e4SLinus Torvalds 			err = -EINVAL;
6841da177e4SLinus Torvalds 			goto out;
6851da177e4SLinus Torvalds 		}
6861da177e4SLinus Torvalds 	} else
687028fec41SDavid Rientjes 		*policy = pol->policy | pol->flags;
6881da177e4SLinus Torvalds 
6891da177e4SLinus Torvalds 	if (vma) {
6901da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
6911da177e4SLinus Torvalds 		vma = NULL;
6921da177e4SLinus Torvalds 	}
6931da177e4SLinus Torvalds 
6941da177e4SLinus Torvalds 	err = 0;
6958bccd85fSChristoph Lameter 	if (nmask)
6968bccd85fSChristoph Lameter 		get_zonemask(pol, nmask);
6971da177e4SLinus Torvalds 
6981da177e4SLinus Torvalds  out:
6991da177e4SLinus Torvalds 	if (vma)
7001da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
7011da177e4SLinus Torvalds 	return err;
7021da177e4SLinus Torvalds }
7031da177e4SLinus Torvalds 
704b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
7058bccd85fSChristoph Lameter /*
7066ce3c4c0SChristoph Lameter  * page migration
7076ce3c4c0SChristoph Lameter  */
708fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
709fc301289SChristoph Lameter 				unsigned long flags)
7106ce3c4c0SChristoph Lameter {
7116ce3c4c0SChristoph Lameter 	/*
712fc301289SChristoph Lameter 	 * Avoid migrating a page that is shared with others.
7136ce3c4c0SChristoph Lameter 	 */
714b20a3503SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
715b20a3503SChristoph Lameter 		isolate_lru_page(page, pagelist);
7166ce3c4c0SChristoph Lameter }
7176ce3c4c0SChristoph Lameter 
718742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x)
71995a402c3SChristoph Lameter {
720769848c0SMel Gorman 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
72195a402c3SChristoph Lameter }
72295a402c3SChristoph Lameter 
7236ce3c4c0SChristoph Lameter /*
7247e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
7257e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
7267e2ab150SChristoph Lameter  */
727dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
728dbcb0f19SAdrian Bunk 			   int flags)
7297e2ab150SChristoph Lameter {
7307e2ab150SChristoph Lameter 	nodemask_t nmask;
7317e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
7327e2ab150SChristoph Lameter 	int err = 0;
7337e2ab150SChristoph Lameter 
7347e2ab150SChristoph Lameter 	nodes_clear(nmask);
7357e2ab150SChristoph Lameter 	node_set(source, nmask);
7367e2ab150SChristoph Lameter 
7377e2ab150SChristoph Lameter 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
7387e2ab150SChristoph Lameter 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
7397e2ab150SChristoph Lameter 
7407e2ab150SChristoph Lameter 	if (!list_empty(&pagelist))
74195a402c3SChristoph Lameter 		err = migrate_pages(&pagelist, new_node_page, dest);
74295a402c3SChristoph Lameter 
7437e2ab150SChristoph Lameter 	return err;
7447e2ab150SChristoph Lameter }
7457e2ab150SChristoph Lameter 
7467e2ab150SChristoph Lameter /*
7477e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
7487e2ab150SChristoph Lameter  * layout as much as possible.
74939743889SChristoph Lameter  *
75039743889SChristoph Lameter  * Returns the number of page that could not be moved.
75139743889SChristoph Lameter  */
75239743889SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
75339743889SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
75439743889SChristoph Lameter {
75539743889SChristoph Lameter 	LIST_HEAD(pagelist);
7567e2ab150SChristoph Lameter 	int busy = 0;
7577e2ab150SChristoph Lameter 	int err = 0;
7587e2ab150SChristoph Lameter 	nodemask_t tmp;
75939743889SChristoph Lameter 
76039743889SChristoph Lameter   	down_read(&mm->mmap_sem);
761d4984711SChristoph Lameter 
7627b2259b3SChristoph Lameter 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
7637b2259b3SChristoph Lameter 	if (err)
7647b2259b3SChristoph Lameter 		goto out;
7657b2259b3SChristoph Lameter 
7667e2ab150SChristoph Lameter /*
7677e2ab150SChristoph Lameter  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
7687e2ab150SChristoph Lameter  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
7697e2ab150SChristoph Lameter  * bit in 'tmp', and return that <source, dest> pair for migration.
7707e2ab150SChristoph Lameter  * The pair of nodemasks 'to' and 'from' define the map.
7717e2ab150SChristoph Lameter  *
7727e2ab150SChristoph Lameter  * If no pair of bits is found that way, fallback to picking some
7737e2ab150SChristoph Lameter  * pair of 'source' and 'dest' bits that are not the same.  If the
7747e2ab150SChristoph Lameter  * 'source' and 'dest' bits are the same, this represents a node
7757e2ab150SChristoph Lameter  * that will be migrating to itself, so no pages need move.
7767e2ab150SChristoph Lameter  *
7777e2ab150SChristoph Lameter  * If no bits are left in 'tmp', or if all remaining bits left
7787e2ab150SChristoph Lameter  * in 'tmp' correspond to the same bit in 'to', return false
7797e2ab150SChristoph Lameter  * (nothing left to migrate).
7807e2ab150SChristoph Lameter  *
7817e2ab150SChristoph Lameter  * This lets us pick a pair of nodes to migrate between, such that
7827e2ab150SChristoph Lameter  * if possible the dest node is not already occupied by some other
7837e2ab150SChristoph Lameter  * source node, minimizing the risk of overloading the memory on a
7847e2ab150SChristoph Lameter  * node that would happen if we migrated incoming memory to a node
7857e2ab150SChristoph Lameter  * before migrating outgoing memory source that same node.
7867e2ab150SChristoph Lameter  *
7877e2ab150SChristoph Lameter  * A single scan of tmp is sufficient.  As we go, we remember the
7887e2ab150SChristoph Lameter  * most recent <s, d> pair that moved (s != d).  If we find a pair
7897e2ab150SChristoph Lameter  * that not only moved, but what's better, moved to an empty slot
7907e2ab150SChristoph Lameter  * (d is not set in tmp), then we break out then, with that pair.
7917e2ab150SChristoph Lameter  * Otherwise when we finish scannng from_tmp, we at least have the
7927e2ab150SChristoph Lameter  * most recent <s, d> pair that moved.  If we get all the way through
7937e2ab150SChristoph Lameter  * the scan of tmp without finding any node that moved, much less
7947e2ab150SChristoph Lameter  * moved to an empty node, then there is nothing left worth migrating.
7957e2ab150SChristoph Lameter  */
7967e2ab150SChristoph Lameter 
7977e2ab150SChristoph Lameter 	tmp = *from_nodes;
7987e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
7997e2ab150SChristoph Lameter 		int s,d;
8007e2ab150SChristoph Lameter 		int source = -1;
8017e2ab150SChristoph Lameter 		int dest = 0;
8027e2ab150SChristoph Lameter 
8037e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
8047e2ab150SChristoph Lameter 			d = node_remap(s, *from_nodes, *to_nodes);
8057e2ab150SChristoph Lameter 			if (s == d)
8067e2ab150SChristoph Lameter 				continue;
8077e2ab150SChristoph Lameter 
8087e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
8097e2ab150SChristoph Lameter 			dest = d;
8107e2ab150SChristoph Lameter 
8117e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
8127e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
8137e2ab150SChristoph Lameter 				break;
8147e2ab150SChristoph Lameter 		}
8157e2ab150SChristoph Lameter 		if (source == -1)
8167e2ab150SChristoph Lameter 			break;
8177e2ab150SChristoph Lameter 
8187e2ab150SChristoph Lameter 		node_clear(source, tmp);
8197e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
8207e2ab150SChristoph Lameter 		if (err > 0)
8217e2ab150SChristoph Lameter 			busy += err;
8227e2ab150SChristoph Lameter 		if (err < 0)
8237e2ab150SChristoph Lameter 			break;
82439743889SChristoph Lameter 	}
8257b2259b3SChristoph Lameter out:
82639743889SChristoph Lameter 	up_read(&mm->mmap_sem);
8277e2ab150SChristoph Lameter 	if (err < 0)
8287e2ab150SChristoph Lameter 		return err;
8297e2ab150SChristoph Lameter 	return busy;
830b20a3503SChristoph Lameter 
83139743889SChristoph Lameter }
83239743889SChristoph Lameter 
8333ad33b24SLee Schermerhorn /*
8343ad33b24SLee Schermerhorn  * Allocate a new page for page migration based on vma policy.
8353ad33b24SLee Schermerhorn  * Start assuming that page is mapped by vma pointed to by @private.
8363ad33b24SLee Schermerhorn  * Search forward from there, if not.  N.B., this assumes that the
8373ad33b24SLee Schermerhorn  * list of pages handed to migrate_pages()--which is how we get here--
8383ad33b24SLee Schermerhorn  * is in virtual address order.
8393ad33b24SLee Schermerhorn  */
840742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
84195a402c3SChristoph Lameter {
84295a402c3SChristoph Lameter 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
8433ad33b24SLee Schermerhorn 	unsigned long uninitialized_var(address);
84495a402c3SChristoph Lameter 
8453ad33b24SLee Schermerhorn 	while (vma) {
8463ad33b24SLee Schermerhorn 		address = page_address_in_vma(page, vma);
8473ad33b24SLee Schermerhorn 		if (address != -EFAULT)
8483ad33b24SLee Schermerhorn 			break;
8493ad33b24SLee Schermerhorn 		vma = vma->vm_next;
8503ad33b24SLee Schermerhorn 	}
8513ad33b24SLee Schermerhorn 
8523ad33b24SLee Schermerhorn 	/*
8533ad33b24SLee Schermerhorn 	 * if !vma, alloc_page_vma() will use task or system default policy
8543ad33b24SLee Schermerhorn 	 */
8553ad33b24SLee Schermerhorn 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
85695a402c3SChristoph Lameter }
857b20a3503SChristoph Lameter #else
858b20a3503SChristoph Lameter 
859b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
860b20a3503SChristoph Lameter 				unsigned long flags)
861b20a3503SChristoph Lameter {
862b20a3503SChristoph Lameter }
863b20a3503SChristoph Lameter 
864b20a3503SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
865b20a3503SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
866b20a3503SChristoph Lameter {
867b20a3503SChristoph Lameter 	return -ENOSYS;
868b20a3503SChristoph Lameter }
86995a402c3SChristoph Lameter 
87069939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
87195a402c3SChristoph Lameter {
87295a402c3SChristoph Lameter 	return NULL;
87395a402c3SChristoph Lameter }
874b20a3503SChristoph Lameter #endif
875b20a3503SChristoph Lameter 
876dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
877028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
878028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
8796ce3c4c0SChristoph Lameter {
8806ce3c4c0SChristoph Lameter 	struct vm_area_struct *vma;
8816ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
8826ce3c4c0SChristoph Lameter 	struct mempolicy *new;
8836ce3c4c0SChristoph Lameter 	unsigned long end;
8846ce3c4c0SChristoph Lameter 	int err;
8856ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
8866ce3c4c0SChristoph Lameter 
887a3b51e01SDavid Rientjes 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
8886ce3c4c0SChristoph Lameter 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
8896ce3c4c0SChristoph Lameter 		return -EINVAL;
89074c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
8916ce3c4c0SChristoph Lameter 		return -EPERM;
8926ce3c4c0SChristoph Lameter 
8936ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
8946ce3c4c0SChristoph Lameter 		return -EINVAL;
8956ce3c4c0SChristoph Lameter 
8966ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
8976ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
8986ce3c4c0SChristoph Lameter 
8996ce3c4c0SChristoph Lameter 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
9006ce3c4c0SChristoph Lameter 	end = start + len;
9016ce3c4c0SChristoph Lameter 
9026ce3c4c0SChristoph Lameter 	if (end < start)
9036ce3c4c0SChristoph Lameter 		return -EINVAL;
9046ce3c4c0SChristoph Lameter 	if (end == start)
9056ce3c4c0SChristoph Lameter 		return 0;
9066ce3c4c0SChristoph Lameter 
907028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
9086ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
9096ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
9106ce3c4c0SChristoph Lameter 
9116ce3c4c0SChristoph Lameter 	/*
9126ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
9136ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
9146ce3c4c0SChristoph Lameter 	 */
9156ce3c4c0SChristoph Lameter 	if (!new)
9166ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
9176ce3c4c0SChristoph Lameter 
918028fec41SDavid Rientjes 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
919028fec41SDavid Rientjes 		 start, start + len, mode, mode_flags,
920028fec41SDavid Rientjes 		 nmask ? nodes_addr(*nmask)[0] : -1);
9216ce3c4c0SChristoph Lameter 
9226ce3c4c0SChristoph Lameter 	down_write(&mm->mmap_sem);
9236ce3c4c0SChristoph Lameter 	vma = check_range(mm, start, end, nmask,
9246ce3c4c0SChristoph Lameter 			  flags | MPOL_MF_INVERT, &pagelist);
9256ce3c4c0SChristoph Lameter 
9266ce3c4c0SChristoph Lameter 	err = PTR_ERR(vma);
9276ce3c4c0SChristoph Lameter 	if (!IS_ERR(vma)) {
9286ce3c4c0SChristoph Lameter 		int nr_failed = 0;
9296ce3c4c0SChristoph Lameter 
9306ce3c4c0SChristoph Lameter 		err = mbind_range(vma, start, end, new);
9317e2ab150SChristoph Lameter 
9326ce3c4c0SChristoph Lameter 		if (!list_empty(&pagelist))
93395a402c3SChristoph Lameter 			nr_failed = migrate_pages(&pagelist, new_vma_page,
93495a402c3SChristoph Lameter 						(unsigned long)vma);
9356ce3c4c0SChristoph Lameter 
9366ce3c4c0SChristoph Lameter 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
9376ce3c4c0SChristoph Lameter 			err = -EIO;
9386ce3c4c0SChristoph Lameter 	}
939b20a3503SChristoph Lameter 
9406ce3c4c0SChristoph Lameter 	up_write(&mm->mmap_sem);
9416ce3c4c0SChristoph Lameter 	mpol_free(new);
9426ce3c4c0SChristoph Lameter 	return err;
9436ce3c4c0SChristoph Lameter }
9446ce3c4c0SChristoph Lameter 
94539743889SChristoph Lameter /*
9468bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
9478bccd85fSChristoph Lameter  */
9488bccd85fSChristoph Lameter 
9498bccd85fSChristoph Lameter /* Copy a node mask from user space. */
95039743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
9518bccd85fSChristoph Lameter 		     unsigned long maxnode)
9528bccd85fSChristoph Lameter {
9538bccd85fSChristoph Lameter 	unsigned long k;
9548bccd85fSChristoph Lameter 	unsigned long nlongs;
9558bccd85fSChristoph Lameter 	unsigned long endmask;
9568bccd85fSChristoph Lameter 
9578bccd85fSChristoph Lameter 	--maxnode;
9588bccd85fSChristoph Lameter 	nodes_clear(*nodes);
9598bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
9608bccd85fSChristoph Lameter 		return 0;
961a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
962636f13c1SChris Wright 		return -EINVAL;
9638bccd85fSChristoph Lameter 
9648bccd85fSChristoph Lameter 	nlongs = BITS_TO_LONGS(maxnode);
9658bccd85fSChristoph Lameter 	if ((maxnode % BITS_PER_LONG) == 0)
9668bccd85fSChristoph Lameter 		endmask = ~0UL;
9678bccd85fSChristoph Lameter 	else
9688bccd85fSChristoph Lameter 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
9698bccd85fSChristoph Lameter 
9708bccd85fSChristoph Lameter 	/* When the user specified more nodes than supported just check
9718bccd85fSChristoph Lameter 	   if the non supported part is all zero. */
9728bccd85fSChristoph Lameter 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
9738bccd85fSChristoph Lameter 		if (nlongs > PAGE_SIZE/sizeof(long))
9748bccd85fSChristoph Lameter 			return -EINVAL;
9758bccd85fSChristoph Lameter 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
9768bccd85fSChristoph Lameter 			unsigned long t;
9778bccd85fSChristoph Lameter 			if (get_user(t, nmask + k))
9788bccd85fSChristoph Lameter 				return -EFAULT;
9798bccd85fSChristoph Lameter 			if (k == nlongs - 1) {
9808bccd85fSChristoph Lameter 				if (t & endmask)
9818bccd85fSChristoph Lameter 					return -EINVAL;
9828bccd85fSChristoph Lameter 			} else if (t)
9838bccd85fSChristoph Lameter 				return -EINVAL;
9848bccd85fSChristoph Lameter 		}
9858bccd85fSChristoph Lameter 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
9868bccd85fSChristoph Lameter 		endmask = ~0UL;
9878bccd85fSChristoph Lameter 	}
9888bccd85fSChristoph Lameter 
9898bccd85fSChristoph Lameter 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
9908bccd85fSChristoph Lameter 		return -EFAULT;
9918bccd85fSChristoph Lameter 	nodes_addr(*nodes)[nlongs-1] &= endmask;
9928bccd85fSChristoph Lameter 	return 0;
9938bccd85fSChristoph Lameter }
9948bccd85fSChristoph Lameter 
9958bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
9968bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
9978bccd85fSChristoph Lameter 			      nodemask_t *nodes)
9988bccd85fSChristoph Lameter {
9998bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
10008bccd85fSChristoph Lameter 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
10018bccd85fSChristoph Lameter 
10028bccd85fSChristoph Lameter 	if (copy > nbytes) {
10038bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
10048bccd85fSChristoph Lameter 			return -EINVAL;
10058bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
10068bccd85fSChristoph Lameter 			return -EFAULT;
10078bccd85fSChristoph Lameter 		copy = nbytes;
10088bccd85fSChristoph Lameter 	}
10098bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
10108bccd85fSChristoph Lameter }
10118bccd85fSChristoph Lameter 
10128bccd85fSChristoph Lameter asmlinkage long sys_mbind(unsigned long start, unsigned long len,
10138bccd85fSChristoph Lameter 			unsigned long mode,
10148bccd85fSChristoph Lameter 			unsigned long __user *nmask, unsigned long maxnode,
10158bccd85fSChristoph Lameter 			unsigned flags)
10168bccd85fSChristoph Lameter {
10178bccd85fSChristoph Lameter 	nodemask_t nodes;
10188bccd85fSChristoph Lameter 	int err;
1019028fec41SDavid Rientjes 	unsigned short mode_flags;
10208bccd85fSChristoph Lameter 
1021028fec41SDavid Rientjes 	mode_flags = mode & MPOL_MODE_FLAGS;
1022028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1023a3b51e01SDavid Rientjes 	if (mode >= MPOL_MAX)
1024a3b51e01SDavid Rientjes 		return -EINVAL;
10254c50bc01SDavid Rientjes 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
10264c50bc01SDavid Rientjes 	    (mode_flags & MPOL_F_RELATIVE_NODES))
10274c50bc01SDavid Rientjes 		return -EINVAL;
10288bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10298bccd85fSChristoph Lameter 	if (err)
10308bccd85fSChristoph Lameter 		return err;
1031028fec41SDavid Rientjes 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
10328bccd85fSChristoph Lameter }
10338bccd85fSChristoph Lameter 
10348bccd85fSChristoph Lameter /* Set the process memory policy */
10358bccd85fSChristoph Lameter asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
10368bccd85fSChristoph Lameter 		unsigned long maxnode)
10378bccd85fSChristoph Lameter {
10388bccd85fSChristoph Lameter 	int err;
10398bccd85fSChristoph Lameter 	nodemask_t nodes;
1040028fec41SDavid Rientjes 	unsigned short flags;
10418bccd85fSChristoph Lameter 
1042028fec41SDavid Rientjes 	flags = mode & MPOL_MODE_FLAGS;
1043028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1044028fec41SDavid Rientjes 	if ((unsigned int)mode >= MPOL_MAX)
10458bccd85fSChristoph Lameter 		return -EINVAL;
10464c50bc01SDavid Rientjes 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
10474c50bc01SDavid Rientjes 		return -EINVAL;
10488bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10498bccd85fSChristoph Lameter 	if (err)
10508bccd85fSChristoph Lameter 		return err;
1051028fec41SDavid Rientjes 	return do_set_mempolicy(mode, flags, &nodes);
10528bccd85fSChristoph Lameter }
10538bccd85fSChristoph Lameter 
105439743889SChristoph Lameter asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
105539743889SChristoph Lameter 		const unsigned long __user *old_nodes,
105639743889SChristoph Lameter 		const unsigned long __user *new_nodes)
105739743889SChristoph Lameter {
105839743889SChristoph Lameter 	struct mm_struct *mm;
105939743889SChristoph Lameter 	struct task_struct *task;
106039743889SChristoph Lameter 	nodemask_t old;
106139743889SChristoph Lameter 	nodemask_t new;
106239743889SChristoph Lameter 	nodemask_t task_nodes;
106339743889SChristoph Lameter 	int err;
106439743889SChristoph Lameter 
106539743889SChristoph Lameter 	err = get_nodes(&old, old_nodes, maxnode);
106639743889SChristoph Lameter 	if (err)
106739743889SChristoph Lameter 		return err;
106839743889SChristoph Lameter 
106939743889SChristoph Lameter 	err = get_nodes(&new, new_nodes, maxnode);
107039743889SChristoph Lameter 	if (err)
107139743889SChristoph Lameter 		return err;
107239743889SChristoph Lameter 
107339743889SChristoph Lameter 	/* Find the mm_struct */
107439743889SChristoph Lameter 	read_lock(&tasklist_lock);
1075228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
107639743889SChristoph Lameter 	if (!task) {
107739743889SChristoph Lameter 		read_unlock(&tasklist_lock);
107839743889SChristoph Lameter 		return -ESRCH;
107939743889SChristoph Lameter 	}
108039743889SChristoph Lameter 	mm = get_task_mm(task);
108139743889SChristoph Lameter 	read_unlock(&tasklist_lock);
108239743889SChristoph Lameter 
108339743889SChristoph Lameter 	if (!mm)
108439743889SChristoph Lameter 		return -EINVAL;
108539743889SChristoph Lameter 
108639743889SChristoph Lameter 	/*
108739743889SChristoph Lameter 	 * Check if this process has the right to modify the specified
108839743889SChristoph Lameter 	 * process. The right exists if the process has administrative
10897f927fccSAlexey Dobriyan 	 * capabilities, superuser privileges or the same
109039743889SChristoph Lameter 	 * userid as the target process.
109139743889SChristoph Lameter 	 */
109239743889SChristoph Lameter 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
109339743889SChristoph Lameter 	    (current->uid != task->suid) && (current->uid != task->uid) &&
109474c00241SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
109539743889SChristoph Lameter 		err = -EPERM;
109639743889SChristoph Lameter 		goto out;
109739743889SChristoph Lameter 	}
109839743889SChristoph Lameter 
109939743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
110039743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
110174c00241SChristoph Lameter 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
110239743889SChristoph Lameter 		err = -EPERM;
110339743889SChristoph Lameter 		goto out;
110439743889SChristoph Lameter 	}
110539743889SChristoph Lameter 
110637b07e41SLee Schermerhorn 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
11073b42d28bSChristoph Lameter 		err = -EINVAL;
11083b42d28bSChristoph Lameter 		goto out;
11093b42d28bSChristoph Lameter 	}
11103b42d28bSChristoph Lameter 
111186c3a764SDavid Quigley 	err = security_task_movememory(task);
111286c3a764SDavid Quigley 	if (err)
111386c3a764SDavid Quigley 		goto out;
111486c3a764SDavid Quigley 
1115511030bcSChristoph Lameter 	err = do_migrate_pages(mm, &old, &new,
111674c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
111739743889SChristoph Lameter out:
111839743889SChristoph Lameter 	mmput(mm);
111939743889SChristoph Lameter 	return err;
112039743889SChristoph Lameter }
112139743889SChristoph Lameter 
112239743889SChristoph Lameter 
11238bccd85fSChristoph Lameter /* Retrieve NUMA policy */
11248bccd85fSChristoph Lameter asmlinkage long sys_get_mempolicy(int __user *policy,
11258bccd85fSChristoph Lameter 				unsigned long __user *nmask,
11268bccd85fSChristoph Lameter 				unsigned long maxnode,
11278bccd85fSChristoph Lameter 				unsigned long addr, unsigned long flags)
11288bccd85fSChristoph Lameter {
1129dbcb0f19SAdrian Bunk 	int err;
1130dbcb0f19SAdrian Bunk 	int uninitialized_var(pval);
11318bccd85fSChristoph Lameter 	nodemask_t nodes;
11328bccd85fSChristoph Lameter 
11338bccd85fSChristoph Lameter 	if (nmask != NULL && maxnode < MAX_NUMNODES)
11348bccd85fSChristoph Lameter 		return -EINVAL;
11358bccd85fSChristoph Lameter 
11368bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
11378bccd85fSChristoph Lameter 
11388bccd85fSChristoph Lameter 	if (err)
11398bccd85fSChristoph Lameter 		return err;
11408bccd85fSChristoph Lameter 
11418bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
11428bccd85fSChristoph Lameter 		return -EFAULT;
11438bccd85fSChristoph Lameter 
11448bccd85fSChristoph Lameter 	if (nmask)
11458bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
11468bccd85fSChristoph Lameter 
11478bccd85fSChristoph Lameter 	return err;
11488bccd85fSChristoph Lameter }
11498bccd85fSChristoph Lameter 
11501da177e4SLinus Torvalds #ifdef CONFIG_COMPAT
11511da177e4SLinus Torvalds 
11521da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy,
11531da177e4SLinus Torvalds 				     compat_ulong_t __user *nmask,
11541da177e4SLinus Torvalds 				     compat_ulong_t maxnode,
11551da177e4SLinus Torvalds 				     compat_ulong_t addr, compat_ulong_t flags)
11561da177e4SLinus Torvalds {
11571da177e4SLinus Torvalds 	long err;
11581da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
11591da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
11601da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
11611da177e4SLinus Torvalds 
11621da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
11631da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
11641da177e4SLinus Torvalds 
11651da177e4SLinus Torvalds 	if (nmask)
11661da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
11671da177e4SLinus Torvalds 
11681da177e4SLinus Torvalds 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
11691da177e4SLinus Torvalds 
11701da177e4SLinus Torvalds 	if (!err && nmask) {
11711da177e4SLinus Torvalds 		err = copy_from_user(bm, nm, alloc_size);
11721da177e4SLinus Torvalds 		/* ensure entire bitmap is zeroed */
11731da177e4SLinus Torvalds 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
11741da177e4SLinus Torvalds 		err |= compat_put_bitmap(nmask, bm, nr_bits);
11751da177e4SLinus Torvalds 	}
11761da177e4SLinus Torvalds 
11771da177e4SLinus Torvalds 	return err;
11781da177e4SLinus Torvalds }
11791da177e4SLinus Torvalds 
11801da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
11811da177e4SLinus Torvalds 				     compat_ulong_t maxnode)
11821da177e4SLinus Torvalds {
11831da177e4SLinus Torvalds 	long err = 0;
11841da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
11851da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
11861da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
11871da177e4SLinus Torvalds 
11881da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
11891da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
11901da177e4SLinus Torvalds 
11911da177e4SLinus Torvalds 	if (nmask) {
11921da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
11931da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
11941da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
11951da177e4SLinus Torvalds 	}
11961da177e4SLinus Torvalds 
11971da177e4SLinus Torvalds 	if (err)
11981da177e4SLinus Torvalds 		return -EFAULT;
11991da177e4SLinus Torvalds 
12001da177e4SLinus Torvalds 	return sys_set_mempolicy(mode, nm, nr_bits+1);
12011da177e4SLinus Torvalds }
12021da177e4SLinus Torvalds 
12031da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
12041da177e4SLinus Torvalds 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
12051da177e4SLinus Torvalds 			     compat_ulong_t maxnode, compat_ulong_t flags)
12061da177e4SLinus Torvalds {
12071da177e4SLinus Torvalds 	long err = 0;
12081da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
12091da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
1210dfcd3c0dSAndi Kleen 	nodemask_t bm;
12111da177e4SLinus Torvalds 
12121da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
12131da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
12141da177e4SLinus Torvalds 
12151da177e4SLinus Torvalds 	if (nmask) {
1216dfcd3c0dSAndi Kleen 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
12171da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
1218dfcd3c0dSAndi Kleen 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
12191da177e4SLinus Torvalds 	}
12201da177e4SLinus Torvalds 
12211da177e4SLinus Torvalds 	if (err)
12221da177e4SLinus Torvalds 		return -EFAULT;
12231da177e4SLinus Torvalds 
12241da177e4SLinus Torvalds 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
12251da177e4SLinus Torvalds }
12261da177e4SLinus Torvalds 
12271da177e4SLinus Torvalds #endif
12281da177e4SLinus Torvalds 
1229480eccf9SLee Schermerhorn /*
1230480eccf9SLee Schermerhorn  * get_vma_policy(@task, @vma, @addr)
1231480eccf9SLee Schermerhorn  * @task - task for fallback if vma policy == default
1232480eccf9SLee Schermerhorn  * @vma   - virtual memory area whose policy is sought
1233480eccf9SLee Schermerhorn  * @addr  - address in @vma for shared policy lookup
1234480eccf9SLee Schermerhorn  *
1235480eccf9SLee Schermerhorn  * Returns effective policy for a VMA at specified address.
1236480eccf9SLee Schermerhorn  * Falls back to @task or system default policy, as necessary.
1237480eccf9SLee Schermerhorn  * Returned policy has extra reference count if shared, vma,
1238480eccf9SLee Schermerhorn  * or some other task's policy [show_numa_maps() can pass
1239480eccf9SLee Schermerhorn  * @task != current].  It is the caller's responsibility to
1240480eccf9SLee Schermerhorn  * free the reference in these cases.
1241480eccf9SLee Schermerhorn  */
124248fce342SChristoph Lameter static struct mempolicy * get_vma_policy(struct task_struct *task,
124348fce342SChristoph Lameter 		struct vm_area_struct *vma, unsigned long addr)
12441da177e4SLinus Torvalds {
12456e21c8f1SChristoph Lameter 	struct mempolicy *pol = task->mempolicy;
1246480eccf9SLee Schermerhorn 	int shared_pol = 0;
12471da177e4SLinus Torvalds 
12481da177e4SLinus Torvalds 	if (vma) {
1249480eccf9SLee Schermerhorn 		if (vma->vm_ops && vma->vm_ops->get_policy) {
12501da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
1251480eccf9SLee Schermerhorn 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1252480eccf9SLee Schermerhorn 		} else if (vma->vm_policy &&
12531da177e4SLinus Torvalds 				vma->vm_policy->policy != MPOL_DEFAULT)
12541da177e4SLinus Torvalds 			pol = vma->vm_policy;
12551da177e4SLinus Torvalds 	}
12561da177e4SLinus Torvalds 	if (!pol)
12571da177e4SLinus Torvalds 		pol = &default_policy;
1258480eccf9SLee Schermerhorn 	else if (!shared_pol && pol != current->mempolicy)
1259480eccf9SLee Schermerhorn 		mpol_get(pol);	/* vma or other task's policy */
12601da177e4SLinus Torvalds 	return pol;
12611da177e4SLinus Torvalds }
12621da177e4SLinus Torvalds 
126319770b32SMel Gorman /* Return a nodemask representing a mempolicy */
126419770b32SMel Gorman static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
126519770b32SMel Gorman {
126619770b32SMel Gorman 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
126719770b32SMel Gorman 	if (unlikely(policy->policy == MPOL_BIND) &&
126819770b32SMel Gorman 			gfp_zone(gfp) >= policy_zone &&
126919770b32SMel Gorman 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
127019770b32SMel Gorman 		return &policy->v.nodes;
127119770b32SMel Gorman 
127219770b32SMel Gorman 	return NULL;
127319770b32SMel Gorman }
127419770b32SMel Gorman 
12751da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */
1276dd0fc66fSAl Viro static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
12771da177e4SLinus Torvalds {
12781da177e4SLinus Torvalds 	int nd;
12791da177e4SLinus Torvalds 
12801da177e4SLinus Torvalds 	switch (policy->policy) {
12811da177e4SLinus Torvalds 	case MPOL_PREFERRED:
12821da177e4SLinus Torvalds 		nd = policy->v.preferred_node;
12831da177e4SLinus Torvalds 		if (nd < 0)
12841da177e4SLinus Torvalds 			nd = numa_node_id();
12851da177e4SLinus Torvalds 		break;
12861da177e4SLinus Torvalds 	case MPOL_BIND:
128719770b32SMel Gorman 		/*
128819770b32SMel Gorman 		 * Normally, MPOL_BIND allocations node-local are node-local
128919770b32SMel Gorman 		 * within the allowed nodemask. However, if __GFP_THISNODE is
129019770b32SMel Gorman 		 * set and the current node is part of the mask, we use the
129119770b32SMel Gorman 		 * the zonelist for the first node in the mask instead.
129219770b32SMel Gorman 		 */
129319770b32SMel Gorman 		nd = numa_node_id();
129419770b32SMel Gorman 		if (unlikely(gfp & __GFP_THISNODE) &&
129519770b32SMel Gorman 				unlikely(!node_isset(nd, policy->v.nodes)))
129619770b32SMel Gorman 			nd = first_node(policy->v.nodes);
129719770b32SMel Gorman 		break;
12981da177e4SLinus Torvalds 	case MPOL_INTERLEAVE: /* should not happen */
12991da177e4SLinus Torvalds 	case MPOL_DEFAULT:
13001da177e4SLinus Torvalds 		nd = numa_node_id();
13011da177e4SLinus Torvalds 		break;
13021da177e4SLinus Torvalds 	default:
13031da177e4SLinus Torvalds 		nd = 0;
13041da177e4SLinus Torvalds 		BUG();
13051da177e4SLinus Torvalds 	}
13060e88460dSMel Gorman 	return node_zonelist(nd, gfp);
13071da177e4SLinus Torvalds }
13081da177e4SLinus Torvalds 
13091da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
13101da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
13111da177e4SLinus Torvalds {
13121da177e4SLinus Torvalds 	unsigned nid, next;
13131da177e4SLinus Torvalds 	struct task_struct *me = current;
13141da177e4SLinus Torvalds 
13151da177e4SLinus Torvalds 	nid = me->il_next;
1316dfcd3c0dSAndi Kleen 	next = next_node(nid, policy->v.nodes);
13171da177e4SLinus Torvalds 	if (next >= MAX_NUMNODES)
1318dfcd3c0dSAndi Kleen 		next = first_node(policy->v.nodes);
1319f5b087b5SDavid Rientjes 	if (next < MAX_NUMNODES)
13201da177e4SLinus Torvalds 		me->il_next = next;
13211da177e4SLinus Torvalds 	return nid;
13221da177e4SLinus Torvalds }
13231da177e4SLinus Torvalds 
1324dc85da15SChristoph Lameter /*
1325dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1326dc85da15SChristoph Lameter  * next slab entry.
1327dc85da15SChristoph Lameter  */
1328dc85da15SChristoph Lameter unsigned slab_node(struct mempolicy *policy)
1329dc85da15SChristoph Lameter {
1330a3b51e01SDavid Rientjes 	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1331765c4507SChristoph Lameter 
1332765c4507SChristoph Lameter 	switch (pol) {
1333dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1334dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1335dc85da15SChristoph Lameter 
1336dd1a239fSMel Gorman 	case MPOL_BIND: {
1337dc85da15SChristoph Lameter 		/*
1338dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1339dc85da15SChristoph Lameter 		 * first node.
1340dc85da15SChristoph Lameter 		 */
134119770b32SMel Gorman 		struct zonelist *zonelist;
134219770b32SMel Gorman 		struct zone *zone;
134319770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
134419770b32SMel Gorman 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
134519770b32SMel Gorman 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
134619770b32SMel Gorman 							&policy->v.nodes,
134719770b32SMel Gorman 							&zone);
134819770b32SMel Gorman 		return zone->node;
1349dd1a239fSMel Gorman 	}
1350dc85da15SChristoph Lameter 
1351dc85da15SChristoph Lameter 	case MPOL_PREFERRED:
1352dc85da15SChristoph Lameter 		if (policy->v.preferred_node >= 0)
1353dc85da15SChristoph Lameter 			return policy->v.preferred_node;
1354dc85da15SChristoph Lameter 		/* Fall through */
1355dc85da15SChristoph Lameter 
1356dc85da15SChristoph Lameter 	default:
1357dc85da15SChristoph Lameter 		return numa_node_id();
1358dc85da15SChristoph Lameter 	}
1359dc85da15SChristoph Lameter }
1360dc85da15SChristoph Lameter 
13611da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */
13621da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol,
13631da177e4SLinus Torvalds 		struct vm_area_struct *vma, unsigned long off)
13641da177e4SLinus Torvalds {
1365dfcd3c0dSAndi Kleen 	unsigned nnodes = nodes_weight(pol->v.nodes);
1366f5b087b5SDavid Rientjes 	unsigned target;
13671da177e4SLinus Torvalds 	int c;
13681da177e4SLinus Torvalds 	int nid = -1;
13691da177e4SLinus Torvalds 
1370f5b087b5SDavid Rientjes 	if (!nnodes)
1371f5b087b5SDavid Rientjes 		return numa_node_id();
1372f5b087b5SDavid Rientjes 	target = (unsigned int)off % nnodes;
13731da177e4SLinus Torvalds 	c = 0;
13741da177e4SLinus Torvalds 	do {
1375dfcd3c0dSAndi Kleen 		nid = next_node(nid, pol->v.nodes);
13761da177e4SLinus Torvalds 		c++;
13771da177e4SLinus Torvalds 	} while (c <= target);
13781da177e4SLinus Torvalds 	return nid;
13791da177e4SLinus Torvalds }
13801da177e4SLinus Torvalds 
13815da7ca86SChristoph Lameter /* Determine a node number for interleave */
13825da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
13835da7ca86SChristoph Lameter 		 struct vm_area_struct *vma, unsigned long addr, int shift)
13845da7ca86SChristoph Lameter {
13855da7ca86SChristoph Lameter 	if (vma) {
13865da7ca86SChristoph Lameter 		unsigned long off;
13875da7ca86SChristoph Lameter 
13883b98b087SNishanth Aravamudan 		/*
13893b98b087SNishanth Aravamudan 		 * for small pages, there is no difference between
13903b98b087SNishanth Aravamudan 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
13913b98b087SNishanth Aravamudan 		 * for huge pages, since vm_pgoff is in units of small
13923b98b087SNishanth Aravamudan 		 * pages, we need to shift off the always 0 bits to get
13933b98b087SNishanth Aravamudan 		 * a useful offset.
13943b98b087SNishanth Aravamudan 		 */
13953b98b087SNishanth Aravamudan 		BUG_ON(shift < PAGE_SHIFT);
13963b98b087SNishanth Aravamudan 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
13975da7ca86SChristoph Lameter 		off += (addr - vma->vm_start) >> shift;
13985da7ca86SChristoph Lameter 		return offset_il_node(pol, vma, off);
13995da7ca86SChristoph Lameter 	} else
14005da7ca86SChristoph Lameter 		return interleave_nodes(pol);
14015da7ca86SChristoph Lameter }
14025da7ca86SChristoph Lameter 
140300ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
1404480eccf9SLee Schermerhorn /*
1405480eccf9SLee Schermerhorn  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1406480eccf9SLee Schermerhorn  * @vma = virtual memory area whose policy is sought
1407480eccf9SLee Schermerhorn  * @addr = address in @vma for shared policy lookup and interleave policy
1408480eccf9SLee Schermerhorn  * @gfp_flags = for requested zone
140919770b32SMel Gorman  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
141019770b32SMel Gorman  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1411480eccf9SLee Schermerhorn  *
1412480eccf9SLee Schermerhorn  * Returns a zonelist suitable for a huge page allocation.
141319770b32SMel Gorman  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
141419770b32SMel Gorman  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1415480eccf9SLee Schermerhorn  * If it is also a policy for which get_vma_policy() returns an extra
141619770b32SMel Gorman  * reference, we must hold that reference until after the allocation.
1417480eccf9SLee Schermerhorn  * In that case, return policy via @mpol so hugetlb allocation can drop
1418480eccf9SLee Schermerhorn  * the reference. For non-'BIND referenced policies, we can/do drop the
1419480eccf9SLee Schermerhorn  * reference here, so the caller doesn't need to know about the special case
1420480eccf9SLee Schermerhorn  * for default and current task policy.
1421480eccf9SLee Schermerhorn  */
1422396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
142319770b32SMel Gorman 				gfp_t gfp_flags, struct mempolicy **mpol,
142419770b32SMel Gorman 				nodemask_t **nodemask)
14255da7ca86SChristoph Lameter {
14265da7ca86SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1427480eccf9SLee Schermerhorn 	struct zonelist *zl;
14285da7ca86SChristoph Lameter 
1429480eccf9SLee Schermerhorn 	*mpol = NULL;		/* probably no unref needed */
143019770b32SMel Gorman 	*nodemask = NULL;	/* assume !MPOL_BIND */
143119770b32SMel Gorman 	if (pol->policy == MPOL_BIND) {
143219770b32SMel Gorman 			*nodemask = &pol->v.nodes;
143319770b32SMel Gorman 	} else if (pol->policy == MPOL_INTERLEAVE) {
14345da7ca86SChristoph Lameter 		unsigned nid;
14355da7ca86SChristoph Lameter 
14365da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
143769682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
143869682d85SLee Schermerhorn 				pol != current->mempolicy))
1439480eccf9SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
14400e88460dSMel Gorman 		return node_zonelist(nid, gfp_flags);
14415da7ca86SChristoph Lameter 	}
1442480eccf9SLee Schermerhorn 
1443480eccf9SLee Schermerhorn 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1444480eccf9SLee Schermerhorn 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1445480eccf9SLee Schermerhorn 		if (pol->policy != MPOL_BIND)
1446480eccf9SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
1447480eccf9SLee Schermerhorn 		else
1448480eccf9SLee Schermerhorn 			*mpol = pol;	/* unref needed after allocation */
1449480eccf9SLee Schermerhorn 	}
1450480eccf9SLee Schermerhorn 	return zl;
14515da7ca86SChristoph Lameter }
145200ac59adSChen, Kenneth W #endif
14535da7ca86SChristoph Lameter 
14541da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
14551da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
1456662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1457662f3a0bSAndi Kleen 					unsigned nid)
14581da177e4SLinus Torvalds {
14591da177e4SLinus Torvalds 	struct zonelist *zl;
14601da177e4SLinus Torvalds 	struct page *page;
14611da177e4SLinus Torvalds 
14620e88460dSMel Gorman 	zl = node_zonelist(nid, gfp);
14631da177e4SLinus Torvalds 	page = __alloc_pages(gfp, order, zl);
1464dd1a239fSMel Gorman 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1465ca889e6cSChristoph Lameter 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
14661da177e4SLinus Torvalds 	return page;
14671da177e4SLinus Torvalds }
14681da177e4SLinus Torvalds 
14691da177e4SLinus Torvalds /**
14701da177e4SLinus Torvalds  * 	alloc_page_vma	- Allocate a page for a VMA.
14711da177e4SLinus Torvalds  *
14721da177e4SLinus Torvalds  * 	@gfp:
14731da177e4SLinus Torvalds  *      %GFP_USER    user allocation.
14741da177e4SLinus Torvalds  *      %GFP_KERNEL  kernel allocations,
14751da177e4SLinus Torvalds  *      %GFP_HIGHMEM highmem/user allocations,
14761da177e4SLinus Torvalds  *      %GFP_FS      allocation should not call back into a file system.
14771da177e4SLinus Torvalds  *      %GFP_ATOMIC  don't sleep.
14781da177e4SLinus Torvalds  *
14791da177e4SLinus Torvalds  * 	@vma:  Pointer to VMA or NULL if not available.
14801da177e4SLinus Torvalds  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
14811da177e4SLinus Torvalds  *
14821da177e4SLinus Torvalds  * 	This function allocates a page from the kernel page pool and applies
14831da177e4SLinus Torvalds  *	a NUMA policy associated with the VMA or the current process.
14841da177e4SLinus Torvalds  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
14851da177e4SLinus Torvalds  *	mm_struct of the VMA to prevent it from going away. Should be used for
14861da177e4SLinus Torvalds  *	all allocations for pages that will be mapped into
14871da177e4SLinus Torvalds  * 	user space. Returns NULL when no page can be allocated.
14881da177e4SLinus Torvalds  *
14891da177e4SLinus Torvalds  *	Should be called with the mm_sem of the vma hold.
14901da177e4SLinus Torvalds  */
14911da177e4SLinus Torvalds struct page *
1492dd0fc66fSAl Viro alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
14931da177e4SLinus Torvalds {
14946e21c8f1SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1495480eccf9SLee Schermerhorn 	struct zonelist *zl;
14961da177e4SLinus Torvalds 
1497cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
14981da177e4SLinus Torvalds 
14991da177e4SLinus Torvalds 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
15001da177e4SLinus Torvalds 		unsigned nid;
15015da7ca86SChristoph Lameter 
15025da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
150369682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
150469682d85SLee Schermerhorn 				pol != current->mempolicy))
150569682d85SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
15061da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, 0, nid);
15071da177e4SLinus Torvalds 	}
1508480eccf9SLee Schermerhorn 	zl = zonelist_policy(gfp, pol);
1509480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy) {
1510480eccf9SLee Schermerhorn 		/*
1511480eccf9SLee Schermerhorn 		 * slow path: ref counted policy -- shared or vma
1512480eccf9SLee Schermerhorn 		 */
151319770b32SMel Gorman 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
151419770b32SMel Gorman 						zl, nodemask_policy(gfp, pol));
1515480eccf9SLee Schermerhorn 		__mpol_free(pol);
1516480eccf9SLee Schermerhorn 		return page;
1517480eccf9SLee Schermerhorn 	}
1518480eccf9SLee Schermerhorn 	/*
1519480eccf9SLee Schermerhorn 	 * fast path:  default or task policy
1520480eccf9SLee Schermerhorn 	 */
152119770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
15221da177e4SLinus Torvalds }
15231da177e4SLinus Torvalds 
15241da177e4SLinus Torvalds /**
15251da177e4SLinus Torvalds  * 	alloc_pages_current - Allocate pages.
15261da177e4SLinus Torvalds  *
15271da177e4SLinus Torvalds  *	@gfp:
15281da177e4SLinus Torvalds  *		%GFP_USER   user allocation,
15291da177e4SLinus Torvalds  *      	%GFP_KERNEL kernel allocation,
15301da177e4SLinus Torvalds  *      	%GFP_HIGHMEM highmem allocation,
15311da177e4SLinus Torvalds  *      	%GFP_FS     don't call back into a file system.
15321da177e4SLinus Torvalds  *      	%GFP_ATOMIC don't sleep.
15331da177e4SLinus Torvalds  *	@order: Power of two of allocation size in pages. 0 is a single page.
15341da177e4SLinus Torvalds  *
15351da177e4SLinus Torvalds  *	Allocate a page from the kernel page pool.  When not in
15361da177e4SLinus Torvalds  *	interrupt context and apply the current process NUMA policy.
15371da177e4SLinus Torvalds  *	Returns NULL when no page can be allocated.
15381da177e4SLinus Torvalds  *
1539cf2a473cSPaul Jackson  *	Don't call cpuset_update_task_memory_state() unless
15401da177e4SLinus Torvalds  *	1) it's ok to take cpuset_sem (can WAIT), and
15411da177e4SLinus Torvalds  *	2) allocating for current task (not interrupt).
15421da177e4SLinus Torvalds  */
1543dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order)
15441da177e4SLinus Torvalds {
15451da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
15461da177e4SLinus Torvalds 
15471da177e4SLinus Torvalds 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1548cf2a473cSPaul Jackson 		cpuset_update_task_memory_state();
15499b819d20SChristoph Lameter 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
15501da177e4SLinus Torvalds 		pol = &default_policy;
15511da177e4SLinus Torvalds 	if (pol->policy == MPOL_INTERLEAVE)
15521da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
155319770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, order,
155419770b32SMel Gorman 			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
15551da177e4SLinus Torvalds }
15561da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current);
15571da177e4SLinus Torvalds 
15584225399aSPaul Jackson /*
15594225399aSPaul Jackson  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
15604225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
15614225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
15624225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
15634225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
15644225399aSPaul Jackson  */
15654225399aSPaul Jackson 
15661da177e4SLinus Torvalds /* Slow path of a mempolicy copy */
15671da177e4SLinus Torvalds struct mempolicy *__mpol_copy(struct mempolicy *old)
15681da177e4SLinus Torvalds {
15691da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
15701da177e4SLinus Torvalds 
15711da177e4SLinus Torvalds 	if (!new)
15721da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
15734225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
15744225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
15754225399aSPaul Jackson 		mpol_rebind_policy(old, &mems);
15764225399aSPaul Jackson 	}
15771da177e4SLinus Torvalds 	*new = *old;
15781da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
15791da177e4SLinus Torvalds 	return new;
15801da177e4SLinus Torvalds }
15811da177e4SLinus Torvalds 
1582f5b087b5SDavid Rientjes static int mpol_match_intent(const struct mempolicy *a,
1583f5b087b5SDavid Rientjes 			     const struct mempolicy *b)
1584f5b087b5SDavid Rientjes {
1585f5b087b5SDavid Rientjes 	if (a->flags != b->flags)
1586f5b087b5SDavid Rientjes 		return 0;
1587f5b087b5SDavid Rientjes 	if (!mpol_store_user_nodemask(a))
1588f5b087b5SDavid Rientjes 		return 1;
1589f5b087b5SDavid Rientjes 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1590f5b087b5SDavid Rientjes }
1591f5b087b5SDavid Rientjes 
15921da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
15931da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
15941da177e4SLinus Torvalds {
15951da177e4SLinus Torvalds 	if (!a || !b)
15961da177e4SLinus Torvalds 		return 0;
15971da177e4SLinus Torvalds 	if (a->policy != b->policy)
15981da177e4SLinus Torvalds 		return 0;
1599f5b087b5SDavid Rientjes 	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1600f5b087b5SDavid Rientjes 		return 0;
16011da177e4SLinus Torvalds 	switch (a->policy) {
16021da177e4SLinus Torvalds 	case MPOL_DEFAULT:
16031da177e4SLinus Torvalds 		return 1;
160419770b32SMel Gorman 	case MPOL_BIND:
160519770b32SMel Gorman 		/* Fall through */
16061da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
1607dfcd3c0dSAndi Kleen 		return nodes_equal(a->v.nodes, b->v.nodes);
16081da177e4SLinus Torvalds 	case MPOL_PREFERRED:
16091da177e4SLinus Torvalds 		return a->v.preferred_node == b->v.preferred_node;
16101da177e4SLinus Torvalds 	default:
16111da177e4SLinus Torvalds 		BUG();
16121da177e4SLinus Torvalds 		return 0;
16131da177e4SLinus Torvalds 	}
16141da177e4SLinus Torvalds }
16151da177e4SLinus Torvalds 
16161da177e4SLinus Torvalds /* Slow path of a mpol destructor. */
16171da177e4SLinus Torvalds void __mpol_free(struct mempolicy *p)
16181da177e4SLinus Torvalds {
16191da177e4SLinus Torvalds 	if (!atomic_dec_and_test(&p->refcnt))
16201da177e4SLinus Torvalds 		return;
16211da177e4SLinus Torvalds 	p->policy = MPOL_DEFAULT;
16221da177e4SLinus Torvalds 	kmem_cache_free(policy_cache, p);
16231da177e4SLinus Torvalds }
16241da177e4SLinus Torvalds 
16251da177e4SLinus Torvalds /*
16261da177e4SLinus Torvalds  * Shared memory backing store policy support.
16271da177e4SLinus Torvalds  *
16281da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
16291da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
16301da177e4SLinus Torvalds  * They are protected by the sp->lock spinlock, which should be held
16311da177e4SLinus Torvalds  * for any accesses to the tree.
16321da177e4SLinus Torvalds  */
16331da177e4SLinus Torvalds 
16341da177e4SLinus Torvalds /* lookup first element intersecting start-end */
16351da177e4SLinus Torvalds /* Caller holds sp->lock */
16361da177e4SLinus Torvalds static struct sp_node *
16371da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
16381da177e4SLinus Torvalds {
16391da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
16401da177e4SLinus Torvalds 
16411da177e4SLinus Torvalds 	while (n) {
16421da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
16431da177e4SLinus Torvalds 
16441da177e4SLinus Torvalds 		if (start >= p->end)
16451da177e4SLinus Torvalds 			n = n->rb_right;
16461da177e4SLinus Torvalds 		else if (end <= p->start)
16471da177e4SLinus Torvalds 			n = n->rb_left;
16481da177e4SLinus Torvalds 		else
16491da177e4SLinus Torvalds 			break;
16501da177e4SLinus Torvalds 	}
16511da177e4SLinus Torvalds 	if (!n)
16521da177e4SLinus Torvalds 		return NULL;
16531da177e4SLinus Torvalds 	for (;;) {
16541da177e4SLinus Torvalds 		struct sp_node *w = NULL;
16551da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
16561da177e4SLinus Torvalds 		if (!prev)
16571da177e4SLinus Torvalds 			break;
16581da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
16591da177e4SLinus Torvalds 		if (w->end <= start)
16601da177e4SLinus Torvalds 			break;
16611da177e4SLinus Torvalds 		n = prev;
16621da177e4SLinus Torvalds 	}
16631da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
16641da177e4SLinus Torvalds }
16651da177e4SLinus Torvalds 
16661da177e4SLinus Torvalds /* Insert a new shared policy into the list. */
16671da177e4SLinus Torvalds /* Caller holds sp->lock */
16681da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
16691da177e4SLinus Torvalds {
16701da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
16711da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
16721da177e4SLinus Torvalds 	struct sp_node *nd;
16731da177e4SLinus Torvalds 
16741da177e4SLinus Torvalds 	while (*p) {
16751da177e4SLinus Torvalds 		parent = *p;
16761da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
16771da177e4SLinus Torvalds 		if (new->start < nd->start)
16781da177e4SLinus Torvalds 			p = &(*p)->rb_left;
16791da177e4SLinus Torvalds 		else if (new->end > nd->end)
16801da177e4SLinus Torvalds 			p = &(*p)->rb_right;
16811da177e4SLinus Torvalds 		else
16821da177e4SLinus Torvalds 			BUG();
16831da177e4SLinus Torvalds 	}
16841da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
16851da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
1686140d5a49SPaul Mundt 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
16871da177e4SLinus Torvalds 		 new->policy ? new->policy->policy : 0);
16881da177e4SLinus Torvalds }
16891da177e4SLinus Torvalds 
16901da177e4SLinus Torvalds /* Find shared policy intersecting idx */
16911da177e4SLinus Torvalds struct mempolicy *
16921da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
16931da177e4SLinus Torvalds {
16941da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
16951da177e4SLinus Torvalds 	struct sp_node *sn;
16961da177e4SLinus Torvalds 
16971da177e4SLinus Torvalds 	if (!sp->root.rb_node)
16981da177e4SLinus Torvalds 		return NULL;
16991da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17001da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
17011da177e4SLinus Torvalds 	if (sn) {
17021da177e4SLinus Torvalds 		mpol_get(sn->policy);
17031da177e4SLinus Torvalds 		pol = sn->policy;
17041da177e4SLinus Torvalds 	}
17051da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17061da177e4SLinus Torvalds 	return pol;
17071da177e4SLinus Torvalds }
17081da177e4SLinus Torvalds 
17091da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
17101da177e4SLinus Torvalds {
1711140d5a49SPaul Mundt 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
17121da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
17131da177e4SLinus Torvalds 	mpol_free(n->policy);
17141da177e4SLinus Torvalds 	kmem_cache_free(sn_cache, n);
17151da177e4SLinus Torvalds }
17161da177e4SLinus Torvalds 
1717dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1718dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
17191da177e4SLinus Torvalds {
17201da177e4SLinus Torvalds 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
17211da177e4SLinus Torvalds 
17221da177e4SLinus Torvalds 	if (!n)
17231da177e4SLinus Torvalds 		return NULL;
17241da177e4SLinus Torvalds 	n->start = start;
17251da177e4SLinus Torvalds 	n->end = end;
17261da177e4SLinus Torvalds 	mpol_get(pol);
17271da177e4SLinus Torvalds 	n->policy = pol;
17281da177e4SLinus Torvalds 	return n;
17291da177e4SLinus Torvalds }
17301da177e4SLinus Torvalds 
17311da177e4SLinus Torvalds /* Replace a policy range. */
17321da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
17331da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
17341da177e4SLinus Torvalds {
17351da177e4SLinus Torvalds 	struct sp_node *n, *new2 = NULL;
17361da177e4SLinus Torvalds 
17371da177e4SLinus Torvalds restart:
17381da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17391da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
17401da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
17411da177e4SLinus Torvalds 	while (n && n->start < end) {
17421da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
17431da177e4SLinus Torvalds 		if (n->start >= start) {
17441da177e4SLinus Torvalds 			if (n->end <= end)
17451da177e4SLinus Torvalds 				sp_delete(sp, n);
17461da177e4SLinus Torvalds 			else
17471da177e4SLinus Torvalds 				n->start = end;
17481da177e4SLinus Torvalds 		} else {
17491da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
17501da177e4SLinus Torvalds 			if (n->end > end) {
17511da177e4SLinus Torvalds 				if (!new2) {
17521da177e4SLinus Torvalds 					spin_unlock(&sp->lock);
17531da177e4SLinus Torvalds 					new2 = sp_alloc(end, n->end, n->policy);
17541da177e4SLinus Torvalds 					if (!new2)
17551da177e4SLinus Torvalds 						return -ENOMEM;
17561da177e4SLinus Torvalds 					goto restart;
17571da177e4SLinus Torvalds 				}
17581da177e4SLinus Torvalds 				n->end = start;
17591da177e4SLinus Torvalds 				sp_insert(sp, new2);
17601da177e4SLinus Torvalds 				new2 = NULL;
17611da177e4SLinus Torvalds 				break;
17621da177e4SLinus Torvalds 			} else
17631da177e4SLinus Torvalds 				n->end = start;
17641da177e4SLinus Torvalds 		}
17651da177e4SLinus Torvalds 		if (!next)
17661da177e4SLinus Torvalds 			break;
17671da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
17681da177e4SLinus Torvalds 	}
17691da177e4SLinus Torvalds 	if (new)
17701da177e4SLinus Torvalds 		sp_insert(sp, new);
17711da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17721da177e4SLinus Torvalds 	if (new2) {
17731da177e4SLinus Torvalds 		mpol_free(new2->policy);
17741da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new2);
17751da177e4SLinus Torvalds 	}
17761da177e4SLinus Torvalds 	return 0;
17771da177e4SLinus Torvalds }
17781da177e4SLinus Torvalds 
1779a3b51e01SDavid Rientjes void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1780028fec41SDavid Rientjes 			unsigned short flags, nodemask_t *policy_nodes)
17817339ff83SRobin Holt {
17827339ff83SRobin Holt 	info->root = RB_ROOT;
17837339ff83SRobin Holt 	spin_lock_init(&info->lock);
17847339ff83SRobin Holt 
17857339ff83SRobin Holt 	if (policy != MPOL_DEFAULT) {
17867339ff83SRobin Holt 		struct mempolicy *newpol;
17877339ff83SRobin Holt 
17887339ff83SRobin Holt 		/* Falls back to MPOL_DEFAULT on any error */
1789028fec41SDavid Rientjes 		newpol = mpol_new(policy, flags, policy_nodes);
17907339ff83SRobin Holt 		if (!IS_ERR(newpol)) {
17917339ff83SRobin Holt 			/* Create pseudo-vma that contains just the policy */
17927339ff83SRobin Holt 			struct vm_area_struct pvma;
17937339ff83SRobin Holt 
17947339ff83SRobin Holt 			memset(&pvma, 0, sizeof(struct vm_area_struct));
17957339ff83SRobin Holt 			/* Policy covers entire file */
17967339ff83SRobin Holt 			pvma.vm_end = TASK_SIZE;
17977339ff83SRobin Holt 			mpol_set_shared_policy(info, &pvma, newpol);
17987339ff83SRobin Holt 			mpol_free(newpol);
17997339ff83SRobin Holt 		}
18007339ff83SRobin Holt 	}
18017339ff83SRobin Holt }
18027339ff83SRobin Holt 
18031da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
18041da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
18051da177e4SLinus Torvalds {
18061da177e4SLinus Torvalds 	int err;
18071da177e4SLinus Torvalds 	struct sp_node *new = NULL;
18081da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
18091da177e4SLinus Torvalds 
1810028fec41SDavid Rientjes 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
18111da177e4SLinus Torvalds 		 vma->vm_pgoff,
18121da177e4SLinus Torvalds 		 sz, npol ? npol->policy : -1,
1813028fec41SDavid Rientjes 		 npol ? npol->flags : -1,
1814dfcd3c0dSAndi Kleen 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
18151da177e4SLinus Torvalds 
18161da177e4SLinus Torvalds 	if (npol) {
18171da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
18181da177e4SLinus Torvalds 		if (!new)
18191da177e4SLinus Torvalds 			return -ENOMEM;
18201da177e4SLinus Torvalds 	}
18211da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
18221da177e4SLinus Torvalds 	if (err && new)
18231da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new);
18241da177e4SLinus Torvalds 	return err;
18251da177e4SLinus Torvalds }
18261da177e4SLinus Torvalds 
18271da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
18281da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
18291da177e4SLinus Torvalds {
18301da177e4SLinus Torvalds 	struct sp_node *n;
18311da177e4SLinus Torvalds 	struct rb_node *next;
18321da177e4SLinus Torvalds 
18331da177e4SLinus Torvalds 	if (!p->root.rb_node)
18341da177e4SLinus Torvalds 		return;
18351da177e4SLinus Torvalds 	spin_lock(&p->lock);
18361da177e4SLinus Torvalds 	next = rb_first(&p->root);
18371da177e4SLinus Torvalds 	while (next) {
18381da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
18391da177e4SLinus Torvalds 		next = rb_next(&n->nd);
184090c5029eSAndi Kleen 		rb_erase(&n->nd, &p->root);
18411da177e4SLinus Torvalds 		mpol_free(n->policy);
18421da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, n);
18431da177e4SLinus Torvalds 	}
18441da177e4SLinus Torvalds 	spin_unlock(&p->lock);
18451da177e4SLinus Torvalds }
18461da177e4SLinus Torvalds 
18471da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
18481da177e4SLinus Torvalds void __init numa_policy_init(void)
18491da177e4SLinus Torvalds {
1850b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
1851b71636e2SPaul Mundt 	unsigned long largest = 0;
1852b71636e2SPaul Mundt 	int nid, prefer = 0;
1853b71636e2SPaul Mundt 
18541da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
18551da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
185620c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
18571da177e4SLinus Torvalds 
18581da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
18591da177e4SLinus Torvalds 				     sizeof(struct sp_node),
186020c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
18611da177e4SLinus Torvalds 
1862b71636e2SPaul Mundt 	/*
1863b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
1864b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
1865b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
1866b71636e2SPaul Mundt 	 */
1867b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
186856bbd65dSChristoph Lameter 	for_each_node_state(nid, N_HIGH_MEMORY) {
1869b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
18701da177e4SLinus Torvalds 
1871b71636e2SPaul Mundt 		/* Preserve the largest node */
1872b71636e2SPaul Mundt 		if (largest < total_pages) {
1873b71636e2SPaul Mundt 			largest = total_pages;
1874b71636e2SPaul Mundt 			prefer = nid;
1875b71636e2SPaul Mundt 		}
1876b71636e2SPaul Mundt 
1877b71636e2SPaul Mundt 		/* Interleave this node? */
1878b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1879b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
1880b71636e2SPaul Mundt 	}
1881b71636e2SPaul Mundt 
1882b71636e2SPaul Mundt 	/* All too small, use the largest */
1883b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
1884b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
1885b71636e2SPaul Mundt 
1886028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
18871da177e4SLinus Torvalds 		printk("numa_policy_init: interleaving failed\n");
18881da177e4SLinus Torvalds }
18891da177e4SLinus Torvalds 
18908bccd85fSChristoph Lameter /* Reset policy of current process to default */
18911da177e4SLinus Torvalds void numa_default_policy(void)
18921da177e4SLinus Torvalds {
1893028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
18941da177e4SLinus Torvalds }
189568860ec1SPaul Jackson 
18964225399aSPaul Jackson /*
18971a75a6c8SChristoph Lameter  * Display pages allocated per node and memory policy via /proc.
18981a75a6c8SChristoph Lameter  */
189915ad7cdcSHelge Deller static const char * const policy_types[] =
190015ad7cdcSHelge Deller 	{ "default", "prefer", "bind", "interleave" };
19011a75a6c8SChristoph Lameter 
19021a75a6c8SChristoph Lameter /*
19031a75a6c8SChristoph Lameter  * Convert a mempolicy into a string.
19041a75a6c8SChristoph Lameter  * Returns the number of characters in buffer (if positive)
19051a75a6c8SChristoph Lameter  * or an error (negative)
19061a75a6c8SChristoph Lameter  */
19071a75a6c8SChristoph Lameter static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
19081a75a6c8SChristoph Lameter {
19091a75a6c8SChristoph Lameter 	char *p = buffer;
19101a75a6c8SChristoph Lameter 	int l;
19111a75a6c8SChristoph Lameter 	nodemask_t nodes;
1912a3b51e01SDavid Rientjes 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1913f5b087b5SDavid Rientjes 	unsigned short flags = pol ? pol->flags : 0;
19141a75a6c8SChristoph Lameter 
19151a75a6c8SChristoph Lameter 	switch (mode) {
19161a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
19171a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19181a75a6c8SChristoph Lameter 		break;
19191a75a6c8SChristoph Lameter 
19201a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
19211a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19221a75a6c8SChristoph Lameter 		node_set(pol->v.preferred_node, nodes);
19231a75a6c8SChristoph Lameter 		break;
19241a75a6c8SChristoph Lameter 
19251a75a6c8SChristoph Lameter 	case MPOL_BIND:
192619770b32SMel Gorman 		/* Fall through */
19271a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
19281a75a6c8SChristoph Lameter 		nodes = pol->v.nodes;
19291a75a6c8SChristoph Lameter 		break;
19301a75a6c8SChristoph Lameter 
19311a75a6c8SChristoph Lameter 	default:
19321a75a6c8SChristoph Lameter 		BUG();
19331a75a6c8SChristoph Lameter 		return -EFAULT;
19341a75a6c8SChristoph Lameter 	}
19351a75a6c8SChristoph Lameter 
19361a75a6c8SChristoph Lameter 	l = strlen(policy_types[mode]);
19371a75a6c8SChristoph Lameter  	if (buffer + maxlen < p + l + 1)
19381a75a6c8SChristoph Lameter  		return -ENOSPC;
19391a75a6c8SChristoph Lameter 
19401a75a6c8SChristoph Lameter 	strcpy(p, policy_types[mode]);
19411a75a6c8SChristoph Lameter 	p += l;
19421a75a6c8SChristoph Lameter 
1943f5b087b5SDavid Rientjes 	if (flags) {
1944f5b087b5SDavid Rientjes 		int need_bar = 0;
1945f5b087b5SDavid Rientjes 
1946f5b087b5SDavid Rientjes 		if (buffer + maxlen < p + 2)
1947f5b087b5SDavid Rientjes 			return -ENOSPC;
1948f5b087b5SDavid Rientjes 		*p++ = '=';
1949f5b087b5SDavid Rientjes 
1950f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
1951f5b087b5SDavid Rientjes 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
19524c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
19534c50bc01SDavid Rientjes 			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1954f5b087b5SDavid Rientjes 	}
1955f5b087b5SDavid Rientjes 
19561a75a6c8SChristoph Lameter 	if (!nodes_empty(nodes)) {
19571a75a6c8SChristoph Lameter 		if (buffer + maxlen < p + 2)
19581a75a6c8SChristoph Lameter 			return -ENOSPC;
19591a75a6c8SChristoph Lameter 		*p++ = '=';
19601a75a6c8SChristoph Lameter 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
19611a75a6c8SChristoph Lameter 	}
19621a75a6c8SChristoph Lameter 	return p - buffer;
19631a75a6c8SChristoph Lameter }
19641a75a6c8SChristoph Lameter 
19651a75a6c8SChristoph Lameter struct numa_maps {
19661a75a6c8SChristoph Lameter 	unsigned long pages;
19671a75a6c8SChristoph Lameter 	unsigned long anon;
1968397874dfSChristoph Lameter 	unsigned long active;
1969397874dfSChristoph Lameter 	unsigned long writeback;
19701a75a6c8SChristoph Lameter 	unsigned long mapcount_max;
1971397874dfSChristoph Lameter 	unsigned long dirty;
1972397874dfSChristoph Lameter 	unsigned long swapcache;
19731a75a6c8SChristoph Lameter 	unsigned long node[MAX_NUMNODES];
19741a75a6c8SChristoph Lameter };
19751a75a6c8SChristoph Lameter 
1976397874dfSChristoph Lameter static void gather_stats(struct page *page, void *private, int pte_dirty)
19771a75a6c8SChristoph Lameter {
19781a75a6c8SChristoph Lameter 	struct numa_maps *md = private;
19791a75a6c8SChristoph Lameter 	int count = page_mapcount(page);
19801a75a6c8SChristoph Lameter 
19811a75a6c8SChristoph Lameter 	md->pages++;
1982397874dfSChristoph Lameter 	if (pte_dirty || PageDirty(page))
1983397874dfSChristoph Lameter 		md->dirty++;
1984397874dfSChristoph Lameter 
1985397874dfSChristoph Lameter 	if (PageSwapCache(page))
1986397874dfSChristoph Lameter 		md->swapcache++;
1987397874dfSChristoph Lameter 
1988397874dfSChristoph Lameter 	if (PageActive(page))
1989397874dfSChristoph Lameter 		md->active++;
1990397874dfSChristoph Lameter 
1991397874dfSChristoph Lameter 	if (PageWriteback(page))
1992397874dfSChristoph Lameter 		md->writeback++;
19931a75a6c8SChristoph Lameter 
19941a75a6c8SChristoph Lameter 	if (PageAnon(page))
19951a75a6c8SChristoph Lameter 		md->anon++;
19961a75a6c8SChristoph Lameter 
1997397874dfSChristoph Lameter 	if (count > md->mapcount_max)
1998397874dfSChristoph Lameter 		md->mapcount_max = count;
1999397874dfSChristoph Lameter 
20001a75a6c8SChristoph Lameter 	md->node[page_to_nid(page)]++;
20011a75a6c8SChristoph Lameter }
20021a75a6c8SChristoph Lameter 
20037f709ed0SAndrew Morton #ifdef CONFIG_HUGETLB_PAGE
2004397874dfSChristoph Lameter static void check_huge_range(struct vm_area_struct *vma,
2005397874dfSChristoph Lameter 		unsigned long start, unsigned long end,
2006397874dfSChristoph Lameter 		struct numa_maps *md)
2007397874dfSChristoph Lameter {
2008397874dfSChristoph Lameter 	unsigned long addr;
2009397874dfSChristoph Lameter 	struct page *page;
2010397874dfSChristoph Lameter 
2011397874dfSChristoph Lameter 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2012397874dfSChristoph Lameter 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2013397874dfSChristoph Lameter 		pte_t pte;
2014397874dfSChristoph Lameter 
2015397874dfSChristoph Lameter 		if (!ptep)
2016397874dfSChristoph Lameter 			continue;
2017397874dfSChristoph Lameter 
2018397874dfSChristoph Lameter 		pte = *ptep;
2019397874dfSChristoph Lameter 		if (pte_none(pte))
2020397874dfSChristoph Lameter 			continue;
2021397874dfSChristoph Lameter 
2022397874dfSChristoph Lameter 		page = pte_page(pte);
2023397874dfSChristoph Lameter 		if (!page)
2024397874dfSChristoph Lameter 			continue;
2025397874dfSChristoph Lameter 
2026397874dfSChristoph Lameter 		gather_stats(page, md, pte_dirty(*ptep));
2027397874dfSChristoph Lameter 	}
2028397874dfSChristoph Lameter }
20297f709ed0SAndrew Morton #else
20307f709ed0SAndrew Morton static inline void check_huge_range(struct vm_area_struct *vma,
20317f709ed0SAndrew Morton 		unsigned long start, unsigned long end,
20327f709ed0SAndrew Morton 		struct numa_maps *md)
20337f709ed0SAndrew Morton {
20347f709ed0SAndrew Morton }
20357f709ed0SAndrew Morton #endif
2036397874dfSChristoph Lameter 
20371a75a6c8SChristoph Lameter int show_numa_map(struct seq_file *m, void *v)
20381a75a6c8SChristoph Lameter {
203999f89551SEric W. Biederman 	struct proc_maps_private *priv = m->private;
20401a75a6c8SChristoph Lameter 	struct vm_area_struct *vma = v;
20411a75a6c8SChristoph Lameter 	struct numa_maps *md;
2042397874dfSChristoph Lameter 	struct file *file = vma->vm_file;
2043397874dfSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
2044480eccf9SLee Schermerhorn 	struct mempolicy *pol;
20451a75a6c8SChristoph Lameter 	int n;
20461a75a6c8SChristoph Lameter 	char buffer[50];
20471a75a6c8SChristoph Lameter 
2048397874dfSChristoph Lameter 	if (!mm)
20491a75a6c8SChristoph Lameter 		return 0;
20501a75a6c8SChristoph Lameter 
20511a75a6c8SChristoph Lameter 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
20521a75a6c8SChristoph Lameter 	if (!md)
20531a75a6c8SChristoph Lameter 		return 0;
20541a75a6c8SChristoph Lameter 
2055480eccf9SLee Schermerhorn 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2056480eccf9SLee Schermerhorn 	mpol_to_str(buffer, sizeof(buffer), pol);
2057480eccf9SLee Schermerhorn 	/*
2058480eccf9SLee Schermerhorn 	 * unref shared or other task's mempolicy
2059480eccf9SLee Schermerhorn 	 */
2060480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy)
2061480eccf9SLee Schermerhorn 		__mpol_free(pol);
20621a75a6c8SChristoph Lameter 
2063397874dfSChristoph Lameter 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2064397874dfSChristoph Lameter 
2065397874dfSChristoph Lameter 	if (file) {
2066397874dfSChristoph Lameter 		seq_printf(m, " file=");
2067c32c2f63SJan Blunck 		seq_path(m, &file->f_path, "\n\t= ");
2068397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2069397874dfSChristoph Lameter 		seq_printf(m, " heap");
2070397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->start_stack &&
2071397874dfSChristoph Lameter 			vma->vm_end >= mm->start_stack) {
2072397874dfSChristoph Lameter 		seq_printf(m, " stack");
2073397874dfSChristoph Lameter 	}
2074397874dfSChristoph Lameter 
2075397874dfSChristoph Lameter 	if (is_vm_hugetlb_page(vma)) {
2076397874dfSChristoph Lameter 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2077397874dfSChristoph Lameter 		seq_printf(m, " huge");
2078397874dfSChristoph Lameter 	} else {
2079397874dfSChristoph Lameter 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
208056bbd65dSChristoph Lameter 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2081397874dfSChristoph Lameter 	}
2082397874dfSChristoph Lameter 
2083397874dfSChristoph Lameter 	if (!md->pages)
2084397874dfSChristoph Lameter 		goto out;
20851a75a6c8SChristoph Lameter 
20861a75a6c8SChristoph Lameter 	if (md->anon)
20871a75a6c8SChristoph Lameter 		seq_printf(m," anon=%lu",md->anon);
20881a75a6c8SChristoph Lameter 
2089397874dfSChristoph Lameter 	if (md->dirty)
2090397874dfSChristoph Lameter 		seq_printf(m," dirty=%lu",md->dirty);
2091397874dfSChristoph Lameter 
2092397874dfSChristoph Lameter 	if (md->pages != md->anon && md->pages != md->dirty)
2093397874dfSChristoph Lameter 		seq_printf(m, " mapped=%lu", md->pages);
2094397874dfSChristoph Lameter 
2095397874dfSChristoph Lameter 	if (md->mapcount_max > 1)
2096397874dfSChristoph Lameter 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2097397874dfSChristoph Lameter 
2098397874dfSChristoph Lameter 	if (md->swapcache)
2099397874dfSChristoph Lameter 		seq_printf(m," swapcache=%lu", md->swapcache);
2100397874dfSChristoph Lameter 
2101397874dfSChristoph Lameter 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2102397874dfSChristoph Lameter 		seq_printf(m," active=%lu", md->active);
2103397874dfSChristoph Lameter 
2104397874dfSChristoph Lameter 	if (md->writeback)
2105397874dfSChristoph Lameter 		seq_printf(m," writeback=%lu", md->writeback);
2106397874dfSChristoph Lameter 
210756bbd65dSChristoph Lameter 	for_each_node_state(n, N_HIGH_MEMORY)
21081a75a6c8SChristoph Lameter 		if (md->node[n])
21091a75a6c8SChristoph Lameter 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2110397874dfSChristoph Lameter out:
21111a75a6c8SChristoph Lameter 	seq_putc(m, '\n');
21121a75a6c8SChristoph Lameter 	kfree(md);
21131a75a6c8SChristoph Lameter 
21141a75a6c8SChristoph Lameter 	if (m->count < m->size)
211599f89551SEric W. Biederman 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
21161a75a6c8SChristoph Lameter 	return 0;
21171a75a6c8SChristoph Lameter }
2118