xref: /openbmc/linux/mm/mempolicy.c (revision b1de0d139c97a6078bbada6cf2d27c30ce127a97)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
58bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
61da177e4SLinus Torvalds  * Subject to the GNU Public License, version 2.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
221da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds  *                no fallback.
248bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
278bccd85fSChristoph Lameter  *
281da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
2900ef2d2fSDavid Rientjes  *                As a special case NUMA_NO_NODE here means do the allocation
301da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds  *                process policy.
338bccd85fSChristoph Lameter  *
341da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
351da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
361da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
371da177e4SLinus Torvalds  *
381da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
391da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
401da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
411da177e4SLinus Torvalds  * allocations for a VMA in the VM.
421da177e4SLinus Torvalds  *
431da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
441da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
451da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
461da177e4SLinus Torvalds  *
471da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
481da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
491da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
501da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
511da177e4SLinus Torvalds  *
521da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
531da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
541da177e4SLinus Torvalds  */
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds /* Notebook:
571da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
581da177e4SLinus Torvalds    object
591da177e4SLinus Torvalds    statistics for bigpages
601da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
611da177e4SLinus Torvalds    first item above.
621da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
631da177e4SLinus Torvalds    grows down?
641da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
651da177e4SLinus Torvalds    kernel is not always grateful with that.
661da177e4SLinus Torvalds */
671da177e4SLinus Torvalds 
68*b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69*b1de0d13SMitchel Humpherys 
701da177e4SLinus Torvalds #include <linux/mempolicy.h>
711da177e4SLinus Torvalds #include <linux/mm.h>
721da177e4SLinus Torvalds #include <linux/highmem.h>
731da177e4SLinus Torvalds #include <linux/hugetlb.h>
741da177e4SLinus Torvalds #include <linux/kernel.h>
751da177e4SLinus Torvalds #include <linux/sched.h>
761da177e4SLinus Torvalds #include <linux/nodemask.h>
771da177e4SLinus Torvalds #include <linux/cpuset.h>
781da177e4SLinus Torvalds #include <linux/slab.h>
791da177e4SLinus Torvalds #include <linux/string.h>
80b95f1b31SPaul Gortmaker #include <linux/export.h>
81b488893aSPavel Emelyanov #include <linux/nsproxy.h>
821da177e4SLinus Torvalds #include <linux/interrupt.h>
831da177e4SLinus Torvalds #include <linux/init.h>
841da177e4SLinus Torvalds #include <linux/compat.h>
85dc9aa5b9SChristoph Lameter #include <linux/swap.h>
861a75a6c8SChristoph Lameter #include <linux/seq_file.h>
871a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
88b20a3503SChristoph Lameter #include <linux/migrate.h>
8962b61f61SHugh Dickins #include <linux/ksm.h>
9095a402c3SChristoph Lameter #include <linux/rmap.h>
9186c3a764SDavid Quigley #include <linux/security.h>
92dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
93095f1fc4SLee Schermerhorn #include <linux/ctype.h>
946d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h>
95b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h>
96*b1de0d13SMitchel Humpherys #include <linux/printk.h>
97dc9aa5b9SChristoph Lameter 
981da177e4SLinus Torvalds #include <asm/tlbflush.h>
991da177e4SLinus Torvalds #include <asm/uaccess.h>
100778d3b0fSMichal Hocko #include <linux/random.h>
1011da177e4SLinus Torvalds 
10262695a84SNick Piggin #include "internal.h"
10362695a84SNick Piggin 
10438e35860SChristoph Lameter /* Internal flags */
105dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
10638e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107dc9aa5b9SChristoph Lameter 
108fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
109fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1101da177e4SLinus Torvalds 
1111da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1121da177e4SLinus Torvalds    policied. */
1136267276fSChristoph Lameter enum zone_type policy_zone = 0;
1141da177e4SLinus Torvalds 
115bea904d5SLee Schermerhorn /*
116bea904d5SLee Schermerhorn  * run-time system-wide default policy => local allocation
117bea904d5SLee Schermerhorn  */
118e754d79dSH Hartley Sweeten static struct mempolicy default_policy = {
1191da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
120bea904d5SLee Schermerhorn 	.mode = MPOL_PREFERRED,
121fc36b8d3SLee Schermerhorn 	.flags = MPOL_F_LOCAL,
1221da177e4SLinus Torvalds };
1231da177e4SLinus Torvalds 
1245606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES];
1255606e387SMel Gorman 
1265606e387SMel Gorman static struct mempolicy *get_task_policy(struct task_struct *p)
1275606e387SMel Gorman {
1285606e387SMel Gorman 	struct mempolicy *pol = p->mempolicy;
1295606e387SMel Gorman 
1305606e387SMel Gorman 	if (!pol) {
1311da6f0e1SJianguo Wu 		int node = numa_node_id();
1325606e387SMel Gorman 
1331da6f0e1SJianguo Wu 		if (node != NUMA_NO_NODE) {
1341da6f0e1SJianguo Wu 			pol = &preferred_node_policy[node];
1351da6f0e1SJianguo Wu 			/*
1361da6f0e1SJianguo Wu 			 * preferred_node_policy is not initialised early in
1371da6f0e1SJianguo Wu 			 * boot
1381da6f0e1SJianguo Wu 			 */
1395606e387SMel Gorman 			if (!pol->mode)
1405606e387SMel Gorman 				pol = NULL;
1415606e387SMel Gorman 		}
1421da6f0e1SJianguo Wu 	}
1435606e387SMel Gorman 
1445606e387SMel Gorman 	return pol;
1455606e387SMel Gorman }
1465606e387SMel Gorman 
14737012946SDavid Rientjes static const struct mempolicy_operations {
14837012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149708c1bbcSMiao Xie 	/*
150708c1bbcSMiao Xie 	 * If read-side task has no lock to protect task->mempolicy, write-side
151708c1bbcSMiao Xie 	 * task will rebind the task->mempolicy by two step. The first step is
152708c1bbcSMiao Xie 	 * setting all the newly nodes, and the second step is cleaning all the
153708c1bbcSMiao Xie 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
154708c1bbcSMiao Xie 	 * page.
155708c1bbcSMiao Xie 	 * If we have a lock to protect task->mempolicy in read-side, we do
156708c1bbcSMiao Xie 	 * rebind directly.
157708c1bbcSMiao Xie 	 *
158708c1bbcSMiao Xie 	 * step:
159708c1bbcSMiao Xie 	 * 	MPOL_REBIND_ONCE - do rebind work at once
160708c1bbcSMiao Xie 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
161708c1bbcSMiao Xie 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
162708c1bbcSMiao Xie 	 */
163708c1bbcSMiao Xie 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164708c1bbcSMiao Xie 			enum mpol_rebind_step step);
16537012946SDavid Rientjes } mpol_ops[MPOL_MAX];
16637012946SDavid Rientjes 
16719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */
16837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask)
1691da177e4SLinus Torvalds {
170d3eb1570SLai Jiangshan 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
1711da177e4SLinus Torvalds }
1721da177e4SLinus Torvalds 
173f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
174f5b087b5SDavid Rientjes {
1756d556294SBob Liu 	return pol->flags & MPOL_MODE_FLAGS;
1764c50bc01SDavid Rientjes }
1774c50bc01SDavid Rientjes 
1784c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1794c50bc01SDavid Rientjes 				   const nodemask_t *rel)
1804c50bc01SDavid Rientjes {
1814c50bc01SDavid Rientjes 	nodemask_t tmp;
1824c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
1834c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
184f5b087b5SDavid Rientjes }
185f5b087b5SDavid Rientjes 
18637012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
18737012946SDavid Rientjes {
18837012946SDavid Rientjes 	if (nodes_empty(*nodes))
18937012946SDavid Rientjes 		return -EINVAL;
19037012946SDavid Rientjes 	pol->v.nodes = *nodes;
19137012946SDavid Rientjes 	return 0;
19237012946SDavid Rientjes }
19337012946SDavid Rientjes 
19437012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
19537012946SDavid Rientjes {
19637012946SDavid Rientjes 	if (!nodes)
197fc36b8d3SLee Schermerhorn 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
19837012946SDavid Rientjes 	else if (nodes_empty(*nodes))
19937012946SDavid Rientjes 		return -EINVAL;			/*  no allowed nodes */
20037012946SDavid Rientjes 	else
20137012946SDavid Rientjes 		pol->v.preferred_node = first_node(*nodes);
20237012946SDavid Rientjes 	return 0;
20337012946SDavid Rientjes }
20437012946SDavid Rientjes 
20537012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
20637012946SDavid Rientjes {
20737012946SDavid Rientjes 	if (!is_valid_nodemask(nodes))
20837012946SDavid Rientjes 		return -EINVAL;
20937012946SDavid Rientjes 	pol->v.nodes = *nodes;
21037012946SDavid Rientjes 	return 0;
21137012946SDavid Rientjes }
21237012946SDavid Rientjes 
21358568d2aSMiao Xie /*
21458568d2aSMiao Xie  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
21558568d2aSMiao Xie  * any, for the new policy.  mpol_new() has already validated the nodes
21658568d2aSMiao Xie  * parameter with respect to the policy mode and flags.  But, we need to
21758568d2aSMiao Xie  * handle an empty nodemask with MPOL_PREFERRED here.
21858568d2aSMiao Xie  *
21958568d2aSMiao Xie  * Must be called holding task's alloc_lock to protect task's mems_allowed
22058568d2aSMiao Xie  * and mempolicy.  May also be called holding the mmap_semaphore for write.
22158568d2aSMiao Xie  */
2224bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol,
2234bfc4495SKAMEZAWA Hiroyuki 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
22458568d2aSMiao Xie {
22558568d2aSMiao Xie 	int ret;
22658568d2aSMiao Xie 
22758568d2aSMiao Xie 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
22858568d2aSMiao Xie 	if (pol == NULL)
22958568d2aSMiao Xie 		return 0;
23001f13bd6SLai Jiangshan 	/* Check N_MEMORY */
2314bfc4495SKAMEZAWA Hiroyuki 	nodes_and(nsc->mask1,
23201f13bd6SLai Jiangshan 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
23358568d2aSMiao Xie 
23458568d2aSMiao Xie 	VM_BUG_ON(!nodes);
23558568d2aSMiao Xie 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
23658568d2aSMiao Xie 		nodes = NULL;	/* explicit local allocation */
23758568d2aSMiao Xie 	else {
23858568d2aSMiao Xie 		if (pol->flags & MPOL_F_RELATIVE_NODES)
2394bfc4495SKAMEZAWA Hiroyuki 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
24058568d2aSMiao Xie 		else
2414bfc4495SKAMEZAWA Hiroyuki 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
2424bfc4495SKAMEZAWA Hiroyuki 
24358568d2aSMiao Xie 		if (mpol_store_user_nodemask(pol))
24458568d2aSMiao Xie 			pol->w.user_nodemask = *nodes;
24558568d2aSMiao Xie 		else
24658568d2aSMiao Xie 			pol->w.cpuset_mems_allowed =
24758568d2aSMiao Xie 						cpuset_current_mems_allowed;
24858568d2aSMiao Xie 	}
24958568d2aSMiao Xie 
2504bfc4495SKAMEZAWA Hiroyuki 	if (nodes)
2514bfc4495SKAMEZAWA Hiroyuki 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
2524bfc4495SKAMEZAWA Hiroyuki 	else
2534bfc4495SKAMEZAWA Hiroyuki 		ret = mpol_ops[pol->mode].create(pol, NULL);
25458568d2aSMiao Xie 	return ret;
25558568d2aSMiao Xie }
25658568d2aSMiao Xie 
25758568d2aSMiao Xie /*
25858568d2aSMiao Xie  * This function just creates a new policy, does some check and simple
25958568d2aSMiao Xie  * initialization. You must invoke mpol_set_nodemask() to set nodes.
26058568d2aSMiao Xie  */
261028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262028fec41SDavid Rientjes 				  nodemask_t *nodes)
2631da177e4SLinus Torvalds {
2641da177e4SLinus Torvalds 	struct mempolicy *policy;
2651da177e4SLinus Torvalds 
266028fec41SDavid Rientjes 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
26700ef2d2fSDavid Rientjes 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
268140d5a49SPaul Mundt 
2693e1f0645SDavid Rientjes 	if (mode == MPOL_DEFAULT) {
2703e1f0645SDavid Rientjes 		if (nodes && !nodes_empty(*nodes))
27137012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
272d3a71033SLee Schermerhorn 		return NULL;
27337012946SDavid Rientjes 	}
2743e1f0645SDavid Rientjes 	VM_BUG_ON(!nodes);
2753e1f0645SDavid Rientjes 
2763e1f0645SDavid Rientjes 	/*
2773e1f0645SDavid Rientjes 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
2783e1f0645SDavid Rientjes 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
2793e1f0645SDavid Rientjes 	 * All other modes require a valid pointer to a non-empty nodemask.
2803e1f0645SDavid Rientjes 	 */
2813e1f0645SDavid Rientjes 	if (mode == MPOL_PREFERRED) {
2823e1f0645SDavid Rientjes 		if (nodes_empty(*nodes)) {
2833e1f0645SDavid Rientjes 			if (((flags & MPOL_F_STATIC_NODES) ||
2843e1f0645SDavid Rientjes 			     (flags & MPOL_F_RELATIVE_NODES)))
2853e1f0645SDavid Rientjes 				return ERR_PTR(-EINVAL);
2863e1f0645SDavid Rientjes 		}
287479e2802SPeter Zijlstra 	} else if (mode == MPOL_LOCAL) {
288479e2802SPeter Zijlstra 		if (!nodes_empty(*nodes))
289479e2802SPeter Zijlstra 			return ERR_PTR(-EINVAL);
290479e2802SPeter Zijlstra 		mode = MPOL_PREFERRED;
2913e1f0645SDavid Rientjes 	} else if (nodes_empty(*nodes))
2923e1f0645SDavid Rientjes 		return ERR_PTR(-EINVAL);
2931da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2941da177e4SLinus Torvalds 	if (!policy)
2951da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2961da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
29745c4745aSLee Schermerhorn 	policy->mode = mode;
29837012946SDavid Rientjes 	policy->flags = flags;
2993e1f0645SDavid Rientjes 
30037012946SDavid Rientjes 	return policy;
30137012946SDavid Rientjes }
30237012946SDavid Rientjes 
30352cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */
30452cd3b07SLee Schermerhorn void __mpol_put(struct mempolicy *p)
30552cd3b07SLee Schermerhorn {
30652cd3b07SLee Schermerhorn 	if (!atomic_dec_and_test(&p->refcnt))
30752cd3b07SLee Schermerhorn 		return;
30852cd3b07SLee Schermerhorn 	kmem_cache_free(policy_cache, p);
30952cd3b07SLee Schermerhorn }
31052cd3b07SLee Schermerhorn 
311708c1bbcSMiao Xie static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
312708c1bbcSMiao Xie 				enum mpol_rebind_step step)
31337012946SDavid Rientjes {
31437012946SDavid Rientjes }
31537012946SDavid Rientjes 
316708c1bbcSMiao Xie /*
317708c1bbcSMiao Xie  * step:
318708c1bbcSMiao Xie  * 	MPOL_REBIND_ONCE  - do rebind work at once
319708c1bbcSMiao Xie  * 	MPOL_REBIND_STEP1 - set all the newly nodes
320708c1bbcSMiao Xie  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
321708c1bbcSMiao Xie  */
322708c1bbcSMiao Xie static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
323708c1bbcSMiao Xie 				 enum mpol_rebind_step step)
3241d0d2680SDavid Rientjes {
3251d0d2680SDavid Rientjes 	nodemask_t tmp;
3261d0d2680SDavid Rientjes 
32737012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
32837012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
32937012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
33037012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3311d0d2680SDavid Rientjes 	else {
332708c1bbcSMiao Xie 		/*
333708c1bbcSMiao Xie 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
334708c1bbcSMiao Xie 		 * result
335708c1bbcSMiao Xie 		 */
336708c1bbcSMiao Xie 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
337708c1bbcSMiao Xie 			nodes_remap(tmp, pol->v.nodes,
338708c1bbcSMiao Xie 					pol->w.cpuset_mems_allowed, *nodes);
339708c1bbcSMiao Xie 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
340708c1bbcSMiao Xie 		} else if (step == MPOL_REBIND_STEP2) {
341708c1bbcSMiao Xie 			tmp = pol->w.cpuset_mems_allowed;
34237012946SDavid Rientjes 			pol->w.cpuset_mems_allowed = *nodes;
343708c1bbcSMiao Xie 		} else
344708c1bbcSMiao Xie 			BUG();
3451d0d2680SDavid Rientjes 	}
34637012946SDavid Rientjes 
347708c1bbcSMiao Xie 	if (nodes_empty(tmp))
348708c1bbcSMiao Xie 		tmp = *nodes;
349708c1bbcSMiao Xie 
350708c1bbcSMiao Xie 	if (step == MPOL_REBIND_STEP1)
351708c1bbcSMiao Xie 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
352708c1bbcSMiao Xie 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
3531d0d2680SDavid Rientjes 		pol->v.nodes = tmp;
354708c1bbcSMiao Xie 	else
355708c1bbcSMiao Xie 		BUG();
356708c1bbcSMiao Xie 
3571d0d2680SDavid Rientjes 	if (!node_isset(current->il_next, tmp)) {
3581d0d2680SDavid Rientjes 		current->il_next = next_node(current->il_next, tmp);
3591d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
3601d0d2680SDavid Rientjes 			current->il_next = first_node(tmp);
3611d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
3621d0d2680SDavid Rientjes 			current->il_next = numa_node_id();
3631d0d2680SDavid Rientjes 	}
36437012946SDavid Rientjes }
36537012946SDavid Rientjes 
36637012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
367708c1bbcSMiao Xie 				  const nodemask_t *nodes,
368708c1bbcSMiao Xie 				  enum mpol_rebind_step step)
36937012946SDavid Rientjes {
37037012946SDavid Rientjes 	nodemask_t tmp;
37137012946SDavid Rientjes 
37237012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES) {
3731d0d2680SDavid Rientjes 		int node = first_node(pol->w.user_nodemask);
3741d0d2680SDavid Rientjes 
375fc36b8d3SLee Schermerhorn 		if (node_isset(node, *nodes)) {
3761d0d2680SDavid Rientjes 			pol->v.preferred_node = node;
377fc36b8d3SLee Schermerhorn 			pol->flags &= ~MPOL_F_LOCAL;
378fc36b8d3SLee Schermerhorn 		} else
379fc36b8d3SLee Schermerhorn 			pol->flags |= MPOL_F_LOCAL;
38037012946SDavid Rientjes 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
38137012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3821d0d2680SDavid Rientjes 		pol->v.preferred_node = first_node(tmp);
383fc36b8d3SLee Schermerhorn 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
3841d0d2680SDavid Rientjes 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
38537012946SDavid Rientjes 						   pol->w.cpuset_mems_allowed,
38637012946SDavid Rientjes 						   *nodes);
38737012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
3881d0d2680SDavid Rientjes 	}
3891d0d2680SDavid Rientjes }
39037012946SDavid Rientjes 
391708c1bbcSMiao Xie /*
392708c1bbcSMiao Xie  * mpol_rebind_policy - Migrate a policy to a different set of nodes
393708c1bbcSMiao Xie  *
394708c1bbcSMiao Xie  * If read-side task has no lock to protect task->mempolicy, write-side
395708c1bbcSMiao Xie  * task will rebind the task->mempolicy by two step. The first step is
396708c1bbcSMiao Xie  * setting all the newly nodes, and the second step is cleaning all the
397708c1bbcSMiao Xie  * disallowed nodes. In this way, we can avoid finding no node to alloc
398708c1bbcSMiao Xie  * page.
399708c1bbcSMiao Xie  * If we have a lock to protect task->mempolicy in read-side, we do
400708c1bbcSMiao Xie  * rebind directly.
401708c1bbcSMiao Xie  *
402708c1bbcSMiao Xie  * step:
403708c1bbcSMiao Xie  * 	MPOL_REBIND_ONCE  - do rebind work at once
404708c1bbcSMiao Xie  * 	MPOL_REBIND_STEP1 - set all the newly nodes
405708c1bbcSMiao Xie  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
406708c1bbcSMiao Xie  */
407708c1bbcSMiao Xie static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
408708c1bbcSMiao Xie 				enum mpol_rebind_step step)
40937012946SDavid Rientjes {
41037012946SDavid Rientjes 	if (!pol)
41137012946SDavid Rientjes 		return;
41289c522c7SWang Sheng-Hui 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
41337012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
41437012946SDavid Rientjes 		return;
415708c1bbcSMiao Xie 
416708c1bbcSMiao Xie 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
417708c1bbcSMiao Xie 		return;
418708c1bbcSMiao Xie 
419708c1bbcSMiao Xie 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
420708c1bbcSMiao Xie 		BUG();
421708c1bbcSMiao Xie 
422708c1bbcSMiao Xie 	if (step == MPOL_REBIND_STEP1)
423708c1bbcSMiao Xie 		pol->flags |= MPOL_F_REBINDING;
424708c1bbcSMiao Xie 	else if (step == MPOL_REBIND_STEP2)
425708c1bbcSMiao Xie 		pol->flags &= ~MPOL_F_REBINDING;
426708c1bbcSMiao Xie 	else if (step >= MPOL_REBIND_NSTEP)
427708c1bbcSMiao Xie 		BUG();
428708c1bbcSMiao Xie 
429708c1bbcSMiao Xie 	mpol_ops[pol->mode].rebind(pol, newmask, step);
4301d0d2680SDavid Rientjes }
4311d0d2680SDavid Rientjes 
4321d0d2680SDavid Rientjes /*
4331d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
4341d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
43558568d2aSMiao Xie  *
43658568d2aSMiao Xie  * Called with task's alloc_lock held.
4371d0d2680SDavid Rientjes  */
4381d0d2680SDavid Rientjes 
439708c1bbcSMiao Xie void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
440708c1bbcSMiao Xie 			enum mpol_rebind_step step)
4411d0d2680SDavid Rientjes {
442708c1bbcSMiao Xie 	mpol_rebind_policy(tsk->mempolicy, new, step);
4431d0d2680SDavid Rientjes }
4441d0d2680SDavid Rientjes 
4451d0d2680SDavid Rientjes /*
4461d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
4471d0d2680SDavid Rientjes  *
4481d0d2680SDavid Rientjes  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
4491d0d2680SDavid Rientjes  */
4501d0d2680SDavid Rientjes 
4511d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
4521d0d2680SDavid Rientjes {
4531d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
4541d0d2680SDavid Rientjes 
4551d0d2680SDavid Rientjes 	down_write(&mm->mmap_sem);
4561d0d2680SDavid Rientjes 	for (vma = mm->mmap; vma; vma = vma->vm_next)
457708c1bbcSMiao Xie 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
4581d0d2680SDavid Rientjes 	up_write(&mm->mmap_sem);
4591d0d2680SDavid Rientjes }
4601d0d2680SDavid Rientjes 
46137012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
46237012946SDavid Rientjes 	[MPOL_DEFAULT] = {
46337012946SDavid Rientjes 		.rebind = mpol_rebind_default,
46437012946SDavid Rientjes 	},
46537012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
46637012946SDavid Rientjes 		.create = mpol_new_interleave,
46737012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
46837012946SDavid Rientjes 	},
46937012946SDavid Rientjes 	[MPOL_PREFERRED] = {
47037012946SDavid Rientjes 		.create = mpol_new_preferred,
47137012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
47237012946SDavid Rientjes 	},
47337012946SDavid Rientjes 	[MPOL_BIND] = {
47437012946SDavid Rientjes 		.create = mpol_new_bind,
47537012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
47637012946SDavid Rientjes 	},
47737012946SDavid Rientjes };
47837012946SDavid Rientjes 
479fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
480fc301289SChristoph Lameter 				unsigned long flags);
4811a75a6c8SChristoph Lameter 
48298094945SNaoya Horiguchi /*
48398094945SNaoya Horiguchi  * Scan through pages checking if pages follow certain conditions,
48498094945SNaoya Horiguchi  * and move them to the pagelist if they do.
48598094945SNaoya Horiguchi  */
48698094945SNaoya Horiguchi static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
487dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
488dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
48938e35860SChristoph Lameter 		void *private)
4901da177e4SLinus Torvalds {
49191612e0dSHugh Dickins 	pte_t *orig_pte;
49291612e0dSHugh Dickins 	pte_t *pte;
493705e87c0SHugh Dickins 	spinlock_t *ptl;
494941150a3SHugh Dickins 
495705e87c0SHugh Dickins 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
49691612e0dSHugh Dickins 	do {
4976aab341eSLinus Torvalds 		struct page *page;
49825ba77c1SAndy Whitcroft 		int nid;
49991612e0dSHugh Dickins 
50091612e0dSHugh Dickins 		if (!pte_present(*pte))
50191612e0dSHugh Dickins 			continue;
5026aab341eSLinus Torvalds 		page = vm_normal_page(vma, addr, *pte);
5036aab341eSLinus Torvalds 		if (!page)
50491612e0dSHugh Dickins 			continue;
505053837fcSNick Piggin 		/*
50662b61f61SHugh Dickins 		 * vm_normal_page() filters out zero pages, but there might
50762b61f61SHugh Dickins 		 * still be PageReserved pages to skip, perhaps in a VDSO.
508053837fcSNick Piggin 		 */
509b79bc0a0SHugh Dickins 		if (PageReserved(page))
510f4598c8bSChristoph Lameter 			continue;
5116aab341eSLinus Torvalds 		nid = page_to_nid(page);
51238e35860SChristoph Lameter 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
51338e35860SChristoph Lameter 			continue;
51438e35860SChristoph Lameter 
515b1f72d18SStephen Wilson 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
516fc301289SChristoph Lameter 			migrate_page_add(page, private, flags);
517dc9aa5b9SChristoph Lameter 		else
5181da177e4SLinus Torvalds 			break;
51991612e0dSHugh Dickins 	} while (pte++, addr += PAGE_SIZE, addr != end);
520705e87c0SHugh Dickins 	pte_unmap_unlock(orig_pte, ptl);
52191612e0dSHugh Dickins 	return addr != end;
52291612e0dSHugh Dickins }
52391612e0dSHugh Dickins 
52498094945SNaoya Horiguchi static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
52598094945SNaoya Horiguchi 		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
526e2d8cf40SNaoya Horiguchi 				    void *private)
527e2d8cf40SNaoya Horiguchi {
528e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
529e2d8cf40SNaoya Horiguchi 	int nid;
530e2d8cf40SNaoya Horiguchi 	struct page *page;
531cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
532e2d8cf40SNaoya Horiguchi 
533cb900f41SKirill A. Shutemov 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
534e2d8cf40SNaoya Horiguchi 	page = pte_page(huge_ptep_get((pte_t *)pmd));
535e2d8cf40SNaoya Horiguchi 	nid = page_to_nid(page);
536e2d8cf40SNaoya Horiguchi 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
537e2d8cf40SNaoya Horiguchi 		goto unlock;
538e2d8cf40SNaoya Horiguchi 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
539e2d8cf40SNaoya Horiguchi 	if (flags & (MPOL_MF_MOVE_ALL) ||
540e2d8cf40SNaoya Horiguchi 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
541e2d8cf40SNaoya Horiguchi 		isolate_huge_page(page, private);
542e2d8cf40SNaoya Horiguchi unlock:
543cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
544e2d8cf40SNaoya Horiguchi #else
545e2d8cf40SNaoya Horiguchi 	BUG();
546e2d8cf40SNaoya Horiguchi #endif
547e2d8cf40SNaoya Horiguchi }
548e2d8cf40SNaoya Horiguchi 
54998094945SNaoya Horiguchi static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
550dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
551dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
55238e35860SChristoph Lameter 		void *private)
55391612e0dSHugh Dickins {
55491612e0dSHugh Dickins 	pmd_t *pmd;
55591612e0dSHugh Dickins 	unsigned long next;
55691612e0dSHugh Dickins 
55791612e0dSHugh Dickins 	pmd = pmd_offset(pud, addr);
55891612e0dSHugh Dickins 	do {
55991612e0dSHugh Dickins 		next = pmd_addr_end(addr, end);
560e2d8cf40SNaoya Horiguchi 		if (!pmd_present(*pmd))
561e2d8cf40SNaoya Horiguchi 			continue;
562e2d8cf40SNaoya Horiguchi 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
56398094945SNaoya Horiguchi 			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
564e2d8cf40SNaoya Horiguchi 						flags, private);
565e2d8cf40SNaoya Horiguchi 			continue;
566e2d8cf40SNaoya Horiguchi 		}
567e180377fSKirill A. Shutemov 		split_huge_page_pmd(vma, addr, pmd);
5681a5a9906SAndrea Arcangeli 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
56991612e0dSHugh Dickins 			continue;
57098094945SNaoya Horiguchi 		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
57138e35860SChristoph Lameter 				    flags, private))
57291612e0dSHugh Dickins 			return -EIO;
57391612e0dSHugh Dickins 	} while (pmd++, addr = next, addr != end);
57491612e0dSHugh Dickins 	return 0;
57591612e0dSHugh Dickins }
57691612e0dSHugh Dickins 
57798094945SNaoya Horiguchi static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
578dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
579dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
58038e35860SChristoph Lameter 		void *private)
58191612e0dSHugh Dickins {
58291612e0dSHugh Dickins 	pud_t *pud;
58391612e0dSHugh Dickins 	unsigned long next;
58491612e0dSHugh Dickins 
58591612e0dSHugh Dickins 	pud = pud_offset(pgd, addr);
58691612e0dSHugh Dickins 	do {
58791612e0dSHugh Dickins 		next = pud_addr_end(addr, end);
588e2d8cf40SNaoya Horiguchi 		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
589e2d8cf40SNaoya Horiguchi 			continue;
59091612e0dSHugh Dickins 		if (pud_none_or_clear_bad(pud))
59191612e0dSHugh Dickins 			continue;
59298094945SNaoya Horiguchi 		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
59338e35860SChristoph Lameter 				    flags, private))
59491612e0dSHugh Dickins 			return -EIO;
59591612e0dSHugh Dickins 	} while (pud++, addr = next, addr != end);
59691612e0dSHugh Dickins 	return 0;
59791612e0dSHugh Dickins }
59891612e0dSHugh Dickins 
59998094945SNaoya Horiguchi static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
600dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
601dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
60238e35860SChristoph Lameter 		void *private)
60391612e0dSHugh Dickins {
60491612e0dSHugh Dickins 	pgd_t *pgd;
60591612e0dSHugh Dickins 	unsigned long next;
60691612e0dSHugh Dickins 
607b5810039SNick Piggin 	pgd = pgd_offset(vma->vm_mm, addr);
60891612e0dSHugh Dickins 	do {
60991612e0dSHugh Dickins 		next = pgd_addr_end(addr, end);
61091612e0dSHugh Dickins 		if (pgd_none_or_clear_bad(pgd))
61191612e0dSHugh Dickins 			continue;
61298094945SNaoya Horiguchi 		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
61338e35860SChristoph Lameter 				    flags, private))
61491612e0dSHugh Dickins 			return -EIO;
61591612e0dSHugh Dickins 	} while (pgd++, addr = next, addr != end);
61691612e0dSHugh Dickins 	return 0;
6171da177e4SLinus Torvalds }
6181da177e4SLinus Torvalds 
6195877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING
620b24f53a0SLee Schermerhorn /*
6214b10e7d5SMel Gorman  * This is used to mark a range of virtual addresses to be inaccessible.
6224b10e7d5SMel Gorman  * These are later cleared by a NUMA hinting fault. Depending on these
6234b10e7d5SMel Gorman  * faults, pages may be migrated for better NUMA placement.
6244b10e7d5SMel Gorman  *
6254b10e7d5SMel Gorman  * This is assuming that NUMA faults are handled using PROT_NONE. If
6264b10e7d5SMel Gorman  * an architecture makes a different choice, it will need further
6274b10e7d5SMel Gorman  * changes to the core.
628b24f53a0SLee Schermerhorn  */
6294b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma,
6304b10e7d5SMel Gorman 			unsigned long addr, unsigned long end)
631b24f53a0SLee Schermerhorn {
6324b10e7d5SMel Gorman 	int nr_updated;
633b24f53a0SLee Schermerhorn 
6344b10e7d5SMel Gorman 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
63503c5a6e1SMel Gorman 	if (nr_updated)
63603c5a6e1SMel Gorman 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
637b24f53a0SLee Schermerhorn 
6384b10e7d5SMel Gorman 	return nr_updated;
639b24f53a0SLee Schermerhorn }
640b24f53a0SLee Schermerhorn #else
641b24f53a0SLee Schermerhorn static unsigned long change_prot_numa(struct vm_area_struct *vma,
642b24f53a0SLee Schermerhorn 			unsigned long addr, unsigned long end)
643b24f53a0SLee Schermerhorn {
644b24f53a0SLee Schermerhorn 	return 0;
645b24f53a0SLee Schermerhorn }
6465877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */
647b24f53a0SLee Schermerhorn 
648dc9aa5b9SChristoph Lameter /*
64998094945SNaoya Horiguchi  * Walk through page tables and collect pages to be migrated.
65098094945SNaoya Horiguchi  *
65198094945SNaoya Horiguchi  * If pages found in a given range are on a set of nodes (determined by
65298094945SNaoya Horiguchi  * @nodes and @flags,) it's isolated and queued to the pagelist which is
65398094945SNaoya Horiguchi  * passed via @private.)
654dc9aa5b9SChristoph Lameter  */
6551da177e4SLinus Torvalds static struct vm_area_struct *
65698094945SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
65738e35860SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags, void *private)
6581da177e4SLinus Torvalds {
6591da177e4SLinus Torvalds 	int err;
6601da177e4SLinus Torvalds 	struct vm_area_struct *first, *vma, *prev;
6611da177e4SLinus Torvalds 
662053837fcSNick Piggin 
6631da177e4SLinus Torvalds 	first = find_vma(mm, start);
6641da177e4SLinus Torvalds 	if (!first)
6651da177e4SLinus Torvalds 		return ERR_PTR(-EFAULT);
6661da177e4SLinus Torvalds 	prev = NULL;
6671da177e4SLinus Torvalds 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
6685b952b3cSAndi Kleen 		unsigned long endvma = vma->vm_end;
669dc9aa5b9SChristoph Lameter 
6705b952b3cSAndi Kleen 		if (endvma > end)
6715b952b3cSAndi Kleen 			endvma = end;
6725b952b3cSAndi Kleen 		if (vma->vm_start > start)
6735b952b3cSAndi Kleen 			start = vma->vm_start;
674b24f53a0SLee Schermerhorn 
675b24f53a0SLee Schermerhorn 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
676b24f53a0SLee Schermerhorn 			if (!vma->vm_next && vma->vm_end < end)
677b24f53a0SLee Schermerhorn 				return ERR_PTR(-EFAULT);
678b24f53a0SLee Schermerhorn 			if (prev && prev->vm_end < vma->vm_start)
679b24f53a0SLee Schermerhorn 				return ERR_PTR(-EFAULT);
680b24f53a0SLee Schermerhorn 		}
681b24f53a0SLee Schermerhorn 
682b24f53a0SLee Schermerhorn 		if (flags & MPOL_MF_LAZY) {
683b24f53a0SLee Schermerhorn 			change_prot_numa(vma, start, endvma);
684b24f53a0SLee Schermerhorn 			goto next;
685b24f53a0SLee Schermerhorn 		}
686b24f53a0SLee Schermerhorn 
687b24f53a0SLee Schermerhorn 		if ((flags & MPOL_MF_STRICT) ||
688b24f53a0SLee Schermerhorn 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
689b24f53a0SLee Schermerhorn 		      vma_migratable(vma))) {
690b24f53a0SLee Schermerhorn 
69198094945SNaoya Horiguchi 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
69238e35860SChristoph Lameter 						flags, private);
6931da177e4SLinus Torvalds 			if (err) {
6941da177e4SLinus Torvalds 				first = ERR_PTR(err);
6951da177e4SLinus Torvalds 				break;
6961da177e4SLinus Torvalds 			}
6971da177e4SLinus Torvalds 		}
698b24f53a0SLee Schermerhorn next:
6991da177e4SLinus Torvalds 		prev = vma;
7001da177e4SLinus Torvalds 	}
7011da177e4SLinus Torvalds 	return first;
7021da177e4SLinus Torvalds }
7031da177e4SLinus Torvalds 
704869833f2SKOSAKI Motohiro /*
705869833f2SKOSAKI Motohiro  * Apply policy to a single VMA
706869833f2SKOSAKI Motohiro  * This must be called with the mmap_sem held for writing.
707869833f2SKOSAKI Motohiro  */
708869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma,
709869833f2SKOSAKI Motohiro 						struct mempolicy *pol)
7108d34694cSKOSAKI Motohiro {
711869833f2SKOSAKI Motohiro 	int err;
712869833f2SKOSAKI Motohiro 	struct mempolicy *old;
713869833f2SKOSAKI Motohiro 	struct mempolicy *new;
7148d34694cSKOSAKI Motohiro 
7158d34694cSKOSAKI Motohiro 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
7168d34694cSKOSAKI Motohiro 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
7178d34694cSKOSAKI Motohiro 		 vma->vm_ops, vma->vm_file,
7188d34694cSKOSAKI Motohiro 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
7198d34694cSKOSAKI Motohiro 
720869833f2SKOSAKI Motohiro 	new = mpol_dup(pol);
721869833f2SKOSAKI Motohiro 	if (IS_ERR(new))
722869833f2SKOSAKI Motohiro 		return PTR_ERR(new);
723869833f2SKOSAKI Motohiro 
724869833f2SKOSAKI Motohiro 	if (vma->vm_ops && vma->vm_ops->set_policy) {
7258d34694cSKOSAKI Motohiro 		err = vma->vm_ops->set_policy(vma, new);
726869833f2SKOSAKI Motohiro 		if (err)
727869833f2SKOSAKI Motohiro 			goto err_out;
7288d34694cSKOSAKI Motohiro 	}
729869833f2SKOSAKI Motohiro 
730869833f2SKOSAKI Motohiro 	old = vma->vm_policy;
731869833f2SKOSAKI Motohiro 	vma->vm_policy = new; /* protected by mmap_sem */
732869833f2SKOSAKI Motohiro 	mpol_put(old);
733869833f2SKOSAKI Motohiro 
734869833f2SKOSAKI Motohiro 	return 0;
735869833f2SKOSAKI Motohiro  err_out:
736869833f2SKOSAKI Motohiro 	mpol_put(new);
7378d34694cSKOSAKI Motohiro 	return err;
7388d34694cSKOSAKI Motohiro }
7398d34694cSKOSAKI Motohiro 
7401da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
7419d8cebd4SKOSAKI Motohiro static int mbind_range(struct mm_struct *mm, unsigned long start,
7429d8cebd4SKOSAKI Motohiro 		       unsigned long end, struct mempolicy *new_pol)
7431da177e4SLinus Torvalds {
7441da177e4SLinus Torvalds 	struct vm_area_struct *next;
7459d8cebd4SKOSAKI Motohiro 	struct vm_area_struct *prev;
7469d8cebd4SKOSAKI Motohiro 	struct vm_area_struct *vma;
7479d8cebd4SKOSAKI Motohiro 	int err = 0;
748e26a5114SKOSAKI Motohiro 	pgoff_t pgoff;
7499d8cebd4SKOSAKI Motohiro 	unsigned long vmstart;
7509d8cebd4SKOSAKI Motohiro 	unsigned long vmend;
7511da177e4SLinus Torvalds 
752097d5910SLinus Torvalds 	vma = find_vma(mm, start);
7539d8cebd4SKOSAKI Motohiro 	if (!vma || vma->vm_start > start)
7549d8cebd4SKOSAKI Motohiro 		return -EFAULT;
7559d8cebd4SKOSAKI Motohiro 
756097d5910SLinus Torvalds 	prev = vma->vm_prev;
757e26a5114SKOSAKI Motohiro 	if (start > vma->vm_start)
758e26a5114SKOSAKI Motohiro 		prev = vma;
759e26a5114SKOSAKI Motohiro 
7609d8cebd4SKOSAKI Motohiro 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
7611da177e4SLinus Torvalds 		next = vma->vm_next;
7629d8cebd4SKOSAKI Motohiro 		vmstart = max(start, vma->vm_start);
7639d8cebd4SKOSAKI Motohiro 		vmend   = min(end, vma->vm_end);
7649d8cebd4SKOSAKI Motohiro 
765e26a5114SKOSAKI Motohiro 		if (mpol_equal(vma_policy(vma), new_pol))
766e26a5114SKOSAKI Motohiro 			continue;
767e26a5114SKOSAKI Motohiro 
768e26a5114SKOSAKI Motohiro 		pgoff = vma->vm_pgoff +
769e26a5114SKOSAKI Motohiro 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
7709d8cebd4SKOSAKI Motohiro 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
771e26a5114SKOSAKI Motohiro 				  vma->anon_vma, vma->vm_file, pgoff,
7728aacc9f5SCaspar Zhang 				  new_pol);
7739d8cebd4SKOSAKI Motohiro 		if (prev) {
7749d8cebd4SKOSAKI Motohiro 			vma = prev;
7759d8cebd4SKOSAKI Motohiro 			next = vma->vm_next;
7763964acd0SOleg Nesterov 			if (mpol_equal(vma_policy(vma), new_pol))
7779d8cebd4SKOSAKI Motohiro 				continue;
7783964acd0SOleg Nesterov 			/* vma_merge() joined vma && vma->next, case 8 */
7793964acd0SOleg Nesterov 			goto replace;
7801da177e4SLinus Torvalds 		}
7819d8cebd4SKOSAKI Motohiro 		if (vma->vm_start != vmstart) {
7829d8cebd4SKOSAKI Motohiro 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
7839d8cebd4SKOSAKI Motohiro 			if (err)
7849d8cebd4SKOSAKI Motohiro 				goto out;
7859d8cebd4SKOSAKI Motohiro 		}
7869d8cebd4SKOSAKI Motohiro 		if (vma->vm_end != vmend) {
7879d8cebd4SKOSAKI Motohiro 			err = split_vma(vma->vm_mm, vma, vmend, 0);
7889d8cebd4SKOSAKI Motohiro 			if (err)
7899d8cebd4SKOSAKI Motohiro 				goto out;
7909d8cebd4SKOSAKI Motohiro 		}
7913964acd0SOleg Nesterov  replace:
792869833f2SKOSAKI Motohiro 		err = vma_replace_policy(vma, new_pol);
7939d8cebd4SKOSAKI Motohiro 		if (err)
7949d8cebd4SKOSAKI Motohiro 			goto out;
7959d8cebd4SKOSAKI Motohiro 	}
7969d8cebd4SKOSAKI Motohiro 
7979d8cebd4SKOSAKI Motohiro  out:
7981da177e4SLinus Torvalds 	return err;
7991da177e4SLinus Torvalds }
8001da177e4SLinus Torvalds 
8011da177e4SLinus Torvalds /* Set the process memory policy */
802028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
803028fec41SDavid Rientjes 			     nodemask_t *nodes)
8041da177e4SLinus Torvalds {
80558568d2aSMiao Xie 	struct mempolicy *new, *old;
806f4e53d91SLee Schermerhorn 	struct mm_struct *mm = current->mm;
8074bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH(scratch);
80858568d2aSMiao Xie 	int ret;
8091da177e4SLinus Torvalds 
8104bfc4495SKAMEZAWA Hiroyuki 	if (!scratch)
8114bfc4495SKAMEZAWA Hiroyuki 		return -ENOMEM;
812f4e53d91SLee Schermerhorn 
8134bfc4495SKAMEZAWA Hiroyuki 	new = mpol_new(mode, flags, nodes);
8144bfc4495SKAMEZAWA Hiroyuki 	if (IS_ERR(new)) {
8154bfc4495SKAMEZAWA Hiroyuki 		ret = PTR_ERR(new);
8164bfc4495SKAMEZAWA Hiroyuki 		goto out;
8174bfc4495SKAMEZAWA Hiroyuki 	}
818f4e53d91SLee Schermerhorn 	/*
819f4e53d91SLee Schermerhorn 	 * prevent changing our mempolicy while show_numa_maps()
820f4e53d91SLee Schermerhorn 	 * is using it.
821f4e53d91SLee Schermerhorn 	 * Note:  do_set_mempolicy() can be called at init time
822f4e53d91SLee Schermerhorn 	 * with no 'mm'.
823f4e53d91SLee Schermerhorn 	 */
824f4e53d91SLee Schermerhorn 	if (mm)
825f4e53d91SLee Schermerhorn 		down_write(&mm->mmap_sem);
82658568d2aSMiao Xie 	task_lock(current);
8274bfc4495SKAMEZAWA Hiroyuki 	ret = mpol_set_nodemask(new, nodes, scratch);
82858568d2aSMiao Xie 	if (ret) {
82958568d2aSMiao Xie 		task_unlock(current);
83058568d2aSMiao Xie 		if (mm)
83158568d2aSMiao Xie 			up_write(&mm->mmap_sem);
83258568d2aSMiao Xie 		mpol_put(new);
8334bfc4495SKAMEZAWA Hiroyuki 		goto out;
83458568d2aSMiao Xie 	}
83558568d2aSMiao Xie 	old = current->mempolicy;
8361da177e4SLinus Torvalds 	current->mempolicy = new;
83745c4745aSLee Schermerhorn 	if (new && new->mode == MPOL_INTERLEAVE &&
838f5b087b5SDavid Rientjes 	    nodes_weight(new->v.nodes))
839dfcd3c0dSAndi Kleen 		current->il_next = first_node(new->v.nodes);
84058568d2aSMiao Xie 	task_unlock(current);
841f4e53d91SLee Schermerhorn 	if (mm)
842f4e53d91SLee Schermerhorn 		up_write(&mm->mmap_sem);
843f4e53d91SLee Schermerhorn 
84458568d2aSMiao Xie 	mpol_put(old);
8454bfc4495SKAMEZAWA Hiroyuki 	ret = 0;
8464bfc4495SKAMEZAWA Hiroyuki out:
8474bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH_FREE(scratch);
8484bfc4495SKAMEZAWA Hiroyuki 	return ret;
8491da177e4SLinus Torvalds }
8501da177e4SLinus Torvalds 
851bea904d5SLee Schermerhorn /*
852bea904d5SLee Schermerhorn  * Return nodemask for policy for get_mempolicy() query
85358568d2aSMiao Xie  *
85458568d2aSMiao Xie  * Called with task's alloc_lock held
855bea904d5SLee Schermerhorn  */
856bea904d5SLee Schermerhorn static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
8571da177e4SLinus Torvalds {
858dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
859bea904d5SLee Schermerhorn 	if (p == &default_policy)
860bea904d5SLee Schermerhorn 		return;
861bea904d5SLee Schermerhorn 
86245c4745aSLee Schermerhorn 	switch (p->mode) {
86319770b32SMel Gorman 	case MPOL_BIND:
86419770b32SMel Gorman 		/* Fall through */
8651da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
866dfcd3c0dSAndi Kleen 		*nodes = p->v.nodes;
8671da177e4SLinus Torvalds 		break;
8681da177e4SLinus Torvalds 	case MPOL_PREFERRED:
869fc36b8d3SLee Schermerhorn 		if (!(p->flags & MPOL_F_LOCAL))
870dfcd3c0dSAndi Kleen 			node_set(p->v.preferred_node, *nodes);
87153f2556bSLee Schermerhorn 		/* else return empty node mask for local allocation */
8721da177e4SLinus Torvalds 		break;
8731da177e4SLinus Torvalds 	default:
8741da177e4SLinus Torvalds 		BUG();
8751da177e4SLinus Torvalds 	}
8761da177e4SLinus Torvalds }
8771da177e4SLinus Torvalds 
8781da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr)
8791da177e4SLinus Torvalds {
8801da177e4SLinus Torvalds 	struct page *p;
8811da177e4SLinus Torvalds 	int err;
8821da177e4SLinus Torvalds 
8831da177e4SLinus Torvalds 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
8841da177e4SLinus Torvalds 	if (err >= 0) {
8851da177e4SLinus Torvalds 		err = page_to_nid(p);
8861da177e4SLinus Torvalds 		put_page(p);
8871da177e4SLinus Torvalds 	}
8881da177e4SLinus Torvalds 	return err;
8891da177e4SLinus Torvalds }
8901da177e4SLinus Torvalds 
8911da177e4SLinus Torvalds /* Retrieve NUMA policy */
892dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
8931da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
8941da177e4SLinus Torvalds {
8958bccd85fSChristoph Lameter 	int err;
8961da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
8971da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
8981da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
8991da177e4SLinus Torvalds 
900754af6f5SLee Schermerhorn 	if (flags &
901754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
9021da177e4SLinus Torvalds 		return -EINVAL;
903754af6f5SLee Schermerhorn 
904754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
905754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
906754af6f5SLee Schermerhorn 			return -EINVAL;
907754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
90858568d2aSMiao Xie 		task_lock(current);
909754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
91058568d2aSMiao Xie 		task_unlock(current);
911754af6f5SLee Schermerhorn 		return 0;
912754af6f5SLee Schermerhorn 	}
913754af6f5SLee Schermerhorn 
9141da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
915bea904d5SLee Schermerhorn 		/*
916bea904d5SLee Schermerhorn 		 * Do NOT fall back to task policy if the
917bea904d5SLee Schermerhorn 		 * vma/shared policy at addr is NULL.  We
918bea904d5SLee Schermerhorn 		 * want to return MPOL_DEFAULT in this case.
919bea904d5SLee Schermerhorn 		 */
9201da177e4SLinus Torvalds 		down_read(&mm->mmap_sem);
9211da177e4SLinus Torvalds 		vma = find_vma_intersection(mm, addr, addr+1);
9221da177e4SLinus Torvalds 		if (!vma) {
9231da177e4SLinus Torvalds 			up_read(&mm->mmap_sem);
9241da177e4SLinus Torvalds 			return -EFAULT;
9251da177e4SLinus Torvalds 		}
9261da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
9271da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
9281da177e4SLinus Torvalds 		else
9291da177e4SLinus Torvalds 			pol = vma->vm_policy;
9301da177e4SLinus Torvalds 	} else if (addr)
9311da177e4SLinus Torvalds 		return -EINVAL;
9321da177e4SLinus Torvalds 
9331da177e4SLinus Torvalds 	if (!pol)
934bea904d5SLee Schermerhorn 		pol = &default_policy;	/* indicates default behavior */
9351da177e4SLinus Torvalds 
9361da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
9371da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
9381da177e4SLinus Torvalds 			err = lookup_node(mm, addr);
9391da177e4SLinus Torvalds 			if (err < 0)
9401da177e4SLinus Torvalds 				goto out;
9418bccd85fSChristoph Lameter 			*policy = err;
9421da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
94345c4745aSLee Schermerhorn 				pol->mode == MPOL_INTERLEAVE) {
9448bccd85fSChristoph Lameter 			*policy = current->il_next;
9451da177e4SLinus Torvalds 		} else {
9461da177e4SLinus Torvalds 			err = -EINVAL;
9471da177e4SLinus Torvalds 			goto out;
9481da177e4SLinus Torvalds 		}
949bea904d5SLee Schermerhorn 	} else {
950bea904d5SLee Schermerhorn 		*policy = pol == &default_policy ? MPOL_DEFAULT :
951bea904d5SLee Schermerhorn 						pol->mode;
952d79df630SDavid Rientjes 		/*
953d79df630SDavid Rientjes 		 * Internal mempolicy flags must be masked off before exposing
954d79df630SDavid Rientjes 		 * the policy to userspace.
955d79df630SDavid Rientjes 		 */
956d79df630SDavid Rientjes 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
957bea904d5SLee Schermerhorn 	}
9581da177e4SLinus Torvalds 
9591da177e4SLinus Torvalds 	if (vma) {
9601da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
9611da177e4SLinus Torvalds 		vma = NULL;
9621da177e4SLinus Torvalds 	}
9631da177e4SLinus Torvalds 
9641da177e4SLinus Torvalds 	err = 0;
96558568d2aSMiao Xie 	if (nmask) {
966c6b6ef8bSLee Schermerhorn 		if (mpol_store_user_nodemask(pol)) {
967c6b6ef8bSLee Schermerhorn 			*nmask = pol->w.user_nodemask;
968c6b6ef8bSLee Schermerhorn 		} else {
96958568d2aSMiao Xie 			task_lock(current);
970bea904d5SLee Schermerhorn 			get_policy_nodemask(pol, nmask);
97158568d2aSMiao Xie 			task_unlock(current);
97258568d2aSMiao Xie 		}
973c6b6ef8bSLee Schermerhorn 	}
9741da177e4SLinus Torvalds 
9751da177e4SLinus Torvalds  out:
97652cd3b07SLee Schermerhorn 	mpol_cond_put(pol);
9771da177e4SLinus Torvalds 	if (vma)
9781da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
9791da177e4SLinus Torvalds 	return err;
9801da177e4SLinus Torvalds }
9811da177e4SLinus Torvalds 
982b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
9838bccd85fSChristoph Lameter /*
9846ce3c4c0SChristoph Lameter  * page migration
9856ce3c4c0SChristoph Lameter  */
986fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
987fc301289SChristoph Lameter 				unsigned long flags)
9886ce3c4c0SChristoph Lameter {
9896ce3c4c0SChristoph Lameter 	/*
990fc301289SChristoph Lameter 	 * Avoid migrating a page that is shared with others.
9916ce3c4c0SChristoph Lameter 	 */
99262695a84SNick Piggin 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
99362695a84SNick Piggin 		if (!isolate_lru_page(page)) {
99462695a84SNick Piggin 			list_add_tail(&page->lru, pagelist);
9956d9c285aSKOSAKI Motohiro 			inc_zone_page_state(page, NR_ISOLATED_ANON +
9966d9c285aSKOSAKI Motohiro 					    page_is_file_cache(page));
99762695a84SNick Piggin 		}
99862695a84SNick Piggin 	}
9996ce3c4c0SChristoph Lameter }
10006ce3c4c0SChristoph Lameter 
1001742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x)
100295a402c3SChristoph Lameter {
1003e2d8cf40SNaoya Horiguchi 	if (PageHuge(page))
1004e2d8cf40SNaoya Horiguchi 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1005e2d8cf40SNaoya Horiguchi 					node);
1006e2d8cf40SNaoya Horiguchi 	else
10076484eb3eSMel Gorman 		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
100895a402c3SChristoph Lameter }
100995a402c3SChristoph Lameter 
10106ce3c4c0SChristoph Lameter /*
10117e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
10127e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
10137e2ab150SChristoph Lameter  */
1014dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1015dbcb0f19SAdrian Bunk 			   int flags)
10167e2ab150SChristoph Lameter {
10177e2ab150SChristoph Lameter 	nodemask_t nmask;
10187e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
10197e2ab150SChristoph Lameter 	int err = 0;
10207e2ab150SChristoph Lameter 
10217e2ab150SChristoph Lameter 	nodes_clear(nmask);
10227e2ab150SChristoph Lameter 	node_set(source, nmask);
10237e2ab150SChristoph Lameter 
102408270807SMinchan Kim 	/*
102508270807SMinchan Kim 	 * This does not "check" the range but isolates all pages that
102608270807SMinchan Kim 	 * need migration.  Between passing in the full user address
102708270807SMinchan Kim 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
102808270807SMinchan Kim 	 */
102908270807SMinchan Kim 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
103098094945SNaoya Horiguchi 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
10317e2ab150SChristoph Lameter 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
10327e2ab150SChristoph Lameter 
1033cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
103468711a74SDavid Rientjes 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
10359c620e2bSHugh Dickins 					MIGRATE_SYNC, MR_SYSCALL);
1036cf608ac1SMinchan Kim 		if (err)
1037e2d8cf40SNaoya Horiguchi 			putback_movable_pages(&pagelist);
1038cf608ac1SMinchan Kim 	}
103995a402c3SChristoph Lameter 
10407e2ab150SChristoph Lameter 	return err;
10417e2ab150SChristoph Lameter }
10427e2ab150SChristoph Lameter 
10437e2ab150SChristoph Lameter /*
10447e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
10457e2ab150SChristoph Lameter  * layout as much as possible.
104639743889SChristoph Lameter  *
104739743889SChristoph Lameter  * Returns the number of page that could not be moved.
104839743889SChristoph Lameter  */
10490ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
10500ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
105139743889SChristoph Lameter {
10527e2ab150SChristoph Lameter 	int busy = 0;
10530aedadf9SChristoph Lameter 	int err;
10547e2ab150SChristoph Lameter 	nodemask_t tmp;
105539743889SChristoph Lameter 
10560aedadf9SChristoph Lameter 	err = migrate_prep();
10570aedadf9SChristoph Lameter 	if (err)
10580aedadf9SChristoph Lameter 		return err;
10590aedadf9SChristoph Lameter 
106039743889SChristoph Lameter 	down_read(&mm->mmap_sem);
1061d4984711SChristoph Lameter 
10620ce72d4fSAndrew Morton 	err = migrate_vmas(mm, from, to, flags);
10637b2259b3SChristoph Lameter 	if (err)
10647b2259b3SChristoph Lameter 		goto out;
10657b2259b3SChristoph Lameter 
10667e2ab150SChristoph Lameter 	/*
10677e2ab150SChristoph Lameter 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
10687e2ab150SChristoph Lameter 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
10697e2ab150SChristoph Lameter 	 * bit in 'tmp', and return that <source, dest> pair for migration.
10707e2ab150SChristoph Lameter 	 * The pair of nodemasks 'to' and 'from' define the map.
10717e2ab150SChristoph Lameter 	 *
10727e2ab150SChristoph Lameter 	 * If no pair of bits is found that way, fallback to picking some
10737e2ab150SChristoph Lameter 	 * pair of 'source' and 'dest' bits that are not the same.  If the
10747e2ab150SChristoph Lameter 	 * 'source' and 'dest' bits are the same, this represents a node
10757e2ab150SChristoph Lameter 	 * that will be migrating to itself, so no pages need move.
10767e2ab150SChristoph Lameter 	 *
10777e2ab150SChristoph Lameter 	 * If no bits are left in 'tmp', or if all remaining bits left
10787e2ab150SChristoph Lameter 	 * in 'tmp' correspond to the same bit in 'to', return false
10797e2ab150SChristoph Lameter 	 * (nothing left to migrate).
10807e2ab150SChristoph Lameter 	 *
10817e2ab150SChristoph Lameter 	 * This lets us pick a pair of nodes to migrate between, such that
10827e2ab150SChristoph Lameter 	 * if possible the dest node is not already occupied by some other
10837e2ab150SChristoph Lameter 	 * source node, minimizing the risk of overloading the memory on a
10847e2ab150SChristoph Lameter 	 * node that would happen if we migrated incoming memory to a node
10857e2ab150SChristoph Lameter 	 * before migrating outgoing memory source that same node.
10867e2ab150SChristoph Lameter 	 *
10877e2ab150SChristoph Lameter 	 * A single scan of tmp is sufficient.  As we go, we remember the
10887e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
10897e2ab150SChristoph Lameter 	 * that not only moved, but what's better, moved to an empty slot
10907e2ab150SChristoph Lameter 	 * (d is not set in tmp), then we break out then, with that pair.
1091ae0e47f0SJustin P. Mattock 	 * Otherwise when we finish scanning from_tmp, we at least have the
10927e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved.  If we get all the way through
10937e2ab150SChristoph Lameter 	 * the scan of tmp without finding any node that moved, much less
10947e2ab150SChristoph Lameter 	 * moved to an empty node, then there is nothing left worth migrating.
10957e2ab150SChristoph Lameter 	 */
10967e2ab150SChristoph Lameter 
10970ce72d4fSAndrew Morton 	tmp = *from;
10987e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
10997e2ab150SChristoph Lameter 		int s,d;
1100b76ac7e7SJianguo Wu 		int source = NUMA_NO_NODE;
11017e2ab150SChristoph Lameter 		int dest = 0;
11027e2ab150SChristoph Lameter 
11037e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
11044a5b18ccSLarry Woodman 
11054a5b18ccSLarry Woodman 			/*
11064a5b18ccSLarry Woodman 			 * do_migrate_pages() tries to maintain the relative
11074a5b18ccSLarry Woodman 			 * node relationship of the pages established between
11084a5b18ccSLarry Woodman 			 * threads and memory areas.
11094a5b18ccSLarry Woodman                          *
11104a5b18ccSLarry Woodman 			 * However if the number of source nodes is not equal to
11114a5b18ccSLarry Woodman 			 * the number of destination nodes we can not preserve
11124a5b18ccSLarry Woodman 			 * this node relative relationship.  In that case, skip
11134a5b18ccSLarry Woodman 			 * copying memory from a node that is in the destination
11144a5b18ccSLarry Woodman 			 * mask.
11154a5b18ccSLarry Woodman 			 *
11164a5b18ccSLarry Woodman 			 * Example: [2,3,4] -> [3,4,5] moves everything.
11174a5b18ccSLarry Woodman 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
11184a5b18ccSLarry Woodman 			 */
11194a5b18ccSLarry Woodman 
11200ce72d4fSAndrew Morton 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
11210ce72d4fSAndrew Morton 						(node_isset(s, *to)))
11224a5b18ccSLarry Woodman 				continue;
11234a5b18ccSLarry Woodman 
11240ce72d4fSAndrew Morton 			d = node_remap(s, *from, *to);
11257e2ab150SChristoph Lameter 			if (s == d)
11267e2ab150SChristoph Lameter 				continue;
11277e2ab150SChristoph Lameter 
11287e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
11297e2ab150SChristoph Lameter 			dest = d;
11307e2ab150SChristoph Lameter 
11317e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
11327e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
11337e2ab150SChristoph Lameter 				break;
11347e2ab150SChristoph Lameter 		}
1135b76ac7e7SJianguo Wu 		if (source == NUMA_NO_NODE)
11367e2ab150SChristoph Lameter 			break;
11377e2ab150SChristoph Lameter 
11387e2ab150SChristoph Lameter 		node_clear(source, tmp);
11397e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
11407e2ab150SChristoph Lameter 		if (err > 0)
11417e2ab150SChristoph Lameter 			busy += err;
11427e2ab150SChristoph Lameter 		if (err < 0)
11437e2ab150SChristoph Lameter 			break;
114439743889SChristoph Lameter 	}
11457b2259b3SChristoph Lameter out:
114639743889SChristoph Lameter 	up_read(&mm->mmap_sem);
11477e2ab150SChristoph Lameter 	if (err < 0)
11487e2ab150SChristoph Lameter 		return err;
11497e2ab150SChristoph Lameter 	return busy;
1150b20a3503SChristoph Lameter 
115139743889SChristoph Lameter }
115239743889SChristoph Lameter 
11533ad33b24SLee Schermerhorn /*
11543ad33b24SLee Schermerhorn  * Allocate a new page for page migration based on vma policy.
11553ad33b24SLee Schermerhorn  * Start assuming that page is mapped by vma pointed to by @private.
11563ad33b24SLee Schermerhorn  * Search forward from there, if not.  N.B., this assumes that the
11573ad33b24SLee Schermerhorn  * list of pages handed to migrate_pages()--which is how we get here--
11583ad33b24SLee Schermerhorn  * is in virtual address order.
11593ad33b24SLee Schermerhorn  */
1160742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
116195a402c3SChristoph Lameter {
116295a402c3SChristoph Lameter 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
11633ad33b24SLee Schermerhorn 	unsigned long uninitialized_var(address);
116495a402c3SChristoph Lameter 
11653ad33b24SLee Schermerhorn 	while (vma) {
11663ad33b24SLee Schermerhorn 		address = page_address_in_vma(page, vma);
11673ad33b24SLee Schermerhorn 		if (address != -EFAULT)
11683ad33b24SLee Schermerhorn 			break;
11693ad33b24SLee Schermerhorn 		vma = vma->vm_next;
11703ad33b24SLee Schermerhorn 	}
11713ad33b24SLee Schermerhorn 
117211c731e8SWanpeng Li 	if (PageHuge(page)) {
1173cc81717eSMichal Hocko 		BUG_ON(!vma);
117474060e4dSNaoya Horiguchi 		return alloc_huge_page_noerr(vma, address, 1);
117511c731e8SWanpeng Li 	}
117611c731e8SWanpeng Li 	/*
117711c731e8SWanpeng Li 	 * if !vma, alloc_page_vma() will use task or system default policy
117811c731e8SWanpeng Li 	 */
11793ad33b24SLee Schermerhorn 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
118095a402c3SChristoph Lameter }
1181b20a3503SChristoph Lameter #else
1182b20a3503SChristoph Lameter 
1183b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
1184b20a3503SChristoph Lameter 				unsigned long flags)
1185b20a3503SChristoph Lameter {
1186b20a3503SChristoph Lameter }
1187b20a3503SChristoph Lameter 
11880ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11890ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
1190b20a3503SChristoph Lameter {
1191b20a3503SChristoph Lameter 	return -ENOSYS;
1192b20a3503SChristoph Lameter }
119395a402c3SChristoph Lameter 
119469939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
119595a402c3SChristoph Lameter {
119695a402c3SChristoph Lameter 	return NULL;
119795a402c3SChristoph Lameter }
1198b20a3503SChristoph Lameter #endif
1199b20a3503SChristoph Lameter 
1200dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
1201028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
1202028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
12036ce3c4c0SChristoph Lameter {
12046ce3c4c0SChristoph Lameter 	struct vm_area_struct *vma;
12056ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
12066ce3c4c0SChristoph Lameter 	struct mempolicy *new;
12076ce3c4c0SChristoph Lameter 	unsigned long end;
12086ce3c4c0SChristoph Lameter 	int err;
12096ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
12106ce3c4c0SChristoph Lameter 
1211b24f53a0SLee Schermerhorn 	if (flags & ~(unsigned long)MPOL_MF_VALID)
12126ce3c4c0SChristoph Lameter 		return -EINVAL;
121374c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
12146ce3c4c0SChristoph Lameter 		return -EPERM;
12156ce3c4c0SChristoph Lameter 
12166ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
12176ce3c4c0SChristoph Lameter 		return -EINVAL;
12186ce3c4c0SChristoph Lameter 
12196ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
12206ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
12216ce3c4c0SChristoph Lameter 
12226ce3c4c0SChristoph Lameter 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
12236ce3c4c0SChristoph Lameter 	end = start + len;
12246ce3c4c0SChristoph Lameter 
12256ce3c4c0SChristoph Lameter 	if (end < start)
12266ce3c4c0SChristoph Lameter 		return -EINVAL;
12276ce3c4c0SChristoph Lameter 	if (end == start)
12286ce3c4c0SChristoph Lameter 		return 0;
12296ce3c4c0SChristoph Lameter 
1230028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
12316ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
12326ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
12336ce3c4c0SChristoph Lameter 
1234b24f53a0SLee Schermerhorn 	if (flags & MPOL_MF_LAZY)
1235b24f53a0SLee Schermerhorn 		new->flags |= MPOL_F_MOF;
1236b24f53a0SLee Schermerhorn 
12376ce3c4c0SChristoph Lameter 	/*
12386ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
12396ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
12406ce3c4c0SChristoph Lameter 	 */
12416ce3c4c0SChristoph Lameter 	if (!new)
12426ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
12436ce3c4c0SChristoph Lameter 
1244028fec41SDavid Rientjes 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1245028fec41SDavid Rientjes 		 start, start + len, mode, mode_flags,
124600ef2d2fSDavid Rientjes 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
12476ce3c4c0SChristoph Lameter 
12480aedadf9SChristoph Lameter 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
12490aedadf9SChristoph Lameter 
12500aedadf9SChristoph Lameter 		err = migrate_prep();
12510aedadf9SChristoph Lameter 		if (err)
1252b05ca738SKOSAKI Motohiro 			goto mpol_out;
12530aedadf9SChristoph Lameter 	}
12544bfc4495SKAMEZAWA Hiroyuki 	{
12554bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
12564bfc4495SKAMEZAWA Hiroyuki 		if (scratch) {
12576ce3c4c0SChristoph Lameter 			down_write(&mm->mmap_sem);
125858568d2aSMiao Xie 			task_lock(current);
12594bfc4495SKAMEZAWA Hiroyuki 			err = mpol_set_nodemask(new, nmask, scratch);
126058568d2aSMiao Xie 			task_unlock(current);
12614bfc4495SKAMEZAWA Hiroyuki 			if (err)
126258568d2aSMiao Xie 				up_write(&mm->mmap_sem);
12634bfc4495SKAMEZAWA Hiroyuki 		} else
12644bfc4495SKAMEZAWA Hiroyuki 			err = -ENOMEM;
12654bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
12664bfc4495SKAMEZAWA Hiroyuki 	}
1267b05ca738SKOSAKI Motohiro 	if (err)
1268b05ca738SKOSAKI Motohiro 		goto mpol_out;
1269b05ca738SKOSAKI Motohiro 
127098094945SNaoya Horiguchi 	vma = queue_pages_range(mm, start, end, nmask,
12716ce3c4c0SChristoph Lameter 			  flags | MPOL_MF_INVERT, &pagelist);
12726ce3c4c0SChristoph Lameter 
1273b24f53a0SLee Schermerhorn 	err = PTR_ERR(vma);	/* maybe ... */
1274a720094dSMel Gorman 	if (!IS_ERR(vma))
12759d8cebd4SKOSAKI Motohiro 		err = mbind_range(mm, start, end, new);
12767e2ab150SChristoph Lameter 
1277b24f53a0SLee Schermerhorn 	if (!err) {
1278b24f53a0SLee Schermerhorn 		int nr_failed = 0;
1279b24f53a0SLee Schermerhorn 
1280cf608ac1SMinchan Kim 		if (!list_empty(&pagelist)) {
1281b24f53a0SLee Schermerhorn 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
128295a402c3SChristoph Lameter 			nr_failed = migrate_pages(&pagelist, new_vma_page,
128368711a74SDavid Rientjes 					NULL, (unsigned long)vma,
12849c620e2bSHugh Dickins 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1285cf608ac1SMinchan Kim 			if (nr_failed)
128674060e4dSNaoya Horiguchi 				putback_movable_pages(&pagelist);
1287cf608ac1SMinchan Kim 		}
12886ce3c4c0SChristoph Lameter 
1289b24f53a0SLee Schermerhorn 		if (nr_failed && (flags & MPOL_MF_STRICT))
12906ce3c4c0SChristoph Lameter 			err = -EIO;
1291ab8a3e14SKOSAKI Motohiro 	} else
1292b0e5fd73SJoonsoo Kim 		putback_movable_pages(&pagelist);
1293b20a3503SChristoph Lameter 
12946ce3c4c0SChristoph Lameter 	up_write(&mm->mmap_sem);
1295b05ca738SKOSAKI Motohiro  mpol_out:
1296f0be3d32SLee Schermerhorn 	mpol_put(new);
12976ce3c4c0SChristoph Lameter 	return err;
12986ce3c4c0SChristoph Lameter }
12996ce3c4c0SChristoph Lameter 
130039743889SChristoph Lameter /*
13018bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
13028bccd85fSChristoph Lameter  */
13038bccd85fSChristoph Lameter 
13048bccd85fSChristoph Lameter /* Copy a node mask from user space. */
130539743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
13068bccd85fSChristoph Lameter 		     unsigned long maxnode)
13078bccd85fSChristoph Lameter {
13088bccd85fSChristoph Lameter 	unsigned long k;
13098bccd85fSChristoph Lameter 	unsigned long nlongs;
13108bccd85fSChristoph Lameter 	unsigned long endmask;
13118bccd85fSChristoph Lameter 
13128bccd85fSChristoph Lameter 	--maxnode;
13138bccd85fSChristoph Lameter 	nodes_clear(*nodes);
13148bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
13158bccd85fSChristoph Lameter 		return 0;
1316a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1317636f13c1SChris Wright 		return -EINVAL;
13188bccd85fSChristoph Lameter 
13198bccd85fSChristoph Lameter 	nlongs = BITS_TO_LONGS(maxnode);
13208bccd85fSChristoph Lameter 	if ((maxnode % BITS_PER_LONG) == 0)
13218bccd85fSChristoph Lameter 		endmask = ~0UL;
13228bccd85fSChristoph Lameter 	else
13238bccd85fSChristoph Lameter 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
13248bccd85fSChristoph Lameter 
13258bccd85fSChristoph Lameter 	/* When the user specified more nodes than supported just check
13268bccd85fSChristoph Lameter 	   if the non supported part is all zero. */
13278bccd85fSChristoph Lameter 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
13288bccd85fSChristoph Lameter 		if (nlongs > PAGE_SIZE/sizeof(long))
13298bccd85fSChristoph Lameter 			return -EINVAL;
13308bccd85fSChristoph Lameter 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
13318bccd85fSChristoph Lameter 			unsigned long t;
13328bccd85fSChristoph Lameter 			if (get_user(t, nmask + k))
13338bccd85fSChristoph Lameter 				return -EFAULT;
13348bccd85fSChristoph Lameter 			if (k == nlongs - 1) {
13358bccd85fSChristoph Lameter 				if (t & endmask)
13368bccd85fSChristoph Lameter 					return -EINVAL;
13378bccd85fSChristoph Lameter 			} else if (t)
13388bccd85fSChristoph Lameter 				return -EINVAL;
13398bccd85fSChristoph Lameter 		}
13408bccd85fSChristoph Lameter 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
13418bccd85fSChristoph Lameter 		endmask = ~0UL;
13428bccd85fSChristoph Lameter 	}
13438bccd85fSChristoph Lameter 
13448bccd85fSChristoph Lameter 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
13458bccd85fSChristoph Lameter 		return -EFAULT;
13468bccd85fSChristoph Lameter 	nodes_addr(*nodes)[nlongs-1] &= endmask;
13478bccd85fSChristoph Lameter 	return 0;
13488bccd85fSChristoph Lameter }
13498bccd85fSChristoph Lameter 
13508bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
13518bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
13528bccd85fSChristoph Lameter 			      nodemask_t *nodes)
13538bccd85fSChristoph Lameter {
13548bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
13558bccd85fSChristoph Lameter 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
13568bccd85fSChristoph Lameter 
13578bccd85fSChristoph Lameter 	if (copy > nbytes) {
13588bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
13598bccd85fSChristoph Lameter 			return -EINVAL;
13608bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
13618bccd85fSChristoph Lameter 			return -EFAULT;
13628bccd85fSChristoph Lameter 		copy = nbytes;
13638bccd85fSChristoph Lameter 	}
13648bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
13658bccd85fSChristoph Lameter }
13668bccd85fSChristoph Lameter 
1367938bb9f5SHeiko Carstens SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1368f7f28ca9SRasmus Villemoes 		unsigned long, mode, const unsigned long __user *, nmask,
1369938bb9f5SHeiko Carstens 		unsigned long, maxnode, unsigned, flags)
13708bccd85fSChristoph Lameter {
13718bccd85fSChristoph Lameter 	nodemask_t nodes;
13728bccd85fSChristoph Lameter 	int err;
1373028fec41SDavid Rientjes 	unsigned short mode_flags;
13748bccd85fSChristoph Lameter 
1375028fec41SDavid Rientjes 	mode_flags = mode & MPOL_MODE_FLAGS;
1376028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1377a3b51e01SDavid Rientjes 	if (mode >= MPOL_MAX)
1378a3b51e01SDavid Rientjes 		return -EINVAL;
13794c50bc01SDavid Rientjes 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
13804c50bc01SDavid Rientjes 	    (mode_flags & MPOL_F_RELATIVE_NODES))
13814c50bc01SDavid Rientjes 		return -EINVAL;
13828bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
13838bccd85fSChristoph Lameter 	if (err)
13848bccd85fSChristoph Lameter 		return err;
1385028fec41SDavid Rientjes 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
13868bccd85fSChristoph Lameter }
13878bccd85fSChristoph Lameter 
13888bccd85fSChristoph Lameter /* Set the process memory policy */
138923c8902dSRasmus Villemoes SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1390938bb9f5SHeiko Carstens 		unsigned long, maxnode)
13918bccd85fSChristoph Lameter {
13928bccd85fSChristoph Lameter 	int err;
13938bccd85fSChristoph Lameter 	nodemask_t nodes;
1394028fec41SDavid Rientjes 	unsigned short flags;
13958bccd85fSChristoph Lameter 
1396028fec41SDavid Rientjes 	flags = mode & MPOL_MODE_FLAGS;
1397028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1398028fec41SDavid Rientjes 	if ((unsigned int)mode >= MPOL_MAX)
13998bccd85fSChristoph Lameter 		return -EINVAL;
14004c50bc01SDavid Rientjes 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
14014c50bc01SDavid Rientjes 		return -EINVAL;
14028bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
14038bccd85fSChristoph Lameter 	if (err)
14048bccd85fSChristoph Lameter 		return err;
1405028fec41SDavid Rientjes 	return do_set_mempolicy(mode, flags, &nodes);
14068bccd85fSChristoph Lameter }
14078bccd85fSChristoph Lameter 
1408938bb9f5SHeiko Carstens SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1409938bb9f5SHeiko Carstens 		const unsigned long __user *, old_nodes,
1410938bb9f5SHeiko Carstens 		const unsigned long __user *, new_nodes)
141139743889SChristoph Lameter {
1412c69e8d9cSDavid Howells 	const struct cred *cred = current_cred(), *tcred;
1413596d7cfaSKOSAKI Motohiro 	struct mm_struct *mm = NULL;
141439743889SChristoph Lameter 	struct task_struct *task;
141539743889SChristoph Lameter 	nodemask_t task_nodes;
141639743889SChristoph Lameter 	int err;
1417596d7cfaSKOSAKI Motohiro 	nodemask_t *old;
1418596d7cfaSKOSAKI Motohiro 	nodemask_t *new;
1419596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH(scratch);
142039743889SChristoph Lameter 
1421596d7cfaSKOSAKI Motohiro 	if (!scratch)
1422596d7cfaSKOSAKI Motohiro 		return -ENOMEM;
142339743889SChristoph Lameter 
1424596d7cfaSKOSAKI Motohiro 	old = &scratch->mask1;
1425596d7cfaSKOSAKI Motohiro 	new = &scratch->mask2;
1426596d7cfaSKOSAKI Motohiro 
1427596d7cfaSKOSAKI Motohiro 	err = get_nodes(old, old_nodes, maxnode);
142839743889SChristoph Lameter 	if (err)
1429596d7cfaSKOSAKI Motohiro 		goto out;
1430596d7cfaSKOSAKI Motohiro 
1431596d7cfaSKOSAKI Motohiro 	err = get_nodes(new, new_nodes, maxnode);
1432596d7cfaSKOSAKI Motohiro 	if (err)
1433596d7cfaSKOSAKI Motohiro 		goto out;
143439743889SChristoph Lameter 
143539743889SChristoph Lameter 	/* Find the mm_struct */
143655cfaa3cSZeng Zhaoming 	rcu_read_lock();
1437228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
143839743889SChristoph Lameter 	if (!task) {
143955cfaa3cSZeng Zhaoming 		rcu_read_unlock();
1440596d7cfaSKOSAKI Motohiro 		err = -ESRCH;
1441596d7cfaSKOSAKI Motohiro 		goto out;
144239743889SChristoph Lameter 	}
14433268c63eSChristoph Lameter 	get_task_struct(task);
144439743889SChristoph Lameter 
1445596d7cfaSKOSAKI Motohiro 	err = -EINVAL;
144639743889SChristoph Lameter 
144739743889SChristoph Lameter 	/*
144839743889SChristoph Lameter 	 * Check if this process has the right to modify the specified
144939743889SChristoph Lameter 	 * process. The right exists if the process has administrative
14507f927fccSAlexey Dobriyan 	 * capabilities, superuser privileges or the same
145139743889SChristoph Lameter 	 * userid as the target process.
145239743889SChristoph Lameter 	 */
1453c69e8d9cSDavid Howells 	tcred = __task_cred(task);
1454b38a86ebSEric W. Biederman 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1455b38a86ebSEric W. Biederman 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
145674c00241SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
1457c69e8d9cSDavid Howells 		rcu_read_unlock();
145839743889SChristoph Lameter 		err = -EPERM;
14593268c63eSChristoph Lameter 		goto out_put;
146039743889SChristoph Lameter 	}
1461c69e8d9cSDavid Howells 	rcu_read_unlock();
146239743889SChristoph Lameter 
146339743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
146439743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
1465596d7cfaSKOSAKI Motohiro 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
146639743889SChristoph Lameter 		err = -EPERM;
14673268c63eSChristoph Lameter 		goto out_put;
146839743889SChristoph Lameter 	}
146939743889SChristoph Lameter 
147001f13bd6SLai Jiangshan 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
14713b42d28bSChristoph Lameter 		err = -EINVAL;
14723268c63eSChristoph Lameter 		goto out_put;
14733b42d28bSChristoph Lameter 	}
14743b42d28bSChristoph Lameter 
147586c3a764SDavid Quigley 	err = security_task_movememory(task);
147686c3a764SDavid Quigley 	if (err)
14773268c63eSChristoph Lameter 		goto out_put;
147886c3a764SDavid Quigley 
14793268c63eSChristoph Lameter 	mm = get_task_mm(task);
14803268c63eSChristoph Lameter 	put_task_struct(task);
1481f2a9ef88SSasha Levin 
1482f2a9ef88SSasha Levin 	if (!mm) {
1483f2a9ef88SSasha Levin 		err = -EINVAL;
1484f2a9ef88SSasha Levin 		goto out;
1485f2a9ef88SSasha Levin 	}
1486f2a9ef88SSasha Levin 
1487596d7cfaSKOSAKI Motohiro 	err = do_migrate_pages(mm, old, new,
148874c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
14893268c63eSChristoph Lameter 
149039743889SChristoph Lameter 	mmput(mm);
14913268c63eSChristoph Lameter out:
1492596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH_FREE(scratch);
1493596d7cfaSKOSAKI Motohiro 
149439743889SChristoph Lameter 	return err;
14953268c63eSChristoph Lameter 
14963268c63eSChristoph Lameter out_put:
14973268c63eSChristoph Lameter 	put_task_struct(task);
14983268c63eSChristoph Lameter 	goto out;
14993268c63eSChristoph Lameter 
150039743889SChristoph Lameter }
150139743889SChristoph Lameter 
150239743889SChristoph Lameter 
15038bccd85fSChristoph Lameter /* Retrieve NUMA policy */
1504938bb9f5SHeiko Carstens SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1505938bb9f5SHeiko Carstens 		unsigned long __user *, nmask, unsigned long, maxnode,
1506938bb9f5SHeiko Carstens 		unsigned long, addr, unsigned long, flags)
15078bccd85fSChristoph Lameter {
1508dbcb0f19SAdrian Bunk 	int err;
1509dbcb0f19SAdrian Bunk 	int uninitialized_var(pval);
15108bccd85fSChristoph Lameter 	nodemask_t nodes;
15118bccd85fSChristoph Lameter 
15128bccd85fSChristoph Lameter 	if (nmask != NULL && maxnode < MAX_NUMNODES)
15138bccd85fSChristoph Lameter 		return -EINVAL;
15148bccd85fSChristoph Lameter 
15158bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
15168bccd85fSChristoph Lameter 
15178bccd85fSChristoph Lameter 	if (err)
15188bccd85fSChristoph Lameter 		return err;
15198bccd85fSChristoph Lameter 
15208bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
15218bccd85fSChristoph Lameter 		return -EFAULT;
15228bccd85fSChristoph Lameter 
15238bccd85fSChristoph Lameter 	if (nmask)
15248bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
15258bccd85fSChristoph Lameter 
15268bccd85fSChristoph Lameter 	return err;
15278bccd85fSChristoph Lameter }
15288bccd85fSChristoph Lameter 
15291da177e4SLinus Torvalds #ifdef CONFIG_COMPAT
15301da177e4SLinus Torvalds 
1531c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1532c93e0f6cSHeiko Carstens 		       compat_ulong_t __user *, nmask,
1533c93e0f6cSHeiko Carstens 		       compat_ulong_t, maxnode,
1534c93e0f6cSHeiko Carstens 		       compat_ulong_t, addr, compat_ulong_t, flags)
15351da177e4SLinus Torvalds {
15361da177e4SLinus Torvalds 	long err;
15371da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
15381da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
15391da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
15401da177e4SLinus Torvalds 
15411da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
15421da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
15431da177e4SLinus Torvalds 
15441da177e4SLinus Torvalds 	if (nmask)
15451da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
15461da177e4SLinus Torvalds 
15471da177e4SLinus Torvalds 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
15481da177e4SLinus Torvalds 
15491da177e4SLinus Torvalds 	if (!err && nmask) {
15502bbff6c7SKAMEZAWA Hiroyuki 		unsigned long copy_size;
15512bbff6c7SKAMEZAWA Hiroyuki 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
15522bbff6c7SKAMEZAWA Hiroyuki 		err = copy_from_user(bm, nm, copy_size);
15531da177e4SLinus Torvalds 		/* ensure entire bitmap is zeroed */
15541da177e4SLinus Torvalds 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
15551da177e4SLinus Torvalds 		err |= compat_put_bitmap(nmask, bm, nr_bits);
15561da177e4SLinus Torvalds 	}
15571da177e4SLinus Torvalds 
15581da177e4SLinus Torvalds 	return err;
15591da177e4SLinus Torvalds }
15601da177e4SLinus Torvalds 
1561c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1562c93e0f6cSHeiko Carstens 		       compat_ulong_t, maxnode)
15631da177e4SLinus Torvalds {
15641da177e4SLinus Torvalds 	long err = 0;
15651da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
15661da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
15671da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
15681da177e4SLinus Torvalds 
15691da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
15701da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
15711da177e4SLinus Torvalds 
15721da177e4SLinus Torvalds 	if (nmask) {
15731da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
15741da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
15751da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
15761da177e4SLinus Torvalds 	}
15771da177e4SLinus Torvalds 
15781da177e4SLinus Torvalds 	if (err)
15791da177e4SLinus Torvalds 		return -EFAULT;
15801da177e4SLinus Torvalds 
15811da177e4SLinus Torvalds 	return sys_set_mempolicy(mode, nm, nr_bits+1);
15821da177e4SLinus Torvalds }
15831da177e4SLinus Torvalds 
1584c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1585c93e0f6cSHeiko Carstens 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1586c93e0f6cSHeiko Carstens 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
15871da177e4SLinus Torvalds {
15881da177e4SLinus Torvalds 	long err = 0;
15891da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
15901da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
1591dfcd3c0dSAndi Kleen 	nodemask_t bm;
15921da177e4SLinus Torvalds 
15931da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
15941da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
15951da177e4SLinus Torvalds 
15961da177e4SLinus Torvalds 	if (nmask) {
1597dfcd3c0dSAndi Kleen 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
15981da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
1599dfcd3c0dSAndi Kleen 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
16001da177e4SLinus Torvalds 	}
16011da177e4SLinus Torvalds 
16021da177e4SLinus Torvalds 	if (err)
16031da177e4SLinus Torvalds 		return -EFAULT;
16041da177e4SLinus Torvalds 
16051da177e4SLinus Torvalds 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
16061da177e4SLinus Torvalds }
16071da177e4SLinus Torvalds 
16081da177e4SLinus Torvalds #endif
16091da177e4SLinus Torvalds 
1610480eccf9SLee Schermerhorn /*
1611480eccf9SLee Schermerhorn  * get_vma_policy(@task, @vma, @addr)
1612b46e14acSFabian Frederick  * @task: task for fallback if vma policy == default
1613b46e14acSFabian Frederick  * @vma: virtual memory area whose policy is sought
1614b46e14acSFabian Frederick  * @addr: address in @vma for shared policy lookup
1615480eccf9SLee Schermerhorn  *
1616480eccf9SLee Schermerhorn  * Returns effective policy for a VMA at specified address.
1617480eccf9SLee Schermerhorn  * Falls back to @task or system default policy, as necessary.
161832f8516aSDavid Rientjes  * Current or other task's task mempolicy and non-shared vma policies must be
161932f8516aSDavid Rientjes  * protected by task_lock(task) by the caller.
162052cd3b07SLee Schermerhorn  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
162152cd3b07SLee Schermerhorn  * count--added by the get_policy() vm_op, as appropriate--to protect against
162252cd3b07SLee Schermerhorn  * freeing by another task.  It is the caller's responsibility to free the
162352cd3b07SLee Schermerhorn  * extra reference for shared policies.
1624480eccf9SLee Schermerhorn  */
1625d98f6cb6SStephen Wilson struct mempolicy *get_vma_policy(struct task_struct *task,
162648fce342SChristoph Lameter 		struct vm_area_struct *vma, unsigned long addr)
16271da177e4SLinus Torvalds {
16285606e387SMel Gorman 	struct mempolicy *pol = get_task_policy(task);
16291da177e4SLinus Torvalds 
16301da177e4SLinus Torvalds 	if (vma) {
1631480eccf9SLee Schermerhorn 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1632ae4d8c16SLee Schermerhorn 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1633ae4d8c16SLee Schermerhorn 									addr);
1634ae4d8c16SLee Schermerhorn 			if (vpol)
1635ae4d8c16SLee Schermerhorn 				pol = vpol;
163600442ad0SMel Gorman 		} else if (vma->vm_policy) {
16371da177e4SLinus Torvalds 			pol = vma->vm_policy;
163800442ad0SMel Gorman 
163900442ad0SMel Gorman 			/*
164000442ad0SMel Gorman 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
164100442ad0SMel Gorman 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
164200442ad0SMel Gorman 			 * count on these policies which will be dropped by
164300442ad0SMel Gorman 			 * mpol_cond_put() later
164400442ad0SMel Gorman 			 */
164500442ad0SMel Gorman 			if (mpol_needs_cond_ref(pol))
164600442ad0SMel Gorman 				mpol_get(pol);
164700442ad0SMel Gorman 		}
16481da177e4SLinus Torvalds 	}
16491da177e4SLinus Torvalds 	if (!pol)
16501da177e4SLinus Torvalds 		pol = &default_policy;
16511da177e4SLinus Torvalds 	return pol;
16521da177e4SLinus Torvalds }
16531da177e4SLinus Torvalds 
1654fc314724SMel Gorman bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1655fc314724SMel Gorman {
1656fc314724SMel Gorman 	struct mempolicy *pol = get_task_policy(task);
1657fc314724SMel Gorman 	if (vma) {
1658fc314724SMel Gorman 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1659fc314724SMel Gorman 			bool ret = false;
1660fc314724SMel Gorman 
1661fc314724SMel Gorman 			pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1662fc314724SMel Gorman 			if (pol && (pol->flags & MPOL_F_MOF))
1663fc314724SMel Gorman 				ret = true;
1664fc314724SMel Gorman 			mpol_cond_put(pol);
1665fc314724SMel Gorman 
1666fc314724SMel Gorman 			return ret;
1667fc314724SMel Gorman 		} else if (vma->vm_policy) {
1668fc314724SMel Gorman 			pol = vma->vm_policy;
1669fc314724SMel Gorman 		}
1670fc314724SMel Gorman 	}
1671fc314724SMel Gorman 
1672fc314724SMel Gorman 	if (!pol)
1673fc314724SMel Gorman 		return default_policy.flags & MPOL_F_MOF;
1674fc314724SMel Gorman 
1675fc314724SMel Gorman 	return pol->flags & MPOL_F_MOF;
1676fc314724SMel Gorman }
1677fc314724SMel Gorman 
1678d3eb1570SLai Jiangshan static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1679d3eb1570SLai Jiangshan {
1680d3eb1570SLai Jiangshan 	enum zone_type dynamic_policy_zone = policy_zone;
1681d3eb1570SLai Jiangshan 
1682d3eb1570SLai Jiangshan 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1683d3eb1570SLai Jiangshan 
1684d3eb1570SLai Jiangshan 	/*
1685d3eb1570SLai Jiangshan 	 * if policy->v.nodes has movable memory only,
1686d3eb1570SLai Jiangshan 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1687d3eb1570SLai Jiangshan 	 *
1688d3eb1570SLai Jiangshan 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1689d3eb1570SLai Jiangshan 	 * so if the following test faile, it implies
1690d3eb1570SLai Jiangshan 	 * policy->v.nodes has movable memory only.
1691d3eb1570SLai Jiangshan 	 */
1692d3eb1570SLai Jiangshan 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1693d3eb1570SLai Jiangshan 		dynamic_policy_zone = ZONE_MOVABLE;
1694d3eb1570SLai Jiangshan 
1695d3eb1570SLai Jiangshan 	return zone >= dynamic_policy_zone;
1696d3eb1570SLai Jiangshan }
1697d3eb1570SLai Jiangshan 
169852cd3b07SLee Schermerhorn /*
169952cd3b07SLee Schermerhorn  * Return a nodemask representing a mempolicy for filtering nodes for
170052cd3b07SLee Schermerhorn  * page allocation
170152cd3b07SLee Schermerhorn  */
170252cd3b07SLee Schermerhorn static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
170319770b32SMel Gorman {
170419770b32SMel Gorman 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
170545c4745aSLee Schermerhorn 	if (unlikely(policy->mode == MPOL_BIND) &&
1706d3eb1570SLai Jiangshan 			apply_policy_zone(policy, gfp_zone(gfp)) &&
170719770b32SMel Gorman 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
170819770b32SMel Gorman 		return &policy->v.nodes;
170919770b32SMel Gorman 
171019770b32SMel Gorman 	return NULL;
171119770b32SMel Gorman }
171219770b32SMel Gorman 
171352cd3b07SLee Schermerhorn /* Return a zonelist indicated by gfp for node representing a mempolicy */
17142f5f9486SAndi Kleen static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
17152f5f9486SAndi Kleen 	int nd)
17161da177e4SLinus Torvalds {
171745c4745aSLee Schermerhorn 	switch (policy->mode) {
17181da177e4SLinus Torvalds 	case MPOL_PREFERRED:
1719fc36b8d3SLee Schermerhorn 		if (!(policy->flags & MPOL_F_LOCAL))
17201da177e4SLinus Torvalds 			nd = policy->v.preferred_node;
17211da177e4SLinus Torvalds 		break;
17221da177e4SLinus Torvalds 	case MPOL_BIND:
172319770b32SMel Gorman 		/*
172452cd3b07SLee Schermerhorn 		 * Normally, MPOL_BIND allocations are node-local within the
172552cd3b07SLee Schermerhorn 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
17266eb27e1fSBob Liu 		 * current node isn't part of the mask, we use the zonelist for
172752cd3b07SLee Schermerhorn 		 * the first node in the mask instead.
172819770b32SMel Gorman 		 */
172919770b32SMel Gorman 		if (unlikely(gfp & __GFP_THISNODE) &&
173019770b32SMel Gorman 				unlikely(!node_isset(nd, policy->v.nodes)))
173119770b32SMel Gorman 			nd = first_node(policy->v.nodes);
173219770b32SMel Gorman 		break;
17331da177e4SLinus Torvalds 	default:
17341da177e4SLinus Torvalds 		BUG();
17351da177e4SLinus Torvalds 	}
17360e88460dSMel Gorman 	return node_zonelist(nd, gfp);
17371da177e4SLinus Torvalds }
17381da177e4SLinus Torvalds 
17391da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
17401da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
17411da177e4SLinus Torvalds {
17421da177e4SLinus Torvalds 	unsigned nid, next;
17431da177e4SLinus Torvalds 	struct task_struct *me = current;
17441da177e4SLinus Torvalds 
17451da177e4SLinus Torvalds 	nid = me->il_next;
1746dfcd3c0dSAndi Kleen 	next = next_node(nid, policy->v.nodes);
17471da177e4SLinus Torvalds 	if (next >= MAX_NUMNODES)
1748dfcd3c0dSAndi Kleen 		next = first_node(policy->v.nodes);
1749f5b087b5SDavid Rientjes 	if (next < MAX_NUMNODES)
17501da177e4SLinus Torvalds 		me->il_next = next;
17511da177e4SLinus Torvalds 	return nid;
17521da177e4SLinus Torvalds }
17531da177e4SLinus Torvalds 
1754dc85da15SChristoph Lameter /*
1755dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1756dc85da15SChristoph Lameter  * next slab entry.
1757dc85da15SChristoph Lameter  */
17582a389610SDavid Rientjes unsigned int mempolicy_slab_node(void)
1759dc85da15SChristoph Lameter {
1760e7b691b0SAndi Kleen 	struct mempolicy *policy;
17612a389610SDavid Rientjes 	int node = numa_mem_id();
1762e7b691b0SAndi Kleen 
1763e7b691b0SAndi Kleen 	if (in_interrupt())
17642a389610SDavid Rientjes 		return node;
1765e7b691b0SAndi Kleen 
1766e7b691b0SAndi Kleen 	policy = current->mempolicy;
1767fc36b8d3SLee Schermerhorn 	if (!policy || policy->flags & MPOL_F_LOCAL)
17682a389610SDavid Rientjes 		return node;
1769765c4507SChristoph Lameter 
1770bea904d5SLee Schermerhorn 	switch (policy->mode) {
1771bea904d5SLee Schermerhorn 	case MPOL_PREFERRED:
1772fc36b8d3SLee Schermerhorn 		/*
1773fc36b8d3SLee Schermerhorn 		 * handled MPOL_F_LOCAL above
1774fc36b8d3SLee Schermerhorn 		 */
1775bea904d5SLee Schermerhorn 		return policy->v.preferred_node;
1776bea904d5SLee Schermerhorn 
1777dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1778dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1779dc85da15SChristoph Lameter 
1780dd1a239fSMel Gorman 	case MPOL_BIND: {
1781dc85da15SChristoph Lameter 		/*
1782dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1783dc85da15SChristoph Lameter 		 * first node.
1784dc85da15SChristoph Lameter 		 */
178519770b32SMel Gorman 		struct zonelist *zonelist;
178619770b32SMel Gorman 		struct zone *zone;
178719770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
17882a389610SDavid Rientjes 		zonelist = &NODE_DATA(node)->node_zonelists[0];
178919770b32SMel Gorman 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
179019770b32SMel Gorman 							&policy->v.nodes,
179119770b32SMel Gorman 							&zone);
17922a389610SDavid Rientjes 		return zone ? zone->node : node;
1793dd1a239fSMel Gorman 	}
1794dc85da15SChristoph Lameter 
1795dc85da15SChristoph Lameter 	default:
1796bea904d5SLee Schermerhorn 		BUG();
1797dc85da15SChristoph Lameter 	}
1798dc85da15SChristoph Lameter }
1799dc85da15SChristoph Lameter 
18001da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */
18011da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol,
18021da177e4SLinus Torvalds 		struct vm_area_struct *vma, unsigned long off)
18031da177e4SLinus Torvalds {
1804dfcd3c0dSAndi Kleen 	unsigned nnodes = nodes_weight(pol->v.nodes);
1805f5b087b5SDavid Rientjes 	unsigned target;
18061da177e4SLinus Torvalds 	int c;
1807b76ac7e7SJianguo Wu 	int nid = NUMA_NO_NODE;
18081da177e4SLinus Torvalds 
1809f5b087b5SDavid Rientjes 	if (!nnodes)
1810f5b087b5SDavid Rientjes 		return numa_node_id();
1811f5b087b5SDavid Rientjes 	target = (unsigned int)off % nnodes;
18121da177e4SLinus Torvalds 	c = 0;
18131da177e4SLinus Torvalds 	do {
1814dfcd3c0dSAndi Kleen 		nid = next_node(nid, pol->v.nodes);
18151da177e4SLinus Torvalds 		c++;
18161da177e4SLinus Torvalds 	} while (c <= target);
18171da177e4SLinus Torvalds 	return nid;
18181da177e4SLinus Torvalds }
18191da177e4SLinus Torvalds 
18205da7ca86SChristoph Lameter /* Determine a node number for interleave */
18215da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
18225da7ca86SChristoph Lameter 		 struct vm_area_struct *vma, unsigned long addr, int shift)
18235da7ca86SChristoph Lameter {
18245da7ca86SChristoph Lameter 	if (vma) {
18255da7ca86SChristoph Lameter 		unsigned long off;
18265da7ca86SChristoph Lameter 
18273b98b087SNishanth Aravamudan 		/*
18283b98b087SNishanth Aravamudan 		 * for small pages, there is no difference between
18293b98b087SNishanth Aravamudan 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
18303b98b087SNishanth Aravamudan 		 * for huge pages, since vm_pgoff is in units of small
18313b98b087SNishanth Aravamudan 		 * pages, we need to shift off the always 0 bits to get
18323b98b087SNishanth Aravamudan 		 * a useful offset.
18333b98b087SNishanth Aravamudan 		 */
18343b98b087SNishanth Aravamudan 		BUG_ON(shift < PAGE_SHIFT);
18353b98b087SNishanth Aravamudan 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
18365da7ca86SChristoph Lameter 		off += (addr - vma->vm_start) >> shift;
18375da7ca86SChristoph Lameter 		return offset_il_node(pol, vma, off);
18385da7ca86SChristoph Lameter 	} else
18395da7ca86SChristoph Lameter 		return interleave_nodes(pol);
18405da7ca86SChristoph Lameter }
18415da7ca86SChristoph Lameter 
1842778d3b0fSMichal Hocko /*
1843778d3b0fSMichal Hocko  * Return the bit number of a random bit set in the nodemask.
1844b76ac7e7SJianguo Wu  * (returns NUMA_NO_NODE if nodemask is empty)
1845778d3b0fSMichal Hocko  */
1846778d3b0fSMichal Hocko int node_random(const nodemask_t *maskp)
1847778d3b0fSMichal Hocko {
1848b76ac7e7SJianguo Wu 	int w, bit = NUMA_NO_NODE;
1849778d3b0fSMichal Hocko 
1850778d3b0fSMichal Hocko 	w = nodes_weight(*maskp);
1851778d3b0fSMichal Hocko 	if (w)
1852778d3b0fSMichal Hocko 		bit = bitmap_ord_to_pos(maskp->bits,
1853778d3b0fSMichal Hocko 			get_random_int() % w, MAX_NUMNODES);
1854778d3b0fSMichal Hocko 	return bit;
1855778d3b0fSMichal Hocko }
1856778d3b0fSMichal Hocko 
185700ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
1858480eccf9SLee Schermerhorn /*
1859480eccf9SLee Schermerhorn  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1860b46e14acSFabian Frederick  * @vma: virtual memory area whose policy is sought
1861b46e14acSFabian Frederick  * @addr: address in @vma for shared policy lookup and interleave policy
1862b46e14acSFabian Frederick  * @gfp_flags: for requested zone
1863b46e14acSFabian Frederick  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1864b46e14acSFabian Frederick  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1865480eccf9SLee Schermerhorn  *
186652cd3b07SLee Schermerhorn  * Returns a zonelist suitable for a huge page allocation and a pointer
186752cd3b07SLee Schermerhorn  * to the struct mempolicy for conditional unref after allocation.
186852cd3b07SLee Schermerhorn  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
186952cd3b07SLee Schermerhorn  * @nodemask for filtering the zonelist.
1870c0ff7453SMiao Xie  *
1871d26914d1SMel Gorman  * Must be protected by read_mems_allowed_begin()
1872480eccf9SLee Schermerhorn  */
1873396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
187419770b32SMel Gorman 				gfp_t gfp_flags, struct mempolicy **mpol,
187519770b32SMel Gorman 				nodemask_t **nodemask)
18765da7ca86SChristoph Lameter {
1877480eccf9SLee Schermerhorn 	struct zonelist *zl;
18785da7ca86SChristoph Lameter 
187952cd3b07SLee Schermerhorn 	*mpol = get_vma_policy(current, vma, addr);
188019770b32SMel Gorman 	*nodemask = NULL;	/* assume !MPOL_BIND */
18815da7ca86SChristoph Lameter 
188252cd3b07SLee Schermerhorn 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
188352cd3b07SLee Schermerhorn 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1884a5516438SAndi Kleen 				huge_page_shift(hstate_vma(vma))), gfp_flags);
188552cd3b07SLee Schermerhorn 	} else {
18862f5f9486SAndi Kleen 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
188752cd3b07SLee Schermerhorn 		if ((*mpol)->mode == MPOL_BIND)
188852cd3b07SLee Schermerhorn 			*nodemask = &(*mpol)->v.nodes;
1889480eccf9SLee Schermerhorn 	}
1890480eccf9SLee Schermerhorn 	return zl;
18915da7ca86SChristoph Lameter }
189206808b08SLee Schermerhorn 
189306808b08SLee Schermerhorn /*
189406808b08SLee Schermerhorn  * init_nodemask_of_mempolicy
189506808b08SLee Schermerhorn  *
189606808b08SLee Schermerhorn  * If the current task's mempolicy is "default" [NULL], return 'false'
189706808b08SLee Schermerhorn  * to indicate default policy.  Otherwise, extract the policy nodemask
189806808b08SLee Schermerhorn  * for 'bind' or 'interleave' policy into the argument nodemask, or
189906808b08SLee Schermerhorn  * initialize the argument nodemask to contain the single node for
190006808b08SLee Schermerhorn  * 'preferred' or 'local' policy and return 'true' to indicate presence
190106808b08SLee Schermerhorn  * of non-default mempolicy.
190206808b08SLee Schermerhorn  *
190306808b08SLee Schermerhorn  * We don't bother with reference counting the mempolicy [mpol_get/put]
190406808b08SLee Schermerhorn  * because the current task is examining it's own mempolicy and a task's
190506808b08SLee Schermerhorn  * mempolicy is only ever changed by the task itself.
190606808b08SLee Schermerhorn  *
190706808b08SLee Schermerhorn  * N.B., it is the caller's responsibility to free a returned nodemask.
190806808b08SLee Schermerhorn  */
190906808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask)
191006808b08SLee Schermerhorn {
191106808b08SLee Schermerhorn 	struct mempolicy *mempolicy;
191206808b08SLee Schermerhorn 	int nid;
191306808b08SLee Schermerhorn 
191406808b08SLee Schermerhorn 	if (!(mask && current->mempolicy))
191506808b08SLee Schermerhorn 		return false;
191606808b08SLee Schermerhorn 
1917c0ff7453SMiao Xie 	task_lock(current);
191806808b08SLee Schermerhorn 	mempolicy = current->mempolicy;
191906808b08SLee Schermerhorn 	switch (mempolicy->mode) {
192006808b08SLee Schermerhorn 	case MPOL_PREFERRED:
192106808b08SLee Schermerhorn 		if (mempolicy->flags & MPOL_F_LOCAL)
192206808b08SLee Schermerhorn 			nid = numa_node_id();
192306808b08SLee Schermerhorn 		else
192406808b08SLee Schermerhorn 			nid = mempolicy->v.preferred_node;
192506808b08SLee Schermerhorn 		init_nodemask_of_node(mask, nid);
192606808b08SLee Schermerhorn 		break;
192706808b08SLee Schermerhorn 
192806808b08SLee Schermerhorn 	case MPOL_BIND:
192906808b08SLee Schermerhorn 		/* Fall through */
193006808b08SLee Schermerhorn 	case MPOL_INTERLEAVE:
193106808b08SLee Schermerhorn 		*mask =  mempolicy->v.nodes;
193206808b08SLee Schermerhorn 		break;
193306808b08SLee Schermerhorn 
193406808b08SLee Schermerhorn 	default:
193506808b08SLee Schermerhorn 		BUG();
193606808b08SLee Schermerhorn 	}
1937c0ff7453SMiao Xie 	task_unlock(current);
193806808b08SLee Schermerhorn 
193906808b08SLee Schermerhorn 	return true;
194006808b08SLee Schermerhorn }
194100ac59adSChen, Kenneth W #endif
19425da7ca86SChristoph Lameter 
19436f48d0ebSDavid Rientjes /*
19446f48d0ebSDavid Rientjes  * mempolicy_nodemask_intersects
19456f48d0ebSDavid Rientjes  *
19466f48d0ebSDavid Rientjes  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
19476f48d0ebSDavid Rientjes  * policy.  Otherwise, check for intersection between mask and the policy
19486f48d0ebSDavid Rientjes  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
19496f48d0ebSDavid Rientjes  * policy, always return true since it may allocate elsewhere on fallback.
19506f48d0ebSDavid Rientjes  *
19516f48d0ebSDavid Rientjes  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
19526f48d0ebSDavid Rientjes  */
19536f48d0ebSDavid Rientjes bool mempolicy_nodemask_intersects(struct task_struct *tsk,
19546f48d0ebSDavid Rientjes 					const nodemask_t *mask)
19556f48d0ebSDavid Rientjes {
19566f48d0ebSDavid Rientjes 	struct mempolicy *mempolicy;
19576f48d0ebSDavid Rientjes 	bool ret = true;
19586f48d0ebSDavid Rientjes 
19596f48d0ebSDavid Rientjes 	if (!mask)
19606f48d0ebSDavid Rientjes 		return ret;
19616f48d0ebSDavid Rientjes 	task_lock(tsk);
19626f48d0ebSDavid Rientjes 	mempolicy = tsk->mempolicy;
19636f48d0ebSDavid Rientjes 	if (!mempolicy)
19646f48d0ebSDavid Rientjes 		goto out;
19656f48d0ebSDavid Rientjes 
19666f48d0ebSDavid Rientjes 	switch (mempolicy->mode) {
19676f48d0ebSDavid Rientjes 	case MPOL_PREFERRED:
19686f48d0ebSDavid Rientjes 		/*
19696f48d0ebSDavid Rientjes 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
19706f48d0ebSDavid Rientjes 		 * allocate from, they may fallback to other nodes when oom.
19716f48d0ebSDavid Rientjes 		 * Thus, it's possible for tsk to have allocated memory from
19726f48d0ebSDavid Rientjes 		 * nodes in mask.
19736f48d0ebSDavid Rientjes 		 */
19746f48d0ebSDavid Rientjes 		break;
19756f48d0ebSDavid Rientjes 	case MPOL_BIND:
19766f48d0ebSDavid Rientjes 	case MPOL_INTERLEAVE:
19776f48d0ebSDavid Rientjes 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
19786f48d0ebSDavid Rientjes 		break;
19796f48d0ebSDavid Rientjes 	default:
19806f48d0ebSDavid Rientjes 		BUG();
19816f48d0ebSDavid Rientjes 	}
19826f48d0ebSDavid Rientjes out:
19836f48d0ebSDavid Rientjes 	task_unlock(tsk);
19846f48d0ebSDavid Rientjes 	return ret;
19856f48d0ebSDavid Rientjes }
19866f48d0ebSDavid Rientjes 
19871da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
19881da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
1989662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1990662f3a0bSAndi Kleen 					unsigned nid)
19911da177e4SLinus Torvalds {
19921da177e4SLinus Torvalds 	struct zonelist *zl;
19931da177e4SLinus Torvalds 	struct page *page;
19941da177e4SLinus Torvalds 
19950e88460dSMel Gorman 	zl = node_zonelist(nid, gfp);
19961da177e4SLinus Torvalds 	page = __alloc_pages(gfp, order, zl);
1997dd1a239fSMel Gorman 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1998ca889e6cSChristoph Lameter 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
19991da177e4SLinus Torvalds 	return page;
20001da177e4SLinus Torvalds }
20011da177e4SLinus Torvalds 
20021da177e4SLinus Torvalds /**
20030bbbc0b3SAndrea Arcangeli  * 	alloc_pages_vma	- Allocate a page for a VMA.
20041da177e4SLinus Torvalds  *
20051da177e4SLinus Torvalds  * 	@gfp:
20061da177e4SLinus Torvalds  *      %GFP_USER    user allocation.
20071da177e4SLinus Torvalds  *      %GFP_KERNEL  kernel allocations,
20081da177e4SLinus Torvalds  *      %GFP_HIGHMEM highmem/user allocations,
20091da177e4SLinus Torvalds  *      %GFP_FS      allocation should not call back into a file system.
20101da177e4SLinus Torvalds  *      %GFP_ATOMIC  don't sleep.
20111da177e4SLinus Torvalds  *
20120bbbc0b3SAndrea Arcangeli  *	@order:Order of the GFP allocation.
20131da177e4SLinus Torvalds  * 	@vma:  Pointer to VMA or NULL if not available.
20141da177e4SLinus Torvalds  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
20151da177e4SLinus Torvalds  *
20161da177e4SLinus Torvalds  * 	This function allocates a page from the kernel page pool and applies
20171da177e4SLinus Torvalds  *	a NUMA policy associated with the VMA or the current process.
20181da177e4SLinus Torvalds  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
20191da177e4SLinus Torvalds  *	mm_struct of the VMA to prevent it from going away. Should be used for
20201da177e4SLinus Torvalds  *	all allocations for pages that will be mapped into
20211da177e4SLinus Torvalds  * 	user space. Returns NULL when no page can be allocated.
20221da177e4SLinus Torvalds  *
20231da177e4SLinus Torvalds  *	Should be called with the mm_sem of the vma hold.
20241da177e4SLinus Torvalds  */
20251da177e4SLinus Torvalds struct page *
20260bbbc0b3SAndrea Arcangeli alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
20272f5f9486SAndi Kleen 		unsigned long addr, int node)
20281da177e4SLinus Torvalds {
2029cc9a6c87SMel Gorman 	struct mempolicy *pol;
2030c0ff7453SMiao Xie 	struct page *page;
2031cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
20321da177e4SLinus Torvalds 
2033cc9a6c87SMel Gorman retry_cpuset:
2034cc9a6c87SMel Gorman 	pol = get_vma_policy(current, vma, addr);
2035d26914d1SMel Gorman 	cpuset_mems_cookie = read_mems_allowed_begin();
2036cc9a6c87SMel Gorman 
203745c4745aSLee Schermerhorn 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
20381da177e4SLinus Torvalds 		unsigned nid;
20395da7ca86SChristoph Lameter 
20408eac563cSAndi Kleen 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
204152cd3b07SLee Schermerhorn 		mpol_cond_put(pol);
20420bbbc0b3SAndrea Arcangeli 		page = alloc_page_interleave(gfp, order, nid);
2043d26914d1SMel Gorman 		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2044cc9a6c87SMel Gorman 			goto retry_cpuset;
2045cc9a6c87SMel Gorman 
2046c0ff7453SMiao Xie 		return page;
20471da177e4SLinus Torvalds 	}
2048212a0a6fSDavid Rientjes 	page = __alloc_pages_nodemask(gfp, order,
2049212a0a6fSDavid Rientjes 				      policy_zonelist(gfp, pol, node),
20500bbbc0b3SAndrea Arcangeli 				      policy_nodemask(gfp, pol));
2051212a0a6fSDavid Rientjes 	if (unlikely(mpol_needs_cond_ref(pol)))
2052212a0a6fSDavid Rientjes 		__mpol_put(pol);
2053d26914d1SMel Gorman 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2054cc9a6c87SMel Gorman 		goto retry_cpuset;
2055c0ff7453SMiao Xie 	return page;
20561da177e4SLinus Torvalds }
20571da177e4SLinus Torvalds 
20581da177e4SLinus Torvalds /**
20591da177e4SLinus Torvalds  * 	alloc_pages_current - Allocate pages.
20601da177e4SLinus Torvalds  *
20611da177e4SLinus Torvalds  *	@gfp:
20621da177e4SLinus Torvalds  *		%GFP_USER   user allocation,
20631da177e4SLinus Torvalds  *      	%GFP_KERNEL kernel allocation,
20641da177e4SLinus Torvalds  *      	%GFP_HIGHMEM highmem allocation,
20651da177e4SLinus Torvalds  *      	%GFP_FS     don't call back into a file system.
20661da177e4SLinus Torvalds  *      	%GFP_ATOMIC don't sleep.
20671da177e4SLinus Torvalds  *	@order: Power of two of allocation size in pages. 0 is a single page.
20681da177e4SLinus Torvalds  *
20691da177e4SLinus Torvalds  *	Allocate a page from the kernel page pool.  When not in
20701da177e4SLinus Torvalds  *	interrupt context and apply the current process NUMA policy.
20711da177e4SLinus Torvalds  *	Returns NULL when no page can be allocated.
20721da177e4SLinus Torvalds  *
2073cf2a473cSPaul Jackson  *	Don't call cpuset_update_task_memory_state() unless
20741da177e4SLinus Torvalds  *	1) it's ok to take cpuset_sem (can WAIT), and
20751da177e4SLinus Torvalds  *	2) allocating for current task (not interrupt).
20761da177e4SLinus Torvalds  */
2077dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order)
20781da177e4SLinus Torvalds {
20795606e387SMel Gorman 	struct mempolicy *pol = get_task_policy(current);
2080c0ff7453SMiao Xie 	struct page *page;
2081cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
20821da177e4SLinus Torvalds 
20839b819d20SChristoph Lameter 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
20841da177e4SLinus Torvalds 		pol = &default_policy;
208552cd3b07SLee Schermerhorn 
2086cc9a6c87SMel Gorman retry_cpuset:
2087d26914d1SMel Gorman 	cpuset_mems_cookie = read_mems_allowed_begin();
2088cc9a6c87SMel Gorman 
208952cd3b07SLee Schermerhorn 	/*
209052cd3b07SLee Schermerhorn 	 * No reference counting needed for current->mempolicy
209152cd3b07SLee Schermerhorn 	 * nor system default_policy
209252cd3b07SLee Schermerhorn 	 */
209345c4745aSLee Schermerhorn 	if (pol->mode == MPOL_INTERLEAVE)
2094c0ff7453SMiao Xie 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2095c0ff7453SMiao Xie 	else
2096c0ff7453SMiao Xie 		page = __alloc_pages_nodemask(gfp, order,
20975c4b4be3SAndi Kleen 				policy_zonelist(gfp, pol, numa_node_id()),
20985c4b4be3SAndi Kleen 				policy_nodemask(gfp, pol));
2099cc9a6c87SMel Gorman 
2100d26914d1SMel Gorman 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2101cc9a6c87SMel Gorman 		goto retry_cpuset;
2102cc9a6c87SMel Gorman 
2103c0ff7453SMiao Xie 	return page;
21041da177e4SLinus Torvalds }
21051da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current);
21061da177e4SLinus Torvalds 
2107ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2108ef0855d3SOleg Nesterov {
2109ef0855d3SOleg Nesterov 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2110ef0855d3SOleg Nesterov 
2111ef0855d3SOleg Nesterov 	if (IS_ERR(pol))
2112ef0855d3SOleg Nesterov 		return PTR_ERR(pol);
2113ef0855d3SOleg Nesterov 	dst->vm_policy = pol;
2114ef0855d3SOleg Nesterov 	return 0;
2115ef0855d3SOleg Nesterov }
2116ef0855d3SOleg Nesterov 
21174225399aSPaul Jackson /*
2118846a16bfSLee Schermerhorn  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
21194225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
21204225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
21214225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
21224225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
2123708c1bbcSMiao Xie  *
2124708c1bbcSMiao Xie  * current's mempolicy may be rebinded by the other task(the task that changes
2125708c1bbcSMiao Xie  * cpuset's mems), so we needn't do rebind work for current task.
21264225399aSPaul Jackson  */
21274225399aSPaul Jackson 
2128846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
2129846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
21301da177e4SLinus Torvalds {
21311da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
21321da177e4SLinus Torvalds 
21331da177e4SLinus Torvalds 	if (!new)
21341da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2135708c1bbcSMiao Xie 
2136708c1bbcSMiao Xie 	/* task's mempolicy is protected by alloc_lock */
2137708c1bbcSMiao Xie 	if (old == current->mempolicy) {
2138708c1bbcSMiao Xie 		task_lock(current);
2139708c1bbcSMiao Xie 		*new = *old;
2140708c1bbcSMiao Xie 		task_unlock(current);
2141708c1bbcSMiao Xie 	} else
2142708c1bbcSMiao Xie 		*new = *old;
2143708c1bbcSMiao Xie 
214499ee4ca7SPaul E. McKenney 	rcu_read_lock();
21454225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
21464225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
2147708c1bbcSMiao Xie 		if (new->flags & MPOL_F_REBINDING)
2148708c1bbcSMiao Xie 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2149708c1bbcSMiao Xie 		else
2150708c1bbcSMiao Xie 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
21514225399aSPaul Jackson 	}
215299ee4ca7SPaul E. McKenney 	rcu_read_unlock();
21531da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
21541da177e4SLinus Torvalds 	return new;
21551da177e4SLinus Torvalds }
21561da177e4SLinus Torvalds 
21571da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
2158fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
21591da177e4SLinus Torvalds {
21601da177e4SLinus Torvalds 	if (!a || !b)
2161fcfb4dccSKOSAKI Motohiro 		return false;
216245c4745aSLee Schermerhorn 	if (a->mode != b->mode)
2163fcfb4dccSKOSAKI Motohiro 		return false;
216419800502SBob Liu 	if (a->flags != b->flags)
2165fcfb4dccSKOSAKI Motohiro 		return false;
216619800502SBob Liu 	if (mpol_store_user_nodemask(a))
216719800502SBob Liu 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2168fcfb4dccSKOSAKI Motohiro 			return false;
216919800502SBob Liu 
217045c4745aSLee Schermerhorn 	switch (a->mode) {
217119770b32SMel Gorman 	case MPOL_BIND:
217219770b32SMel Gorman 		/* Fall through */
21731da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
2174fcfb4dccSKOSAKI Motohiro 		return !!nodes_equal(a->v.nodes, b->v.nodes);
21751da177e4SLinus Torvalds 	case MPOL_PREFERRED:
217675719661SNamhyung Kim 		return a->v.preferred_node == b->v.preferred_node;
21771da177e4SLinus Torvalds 	default:
21781da177e4SLinus Torvalds 		BUG();
2179fcfb4dccSKOSAKI Motohiro 		return false;
21801da177e4SLinus Torvalds 	}
21811da177e4SLinus Torvalds }
21821da177e4SLinus Torvalds 
21831da177e4SLinus Torvalds /*
21841da177e4SLinus Torvalds  * Shared memory backing store policy support.
21851da177e4SLinus Torvalds  *
21861da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
21871da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
21881da177e4SLinus Torvalds  * They are protected by the sp->lock spinlock, which should be held
21891da177e4SLinus Torvalds  * for any accesses to the tree.
21901da177e4SLinus Torvalds  */
21911da177e4SLinus Torvalds 
21921da177e4SLinus Torvalds /* lookup first element intersecting start-end */
219342288fe3SMel Gorman /* Caller holds sp->lock */
21941da177e4SLinus Torvalds static struct sp_node *
21951da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
21961da177e4SLinus Torvalds {
21971da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
21981da177e4SLinus Torvalds 
21991da177e4SLinus Torvalds 	while (n) {
22001da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
22011da177e4SLinus Torvalds 
22021da177e4SLinus Torvalds 		if (start >= p->end)
22031da177e4SLinus Torvalds 			n = n->rb_right;
22041da177e4SLinus Torvalds 		else if (end <= p->start)
22051da177e4SLinus Torvalds 			n = n->rb_left;
22061da177e4SLinus Torvalds 		else
22071da177e4SLinus Torvalds 			break;
22081da177e4SLinus Torvalds 	}
22091da177e4SLinus Torvalds 	if (!n)
22101da177e4SLinus Torvalds 		return NULL;
22111da177e4SLinus Torvalds 	for (;;) {
22121da177e4SLinus Torvalds 		struct sp_node *w = NULL;
22131da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
22141da177e4SLinus Torvalds 		if (!prev)
22151da177e4SLinus Torvalds 			break;
22161da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
22171da177e4SLinus Torvalds 		if (w->end <= start)
22181da177e4SLinus Torvalds 			break;
22191da177e4SLinus Torvalds 		n = prev;
22201da177e4SLinus Torvalds 	}
22211da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
22221da177e4SLinus Torvalds }
22231da177e4SLinus Torvalds 
22241da177e4SLinus Torvalds /* Insert a new shared policy into the list. */
22251da177e4SLinus Torvalds /* Caller holds sp->lock */
22261da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
22271da177e4SLinus Torvalds {
22281da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
22291da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
22301da177e4SLinus Torvalds 	struct sp_node *nd;
22311da177e4SLinus Torvalds 
22321da177e4SLinus Torvalds 	while (*p) {
22331da177e4SLinus Torvalds 		parent = *p;
22341da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
22351da177e4SLinus Torvalds 		if (new->start < nd->start)
22361da177e4SLinus Torvalds 			p = &(*p)->rb_left;
22371da177e4SLinus Torvalds 		else if (new->end > nd->end)
22381da177e4SLinus Torvalds 			p = &(*p)->rb_right;
22391da177e4SLinus Torvalds 		else
22401da177e4SLinus Torvalds 			BUG();
22411da177e4SLinus Torvalds 	}
22421da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
22431da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
2244140d5a49SPaul Mundt 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
224545c4745aSLee Schermerhorn 		 new->policy ? new->policy->mode : 0);
22461da177e4SLinus Torvalds }
22471da177e4SLinus Torvalds 
22481da177e4SLinus Torvalds /* Find shared policy intersecting idx */
22491da177e4SLinus Torvalds struct mempolicy *
22501da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
22511da177e4SLinus Torvalds {
22521da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
22531da177e4SLinus Torvalds 	struct sp_node *sn;
22541da177e4SLinus Torvalds 
22551da177e4SLinus Torvalds 	if (!sp->root.rb_node)
22561da177e4SLinus Torvalds 		return NULL;
225742288fe3SMel Gorman 	spin_lock(&sp->lock);
22581da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
22591da177e4SLinus Torvalds 	if (sn) {
22601da177e4SLinus Torvalds 		mpol_get(sn->policy);
22611da177e4SLinus Torvalds 		pol = sn->policy;
22621da177e4SLinus Torvalds 	}
226342288fe3SMel Gorman 	spin_unlock(&sp->lock);
22641da177e4SLinus Torvalds 	return pol;
22651da177e4SLinus Torvalds }
22661da177e4SLinus Torvalds 
226763f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n)
226863f74ca2SKOSAKI Motohiro {
226963f74ca2SKOSAKI Motohiro 	mpol_put(n->policy);
227063f74ca2SKOSAKI Motohiro 	kmem_cache_free(sn_cache, n);
227163f74ca2SKOSAKI Motohiro }
227263f74ca2SKOSAKI Motohiro 
2273771fb4d8SLee Schermerhorn /**
2274771fb4d8SLee Schermerhorn  * mpol_misplaced - check whether current page node is valid in policy
2275771fb4d8SLee Schermerhorn  *
2276b46e14acSFabian Frederick  * @page: page to be checked
2277b46e14acSFabian Frederick  * @vma: vm area where page mapped
2278b46e14acSFabian Frederick  * @addr: virtual address where page mapped
2279771fb4d8SLee Schermerhorn  *
2280771fb4d8SLee Schermerhorn  * Lookup current policy node id for vma,addr and "compare to" page's
2281771fb4d8SLee Schermerhorn  * node id.
2282771fb4d8SLee Schermerhorn  *
2283771fb4d8SLee Schermerhorn  * Returns:
2284771fb4d8SLee Schermerhorn  *	-1	- not misplaced, page is in the right node
2285771fb4d8SLee Schermerhorn  *	node	- node id where the page should be
2286771fb4d8SLee Schermerhorn  *
2287771fb4d8SLee Schermerhorn  * Policy determination "mimics" alloc_page_vma().
2288771fb4d8SLee Schermerhorn  * Called from fault path where we know the vma and faulting address.
2289771fb4d8SLee Schermerhorn  */
2290771fb4d8SLee Schermerhorn int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2291771fb4d8SLee Schermerhorn {
2292771fb4d8SLee Schermerhorn 	struct mempolicy *pol;
2293771fb4d8SLee Schermerhorn 	struct zone *zone;
2294771fb4d8SLee Schermerhorn 	int curnid = page_to_nid(page);
2295771fb4d8SLee Schermerhorn 	unsigned long pgoff;
229690572890SPeter Zijlstra 	int thiscpu = raw_smp_processor_id();
229790572890SPeter Zijlstra 	int thisnid = cpu_to_node(thiscpu);
2298771fb4d8SLee Schermerhorn 	int polnid = -1;
2299771fb4d8SLee Schermerhorn 	int ret = -1;
2300771fb4d8SLee Schermerhorn 
2301771fb4d8SLee Schermerhorn 	BUG_ON(!vma);
2302771fb4d8SLee Schermerhorn 
2303771fb4d8SLee Schermerhorn 	pol = get_vma_policy(current, vma, addr);
2304771fb4d8SLee Schermerhorn 	if (!(pol->flags & MPOL_F_MOF))
2305771fb4d8SLee Schermerhorn 		goto out;
2306771fb4d8SLee Schermerhorn 
2307771fb4d8SLee Schermerhorn 	switch (pol->mode) {
2308771fb4d8SLee Schermerhorn 	case MPOL_INTERLEAVE:
2309771fb4d8SLee Schermerhorn 		BUG_ON(addr >= vma->vm_end);
2310771fb4d8SLee Schermerhorn 		BUG_ON(addr < vma->vm_start);
2311771fb4d8SLee Schermerhorn 
2312771fb4d8SLee Schermerhorn 		pgoff = vma->vm_pgoff;
2313771fb4d8SLee Schermerhorn 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2314771fb4d8SLee Schermerhorn 		polnid = offset_il_node(pol, vma, pgoff);
2315771fb4d8SLee Schermerhorn 		break;
2316771fb4d8SLee Schermerhorn 
2317771fb4d8SLee Schermerhorn 	case MPOL_PREFERRED:
2318771fb4d8SLee Schermerhorn 		if (pol->flags & MPOL_F_LOCAL)
2319771fb4d8SLee Schermerhorn 			polnid = numa_node_id();
2320771fb4d8SLee Schermerhorn 		else
2321771fb4d8SLee Schermerhorn 			polnid = pol->v.preferred_node;
2322771fb4d8SLee Schermerhorn 		break;
2323771fb4d8SLee Schermerhorn 
2324771fb4d8SLee Schermerhorn 	case MPOL_BIND:
2325771fb4d8SLee Schermerhorn 		/*
2326771fb4d8SLee Schermerhorn 		 * allows binding to multiple nodes.
2327771fb4d8SLee Schermerhorn 		 * use current page if in policy nodemask,
2328771fb4d8SLee Schermerhorn 		 * else select nearest allowed node, if any.
2329771fb4d8SLee Schermerhorn 		 * If no allowed nodes, use current [!misplaced].
2330771fb4d8SLee Schermerhorn 		 */
2331771fb4d8SLee Schermerhorn 		if (node_isset(curnid, pol->v.nodes))
2332771fb4d8SLee Schermerhorn 			goto out;
2333771fb4d8SLee Schermerhorn 		(void)first_zones_zonelist(
2334771fb4d8SLee Schermerhorn 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2335771fb4d8SLee Schermerhorn 				gfp_zone(GFP_HIGHUSER),
2336771fb4d8SLee Schermerhorn 				&pol->v.nodes, &zone);
2337771fb4d8SLee Schermerhorn 		polnid = zone->node;
2338771fb4d8SLee Schermerhorn 		break;
2339771fb4d8SLee Schermerhorn 
2340771fb4d8SLee Schermerhorn 	default:
2341771fb4d8SLee Schermerhorn 		BUG();
2342771fb4d8SLee Schermerhorn 	}
23435606e387SMel Gorman 
23445606e387SMel Gorman 	/* Migrate the page towards the node whose CPU is referencing it */
2345e42c8ff2SMel Gorman 	if (pol->flags & MPOL_F_MORON) {
234690572890SPeter Zijlstra 		polnid = thisnid;
23475606e387SMel Gorman 
234810f39042SRik van Riel 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2349de1c9ce6SRik van Riel 			goto out;
2350de1c9ce6SRik van Riel 	}
2351e42c8ff2SMel Gorman 
2352771fb4d8SLee Schermerhorn 	if (curnid != polnid)
2353771fb4d8SLee Schermerhorn 		ret = polnid;
2354771fb4d8SLee Schermerhorn out:
2355771fb4d8SLee Schermerhorn 	mpol_cond_put(pol);
2356771fb4d8SLee Schermerhorn 
2357771fb4d8SLee Schermerhorn 	return ret;
2358771fb4d8SLee Schermerhorn }
2359771fb4d8SLee Schermerhorn 
23601da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
23611da177e4SLinus Torvalds {
2362140d5a49SPaul Mundt 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
23631da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
236463f74ca2SKOSAKI Motohiro 	sp_free(n);
23651da177e4SLinus Torvalds }
23661da177e4SLinus Torvalds 
236742288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start,
236842288fe3SMel Gorman 			unsigned long end, struct mempolicy *pol)
236942288fe3SMel Gorman {
237042288fe3SMel Gorman 	node->start = start;
237142288fe3SMel Gorman 	node->end = end;
237242288fe3SMel Gorman 	node->policy = pol;
237342288fe3SMel Gorman }
237442288fe3SMel Gorman 
2375dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2376dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
23771da177e4SLinus Torvalds {
2378869833f2SKOSAKI Motohiro 	struct sp_node *n;
2379869833f2SKOSAKI Motohiro 	struct mempolicy *newpol;
23801da177e4SLinus Torvalds 
2381869833f2SKOSAKI Motohiro 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
23821da177e4SLinus Torvalds 	if (!n)
23831da177e4SLinus Torvalds 		return NULL;
2384869833f2SKOSAKI Motohiro 
2385869833f2SKOSAKI Motohiro 	newpol = mpol_dup(pol);
2386869833f2SKOSAKI Motohiro 	if (IS_ERR(newpol)) {
2387869833f2SKOSAKI Motohiro 		kmem_cache_free(sn_cache, n);
2388869833f2SKOSAKI Motohiro 		return NULL;
2389869833f2SKOSAKI Motohiro 	}
2390869833f2SKOSAKI Motohiro 	newpol->flags |= MPOL_F_SHARED;
239142288fe3SMel Gorman 	sp_node_init(n, start, end, newpol);
2392869833f2SKOSAKI Motohiro 
23931da177e4SLinus Torvalds 	return n;
23941da177e4SLinus Torvalds }
23951da177e4SLinus Torvalds 
23961da177e4SLinus Torvalds /* Replace a policy range. */
23971da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
23981da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
23991da177e4SLinus Torvalds {
2400b22d127aSMel Gorman 	struct sp_node *n;
240142288fe3SMel Gorman 	struct sp_node *n_new = NULL;
240242288fe3SMel Gorman 	struct mempolicy *mpol_new = NULL;
2403b22d127aSMel Gorman 	int ret = 0;
24041da177e4SLinus Torvalds 
240542288fe3SMel Gorman restart:
240642288fe3SMel Gorman 	spin_lock(&sp->lock);
24071da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
24081da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
24091da177e4SLinus Torvalds 	while (n && n->start < end) {
24101da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
24111da177e4SLinus Torvalds 		if (n->start >= start) {
24121da177e4SLinus Torvalds 			if (n->end <= end)
24131da177e4SLinus Torvalds 				sp_delete(sp, n);
24141da177e4SLinus Torvalds 			else
24151da177e4SLinus Torvalds 				n->start = end;
24161da177e4SLinus Torvalds 		} else {
24171da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
24181da177e4SLinus Torvalds 			if (n->end > end) {
241942288fe3SMel Gorman 				if (!n_new)
242042288fe3SMel Gorman 					goto alloc_new;
242142288fe3SMel Gorman 
242242288fe3SMel Gorman 				*mpol_new = *n->policy;
242342288fe3SMel Gorman 				atomic_set(&mpol_new->refcnt, 1);
24247880639cSKOSAKI Motohiro 				sp_node_init(n_new, end, n->end, mpol_new);
24251da177e4SLinus Torvalds 				n->end = start;
24265ca39575SHillf Danton 				sp_insert(sp, n_new);
242742288fe3SMel Gorman 				n_new = NULL;
242842288fe3SMel Gorman 				mpol_new = NULL;
24291da177e4SLinus Torvalds 				break;
24301da177e4SLinus Torvalds 			} else
24311da177e4SLinus Torvalds 				n->end = start;
24321da177e4SLinus Torvalds 		}
24331da177e4SLinus Torvalds 		if (!next)
24341da177e4SLinus Torvalds 			break;
24351da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
24361da177e4SLinus Torvalds 	}
24371da177e4SLinus Torvalds 	if (new)
24381da177e4SLinus Torvalds 		sp_insert(sp, new);
243942288fe3SMel Gorman 	spin_unlock(&sp->lock);
244042288fe3SMel Gorman 	ret = 0;
244142288fe3SMel Gorman 
244242288fe3SMel Gorman err_out:
244342288fe3SMel Gorman 	if (mpol_new)
244442288fe3SMel Gorman 		mpol_put(mpol_new);
244542288fe3SMel Gorman 	if (n_new)
244642288fe3SMel Gorman 		kmem_cache_free(sn_cache, n_new);
244742288fe3SMel Gorman 
2448b22d127aSMel Gorman 	return ret;
244942288fe3SMel Gorman 
245042288fe3SMel Gorman alloc_new:
245142288fe3SMel Gorman 	spin_unlock(&sp->lock);
245242288fe3SMel Gorman 	ret = -ENOMEM;
245342288fe3SMel Gorman 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
245442288fe3SMel Gorman 	if (!n_new)
245542288fe3SMel Gorman 		goto err_out;
245642288fe3SMel Gorman 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
245742288fe3SMel Gorman 	if (!mpol_new)
245842288fe3SMel Gorman 		goto err_out;
245942288fe3SMel Gorman 	goto restart;
24601da177e4SLinus Torvalds }
24611da177e4SLinus Torvalds 
246271fe804bSLee Schermerhorn /**
246371fe804bSLee Schermerhorn  * mpol_shared_policy_init - initialize shared policy for inode
246471fe804bSLee Schermerhorn  * @sp: pointer to inode shared policy
246571fe804bSLee Schermerhorn  * @mpol:  struct mempolicy to install
246671fe804bSLee Schermerhorn  *
246771fe804bSLee Schermerhorn  * Install non-NULL @mpol in inode's shared policy rb-tree.
246871fe804bSLee Schermerhorn  * On entry, the current task has a reference on a non-NULL @mpol.
246971fe804bSLee Schermerhorn  * This must be released on exit.
24704bfc4495SKAMEZAWA Hiroyuki  * This is called at get_inode() calls and we can use GFP_KERNEL.
247171fe804bSLee Schermerhorn  */
247271fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
24737339ff83SRobin Holt {
247458568d2aSMiao Xie 	int ret;
247558568d2aSMiao Xie 
247671fe804bSLee Schermerhorn 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
247742288fe3SMel Gorman 	spin_lock_init(&sp->lock);
24787339ff83SRobin Holt 
247971fe804bSLee Schermerhorn 	if (mpol) {
24807339ff83SRobin Holt 		struct vm_area_struct pvma;
248171fe804bSLee Schermerhorn 		struct mempolicy *new;
24824bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
24837339ff83SRobin Holt 
24844bfc4495SKAMEZAWA Hiroyuki 		if (!scratch)
24855c0c1654SLee Schermerhorn 			goto put_mpol;
248671fe804bSLee Schermerhorn 		/* contextualize the tmpfs mount point mempolicy */
248771fe804bSLee Schermerhorn 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
248815d77835SLee Schermerhorn 		if (IS_ERR(new))
24890cae3457SDan Carpenter 			goto free_scratch; /* no valid nodemask intersection */
249058568d2aSMiao Xie 
249158568d2aSMiao Xie 		task_lock(current);
24924bfc4495SKAMEZAWA Hiroyuki 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
249358568d2aSMiao Xie 		task_unlock(current);
249415d77835SLee Schermerhorn 		if (ret)
24955c0c1654SLee Schermerhorn 			goto put_new;
249671fe804bSLee Schermerhorn 
249771fe804bSLee Schermerhorn 		/* Create pseudo-vma that contains just the policy */
24987339ff83SRobin Holt 		memset(&pvma, 0, sizeof(struct vm_area_struct));
249971fe804bSLee Schermerhorn 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
250071fe804bSLee Schermerhorn 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
250115d77835SLee Schermerhorn 
25025c0c1654SLee Schermerhorn put_new:
250371fe804bSLee Schermerhorn 		mpol_put(new);			/* drop initial ref */
25040cae3457SDan Carpenter free_scratch:
25054bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
25065c0c1654SLee Schermerhorn put_mpol:
25075c0c1654SLee Schermerhorn 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
25087339ff83SRobin Holt 	}
25097339ff83SRobin Holt }
25107339ff83SRobin Holt 
25111da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
25121da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
25131da177e4SLinus Torvalds {
25141da177e4SLinus Torvalds 	int err;
25151da177e4SLinus Torvalds 	struct sp_node *new = NULL;
25161da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
25171da177e4SLinus Torvalds 
2518028fec41SDavid Rientjes 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
25191da177e4SLinus Torvalds 		 vma->vm_pgoff,
252045c4745aSLee Schermerhorn 		 sz, npol ? npol->mode : -1,
2521028fec41SDavid Rientjes 		 npol ? npol->flags : -1,
252200ef2d2fSDavid Rientjes 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
25231da177e4SLinus Torvalds 
25241da177e4SLinus Torvalds 	if (npol) {
25251da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
25261da177e4SLinus Torvalds 		if (!new)
25271da177e4SLinus Torvalds 			return -ENOMEM;
25281da177e4SLinus Torvalds 	}
25291da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
25301da177e4SLinus Torvalds 	if (err && new)
253163f74ca2SKOSAKI Motohiro 		sp_free(new);
25321da177e4SLinus Torvalds 	return err;
25331da177e4SLinus Torvalds }
25341da177e4SLinus Torvalds 
25351da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
25361da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
25371da177e4SLinus Torvalds {
25381da177e4SLinus Torvalds 	struct sp_node *n;
25391da177e4SLinus Torvalds 	struct rb_node *next;
25401da177e4SLinus Torvalds 
25411da177e4SLinus Torvalds 	if (!p->root.rb_node)
25421da177e4SLinus Torvalds 		return;
254342288fe3SMel Gorman 	spin_lock(&p->lock);
25441da177e4SLinus Torvalds 	next = rb_first(&p->root);
25451da177e4SLinus Torvalds 	while (next) {
25461da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
25471da177e4SLinus Torvalds 		next = rb_next(&n->nd);
254863f74ca2SKOSAKI Motohiro 		sp_delete(p, n);
25491da177e4SLinus Torvalds 	}
255042288fe3SMel Gorman 	spin_unlock(&p->lock);
25511da177e4SLinus Torvalds }
25521da177e4SLinus Torvalds 
25531a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING
2554c297663cSMel Gorman static int __initdata numabalancing_override;
25551a687c2eSMel Gorman 
25561a687c2eSMel Gorman static void __init check_numabalancing_enable(void)
25571a687c2eSMel Gorman {
25581a687c2eSMel Gorman 	bool numabalancing_default = false;
25591a687c2eSMel Gorman 
25601a687c2eSMel Gorman 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
25611a687c2eSMel Gorman 		numabalancing_default = true;
25621a687c2eSMel Gorman 
2563c297663cSMel Gorman 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2564c297663cSMel Gorman 	if (numabalancing_override)
2565c297663cSMel Gorman 		set_numabalancing_state(numabalancing_override == 1);
2566c297663cSMel Gorman 
25671a687c2eSMel Gorman 	if (nr_node_ids > 1 && !numabalancing_override) {
25684a404beaSAndrew Morton 		pr_info("%s automatic NUMA balancing. "
2569c297663cSMel Gorman 			"Configure with numa_balancing= or the "
2570c297663cSMel Gorman 			"kernel.numa_balancing sysctl",
2571c297663cSMel Gorman 			numabalancing_default ? "Enabling" : "Disabling");
25721a687c2eSMel Gorman 		set_numabalancing_state(numabalancing_default);
25731a687c2eSMel Gorman 	}
25741a687c2eSMel Gorman }
25751a687c2eSMel Gorman 
25761a687c2eSMel Gorman static int __init setup_numabalancing(char *str)
25771a687c2eSMel Gorman {
25781a687c2eSMel Gorman 	int ret = 0;
25791a687c2eSMel Gorman 	if (!str)
25801a687c2eSMel Gorman 		goto out;
25811a687c2eSMel Gorman 
25821a687c2eSMel Gorman 	if (!strcmp(str, "enable")) {
2583c297663cSMel Gorman 		numabalancing_override = 1;
25841a687c2eSMel Gorman 		ret = 1;
25851a687c2eSMel Gorman 	} else if (!strcmp(str, "disable")) {
2586c297663cSMel Gorman 		numabalancing_override = -1;
25871a687c2eSMel Gorman 		ret = 1;
25881a687c2eSMel Gorman 	}
25891a687c2eSMel Gorman out:
25901a687c2eSMel Gorman 	if (!ret)
25914a404beaSAndrew Morton 		pr_warn("Unable to parse numa_balancing=\n");
25921a687c2eSMel Gorman 
25931a687c2eSMel Gorman 	return ret;
25941a687c2eSMel Gorman }
25951a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing);
25961a687c2eSMel Gorman #else
25971a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void)
25981a687c2eSMel Gorman {
25991a687c2eSMel Gorman }
26001a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */
26011a687c2eSMel Gorman 
26021da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
26031da177e4SLinus Torvalds void __init numa_policy_init(void)
26041da177e4SLinus Torvalds {
2605b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
2606b71636e2SPaul Mundt 	unsigned long largest = 0;
2607b71636e2SPaul Mundt 	int nid, prefer = 0;
2608b71636e2SPaul Mundt 
26091da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
26101da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
261120c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
26121da177e4SLinus Torvalds 
26131da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
26141da177e4SLinus Torvalds 				     sizeof(struct sp_node),
261520c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
26161da177e4SLinus Torvalds 
26175606e387SMel Gorman 	for_each_node(nid) {
26185606e387SMel Gorman 		preferred_node_policy[nid] = (struct mempolicy) {
26195606e387SMel Gorman 			.refcnt = ATOMIC_INIT(1),
26205606e387SMel Gorman 			.mode = MPOL_PREFERRED,
26215606e387SMel Gorman 			.flags = MPOL_F_MOF | MPOL_F_MORON,
26225606e387SMel Gorman 			.v = { .preferred_node = nid, },
26235606e387SMel Gorman 		};
26245606e387SMel Gorman 	}
26255606e387SMel Gorman 
2626b71636e2SPaul Mundt 	/*
2627b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
2628b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
2629b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
2630b71636e2SPaul Mundt 	 */
2631b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
263201f13bd6SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
2633b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
26341da177e4SLinus Torvalds 
2635b71636e2SPaul Mundt 		/* Preserve the largest node */
2636b71636e2SPaul Mundt 		if (largest < total_pages) {
2637b71636e2SPaul Mundt 			largest = total_pages;
2638b71636e2SPaul Mundt 			prefer = nid;
2639b71636e2SPaul Mundt 		}
2640b71636e2SPaul Mundt 
2641b71636e2SPaul Mundt 		/* Interleave this node? */
2642b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2643b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
2644b71636e2SPaul Mundt 	}
2645b71636e2SPaul Mundt 
2646b71636e2SPaul Mundt 	/* All too small, use the largest */
2647b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
2648b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
2649b71636e2SPaul Mundt 
2650028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2651*b1de0d13SMitchel Humpherys 		pr_err("%s: interleaving failed\n", __func__);
26521a687c2eSMel Gorman 
26531a687c2eSMel Gorman 	check_numabalancing_enable();
26541da177e4SLinus Torvalds }
26551da177e4SLinus Torvalds 
26568bccd85fSChristoph Lameter /* Reset policy of current process to default */
26571da177e4SLinus Torvalds void numa_default_policy(void)
26581da177e4SLinus Torvalds {
2659028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
26601da177e4SLinus Torvalds }
266168860ec1SPaul Jackson 
26624225399aSPaul Jackson /*
2663095f1fc4SLee Schermerhorn  * Parse and format mempolicy from/to strings
2664095f1fc4SLee Schermerhorn  */
2665095f1fc4SLee Schermerhorn 
2666095f1fc4SLee Schermerhorn /*
2667f2a07f40SHugh Dickins  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
26681a75a6c8SChristoph Lameter  */
2669345ace9cSLee Schermerhorn static const char * const policy_modes[] =
2670345ace9cSLee Schermerhorn {
2671345ace9cSLee Schermerhorn 	[MPOL_DEFAULT]    = "default",
2672345ace9cSLee Schermerhorn 	[MPOL_PREFERRED]  = "prefer",
2673345ace9cSLee Schermerhorn 	[MPOL_BIND]       = "bind",
2674345ace9cSLee Schermerhorn 	[MPOL_INTERLEAVE] = "interleave",
2675d3a71033SLee Schermerhorn 	[MPOL_LOCAL]      = "local",
2676345ace9cSLee Schermerhorn };
26771a75a6c8SChristoph Lameter 
2678095f1fc4SLee Schermerhorn 
2679095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS
2680095f1fc4SLee Schermerhorn /**
2681f2a07f40SHugh Dickins  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2682095f1fc4SLee Schermerhorn  * @str:  string containing mempolicy to parse
268371fe804bSLee Schermerhorn  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2684095f1fc4SLee Schermerhorn  *
2685095f1fc4SLee Schermerhorn  * Format of input:
2686095f1fc4SLee Schermerhorn  *	<mode>[=<flags>][:<nodelist>]
2687095f1fc4SLee Schermerhorn  *
268871fe804bSLee Schermerhorn  * On success, returns 0, else 1
2689095f1fc4SLee Schermerhorn  */
2690a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol)
2691095f1fc4SLee Schermerhorn {
269271fe804bSLee Schermerhorn 	struct mempolicy *new = NULL;
2693b4652e84SLee Schermerhorn 	unsigned short mode;
2694f2a07f40SHugh Dickins 	unsigned short mode_flags;
269571fe804bSLee Schermerhorn 	nodemask_t nodes;
2696095f1fc4SLee Schermerhorn 	char *nodelist = strchr(str, ':');
2697095f1fc4SLee Schermerhorn 	char *flags = strchr(str, '=');
2698095f1fc4SLee Schermerhorn 	int err = 1;
2699095f1fc4SLee Schermerhorn 
2700095f1fc4SLee Schermerhorn 	if (nodelist) {
2701095f1fc4SLee Schermerhorn 		/* NUL-terminate mode or flags string */
2702095f1fc4SLee Schermerhorn 		*nodelist++ = '\0';
270371fe804bSLee Schermerhorn 		if (nodelist_parse(nodelist, nodes))
2704095f1fc4SLee Schermerhorn 			goto out;
270501f13bd6SLai Jiangshan 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2706095f1fc4SLee Schermerhorn 			goto out;
270771fe804bSLee Schermerhorn 	} else
270871fe804bSLee Schermerhorn 		nodes_clear(nodes);
270971fe804bSLee Schermerhorn 
2710095f1fc4SLee Schermerhorn 	if (flags)
2711095f1fc4SLee Schermerhorn 		*flags++ = '\0';	/* terminate mode string */
2712095f1fc4SLee Schermerhorn 
2713479e2802SPeter Zijlstra 	for (mode = 0; mode < MPOL_MAX; mode++) {
2714345ace9cSLee Schermerhorn 		if (!strcmp(str, policy_modes[mode])) {
2715095f1fc4SLee Schermerhorn 			break;
2716095f1fc4SLee Schermerhorn 		}
2717095f1fc4SLee Schermerhorn 	}
2718a720094dSMel Gorman 	if (mode >= MPOL_MAX)
2719095f1fc4SLee Schermerhorn 		goto out;
2720095f1fc4SLee Schermerhorn 
272171fe804bSLee Schermerhorn 	switch (mode) {
2722095f1fc4SLee Schermerhorn 	case MPOL_PREFERRED:
272371fe804bSLee Schermerhorn 		/*
272471fe804bSLee Schermerhorn 		 * Insist on a nodelist of one node only
272571fe804bSLee Schermerhorn 		 */
2726095f1fc4SLee Schermerhorn 		if (nodelist) {
2727095f1fc4SLee Schermerhorn 			char *rest = nodelist;
2728095f1fc4SLee Schermerhorn 			while (isdigit(*rest))
2729095f1fc4SLee Schermerhorn 				rest++;
2730926f2ae0SKOSAKI Motohiro 			if (*rest)
2731926f2ae0SKOSAKI Motohiro 				goto out;
2732095f1fc4SLee Schermerhorn 		}
2733095f1fc4SLee Schermerhorn 		break;
2734095f1fc4SLee Schermerhorn 	case MPOL_INTERLEAVE:
2735095f1fc4SLee Schermerhorn 		/*
2736095f1fc4SLee Schermerhorn 		 * Default to online nodes with memory if no nodelist
2737095f1fc4SLee Schermerhorn 		 */
2738095f1fc4SLee Schermerhorn 		if (!nodelist)
273901f13bd6SLai Jiangshan 			nodes = node_states[N_MEMORY];
27403f226aa1SLee Schermerhorn 		break;
274171fe804bSLee Schermerhorn 	case MPOL_LOCAL:
27423f226aa1SLee Schermerhorn 		/*
274371fe804bSLee Schermerhorn 		 * Don't allow a nodelist;  mpol_new() checks flags
27443f226aa1SLee Schermerhorn 		 */
274571fe804bSLee Schermerhorn 		if (nodelist)
27463f226aa1SLee Schermerhorn 			goto out;
274771fe804bSLee Schermerhorn 		mode = MPOL_PREFERRED;
27483f226aa1SLee Schermerhorn 		break;
2749413b43deSRavikiran G Thirumalai 	case MPOL_DEFAULT:
2750413b43deSRavikiran G Thirumalai 		/*
2751413b43deSRavikiran G Thirumalai 		 * Insist on a empty nodelist
2752413b43deSRavikiran G Thirumalai 		 */
2753413b43deSRavikiran G Thirumalai 		if (!nodelist)
2754413b43deSRavikiran G Thirumalai 			err = 0;
2755413b43deSRavikiran G Thirumalai 		goto out;
2756d69b2e63SKOSAKI Motohiro 	case MPOL_BIND:
275771fe804bSLee Schermerhorn 		/*
2758d69b2e63SKOSAKI Motohiro 		 * Insist on a nodelist
275971fe804bSLee Schermerhorn 		 */
2760d69b2e63SKOSAKI Motohiro 		if (!nodelist)
2761d69b2e63SKOSAKI Motohiro 			goto out;
2762095f1fc4SLee Schermerhorn 	}
2763095f1fc4SLee Schermerhorn 
276471fe804bSLee Schermerhorn 	mode_flags = 0;
2765095f1fc4SLee Schermerhorn 	if (flags) {
2766095f1fc4SLee Schermerhorn 		/*
2767095f1fc4SLee Schermerhorn 		 * Currently, we only support two mutually exclusive
2768095f1fc4SLee Schermerhorn 		 * mode flags.
2769095f1fc4SLee Schermerhorn 		 */
2770095f1fc4SLee Schermerhorn 		if (!strcmp(flags, "static"))
277171fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_STATIC_NODES;
2772095f1fc4SLee Schermerhorn 		else if (!strcmp(flags, "relative"))
277371fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_RELATIVE_NODES;
2774095f1fc4SLee Schermerhorn 		else
2775926f2ae0SKOSAKI Motohiro 			goto out;
2776095f1fc4SLee Schermerhorn 	}
277771fe804bSLee Schermerhorn 
277871fe804bSLee Schermerhorn 	new = mpol_new(mode, mode_flags, &nodes);
277971fe804bSLee Schermerhorn 	if (IS_ERR(new))
2780926f2ae0SKOSAKI Motohiro 		goto out;
2781926f2ae0SKOSAKI Motohiro 
2782f2a07f40SHugh Dickins 	/*
2783f2a07f40SHugh Dickins 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2784f2a07f40SHugh Dickins 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2785f2a07f40SHugh Dickins 	 */
2786f2a07f40SHugh Dickins 	if (mode != MPOL_PREFERRED)
2787f2a07f40SHugh Dickins 		new->v.nodes = nodes;
2788f2a07f40SHugh Dickins 	else if (nodelist)
2789f2a07f40SHugh Dickins 		new->v.preferred_node = first_node(nodes);
2790f2a07f40SHugh Dickins 	else
2791f2a07f40SHugh Dickins 		new->flags |= MPOL_F_LOCAL;
2792f2a07f40SHugh Dickins 
2793f2a07f40SHugh Dickins 	/*
2794f2a07f40SHugh Dickins 	 * Save nodes for contextualization: this will be used to "clone"
2795f2a07f40SHugh Dickins 	 * the mempolicy in a specific context [cpuset] at a later time.
2796f2a07f40SHugh Dickins 	 */
2797e17f74afSLee Schermerhorn 	new->w.user_nodemask = nodes;
2798f2a07f40SHugh Dickins 
2799926f2ae0SKOSAKI Motohiro 	err = 0;
280071fe804bSLee Schermerhorn 
2801095f1fc4SLee Schermerhorn out:
2802095f1fc4SLee Schermerhorn 	/* Restore string for error message */
2803095f1fc4SLee Schermerhorn 	if (nodelist)
2804095f1fc4SLee Schermerhorn 		*--nodelist = ':';
2805095f1fc4SLee Schermerhorn 	if (flags)
2806095f1fc4SLee Schermerhorn 		*--flags = '=';
280771fe804bSLee Schermerhorn 	if (!err)
280871fe804bSLee Schermerhorn 		*mpol = new;
2809095f1fc4SLee Schermerhorn 	return err;
2810095f1fc4SLee Schermerhorn }
2811095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */
2812095f1fc4SLee Schermerhorn 
281371fe804bSLee Schermerhorn /**
281471fe804bSLee Schermerhorn  * mpol_to_str - format a mempolicy structure for printing
281571fe804bSLee Schermerhorn  * @buffer:  to contain formatted mempolicy string
281671fe804bSLee Schermerhorn  * @maxlen:  length of @buffer
281771fe804bSLee Schermerhorn  * @pol:  pointer to mempolicy to be formatted
281871fe804bSLee Schermerhorn  *
2819948927eeSDavid Rientjes  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2820948927eeSDavid Rientjes  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2821948927eeSDavid Rientjes  * longest flag, "relative", and to display at least a few node ids.
28221a75a6c8SChristoph Lameter  */
2823948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
28241a75a6c8SChristoph Lameter {
28251a75a6c8SChristoph Lameter 	char *p = buffer;
2826948927eeSDavid Rientjes 	nodemask_t nodes = NODE_MASK_NONE;
2827948927eeSDavid Rientjes 	unsigned short mode = MPOL_DEFAULT;
2828948927eeSDavid Rientjes 	unsigned short flags = 0;
28291a75a6c8SChristoph Lameter 
28308790c71aSDavid Rientjes 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2831bea904d5SLee Schermerhorn 		mode = pol->mode;
2832948927eeSDavid Rientjes 		flags = pol->flags;
2833948927eeSDavid Rientjes 	}
2834bea904d5SLee Schermerhorn 
28351a75a6c8SChristoph Lameter 	switch (mode) {
28361a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
28371a75a6c8SChristoph Lameter 		break;
28381a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
2839fc36b8d3SLee Schermerhorn 		if (flags & MPOL_F_LOCAL)
2840f2a07f40SHugh Dickins 			mode = MPOL_LOCAL;
284153f2556bSLee Schermerhorn 		else
2842fc36b8d3SLee Schermerhorn 			node_set(pol->v.preferred_node, nodes);
28431a75a6c8SChristoph Lameter 		break;
28441a75a6c8SChristoph Lameter 	case MPOL_BIND:
28451a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
28461a75a6c8SChristoph Lameter 		nodes = pol->v.nodes;
28471a75a6c8SChristoph Lameter 		break;
28481a75a6c8SChristoph Lameter 	default:
2849948927eeSDavid Rientjes 		WARN_ON_ONCE(1);
2850948927eeSDavid Rientjes 		snprintf(p, maxlen, "unknown");
2851948927eeSDavid Rientjes 		return;
28521a75a6c8SChristoph Lameter 	}
28531a75a6c8SChristoph Lameter 
2854b7a9f420SDavid Rientjes 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
28551a75a6c8SChristoph Lameter 
2856fc36b8d3SLee Schermerhorn 	if (flags & MPOL_MODE_FLAGS) {
2857948927eeSDavid Rientjes 		p += snprintf(p, buffer + maxlen - p, "=");
2858f5b087b5SDavid Rientjes 
28592291990aSLee Schermerhorn 		/*
28602291990aSLee Schermerhorn 		 * Currently, the only defined flags are mutually exclusive
28612291990aSLee Schermerhorn 		 */
2862f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
28632291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "static");
28642291990aSLee Schermerhorn 		else if (flags & MPOL_F_RELATIVE_NODES)
28652291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "relative");
2866f5b087b5SDavid Rientjes 	}
2867f5b087b5SDavid Rientjes 
28681a75a6c8SChristoph Lameter 	if (!nodes_empty(nodes)) {
2869948927eeSDavid Rientjes 		p += snprintf(p, buffer + maxlen - p, ":");
28701a75a6c8SChristoph Lameter 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
28711a75a6c8SChristoph Lameter 	}
28721a75a6c8SChristoph Lameter }
2873