xref: /openbmc/linux/mm/mempolicy.c (revision be1a13eb51077b2ec5f7f4306f93dfece503a3f1)
146aeb7e6SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
68bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
221da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds  *                no fallback.
248bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
278bccd85fSChristoph Lameter  *
281da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
2900ef2d2fSDavid Rientjes  *                As a special case NUMA_NO_NODE here means do the allocation
301da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds  *                process policy.
338bccd85fSChristoph Lameter  *
34b27abaccSDave Hansen  * preferred many Try a set of nodes first before normal fallback. This is
35b27abaccSDave Hansen  *                similar to preferred without the special case.
36b27abaccSDave Hansen  *
371da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
381da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
391da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
401da177e4SLinus Torvalds  *
411da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
421da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
431da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
441da177e4SLinus Torvalds  * allocations for a VMA in the VM.
451da177e4SLinus Torvalds  *
461da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
471da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
481da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
491da177e4SLinus Torvalds  *
501da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
511da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
521da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
531da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
541da177e4SLinus Torvalds  *
551da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
561da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
571da177e4SLinus Torvalds  */
581da177e4SLinus Torvalds 
591da177e4SLinus Torvalds /* Notebook:
601da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
611da177e4SLinus Torvalds    object
621da177e4SLinus Torvalds    statistics for bigpages
631da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
641da177e4SLinus Torvalds    first item above.
651da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
661da177e4SLinus Torvalds    grows down?
671da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
681da177e4SLinus Torvalds    kernel is not always grateful with that.
691da177e4SLinus Torvalds */
701da177e4SLinus Torvalds 
71b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72b1de0d13SMitchel Humpherys 
731da177e4SLinus Torvalds #include <linux/mempolicy.h>
74a520110eSChristoph Hellwig #include <linux/pagewalk.h>
751da177e4SLinus Torvalds #include <linux/highmem.h>
761da177e4SLinus Torvalds #include <linux/hugetlb.h>
771da177e4SLinus Torvalds #include <linux/kernel.h>
781da177e4SLinus Torvalds #include <linux/sched.h>
796e84f315SIngo Molnar #include <linux/sched/mm.h>
806a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
81f719ff9bSIngo Molnar #include <linux/sched/task.h>
821da177e4SLinus Torvalds #include <linux/nodemask.h>
831da177e4SLinus Torvalds #include <linux/cpuset.h>
841da177e4SLinus Torvalds #include <linux/slab.h>
851da177e4SLinus Torvalds #include <linux/string.h>
86b95f1b31SPaul Gortmaker #include <linux/export.h>
87b488893aSPavel Emelyanov #include <linux/nsproxy.h>
881da177e4SLinus Torvalds #include <linux/interrupt.h>
891da177e4SLinus Torvalds #include <linux/init.h>
901da177e4SLinus Torvalds #include <linux/compat.h>
9131367466SOtto Ebeling #include <linux/ptrace.h>
92dc9aa5b9SChristoph Lameter #include <linux/swap.h>
931a75a6c8SChristoph Lameter #include <linux/seq_file.h>
941a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
95b20a3503SChristoph Lameter #include <linux/migrate.h>
9662b61f61SHugh Dickins #include <linux/ksm.h>
9795a402c3SChristoph Lameter #include <linux/rmap.h>
9886c3a764SDavid Quigley #include <linux/security.h>
99dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
100095f1fc4SLee Schermerhorn #include <linux/ctype.h>
1016d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h>
102b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h>
103b1de0d13SMitchel Humpherys #include <linux/printk.h>
104c8633798SNaoya Horiguchi #include <linux/swapops.h>
105dc9aa5b9SChristoph Lameter 
1061da177e4SLinus Torvalds #include <asm/tlbflush.h>
1077c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
1081da177e4SLinus Torvalds 
10962695a84SNick Piggin #include "internal.h"
11062695a84SNick Piggin 
11138e35860SChristoph Lameter /* Internal flags */
112dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
11338e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
114dc9aa5b9SChristoph Lameter 
115fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
116fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1171da177e4SLinus Torvalds 
1181da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1191da177e4SLinus Torvalds    policied. */
1206267276fSChristoph Lameter enum zone_type policy_zone = 0;
1211da177e4SLinus Torvalds 
122bea904d5SLee Schermerhorn /*
123bea904d5SLee Schermerhorn  * run-time system-wide default policy => local allocation
124bea904d5SLee Schermerhorn  */
125e754d79dSH Hartley Sweeten static struct mempolicy default_policy = {
1261da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
1277858d7bcSFeng Tang 	.mode = MPOL_LOCAL,
1281da177e4SLinus Torvalds };
1291da177e4SLinus Torvalds 
1305606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES];
1315606e387SMel Gorman 
132b2ca916cSDan Williams /**
133b2ca916cSDan Williams  * numa_map_to_online_node - Find closest online node
134f6e92f40SKrzysztof Kozlowski  * @node: Node id to start the search
135b2ca916cSDan Williams  *
136b2ca916cSDan Williams  * Lookup the next closest node by distance if @nid is not online.
137b2ca916cSDan Williams  */
138b2ca916cSDan Williams int numa_map_to_online_node(int node)
139b2ca916cSDan Williams {
1404fcbe96eSDan Williams 	int min_dist = INT_MAX, dist, n, min_node;
141b2ca916cSDan Williams 
1424fcbe96eSDan Williams 	if (node == NUMA_NO_NODE || node_online(node))
1434fcbe96eSDan Williams 		return node;
144b2ca916cSDan Williams 
145b2ca916cSDan Williams 	min_node = node;
146b2ca916cSDan Williams 	for_each_online_node(n) {
147b2ca916cSDan Williams 		dist = node_distance(node, n);
148b2ca916cSDan Williams 		if (dist < min_dist) {
149b2ca916cSDan Williams 			min_dist = dist;
150b2ca916cSDan Williams 			min_node = n;
151b2ca916cSDan Williams 		}
152b2ca916cSDan Williams 	}
153b2ca916cSDan Williams 
154b2ca916cSDan Williams 	return min_node;
155b2ca916cSDan Williams }
156b2ca916cSDan Williams EXPORT_SYMBOL_GPL(numa_map_to_online_node);
157b2ca916cSDan Williams 
15874d2c3a0SOleg Nesterov struct mempolicy *get_task_policy(struct task_struct *p)
1595606e387SMel Gorman {
1605606e387SMel Gorman 	struct mempolicy *pol = p->mempolicy;
161f15ca78eSOleg Nesterov 	int node;
1625606e387SMel Gorman 
163f15ca78eSOleg Nesterov 	if (pol)
164f15ca78eSOleg Nesterov 		return pol;
1655606e387SMel Gorman 
166f15ca78eSOleg Nesterov 	node = numa_node_id();
1671da6f0e1SJianguo Wu 	if (node != NUMA_NO_NODE) {
1681da6f0e1SJianguo Wu 		pol = &preferred_node_policy[node];
169f15ca78eSOleg Nesterov 		/* preferred_node_policy is not initialised early in boot */
170f15ca78eSOleg Nesterov 		if (pol->mode)
171f15ca78eSOleg Nesterov 			return pol;
1721da6f0e1SJianguo Wu 	}
1735606e387SMel Gorman 
174f15ca78eSOleg Nesterov 	return &default_policy;
1755606e387SMel Gorman }
1765606e387SMel Gorman 
17737012946SDavid Rientjes static const struct mempolicy_operations {
17837012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
179213980c0SVlastimil Babka 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
18037012946SDavid Rientjes } mpol_ops[MPOL_MAX];
18137012946SDavid Rientjes 
182f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
183f5b087b5SDavid Rientjes {
1846d556294SBob Liu 	return pol->flags & MPOL_MODE_FLAGS;
1854c50bc01SDavid Rientjes }
1864c50bc01SDavid Rientjes 
1874c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1884c50bc01SDavid Rientjes 				   const nodemask_t *rel)
1894c50bc01SDavid Rientjes {
1904c50bc01SDavid Rientjes 	nodemask_t tmp;
1914c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
1924c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
193f5b087b5SDavid Rientjes }
194f5b087b5SDavid Rientjes 
195be897d48SFeng Tang static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
19637012946SDavid Rientjes {
19737012946SDavid Rientjes 	if (nodes_empty(*nodes))
19837012946SDavid Rientjes 		return -EINVAL;
199269fbe72SBen Widawsky 	pol->nodes = *nodes;
20037012946SDavid Rientjes 	return 0;
20137012946SDavid Rientjes }
20237012946SDavid Rientjes 
20337012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
20437012946SDavid Rientjes {
2057858d7bcSFeng Tang 	if (nodes_empty(*nodes))
2067858d7bcSFeng Tang 		return -EINVAL;
207269fbe72SBen Widawsky 
208269fbe72SBen Widawsky 	nodes_clear(pol->nodes);
209269fbe72SBen Widawsky 	node_set(first_node(*nodes), pol->nodes);
21037012946SDavid Rientjes 	return 0;
21137012946SDavid Rientjes }
21237012946SDavid Rientjes 
21358568d2aSMiao Xie /*
21458568d2aSMiao Xie  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
21558568d2aSMiao Xie  * any, for the new policy.  mpol_new() has already validated the nodes
2167858d7bcSFeng Tang  * parameter with respect to the policy mode and flags.
21758568d2aSMiao Xie  *
21858568d2aSMiao Xie  * Must be called holding task's alloc_lock to protect task's mems_allowed
219c1e8d7c6SMichel Lespinasse  * and mempolicy.  May also be called holding the mmap_lock for write.
22058568d2aSMiao Xie  */
2214bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol,
2224bfc4495SKAMEZAWA Hiroyuki 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
22358568d2aSMiao Xie {
22458568d2aSMiao Xie 	int ret;
22558568d2aSMiao Xie 
2267858d7bcSFeng Tang 	/*
2277858d7bcSFeng Tang 	 * Default (pol==NULL) resp. local memory policies are not a
2287858d7bcSFeng Tang 	 * subject of any remapping. They also do not need any special
2297858d7bcSFeng Tang 	 * constructor.
2307858d7bcSFeng Tang 	 */
2317858d7bcSFeng Tang 	if (!pol || pol->mode == MPOL_LOCAL)
23258568d2aSMiao Xie 		return 0;
2337858d7bcSFeng Tang 
23401f13bd6SLai Jiangshan 	/* Check N_MEMORY */
2354bfc4495SKAMEZAWA Hiroyuki 	nodes_and(nsc->mask1,
23601f13bd6SLai Jiangshan 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
23758568d2aSMiao Xie 
23858568d2aSMiao Xie 	VM_BUG_ON(!nodes);
2397858d7bcSFeng Tang 
24058568d2aSMiao Xie 	if (pol->flags & MPOL_F_RELATIVE_NODES)
2414bfc4495SKAMEZAWA Hiroyuki 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
24258568d2aSMiao Xie 	else
2434bfc4495SKAMEZAWA Hiroyuki 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
2444bfc4495SKAMEZAWA Hiroyuki 
24558568d2aSMiao Xie 	if (mpol_store_user_nodemask(pol))
24658568d2aSMiao Xie 		pol->w.user_nodemask = *nodes;
24758568d2aSMiao Xie 	else
2487858d7bcSFeng Tang 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
24958568d2aSMiao Xie 
2504bfc4495SKAMEZAWA Hiroyuki 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
25158568d2aSMiao Xie 	return ret;
25258568d2aSMiao Xie }
25358568d2aSMiao Xie 
25458568d2aSMiao Xie /*
25558568d2aSMiao Xie  * This function just creates a new policy, does some check and simple
25658568d2aSMiao Xie  * initialization. You must invoke mpol_set_nodemask() to set nodes.
25758568d2aSMiao Xie  */
258028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259028fec41SDavid Rientjes 				  nodemask_t *nodes)
2601da177e4SLinus Torvalds {
2611da177e4SLinus Torvalds 	struct mempolicy *policy;
2621da177e4SLinus Torvalds 
263028fec41SDavid Rientjes 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
26400ef2d2fSDavid Rientjes 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265140d5a49SPaul Mundt 
2663e1f0645SDavid Rientjes 	if (mode == MPOL_DEFAULT) {
2673e1f0645SDavid Rientjes 		if (nodes && !nodes_empty(*nodes))
26837012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
269d3a71033SLee Schermerhorn 		return NULL;
27037012946SDavid Rientjes 	}
2713e1f0645SDavid Rientjes 	VM_BUG_ON(!nodes);
2723e1f0645SDavid Rientjes 
2733e1f0645SDavid Rientjes 	/*
2743e1f0645SDavid Rientjes 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
2753e1f0645SDavid Rientjes 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
2763e1f0645SDavid Rientjes 	 * All other modes require a valid pointer to a non-empty nodemask.
2773e1f0645SDavid Rientjes 	 */
2783e1f0645SDavid Rientjes 	if (mode == MPOL_PREFERRED) {
2793e1f0645SDavid Rientjes 		if (nodes_empty(*nodes)) {
2803e1f0645SDavid Rientjes 			if (((flags & MPOL_F_STATIC_NODES) ||
2813e1f0645SDavid Rientjes 			     (flags & MPOL_F_RELATIVE_NODES)))
2823e1f0645SDavid Rientjes 				return ERR_PTR(-EINVAL);
2837858d7bcSFeng Tang 
2847858d7bcSFeng Tang 			mode = MPOL_LOCAL;
2853e1f0645SDavid Rientjes 		}
286479e2802SPeter Zijlstra 	} else if (mode == MPOL_LOCAL) {
2878d303e44SPiotr Kwapulinski 		if (!nodes_empty(*nodes) ||
2888d303e44SPiotr Kwapulinski 		    (flags & MPOL_F_STATIC_NODES) ||
2898d303e44SPiotr Kwapulinski 		    (flags & MPOL_F_RELATIVE_NODES))
290479e2802SPeter Zijlstra 			return ERR_PTR(-EINVAL);
2913e1f0645SDavid Rientjes 	} else if (nodes_empty(*nodes))
2923e1f0645SDavid Rientjes 		return ERR_PTR(-EINVAL);
2931da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2941da177e4SLinus Torvalds 	if (!policy)
2951da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2961da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
29745c4745aSLee Schermerhorn 	policy->mode = mode;
29837012946SDavid Rientjes 	policy->flags = flags;
2993e1f0645SDavid Rientjes 
30037012946SDavid Rientjes 	return policy;
30137012946SDavid Rientjes }
30237012946SDavid Rientjes 
30352cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */
30452cd3b07SLee Schermerhorn void __mpol_put(struct mempolicy *p)
30552cd3b07SLee Schermerhorn {
30652cd3b07SLee Schermerhorn 	if (!atomic_dec_and_test(&p->refcnt))
30752cd3b07SLee Schermerhorn 		return;
30852cd3b07SLee Schermerhorn 	kmem_cache_free(policy_cache, p);
30952cd3b07SLee Schermerhorn }
31052cd3b07SLee Schermerhorn 
311213980c0SVlastimil Babka static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
31237012946SDavid Rientjes {
31337012946SDavid Rientjes }
31437012946SDavid Rientjes 
315213980c0SVlastimil Babka static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
3161d0d2680SDavid Rientjes {
3171d0d2680SDavid Rientjes 	nodemask_t tmp;
3181d0d2680SDavid Rientjes 
31937012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
32037012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
32137012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
32237012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3231d0d2680SDavid Rientjes 	else {
324269fbe72SBen Widawsky 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
325213980c0SVlastimil Babka 								*nodes);
32629b190faSzhong jiang 		pol->w.cpuset_mems_allowed = *nodes;
3271d0d2680SDavid Rientjes 	}
32837012946SDavid Rientjes 
329708c1bbcSMiao Xie 	if (nodes_empty(tmp))
330708c1bbcSMiao Xie 		tmp = *nodes;
331708c1bbcSMiao Xie 
332269fbe72SBen Widawsky 	pol->nodes = tmp;
33337012946SDavid Rientjes }
33437012946SDavid Rientjes 
33537012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
336213980c0SVlastimil Babka 						const nodemask_t *nodes)
33737012946SDavid Rientjes {
33837012946SDavid Rientjes 	pol->w.cpuset_mems_allowed = *nodes;
3391d0d2680SDavid Rientjes }
34037012946SDavid Rientjes 
341708c1bbcSMiao Xie /*
342708c1bbcSMiao Xie  * mpol_rebind_policy - Migrate a policy to a different set of nodes
343708c1bbcSMiao Xie  *
344c1e8d7c6SMichel Lespinasse  * Per-vma policies are protected by mmap_lock. Allocations using per-task
345213980c0SVlastimil Babka  * policies are protected by task->mems_allowed_seq to prevent a premature
346213980c0SVlastimil Babka  * OOM/allocation failure due to parallel nodemask modification.
347708c1bbcSMiao Xie  */
348213980c0SVlastimil Babka static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
34937012946SDavid Rientjes {
35037012946SDavid Rientjes 	if (!pol)
35137012946SDavid Rientjes 		return;
3527858d7bcSFeng Tang 	if (!mpol_store_user_nodemask(pol) &&
35337012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
35437012946SDavid Rientjes 		return;
355708c1bbcSMiao Xie 
356213980c0SVlastimil Babka 	mpol_ops[pol->mode].rebind(pol, newmask);
3571d0d2680SDavid Rientjes }
3581d0d2680SDavid Rientjes 
3591d0d2680SDavid Rientjes /*
3601d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
3611d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
36258568d2aSMiao Xie  *
36358568d2aSMiao Xie  * Called with task's alloc_lock held.
3641d0d2680SDavid Rientjes  */
3651d0d2680SDavid Rientjes 
366213980c0SVlastimil Babka void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3671d0d2680SDavid Rientjes {
368213980c0SVlastimil Babka 	mpol_rebind_policy(tsk->mempolicy, new);
3691d0d2680SDavid Rientjes }
3701d0d2680SDavid Rientjes 
3711d0d2680SDavid Rientjes /*
3721d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
3731d0d2680SDavid Rientjes  *
374c1e8d7c6SMichel Lespinasse  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
3751d0d2680SDavid Rientjes  */
3761d0d2680SDavid Rientjes 
3771d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
3781d0d2680SDavid Rientjes {
3791d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
3801d0d2680SDavid Rientjes 
381d8ed45c5SMichel Lespinasse 	mmap_write_lock(mm);
3821d0d2680SDavid Rientjes 	for (vma = mm->mmap; vma; vma = vma->vm_next)
383213980c0SVlastimil Babka 		mpol_rebind_policy(vma->vm_policy, new);
384d8ed45c5SMichel Lespinasse 	mmap_write_unlock(mm);
3851d0d2680SDavid Rientjes }
3861d0d2680SDavid Rientjes 
38737012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
38837012946SDavid Rientjes 	[MPOL_DEFAULT] = {
38937012946SDavid Rientjes 		.rebind = mpol_rebind_default,
39037012946SDavid Rientjes 	},
39137012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
392be897d48SFeng Tang 		.create = mpol_new_nodemask,
39337012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
39437012946SDavid Rientjes 	},
39537012946SDavid Rientjes 	[MPOL_PREFERRED] = {
39637012946SDavid Rientjes 		.create = mpol_new_preferred,
39737012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
39837012946SDavid Rientjes 	},
39937012946SDavid Rientjes 	[MPOL_BIND] = {
400be897d48SFeng Tang 		.create = mpol_new_nodemask,
40137012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
40237012946SDavid Rientjes 	},
4037858d7bcSFeng Tang 	[MPOL_LOCAL] = {
4047858d7bcSFeng Tang 		.rebind = mpol_rebind_default,
4057858d7bcSFeng Tang 	},
406b27abaccSDave Hansen 	[MPOL_PREFERRED_MANY] = {
407be897d48SFeng Tang 		.create = mpol_new_nodemask,
408b27abaccSDave Hansen 		.rebind = mpol_rebind_preferred,
409b27abaccSDave Hansen 	},
41037012946SDavid Rientjes };
41137012946SDavid Rientjes 
412a53190a4SYang Shi static int migrate_page_add(struct page *page, struct list_head *pagelist,
413fc301289SChristoph Lameter 				unsigned long flags);
4141a75a6c8SChristoph Lameter 
4156f4576e3SNaoya Horiguchi struct queue_pages {
4166f4576e3SNaoya Horiguchi 	struct list_head *pagelist;
4176f4576e3SNaoya Horiguchi 	unsigned long flags;
4186f4576e3SNaoya Horiguchi 	nodemask_t *nmask;
419f18da660SLi Xinhai 	unsigned long start;
420f18da660SLi Xinhai 	unsigned long end;
421f18da660SLi Xinhai 	struct vm_area_struct *first;
4226f4576e3SNaoya Horiguchi };
4236f4576e3SNaoya Horiguchi 
42498094945SNaoya Horiguchi /*
42588aaa2a1SNaoya Horiguchi  * Check if the page's nid is in qp->nmask.
42688aaa2a1SNaoya Horiguchi  *
42788aaa2a1SNaoya Horiguchi  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
42888aaa2a1SNaoya Horiguchi  * in the invert of qp->nmask.
42988aaa2a1SNaoya Horiguchi  */
43088aaa2a1SNaoya Horiguchi static inline bool queue_pages_required(struct page *page,
43188aaa2a1SNaoya Horiguchi 					struct queue_pages *qp)
43288aaa2a1SNaoya Horiguchi {
43388aaa2a1SNaoya Horiguchi 	int nid = page_to_nid(page);
43488aaa2a1SNaoya Horiguchi 	unsigned long flags = qp->flags;
43588aaa2a1SNaoya Horiguchi 
43688aaa2a1SNaoya Horiguchi 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
43788aaa2a1SNaoya Horiguchi }
43888aaa2a1SNaoya Horiguchi 
439a7f40cfeSYang Shi /*
440d8835445SYang Shi  * queue_pages_pmd() has four possible return values:
441e5947d23SYang Shi  * 0 - pages are placed on the right node or queued successfully, or
442e5947d23SYang Shi  *     special page is met, i.e. huge zero page.
443d8835445SYang Shi  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
444d8835445SYang Shi  *     specified.
445d8835445SYang Shi  * 2 - THP was split.
446d8835445SYang Shi  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
447d8835445SYang Shi  *        existing page was already on a node that does not follow the
448d8835445SYang Shi  *        policy.
449a7f40cfeSYang Shi  */
450c8633798SNaoya Horiguchi static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
451c8633798SNaoya Horiguchi 				unsigned long end, struct mm_walk *walk)
452959a7e13SJules Irenge 	__releases(ptl)
453c8633798SNaoya Horiguchi {
454c8633798SNaoya Horiguchi 	int ret = 0;
455c8633798SNaoya Horiguchi 	struct page *page;
456c8633798SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
457c8633798SNaoya Horiguchi 	unsigned long flags;
458c8633798SNaoya Horiguchi 
459c8633798SNaoya Horiguchi 	if (unlikely(is_pmd_migration_entry(*pmd))) {
460a7f40cfeSYang Shi 		ret = -EIO;
461c8633798SNaoya Horiguchi 		goto unlock;
462c8633798SNaoya Horiguchi 	}
463c8633798SNaoya Horiguchi 	page = pmd_page(*pmd);
464c8633798SNaoya Horiguchi 	if (is_huge_zero_page(page)) {
465c8633798SNaoya Horiguchi 		spin_unlock(ptl);
466e5947d23SYang Shi 		walk->action = ACTION_CONTINUE;
467c8633798SNaoya Horiguchi 		goto out;
468c8633798SNaoya Horiguchi 	}
469d8835445SYang Shi 	if (!queue_pages_required(page, qp))
470c8633798SNaoya Horiguchi 		goto unlock;
471c8633798SNaoya Horiguchi 
472c8633798SNaoya Horiguchi 	flags = qp->flags;
473c8633798SNaoya Horiguchi 	/* go to thp migration */
474a7f40cfeSYang Shi 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
475a53190a4SYang Shi 		if (!vma_migratable(walk->vma) ||
476a53190a4SYang Shi 		    migrate_page_add(page, qp->pagelist, flags)) {
477d8835445SYang Shi 			ret = 1;
478a7f40cfeSYang Shi 			goto unlock;
479a7f40cfeSYang Shi 		}
480a7f40cfeSYang Shi 	} else
481a7f40cfeSYang Shi 		ret = -EIO;
482c8633798SNaoya Horiguchi unlock:
483c8633798SNaoya Horiguchi 	spin_unlock(ptl);
484c8633798SNaoya Horiguchi out:
485c8633798SNaoya Horiguchi 	return ret;
486c8633798SNaoya Horiguchi }
487c8633798SNaoya Horiguchi 
48888aaa2a1SNaoya Horiguchi /*
48998094945SNaoya Horiguchi  * Scan through pages checking if pages follow certain conditions,
49098094945SNaoya Horiguchi  * and move them to the pagelist if they do.
491d8835445SYang Shi  *
492d8835445SYang Shi  * queue_pages_pte_range() has three possible return values:
493e5947d23SYang Shi  * 0 - pages are placed on the right node or queued successfully, or
494e5947d23SYang Shi  *     special page is met, i.e. zero page.
495d8835445SYang Shi  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
496d8835445SYang Shi  *     specified.
497d8835445SYang Shi  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
498d8835445SYang Shi  *        on a node that does not follow the policy.
49998094945SNaoya Horiguchi  */
5006f4576e3SNaoya Horiguchi static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
5016f4576e3SNaoya Horiguchi 			unsigned long end, struct mm_walk *walk)
5021da177e4SLinus Torvalds {
5036f4576e3SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
5046f4576e3SNaoya Horiguchi 	struct page *page;
5056f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
5066f4576e3SNaoya Horiguchi 	unsigned long flags = qp->flags;
507c8633798SNaoya Horiguchi 	int ret;
508d8835445SYang Shi 	bool has_unmovable = false;
5093f088420SShijie Luo 	pte_t *pte, *mapped_pte;
510705e87c0SHugh Dickins 	spinlock_t *ptl;
511941150a3SHugh Dickins 
512c8633798SNaoya Horiguchi 	ptl = pmd_trans_huge_lock(pmd, vma);
513c8633798SNaoya Horiguchi 	if (ptl) {
514c8633798SNaoya Horiguchi 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
515d8835445SYang Shi 		if (ret != 2)
516a7f40cfeSYang Shi 			return ret;
517248db92dSKirill A. Shutemov 	}
518d8835445SYang Shi 	/* THP was split, fall through to pte walk */
51991612e0dSHugh Dickins 
520337d9abfSNaoya Horiguchi 	if (pmd_trans_unstable(pmd))
521337d9abfSNaoya Horiguchi 		return 0;
52294723aafSMichal Hocko 
5233f088420SShijie Luo 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
5246f4576e3SNaoya Horiguchi 	for (; addr != end; pte++, addr += PAGE_SIZE) {
52591612e0dSHugh Dickins 		if (!pte_present(*pte))
52691612e0dSHugh Dickins 			continue;
5276aab341eSLinus Torvalds 		page = vm_normal_page(vma, addr, *pte);
5286aab341eSLinus Torvalds 		if (!page)
52991612e0dSHugh Dickins 			continue;
530053837fcSNick Piggin 		/*
53162b61f61SHugh Dickins 		 * vm_normal_page() filters out zero pages, but there might
53262b61f61SHugh Dickins 		 * still be PageReserved pages to skip, perhaps in a VDSO.
533053837fcSNick Piggin 		 */
534b79bc0a0SHugh Dickins 		if (PageReserved(page))
535f4598c8bSChristoph Lameter 			continue;
53688aaa2a1SNaoya Horiguchi 		if (!queue_pages_required(page, qp))
53738e35860SChristoph Lameter 			continue;
538a7f40cfeSYang Shi 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
539d8835445SYang Shi 			/* MPOL_MF_STRICT must be specified if we get here */
540d8835445SYang Shi 			if (!vma_migratable(vma)) {
541d8835445SYang Shi 				has_unmovable = true;
542a7f40cfeSYang Shi 				break;
543d8835445SYang Shi 			}
544a53190a4SYang Shi 
545a53190a4SYang Shi 			/*
546a53190a4SYang Shi 			 * Do not abort immediately since there may be
547a53190a4SYang Shi 			 * temporary off LRU pages in the range.  Still
548a53190a4SYang Shi 			 * need migrate other LRU pages.
549a53190a4SYang Shi 			 */
550a53190a4SYang Shi 			if (migrate_page_add(page, qp->pagelist, flags))
551a53190a4SYang Shi 				has_unmovable = true;
552a7f40cfeSYang Shi 		} else
553a7f40cfeSYang Shi 			break;
5546f4576e3SNaoya Horiguchi 	}
5553f088420SShijie Luo 	pte_unmap_unlock(mapped_pte, ptl);
5566f4576e3SNaoya Horiguchi 	cond_resched();
557d8835445SYang Shi 
558d8835445SYang Shi 	if (has_unmovable)
559d8835445SYang Shi 		return 1;
560d8835445SYang Shi 
561a7f40cfeSYang Shi 	return addr != end ? -EIO : 0;
56291612e0dSHugh Dickins }
56391612e0dSHugh Dickins 
5646f4576e3SNaoya Horiguchi static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
5656f4576e3SNaoya Horiguchi 			       unsigned long addr, unsigned long end,
5666f4576e3SNaoya Horiguchi 			       struct mm_walk *walk)
567e2d8cf40SNaoya Horiguchi {
568dcf17635SLi Xinhai 	int ret = 0;
569e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
5706f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
571dcf17635SLi Xinhai 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
572e2d8cf40SNaoya Horiguchi 	struct page *page;
573cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
574d4c54919SNaoya Horiguchi 	pte_t entry;
575e2d8cf40SNaoya Horiguchi 
5766f4576e3SNaoya Horiguchi 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
5776f4576e3SNaoya Horiguchi 	entry = huge_ptep_get(pte);
578d4c54919SNaoya Horiguchi 	if (!pte_present(entry))
579d4c54919SNaoya Horiguchi 		goto unlock;
580d4c54919SNaoya Horiguchi 	page = pte_page(entry);
58188aaa2a1SNaoya Horiguchi 	if (!queue_pages_required(page, qp))
582e2d8cf40SNaoya Horiguchi 		goto unlock;
583dcf17635SLi Xinhai 
584dcf17635SLi Xinhai 	if (flags == MPOL_MF_STRICT) {
585dcf17635SLi Xinhai 		/*
586dcf17635SLi Xinhai 		 * STRICT alone means only detecting misplaced page and no
587dcf17635SLi Xinhai 		 * need to further check other vma.
588dcf17635SLi Xinhai 		 */
589dcf17635SLi Xinhai 		ret = -EIO;
590dcf17635SLi Xinhai 		goto unlock;
591dcf17635SLi Xinhai 	}
592dcf17635SLi Xinhai 
593dcf17635SLi Xinhai 	if (!vma_migratable(walk->vma)) {
594dcf17635SLi Xinhai 		/*
595dcf17635SLi Xinhai 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
596dcf17635SLi Xinhai 		 * stopped walking current vma.
597dcf17635SLi Xinhai 		 * Detecting misplaced page but allow migrating pages which
598dcf17635SLi Xinhai 		 * have been queued.
599dcf17635SLi Xinhai 		 */
600dcf17635SLi Xinhai 		ret = 1;
601dcf17635SLi Xinhai 		goto unlock;
602dcf17635SLi Xinhai 	}
603dcf17635SLi Xinhai 
604e2d8cf40SNaoya Horiguchi 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
605e2d8cf40SNaoya Horiguchi 	if (flags & (MPOL_MF_MOVE_ALL) ||
606dcf17635SLi Xinhai 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
607dcf17635SLi Xinhai 		if (!isolate_huge_page(page, qp->pagelist) &&
608dcf17635SLi Xinhai 			(flags & MPOL_MF_STRICT))
609dcf17635SLi Xinhai 			/*
610dcf17635SLi Xinhai 			 * Failed to isolate page but allow migrating pages
611dcf17635SLi Xinhai 			 * which have been queued.
612dcf17635SLi Xinhai 			 */
613dcf17635SLi Xinhai 			ret = 1;
614dcf17635SLi Xinhai 	}
615e2d8cf40SNaoya Horiguchi unlock:
616cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
617e2d8cf40SNaoya Horiguchi #else
618e2d8cf40SNaoya Horiguchi 	BUG();
619e2d8cf40SNaoya Horiguchi #endif
620dcf17635SLi Xinhai 	return ret;
6211da177e4SLinus Torvalds }
6221da177e4SLinus Torvalds 
6235877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING
624b24f53a0SLee Schermerhorn /*
6254b10e7d5SMel Gorman  * This is used to mark a range of virtual addresses to be inaccessible.
6264b10e7d5SMel Gorman  * These are later cleared by a NUMA hinting fault. Depending on these
6274b10e7d5SMel Gorman  * faults, pages may be migrated for better NUMA placement.
6284b10e7d5SMel Gorman  *
6294b10e7d5SMel Gorman  * This is assuming that NUMA faults are handled using PROT_NONE. If
6304b10e7d5SMel Gorman  * an architecture makes a different choice, it will need further
6314b10e7d5SMel Gorman  * changes to the core.
632b24f53a0SLee Schermerhorn  */
6334b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma,
6344b10e7d5SMel Gorman 			unsigned long addr, unsigned long end)
635b24f53a0SLee Schermerhorn {
6364b10e7d5SMel Gorman 	int nr_updated;
637b24f53a0SLee Schermerhorn 
63858705444SPeter Xu 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
63903c5a6e1SMel Gorman 	if (nr_updated)
64003c5a6e1SMel Gorman 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
641b24f53a0SLee Schermerhorn 
6424b10e7d5SMel Gorman 	return nr_updated;
643b24f53a0SLee Schermerhorn }
644b24f53a0SLee Schermerhorn #else
645b24f53a0SLee Schermerhorn static unsigned long change_prot_numa(struct vm_area_struct *vma,
646b24f53a0SLee Schermerhorn 			unsigned long addr, unsigned long end)
647b24f53a0SLee Schermerhorn {
648b24f53a0SLee Schermerhorn 	return 0;
649b24f53a0SLee Schermerhorn }
6505877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */
651b24f53a0SLee Schermerhorn 
6526f4576e3SNaoya Horiguchi static int queue_pages_test_walk(unsigned long start, unsigned long end,
6536f4576e3SNaoya Horiguchi 				struct mm_walk *walk)
6541da177e4SLinus Torvalds {
6556f4576e3SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
6566f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
6575b952b3cSAndi Kleen 	unsigned long endvma = vma->vm_end;
6586f4576e3SNaoya Horiguchi 	unsigned long flags = qp->flags;
659dc9aa5b9SChristoph Lameter 
660a18b3ac2SLi Xinhai 	/* range check first */
661ce33135cSMiaohe Lin 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
662f18da660SLi Xinhai 
663f18da660SLi Xinhai 	if (!qp->first) {
664f18da660SLi Xinhai 		qp->first = vma;
665f18da660SLi Xinhai 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
666f18da660SLi Xinhai 			(qp->start < vma->vm_start))
667f18da660SLi Xinhai 			/* hole at head side of range */
668a18b3ac2SLi Xinhai 			return -EFAULT;
669a18b3ac2SLi Xinhai 	}
670f18da660SLi Xinhai 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
671f18da660SLi Xinhai 		((vma->vm_end < qp->end) &&
672f18da660SLi Xinhai 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
673f18da660SLi Xinhai 		/* hole at middle or tail of range */
674f18da660SLi Xinhai 		return -EFAULT;
675a18b3ac2SLi Xinhai 
676a7f40cfeSYang Shi 	/*
677a7f40cfeSYang Shi 	 * Need check MPOL_MF_STRICT to return -EIO if possible
678a7f40cfeSYang Shi 	 * regardless of vma_migratable
679a7f40cfeSYang Shi 	 */
680a7f40cfeSYang Shi 	if (!vma_migratable(vma) &&
681a7f40cfeSYang Shi 	    !(flags & MPOL_MF_STRICT))
68248684a65SNaoya Horiguchi 		return 1;
68348684a65SNaoya Horiguchi 
6845b952b3cSAndi Kleen 	if (endvma > end)
6855b952b3cSAndi Kleen 		endvma = end;
686b24f53a0SLee Schermerhorn 
687b24f53a0SLee Schermerhorn 	if (flags & MPOL_MF_LAZY) {
6882c0346a3SMel Gorman 		/* Similar to task_numa_work, skip inaccessible VMAs */
6893122e80eSAnshuman Khandual 		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
6904355c018SLiang Chen 			!(vma->vm_flags & VM_MIXEDMAP))
691b24f53a0SLee Schermerhorn 			change_prot_numa(vma, start, endvma);
6926f4576e3SNaoya Horiguchi 		return 1;
693b24f53a0SLee Schermerhorn 	}
694b24f53a0SLee Schermerhorn 
6956f4576e3SNaoya Horiguchi 	/* queue pages from current vma */
696a7f40cfeSYang Shi 	if (flags & MPOL_MF_VALID)
6976f4576e3SNaoya Horiguchi 		return 0;
6986f4576e3SNaoya Horiguchi 	return 1;
6996f4576e3SNaoya Horiguchi }
700b24f53a0SLee Schermerhorn 
7017b86ac33SChristoph Hellwig static const struct mm_walk_ops queue_pages_walk_ops = {
7027b86ac33SChristoph Hellwig 	.hugetlb_entry		= queue_pages_hugetlb,
7037b86ac33SChristoph Hellwig 	.pmd_entry		= queue_pages_pte_range,
7047b86ac33SChristoph Hellwig 	.test_walk		= queue_pages_test_walk,
7057b86ac33SChristoph Hellwig };
7067b86ac33SChristoph Hellwig 
7076f4576e3SNaoya Horiguchi /*
7086f4576e3SNaoya Horiguchi  * Walk through page tables and collect pages to be migrated.
7096f4576e3SNaoya Horiguchi  *
7106f4576e3SNaoya Horiguchi  * If pages found in a given range are on a set of nodes (determined by
7116f4576e3SNaoya Horiguchi  * @nodes and @flags,) it's isolated and queued to the pagelist which is
712d8835445SYang Shi  * passed via @private.
713d8835445SYang Shi  *
714d8835445SYang Shi  * queue_pages_range() has three possible return values:
715d8835445SYang Shi  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
716d8835445SYang Shi  *     specified.
717d8835445SYang Shi  * 0 - queue pages successfully or no misplaced page.
718a85dfc30SYang Shi  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
719a85dfc30SYang Shi  *         memory range specified by nodemask and maxnode points outside
720a85dfc30SYang Shi  *         your accessible address space (-EFAULT)
7216f4576e3SNaoya Horiguchi  */
7226f4576e3SNaoya Horiguchi static int
7236f4576e3SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
7246f4576e3SNaoya Horiguchi 		nodemask_t *nodes, unsigned long flags,
7256f4576e3SNaoya Horiguchi 		struct list_head *pagelist)
7266f4576e3SNaoya Horiguchi {
727f18da660SLi Xinhai 	int err;
7286f4576e3SNaoya Horiguchi 	struct queue_pages qp = {
7296f4576e3SNaoya Horiguchi 		.pagelist = pagelist,
7306f4576e3SNaoya Horiguchi 		.flags = flags,
7316f4576e3SNaoya Horiguchi 		.nmask = nodes,
732f18da660SLi Xinhai 		.start = start,
733f18da660SLi Xinhai 		.end = end,
734f18da660SLi Xinhai 		.first = NULL,
7356f4576e3SNaoya Horiguchi 	};
7366f4576e3SNaoya Horiguchi 
737f18da660SLi Xinhai 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
738f18da660SLi Xinhai 
739f18da660SLi Xinhai 	if (!qp.first)
740f18da660SLi Xinhai 		/* whole range in hole */
741f18da660SLi Xinhai 		err = -EFAULT;
742f18da660SLi Xinhai 
743f18da660SLi Xinhai 	return err;
7441da177e4SLinus Torvalds }
7451da177e4SLinus Torvalds 
746869833f2SKOSAKI Motohiro /*
747869833f2SKOSAKI Motohiro  * Apply policy to a single VMA
748c1e8d7c6SMichel Lespinasse  * This must be called with the mmap_lock held for writing.
749869833f2SKOSAKI Motohiro  */
750869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma,
751869833f2SKOSAKI Motohiro 						struct mempolicy *pol)
7528d34694cSKOSAKI Motohiro {
753869833f2SKOSAKI Motohiro 	int err;
754869833f2SKOSAKI Motohiro 	struct mempolicy *old;
755869833f2SKOSAKI Motohiro 	struct mempolicy *new;
7568d34694cSKOSAKI Motohiro 
7578d34694cSKOSAKI Motohiro 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
7588d34694cSKOSAKI Motohiro 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
7598d34694cSKOSAKI Motohiro 		 vma->vm_ops, vma->vm_file,
7608d34694cSKOSAKI Motohiro 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
7618d34694cSKOSAKI Motohiro 
762869833f2SKOSAKI Motohiro 	new = mpol_dup(pol);
763869833f2SKOSAKI Motohiro 	if (IS_ERR(new))
764869833f2SKOSAKI Motohiro 		return PTR_ERR(new);
765869833f2SKOSAKI Motohiro 
766869833f2SKOSAKI Motohiro 	if (vma->vm_ops && vma->vm_ops->set_policy) {
7678d34694cSKOSAKI Motohiro 		err = vma->vm_ops->set_policy(vma, new);
768869833f2SKOSAKI Motohiro 		if (err)
769869833f2SKOSAKI Motohiro 			goto err_out;
7708d34694cSKOSAKI Motohiro 	}
771869833f2SKOSAKI Motohiro 
772869833f2SKOSAKI Motohiro 	old = vma->vm_policy;
773c1e8d7c6SMichel Lespinasse 	vma->vm_policy = new; /* protected by mmap_lock */
774869833f2SKOSAKI Motohiro 	mpol_put(old);
775869833f2SKOSAKI Motohiro 
776869833f2SKOSAKI Motohiro 	return 0;
777869833f2SKOSAKI Motohiro  err_out:
778869833f2SKOSAKI Motohiro 	mpol_put(new);
7798d34694cSKOSAKI Motohiro 	return err;
7808d34694cSKOSAKI Motohiro }
7818d34694cSKOSAKI Motohiro 
7821da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
7839d8cebd4SKOSAKI Motohiro static int mbind_range(struct mm_struct *mm, unsigned long start,
7849d8cebd4SKOSAKI Motohiro 		       unsigned long end, struct mempolicy *new_pol)
7851da177e4SLinus Torvalds {
7861da177e4SLinus Torvalds 	struct vm_area_struct *next;
7879d8cebd4SKOSAKI Motohiro 	struct vm_area_struct *prev;
7889d8cebd4SKOSAKI Motohiro 	struct vm_area_struct *vma;
7899d8cebd4SKOSAKI Motohiro 	int err = 0;
790e26a5114SKOSAKI Motohiro 	pgoff_t pgoff;
7919d8cebd4SKOSAKI Motohiro 	unsigned long vmstart;
7929d8cebd4SKOSAKI Motohiro 	unsigned long vmend;
7931da177e4SLinus Torvalds 
794097d5910SLinus Torvalds 	vma = find_vma(mm, start);
795f18da660SLi Xinhai 	VM_BUG_ON(!vma);
7969d8cebd4SKOSAKI Motohiro 
797097d5910SLinus Torvalds 	prev = vma->vm_prev;
798e26a5114SKOSAKI Motohiro 	if (start > vma->vm_start)
799e26a5114SKOSAKI Motohiro 		prev = vma;
800e26a5114SKOSAKI Motohiro 
8019d8cebd4SKOSAKI Motohiro 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
8021da177e4SLinus Torvalds 		next = vma->vm_next;
8039d8cebd4SKOSAKI Motohiro 		vmstart = max(start, vma->vm_start);
8049d8cebd4SKOSAKI Motohiro 		vmend   = min(end, vma->vm_end);
8059d8cebd4SKOSAKI Motohiro 
806e26a5114SKOSAKI Motohiro 		if (mpol_equal(vma_policy(vma), new_pol))
807e26a5114SKOSAKI Motohiro 			continue;
808e26a5114SKOSAKI Motohiro 
809e26a5114SKOSAKI Motohiro 		pgoff = vma->vm_pgoff +
810e26a5114SKOSAKI Motohiro 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
8119d8cebd4SKOSAKI Motohiro 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
812e26a5114SKOSAKI Motohiro 				 vma->anon_vma, vma->vm_file, pgoff,
8139a10064fSColin Cross 				 new_pol, vma->vm_userfaultfd_ctx,
8149a10064fSColin Cross 				 vma_anon_name(vma));
8159d8cebd4SKOSAKI Motohiro 		if (prev) {
8169d8cebd4SKOSAKI Motohiro 			vma = prev;
8179d8cebd4SKOSAKI Motohiro 			next = vma->vm_next;
8183964acd0SOleg Nesterov 			if (mpol_equal(vma_policy(vma), new_pol))
8199d8cebd4SKOSAKI Motohiro 				continue;
8203964acd0SOleg Nesterov 			/* vma_merge() joined vma && vma->next, case 8 */
8213964acd0SOleg Nesterov 			goto replace;
8221da177e4SLinus Torvalds 		}
8239d8cebd4SKOSAKI Motohiro 		if (vma->vm_start != vmstart) {
8249d8cebd4SKOSAKI Motohiro 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
8259d8cebd4SKOSAKI Motohiro 			if (err)
8269d8cebd4SKOSAKI Motohiro 				goto out;
8279d8cebd4SKOSAKI Motohiro 		}
8289d8cebd4SKOSAKI Motohiro 		if (vma->vm_end != vmend) {
8299d8cebd4SKOSAKI Motohiro 			err = split_vma(vma->vm_mm, vma, vmend, 0);
8309d8cebd4SKOSAKI Motohiro 			if (err)
8319d8cebd4SKOSAKI Motohiro 				goto out;
8329d8cebd4SKOSAKI Motohiro 		}
8333964acd0SOleg Nesterov  replace:
834869833f2SKOSAKI Motohiro 		err = vma_replace_policy(vma, new_pol);
8359d8cebd4SKOSAKI Motohiro 		if (err)
8369d8cebd4SKOSAKI Motohiro 			goto out;
8379d8cebd4SKOSAKI Motohiro 	}
8389d8cebd4SKOSAKI Motohiro 
8399d8cebd4SKOSAKI Motohiro  out:
8401da177e4SLinus Torvalds 	return err;
8411da177e4SLinus Torvalds }
8421da177e4SLinus Torvalds 
8431da177e4SLinus Torvalds /* Set the process memory policy */
844028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
845028fec41SDavid Rientjes 			     nodemask_t *nodes)
8461da177e4SLinus Torvalds {
84758568d2aSMiao Xie 	struct mempolicy *new, *old;
8484bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH(scratch);
84958568d2aSMiao Xie 	int ret;
8501da177e4SLinus Torvalds 
8514bfc4495SKAMEZAWA Hiroyuki 	if (!scratch)
8524bfc4495SKAMEZAWA Hiroyuki 		return -ENOMEM;
853f4e53d91SLee Schermerhorn 
8544bfc4495SKAMEZAWA Hiroyuki 	new = mpol_new(mode, flags, nodes);
8554bfc4495SKAMEZAWA Hiroyuki 	if (IS_ERR(new)) {
8564bfc4495SKAMEZAWA Hiroyuki 		ret = PTR_ERR(new);
8574bfc4495SKAMEZAWA Hiroyuki 		goto out;
8584bfc4495SKAMEZAWA Hiroyuki 	}
8592c7c3a7dSOleg Nesterov 
8604bfc4495SKAMEZAWA Hiroyuki 	ret = mpol_set_nodemask(new, nodes, scratch);
86158568d2aSMiao Xie 	if (ret) {
86258568d2aSMiao Xie 		mpol_put(new);
8634bfc4495SKAMEZAWA Hiroyuki 		goto out;
86458568d2aSMiao Xie 	}
86578b132e9SWei Yang 	task_lock(current);
86658568d2aSMiao Xie 	old = current->mempolicy;
8671da177e4SLinus Torvalds 	current->mempolicy = new;
86845816682SVlastimil Babka 	if (new && new->mode == MPOL_INTERLEAVE)
86945816682SVlastimil Babka 		current->il_prev = MAX_NUMNODES-1;
87058568d2aSMiao Xie 	task_unlock(current);
87158568d2aSMiao Xie 	mpol_put(old);
8724bfc4495SKAMEZAWA Hiroyuki 	ret = 0;
8734bfc4495SKAMEZAWA Hiroyuki out:
8744bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH_FREE(scratch);
8754bfc4495SKAMEZAWA Hiroyuki 	return ret;
8761da177e4SLinus Torvalds }
8771da177e4SLinus Torvalds 
878bea904d5SLee Schermerhorn /*
879bea904d5SLee Schermerhorn  * Return nodemask for policy for get_mempolicy() query
88058568d2aSMiao Xie  *
88158568d2aSMiao Xie  * Called with task's alloc_lock held
882bea904d5SLee Schermerhorn  */
883bea904d5SLee Schermerhorn static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
8841da177e4SLinus Torvalds {
885dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
886bea904d5SLee Schermerhorn 	if (p == &default_policy)
887bea904d5SLee Schermerhorn 		return;
888bea904d5SLee Schermerhorn 
88945c4745aSLee Schermerhorn 	switch (p->mode) {
89019770b32SMel Gorman 	case MPOL_BIND:
8911da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
892269fbe72SBen Widawsky 	case MPOL_PREFERRED:
893b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
894269fbe72SBen Widawsky 		*nodes = p->nodes;
8951da177e4SLinus Torvalds 		break;
8967858d7bcSFeng Tang 	case MPOL_LOCAL:
8977858d7bcSFeng Tang 		/* return empty node mask for local allocation */
8987858d7bcSFeng Tang 		break;
8991da177e4SLinus Torvalds 	default:
9001da177e4SLinus Torvalds 		BUG();
9011da177e4SLinus Torvalds 	}
9021da177e4SLinus Torvalds }
9031da177e4SLinus Torvalds 
9043b9aadf7SAndrea Arcangeli static int lookup_node(struct mm_struct *mm, unsigned long addr)
9051da177e4SLinus Torvalds {
906ba841078SPeter Xu 	struct page *p = NULL;
9071da177e4SLinus Torvalds 	int err;
9081da177e4SLinus Torvalds 
9093b9aadf7SAndrea Arcangeli 	int locked = 1;
9103b9aadf7SAndrea Arcangeli 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
9112d3a36a4SMichal Hocko 	if (err > 0) {
9121da177e4SLinus Torvalds 		err = page_to_nid(p);
9131da177e4SLinus Torvalds 		put_page(p);
9141da177e4SLinus Torvalds 	}
9153b9aadf7SAndrea Arcangeli 	if (locked)
916d8ed45c5SMichel Lespinasse 		mmap_read_unlock(mm);
9171da177e4SLinus Torvalds 	return err;
9181da177e4SLinus Torvalds }
9191da177e4SLinus Torvalds 
9201da177e4SLinus Torvalds /* Retrieve NUMA policy */
921dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
9221da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
9231da177e4SLinus Torvalds {
9248bccd85fSChristoph Lameter 	int err;
9251da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
9261da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
9273b9aadf7SAndrea Arcangeli 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
9281da177e4SLinus Torvalds 
929754af6f5SLee Schermerhorn 	if (flags &
930754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
9311da177e4SLinus Torvalds 		return -EINVAL;
932754af6f5SLee Schermerhorn 
933754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
934754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
935754af6f5SLee Schermerhorn 			return -EINVAL;
936754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
93758568d2aSMiao Xie 		task_lock(current);
938754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
93958568d2aSMiao Xie 		task_unlock(current);
940754af6f5SLee Schermerhorn 		return 0;
941754af6f5SLee Schermerhorn 	}
942754af6f5SLee Schermerhorn 
9431da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
944bea904d5SLee Schermerhorn 		/*
945bea904d5SLee Schermerhorn 		 * Do NOT fall back to task policy if the
946bea904d5SLee Schermerhorn 		 * vma/shared policy at addr is NULL.  We
947bea904d5SLee Schermerhorn 		 * want to return MPOL_DEFAULT in this case.
948bea904d5SLee Schermerhorn 		 */
949d8ed45c5SMichel Lespinasse 		mmap_read_lock(mm);
95033e3575cSLiam Howlett 		vma = vma_lookup(mm, addr);
9511da177e4SLinus Torvalds 		if (!vma) {
952d8ed45c5SMichel Lespinasse 			mmap_read_unlock(mm);
9531da177e4SLinus Torvalds 			return -EFAULT;
9541da177e4SLinus Torvalds 		}
9551da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
9561da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
9571da177e4SLinus Torvalds 		else
9581da177e4SLinus Torvalds 			pol = vma->vm_policy;
9591da177e4SLinus Torvalds 	} else if (addr)
9601da177e4SLinus Torvalds 		return -EINVAL;
9611da177e4SLinus Torvalds 
9621da177e4SLinus Torvalds 	if (!pol)
963bea904d5SLee Schermerhorn 		pol = &default_policy;	/* indicates default behavior */
9641da177e4SLinus Torvalds 
9651da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
9661da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
9673b9aadf7SAndrea Arcangeli 			/*
9683b9aadf7SAndrea Arcangeli 			 * Take a refcount on the mpol, lookup_node()
969baf2f90bSLu Jialin 			 * will drop the mmap_lock, so after calling
9703b9aadf7SAndrea Arcangeli 			 * lookup_node() only "pol" remains valid, "vma"
9713b9aadf7SAndrea Arcangeli 			 * is stale.
9723b9aadf7SAndrea Arcangeli 			 */
9733b9aadf7SAndrea Arcangeli 			pol_refcount = pol;
9743b9aadf7SAndrea Arcangeli 			vma = NULL;
9753b9aadf7SAndrea Arcangeli 			mpol_get(pol);
9763b9aadf7SAndrea Arcangeli 			err = lookup_node(mm, addr);
9771da177e4SLinus Torvalds 			if (err < 0)
9781da177e4SLinus Torvalds 				goto out;
9798bccd85fSChristoph Lameter 			*policy = err;
9801da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
98145c4745aSLee Schermerhorn 				pol->mode == MPOL_INTERLEAVE) {
982269fbe72SBen Widawsky 			*policy = next_node_in(current->il_prev, pol->nodes);
9831da177e4SLinus Torvalds 		} else {
9841da177e4SLinus Torvalds 			err = -EINVAL;
9851da177e4SLinus Torvalds 			goto out;
9861da177e4SLinus Torvalds 		}
987bea904d5SLee Schermerhorn 	} else {
988bea904d5SLee Schermerhorn 		*policy = pol == &default_policy ? MPOL_DEFAULT :
989bea904d5SLee Schermerhorn 						pol->mode;
990d79df630SDavid Rientjes 		/*
991d79df630SDavid Rientjes 		 * Internal mempolicy flags must be masked off before exposing
992d79df630SDavid Rientjes 		 * the policy to userspace.
993d79df630SDavid Rientjes 		 */
994d79df630SDavid Rientjes 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
995bea904d5SLee Schermerhorn 	}
9961da177e4SLinus Torvalds 
9971da177e4SLinus Torvalds 	err = 0;
99858568d2aSMiao Xie 	if (nmask) {
999c6b6ef8bSLee Schermerhorn 		if (mpol_store_user_nodemask(pol)) {
1000c6b6ef8bSLee Schermerhorn 			*nmask = pol->w.user_nodemask;
1001c6b6ef8bSLee Schermerhorn 		} else {
100258568d2aSMiao Xie 			task_lock(current);
1003bea904d5SLee Schermerhorn 			get_policy_nodemask(pol, nmask);
100458568d2aSMiao Xie 			task_unlock(current);
100558568d2aSMiao Xie 		}
1006c6b6ef8bSLee Schermerhorn 	}
10071da177e4SLinus Torvalds 
10081da177e4SLinus Torvalds  out:
100952cd3b07SLee Schermerhorn 	mpol_cond_put(pol);
10101da177e4SLinus Torvalds 	if (vma)
1011d8ed45c5SMichel Lespinasse 		mmap_read_unlock(mm);
10123b9aadf7SAndrea Arcangeli 	if (pol_refcount)
10133b9aadf7SAndrea Arcangeli 		mpol_put(pol_refcount);
10141da177e4SLinus Torvalds 	return err;
10151da177e4SLinus Torvalds }
10161da177e4SLinus Torvalds 
1017b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
10188bccd85fSChristoph Lameter /*
1019c8633798SNaoya Horiguchi  * page migration, thp tail pages can be passed.
10206ce3c4c0SChristoph Lameter  */
1021a53190a4SYang Shi static int migrate_page_add(struct page *page, struct list_head *pagelist,
1022fc301289SChristoph Lameter 				unsigned long flags)
10236ce3c4c0SChristoph Lameter {
1024c8633798SNaoya Horiguchi 	struct page *head = compound_head(page);
10256ce3c4c0SChristoph Lameter 	/*
1026fc301289SChristoph Lameter 	 * Avoid migrating a page that is shared with others.
10276ce3c4c0SChristoph Lameter 	 */
1028c8633798SNaoya Horiguchi 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1029c8633798SNaoya Horiguchi 		if (!isolate_lru_page(head)) {
1030c8633798SNaoya Horiguchi 			list_add_tail(&head->lru, pagelist);
1031c8633798SNaoya Horiguchi 			mod_node_page_state(page_pgdat(head),
10329de4f22aSHuang Ying 				NR_ISOLATED_ANON + page_is_file_lru(head),
10336c357848SMatthew Wilcox (Oracle) 				thp_nr_pages(head));
1034a53190a4SYang Shi 		} else if (flags & MPOL_MF_STRICT) {
1035a53190a4SYang Shi 			/*
1036a53190a4SYang Shi 			 * Non-movable page may reach here.  And, there may be
1037a53190a4SYang Shi 			 * temporary off LRU pages or non-LRU movable pages.
1038a53190a4SYang Shi 			 * Treat them as unmovable pages since they can't be
1039a53190a4SYang Shi 			 * isolated, so they can't be moved at the moment.  It
1040a53190a4SYang Shi 			 * should return -EIO for this case too.
1041a53190a4SYang Shi 			 */
1042a53190a4SYang Shi 			return -EIO;
104362695a84SNick Piggin 		}
104462695a84SNick Piggin 	}
1045a53190a4SYang Shi 
1046a53190a4SYang Shi 	return 0;
10476ce3c4c0SChristoph Lameter }
10486ce3c4c0SChristoph Lameter 
10496ce3c4c0SChristoph Lameter /*
10507e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
10517e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
10527e2ab150SChristoph Lameter  */
1053dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1054dbcb0f19SAdrian Bunk 			   int flags)
10557e2ab150SChristoph Lameter {
10567e2ab150SChristoph Lameter 	nodemask_t nmask;
10577e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
10587e2ab150SChristoph Lameter 	int err = 0;
1059a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1060a0976311SJoonsoo Kim 		.nid = dest,
1061a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1062a0976311SJoonsoo Kim 	};
10637e2ab150SChristoph Lameter 
10647e2ab150SChristoph Lameter 	nodes_clear(nmask);
10657e2ab150SChristoph Lameter 	node_set(source, nmask);
10667e2ab150SChristoph Lameter 
106708270807SMinchan Kim 	/*
106808270807SMinchan Kim 	 * This does not "check" the range but isolates all pages that
106908270807SMinchan Kim 	 * need migration.  Between passing in the full user address
107008270807SMinchan Kim 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
107108270807SMinchan Kim 	 */
107208270807SMinchan Kim 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
107398094945SNaoya Horiguchi 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
10747e2ab150SChristoph Lameter 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
10757e2ab150SChristoph Lameter 
1076cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
1077a0976311SJoonsoo Kim 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
10785ac95884SYang Shi 				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1079cf608ac1SMinchan Kim 		if (err)
1080e2d8cf40SNaoya Horiguchi 			putback_movable_pages(&pagelist);
1081cf608ac1SMinchan Kim 	}
108295a402c3SChristoph Lameter 
10837e2ab150SChristoph Lameter 	return err;
10847e2ab150SChristoph Lameter }
10857e2ab150SChristoph Lameter 
10867e2ab150SChristoph Lameter /*
10877e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
10887e2ab150SChristoph Lameter  * layout as much as possible.
108939743889SChristoph Lameter  *
109039743889SChristoph Lameter  * Returns the number of page that could not be moved.
109139743889SChristoph Lameter  */
10920ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
10930ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
109439743889SChristoph Lameter {
10957e2ab150SChristoph Lameter 	int busy = 0;
1096f555befdSJan Stancek 	int err = 0;
10977e2ab150SChristoph Lameter 	nodemask_t tmp;
109839743889SChristoph Lameter 
1099361a2a22SMinchan Kim 	lru_cache_disable();
11000aedadf9SChristoph Lameter 
1101d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
1102d4984711SChristoph Lameter 
11037e2ab150SChristoph Lameter 	/*
11047e2ab150SChristoph Lameter 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
11057e2ab150SChristoph Lameter 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
11067e2ab150SChristoph Lameter 	 * bit in 'tmp', and return that <source, dest> pair for migration.
11077e2ab150SChristoph Lameter 	 * The pair of nodemasks 'to' and 'from' define the map.
11087e2ab150SChristoph Lameter 	 *
11097e2ab150SChristoph Lameter 	 * If no pair of bits is found that way, fallback to picking some
11107e2ab150SChristoph Lameter 	 * pair of 'source' and 'dest' bits that are not the same.  If the
11117e2ab150SChristoph Lameter 	 * 'source' and 'dest' bits are the same, this represents a node
11127e2ab150SChristoph Lameter 	 * that will be migrating to itself, so no pages need move.
11137e2ab150SChristoph Lameter 	 *
11147e2ab150SChristoph Lameter 	 * If no bits are left in 'tmp', or if all remaining bits left
11157e2ab150SChristoph Lameter 	 * in 'tmp' correspond to the same bit in 'to', return false
11167e2ab150SChristoph Lameter 	 * (nothing left to migrate).
11177e2ab150SChristoph Lameter 	 *
11187e2ab150SChristoph Lameter 	 * This lets us pick a pair of nodes to migrate between, such that
11197e2ab150SChristoph Lameter 	 * if possible the dest node is not already occupied by some other
11207e2ab150SChristoph Lameter 	 * source node, minimizing the risk of overloading the memory on a
11217e2ab150SChristoph Lameter 	 * node that would happen if we migrated incoming memory to a node
11227e2ab150SChristoph Lameter 	 * before migrating outgoing memory source that same node.
11237e2ab150SChristoph Lameter 	 *
11247e2ab150SChristoph Lameter 	 * A single scan of tmp is sufficient.  As we go, we remember the
11257e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
11267e2ab150SChristoph Lameter 	 * that not only moved, but what's better, moved to an empty slot
11277e2ab150SChristoph Lameter 	 * (d is not set in tmp), then we break out then, with that pair.
1128ae0e47f0SJustin P. Mattock 	 * Otherwise when we finish scanning from_tmp, we at least have the
11297e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved.  If we get all the way through
11307e2ab150SChristoph Lameter 	 * the scan of tmp without finding any node that moved, much less
11317e2ab150SChristoph Lameter 	 * moved to an empty node, then there is nothing left worth migrating.
11327e2ab150SChristoph Lameter 	 */
11337e2ab150SChristoph Lameter 
11340ce72d4fSAndrew Morton 	tmp = *from;
11357e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
11367e2ab150SChristoph Lameter 		int s, d;
1137b76ac7e7SJianguo Wu 		int source = NUMA_NO_NODE;
11387e2ab150SChristoph Lameter 		int dest = 0;
11397e2ab150SChristoph Lameter 
11407e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
11414a5b18ccSLarry Woodman 
11424a5b18ccSLarry Woodman 			/*
11434a5b18ccSLarry Woodman 			 * do_migrate_pages() tries to maintain the relative
11444a5b18ccSLarry Woodman 			 * node relationship of the pages established between
11454a5b18ccSLarry Woodman 			 * threads and memory areas.
11464a5b18ccSLarry Woodman                          *
11474a5b18ccSLarry Woodman 			 * However if the number of source nodes is not equal to
11484a5b18ccSLarry Woodman 			 * the number of destination nodes we can not preserve
11494a5b18ccSLarry Woodman 			 * this node relative relationship.  In that case, skip
11504a5b18ccSLarry Woodman 			 * copying memory from a node that is in the destination
11514a5b18ccSLarry Woodman 			 * mask.
11524a5b18ccSLarry Woodman 			 *
11534a5b18ccSLarry Woodman 			 * Example: [2,3,4] -> [3,4,5] moves everything.
11544a5b18ccSLarry Woodman 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
11554a5b18ccSLarry Woodman 			 */
11564a5b18ccSLarry Woodman 
11570ce72d4fSAndrew Morton 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
11580ce72d4fSAndrew Morton 						(node_isset(s, *to)))
11594a5b18ccSLarry Woodman 				continue;
11604a5b18ccSLarry Woodman 
11610ce72d4fSAndrew Morton 			d = node_remap(s, *from, *to);
11627e2ab150SChristoph Lameter 			if (s == d)
11637e2ab150SChristoph Lameter 				continue;
11647e2ab150SChristoph Lameter 
11657e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
11667e2ab150SChristoph Lameter 			dest = d;
11677e2ab150SChristoph Lameter 
11687e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
11697e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
11707e2ab150SChristoph Lameter 				break;
11717e2ab150SChristoph Lameter 		}
1172b76ac7e7SJianguo Wu 		if (source == NUMA_NO_NODE)
11737e2ab150SChristoph Lameter 			break;
11747e2ab150SChristoph Lameter 
11757e2ab150SChristoph Lameter 		node_clear(source, tmp);
11767e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
11777e2ab150SChristoph Lameter 		if (err > 0)
11787e2ab150SChristoph Lameter 			busy += err;
11797e2ab150SChristoph Lameter 		if (err < 0)
11807e2ab150SChristoph Lameter 			break;
118139743889SChristoph Lameter 	}
1182d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
1183d479960eSMinchan Kim 
1184361a2a22SMinchan Kim 	lru_cache_enable();
11857e2ab150SChristoph Lameter 	if (err < 0)
11867e2ab150SChristoph Lameter 		return err;
11877e2ab150SChristoph Lameter 	return busy;
1188b20a3503SChristoph Lameter 
118939743889SChristoph Lameter }
119039743889SChristoph Lameter 
11913ad33b24SLee Schermerhorn /*
11923ad33b24SLee Schermerhorn  * Allocate a new page for page migration based on vma policy.
1193d05f0cdcSHugh Dickins  * Start by assuming the page is mapped by the same vma as contains @start.
11943ad33b24SLee Schermerhorn  * Search forward from there, if not.  N.B., this assumes that the
11953ad33b24SLee Schermerhorn  * list of pages handed to migrate_pages()--which is how we get here--
11963ad33b24SLee Schermerhorn  * is in virtual address order.
11973ad33b24SLee Schermerhorn  */
1198666feb21SMichal Hocko static struct page *new_page(struct page *page, unsigned long start)
119995a402c3SChristoph Lameter {
1200d05f0cdcSHugh Dickins 	struct vm_area_struct *vma;
12013f649ab7SKees Cook 	unsigned long address;
120295a402c3SChristoph Lameter 
1203d05f0cdcSHugh Dickins 	vma = find_vma(current->mm, start);
12043ad33b24SLee Schermerhorn 	while (vma) {
12053ad33b24SLee Schermerhorn 		address = page_address_in_vma(page, vma);
12063ad33b24SLee Schermerhorn 		if (address != -EFAULT)
12073ad33b24SLee Schermerhorn 			break;
12083ad33b24SLee Schermerhorn 		vma = vma->vm_next;
12093ad33b24SLee Schermerhorn 	}
12103ad33b24SLee Schermerhorn 
121111c731e8SWanpeng Li 	if (PageHuge(page)) {
1212389c8178SMichal Hocko 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1213389c8178SMichal Hocko 				vma, address);
121494723aafSMichal Hocko 	} else if (PageTransHuge(page)) {
1215c8633798SNaoya Horiguchi 		struct page *thp;
1216c8633798SNaoya Horiguchi 
121719deb769SDavid Rientjes 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
121819deb769SDavid Rientjes 					 HPAGE_PMD_ORDER);
1219c8633798SNaoya Horiguchi 		if (!thp)
1220c8633798SNaoya Horiguchi 			return NULL;
1221c8633798SNaoya Horiguchi 		prep_transhuge_page(thp);
1222c8633798SNaoya Horiguchi 		return thp;
122311c731e8SWanpeng Li 	}
122411c731e8SWanpeng Li 	/*
122511c731e8SWanpeng Li 	 * if !vma, alloc_page_vma() will use task or system default policy
122611c731e8SWanpeng Li 	 */
12270f556856SMichal Hocko 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
12280f556856SMichal Hocko 			vma, address);
122995a402c3SChristoph Lameter }
1230b20a3503SChristoph Lameter #else
1231b20a3503SChristoph Lameter 
1232a53190a4SYang Shi static int migrate_page_add(struct page *page, struct list_head *pagelist,
1233b20a3503SChristoph Lameter 				unsigned long flags)
1234b20a3503SChristoph Lameter {
1235a53190a4SYang Shi 	return -EIO;
1236b20a3503SChristoph Lameter }
1237b20a3503SChristoph Lameter 
12380ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
12390ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
1240b20a3503SChristoph Lameter {
1241b20a3503SChristoph Lameter 	return -ENOSYS;
1242b20a3503SChristoph Lameter }
124395a402c3SChristoph Lameter 
1244666feb21SMichal Hocko static struct page *new_page(struct page *page, unsigned long start)
124595a402c3SChristoph Lameter {
124695a402c3SChristoph Lameter 	return NULL;
124795a402c3SChristoph Lameter }
1248b20a3503SChristoph Lameter #endif
1249b20a3503SChristoph Lameter 
1250dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
1251028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
1252028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
12536ce3c4c0SChristoph Lameter {
12546ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
12556ce3c4c0SChristoph Lameter 	struct mempolicy *new;
12566ce3c4c0SChristoph Lameter 	unsigned long end;
12576ce3c4c0SChristoph Lameter 	int err;
1258d8835445SYang Shi 	int ret;
12596ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
12606ce3c4c0SChristoph Lameter 
1261b24f53a0SLee Schermerhorn 	if (flags & ~(unsigned long)MPOL_MF_VALID)
12626ce3c4c0SChristoph Lameter 		return -EINVAL;
126374c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
12646ce3c4c0SChristoph Lameter 		return -EPERM;
12656ce3c4c0SChristoph Lameter 
12666ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
12676ce3c4c0SChristoph Lameter 		return -EINVAL;
12686ce3c4c0SChristoph Lameter 
12696ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
12706ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
12716ce3c4c0SChristoph Lameter 
12726ce3c4c0SChristoph Lameter 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
12736ce3c4c0SChristoph Lameter 	end = start + len;
12746ce3c4c0SChristoph Lameter 
12756ce3c4c0SChristoph Lameter 	if (end < start)
12766ce3c4c0SChristoph Lameter 		return -EINVAL;
12776ce3c4c0SChristoph Lameter 	if (end == start)
12786ce3c4c0SChristoph Lameter 		return 0;
12796ce3c4c0SChristoph Lameter 
1280028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
12816ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
12826ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
12836ce3c4c0SChristoph Lameter 
1284b24f53a0SLee Schermerhorn 	if (flags & MPOL_MF_LAZY)
1285b24f53a0SLee Schermerhorn 		new->flags |= MPOL_F_MOF;
1286b24f53a0SLee Schermerhorn 
12876ce3c4c0SChristoph Lameter 	/*
12886ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
12896ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
12906ce3c4c0SChristoph Lameter 	 */
12916ce3c4c0SChristoph Lameter 	if (!new)
12926ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
12936ce3c4c0SChristoph Lameter 
1294028fec41SDavid Rientjes 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1295028fec41SDavid Rientjes 		 start, start + len, mode, mode_flags,
129600ef2d2fSDavid Rientjes 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
12976ce3c4c0SChristoph Lameter 
12980aedadf9SChristoph Lameter 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
12990aedadf9SChristoph Lameter 
1300361a2a22SMinchan Kim 		lru_cache_disable();
13010aedadf9SChristoph Lameter 	}
13024bfc4495SKAMEZAWA Hiroyuki 	{
13034bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
13044bfc4495SKAMEZAWA Hiroyuki 		if (scratch) {
1305d8ed45c5SMichel Lespinasse 			mmap_write_lock(mm);
13064bfc4495SKAMEZAWA Hiroyuki 			err = mpol_set_nodemask(new, nmask, scratch);
13074bfc4495SKAMEZAWA Hiroyuki 			if (err)
1308d8ed45c5SMichel Lespinasse 				mmap_write_unlock(mm);
13094bfc4495SKAMEZAWA Hiroyuki 		} else
13104bfc4495SKAMEZAWA Hiroyuki 			err = -ENOMEM;
13114bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
13124bfc4495SKAMEZAWA Hiroyuki 	}
1313b05ca738SKOSAKI Motohiro 	if (err)
1314b05ca738SKOSAKI Motohiro 		goto mpol_out;
1315b05ca738SKOSAKI Motohiro 
1316d8835445SYang Shi 	ret = queue_pages_range(mm, start, end, nmask,
13176ce3c4c0SChristoph Lameter 			  flags | MPOL_MF_INVERT, &pagelist);
1318d8835445SYang Shi 
1319d8835445SYang Shi 	if (ret < 0) {
1320a85dfc30SYang Shi 		err = ret;
1321d8835445SYang Shi 		goto up_out;
1322d8835445SYang Shi 	}
1323d8835445SYang Shi 
13249d8cebd4SKOSAKI Motohiro 	err = mbind_range(mm, start, end, new);
13257e2ab150SChristoph Lameter 
1326b24f53a0SLee Schermerhorn 	if (!err) {
1327b24f53a0SLee Schermerhorn 		int nr_failed = 0;
1328b24f53a0SLee Schermerhorn 
1329cf608ac1SMinchan Kim 		if (!list_empty(&pagelist)) {
1330b24f53a0SLee Schermerhorn 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1331d05f0cdcSHugh Dickins 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
13325ac95884SYang Shi 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1333cf608ac1SMinchan Kim 			if (nr_failed)
133474060e4dSNaoya Horiguchi 				putback_movable_pages(&pagelist);
1335cf608ac1SMinchan Kim 		}
13366ce3c4c0SChristoph Lameter 
1337d8835445SYang Shi 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
13386ce3c4c0SChristoph Lameter 			err = -EIO;
1339a85dfc30SYang Shi 	} else {
1340d8835445SYang Shi up_out:
1341a85dfc30SYang Shi 		if (!list_empty(&pagelist))
1342a85dfc30SYang Shi 			putback_movable_pages(&pagelist);
1343a85dfc30SYang Shi 	}
1344a85dfc30SYang Shi 
1345d8ed45c5SMichel Lespinasse 	mmap_write_unlock(mm);
1346b05ca738SKOSAKI Motohiro mpol_out:
1347f0be3d32SLee Schermerhorn 	mpol_put(new);
1348d479960eSMinchan Kim 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1349361a2a22SMinchan Kim 		lru_cache_enable();
13506ce3c4c0SChristoph Lameter 	return err;
13516ce3c4c0SChristoph Lameter }
13526ce3c4c0SChristoph Lameter 
135339743889SChristoph Lameter /*
13548bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
13558bccd85fSChristoph Lameter  */
1356e130242dSArnd Bergmann static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1357e130242dSArnd Bergmann 		      unsigned long maxnode)
1358e130242dSArnd Bergmann {
1359e130242dSArnd Bergmann 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1360e130242dSArnd Bergmann 	int ret;
1361e130242dSArnd Bergmann 
1362e130242dSArnd Bergmann 	if (in_compat_syscall())
1363e130242dSArnd Bergmann 		ret = compat_get_bitmap(mask,
1364e130242dSArnd Bergmann 					(const compat_ulong_t __user *)nmask,
1365e130242dSArnd Bergmann 					maxnode);
1366e130242dSArnd Bergmann 	else
1367e130242dSArnd Bergmann 		ret = copy_from_user(mask, nmask,
1368e130242dSArnd Bergmann 				     nlongs * sizeof(unsigned long));
1369e130242dSArnd Bergmann 
1370e130242dSArnd Bergmann 	if (ret)
1371e130242dSArnd Bergmann 		return -EFAULT;
1372e130242dSArnd Bergmann 
1373e130242dSArnd Bergmann 	if (maxnode % BITS_PER_LONG)
1374e130242dSArnd Bergmann 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1375e130242dSArnd Bergmann 
1376e130242dSArnd Bergmann 	return 0;
1377e130242dSArnd Bergmann }
13788bccd85fSChristoph Lameter 
13798bccd85fSChristoph Lameter /* Copy a node mask from user space. */
138039743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
13818bccd85fSChristoph Lameter 		     unsigned long maxnode)
13828bccd85fSChristoph Lameter {
13838bccd85fSChristoph Lameter 	--maxnode;
13848bccd85fSChristoph Lameter 	nodes_clear(*nodes);
13858bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
13868bccd85fSChristoph Lameter 		return 0;
1387a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1388636f13c1SChris Wright 		return -EINVAL;
13898bccd85fSChristoph Lameter 
139056521e7aSYisheng Xie 	/*
139156521e7aSYisheng Xie 	 * When the user specified more nodes than supported just check
1392e130242dSArnd Bergmann 	 * if the non supported part is all zero, one word at a time,
1393e130242dSArnd Bergmann 	 * starting at the end.
139456521e7aSYisheng Xie 	 */
1395e130242dSArnd Bergmann 	while (maxnode > MAX_NUMNODES) {
1396e130242dSArnd Bergmann 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1397e130242dSArnd Bergmann 		unsigned long t;
13988bccd85fSChristoph Lameter 
1399e130242dSArnd Bergmann 		if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
140056521e7aSYisheng Xie 			return -EFAULT;
1401e130242dSArnd Bergmann 
1402e130242dSArnd Bergmann 		if (maxnode - bits >= MAX_NUMNODES) {
1403e130242dSArnd Bergmann 			maxnode -= bits;
1404e130242dSArnd Bergmann 		} else {
1405e130242dSArnd Bergmann 			maxnode = MAX_NUMNODES;
1406e130242dSArnd Bergmann 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1407e130242dSArnd Bergmann 		}
1408e130242dSArnd Bergmann 		if (t)
140956521e7aSYisheng Xie 			return -EINVAL;
141056521e7aSYisheng Xie 	}
141156521e7aSYisheng Xie 
1412e130242dSArnd Bergmann 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
14138bccd85fSChristoph Lameter }
14148bccd85fSChristoph Lameter 
14158bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
14168bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
14178bccd85fSChristoph Lameter 			      nodemask_t *nodes)
14188bccd85fSChristoph Lameter {
14198bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1420050c17f2SRalph Campbell 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1421e130242dSArnd Bergmann 	bool compat = in_compat_syscall();
1422e130242dSArnd Bergmann 
1423e130242dSArnd Bergmann 	if (compat)
1424e130242dSArnd Bergmann 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
14258bccd85fSChristoph Lameter 
14268bccd85fSChristoph Lameter 	if (copy > nbytes) {
14278bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
14288bccd85fSChristoph Lameter 			return -EINVAL;
14298bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
14308bccd85fSChristoph Lameter 			return -EFAULT;
14318bccd85fSChristoph Lameter 		copy = nbytes;
1432e130242dSArnd Bergmann 		maxnode = nr_node_ids;
14338bccd85fSChristoph Lameter 	}
1434e130242dSArnd Bergmann 
1435e130242dSArnd Bergmann 	if (compat)
1436e130242dSArnd Bergmann 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1437e130242dSArnd Bergmann 					 nodes_addr(*nodes), maxnode);
1438e130242dSArnd Bergmann 
14398bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
14408bccd85fSChristoph Lameter }
14418bccd85fSChristoph Lameter 
144295837924SFeng Tang /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
144395837924SFeng Tang static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
144495837924SFeng Tang {
144595837924SFeng Tang 	*flags = *mode & MPOL_MODE_FLAGS;
144695837924SFeng Tang 	*mode &= ~MPOL_MODE_FLAGS;
1447b27abaccSDave Hansen 
1448a38a59fdSBen Widawsky 	if ((unsigned int)(*mode) >=  MPOL_MAX)
144995837924SFeng Tang 		return -EINVAL;
145095837924SFeng Tang 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
145195837924SFeng Tang 		return -EINVAL;
14526d2aec9eSEric Dumazet 	if (*flags & MPOL_F_NUMA_BALANCING) {
14536d2aec9eSEric Dumazet 		if (*mode != MPOL_BIND)
14546d2aec9eSEric Dumazet 			return -EINVAL;
14556d2aec9eSEric Dumazet 		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
14566d2aec9eSEric Dumazet 	}
145795837924SFeng Tang 	return 0;
145895837924SFeng Tang }
145995837924SFeng Tang 
1460e7dc9ad6SDominik Brodowski static long kernel_mbind(unsigned long start, unsigned long len,
1461e7dc9ad6SDominik Brodowski 			 unsigned long mode, const unsigned long __user *nmask,
1462e7dc9ad6SDominik Brodowski 			 unsigned long maxnode, unsigned int flags)
14638bccd85fSChristoph Lameter {
1464028fec41SDavid Rientjes 	unsigned short mode_flags;
146595837924SFeng Tang 	nodemask_t nodes;
146695837924SFeng Tang 	int lmode = mode;
146795837924SFeng Tang 	int err;
14688bccd85fSChristoph Lameter 
1469057d3389SAndrey Konovalov 	start = untagged_addr(start);
147095837924SFeng Tang 	err = sanitize_mpol_flags(&lmode, &mode_flags);
147195837924SFeng Tang 	if (err)
147295837924SFeng Tang 		return err;
147395837924SFeng Tang 
14748bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
14758bccd85fSChristoph Lameter 	if (err)
14768bccd85fSChristoph Lameter 		return err;
147795837924SFeng Tang 
147895837924SFeng Tang 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
14798bccd85fSChristoph Lameter }
14808bccd85fSChristoph Lameter 
1481e7dc9ad6SDominik Brodowski SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1482e7dc9ad6SDominik Brodowski 		unsigned long, mode, const unsigned long __user *, nmask,
1483e7dc9ad6SDominik Brodowski 		unsigned long, maxnode, unsigned int, flags)
1484e7dc9ad6SDominik Brodowski {
1485e7dc9ad6SDominik Brodowski 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1486e7dc9ad6SDominik Brodowski }
1487e7dc9ad6SDominik Brodowski 
14888bccd85fSChristoph Lameter /* Set the process memory policy */
1489af03c4acSDominik Brodowski static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1490af03c4acSDominik Brodowski 				 unsigned long maxnode)
14918bccd85fSChristoph Lameter {
149295837924SFeng Tang 	unsigned short mode_flags;
14938bccd85fSChristoph Lameter 	nodemask_t nodes;
149495837924SFeng Tang 	int lmode = mode;
149595837924SFeng Tang 	int err;
14968bccd85fSChristoph Lameter 
149795837924SFeng Tang 	err = sanitize_mpol_flags(&lmode, &mode_flags);
149895837924SFeng Tang 	if (err)
149995837924SFeng Tang 		return err;
150095837924SFeng Tang 
15018bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
15028bccd85fSChristoph Lameter 	if (err)
15038bccd85fSChristoph Lameter 		return err;
150495837924SFeng Tang 
150595837924SFeng Tang 	return do_set_mempolicy(lmode, mode_flags, &nodes);
15068bccd85fSChristoph Lameter }
15078bccd85fSChristoph Lameter 
1508af03c4acSDominik Brodowski SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1509af03c4acSDominik Brodowski 		unsigned long, maxnode)
1510af03c4acSDominik Brodowski {
1511af03c4acSDominik Brodowski 	return kernel_set_mempolicy(mode, nmask, maxnode);
1512af03c4acSDominik Brodowski }
1513af03c4acSDominik Brodowski 
1514b6e9b0baSDominik Brodowski static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1515b6e9b0baSDominik Brodowski 				const unsigned long __user *old_nodes,
1516b6e9b0baSDominik Brodowski 				const unsigned long __user *new_nodes)
151739743889SChristoph Lameter {
1518596d7cfaSKOSAKI Motohiro 	struct mm_struct *mm = NULL;
151939743889SChristoph Lameter 	struct task_struct *task;
152039743889SChristoph Lameter 	nodemask_t task_nodes;
152139743889SChristoph Lameter 	int err;
1522596d7cfaSKOSAKI Motohiro 	nodemask_t *old;
1523596d7cfaSKOSAKI Motohiro 	nodemask_t *new;
1524596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH(scratch);
152539743889SChristoph Lameter 
1526596d7cfaSKOSAKI Motohiro 	if (!scratch)
1527596d7cfaSKOSAKI Motohiro 		return -ENOMEM;
152839743889SChristoph Lameter 
1529596d7cfaSKOSAKI Motohiro 	old = &scratch->mask1;
1530596d7cfaSKOSAKI Motohiro 	new = &scratch->mask2;
1531596d7cfaSKOSAKI Motohiro 
1532596d7cfaSKOSAKI Motohiro 	err = get_nodes(old, old_nodes, maxnode);
153339743889SChristoph Lameter 	if (err)
1534596d7cfaSKOSAKI Motohiro 		goto out;
1535596d7cfaSKOSAKI Motohiro 
1536596d7cfaSKOSAKI Motohiro 	err = get_nodes(new, new_nodes, maxnode);
1537596d7cfaSKOSAKI Motohiro 	if (err)
1538596d7cfaSKOSAKI Motohiro 		goto out;
153939743889SChristoph Lameter 
154039743889SChristoph Lameter 	/* Find the mm_struct */
154155cfaa3cSZeng Zhaoming 	rcu_read_lock();
1542228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
154339743889SChristoph Lameter 	if (!task) {
154455cfaa3cSZeng Zhaoming 		rcu_read_unlock();
1545596d7cfaSKOSAKI Motohiro 		err = -ESRCH;
1546596d7cfaSKOSAKI Motohiro 		goto out;
154739743889SChristoph Lameter 	}
15483268c63eSChristoph Lameter 	get_task_struct(task);
154939743889SChristoph Lameter 
1550596d7cfaSKOSAKI Motohiro 	err = -EINVAL;
155139743889SChristoph Lameter 
155239743889SChristoph Lameter 	/*
155331367466SOtto Ebeling 	 * Check if this process has the right to modify the specified process.
155431367466SOtto Ebeling 	 * Use the regular "ptrace_may_access()" checks.
155539743889SChristoph Lameter 	 */
155631367466SOtto Ebeling 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1557c69e8d9cSDavid Howells 		rcu_read_unlock();
155839743889SChristoph Lameter 		err = -EPERM;
15593268c63eSChristoph Lameter 		goto out_put;
156039743889SChristoph Lameter 	}
1561c69e8d9cSDavid Howells 	rcu_read_unlock();
156239743889SChristoph Lameter 
156339743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
156439743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
1565596d7cfaSKOSAKI Motohiro 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
156639743889SChristoph Lameter 		err = -EPERM;
15673268c63eSChristoph Lameter 		goto out_put;
156839743889SChristoph Lameter 	}
156939743889SChristoph Lameter 
15700486a38bSYisheng Xie 	task_nodes = cpuset_mems_allowed(current);
15710486a38bSYisheng Xie 	nodes_and(*new, *new, task_nodes);
15720486a38bSYisheng Xie 	if (nodes_empty(*new))
15733268c63eSChristoph Lameter 		goto out_put;
15740486a38bSYisheng Xie 
157586c3a764SDavid Quigley 	err = security_task_movememory(task);
157686c3a764SDavid Quigley 	if (err)
15773268c63eSChristoph Lameter 		goto out_put;
157886c3a764SDavid Quigley 
15793268c63eSChristoph Lameter 	mm = get_task_mm(task);
15803268c63eSChristoph Lameter 	put_task_struct(task);
1581f2a9ef88SSasha Levin 
1582f2a9ef88SSasha Levin 	if (!mm) {
1583f2a9ef88SSasha Levin 		err = -EINVAL;
1584f2a9ef88SSasha Levin 		goto out;
1585f2a9ef88SSasha Levin 	}
1586f2a9ef88SSasha Levin 
1587596d7cfaSKOSAKI Motohiro 	err = do_migrate_pages(mm, old, new,
158874c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
15893268c63eSChristoph Lameter 
159039743889SChristoph Lameter 	mmput(mm);
15913268c63eSChristoph Lameter out:
1592596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH_FREE(scratch);
1593596d7cfaSKOSAKI Motohiro 
159439743889SChristoph Lameter 	return err;
15953268c63eSChristoph Lameter 
15963268c63eSChristoph Lameter out_put:
15973268c63eSChristoph Lameter 	put_task_struct(task);
15983268c63eSChristoph Lameter 	goto out;
15993268c63eSChristoph Lameter 
160039743889SChristoph Lameter }
160139743889SChristoph Lameter 
1602b6e9b0baSDominik Brodowski SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1603b6e9b0baSDominik Brodowski 		const unsigned long __user *, old_nodes,
1604b6e9b0baSDominik Brodowski 		const unsigned long __user *, new_nodes)
1605b6e9b0baSDominik Brodowski {
1606b6e9b0baSDominik Brodowski 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1607b6e9b0baSDominik Brodowski }
1608b6e9b0baSDominik Brodowski 
160939743889SChristoph Lameter 
16108bccd85fSChristoph Lameter /* Retrieve NUMA policy */
1611af03c4acSDominik Brodowski static int kernel_get_mempolicy(int __user *policy,
1612af03c4acSDominik Brodowski 				unsigned long __user *nmask,
1613af03c4acSDominik Brodowski 				unsigned long maxnode,
1614af03c4acSDominik Brodowski 				unsigned long addr,
1615af03c4acSDominik Brodowski 				unsigned long flags)
16168bccd85fSChristoph Lameter {
1617dbcb0f19SAdrian Bunk 	int err;
16183f649ab7SKees Cook 	int pval;
16198bccd85fSChristoph Lameter 	nodemask_t nodes;
16208bccd85fSChristoph Lameter 
1621050c17f2SRalph Campbell 	if (nmask != NULL && maxnode < nr_node_ids)
16228bccd85fSChristoph Lameter 		return -EINVAL;
16238bccd85fSChristoph Lameter 
16244605f057SWenchao Hao 	addr = untagged_addr(addr);
16254605f057SWenchao Hao 
16268bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
16278bccd85fSChristoph Lameter 
16288bccd85fSChristoph Lameter 	if (err)
16298bccd85fSChristoph Lameter 		return err;
16308bccd85fSChristoph Lameter 
16318bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
16328bccd85fSChristoph Lameter 		return -EFAULT;
16338bccd85fSChristoph Lameter 
16348bccd85fSChristoph Lameter 	if (nmask)
16358bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
16368bccd85fSChristoph Lameter 
16378bccd85fSChristoph Lameter 	return err;
16388bccd85fSChristoph Lameter }
16398bccd85fSChristoph Lameter 
1640af03c4acSDominik Brodowski SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1641af03c4acSDominik Brodowski 		unsigned long __user *, nmask, unsigned long, maxnode,
1642af03c4acSDominik Brodowski 		unsigned long, addr, unsigned long, flags)
1643af03c4acSDominik Brodowski {
1644af03c4acSDominik Brodowski 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1645af03c4acSDominik Brodowski }
1646af03c4acSDominik Brodowski 
164720ca87f2SLi Xinhai bool vma_migratable(struct vm_area_struct *vma)
164820ca87f2SLi Xinhai {
164920ca87f2SLi Xinhai 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
165020ca87f2SLi Xinhai 		return false;
165120ca87f2SLi Xinhai 
165220ca87f2SLi Xinhai 	/*
165320ca87f2SLi Xinhai 	 * DAX device mappings require predictable access latency, so avoid
165420ca87f2SLi Xinhai 	 * incurring periodic faults.
165520ca87f2SLi Xinhai 	 */
165620ca87f2SLi Xinhai 	if (vma_is_dax(vma))
165720ca87f2SLi Xinhai 		return false;
165820ca87f2SLi Xinhai 
165920ca87f2SLi Xinhai 	if (is_vm_hugetlb_page(vma) &&
166020ca87f2SLi Xinhai 		!hugepage_migration_supported(hstate_vma(vma)))
166120ca87f2SLi Xinhai 		return false;
166220ca87f2SLi Xinhai 
166320ca87f2SLi Xinhai 	/*
166420ca87f2SLi Xinhai 	 * Migration allocates pages in the highest zone. If we cannot
166520ca87f2SLi Xinhai 	 * do so then migration (at least from node to node) is not
166620ca87f2SLi Xinhai 	 * possible.
166720ca87f2SLi Xinhai 	 */
166820ca87f2SLi Xinhai 	if (vma->vm_file &&
166920ca87f2SLi Xinhai 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
167020ca87f2SLi Xinhai 			< policy_zone)
167120ca87f2SLi Xinhai 		return false;
167220ca87f2SLi Xinhai 	return true;
167320ca87f2SLi Xinhai }
167420ca87f2SLi Xinhai 
167574d2c3a0SOleg Nesterov struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
167674d2c3a0SOleg Nesterov 						unsigned long addr)
16771da177e4SLinus Torvalds {
16788d90274bSOleg Nesterov 	struct mempolicy *pol = NULL;
16791da177e4SLinus Torvalds 
16801da177e4SLinus Torvalds 	if (vma) {
1681480eccf9SLee Schermerhorn 		if (vma->vm_ops && vma->vm_ops->get_policy) {
16828d90274bSOleg Nesterov 			pol = vma->vm_ops->get_policy(vma, addr);
168300442ad0SMel Gorman 		} else if (vma->vm_policy) {
16841da177e4SLinus Torvalds 			pol = vma->vm_policy;
168500442ad0SMel Gorman 
168600442ad0SMel Gorman 			/*
168700442ad0SMel Gorman 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
168800442ad0SMel Gorman 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
168900442ad0SMel Gorman 			 * count on these policies which will be dropped by
169000442ad0SMel Gorman 			 * mpol_cond_put() later
169100442ad0SMel Gorman 			 */
169200442ad0SMel Gorman 			if (mpol_needs_cond_ref(pol))
169300442ad0SMel Gorman 				mpol_get(pol);
169400442ad0SMel Gorman 		}
16951da177e4SLinus Torvalds 	}
1696f15ca78eSOleg Nesterov 
169774d2c3a0SOleg Nesterov 	return pol;
169874d2c3a0SOleg Nesterov }
169974d2c3a0SOleg Nesterov 
170074d2c3a0SOleg Nesterov /*
1701dd6eecb9SOleg Nesterov  * get_vma_policy(@vma, @addr)
170274d2c3a0SOleg Nesterov  * @vma: virtual memory area whose policy is sought
170374d2c3a0SOleg Nesterov  * @addr: address in @vma for shared policy lookup
170474d2c3a0SOleg Nesterov  *
170574d2c3a0SOleg Nesterov  * Returns effective policy for a VMA at specified address.
1706dd6eecb9SOleg Nesterov  * Falls back to current->mempolicy or system default policy, as necessary.
170774d2c3a0SOleg Nesterov  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
170874d2c3a0SOleg Nesterov  * count--added by the get_policy() vm_op, as appropriate--to protect against
170974d2c3a0SOleg Nesterov  * freeing by another task.  It is the caller's responsibility to free the
171074d2c3a0SOleg Nesterov  * extra reference for shared policies.
171174d2c3a0SOleg Nesterov  */
1712ac79f78dSDavid Rientjes static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1713dd6eecb9SOleg Nesterov 						unsigned long addr)
171474d2c3a0SOleg Nesterov {
171574d2c3a0SOleg Nesterov 	struct mempolicy *pol = __get_vma_policy(vma, addr);
171674d2c3a0SOleg Nesterov 
17178d90274bSOleg Nesterov 	if (!pol)
1718dd6eecb9SOleg Nesterov 		pol = get_task_policy(current);
17198d90274bSOleg Nesterov 
17201da177e4SLinus Torvalds 	return pol;
17211da177e4SLinus Torvalds }
17221da177e4SLinus Torvalds 
17236b6482bbSOleg Nesterov bool vma_policy_mof(struct vm_area_struct *vma)
1724fc314724SMel Gorman {
17256b6482bbSOleg Nesterov 	struct mempolicy *pol;
1726f15ca78eSOleg Nesterov 
1727fc314724SMel Gorman 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1728fc314724SMel Gorman 		bool ret = false;
1729fc314724SMel Gorman 
1730fc314724SMel Gorman 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1731fc314724SMel Gorman 		if (pol && (pol->flags & MPOL_F_MOF))
1732fc314724SMel Gorman 			ret = true;
1733fc314724SMel Gorman 		mpol_cond_put(pol);
1734fc314724SMel Gorman 
1735fc314724SMel Gorman 		return ret;
17368d90274bSOleg Nesterov 	}
17378d90274bSOleg Nesterov 
1738fc314724SMel Gorman 	pol = vma->vm_policy;
17398d90274bSOleg Nesterov 	if (!pol)
17406b6482bbSOleg Nesterov 		pol = get_task_policy(current);
1741fc314724SMel Gorman 
1742fc314724SMel Gorman 	return pol->flags & MPOL_F_MOF;
1743fc314724SMel Gorman }
1744fc314724SMel Gorman 
1745d3eb1570SLai Jiangshan static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1746d3eb1570SLai Jiangshan {
1747d3eb1570SLai Jiangshan 	enum zone_type dynamic_policy_zone = policy_zone;
1748d3eb1570SLai Jiangshan 
1749d3eb1570SLai Jiangshan 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1750d3eb1570SLai Jiangshan 
1751d3eb1570SLai Jiangshan 	/*
1752269fbe72SBen Widawsky 	 * if policy->nodes has movable memory only,
1753d3eb1570SLai Jiangshan 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1754d3eb1570SLai Jiangshan 	 *
1755269fbe72SBen Widawsky 	 * policy->nodes is intersect with node_states[N_MEMORY].
1756f0953a1bSIngo Molnar 	 * so if the following test fails, it implies
1757269fbe72SBen Widawsky 	 * policy->nodes has movable memory only.
1758d3eb1570SLai Jiangshan 	 */
1759269fbe72SBen Widawsky 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1760d3eb1570SLai Jiangshan 		dynamic_policy_zone = ZONE_MOVABLE;
1761d3eb1570SLai Jiangshan 
1762d3eb1570SLai Jiangshan 	return zone >= dynamic_policy_zone;
1763d3eb1570SLai Jiangshan }
1764d3eb1570SLai Jiangshan 
176552cd3b07SLee Schermerhorn /*
176652cd3b07SLee Schermerhorn  * Return a nodemask representing a mempolicy for filtering nodes for
176752cd3b07SLee Schermerhorn  * page allocation
176852cd3b07SLee Schermerhorn  */
17698ca39e68SMuchun Song nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
177019770b32SMel Gorman {
1771b27abaccSDave Hansen 	int mode = policy->mode;
1772b27abaccSDave Hansen 
177319770b32SMel Gorman 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1774b27abaccSDave Hansen 	if (unlikely(mode == MPOL_BIND) &&
1775d3eb1570SLai Jiangshan 		apply_policy_zone(policy, gfp_zone(gfp)) &&
1776269fbe72SBen Widawsky 		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1777269fbe72SBen Widawsky 		return &policy->nodes;
177819770b32SMel Gorman 
1779b27abaccSDave Hansen 	if (mode == MPOL_PREFERRED_MANY)
1780b27abaccSDave Hansen 		return &policy->nodes;
1781b27abaccSDave Hansen 
178219770b32SMel Gorman 	return NULL;
178319770b32SMel Gorman }
178419770b32SMel Gorman 
1785b27abaccSDave Hansen /*
1786b27abaccSDave Hansen  * Return the  preferred node id for 'prefer' mempolicy, and return
1787b27abaccSDave Hansen  * the given id for all other policies.
1788b27abaccSDave Hansen  *
1789b27abaccSDave Hansen  * policy_node() is always coupled with policy_nodemask(), which
1790b27abaccSDave Hansen  * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1791b27abaccSDave Hansen  */
1792f8fd5253SWei Yang static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
17931da177e4SLinus Torvalds {
17947858d7bcSFeng Tang 	if (policy->mode == MPOL_PREFERRED) {
1795269fbe72SBen Widawsky 		nd = first_node(policy->nodes);
17967858d7bcSFeng Tang 	} else {
179719770b32SMel Gorman 		/*
17986d840958SMichal Hocko 		 * __GFP_THISNODE shouldn't even be used with the bind policy
17996d840958SMichal Hocko 		 * because we might easily break the expectation to stay on the
18006d840958SMichal Hocko 		 * requested node and not break the policy.
180119770b32SMel Gorman 		 */
18026d840958SMichal Hocko 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
18031da177e4SLinus Torvalds 	}
18046d840958SMichal Hocko 
180504ec6264SVlastimil Babka 	return nd;
18061da177e4SLinus Torvalds }
18071da177e4SLinus Torvalds 
18081da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
18091da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
18101da177e4SLinus Torvalds {
181145816682SVlastimil Babka 	unsigned next;
18121da177e4SLinus Torvalds 	struct task_struct *me = current;
18131da177e4SLinus Torvalds 
1814269fbe72SBen Widawsky 	next = next_node_in(me->il_prev, policy->nodes);
1815f5b087b5SDavid Rientjes 	if (next < MAX_NUMNODES)
181645816682SVlastimil Babka 		me->il_prev = next;
181745816682SVlastimil Babka 	return next;
18181da177e4SLinus Torvalds }
18191da177e4SLinus Torvalds 
1820dc85da15SChristoph Lameter /*
1821dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1822dc85da15SChristoph Lameter  * next slab entry.
1823dc85da15SChristoph Lameter  */
18242a389610SDavid Rientjes unsigned int mempolicy_slab_node(void)
1825dc85da15SChristoph Lameter {
1826e7b691b0SAndi Kleen 	struct mempolicy *policy;
18272a389610SDavid Rientjes 	int node = numa_mem_id();
1828e7b691b0SAndi Kleen 
182938b031ddSVasily Averin 	if (!in_task())
18302a389610SDavid Rientjes 		return node;
1831e7b691b0SAndi Kleen 
1832e7b691b0SAndi Kleen 	policy = current->mempolicy;
18337858d7bcSFeng Tang 	if (!policy)
18342a389610SDavid Rientjes 		return node;
1835765c4507SChristoph Lameter 
1836bea904d5SLee Schermerhorn 	switch (policy->mode) {
1837bea904d5SLee Schermerhorn 	case MPOL_PREFERRED:
1838269fbe72SBen Widawsky 		return first_node(policy->nodes);
1839bea904d5SLee Schermerhorn 
1840dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1841dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1842dc85da15SChristoph Lameter 
1843b27abaccSDave Hansen 	case MPOL_BIND:
1844b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
1845b27abaccSDave Hansen 	{
1846c33d6c06SMel Gorman 		struct zoneref *z;
1847c33d6c06SMel Gorman 
1848dc85da15SChristoph Lameter 		/*
1849dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1850dc85da15SChristoph Lameter 		 * first node.
1851dc85da15SChristoph Lameter 		 */
185219770b32SMel Gorman 		struct zonelist *zonelist;
185319770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1854c9634cf0SAneesh Kumar K.V 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1855c33d6c06SMel Gorman 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1856269fbe72SBen Widawsky 							&policy->nodes);
1857c1093b74SPavel Tatashin 		return z->zone ? zone_to_nid(z->zone) : node;
1858dd1a239fSMel Gorman 	}
18597858d7bcSFeng Tang 	case MPOL_LOCAL:
18607858d7bcSFeng Tang 		return node;
1861dc85da15SChristoph Lameter 
1862dc85da15SChristoph Lameter 	default:
1863bea904d5SLee Schermerhorn 		BUG();
1864dc85da15SChristoph Lameter 	}
1865dc85da15SChristoph Lameter }
1866dc85da15SChristoph Lameter 
1867fee83b3aSAndrew Morton /*
1868fee83b3aSAndrew Morton  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1869269fbe72SBen Widawsky  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1870fee83b3aSAndrew Morton  * number of present nodes.
1871fee83b3aSAndrew Morton  */
187298c70baaSLaurent Dufour static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
18731da177e4SLinus Torvalds {
1874276aeee1Syanghui 	nodemask_t nodemask = pol->nodes;
1875276aeee1Syanghui 	unsigned int target, nnodes;
1876fee83b3aSAndrew Morton 	int i;
1877fee83b3aSAndrew Morton 	int nid;
1878276aeee1Syanghui 	/*
1879276aeee1Syanghui 	 * The barrier will stabilize the nodemask in a register or on
1880276aeee1Syanghui 	 * the stack so that it will stop changing under the code.
1881276aeee1Syanghui 	 *
1882276aeee1Syanghui 	 * Between first_node() and next_node(), pol->nodes could be changed
1883276aeee1Syanghui 	 * by other threads. So we put pol->nodes in a local stack.
1884276aeee1Syanghui 	 */
1885276aeee1Syanghui 	barrier();
18861da177e4SLinus Torvalds 
1887276aeee1Syanghui 	nnodes = nodes_weight(nodemask);
1888f5b087b5SDavid Rientjes 	if (!nnodes)
1889f5b087b5SDavid Rientjes 		return numa_node_id();
1890fee83b3aSAndrew Morton 	target = (unsigned int)n % nnodes;
1891276aeee1Syanghui 	nid = first_node(nodemask);
1892fee83b3aSAndrew Morton 	for (i = 0; i < target; i++)
1893276aeee1Syanghui 		nid = next_node(nid, nodemask);
18941da177e4SLinus Torvalds 	return nid;
18951da177e4SLinus Torvalds }
18961da177e4SLinus Torvalds 
18975da7ca86SChristoph Lameter /* Determine a node number for interleave */
18985da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
18995da7ca86SChristoph Lameter 		 struct vm_area_struct *vma, unsigned long addr, int shift)
19005da7ca86SChristoph Lameter {
19015da7ca86SChristoph Lameter 	if (vma) {
19025da7ca86SChristoph Lameter 		unsigned long off;
19035da7ca86SChristoph Lameter 
19043b98b087SNishanth Aravamudan 		/*
19053b98b087SNishanth Aravamudan 		 * for small pages, there is no difference between
19063b98b087SNishanth Aravamudan 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
19073b98b087SNishanth Aravamudan 		 * for huge pages, since vm_pgoff is in units of small
19083b98b087SNishanth Aravamudan 		 * pages, we need to shift off the always 0 bits to get
19093b98b087SNishanth Aravamudan 		 * a useful offset.
19103b98b087SNishanth Aravamudan 		 */
19113b98b087SNishanth Aravamudan 		BUG_ON(shift < PAGE_SHIFT);
19123b98b087SNishanth Aravamudan 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
19135da7ca86SChristoph Lameter 		off += (addr - vma->vm_start) >> shift;
191498c70baaSLaurent Dufour 		return offset_il_node(pol, off);
19155da7ca86SChristoph Lameter 	} else
19165da7ca86SChristoph Lameter 		return interleave_nodes(pol);
19175da7ca86SChristoph Lameter }
19185da7ca86SChristoph Lameter 
191900ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
1920480eccf9SLee Schermerhorn /*
192104ec6264SVlastimil Babka  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1922b46e14acSFabian Frederick  * @vma: virtual memory area whose policy is sought
1923b46e14acSFabian Frederick  * @addr: address in @vma for shared policy lookup and interleave policy
1924b46e14acSFabian Frederick  * @gfp_flags: for requested zone
1925b46e14acSFabian Frederick  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1926b27abaccSDave Hansen  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
1927480eccf9SLee Schermerhorn  *
192804ec6264SVlastimil Babka  * Returns a nid suitable for a huge page allocation and a pointer
192952cd3b07SLee Schermerhorn  * to the struct mempolicy for conditional unref after allocation.
1930b27abaccSDave Hansen  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1931b27abaccSDave Hansen  * to the mempolicy's @nodemask for filtering the zonelist.
1932c0ff7453SMiao Xie  *
1933d26914d1SMel Gorman  * Must be protected by read_mems_allowed_begin()
1934480eccf9SLee Schermerhorn  */
193504ec6264SVlastimil Babka int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
193604ec6264SVlastimil Babka 				struct mempolicy **mpol, nodemask_t **nodemask)
19375da7ca86SChristoph Lameter {
193804ec6264SVlastimil Babka 	int nid;
1939b27abaccSDave Hansen 	int mode;
19405da7ca86SChristoph Lameter 
1941dd6eecb9SOleg Nesterov 	*mpol = get_vma_policy(vma, addr);
1942b27abaccSDave Hansen 	*nodemask = NULL;
1943b27abaccSDave Hansen 	mode = (*mpol)->mode;
19445da7ca86SChristoph Lameter 
1945b27abaccSDave Hansen 	if (unlikely(mode == MPOL_INTERLEAVE)) {
194604ec6264SVlastimil Babka 		nid = interleave_nid(*mpol, vma, addr,
194704ec6264SVlastimil Babka 					huge_page_shift(hstate_vma(vma)));
194852cd3b07SLee Schermerhorn 	} else {
194904ec6264SVlastimil Babka 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1950b27abaccSDave Hansen 		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
1951269fbe72SBen Widawsky 			*nodemask = &(*mpol)->nodes;
1952480eccf9SLee Schermerhorn 	}
195304ec6264SVlastimil Babka 	return nid;
19545da7ca86SChristoph Lameter }
195506808b08SLee Schermerhorn 
195606808b08SLee Schermerhorn /*
195706808b08SLee Schermerhorn  * init_nodemask_of_mempolicy
195806808b08SLee Schermerhorn  *
195906808b08SLee Schermerhorn  * If the current task's mempolicy is "default" [NULL], return 'false'
196006808b08SLee Schermerhorn  * to indicate default policy.  Otherwise, extract the policy nodemask
196106808b08SLee Schermerhorn  * for 'bind' or 'interleave' policy into the argument nodemask, or
196206808b08SLee Schermerhorn  * initialize the argument nodemask to contain the single node for
196306808b08SLee Schermerhorn  * 'preferred' or 'local' policy and return 'true' to indicate presence
196406808b08SLee Schermerhorn  * of non-default mempolicy.
196506808b08SLee Schermerhorn  *
196606808b08SLee Schermerhorn  * We don't bother with reference counting the mempolicy [mpol_get/put]
196706808b08SLee Schermerhorn  * because the current task is examining it's own mempolicy and a task's
196806808b08SLee Schermerhorn  * mempolicy is only ever changed by the task itself.
196906808b08SLee Schermerhorn  *
197006808b08SLee Schermerhorn  * N.B., it is the caller's responsibility to free a returned nodemask.
197106808b08SLee Schermerhorn  */
197206808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask)
197306808b08SLee Schermerhorn {
197406808b08SLee Schermerhorn 	struct mempolicy *mempolicy;
197506808b08SLee Schermerhorn 
197606808b08SLee Schermerhorn 	if (!(mask && current->mempolicy))
197706808b08SLee Schermerhorn 		return false;
197806808b08SLee Schermerhorn 
1979c0ff7453SMiao Xie 	task_lock(current);
198006808b08SLee Schermerhorn 	mempolicy = current->mempolicy;
198106808b08SLee Schermerhorn 	switch (mempolicy->mode) {
198206808b08SLee Schermerhorn 	case MPOL_PREFERRED:
1983b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
198406808b08SLee Schermerhorn 	case MPOL_BIND:
198506808b08SLee Schermerhorn 	case MPOL_INTERLEAVE:
1986269fbe72SBen Widawsky 		*mask = mempolicy->nodes;
198706808b08SLee Schermerhorn 		break;
198806808b08SLee Schermerhorn 
19897858d7bcSFeng Tang 	case MPOL_LOCAL:
1990269fbe72SBen Widawsky 		init_nodemask_of_node(mask, numa_node_id());
19917858d7bcSFeng Tang 		break;
19927858d7bcSFeng Tang 
199306808b08SLee Schermerhorn 	default:
199406808b08SLee Schermerhorn 		BUG();
199506808b08SLee Schermerhorn 	}
1996c0ff7453SMiao Xie 	task_unlock(current);
199706808b08SLee Schermerhorn 
199806808b08SLee Schermerhorn 	return true;
199906808b08SLee Schermerhorn }
200000ac59adSChen, Kenneth W #endif
20015da7ca86SChristoph Lameter 
20026f48d0ebSDavid Rientjes /*
2003b26e517aSFeng Tang  * mempolicy_in_oom_domain
20046f48d0ebSDavid Rientjes  *
2005b26e517aSFeng Tang  * If tsk's mempolicy is "bind", check for intersection between mask and
2006b26e517aSFeng Tang  * the policy nodemask. Otherwise, return true for all other policies
2007b26e517aSFeng Tang  * including "interleave", as a tsk with "interleave" policy may have
2008b26e517aSFeng Tang  * memory allocated from all nodes in system.
20096f48d0ebSDavid Rientjes  *
20106f48d0ebSDavid Rientjes  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
20116f48d0ebSDavid Rientjes  */
2012b26e517aSFeng Tang bool mempolicy_in_oom_domain(struct task_struct *tsk,
20136f48d0ebSDavid Rientjes 					const nodemask_t *mask)
20146f48d0ebSDavid Rientjes {
20156f48d0ebSDavid Rientjes 	struct mempolicy *mempolicy;
20166f48d0ebSDavid Rientjes 	bool ret = true;
20176f48d0ebSDavid Rientjes 
20186f48d0ebSDavid Rientjes 	if (!mask)
20196f48d0ebSDavid Rientjes 		return ret;
2020b26e517aSFeng Tang 
20216f48d0ebSDavid Rientjes 	task_lock(tsk);
20226f48d0ebSDavid Rientjes 	mempolicy = tsk->mempolicy;
2023b26e517aSFeng Tang 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2024269fbe72SBen Widawsky 		ret = nodes_intersects(mempolicy->nodes, *mask);
20256f48d0ebSDavid Rientjes 	task_unlock(tsk);
2026b26e517aSFeng Tang 
20276f48d0ebSDavid Rientjes 	return ret;
20286f48d0ebSDavid Rientjes }
20296f48d0ebSDavid Rientjes 
20301da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
20311da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
2032662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2033662f3a0bSAndi Kleen 					unsigned nid)
20341da177e4SLinus Torvalds {
20351da177e4SLinus Torvalds 	struct page *page;
20361da177e4SLinus Torvalds 
203784172f4bSMatthew Wilcox (Oracle) 	page = __alloc_pages(gfp, order, nid, NULL);
20384518085eSKemi Wang 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
20394518085eSKemi Wang 	if (!static_branch_likely(&vm_numa_stat_key))
20404518085eSKemi Wang 		return page;
2041de55c8b2SAndrey Ryabinin 	if (page && page_to_nid(page) == nid) {
2042de55c8b2SAndrey Ryabinin 		preempt_disable();
2043f19298b9SMel Gorman 		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2044de55c8b2SAndrey Ryabinin 		preempt_enable();
2045de55c8b2SAndrey Ryabinin 	}
20461da177e4SLinus Torvalds 	return page;
20471da177e4SLinus Torvalds }
20481da177e4SLinus Torvalds 
20494c54d949SFeng Tang static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
20504c54d949SFeng Tang 						int nid, struct mempolicy *pol)
20514c54d949SFeng Tang {
20524c54d949SFeng Tang 	struct page *page;
20534c54d949SFeng Tang 	gfp_t preferred_gfp;
20544c54d949SFeng Tang 
20554c54d949SFeng Tang 	/*
20564c54d949SFeng Tang 	 * This is a two pass approach. The first pass will only try the
20574c54d949SFeng Tang 	 * preferred nodes but skip the direct reclaim and allow the
20584c54d949SFeng Tang 	 * allocation to fail, while the second pass will try all the
20594c54d949SFeng Tang 	 * nodes in system.
20604c54d949SFeng Tang 	 */
20614c54d949SFeng Tang 	preferred_gfp = gfp | __GFP_NOWARN;
20624c54d949SFeng Tang 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
20634c54d949SFeng Tang 	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
20644c54d949SFeng Tang 	if (!page)
20654c54d949SFeng Tang 		page = __alloc_pages(gfp, order, numa_node_id(), NULL);
20664c54d949SFeng Tang 
20674c54d949SFeng Tang 	return page;
20684c54d949SFeng Tang }
20694c54d949SFeng Tang 
20701da177e4SLinus Torvalds /**
20710bbbc0b3SAndrea Arcangeli  * alloc_pages_vma - Allocate a page for a VMA.
2072eb350739SMatthew Wilcox (Oracle)  * @gfp: GFP flags.
20730bbbc0b3SAndrea Arcangeli  * @order: Order of the GFP allocation.
20741da177e4SLinus Torvalds  * @vma: Pointer to VMA or NULL if not available.
2075eb350739SMatthew Wilcox (Oracle)  * @addr: Virtual address of the allocation.  Must be inside @vma.
2076be97a41bSVlastimil Babka  * @node: Which node to prefer for allocation (modulo policy).
2077eb350739SMatthew Wilcox (Oracle)  * @hugepage: For hugepages try only the preferred node if possible.
20781da177e4SLinus Torvalds  *
2079eb350739SMatthew Wilcox (Oracle)  * Allocate a page for a specific address in @vma, using the appropriate
2080eb350739SMatthew Wilcox (Oracle)  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2081eb350739SMatthew Wilcox (Oracle)  * of the mm_struct of the VMA to prevent it from going away.  Should be
2082eb350739SMatthew Wilcox (Oracle)  * used for all allocations for pages that will be mapped into user space.
2083eb350739SMatthew Wilcox (Oracle)  *
2084eb350739SMatthew Wilcox (Oracle)  * Return: The page on success or NULL if allocation fails.
20851da177e4SLinus Torvalds  */
2086eb350739SMatthew Wilcox (Oracle) struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2087*be1a13ebSMichal Hocko 		unsigned long addr, bool hugepage)
20881da177e4SLinus Torvalds {
2089cc9a6c87SMel Gorman 	struct mempolicy *pol;
2090*be1a13ebSMichal Hocko 	int node = numa_node_id();
2091c0ff7453SMiao Xie 	struct page *page;
209204ec6264SVlastimil Babka 	int preferred_nid;
2093be97a41bSVlastimil Babka 	nodemask_t *nmask;
20941da177e4SLinus Torvalds 
2095dd6eecb9SOleg Nesterov 	pol = get_vma_policy(vma, addr);
2096cc9a6c87SMel Gorman 
2097be97a41bSVlastimil Babka 	if (pol->mode == MPOL_INTERLEAVE) {
20981da177e4SLinus Torvalds 		unsigned nid;
20995da7ca86SChristoph Lameter 
21008eac563cSAndi Kleen 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
210152cd3b07SLee Schermerhorn 		mpol_cond_put(pol);
21020bbbc0b3SAndrea Arcangeli 		page = alloc_page_interleave(gfp, order, nid);
2103be97a41bSVlastimil Babka 		goto out;
21041da177e4SLinus Torvalds 	}
21051da177e4SLinus Torvalds 
21064c54d949SFeng Tang 	if (pol->mode == MPOL_PREFERRED_MANY) {
21074c54d949SFeng Tang 		page = alloc_pages_preferred_many(gfp, order, node, pol);
21084c54d949SFeng Tang 		mpol_cond_put(pol);
21094c54d949SFeng Tang 		goto out;
21104c54d949SFeng Tang 	}
21114c54d949SFeng Tang 
211219deb769SDavid Rientjes 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
211319deb769SDavid Rientjes 		int hpage_node = node;
211419deb769SDavid Rientjes 
211519deb769SDavid Rientjes 		/*
211619deb769SDavid Rientjes 		 * For hugepage allocation and non-interleave policy which
211719deb769SDavid Rientjes 		 * allows the current node (or other explicitly preferred
211819deb769SDavid Rientjes 		 * node) we only try to allocate from the current/preferred
211919deb769SDavid Rientjes 		 * node and don't fall back to other nodes, as the cost of
212019deb769SDavid Rientjes 		 * remote accesses would likely offset THP benefits.
212119deb769SDavid Rientjes 		 *
2122b27abaccSDave Hansen 		 * If the policy is interleave or does not allow the current
212319deb769SDavid Rientjes 		 * node in its nodemask, we allocate the standard way.
212419deb769SDavid Rientjes 		 */
21257858d7bcSFeng Tang 		if (pol->mode == MPOL_PREFERRED)
2126269fbe72SBen Widawsky 			hpage_node = first_node(pol->nodes);
212719deb769SDavid Rientjes 
212819deb769SDavid Rientjes 		nmask = policy_nodemask(gfp, pol);
212919deb769SDavid Rientjes 		if (!nmask || node_isset(hpage_node, *nmask)) {
213019deb769SDavid Rientjes 			mpol_cond_put(pol);
2131cc638f32SVlastimil Babka 			/*
2132cc638f32SVlastimil Babka 			 * First, try to allocate THP only on local node, but
2133cc638f32SVlastimil Babka 			 * don't reclaim unnecessarily, just compact.
2134cc638f32SVlastimil Babka 			 */
213519deb769SDavid Rientjes 			page = __alloc_pages_node(hpage_node,
2136cc638f32SVlastimil Babka 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
213776e654ccSDavid Rientjes 
213876e654ccSDavid Rientjes 			/*
213976e654ccSDavid Rientjes 			 * If hugepage allocations are configured to always
214076e654ccSDavid Rientjes 			 * synchronous compact or the vma has been madvised
214176e654ccSDavid Rientjes 			 * to prefer hugepage backing, retry allowing remote
2142cc638f32SVlastimil Babka 			 * memory with both reclaim and compact as well.
214376e654ccSDavid Rientjes 			 */
214476e654ccSDavid Rientjes 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
214533863534SAndrey Ryabinin 				page = __alloc_pages(gfp, order, hpage_node, nmask);
214676e654ccSDavid Rientjes 
214719deb769SDavid Rientjes 			goto out;
214819deb769SDavid Rientjes 		}
214919deb769SDavid Rientjes 	}
215019deb769SDavid Rientjes 
2151077fcf11SAneesh Kumar K.V 	nmask = policy_nodemask(gfp, pol);
215204ec6264SVlastimil Babka 	preferred_nid = policy_node(gfp, pol, node);
215384172f4bSMatthew Wilcox (Oracle) 	page = __alloc_pages(gfp, order, preferred_nid, nmask);
2154d51e9894SVlastimil Babka 	mpol_cond_put(pol);
2155be97a41bSVlastimil Babka out:
2156077fcf11SAneesh Kumar K.V 	return page;
2157077fcf11SAneesh Kumar K.V }
215869262215SChristoph Hellwig EXPORT_SYMBOL(alloc_pages_vma);
2159077fcf11SAneesh Kumar K.V 
21601da177e4SLinus Torvalds /**
2161d7f946d0SMatthew Wilcox (Oracle)  * alloc_pages - Allocate pages.
21626421ec76SMatthew Wilcox (Oracle)  * @gfp: GFP flags.
21636421ec76SMatthew Wilcox (Oracle)  * @order: Power of two of number of pages to allocate.
21641da177e4SLinus Torvalds  *
21656421ec76SMatthew Wilcox (Oracle)  * Allocate 1 << @order contiguous pages.  The physical address of the
21666421ec76SMatthew Wilcox (Oracle)  * first page is naturally aligned (eg an order-3 allocation will be aligned
21676421ec76SMatthew Wilcox (Oracle)  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
21686421ec76SMatthew Wilcox (Oracle)  * process is honoured when in process context.
21691da177e4SLinus Torvalds  *
21706421ec76SMatthew Wilcox (Oracle)  * Context: Can be called from any context, providing the appropriate GFP
21716421ec76SMatthew Wilcox (Oracle)  * flags are used.
21726421ec76SMatthew Wilcox (Oracle)  * Return: The page on success or NULL if allocation fails.
21731da177e4SLinus Torvalds  */
2174d7f946d0SMatthew Wilcox (Oracle) struct page *alloc_pages(gfp_t gfp, unsigned order)
21751da177e4SLinus Torvalds {
21768d90274bSOleg Nesterov 	struct mempolicy *pol = &default_policy;
2177c0ff7453SMiao Xie 	struct page *page;
21781da177e4SLinus Torvalds 
21798d90274bSOleg Nesterov 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
21808d90274bSOleg Nesterov 		pol = get_task_policy(current);
218152cd3b07SLee Schermerhorn 
218252cd3b07SLee Schermerhorn 	/*
218352cd3b07SLee Schermerhorn 	 * No reference counting needed for current->mempolicy
218452cd3b07SLee Schermerhorn 	 * nor system default_policy
218552cd3b07SLee Schermerhorn 	 */
218645c4745aSLee Schermerhorn 	if (pol->mode == MPOL_INTERLEAVE)
2187c0ff7453SMiao Xie 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
21884c54d949SFeng Tang 	else if (pol->mode == MPOL_PREFERRED_MANY)
21894c54d949SFeng Tang 		page = alloc_pages_preferred_many(gfp, order,
21904c54d949SFeng Tang 				numa_node_id(), pol);
2191c0ff7453SMiao Xie 	else
219284172f4bSMatthew Wilcox (Oracle) 		page = __alloc_pages(gfp, order,
219304ec6264SVlastimil Babka 				policy_node(gfp, pol, numa_node_id()),
21945c4b4be3SAndi Kleen 				policy_nodemask(gfp, pol));
2195cc9a6c87SMel Gorman 
2196c0ff7453SMiao Xie 	return page;
21971da177e4SLinus Torvalds }
2198d7f946d0SMatthew Wilcox (Oracle) EXPORT_SYMBOL(alloc_pages);
21991da177e4SLinus Torvalds 
2200cc09cb13SMatthew Wilcox (Oracle) struct folio *folio_alloc(gfp_t gfp, unsigned order)
2201cc09cb13SMatthew Wilcox (Oracle) {
2202cc09cb13SMatthew Wilcox (Oracle) 	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2203cc09cb13SMatthew Wilcox (Oracle) 
2204cc09cb13SMatthew Wilcox (Oracle) 	if (page && order > 1)
2205cc09cb13SMatthew Wilcox (Oracle) 		prep_transhuge_page(page);
2206cc09cb13SMatthew Wilcox (Oracle) 	return (struct folio *)page;
2207cc09cb13SMatthew Wilcox (Oracle) }
2208cc09cb13SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_alloc);
2209cc09cb13SMatthew Wilcox (Oracle) 
2210c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2211c00b6b96SChen Wandun 		struct mempolicy *pol, unsigned long nr_pages,
2212c00b6b96SChen Wandun 		struct page **page_array)
2213c00b6b96SChen Wandun {
2214c00b6b96SChen Wandun 	int nodes;
2215c00b6b96SChen Wandun 	unsigned long nr_pages_per_node;
2216c00b6b96SChen Wandun 	int delta;
2217c00b6b96SChen Wandun 	int i;
2218c00b6b96SChen Wandun 	unsigned long nr_allocated;
2219c00b6b96SChen Wandun 	unsigned long total_allocated = 0;
2220c00b6b96SChen Wandun 
2221c00b6b96SChen Wandun 	nodes = nodes_weight(pol->nodes);
2222c00b6b96SChen Wandun 	nr_pages_per_node = nr_pages / nodes;
2223c00b6b96SChen Wandun 	delta = nr_pages - nodes * nr_pages_per_node;
2224c00b6b96SChen Wandun 
2225c00b6b96SChen Wandun 	for (i = 0; i < nodes; i++) {
2226c00b6b96SChen Wandun 		if (delta) {
2227c00b6b96SChen Wandun 			nr_allocated = __alloc_pages_bulk(gfp,
2228c00b6b96SChen Wandun 					interleave_nodes(pol), NULL,
2229c00b6b96SChen Wandun 					nr_pages_per_node + 1, NULL,
2230c00b6b96SChen Wandun 					page_array);
2231c00b6b96SChen Wandun 			delta--;
2232c00b6b96SChen Wandun 		} else {
2233c00b6b96SChen Wandun 			nr_allocated = __alloc_pages_bulk(gfp,
2234c00b6b96SChen Wandun 					interleave_nodes(pol), NULL,
2235c00b6b96SChen Wandun 					nr_pages_per_node, NULL, page_array);
2236c00b6b96SChen Wandun 		}
2237c00b6b96SChen Wandun 
2238c00b6b96SChen Wandun 		page_array += nr_allocated;
2239c00b6b96SChen Wandun 		total_allocated += nr_allocated;
2240c00b6b96SChen Wandun 	}
2241c00b6b96SChen Wandun 
2242c00b6b96SChen Wandun 	return total_allocated;
2243c00b6b96SChen Wandun }
2244c00b6b96SChen Wandun 
2245c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2246c00b6b96SChen Wandun 		struct mempolicy *pol, unsigned long nr_pages,
2247c00b6b96SChen Wandun 		struct page **page_array)
2248c00b6b96SChen Wandun {
2249c00b6b96SChen Wandun 	gfp_t preferred_gfp;
2250c00b6b96SChen Wandun 	unsigned long nr_allocated = 0;
2251c00b6b96SChen Wandun 
2252c00b6b96SChen Wandun 	preferred_gfp = gfp | __GFP_NOWARN;
2253c00b6b96SChen Wandun 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2254c00b6b96SChen Wandun 
2255c00b6b96SChen Wandun 	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2256c00b6b96SChen Wandun 					   nr_pages, NULL, page_array);
2257c00b6b96SChen Wandun 
2258c00b6b96SChen Wandun 	if (nr_allocated < nr_pages)
2259c00b6b96SChen Wandun 		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2260c00b6b96SChen Wandun 				nr_pages - nr_allocated, NULL,
2261c00b6b96SChen Wandun 				page_array + nr_allocated);
2262c00b6b96SChen Wandun 	return nr_allocated;
2263c00b6b96SChen Wandun }
2264c00b6b96SChen Wandun 
2265c00b6b96SChen Wandun /* alloc pages bulk and mempolicy should be considered at the
2266c00b6b96SChen Wandun  * same time in some situation such as vmalloc.
2267c00b6b96SChen Wandun  *
2268c00b6b96SChen Wandun  * It can accelerate memory allocation especially interleaving
2269c00b6b96SChen Wandun  * allocate memory.
2270c00b6b96SChen Wandun  */
2271c00b6b96SChen Wandun unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2272c00b6b96SChen Wandun 		unsigned long nr_pages, struct page **page_array)
2273c00b6b96SChen Wandun {
2274c00b6b96SChen Wandun 	struct mempolicy *pol = &default_policy;
2275c00b6b96SChen Wandun 
2276c00b6b96SChen Wandun 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2277c00b6b96SChen Wandun 		pol = get_task_policy(current);
2278c00b6b96SChen Wandun 
2279c00b6b96SChen Wandun 	if (pol->mode == MPOL_INTERLEAVE)
2280c00b6b96SChen Wandun 		return alloc_pages_bulk_array_interleave(gfp, pol,
2281c00b6b96SChen Wandun 							 nr_pages, page_array);
2282c00b6b96SChen Wandun 
2283c00b6b96SChen Wandun 	if (pol->mode == MPOL_PREFERRED_MANY)
2284c00b6b96SChen Wandun 		return alloc_pages_bulk_array_preferred_many(gfp,
2285c00b6b96SChen Wandun 				numa_node_id(), pol, nr_pages, page_array);
2286c00b6b96SChen Wandun 
2287c00b6b96SChen Wandun 	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2288c00b6b96SChen Wandun 				  policy_nodemask(gfp, pol), nr_pages, NULL,
2289c00b6b96SChen Wandun 				  page_array);
2290c00b6b96SChen Wandun }
2291c00b6b96SChen Wandun 
2292ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2293ef0855d3SOleg Nesterov {
2294ef0855d3SOleg Nesterov 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2295ef0855d3SOleg Nesterov 
2296ef0855d3SOleg Nesterov 	if (IS_ERR(pol))
2297ef0855d3SOleg Nesterov 		return PTR_ERR(pol);
2298ef0855d3SOleg Nesterov 	dst->vm_policy = pol;
2299ef0855d3SOleg Nesterov 	return 0;
2300ef0855d3SOleg Nesterov }
2301ef0855d3SOleg Nesterov 
23024225399aSPaul Jackson /*
2303846a16bfSLee Schermerhorn  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
23044225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
23054225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
23064225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
23074225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
2308708c1bbcSMiao Xie  *
2309708c1bbcSMiao Xie  * current's mempolicy may be rebinded by the other task(the task that changes
2310708c1bbcSMiao Xie  * cpuset's mems), so we needn't do rebind work for current task.
23114225399aSPaul Jackson  */
23124225399aSPaul Jackson 
2313846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
2314846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
23151da177e4SLinus Torvalds {
23161da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
23171da177e4SLinus Torvalds 
23181da177e4SLinus Torvalds 	if (!new)
23191da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2320708c1bbcSMiao Xie 
2321708c1bbcSMiao Xie 	/* task's mempolicy is protected by alloc_lock */
2322708c1bbcSMiao Xie 	if (old == current->mempolicy) {
2323708c1bbcSMiao Xie 		task_lock(current);
2324708c1bbcSMiao Xie 		*new = *old;
2325708c1bbcSMiao Xie 		task_unlock(current);
2326708c1bbcSMiao Xie 	} else
2327708c1bbcSMiao Xie 		*new = *old;
2328708c1bbcSMiao Xie 
23294225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
23304225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
2331213980c0SVlastimil Babka 		mpol_rebind_policy(new, &mems);
23324225399aSPaul Jackson 	}
23331da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
23341da177e4SLinus Torvalds 	return new;
23351da177e4SLinus Torvalds }
23361da177e4SLinus Torvalds 
23371da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
2338fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
23391da177e4SLinus Torvalds {
23401da177e4SLinus Torvalds 	if (!a || !b)
2341fcfb4dccSKOSAKI Motohiro 		return false;
234245c4745aSLee Schermerhorn 	if (a->mode != b->mode)
2343fcfb4dccSKOSAKI Motohiro 		return false;
234419800502SBob Liu 	if (a->flags != b->flags)
2345fcfb4dccSKOSAKI Motohiro 		return false;
234619800502SBob Liu 	if (mpol_store_user_nodemask(a))
234719800502SBob Liu 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2348fcfb4dccSKOSAKI Motohiro 			return false;
234919800502SBob Liu 
235045c4745aSLee Schermerhorn 	switch (a->mode) {
235119770b32SMel Gorman 	case MPOL_BIND:
23521da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
23531da177e4SLinus Torvalds 	case MPOL_PREFERRED:
2354b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
2355269fbe72SBen Widawsky 		return !!nodes_equal(a->nodes, b->nodes);
23567858d7bcSFeng Tang 	case MPOL_LOCAL:
23577858d7bcSFeng Tang 		return true;
23581da177e4SLinus Torvalds 	default:
23591da177e4SLinus Torvalds 		BUG();
2360fcfb4dccSKOSAKI Motohiro 		return false;
23611da177e4SLinus Torvalds 	}
23621da177e4SLinus Torvalds }
23631da177e4SLinus Torvalds 
23641da177e4SLinus Torvalds /*
23651da177e4SLinus Torvalds  * Shared memory backing store policy support.
23661da177e4SLinus Torvalds  *
23671da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
23681da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
23694a8c7bb5SNathan Zimmer  * They are protected by the sp->lock rwlock, which should be held
23701da177e4SLinus Torvalds  * for any accesses to the tree.
23711da177e4SLinus Torvalds  */
23721da177e4SLinus Torvalds 
23734a8c7bb5SNathan Zimmer /*
23744a8c7bb5SNathan Zimmer  * lookup first element intersecting start-end.  Caller holds sp->lock for
23754a8c7bb5SNathan Zimmer  * reading or for writing
23764a8c7bb5SNathan Zimmer  */
23771da177e4SLinus Torvalds static struct sp_node *
23781da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
23791da177e4SLinus Torvalds {
23801da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
23811da177e4SLinus Torvalds 
23821da177e4SLinus Torvalds 	while (n) {
23831da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
23841da177e4SLinus Torvalds 
23851da177e4SLinus Torvalds 		if (start >= p->end)
23861da177e4SLinus Torvalds 			n = n->rb_right;
23871da177e4SLinus Torvalds 		else if (end <= p->start)
23881da177e4SLinus Torvalds 			n = n->rb_left;
23891da177e4SLinus Torvalds 		else
23901da177e4SLinus Torvalds 			break;
23911da177e4SLinus Torvalds 	}
23921da177e4SLinus Torvalds 	if (!n)
23931da177e4SLinus Torvalds 		return NULL;
23941da177e4SLinus Torvalds 	for (;;) {
23951da177e4SLinus Torvalds 		struct sp_node *w = NULL;
23961da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
23971da177e4SLinus Torvalds 		if (!prev)
23981da177e4SLinus Torvalds 			break;
23991da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
24001da177e4SLinus Torvalds 		if (w->end <= start)
24011da177e4SLinus Torvalds 			break;
24021da177e4SLinus Torvalds 		n = prev;
24031da177e4SLinus Torvalds 	}
24041da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
24051da177e4SLinus Torvalds }
24061da177e4SLinus Torvalds 
24074a8c7bb5SNathan Zimmer /*
24084a8c7bb5SNathan Zimmer  * Insert a new shared policy into the list.  Caller holds sp->lock for
24094a8c7bb5SNathan Zimmer  * writing.
24104a8c7bb5SNathan Zimmer  */
24111da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
24121da177e4SLinus Torvalds {
24131da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
24141da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
24151da177e4SLinus Torvalds 	struct sp_node *nd;
24161da177e4SLinus Torvalds 
24171da177e4SLinus Torvalds 	while (*p) {
24181da177e4SLinus Torvalds 		parent = *p;
24191da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
24201da177e4SLinus Torvalds 		if (new->start < nd->start)
24211da177e4SLinus Torvalds 			p = &(*p)->rb_left;
24221da177e4SLinus Torvalds 		else if (new->end > nd->end)
24231da177e4SLinus Torvalds 			p = &(*p)->rb_right;
24241da177e4SLinus Torvalds 		else
24251da177e4SLinus Torvalds 			BUG();
24261da177e4SLinus Torvalds 	}
24271da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
24281da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
2429140d5a49SPaul Mundt 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
243045c4745aSLee Schermerhorn 		 new->policy ? new->policy->mode : 0);
24311da177e4SLinus Torvalds }
24321da177e4SLinus Torvalds 
24331da177e4SLinus Torvalds /* Find shared policy intersecting idx */
24341da177e4SLinus Torvalds struct mempolicy *
24351da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
24361da177e4SLinus Torvalds {
24371da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
24381da177e4SLinus Torvalds 	struct sp_node *sn;
24391da177e4SLinus Torvalds 
24401da177e4SLinus Torvalds 	if (!sp->root.rb_node)
24411da177e4SLinus Torvalds 		return NULL;
24424a8c7bb5SNathan Zimmer 	read_lock(&sp->lock);
24431da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
24441da177e4SLinus Torvalds 	if (sn) {
24451da177e4SLinus Torvalds 		mpol_get(sn->policy);
24461da177e4SLinus Torvalds 		pol = sn->policy;
24471da177e4SLinus Torvalds 	}
24484a8c7bb5SNathan Zimmer 	read_unlock(&sp->lock);
24491da177e4SLinus Torvalds 	return pol;
24501da177e4SLinus Torvalds }
24511da177e4SLinus Torvalds 
245263f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n)
245363f74ca2SKOSAKI Motohiro {
245463f74ca2SKOSAKI Motohiro 	mpol_put(n->policy);
245563f74ca2SKOSAKI Motohiro 	kmem_cache_free(sn_cache, n);
245663f74ca2SKOSAKI Motohiro }
245763f74ca2SKOSAKI Motohiro 
2458771fb4d8SLee Schermerhorn /**
2459771fb4d8SLee Schermerhorn  * mpol_misplaced - check whether current page node is valid in policy
2460771fb4d8SLee Schermerhorn  *
2461b46e14acSFabian Frederick  * @page: page to be checked
2462b46e14acSFabian Frederick  * @vma: vm area where page mapped
2463b46e14acSFabian Frederick  * @addr: virtual address where page mapped
2464771fb4d8SLee Schermerhorn  *
2465771fb4d8SLee Schermerhorn  * Lookup current policy node id for vma,addr and "compare to" page's
24665f076944SMatthew Wilcox (Oracle)  * node id.  Policy determination "mimics" alloc_page_vma().
2467771fb4d8SLee Schermerhorn  * Called from fault path where we know the vma and faulting address.
24685f076944SMatthew Wilcox (Oracle)  *
2469062db293SBaolin Wang  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2470062db293SBaolin Wang  * policy, or a suitable node ID to allocate a replacement page from.
2471771fb4d8SLee Schermerhorn  */
2472771fb4d8SLee Schermerhorn int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2473771fb4d8SLee Schermerhorn {
2474771fb4d8SLee Schermerhorn 	struct mempolicy *pol;
2475c33d6c06SMel Gorman 	struct zoneref *z;
2476771fb4d8SLee Schermerhorn 	int curnid = page_to_nid(page);
2477771fb4d8SLee Schermerhorn 	unsigned long pgoff;
247890572890SPeter Zijlstra 	int thiscpu = raw_smp_processor_id();
247990572890SPeter Zijlstra 	int thisnid = cpu_to_node(thiscpu);
248098fa15f3SAnshuman Khandual 	int polnid = NUMA_NO_NODE;
2481062db293SBaolin Wang 	int ret = NUMA_NO_NODE;
2482771fb4d8SLee Schermerhorn 
2483dd6eecb9SOleg Nesterov 	pol = get_vma_policy(vma, addr);
2484771fb4d8SLee Schermerhorn 	if (!(pol->flags & MPOL_F_MOF))
2485771fb4d8SLee Schermerhorn 		goto out;
2486771fb4d8SLee Schermerhorn 
2487771fb4d8SLee Schermerhorn 	switch (pol->mode) {
2488771fb4d8SLee Schermerhorn 	case MPOL_INTERLEAVE:
2489771fb4d8SLee Schermerhorn 		pgoff = vma->vm_pgoff;
2490771fb4d8SLee Schermerhorn 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
249198c70baaSLaurent Dufour 		polnid = offset_il_node(pol, pgoff);
2492771fb4d8SLee Schermerhorn 		break;
2493771fb4d8SLee Schermerhorn 
2494771fb4d8SLee Schermerhorn 	case MPOL_PREFERRED:
2495b27abaccSDave Hansen 		if (node_isset(curnid, pol->nodes))
2496b27abaccSDave Hansen 			goto out;
2497269fbe72SBen Widawsky 		polnid = first_node(pol->nodes);
2498771fb4d8SLee Schermerhorn 		break;
2499771fb4d8SLee Schermerhorn 
25007858d7bcSFeng Tang 	case MPOL_LOCAL:
25017858d7bcSFeng Tang 		polnid = numa_node_id();
25027858d7bcSFeng Tang 		break;
25037858d7bcSFeng Tang 
2504771fb4d8SLee Schermerhorn 	case MPOL_BIND:
2505bda420b9SHuang Ying 		/* Optimize placement among multiple nodes via NUMA balancing */
2506bda420b9SHuang Ying 		if (pol->flags & MPOL_F_MORON) {
2507269fbe72SBen Widawsky 			if (node_isset(thisnid, pol->nodes))
2508bda420b9SHuang Ying 				break;
2509bda420b9SHuang Ying 			goto out;
2510bda420b9SHuang Ying 		}
2511b27abaccSDave Hansen 		fallthrough;
2512c33d6c06SMel Gorman 
2513b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
2514771fb4d8SLee Schermerhorn 		/*
2515771fb4d8SLee Schermerhorn 		 * use current page if in policy nodemask,
2516771fb4d8SLee Schermerhorn 		 * else select nearest allowed node, if any.
2517771fb4d8SLee Schermerhorn 		 * If no allowed nodes, use current [!misplaced].
2518771fb4d8SLee Schermerhorn 		 */
2519269fbe72SBen Widawsky 		if (node_isset(curnid, pol->nodes))
2520771fb4d8SLee Schermerhorn 			goto out;
2521c33d6c06SMel Gorman 		z = first_zones_zonelist(
2522771fb4d8SLee Schermerhorn 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2523771fb4d8SLee Schermerhorn 				gfp_zone(GFP_HIGHUSER),
2524269fbe72SBen Widawsky 				&pol->nodes);
2525c1093b74SPavel Tatashin 		polnid = zone_to_nid(z->zone);
2526771fb4d8SLee Schermerhorn 		break;
2527771fb4d8SLee Schermerhorn 
2528771fb4d8SLee Schermerhorn 	default:
2529771fb4d8SLee Schermerhorn 		BUG();
2530771fb4d8SLee Schermerhorn 	}
25315606e387SMel Gorman 
25325606e387SMel Gorman 	/* Migrate the page towards the node whose CPU is referencing it */
2533e42c8ff2SMel Gorman 	if (pol->flags & MPOL_F_MORON) {
253490572890SPeter Zijlstra 		polnid = thisnid;
25355606e387SMel Gorman 
253610f39042SRik van Riel 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2537de1c9ce6SRik van Riel 			goto out;
2538de1c9ce6SRik van Riel 	}
2539e42c8ff2SMel Gorman 
2540771fb4d8SLee Schermerhorn 	if (curnid != polnid)
2541771fb4d8SLee Schermerhorn 		ret = polnid;
2542771fb4d8SLee Schermerhorn out:
2543771fb4d8SLee Schermerhorn 	mpol_cond_put(pol);
2544771fb4d8SLee Schermerhorn 
2545771fb4d8SLee Schermerhorn 	return ret;
2546771fb4d8SLee Schermerhorn }
2547771fb4d8SLee Schermerhorn 
2548c11600e4SDavid Rientjes /*
2549c11600e4SDavid Rientjes  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2550c11600e4SDavid Rientjes  * dropped after task->mempolicy is set to NULL so that any allocation done as
2551c11600e4SDavid Rientjes  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2552c11600e4SDavid Rientjes  * policy.
2553c11600e4SDavid Rientjes  */
2554c11600e4SDavid Rientjes void mpol_put_task_policy(struct task_struct *task)
2555c11600e4SDavid Rientjes {
2556c11600e4SDavid Rientjes 	struct mempolicy *pol;
2557c11600e4SDavid Rientjes 
2558c11600e4SDavid Rientjes 	task_lock(task);
2559c11600e4SDavid Rientjes 	pol = task->mempolicy;
2560c11600e4SDavid Rientjes 	task->mempolicy = NULL;
2561c11600e4SDavid Rientjes 	task_unlock(task);
2562c11600e4SDavid Rientjes 	mpol_put(pol);
2563c11600e4SDavid Rientjes }
2564c11600e4SDavid Rientjes 
25651da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
25661da177e4SLinus Torvalds {
2567140d5a49SPaul Mundt 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
25681da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
256963f74ca2SKOSAKI Motohiro 	sp_free(n);
25701da177e4SLinus Torvalds }
25711da177e4SLinus Torvalds 
257242288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start,
257342288fe3SMel Gorman 			unsigned long end, struct mempolicy *pol)
257442288fe3SMel Gorman {
257542288fe3SMel Gorman 	node->start = start;
257642288fe3SMel Gorman 	node->end = end;
257742288fe3SMel Gorman 	node->policy = pol;
257842288fe3SMel Gorman }
257942288fe3SMel Gorman 
2580dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2581dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
25821da177e4SLinus Torvalds {
2583869833f2SKOSAKI Motohiro 	struct sp_node *n;
2584869833f2SKOSAKI Motohiro 	struct mempolicy *newpol;
25851da177e4SLinus Torvalds 
2586869833f2SKOSAKI Motohiro 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
25871da177e4SLinus Torvalds 	if (!n)
25881da177e4SLinus Torvalds 		return NULL;
2589869833f2SKOSAKI Motohiro 
2590869833f2SKOSAKI Motohiro 	newpol = mpol_dup(pol);
2591869833f2SKOSAKI Motohiro 	if (IS_ERR(newpol)) {
2592869833f2SKOSAKI Motohiro 		kmem_cache_free(sn_cache, n);
2593869833f2SKOSAKI Motohiro 		return NULL;
2594869833f2SKOSAKI Motohiro 	}
2595869833f2SKOSAKI Motohiro 	newpol->flags |= MPOL_F_SHARED;
259642288fe3SMel Gorman 	sp_node_init(n, start, end, newpol);
2597869833f2SKOSAKI Motohiro 
25981da177e4SLinus Torvalds 	return n;
25991da177e4SLinus Torvalds }
26001da177e4SLinus Torvalds 
26011da177e4SLinus Torvalds /* Replace a policy range. */
26021da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
26031da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
26041da177e4SLinus Torvalds {
2605b22d127aSMel Gorman 	struct sp_node *n;
260642288fe3SMel Gorman 	struct sp_node *n_new = NULL;
260742288fe3SMel Gorman 	struct mempolicy *mpol_new = NULL;
2608b22d127aSMel Gorman 	int ret = 0;
26091da177e4SLinus Torvalds 
261042288fe3SMel Gorman restart:
26114a8c7bb5SNathan Zimmer 	write_lock(&sp->lock);
26121da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
26131da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
26141da177e4SLinus Torvalds 	while (n && n->start < end) {
26151da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
26161da177e4SLinus Torvalds 		if (n->start >= start) {
26171da177e4SLinus Torvalds 			if (n->end <= end)
26181da177e4SLinus Torvalds 				sp_delete(sp, n);
26191da177e4SLinus Torvalds 			else
26201da177e4SLinus Torvalds 				n->start = end;
26211da177e4SLinus Torvalds 		} else {
26221da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
26231da177e4SLinus Torvalds 			if (n->end > end) {
262442288fe3SMel Gorman 				if (!n_new)
262542288fe3SMel Gorman 					goto alloc_new;
262642288fe3SMel Gorman 
262742288fe3SMel Gorman 				*mpol_new = *n->policy;
262842288fe3SMel Gorman 				atomic_set(&mpol_new->refcnt, 1);
26297880639cSKOSAKI Motohiro 				sp_node_init(n_new, end, n->end, mpol_new);
26301da177e4SLinus Torvalds 				n->end = start;
26315ca39575SHillf Danton 				sp_insert(sp, n_new);
263242288fe3SMel Gorman 				n_new = NULL;
263342288fe3SMel Gorman 				mpol_new = NULL;
26341da177e4SLinus Torvalds 				break;
26351da177e4SLinus Torvalds 			} else
26361da177e4SLinus Torvalds 				n->end = start;
26371da177e4SLinus Torvalds 		}
26381da177e4SLinus Torvalds 		if (!next)
26391da177e4SLinus Torvalds 			break;
26401da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
26411da177e4SLinus Torvalds 	}
26421da177e4SLinus Torvalds 	if (new)
26431da177e4SLinus Torvalds 		sp_insert(sp, new);
26444a8c7bb5SNathan Zimmer 	write_unlock(&sp->lock);
264542288fe3SMel Gorman 	ret = 0;
264642288fe3SMel Gorman 
264742288fe3SMel Gorman err_out:
264842288fe3SMel Gorman 	if (mpol_new)
264942288fe3SMel Gorman 		mpol_put(mpol_new);
265042288fe3SMel Gorman 	if (n_new)
265142288fe3SMel Gorman 		kmem_cache_free(sn_cache, n_new);
265242288fe3SMel Gorman 
2653b22d127aSMel Gorman 	return ret;
265442288fe3SMel Gorman 
265542288fe3SMel Gorman alloc_new:
26564a8c7bb5SNathan Zimmer 	write_unlock(&sp->lock);
265742288fe3SMel Gorman 	ret = -ENOMEM;
265842288fe3SMel Gorman 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
265942288fe3SMel Gorman 	if (!n_new)
266042288fe3SMel Gorman 		goto err_out;
266142288fe3SMel Gorman 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
266242288fe3SMel Gorman 	if (!mpol_new)
266342288fe3SMel Gorman 		goto err_out;
266442288fe3SMel Gorman 	goto restart;
26651da177e4SLinus Torvalds }
26661da177e4SLinus Torvalds 
266771fe804bSLee Schermerhorn /**
266871fe804bSLee Schermerhorn  * mpol_shared_policy_init - initialize shared policy for inode
266971fe804bSLee Schermerhorn  * @sp: pointer to inode shared policy
267071fe804bSLee Schermerhorn  * @mpol:  struct mempolicy to install
267171fe804bSLee Schermerhorn  *
267271fe804bSLee Schermerhorn  * Install non-NULL @mpol in inode's shared policy rb-tree.
267371fe804bSLee Schermerhorn  * On entry, the current task has a reference on a non-NULL @mpol.
267471fe804bSLee Schermerhorn  * This must be released on exit.
26754bfc4495SKAMEZAWA Hiroyuki  * This is called at get_inode() calls and we can use GFP_KERNEL.
267671fe804bSLee Schermerhorn  */
267771fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
26787339ff83SRobin Holt {
267958568d2aSMiao Xie 	int ret;
268058568d2aSMiao Xie 
268171fe804bSLee Schermerhorn 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
26824a8c7bb5SNathan Zimmer 	rwlock_init(&sp->lock);
26837339ff83SRobin Holt 
268471fe804bSLee Schermerhorn 	if (mpol) {
26857339ff83SRobin Holt 		struct vm_area_struct pvma;
268671fe804bSLee Schermerhorn 		struct mempolicy *new;
26874bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
26887339ff83SRobin Holt 
26894bfc4495SKAMEZAWA Hiroyuki 		if (!scratch)
26905c0c1654SLee Schermerhorn 			goto put_mpol;
269171fe804bSLee Schermerhorn 		/* contextualize the tmpfs mount point mempolicy */
269271fe804bSLee Schermerhorn 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
269315d77835SLee Schermerhorn 		if (IS_ERR(new))
26940cae3457SDan Carpenter 			goto free_scratch; /* no valid nodemask intersection */
269558568d2aSMiao Xie 
269658568d2aSMiao Xie 		task_lock(current);
26974bfc4495SKAMEZAWA Hiroyuki 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
269858568d2aSMiao Xie 		task_unlock(current);
269915d77835SLee Schermerhorn 		if (ret)
27005c0c1654SLee Schermerhorn 			goto put_new;
270171fe804bSLee Schermerhorn 
270271fe804bSLee Schermerhorn 		/* Create pseudo-vma that contains just the policy */
27032c4541e2SKirill A. Shutemov 		vma_init(&pvma, NULL);
270471fe804bSLee Schermerhorn 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
270571fe804bSLee Schermerhorn 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
270615d77835SLee Schermerhorn 
27075c0c1654SLee Schermerhorn put_new:
270871fe804bSLee Schermerhorn 		mpol_put(new);			/* drop initial ref */
27090cae3457SDan Carpenter free_scratch:
27104bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
27115c0c1654SLee Schermerhorn put_mpol:
27125c0c1654SLee Schermerhorn 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
27137339ff83SRobin Holt 	}
27147339ff83SRobin Holt }
27157339ff83SRobin Holt 
27161da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
27171da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
27181da177e4SLinus Torvalds {
27191da177e4SLinus Torvalds 	int err;
27201da177e4SLinus Torvalds 	struct sp_node *new = NULL;
27211da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
27221da177e4SLinus Torvalds 
2723028fec41SDavid Rientjes 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
27241da177e4SLinus Torvalds 		 vma->vm_pgoff,
272545c4745aSLee Schermerhorn 		 sz, npol ? npol->mode : -1,
2726028fec41SDavid Rientjes 		 npol ? npol->flags : -1,
2727269fbe72SBen Widawsky 		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
27281da177e4SLinus Torvalds 
27291da177e4SLinus Torvalds 	if (npol) {
27301da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
27311da177e4SLinus Torvalds 		if (!new)
27321da177e4SLinus Torvalds 			return -ENOMEM;
27331da177e4SLinus Torvalds 	}
27341da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
27351da177e4SLinus Torvalds 	if (err && new)
273663f74ca2SKOSAKI Motohiro 		sp_free(new);
27371da177e4SLinus Torvalds 	return err;
27381da177e4SLinus Torvalds }
27391da177e4SLinus Torvalds 
27401da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
27411da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
27421da177e4SLinus Torvalds {
27431da177e4SLinus Torvalds 	struct sp_node *n;
27441da177e4SLinus Torvalds 	struct rb_node *next;
27451da177e4SLinus Torvalds 
27461da177e4SLinus Torvalds 	if (!p->root.rb_node)
27471da177e4SLinus Torvalds 		return;
27484a8c7bb5SNathan Zimmer 	write_lock(&p->lock);
27491da177e4SLinus Torvalds 	next = rb_first(&p->root);
27501da177e4SLinus Torvalds 	while (next) {
27511da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
27521da177e4SLinus Torvalds 		next = rb_next(&n->nd);
275363f74ca2SKOSAKI Motohiro 		sp_delete(p, n);
27541da177e4SLinus Torvalds 	}
27554a8c7bb5SNathan Zimmer 	write_unlock(&p->lock);
27561da177e4SLinus Torvalds }
27571da177e4SLinus Torvalds 
27581a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING
2759c297663cSMel Gorman static int __initdata numabalancing_override;
27601a687c2eSMel Gorman 
27611a687c2eSMel Gorman static void __init check_numabalancing_enable(void)
27621a687c2eSMel Gorman {
27631a687c2eSMel Gorman 	bool numabalancing_default = false;
27641a687c2eSMel Gorman 
27651a687c2eSMel Gorman 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
27661a687c2eSMel Gorman 		numabalancing_default = true;
27671a687c2eSMel Gorman 
2768c297663cSMel Gorman 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2769c297663cSMel Gorman 	if (numabalancing_override)
2770c297663cSMel Gorman 		set_numabalancing_state(numabalancing_override == 1);
2771c297663cSMel Gorman 
2772b0dc2b9bSMel Gorman 	if (num_online_nodes() > 1 && !numabalancing_override) {
2773756a025fSJoe Perches 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2774c297663cSMel Gorman 			numabalancing_default ? "Enabling" : "Disabling");
27751a687c2eSMel Gorman 		set_numabalancing_state(numabalancing_default);
27761a687c2eSMel Gorman 	}
27771a687c2eSMel Gorman }
27781a687c2eSMel Gorman 
27791a687c2eSMel Gorman static int __init setup_numabalancing(char *str)
27801a687c2eSMel Gorman {
27811a687c2eSMel Gorman 	int ret = 0;
27821a687c2eSMel Gorman 	if (!str)
27831a687c2eSMel Gorman 		goto out;
27841a687c2eSMel Gorman 
27851a687c2eSMel Gorman 	if (!strcmp(str, "enable")) {
2786c297663cSMel Gorman 		numabalancing_override = 1;
27871a687c2eSMel Gorman 		ret = 1;
27881a687c2eSMel Gorman 	} else if (!strcmp(str, "disable")) {
2789c297663cSMel Gorman 		numabalancing_override = -1;
27901a687c2eSMel Gorman 		ret = 1;
27911a687c2eSMel Gorman 	}
27921a687c2eSMel Gorman out:
27931a687c2eSMel Gorman 	if (!ret)
27944a404beaSAndrew Morton 		pr_warn("Unable to parse numa_balancing=\n");
27951a687c2eSMel Gorman 
27961a687c2eSMel Gorman 	return ret;
27971a687c2eSMel Gorman }
27981a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing);
27991a687c2eSMel Gorman #else
28001a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void)
28011a687c2eSMel Gorman {
28021a687c2eSMel Gorman }
28031a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */
28041a687c2eSMel Gorman 
28051da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
28061da177e4SLinus Torvalds void __init numa_policy_init(void)
28071da177e4SLinus Torvalds {
2808b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
2809b71636e2SPaul Mundt 	unsigned long largest = 0;
2810b71636e2SPaul Mundt 	int nid, prefer = 0;
2811b71636e2SPaul Mundt 
28121da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
28131da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
281420c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
28151da177e4SLinus Torvalds 
28161da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
28171da177e4SLinus Torvalds 				     sizeof(struct sp_node),
281820c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
28191da177e4SLinus Torvalds 
28205606e387SMel Gorman 	for_each_node(nid) {
28215606e387SMel Gorman 		preferred_node_policy[nid] = (struct mempolicy) {
28225606e387SMel Gorman 			.refcnt = ATOMIC_INIT(1),
28235606e387SMel Gorman 			.mode = MPOL_PREFERRED,
28245606e387SMel Gorman 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2825269fbe72SBen Widawsky 			.nodes = nodemask_of_node(nid),
28265606e387SMel Gorman 		};
28275606e387SMel Gorman 	}
28285606e387SMel Gorman 
2829b71636e2SPaul Mundt 	/*
2830b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
2831b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
2832b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
2833b71636e2SPaul Mundt 	 */
2834b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
283501f13bd6SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
2836b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
28371da177e4SLinus Torvalds 
2838b71636e2SPaul Mundt 		/* Preserve the largest node */
2839b71636e2SPaul Mundt 		if (largest < total_pages) {
2840b71636e2SPaul Mundt 			largest = total_pages;
2841b71636e2SPaul Mundt 			prefer = nid;
2842b71636e2SPaul Mundt 		}
2843b71636e2SPaul Mundt 
2844b71636e2SPaul Mundt 		/* Interleave this node? */
2845b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2846b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
2847b71636e2SPaul Mundt 	}
2848b71636e2SPaul Mundt 
2849b71636e2SPaul Mundt 	/* All too small, use the largest */
2850b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
2851b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
2852b71636e2SPaul Mundt 
2853028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2854b1de0d13SMitchel Humpherys 		pr_err("%s: interleaving failed\n", __func__);
28551a687c2eSMel Gorman 
28561a687c2eSMel Gorman 	check_numabalancing_enable();
28571da177e4SLinus Torvalds }
28581da177e4SLinus Torvalds 
28598bccd85fSChristoph Lameter /* Reset policy of current process to default */
28601da177e4SLinus Torvalds void numa_default_policy(void)
28611da177e4SLinus Torvalds {
2862028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
28631da177e4SLinus Torvalds }
286468860ec1SPaul Jackson 
28654225399aSPaul Jackson /*
2866095f1fc4SLee Schermerhorn  * Parse and format mempolicy from/to strings
2867095f1fc4SLee Schermerhorn  */
2868095f1fc4SLee Schermerhorn 
2869345ace9cSLee Schermerhorn static const char * const policy_modes[] =
2870345ace9cSLee Schermerhorn {
2871345ace9cSLee Schermerhorn 	[MPOL_DEFAULT]    = "default",
2872345ace9cSLee Schermerhorn 	[MPOL_PREFERRED]  = "prefer",
2873345ace9cSLee Schermerhorn 	[MPOL_BIND]       = "bind",
2874345ace9cSLee Schermerhorn 	[MPOL_INTERLEAVE] = "interleave",
2875d3a71033SLee Schermerhorn 	[MPOL_LOCAL]      = "local",
2876b27abaccSDave Hansen 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
2877345ace9cSLee Schermerhorn };
28781a75a6c8SChristoph Lameter 
2879095f1fc4SLee Schermerhorn 
2880095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS
2881095f1fc4SLee Schermerhorn /**
2882f2a07f40SHugh Dickins  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2883095f1fc4SLee Schermerhorn  * @str:  string containing mempolicy to parse
288471fe804bSLee Schermerhorn  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2885095f1fc4SLee Schermerhorn  *
2886095f1fc4SLee Schermerhorn  * Format of input:
2887095f1fc4SLee Schermerhorn  *	<mode>[=<flags>][:<nodelist>]
2888095f1fc4SLee Schermerhorn  *
288971fe804bSLee Schermerhorn  * On success, returns 0, else 1
2890095f1fc4SLee Schermerhorn  */
2891a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol)
2892095f1fc4SLee Schermerhorn {
289371fe804bSLee Schermerhorn 	struct mempolicy *new = NULL;
2894f2a07f40SHugh Dickins 	unsigned short mode_flags;
289571fe804bSLee Schermerhorn 	nodemask_t nodes;
2896095f1fc4SLee Schermerhorn 	char *nodelist = strchr(str, ':');
2897095f1fc4SLee Schermerhorn 	char *flags = strchr(str, '=');
2898dedf2c73Szhong jiang 	int err = 1, mode;
2899095f1fc4SLee Schermerhorn 
2900c7a91bc7SDan Carpenter 	if (flags)
2901c7a91bc7SDan Carpenter 		*flags++ = '\0';	/* terminate mode string */
2902c7a91bc7SDan Carpenter 
2903095f1fc4SLee Schermerhorn 	if (nodelist) {
2904095f1fc4SLee Schermerhorn 		/* NUL-terminate mode or flags string */
2905095f1fc4SLee Schermerhorn 		*nodelist++ = '\0';
290671fe804bSLee Schermerhorn 		if (nodelist_parse(nodelist, nodes))
2907095f1fc4SLee Schermerhorn 			goto out;
290801f13bd6SLai Jiangshan 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2909095f1fc4SLee Schermerhorn 			goto out;
291071fe804bSLee Schermerhorn 	} else
291171fe804bSLee Schermerhorn 		nodes_clear(nodes);
291271fe804bSLee Schermerhorn 
2913dedf2c73Szhong jiang 	mode = match_string(policy_modes, MPOL_MAX, str);
2914dedf2c73Szhong jiang 	if (mode < 0)
2915095f1fc4SLee Schermerhorn 		goto out;
2916095f1fc4SLee Schermerhorn 
291771fe804bSLee Schermerhorn 	switch (mode) {
2918095f1fc4SLee Schermerhorn 	case MPOL_PREFERRED:
291971fe804bSLee Schermerhorn 		/*
2920aa9f7d51SRandy Dunlap 		 * Insist on a nodelist of one node only, although later
2921aa9f7d51SRandy Dunlap 		 * we use first_node(nodes) to grab a single node, so here
2922aa9f7d51SRandy Dunlap 		 * nodelist (or nodes) cannot be empty.
292371fe804bSLee Schermerhorn 		 */
2924095f1fc4SLee Schermerhorn 		if (nodelist) {
2925095f1fc4SLee Schermerhorn 			char *rest = nodelist;
2926095f1fc4SLee Schermerhorn 			while (isdigit(*rest))
2927095f1fc4SLee Schermerhorn 				rest++;
2928926f2ae0SKOSAKI Motohiro 			if (*rest)
2929926f2ae0SKOSAKI Motohiro 				goto out;
2930aa9f7d51SRandy Dunlap 			if (nodes_empty(nodes))
2931aa9f7d51SRandy Dunlap 				goto out;
2932095f1fc4SLee Schermerhorn 		}
2933095f1fc4SLee Schermerhorn 		break;
2934095f1fc4SLee Schermerhorn 	case MPOL_INTERLEAVE:
2935095f1fc4SLee Schermerhorn 		/*
2936095f1fc4SLee Schermerhorn 		 * Default to online nodes with memory if no nodelist
2937095f1fc4SLee Schermerhorn 		 */
2938095f1fc4SLee Schermerhorn 		if (!nodelist)
293901f13bd6SLai Jiangshan 			nodes = node_states[N_MEMORY];
29403f226aa1SLee Schermerhorn 		break;
294171fe804bSLee Schermerhorn 	case MPOL_LOCAL:
29423f226aa1SLee Schermerhorn 		/*
294371fe804bSLee Schermerhorn 		 * Don't allow a nodelist;  mpol_new() checks flags
29443f226aa1SLee Schermerhorn 		 */
294571fe804bSLee Schermerhorn 		if (nodelist)
29463f226aa1SLee Schermerhorn 			goto out;
29473f226aa1SLee Schermerhorn 		break;
2948413b43deSRavikiran G Thirumalai 	case MPOL_DEFAULT:
2949413b43deSRavikiran G Thirumalai 		/*
2950413b43deSRavikiran G Thirumalai 		 * Insist on a empty nodelist
2951413b43deSRavikiran G Thirumalai 		 */
2952413b43deSRavikiran G Thirumalai 		if (!nodelist)
2953413b43deSRavikiran G Thirumalai 			err = 0;
2954413b43deSRavikiran G Thirumalai 		goto out;
2955b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
2956d69b2e63SKOSAKI Motohiro 	case MPOL_BIND:
295771fe804bSLee Schermerhorn 		/*
2958d69b2e63SKOSAKI Motohiro 		 * Insist on a nodelist
295971fe804bSLee Schermerhorn 		 */
2960d69b2e63SKOSAKI Motohiro 		if (!nodelist)
2961d69b2e63SKOSAKI Motohiro 			goto out;
2962095f1fc4SLee Schermerhorn 	}
2963095f1fc4SLee Schermerhorn 
296471fe804bSLee Schermerhorn 	mode_flags = 0;
2965095f1fc4SLee Schermerhorn 	if (flags) {
2966095f1fc4SLee Schermerhorn 		/*
2967095f1fc4SLee Schermerhorn 		 * Currently, we only support two mutually exclusive
2968095f1fc4SLee Schermerhorn 		 * mode flags.
2969095f1fc4SLee Schermerhorn 		 */
2970095f1fc4SLee Schermerhorn 		if (!strcmp(flags, "static"))
297171fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_STATIC_NODES;
2972095f1fc4SLee Schermerhorn 		else if (!strcmp(flags, "relative"))
297371fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_RELATIVE_NODES;
2974095f1fc4SLee Schermerhorn 		else
2975926f2ae0SKOSAKI Motohiro 			goto out;
2976095f1fc4SLee Schermerhorn 	}
297771fe804bSLee Schermerhorn 
297871fe804bSLee Schermerhorn 	new = mpol_new(mode, mode_flags, &nodes);
297971fe804bSLee Schermerhorn 	if (IS_ERR(new))
2980926f2ae0SKOSAKI Motohiro 		goto out;
2981926f2ae0SKOSAKI Motohiro 
2982f2a07f40SHugh Dickins 	/*
2983f2a07f40SHugh Dickins 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2984f2a07f40SHugh Dickins 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2985f2a07f40SHugh Dickins 	 */
2986269fbe72SBen Widawsky 	if (mode != MPOL_PREFERRED) {
2987269fbe72SBen Widawsky 		new->nodes = nodes;
2988269fbe72SBen Widawsky 	} else if (nodelist) {
2989269fbe72SBen Widawsky 		nodes_clear(new->nodes);
2990269fbe72SBen Widawsky 		node_set(first_node(nodes), new->nodes);
2991269fbe72SBen Widawsky 	} else {
29927858d7bcSFeng Tang 		new->mode = MPOL_LOCAL;
2993269fbe72SBen Widawsky 	}
2994f2a07f40SHugh Dickins 
2995f2a07f40SHugh Dickins 	/*
2996f2a07f40SHugh Dickins 	 * Save nodes for contextualization: this will be used to "clone"
2997f2a07f40SHugh Dickins 	 * the mempolicy in a specific context [cpuset] at a later time.
2998f2a07f40SHugh Dickins 	 */
2999e17f74afSLee Schermerhorn 	new->w.user_nodemask = nodes;
3000f2a07f40SHugh Dickins 
3001926f2ae0SKOSAKI Motohiro 	err = 0;
300271fe804bSLee Schermerhorn 
3003095f1fc4SLee Schermerhorn out:
3004095f1fc4SLee Schermerhorn 	/* Restore string for error message */
3005095f1fc4SLee Schermerhorn 	if (nodelist)
3006095f1fc4SLee Schermerhorn 		*--nodelist = ':';
3007095f1fc4SLee Schermerhorn 	if (flags)
3008095f1fc4SLee Schermerhorn 		*--flags = '=';
300971fe804bSLee Schermerhorn 	if (!err)
301071fe804bSLee Schermerhorn 		*mpol = new;
3011095f1fc4SLee Schermerhorn 	return err;
3012095f1fc4SLee Schermerhorn }
3013095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */
3014095f1fc4SLee Schermerhorn 
301571fe804bSLee Schermerhorn /**
301671fe804bSLee Schermerhorn  * mpol_to_str - format a mempolicy structure for printing
301771fe804bSLee Schermerhorn  * @buffer:  to contain formatted mempolicy string
301871fe804bSLee Schermerhorn  * @maxlen:  length of @buffer
301971fe804bSLee Schermerhorn  * @pol:  pointer to mempolicy to be formatted
302071fe804bSLee Schermerhorn  *
3021948927eeSDavid Rientjes  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3022948927eeSDavid Rientjes  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3023948927eeSDavid Rientjes  * longest flag, "relative", and to display at least a few node ids.
30241a75a6c8SChristoph Lameter  */
3025948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
30261a75a6c8SChristoph Lameter {
30271a75a6c8SChristoph Lameter 	char *p = buffer;
3028948927eeSDavid Rientjes 	nodemask_t nodes = NODE_MASK_NONE;
3029948927eeSDavid Rientjes 	unsigned short mode = MPOL_DEFAULT;
3030948927eeSDavid Rientjes 	unsigned short flags = 0;
30311a75a6c8SChristoph Lameter 
30328790c71aSDavid Rientjes 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3033bea904d5SLee Schermerhorn 		mode = pol->mode;
3034948927eeSDavid Rientjes 		flags = pol->flags;
3035948927eeSDavid Rientjes 	}
3036bea904d5SLee Schermerhorn 
30371a75a6c8SChristoph Lameter 	switch (mode) {
30381a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
30397858d7bcSFeng Tang 	case MPOL_LOCAL:
30401a75a6c8SChristoph Lameter 		break;
30411a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
3042b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
30431a75a6c8SChristoph Lameter 	case MPOL_BIND:
30441a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
3045269fbe72SBen Widawsky 		nodes = pol->nodes;
30461a75a6c8SChristoph Lameter 		break;
30471a75a6c8SChristoph Lameter 	default:
3048948927eeSDavid Rientjes 		WARN_ON_ONCE(1);
3049948927eeSDavid Rientjes 		snprintf(p, maxlen, "unknown");
3050948927eeSDavid Rientjes 		return;
30511a75a6c8SChristoph Lameter 	}
30521a75a6c8SChristoph Lameter 
3053b7a9f420SDavid Rientjes 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
30541a75a6c8SChristoph Lameter 
3055fc36b8d3SLee Schermerhorn 	if (flags & MPOL_MODE_FLAGS) {
3056948927eeSDavid Rientjes 		p += snprintf(p, buffer + maxlen - p, "=");
3057f5b087b5SDavid Rientjes 
30582291990aSLee Schermerhorn 		/*
30592291990aSLee Schermerhorn 		 * Currently, the only defined flags are mutually exclusive
30602291990aSLee Schermerhorn 		 */
3061f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
30622291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "static");
30632291990aSLee Schermerhorn 		else if (flags & MPOL_F_RELATIVE_NODES)
30642291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "relative");
3065f5b087b5SDavid Rientjes 	}
3066f5b087b5SDavid Rientjes 
30679e763e0fSTejun Heo 	if (!nodes_empty(nodes))
30689e763e0fSTejun Heo 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
30699e763e0fSTejun Heo 			       nodemask_pr_args(&nodes));
30701a75a6c8SChristoph Lameter }
3071