146aeb7e6SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel.
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs.
68bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds * be allocated.
101da177e4SLinus Torvalds *
111da177e4SLinus Torvalds * Support four policies per VMA and per process:
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds *
151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds * with normal fallback if it fails.
171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds * is used.
218bccd85fSChristoph Lameter *
221da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds * no fallback.
248bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter * the allocation to memory nodes instead
278bccd85fSChristoph Lameter *
281da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback.
2900ef2d2fSDavid Rientjes * As a special case NUMA_NO_NODE here means do the allocation
301da177e4SLinus Torvalds * on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds * process policy.
338bccd85fSChristoph Lameter *
34b27abaccSDave Hansen * preferred many Try a set of nodes first before normal fallback. This is
35b27abaccSDave Hansen * similar to preferred without the special case.
36b27abaccSDave Hansen *
371da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA
381da177e4SLinus Torvalds * use the process policy. This is what Linux always did
391da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default.
401da177e4SLinus Torvalds *
411da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations
421da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always
431da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory
441da177e4SLinus Torvalds * allocations for a VMA in the VM.
451da177e4SLinus Torvalds *
461da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy
471da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy
481da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins.
491da177e4SLinus Torvalds *
501da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations
511da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that
521da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied.
531da177e4SLinus Torvalds * Same with GFP_DMA allocations.
541da177e4SLinus Torvalds *
551da177e4SLinus Torvalds * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
561da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped.
571da177e4SLinus Torvalds */
581da177e4SLinus Torvalds
591da177e4SLinus Torvalds /* Notebook:
601da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache
611da177e4SLinus Torvalds object
621da177e4SLinus Torvalds statistics for bigpages
631da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires
641da177e4SLinus Torvalds first item above.
651da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy)
661da177e4SLinus Torvalds grows down?
671da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the
681da177e4SLinus Torvalds kernel is not always grateful with that.
691da177e4SLinus Torvalds */
701da177e4SLinus Torvalds
71b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72b1de0d13SMitchel Humpherys
731da177e4SLinus Torvalds #include <linux/mempolicy.h>
74a520110eSChristoph Hellwig #include <linux/pagewalk.h>
751da177e4SLinus Torvalds #include <linux/highmem.h>
761da177e4SLinus Torvalds #include <linux/hugetlb.h>
771da177e4SLinus Torvalds #include <linux/kernel.h>
781da177e4SLinus Torvalds #include <linux/sched.h>
796e84f315SIngo Molnar #include <linux/sched/mm.h>
806a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
81f719ff9bSIngo Molnar #include <linux/sched/task.h>
821da177e4SLinus Torvalds #include <linux/nodemask.h>
831da177e4SLinus Torvalds #include <linux/cpuset.h>
841da177e4SLinus Torvalds #include <linux/slab.h>
851da177e4SLinus Torvalds #include <linux/string.h>
86b95f1b31SPaul Gortmaker #include <linux/export.h>
87b488893aSPavel Emelyanov #include <linux/nsproxy.h>
881da177e4SLinus Torvalds #include <linux/interrupt.h>
891da177e4SLinus Torvalds #include <linux/init.h>
901da177e4SLinus Torvalds #include <linux/compat.h>
9131367466SOtto Ebeling #include <linux/ptrace.h>
92dc9aa5b9SChristoph Lameter #include <linux/swap.h>
931a75a6c8SChristoph Lameter #include <linux/seq_file.h>
941a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
95b20a3503SChristoph Lameter #include <linux/migrate.h>
9662b61f61SHugh Dickins #include <linux/ksm.h>
9795a402c3SChristoph Lameter #include <linux/rmap.h>
9886c3a764SDavid Quigley #include <linux/security.h>
99dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
100095f1fc4SLee Schermerhorn #include <linux/ctype.h>
1016d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h>
102b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h>
103b1de0d13SMitchel Humpherys #include <linux/printk.h>
104c8633798SNaoya Horiguchi #include <linux/swapops.h>
105dc9aa5b9SChristoph Lameter
1061da177e4SLinus Torvalds #include <asm/tlbflush.h>
1074a18419fSNadav Amit #include <asm/tlb.h>
1087c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
1091da177e4SLinus Torvalds
11062695a84SNick Piggin #include "internal.h"
11162695a84SNick Piggin
11238e35860SChristoph Lameter /* Internal flags */
113dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
11438e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
115dc9aa5b9SChristoph Lameter
116fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
117fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1181da177e4SLinus Torvalds
1191da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1201da177e4SLinus Torvalds policied. */
1216267276fSChristoph Lameter enum zone_type policy_zone = 0;
1221da177e4SLinus Torvalds
123bea904d5SLee Schermerhorn /*
124bea904d5SLee Schermerhorn * run-time system-wide default policy => local allocation
125bea904d5SLee Schermerhorn */
126e754d79dSH Hartley Sweeten static struct mempolicy default_policy = {
1271da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */
1287858d7bcSFeng Tang .mode = MPOL_LOCAL,
1291da177e4SLinus Torvalds };
1301da177e4SLinus Torvalds
1315606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES];
1325606e387SMel Gorman
133b2ca916cSDan Williams /**
134*52cacc5cSYury Norov * numa_nearest_node - Find nearest node by state
135f6e92f40SKrzysztof Kozlowski * @node: Node id to start the search
136*52cacc5cSYury Norov * @state: State to filter the search
137b2ca916cSDan Williams *
138*52cacc5cSYury Norov * Lookup the closest node by distance if @nid is not in state.
139dad5b023SRandy Dunlap *
140*52cacc5cSYury Norov * Return: this @node if it is in state, otherwise the closest node by distance
141b2ca916cSDan Williams */
numa_nearest_node(int node,unsigned int state)142*52cacc5cSYury Norov int numa_nearest_node(int node, unsigned int state)
143b2ca916cSDan Williams {
1444fcbe96eSDan Williams int min_dist = INT_MAX, dist, n, min_node;
145b2ca916cSDan Williams
146*52cacc5cSYury Norov if (state >= NR_NODE_STATES)
147*52cacc5cSYury Norov return -EINVAL;
148*52cacc5cSYury Norov
149*52cacc5cSYury Norov if (node == NUMA_NO_NODE || node_state(node, state))
1504fcbe96eSDan Williams return node;
151b2ca916cSDan Williams
152b2ca916cSDan Williams min_node = node;
153*52cacc5cSYury Norov for_each_node_state(n, state) {
154b2ca916cSDan Williams dist = node_distance(node, n);
155b2ca916cSDan Williams if (dist < min_dist) {
156b2ca916cSDan Williams min_dist = dist;
157b2ca916cSDan Williams min_node = n;
158b2ca916cSDan Williams }
159b2ca916cSDan Williams }
160b2ca916cSDan Williams
161b2ca916cSDan Williams return min_node;
162b2ca916cSDan Williams }
163*52cacc5cSYury Norov EXPORT_SYMBOL_GPL(numa_nearest_node);
164b2ca916cSDan Williams
get_task_policy(struct task_struct * p)16574d2c3a0SOleg Nesterov struct mempolicy *get_task_policy(struct task_struct *p)
1665606e387SMel Gorman {
1675606e387SMel Gorman struct mempolicy *pol = p->mempolicy;
168f15ca78eSOleg Nesterov int node;
1695606e387SMel Gorman
170f15ca78eSOleg Nesterov if (pol)
171f15ca78eSOleg Nesterov return pol;
1725606e387SMel Gorman
173f15ca78eSOleg Nesterov node = numa_node_id();
1741da6f0e1SJianguo Wu if (node != NUMA_NO_NODE) {
1751da6f0e1SJianguo Wu pol = &preferred_node_policy[node];
176f15ca78eSOleg Nesterov /* preferred_node_policy is not initialised early in boot */
177f15ca78eSOleg Nesterov if (pol->mode)
178f15ca78eSOleg Nesterov return pol;
1791da6f0e1SJianguo Wu }
1805606e387SMel Gorman
181f15ca78eSOleg Nesterov return &default_policy;
1825606e387SMel Gorman }
1835606e387SMel Gorman
18437012946SDavid Rientjes static const struct mempolicy_operations {
18537012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
186213980c0SVlastimil Babka void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
18737012946SDavid Rientjes } mpol_ops[MPOL_MAX];
18837012946SDavid Rientjes
mpol_store_user_nodemask(const struct mempolicy * pol)189f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
190f5b087b5SDavid Rientjes {
1916d556294SBob Liu return pol->flags & MPOL_MODE_FLAGS;
1924c50bc01SDavid Rientjes }
1934c50bc01SDavid Rientjes
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)1944c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1954c50bc01SDavid Rientjes const nodemask_t *rel)
1964c50bc01SDavid Rientjes {
1974c50bc01SDavid Rientjes nodemask_t tmp;
1984c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel));
1994c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel);
200f5b087b5SDavid Rientjes }
201f5b087b5SDavid Rientjes
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)202be897d48SFeng Tang static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
20337012946SDavid Rientjes {
20437012946SDavid Rientjes if (nodes_empty(*nodes))
20537012946SDavid Rientjes return -EINVAL;
206269fbe72SBen Widawsky pol->nodes = *nodes;
20737012946SDavid Rientjes return 0;
20837012946SDavid Rientjes }
20937012946SDavid Rientjes
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)21037012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
21137012946SDavid Rientjes {
2127858d7bcSFeng Tang if (nodes_empty(*nodes))
2137858d7bcSFeng Tang return -EINVAL;
214269fbe72SBen Widawsky
215269fbe72SBen Widawsky nodes_clear(pol->nodes);
216269fbe72SBen Widawsky node_set(first_node(*nodes), pol->nodes);
21737012946SDavid Rientjes return 0;
21837012946SDavid Rientjes }
21937012946SDavid Rientjes
22058568d2aSMiao Xie /*
22158568d2aSMiao Xie * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
22258568d2aSMiao Xie * any, for the new policy. mpol_new() has already validated the nodes
2237858d7bcSFeng Tang * parameter with respect to the policy mode and flags.
22458568d2aSMiao Xie *
22558568d2aSMiao Xie * Must be called holding task's alloc_lock to protect task's mems_allowed
226c1e8d7c6SMichel Lespinasse * and mempolicy. May also be called holding the mmap_lock for write.
22758568d2aSMiao Xie */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)2284bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol,
2294bfc4495SKAMEZAWA Hiroyuki const nodemask_t *nodes, struct nodemask_scratch *nsc)
23058568d2aSMiao Xie {
23158568d2aSMiao Xie int ret;
23258568d2aSMiao Xie
2337858d7bcSFeng Tang /*
2347858d7bcSFeng Tang * Default (pol==NULL) resp. local memory policies are not a
2357858d7bcSFeng Tang * subject of any remapping. They also do not need any special
2367858d7bcSFeng Tang * constructor.
2377858d7bcSFeng Tang */
2387858d7bcSFeng Tang if (!pol || pol->mode == MPOL_LOCAL)
23958568d2aSMiao Xie return 0;
2407858d7bcSFeng Tang
24101f13bd6SLai Jiangshan /* Check N_MEMORY */
2424bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask1,
24301f13bd6SLai Jiangshan cpuset_current_mems_allowed, node_states[N_MEMORY]);
24458568d2aSMiao Xie
24558568d2aSMiao Xie VM_BUG_ON(!nodes);
2467858d7bcSFeng Tang
24758568d2aSMiao Xie if (pol->flags & MPOL_F_RELATIVE_NODES)
2484bfc4495SKAMEZAWA Hiroyuki mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
24958568d2aSMiao Xie else
2504bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask2, *nodes, nsc->mask1);
2514bfc4495SKAMEZAWA Hiroyuki
25258568d2aSMiao Xie if (mpol_store_user_nodemask(pol))
25358568d2aSMiao Xie pol->w.user_nodemask = *nodes;
25458568d2aSMiao Xie else
2557858d7bcSFeng Tang pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
25658568d2aSMiao Xie
2574bfc4495SKAMEZAWA Hiroyuki ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
25858568d2aSMiao Xie return ret;
25958568d2aSMiao Xie }
26058568d2aSMiao Xie
26158568d2aSMiao Xie /*
26258568d2aSMiao Xie * This function just creates a new policy, does some check and simple
26358568d2aSMiao Xie * initialization. You must invoke mpol_set_nodemask() to set nodes.
26458568d2aSMiao Xie */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)265028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
266028fec41SDavid Rientjes nodemask_t *nodes)
2671da177e4SLinus Torvalds {
2681da177e4SLinus Torvalds struct mempolicy *policy;
2691da177e4SLinus Torvalds
270028fec41SDavid Rientjes pr_debug("setting mode %d flags %d nodes[0] %lx\n",
27100ef2d2fSDavid Rientjes mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
272140d5a49SPaul Mundt
2733e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) {
2743e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes))
27537012946SDavid Rientjes return ERR_PTR(-EINVAL);
276d3a71033SLee Schermerhorn return NULL;
27737012946SDavid Rientjes }
2783e1f0645SDavid Rientjes VM_BUG_ON(!nodes);
2793e1f0645SDavid Rientjes
2803e1f0645SDavid Rientjes /*
2813e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
2823e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
2833e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask.
2843e1f0645SDavid Rientjes */
2853e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) {
2863e1f0645SDavid Rientjes if (nodes_empty(*nodes)) {
2873e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) ||
2883e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES)))
2893e1f0645SDavid Rientjes return ERR_PTR(-EINVAL);
2907858d7bcSFeng Tang
2917858d7bcSFeng Tang mode = MPOL_LOCAL;
2923e1f0645SDavid Rientjes }
293479e2802SPeter Zijlstra } else if (mode == MPOL_LOCAL) {
2948d303e44SPiotr Kwapulinski if (!nodes_empty(*nodes) ||
2958d303e44SPiotr Kwapulinski (flags & MPOL_F_STATIC_NODES) ||
2968d303e44SPiotr Kwapulinski (flags & MPOL_F_RELATIVE_NODES))
297479e2802SPeter Zijlstra return ERR_PTR(-EINVAL);
2983e1f0645SDavid Rientjes } else if (nodes_empty(*nodes))
2993e1f0645SDavid Rientjes return ERR_PTR(-EINVAL);
3001da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3011da177e4SLinus Torvalds if (!policy)
3021da177e4SLinus Torvalds return ERR_PTR(-ENOMEM);
3031da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1);
30445c4745aSLee Schermerhorn policy->mode = mode;
30537012946SDavid Rientjes policy->flags = flags;
306c6018b4bSAneesh Kumar K.V policy->home_node = NUMA_NO_NODE;
3073e1f0645SDavid Rientjes
30837012946SDavid Rientjes return policy;
30937012946SDavid Rientjes }
31037012946SDavid Rientjes
31152cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * p)31252cd3b07SLee Schermerhorn void __mpol_put(struct mempolicy *p)
31352cd3b07SLee Schermerhorn {
31452cd3b07SLee Schermerhorn if (!atomic_dec_and_test(&p->refcnt))
31552cd3b07SLee Schermerhorn return;
31652cd3b07SLee Schermerhorn kmem_cache_free(policy_cache, p);
31752cd3b07SLee Schermerhorn }
31852cd3b07SLee Schermerhorn
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)319213980c0SVlastimil Babka static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
32037012946SDavid Rientjes {
32137012946SDavid Rientjes }
32237012946SDavid Rientjes
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)323213980c0SVlastimil Babka static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
3241d0d2680SDavid Rientjes {
3251d0d2680SDavid Rientjes nodemask_t tmp;
3261d0d2680SDavid Rientjes
32737012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES)
32837012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes);
32937012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES)
33037012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3311d0d2680SDavid Rientjes else {
332269fbe72SBen Widawsky nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
333213980c0SVlastimil Babka *nodes);
33429b190faSzhong jiang pol->w.cpuset_mems_allowed = *nodes;
3351d0d2680SDavid Rientjes }
33637012946SDavid Rientjes
337708c1bbcSMiao Xie if (nodes_empty(tmp))
338708c1bbcSMiao Xie tmp = *nodes;
339708c1bbcSMiao Xie
340269fbe72SBen Widawsky pol->nodes = tmp;
34137012946SDavid Rientjes }
34237012946SDavid Rientjes
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)34337012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
344213980c0SVlastimil Babka const nodemask_t *nodes)
34537012946SDavid Rientjes {
34637012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes;
3471d0d2680SDavid Rientjes }
34837012946SDavid Rientjes
349708c1bbcSMiao Xie /*
350708c1bbcSMiao Xie * mpol_rebind_policy - Migrate a policy to a different set of nodes
351708c1bbcSMiao Xie *
352c1e8d7c6SMichel Lespinasse * Per-vma policies are protected by mmap_lock. Allocations using per-task
353213980c0SVlastimil Babka * policies are protected by task->mems_allowed_seq to prevent a premature
354213980c0SVlastimil Babka * OOM/allocation failure due to parallel nodemask modification.
355708c1bbcSMiao Xie */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)356213980c0SVlastimil Babka static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
35737012946SDavid Rientjes {
358018160adSWang Cheng if (!pol || pol->mode == MPOL_LOCAL)
35937012946SDavid Rientjes return;
3607858d7bcSFeng Tang if (!mpol_store_user_nodemask(pol) &&
36137012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
36237012946SDavid Rientjes return;
363708c1bbcSMiao Xie
364213980c0SVlastimil Babka mpol_ops[pol->mode].rebind(pol, newmask);
3651d0d2680SDavid Rientjes }
3661d0d2680SDavid Rientjes
3671d0d2680SDavid Rientjes /*
3681d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task
3691d0d2680SDavid Rientjes * pointer, and updates task mempolicy.
37058568d2aSMiao Xie *
37158568d2aSMiao Xie * Called with task's alloc_lock held.
3721d0d2680SDavid Rientjes */
3731d0d2680SDavid Rientjes
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)374213980c0SVlastimil Babka void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3751d0d2680SDavid Rientjes {
376213980c0SVlastimil Babka mpol_rebind_policy(tsk->mempolicy, new);
3771d0d2680SDavid Rientjes }
3781d0d2680SDavid Rientjes
3791d0d2680SDavid Rientjes /*
3801d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask.
3811d0d2680SDavid Rientjes *
382c1e8d7c6SMichel Lespinasse * Call holding a reference to mm. Takes mm->mmap_lock during call.
3831d0d2680SDavid Rientjes */
3841d0d2680SDavid Rientjes
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)3851d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
3861d0d2680SDavid Rientjes {
3871d0d2680SDavid Rientjes struct vm_area_struct *vma;
38866850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, 0);
3891d0d2680SDavid Rientjes
390d8ed45c5SMichel Lespinasse mmap_write_lock(mm);
3916c21e066SJann Horn for_each_vma(vmi, vma) {
3926c21e066SJann Horn vma_start_write(vma);
393213980c0SVlastimil Babka mpol_rebind_policy(vma->vm_policy, new);
3946c21e066SJann Horn }
395d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
3961d0d2680SDavid Rientjes }
3971d0d2680SDavid Rientjes
39837012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
39937012946SDavid Rientjes [MPOL_DEFAULT] = {
40037012946SDavid Rientjes .rebind = mpol_rebind_default,
40137012946SDavid Rientjes },
40237012946SDavid Rientjes [MPOL_INTERLEAVE] = {
403be897d48SFeng Tang .create = mpol_new_nodemask,
40437012946SDavid Rientjes .rebind = mpol_rebind_nodemask,
40537012946SDavid Rientjes },
40637012946SDavid Rientjes [MPOL_PREFERRED] = {
40737012946SDavid Rientjes .create = mpol_new_preferred,
40837012946SDavid Rientjes .rebind = mpol_rebind_preferred,
40937012946SDavid Rientjes },
41037012946SDavid Rientjes [MPOL_BIND] = {
411be897d48SFeng Tang .create = mpol_new_nodemask,
41237012946SDavid Rientjes .rebind = mpol_rebind_nodemask,
41337012946SDavid Rientjes },
4147858d7bcSFeng Tang [MPOL_LOCAL] = {
4157858d7bcSFeng Tang .rebind = mpol_rebind_default,
4167858d7bcSFeng Tang },
417b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = {
418be897d48SFeng Tang .create = mpol_new_nodemask,
419b27abaccSDave Hansen .rebind = mpol_rebind_preferred,
420b27abaccSDave Hansen },
42137012946SDavid Rientjes };
42237012946SDavid Rientjes
4234a64981dSVishal Moola (Oracle) static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
424fc301289SChristoph Lameter unsigned long flags);
4251a75a6c8SChristoph Lameter
4266f4576e3SNaoya Horiguchi struct queue_pages {
4276f4576e3SNaoya Horiguchi struct list_head *pagelist;
4286f4576e3SNaoya Horiguchi unsigned long flags;
4296f4576e3SNaoya Horiguchi nodemask_t *nmask;
430f18da660SLi Xinhai unsigned long start;
431f18da660SLi Xinhai unsigned long end;
432f18da660SLi Xinhai struct vm_area_struct *first;
43324526268SYang Shi bool has_unmovable;
4346f4576e3SNaoya Horiguchi };
4356f4576e3SNaoya Horiguchi
43698094945SNaoya Horiguchi /*
437d451b89dSVishal Moola (Oracle) * Check if the folio's nid is in qp->nmask.
43888aaa2a1SNaoya Horiguchi *
43988aaa2a1SNaoya Horiguchi * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
44088aaa2a1SNaoya Horiguchi * in the invert of qp->nmask.
44188aaa2a1SNaoya Horiguchi */
queue_folio_required(struct folio * folio,struct queue_pages * qp)442d451b89dSVishal Moola (Oracle) static inline bool queue_folio_required(struct folio *folio,
44388aaa2a1SNaoya Horiguchi struct queue_pages *qp)
44488aaa2a1SNaoya Horiguchi {
445d451b89dSVishal Moola (Oracle) int nid = folio_nid(folio);
44688aaa2a1SNaoya Horiguchi unsigned long flags = qp->flags;
44788aaa2a1SNaoya Horiguchi
44888aaa2a1SNaoya Horiguchi return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
44988aaa2a1SNaoya Horiguchi }
45088aaa2a1SNaoya Horiguchi
451a7f40cfeSYang Shi /*
452de1f5055SVishal Moola (Oracle) * queue_folios_pmd() has three possible return values:
453de1f5055SVishal Moola (Oracle) * 0 - folios are placed on the right node or queued successfully, or
45424526268SYang Shi * special page is met, i.e. zero page, or unmovable page is found
45524526268SYang Shi * but continue walking (indicated by queue_pages.has_unmovable).
456d8835445SYang Shi * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
457de1f5055SVishal Moola (Oracle) * existing folio was already on a node that does not follow the
458d8835445SYang Shi * policy.
459a7f40cfeSYang Shi */
queue_folios_pmd(pmd_t * pmd,spinlock_t * ptl,unsigned long addr,unsigned long end,struct mm_walk * walk)460de1f5055SVishal Moola (Oracle) static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
461c8633798SNaoya Horiguchi unsigned long end, struct mm_walk *walk)
462959a7e13SJules Irenge __releases(ptl)
463c8633798SNaoya Horiguchi {
464c8633798SNaoya Horiguchi int ret = 0;
465de1f5055SVishal Moola (Oracle) struct folio *folio;
466c8633798SNaoya Horiguchi struct queue_pages *qp = walk->private;
467c8633798SNaoya Horiguchi unsigned long flags;
468c8633798SNaoya Horiguchi
469c8633798SNaoya Horiguchi if (unlikely(is_pmd_migration_entry(*pmd))) {
470a7f40cfeSYang Shi ret = -EIO;
471c8633798SNaoya Horiguchi goto unlock;
472c8633798SNaoya Horiguchi }
473de1f5055SVishal Moola (Oracle) folio = pfn_folio(pmd_pfn(*pmd));
474de1f5055SVishal Moola (Oracle) if (is_huge_zero_page(&folio->page)) {
475e5947d23SYang Shi walk->action = ACTION_CONTINUE;
4766d97cf88SMiaohe Lin goto unlock;
477c8633798SNaoya Horiguchi }
478d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
479c8633798SNaoya Horiguchi goto unlock;
480c8633798SNaoya Horiguchi
481c8633798SNaoya Horiguchi flags = qp->flags;
482de1f5055SVishal Moola (Oracle) /* go to folio migration */
483a7f40cfeSYang Shi if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
484a53190a4SYang Shi if (!vma_migratable(walk->vma) ||
4854a64981dSVishal Moola (Oracle) migrate_folio_add(folio, qp->pagelist, flags)) {
48624526268SYang Shi qp->has_unmovable = true;
487a7f40cfeSYang Shi goto unlock;
488a7f40cfeSYang Shi }
489a7f40cfeSYang Shi } else
490a7f40cfeSYang Shi ret = -EIO;
491c8633798SNaoya Horiguchi unlock:
492c8633798SNaoya Horiguchi spin_unlock(ptl);
493c8633798SNaoya Horiguchi return ret;
494c8633798SNaoya Horiguchi }
495c8633798SNaoya Horiguchi
49688aaa2a1SNaoya Horiguchi /*
49798094945SNaoya Horiguchi * Scan through pages checking if pages follow certain conditions,
49898094945SNaoya Horiguchi * and move them to the pagelist if they do.
499d8835445SYang Shi *
5003dae02bbSVishal Moola (Oracle) * queue_folios_pte_range() has three possible return values:
5013dae02bbSVishal Moola (Oracle) * 0 - folios are placed on the right node or queued successfully, or
50224526268SYang Shi * special page is met, i.e. zero page, or unmovable page is found
50324526268SYang Shi * but continue walking (indicated by queue_pages.has_unmovable).
5043dae02bbSVishal Moola (Oracle) * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
505d8835445SYang Shi * on a node that does not follow the policy.
50698094945SNaoya Horiguchi */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)5073dae02bbSVishal Moola (Oracle) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
5086f4576e3SNaoya Horiguchi unsigned long end, struct mm_walk *walk)
5091da177e4SLinus Torvalds {
5106f4576e3SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
5113dae02bbSVishal Moola (Oracle) struct folio *folio;
5126f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
5136f4576e3SNaoya Horiguchi unsigned long flags = qp->flags;
5143f088420SShijie Luo pte_t *pte, *mapped_pte;
515c33c7948SRyan Roberts pte_t ptent;
516705e87c0SHugh Dickins spinlock_t *ptl;
517941150a3SHugh Dickins
518c8633798SNaoya Horiguchi ptl = pmd_trans_huge_lock(pmd, vma);
519bc78b5edSMiaohe Lin if (ptl)
520de1f5055SVishal Moola (Oracle) return queue_folios_pmd(pmd, ptl, addr, end, walk);
52191612e0dSHugh Dickins
5223f088420SShijie Luo mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
5237780d040SHugh Dickins if (!pte) {
5247780d040SHugh Dickins walk->action = ACTION_AGAIN;
5257780d040SHugh Dickins return 0;
5267780d040SHugh Dickins }
5276f4576e3SNaoya Horiguchi for (; addr != end; pte++, addr += PAGE_SIZE) {
528c33c7948SRyan Roberts ptent = ptep_get(pte);
529c33c7948SRyan Roberts if (!pte_present(ptent))
53091612e0dSHugh Dickins continue;
531c33c7948SRyan Roberts folio = vm_normal_folio(vma, addr, ptent);
5323dae02bbSVishal Moola (Oracle) if (!folio || folio_is_zone_device(folio))
53391612e0dSHugh Dickins continue;
534053837fcSNick Piggin /*
5353dae02bbSVishal Moola (Oracle) * vm_normal_folio() filters out zero pages, but there might
5363dae02bbSVishal Moola (Oracle) * still be reserved folios to skip, perhaps in a VDSO.
537053837fcSNick Piggin */
5383dae02bbSVishal Moola (Oracle) if (folio_test_reserved(folio))
539f4598c8bSChristoph Lameter continue;
540d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
54138e35860SChristoph Lameter continue;
542a7f40cfeSYang Shi if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
54324526268SYang Shi /*
54424526268SYang Shi * MPOL_MF_STRICT must be specified if we get here.
54524526268SYang Shi * Continue walking vmas due to MPOL_MF_MOVE* flags.
54624526268SYang Shi */
54724526268SYang Shi if (!vma_migratable(vma))
54824526268SYang Shi qp->has_unmovable = true;
549a53190a4SYang Shi
550a53190a4SYang Shi /*
551a53190a4SYang Shi * Do not abort immediately since there may be
552a53190a4SYang Shi * temporary off LRU pages in the range. Still
553a53190a4SYang Shi * need migrate other LRU pages.
554a53190a4SYang Shi */
5554a64981dSVishal Moola (Oracle) if (migrate_folio_add(folio, qp->pagelist, flags))
55624526268SYang Shi qp->has_unmovable = true;
557a7f40cfeSYang Shi } else
558a7f40cfeSYang Shi break;
5596f4576e3SNaoya Horiguchi }
5603f088420SShijie Luo pte_unmap_unlock(mapped_pte, ptl);
5616f4576e3SNaoya Horiguchi cond_resched();
562d8835445SYang Shi
563a7f40cfeSYang Shi return addr != end ? -EIO : 0;
56491612e0dSHugh Dickins }
56591612e0dSHugh Dickins
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)5660a2c1e81SVishal Moola (Oracle) static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
5676f4576e3SNaoya Horiguchi unsigned long addr, unsigned long end,
5686f4576e3SNaoya Horiguchi struct mm_walk *walk)
569e2d8cf40SNaoya Horiguchi {
570dcf17635SLi Xinhai int ret = 0;
571e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
5726f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
573dcf17635SLi Xinhai unsigned long flags = (qp->flags & MPOL_MF_VALID);
5740a2c1e81SVishal Moola (Oracle) struct folio *folio;
575cb900f41SKirill A. Shutemov spinlock_t *ptl;
576d4c54919SNaoya Horiguchi pte_t entry;
577e2d8cf40SNaoya Horiguchi
5786f4576e3SNaoya Horiguchi ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
5796f4576e3SNaoya Horiguchi entry = huge_ptep_get(pte);
580d4c54919SNaoya Horiguchi if (!pte_present(entry))
581d4c54919SNaoya Horiguchi goto unlock;
5820a2c1e81SVishal Moola (Oracle) folio = pfn_folio(pte_pfn(entry));
583d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
584e2d8cf40SNaoya Horiguchi goto unlock;
585dcf17635SLi Xinhai
586dcf17635SLi Xinhai if (flags == MPOL_MF_STRICT) {
587dcf17635SLi Xinhai /*
5880a2c1e81SVishal Moola (Oracle) * STRICT alone means only detecting misplaced folio and no
589dcf17635SLi Xinhai * need to further check other vma.
590dcf17635SLi Xinhai */
591dcf17635SLi Xinhai ret = -EIO;
592dcf17635SLi Xinhai goto unlock;
593dcf17635SLi Xinhai }
594dcf17635SLi Xinhai
595dcf17635SLi Xinhai if (!vma_migratable(walk->vma)) {
596dcf17635SLi Xinhai /*
597dcf17635SLi Xinhai * Must be STRICT with MOVE*, otherwise .test_walk() have
598dcf17635SLi Xinhai * stopped walking current vma.
5990a2c1e81SVishal Moola (Oracle) * Detecting misplaced folio but allow migrating folios which
600dcf17635SLi Xinhai * have been queued.
601dcf17635SLi Xinhai */
60224526268SYang Shi qp->has_unmovable = true;
603dcf17635SLi Xinhai goto unlock;
604dcf17635SLi Xinhai }
605dcf17635SLi Xinhai
6060a2c1e81SVishal Moola (Oracle) /*
6070a2c1e81SVishal Moola (Oracle) * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
6080a2c1e81SVishal Moola (Oracle) * is shared it is likely not worth migrating.
6090a2c1e81SVishal Moola (Oracle) *
6100a2c1e81SVishal Moola (Oracle) * To check if the folio is shared, ideally we want to make sure
6110a2c1e81SVishal Moola (Oracle) * every page is mapped to the same process. Doing that is very
6120a2c1e81SVishal Moola (Oracle) * expensive, so check the estimated mapcount of the folio instead.
6130a2c1e81SVishal Moola (Oracle) */
614e2d8cf40SNaoya Horiguchi if (flags & (MPOL_MF_MOVE_ALL) ||
6150a2c1e81SVishal Moola (Oracle) (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
61673bdf65eSMike Kravetz !hugetlb_pmd_shared(pte))) {
6179747b9e9SBaolin Wang if (!isolate_hugetlb(folio, qp->pagelist) &&
618dcf17635SLi Xinhai (flags & MPOL_MF_STRICT))
619dcf17635SLi Xinhai /*
6200a2c1e81SVishal Moola (Oracle) * Failed to isolate folio but allow migrating pages
621dcf17635SLi Xinhai * which have been queued.
622dcf17635SLi Xinhai */
62324526268SYang Shi qp->has_unmovable = true;
624dcf17635SLi Xinhai }
625e2d8cf40SNaoya Horiguchi unlock:
626cb900f41SKirill A. Shutemov spin_unlock(ptl);
627e2d8cf40SNaoya Horiguchi #else
628e2d8cf40SNaoya Horiguchi BUG();
629e2d8cf40SNaoya Horiguchi #endif
630dcf17635SLi Xinhai return ret;
6311da177e4SLinus Torvalds }
6321da177e4SLinus Torvalds
6335877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING
634b24f53a0SLee Schermerhorn /*
6354b10e7d5SMel Gorman * This is used to mark a range of virtual addresses to be inaccessible.
6364b10e7d5SMel Gorman * These are later cleared by a NUMA hinting fault. Depending on these
6374b10e7d5SMel Gorman * faults, pages may be migrated for better NUMA placement.
6384b10e7d5SMel Gorman *
6394b10e7d5SMel Gorman * This is assuming that NUMA faults are handled using PROT_NONE. If
6404b10e7d5SMel Gorman * an architecture makes a different choice, it will need further
6414b10e7d5SMel Gorman * changes to the core.
642b24f53a0SLee Schermerhorn */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)6434b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma,
6444b10e7d5SMel Gorman unsigned long addr, unsigned long end)
645b24f53a0SLee Schermerhorn {
6464a18419fSNadav Amit struct mmu_gather tlb;
647a79390f5SPeter Xu long nr_updated;
648b24f53a0SLee Schermerhorn
6494a18419fSNadav Amit tlb_gather_mmu(&tlb, vma->vm_mm);
6504a18419fSNadav Amit
6511ef488edSDavid Hildenbrand nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
652d1751118SPeter Xu if (nr_updated > 0)
65303c5a6e1SMel Gorman count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
654b24f53a0SLee Schermerhorn
6554a18419fSNadav Amit tlb_finish_mmu(&tlb);
6564a18419fSNadav Amit
6574b10e7d5SMel Gorman return nr_updated;
658b24f53a0SLee Schermerhorn }
659b24f53a0SLee Schermerhorn #else
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)660b24f53a0SLee Schermerhorn static unsigned long change_prot_numa(struct vm_area_struct *vma,
661b24f53a0SLee Schermerhorn unsigned long addr, unsigned long end)
662b24f53a0SLee Schermerhorn {
663b24f53a0SLee Schermerhorn return 0;
664b24f53a0SLee Schermerhorn }
6655877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */
666b24f53a0SLee Schermerhorn
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)6676f4576e3SNaoya Horiguchi static int queue_pages_test_walk(unsigned long start, unsigned long end,
6686f4576e3SNaoya Horiguchi struct mm_walk *walk)
6691da177e4SLinus Torvalds {
67066850be5SLiam R. Howlett struct vm_area_struct *next, *vma = walk->vma;
6716f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
6725b952b3cSAndi Kleen unsigned long endvma = vma->vm_end;
6736f4576e3SNaoya Horiguchi unsigned long flags = qp->flags;
674dc9aa5b9SChristoph Lameter
675a18b3ac2SLi Xinhai /* range check first */
676ce33135cSMiaohe Lin VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
677f18da660SLi Xinhai
678f18da660SLi Xinhai if (!qp->first) {
679f18da660SLi Xinhai qp->first = vma;
680f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) &&
681f18da660SLi Xinhai (qp->start < vma->vm_start))
682f18da660SLi Xinhai /* hole at head side of range */
683a18b3ac2SLi Xinhai return -EFAULT;
684a18b3ac2SLi Xinhai }
68566850be5SLiam R. Howlett next = find_vma(vma->vm_mm, vma->vm_end);
686f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) &&
687f18da660SLi Xinhai ((vma->vm_end < qp->end) &&
68866850be5SLiam R. Howlett (!next || vma->vm_end < next->vm_start)))
689f18da660SLi Xinhai /* hole at middle or tail of range */
690f18da660SLi Xinhai return -EFAULT;
691a18b3ac2SLi Xinhai
692a7f40cfeSYang Shi /*
693a7f40cfeSYang Shi * Need check MPOL_MF_STRICT to return -EIO if possible
694a7f40cfeSYang Shi * regardless of vma_migratable
695a7f40cfeSYang Shi */
696a7f40cfeSYang Shi if (!vma_migratable(vma) &&
697a7f40cfeSYang Shi !(flags & MPOL_MF_STRICT))
69848684a65SNaoya Horiguchi return 1;
69948684a65SNaoya Horiguchi
7005b952b3cSAndi Kleen if (endvma > end)
7015b952b3cSAndi Kleen endvma = end;
702b24f53a0SLee Schermerhorn
703b24f53a0SLee Schermerhorn if (flags & MPOL_MF_LAZY) {
7042c0346a3SMel Gorman /* Similar to task_numa_work, skip inaccessible VMAs */
7053122e80eSAnshuman Khandual if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
7064355c018SLiang Chen !(vma->vm_flags & VM_MIXEDMAP))
707b24f53a0SLee Schermerhorn change_prot_numa(vma, start, endvma);
7086f4576e3SNaoya Horiguchi return 1;
709b24f53a0SLee Schermerhorn }
710b24f53a0SLee Schermerhorn
7116f4576e3SNaoya Horiguchi /* queue pages from current vma */
712a7f40cfeSYang Shi if (flags & MPOL_MF_VALID)
7136f4576e3SNaoya Horiguchi return 0;
7146f4576e3SNaoya Horiguchi return 1;
7156f4576e3SNaoya Horiguchi }
716b24f53a0SLee Schermerhorn
7177b86ac33SChristoph Hellwig static const struct mm_walk_ops queue_pages_walk_ops = {
7180a2c1e81SVishal Moola (Oracle) .hugetlb_entry = queue_folios_hugetlb,
7193dae02bbSVishal Moola (Oracle) .pmd_entry = queue_folios_pte_range,
7207b86ac33SChristoph Hellwig .test_walk = queue_pages_test_walk,
72149b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
72249b06385SSuren Baghdasaryan };
72349b06385SSuren Baghdasaryan
72449b06385SSuren Baghdasaryan static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
72549b06385SSuren Baghdasaryan .hugetlb_entry = queue_folios_hugetlb,
72649b06385SSuren Baghdasaryan .pmd_entry = queue_folios_pte_range,
72749b06385SSuren Baghdasaryan .test_walk = queue_pages_test_walk,
72849b06385SSuren Baghdasaryan .walk_lock = PGWALK_WRLOCK,
7297b86ac33SChristoph Hellwig };
7307b86ac33SChristoph Hellwig
7316f4576e3SNaoya Horiguchi /*
7326f4576e3SNaoya Horiguchi * Walk through page tables and collect pages to be migrated.
7336f4576e3SNaoya Horiguchi *
7346f4576e3SNaoya Horiguchi * If pages found in a given range are on a set of nodes (determined by
7356f4576e3SNaoya Horiguchi * @nodes and @flags,) it's isolated and queued to the pagelist which is
736d8835445SYang Shi * passed via @private.
737d8835445SYang Shi *
738d8835445SYang Shi * queue_pages_range() has three possible return values:
739d8835445SYang Shi * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
740d8835445SYang Shi * specified.
741d8835445SYang Shi * 0 - queue pages successfully or no misplaced page.
742a85dfc30SYang Shi * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
743a85dfc30SYang Shi * memory range specified by nodemask and maxnode points outside
744a85dfc30SYang Shi * your accessible address space (-EFAULT)
7456f4576e3SNaoya Horiguchi */
7466f4576e3SNaoya Horiguchi static int
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist,bool lock_vma)7476f4576e3SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
7486f4576e3SNaoya Horiguchi nodemask_t *nodes, unsigned long flags,
74949b06385SSuren Baghdasaryan struct list_head *pagelist, bool lock_vma)
7506f4576e3SNaoya Horiguchi {
751f18da660SLi Xinhai int err;
7526f4576e3SNaoya Horiguchi struct queue_pages qp = {
7536f4576e3SNaoya Horiguchi .pagelist = pagelist,
7546f4576e3SNaoya Horiguchi .flags = flags,
7556f4576e3SNaoya Horiguchi .nmask = nodes,
756f18da660SLi Xinhai .start = start,
757f18da660SLi Xinhai .end = end,
758f18da660SLi Xinhai .first = NULL,
75924526268SYang Shi .has_unmovable = false,
7606f4576e3SNaoya Horiguchi };
76149b06385SSuren Baghdasaryan const struct mm_walk_ops *ops = lock_vma ?
76249b06385SSuren Baghdasaryan &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
7636f4576e3SNaoya Horiguchi
76449b06385SSuren Baghdasaryan err = walk_page_range(mm, start, end, ops, &qp);
765f18da660SLi Xinhai
76624526268SYang Shi if (qp.has_unmovable)
76724526268SYang Shi err = 1;
768f18da660SLi Xinhai if (!qp.first)
769f18da660SLi Xinhai /* whole range in hole */
770f18da660SLi Xinhai err = -EFAULT;
771f18da660SLi Xinhai
772f18da660SLi Xinhai return err;
7731da177e4SLinus Torvalds }
7741da177e4SLinus Torvalds
775869833f2SKOSAKI Motohiro /*
776869833f2SKOSAKI Motohiro * Apply policy to a single VMA
777c1e8d7c6SMichel Lespinasse * This must be called with the mmap_lock held for writing.
778869833f2SKOSAKI Motohiro */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)779869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma,
780869833f2SKOSAKI Motohiro struct mempolicy *pol)
7818d34694cSKOSAKI Motohiro {
782869833f2SKOSAKI Motohiro int err;
783869833f2SKOSAKI Motohiro struct mempolicy *old;
784869833f2SKOSAKI Motohiro struct mempolicy *new;
7858d34694cSKOSAKI Motohiro
7866c21e066SJann Horn vma_assert_write_locked(vma);
7876c21e066SJann Horn
7888d34694cSKOSAKI Motohiro pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
7898d34694cSKOSAKI Motohiro vma->vm_start, vma->vm_end, vma->vm_pgoff,
7908d34694cSKOSAKI Motohiro vma->vm_ops, vma->vm_file,
7918d34694cSKOSAKI Motohiro vma->vm_ops ? vma->vm_ops->set_policy : NULL);
7928d34694cSKOSAKI Motohiro
793869833f2SKOSAKI Motohiro new = mpol_dup(pol);
794869833f2SKOSAKI Motohiro if (IS_ERR(new))
795869833f2SKOSAKI Motohiro return PTR_ERR(new);
796869833f2SKOSAKI Motohiro
797869833f2SKOSAKI Motohiro if (vma->vm_ops && vma->vm_ops->set_policy) {
7988d34694cSKOSAKI Motohiro err = vma->vm_ops->set_policy(vma, new);
799869833f2SKOSAKI Motohiro if (err)
800869833f2SKOSAKI Motohiro goto err_out;
8018d34694cSKOSAKI Motohiro }
802869833f2SKOSAKI Motohiro
803869833f2SKOSAKI Motohiro old = vma->vm_policy;
804c1e8d7c6SMichel Lespinasse vma->vm_policy = new; /* protected by mmap_lock */
805869833f2SKOSAKI Motohiro mpol_put(old);
806869833f2SKOSAKI Motohiro
807869833f2SKOSAKI Motohiro return 0;
808869833f2SKOSAKI Motohiro err_out:
809869833f2SKOSAKI Motohiro mpol_put(new);
8108d34694cSKOSAKI Motohiro return err;
8118d34694cSKOSAKI Motohiro }
8128d34694cSKOSAKI Motohiro
813f4e9e0e6SLiam R. Howlett /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)814f4e9e0e6SLiam R. Howlett static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
815f4e9e0e6SLiam R. Howlett struct vm_area_struct **prev, unsigned long start,
8169d8cebd4SKOSAKI Motohiro unsigned long end, struct mempolicy *new_pol)
8171da177e4SLinus Torvalds {
818f4e9e0e6SLiam R. Howlett struct vm_area_struct *merged;
819f4e9e0e6SLiam R. Howlett unsigned long vmstart, vmend;
820e26a5114SKOSAKI Motohiro pgoff_t pgoff;
821f4e9e0e6SLiam R. Howlett int err;
8221da177e4SLinus Torvalds
823f4e9e0e6SLiam R. Howlett vmend = min(end, vma->vm_end);
824f4e9e0e6SLiam R. Howlett if (start > vma->vm_start) {
825f4e9e0e6SLiam R. Howlett *prev = vma;
826f4e9e0e6SLiam R. Howlett vmstart = start;
827f4e9e0e6SLiam R. Howlett } else {
828f4e9e0e6SLiam R. Howlett vmstart = vma->vm_start;
829f4e9e0e6SLiam R. Howlett }
8309d8cebd4SKOSAKI Motohiro
83100ca0f2eSLorenzo Stoakes if (mpol_equal(vma_policy(vma), new_pol)) {
83200ca0f2eSLorenzo Stoakes *prev = vma;
833f4e9e0e6SLiam R. Howlett return 0;
83400ca0f2eSLorenzo Stoakes }
835e26a5114SKOSAKI Motohiro
836f4e9e0e6SLiam R. Howlett pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
837f4e9e0e6SLiam R. Howlett merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
838f4e9e0e6SLiam R. Howlett vma->anon_vma, vma->vm_file, pgoff, new_pol,
839f4e9e0e6SLiam R. Howlett vma->vm_userfaultfd_ctx, anon_vma_name(vma));
840f4e9e0e6SLiam R. Howlett if (merged) {
841f4e9e0e6SLiam R. Howlett *prev = merged;
842f4e9e0e6SLiam R. Howlett return vma_replace_policy(merged, new_pol);
8431da177e4SLinus Torvalds }
844f4e9e0e6SLiam R. Howlett
8459d8cebd4SKOSAKI Motohiro if (vma->vm_start != vmstart) {
846f4e9e0e6SLiam R. Howlett err = split_vma(vmi, vma, vmstart, 1);
8479d8cebd4SKOSAKI Motohiro if (err)
8481da177e4SLinus Torvalds return err;
8491da177e4SLinus Torvalds }
8501da177e4SLinus Torvalds
851f4e9e0e6SLiam R. Howlett if (vma->vm_end != vmend) {
852f4e9e0e6SLiam R. Howlett err = split_vma(vmi, vma, vmend, 0);
853f4e9e0e6SLiam R. Howlett if (err)
854f4e9e0e6SLiam R. Howlett return err;
855f4e9e0e6SLiam R. Howlett }
856f4e9e0e6SLiam R. Howlett
857f4e9e0e6SLiam R. Howlett *prev = vma;
858f4e9e0e6SLiam R. Howlett return vma_replace_policy(vma, new_pol);
859f4e9e0e6SLiam R. Howlett }
860f4e9e0e6SLiam R. Howlett
8611da177e4SLinus Torvalds /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)862028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
863028fec41SDavid Rientjes nodemask_t *nodes)
8641da177e4SLinus Torvalds {
86558568d2aSMiao Xie struct mempolicy *new, *old;
8664bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
86758568d2aSMiao Xie int ret;
8681da177e4SLinus Torvalds
8694bfc4495SKAMEZAWA Hiroyuki if (!scratch)
8704bfc4495SKAMEZAWA Hiroyuki return -ENOMEM;
871f4e53d91SLee Schermerhorn
8724bfc4495SKAMEZAWA Hiroyuki new = mpol_new(mode, flags, nodes);
8734bfc4495SKAMEZAWA Hiroyuki if (IS_ERR(new)) {
8744bfc4495SKAMEZAWA Hiroyuki ret = PTR_ERR(new);
8754bfc4495SKAMEZAWA Hiroyuki goto out;
8764bfc4495SKAMEZAWA Hiroyuki }
8772c7c3a7dSOleg Nesterov
87812c1dc8eSAbel Wu task_lock(current);
8794bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, nodes, scratch);
88058568d2aSMiao Xie if (ret) {
88112c1dc8eSAbel Wu task_unlock(current);
88258568d2aSMiao Xie mpol_put(new);
8834bfc4495SKAMEZAWA Hiroyuki goto out;
88458568d2aSMiao Xie }
88512c1dc8eSAbel Wu
88658568d2aSMiao Xie old = current->mempolicy;
8871da177e4SLinus Torvalds current->mempolicy = new;
88845816682SVlastimil Babka if (new && new->mode == MPOL_INTERLEAVE)
88945816682SVlastimil Babka current->il_prev = MAX_NUMNODES-1;
89058568d2aSMiao Xie task_unlock(current);
89158568d2aSMiao Xie mpol_put(old);
8924bfc4495SKAMEZAWA Hiroyuki ret = 0;
8934bfc4495SKAMEZAWA Hiroyuki out:
8944bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
8954bfc4495SKAMEZAWA Hiroyuki return ret;
8961da177e4SLinus Torvalds }
8971da177e4SLinus Torvalds
898bea904d5SLee Schermerhorn /*
899bea904d5SLee Schermerhorn * Return nodemask for policy for get_mempolicy() query
90058568d2aSMiao Xie *
90158568d2aSMiao Xie * Called with task's alloc_lock held
902bea904d5SLee Schermerhorn */
get_policy_nodemask(struct mempolicy * p,nodemask_t * nodes)903bea904d5SLee Schermerhorn static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
9041da177e4SLinus Torvalds {
905dfcd3c0dSAndi Kleen nodes_clear(*nodes);
906bea904d5SLee Schermerhorn if (p == &default_policy)
907bea904d5SLee Schermerhorn return;
908bea904d5SLee Schermerhorn
90945c4745aSLee Schermerhorn switch (p->mode) {
91019770b32SMel Gorman case MPOL_BIND:
9111da177e4SLinus Torvalds case MPOL_INTERLEAVE:
912269fbe72SBen Widawsky case MPOL_PREFERRED:
913b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
914269fbe72SBen Widawsky *nodes = p->nodes;
9151da177e4SLinus Torvalds break;
9167858d7bcSFeng Tang case MPOL_LOCAL:
9177858d7bcSFeng Tang /* return empty node mask for local allocation */
9187858d7bcSFeng Tang break;
9191da177e4SLinus Torvalds default:
9201da177e4SLinus Torvalds BUG();
9211da177e4SLinus Torvalds }
9221da177e4SLinus Torvalds }
9231da177e4SLinus Torvalds
lookup_node(struct mm_struct * mm,unsigned long addr)9243b9aadf7SAndrea Arcangeli static int lookup_node(struct mm_struct *mm, unsigned long addr)
9251da177e4SLinus Torvalds {
926ba841078SPeter Xu struct page *p = NULL;
927f728b9c4SJohn Hubbard int ret;
9281da177e4SLinus Torvalds
929f728b9c4SJohn Hubbard ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
930f728b9c4SJohn Hubbard if (ret > 0) {
931f728b9c4SJohn Hubbard ret = page_to_nid(p);
9321da177e4SLinus Torvalds put_page(p);
9331da177e4SLinus Torvalds }
934f728b9c4SJohn Hubbard return ret;
9351da177e4SLinus Torvalds }
9361da177e4SLinus Torvalds
9371da177e4SLinus Torvalds /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)938dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
9391da177e4SLinus Torvalds unsigned long addr, unsigned long flags)
9401da177e4SLinus Torvalds {
9418bccd85fSChristoph Lameter int err;
9421da177e4SLinus Torvalds struct mm_struct *mm = current->mm;
9431da177e4SLinus Torvalds struct vm_area_struct *vma = NULL;
9443b9aadf7SAndrea Arcangeli struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
9451da177e4SLinus Torvalds
946754af6f5SLee Schermerhorn if (flags &
947754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
9481da177e4SLinus Torvalds return -EINVAL;
949754af6f5SLee Schermerhorn
950754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) {
951754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
952754af6f5SLee Schermerhorn return -EINVAL;
953754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */
95458568d2aSMiao Xie task_lock(current);
955754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed;
95658568d2aSMiao Xie task_unlock(current);
957754af6f5SLee Schermerhorn return 0;
958754af6f5SLee Schermerhorn }
959754af6f5SLee Schermerhorn
9601da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) {
961bea904d5SLee Schermerhorn /*
962bea904d5SLee Schermerhorn * Do NOT fall back to task policy if the
963bea904d5SLee Schermerhorn * vma/shared policy at addr is NULL. We
964bea904d5SLee Schermerhorn * want to return MPOL_DEFAULT in this case.
965bea904d5SLee Schermerhorn */
966d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
96733e3575cSLiam Howlett vma = vma_lookup(mm, addr);
9681da177e4SLinus Torvalds if (!vma) {
969d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
9701da177e4SLinus Torvalds return -EFAULT;
9711da177e4SLinus Torvalds }
9721da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy)
9731da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr);
9741da177e4SLinus Torvalds else
9751da177e4SLinus Torvalds pol = vma->vm_policy;
9761da177e4SLinus Torvalds } else if (addr)
9771da177e4SLinus Torvalds return -EINVAL;
9781da177e4SLinus Torvalds
9791da177e4SLinus Torvalds if (!pol)
980bea904d5SLee Schermerhorn pol = &default_policy; /* indicates default behavior */
9811da177e4SLinus Torvalds
9821da177e4SLinus Torvalds if (flags & MPOL_F_NODE) {
9831da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) {
9843b9aadf7SAndrea Arcangeli /*
985f728b9c4SJohn Hubbard * Take a refcount on the mpol, because we are about to
986f728b9c4SJohn Hubbard * drop the mmap_lock, after which only "pol" remains
987f728b9c4SJohn Hubbard * valid, "vma" is stale.
9883b9aadf7SAndrea Arcangeli */
9893b9aadf7SAndrea Arcangeli pol_refcount = pol;
9903b9aadf7SAndrea Arcangeli vma = NULL;
9913b9aadf7SAndrea Arcangeli mpol_get(pol);
992f728b9c4SJohn Hubbard mmap_read_unlock(mm);
9933b9aadf7SAndrea Arcangeli err = lookup_node(mm, addr);
9941da177e4SLinus Torvalds if (err < 0)
9951da177e4SLinus Torvalds goto out;
9968bccd85fSChristoph Lameter *policy = err;
9971da177e4SLinus Torvalds } else if (pol == current->mempolicy &&
99845c4745aSLee Schermerhorn pol->mode == MPOL_INTERLEAVE) {
999269fbe72SBen Widawsky *policy = next_node_in(current->il_prev, pol->nodes);
10001da177e4SLinus Torvalds } else {
10011da177e4SLinus Torvalds err = -EINVAL;
10021da177e4SLinus Torvalds goto out;
10031da177e4SLinus Torvalds }
1004bea904d5SLee Schermerhorn } else {
1005bea904d5SLee Schermerhorn *policy = pol == &default_policy ? MPOL_DEFAULT :
1006bea904d5SLee Schermerhorn pol->mode;
1007d79df630SDavid Rientjes /*
1008d79df630SDavid Rientjes * Internal mempolicy flags must be masked off before exposing
1009d79df630SDavid Rientjes * the policy to userspace.
1010d79df630SDavid Rientjes */
1011d79df630SDavid Rientjes *policy |= (pol->flags & MPOL_MODE_FLAGS);
1012bea904d5SLee Schermerhorn }
10131da177e4SLinus Torvalds
10141da177e4SLinus Torvalds err = 0;
101558568d2aSMiao Xie if (nmask) {
1016c6b6ef8bSLee Schermerhorn if (mpol_store_user_nodemask(pol)) {
1017c6b6ef8bSLee Schermerhorn *nmask = pol->w.user_nodemask;
1018c6b6ef8bSLee Schermerhorn } else {
101958568d2aSMiao Xie task_lock(current);
1020bea904d5SLee Schermerhorn get_policy_nodemask(pol, nmask);
102158568d2aSMiao Xie task_unlock(current);
102258568d2aSMiao Xie }
1023c6b6ef8bSLee Schermerhorn }
10241da177e4SLinus Torvalds
10251da177e4SLinus Torvalds out:
102652cd3b07SLee Schermerhorn mpol_cond_put(pol);
10271da177e4SLinus Torvalds if (vma)
1028d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
10293b9aadf7SAndrea Arcangeli if (pol_refcount)
10303b9aadf7SAndrea Arcangeli mpol_put(pol_refcount);
10311da177e4SLinus Torvalds return err;
10321da177e4SLinus Torvalds }
10331da177e4SLinus Torvalds
1034b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)10354a64981dSVishal Moola (Oracle) static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1036fc301289SChristoph Lameter unsigned long flags)
10376ce3c4c0SChristoph Lameter {
10386ce3c4c0SChristoph Lameter /*
10394a64981dSVishal Moola (Oracle) * We try to migrate only unshared folios. If it is shared it
10404a64981dSVishal Moola (Oracle) * is likely not worth migrating.
10414a64981dSVishal Moola (Oracle) *
10424a64981dSVishal Moola (Oracle) * To check if the folio is shared, ideally we want to make sure
10434a64981dSVishal Moola (Oracle) * every page is mapped to the same process. Doing that is very
10444a64981dSVishal Moola (Oracle) * expensive, so check the estimated mapcount of the folio instead.
10456ce3c4c0SChristoph Lameter */
10464a64981dSVishal Moola (Oracle) if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1047be2d5756SBaolin Wang if (folio_isolate_lru(folio)) {
10484a64981dSVishal Moola (Oracle) list_add_tail(&folio->lru, foliolist);
10494a64981dSVishal Moola (Oracle) node_stat_mod_folio(folio,
10504a64981dSVishal Moola (Oracle) NR_ISOLATED_ANON + folio_is_file_lru(folio),
10514a64981dSVishal Moola (Oracle) folio_nr_pages(folio));
1052a53190a4SYang Shi } else if (flags & MPOL_MF_STRICT) {
1053a53190a4SYang Shi /*
10544a64981dSVishal Moola (Oracle) * Non-movable folio may reach here. And, there may be
10554a64981dSVishal Moola (Oracle) * temporary off LRU folios or non-LRU movable folios.
10564a64981dSVishal Moola (Oracle) * Treat them as unmovable folios since they can't be
1057a53190a4SYang Shi * isolated, so they can't be moved at the moment. It
1058a53190a4SYang Shi * should return -EIO for this case too.
1059a53190a4SYang Shi */
1060a53190a4SYang Shi return -EIO;
106162695a84SNick Piggin }
106262695a84SNick Piggin }
1063a53190a4SYang Shi
1064a53190a4SYang Shi return 0;
10656ce3c4c0SChristoph Lameter }
10666ce3c4c0SChristoph Lameter
10676ce3c4c0SChristoph Lameter /*
10687e2ab150SChristoph Lameter * Migrate pages from one node to a target node.
10697e2ab150SChristoph Lameter * Returns error or the number of pages not migrated.
10707e2ab150SChristoph Lameter */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1071dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1072dbcb0f19SAdrian Bunk int flags)
10737e2ab150SChristoph Lameter {
10747e2ab150SChristoph Lameter nodemask_t nmask;
107566850be5SLiam R. Howlett struct vm_area_struct *vma;
10767e2ab150SChristoph Lameter LIST_HEAD(pagelist);
10777e2ab150SChristoph Lameter int err = 0;
1078a0976311SJoonsoo Kim struct migration_target_control mtc = {
1079a0976311SJoonsoo Kim .nid = dest,
1080a0976311SJoonsoo Kim .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1081a0976311SJoonsoo Kim };
10827e2ab150SChristoph Lameter
10837e2ab150SChristoph Lameter nodes_clear(nmask);
10847e2ab150SChristoph Lameter node_set(source, nmask);
10857e2ab150SChristoph Lameter
108608270807SMinchan Kim /*
108708270807SMinchan Kim * This does not "check" the range but isolates all pages that
108808270807SMinchan Kim * need migration. Between passing in the full user address
108908270807SMinchan Kim * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
109008270807SMinchan Kim */
109166850be5SLiam R. Howlett vma = find_vma(mm, 0);
109208270807SMinchan Kim VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
109366850be5SLiam R. Howlett queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
109449b06385SSuren Baghdasaryan flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
10957e2ab150SChristoph Lameter
1096cf608ac1SMinchan Kim if (!list_empty(&pagelist)) {
1097a0976311SJoonsoo Kim err = migrate_pages(&pagelist, alloc_migration_target, NULL,
10985ac95884SYang Shi (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1099cf608ac1SMinchan Kim if (err)
1100e2d8cf40SNaoya Horiguchi putback_movable_pages(&pagelist);
1101cf608ac1SMinchan Kim }
110295a402c3SChristoph Lameter
11037e2ab150SChristoph Lameter return err;
11047e2ab150SChristoph Lameter }
11057e2ab150SChristoph Lameter
11067e2ab150SChristoph Lameter /*
11077e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical
11087e2ab150SChristoph Lameter * layout as much as possible.
110939743889SChristoph Lameter *
111039743889SChristoph Lameter * Returns the number of page that could not be moved.
111139743889SChristoph Lameter */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)11120ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11130ce72d4fSAndrew Morton const nodemask_t *to, int flags)
111439743889SChristoph Lameter {
11157e2ab150SChristoph Lameter int busy = 0;
1116f555befdSJan Stancek int err = 0;
11177e2ab150SChristoph Lameter nodemask_t tmp;
111839743889SChristoph Lameter
1119361a2a22SMinchan Kim lru_cache_disable();
11200aedadf9SChristoph Lameter
1121d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
1122d4984711SChristoph Lameter
11237e2ab150SChristoph Lameter /*
11247e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
11257e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
11267e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration.
11277e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map.
11287e2ab150SChristoph Lameter *
11297e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some
11307e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the
11317e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node
11327e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move.
11337e2ab150SChristoph Lameter *
11347e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left
11357e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false
11367e2ab150SChristoph Lameter * (nothing left to migrate).
11377e2ab150SChristoph Lameter *
11387e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that
11397e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other
11407e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a
11417e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node
11427e2ab150SChristoph Lameter * before migrating outgoing memory source that same node.
11437e2ab150SChristoph Lameter *
11447e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the
11457e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair
11467e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot
11477e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair.
1148ae0e47f0SJustin P. Mattock * Otherwise when we finish scanning from_tmp, we at least have the
11497e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through
11507e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less
11517e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating.
11527e2ab150SChristoph Lameter */
11537e2ab150SChristoph Lameter
11540ce72d4fSAndrew Morton tmp = *from;
11557e2ab150SChristoph Lameter while (!nodes_empty(tmp)) {
11567e2ab150SChristoph Lameter int s, d;
1157b76ac7e7SJianguo Wu int source = NUMA_NO_NODE;
11587e2ab150SChristoph Lameter int dest = 0;
11597e2ab150SChristoph Lameter
11607e2ab150SChristoph Lameter for_each_node_mask(s, tmp) {
11614a5b18ccSLarry Woodman
11624a5b18ccSLarry Woodman /*
11634a5b18ccSLarry Woodman * do_migrate_pages() tries to maintain the relative
11644a5b18ccSLarry Woodman * node relationship of the pages established between
11654a5b18ccSLarry Woodman * threads and memory areas.
11664a5b18ccSLarry Woodman *
11674a5b18ccSLarry Woodman * However if the number of source nodes is not equal to
11684a5b18ccSLarry Woodman * the number of destination nodes we can not preserve
11694a5b18ccSLarry Woodman * this node relative relationship. In that case, skip
11704a5b18ccSLarry Woodman * copying memory from a node that is in the destination
11714a5b18ccSLarry Woodman * mask.
11724a5b18ccSLarry Woodman *
11734a5b18ccSLarry Woodman * Example: [2,3,4] -> [3,4,5] moves everything.
11744a5b18ccSLarry Woodman * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
11754a5b18ccSLarry Woodman */
11764a5b18ccSLarry Woodman
11770ce72d4fSAndrew Morton if ((nodes_weight(*from) != nodes_weight(*to)) &&
11780ce72d4fSAndrew Morton (node_isset(s, *to)))
11794a5b18ccSLarry Woodman continue;
11804a5b18ccSLarry Woodman
11810ce72d4fSAndrew Morton d = node_remap(s, *from, *to);
11827e2ab150SChristoph Lameter if (s == d)
11837e2ab150SChristoph Lameter continue;
11847e2ab150SChristoph Lameter
11857e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */
11867e2ab150SChristoph Lameter dest = d;
11877e2ab150SChristoph Lameter
11887e2ab150SChristoph Lameter /* dest not in remaining from nodes? */
11897e2ab150SChristoph Lameter if (!node_isset(dest, tmp))
11907e2ab150SChristoph Lameter break;
11917e2ab150SChristoph Lameter }
1192b76ac7e7SJianguo Wu if (source == NUMA_NO_NODE)
11937e2ab150SChristoph Lameter break;
11947e2ab150SChristoph Lameter
11957e2ab150SChristoph Lameter node_clear(source, tmp);
11967e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags);
11977e2ab150SChristoph Lameter if (err > 0)
11987e2ab150SChristoph Lameter busy += err;
11997e2ab150SChristoph Lameter if (err < 0)
12007e2ab150SChristoph Lameter break;
120139743889SChristoph Lameter }
1202d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1203d479960eSMinchan Kim
1204361a2a22SMinchan Kim lru_cache_enable();
12057e2ab150SChristoph Lameter if (err < 0)
12067e2ab150SChristoph Lameter return err;
12077e2ab150SChristoph Lameter return busy;
1208b20a3503SChristoph Lameter
120939743889SChristoph Lameter }
121039743889SChristoph Lameter
12113ad33b24SLee Schermerhorn /*
12123ad33b24SLee Schermerhorn * Allocate a new page for page migration based on vma policy.
1213d05f0cdcSHugh Dickins * Start by assuming the page is mapped by the same vma as contains @start.
12143ad33b24SLee Schermerhorn * Search forward from there, if not. N.B., this assumes that the
12153ad33b24SLee Schermerhorn * list of pages handed to migrate_pages()--which is how we get here--
12163ad33b24SLee Schermerhorn * is in virtual address order.
12173ad33b24SLee Schermerhorn */
new_folio(struct folio * src,unsigned long start)12184e096ae1SMatthew Wilcox (Oracle) static struct folio *new_folio(struct folio *src, unsigned long start)
121995a402c3SChristoph Lameter {
1220d05f0cdcSHugh Dickins struct vm_area_struct *vma;
12213f649ab7SKees Cook unsigned long address;
122266850be5SLiam R. Howlett VMA_ITERATOR(vmi, current->mm, start);
1223ec4858e0SMatthew Wilcox (Oracle) gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
122495a402c3SChristoph Lameter
122566850be5SLiam R. Howlett for_each_vma(vmi, vma) {
12264e096ae1SMatthew Wilcox (Oracle) address = page_address_in_vma(&src->page, vma);
12273ad33b24SLee Schermerhorn if (address != -EFAULT)
12283ad33b24SLee Schermerhorn break;
12293ad33b24SLee Schermerhorn }
12303ad33b24SLee Schermerhorn
1231d0ce0e47SSidhartha Kumar if (folio_test_hugetlb(src)) {
12324e096ae1SMatthew Wilcox (Oracle) return alloc_hugetlb_folio_vma(folio_hstate(src),
1233389c8178SMichal Hocko vma, address);
1234d0ce0e47SSidhartha Kumar }
1235c8633798SNaoya Horiguchi
1236ec4858e0SMatthew Wilcox (Oracle) if (folio_test_large(src))
1237ec4858e0SMatthew Wilcox (Oracle) gfp = GFP_TRANSHUGE;
1238ec4858e0SMatthew Wilcox (Oracle)
123911c731e8SWanpeng Li /*
1240ec4858e0SMatthew Wilcox (Oracle) * if !vma, vma_alloc_folio() will use task or system default policy
124111c731e8SWanpeng Li */
12424e096ae1SMatthew Wilcox (Oracle) return vma_alloc_folio(gfp, folio_order(src), vma, address,
1243ec4858e0SMatthew Wilcox (Oracle) folio_test_large(src));
124495a402c3SChristoph Lameter }
1245b20a3503SChristoph Lameter #else
1246b20a3503SChristoph Lameter
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)12474a64981dSVishal Moola (Oracle) static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1248b20a3503SChristoph Lameter unsigned long flags)
1249b20a3503SChristoph Lameter {
1250a53190a4SYang Shi return -EIO;
1251b20a3503SChristoph Lameter }
1252b20a3503SChristoph Lameter
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)12530ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
12540ce72d4fSAndrew Morton const nodemask_t *to, int flags)
1255b20a3503SChristoph Lameter {
1256b20a3503SChristoph Lameter return -ENOSYS;
1257b20a3503SChristoph Lameter }
125895a402c3SChristoph Lameter
new_folio(struct folio * src,unsigned long start)12594e096ae1SMatthew Wilcox (Oracle) static struct folio *new_folio(struct folio *src, unsigned long start)
126095a402c3SChristoph Lameter {
126195a402c3SChristoph Lameter return NULL;
126295a402c3SChristoph Lameter }
1263b20a3503SChristoph Lameter #endif
1264b20a3503SChristoph Lameter
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1265dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
1266028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags,
1267028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags)
12686ce3c4c0SChristoph Lameter {
12696ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm;
1270f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev;
1271f4e9e0e6SLiam R. Howlett struct vma_iterator vmi;
12726ce3c4c0SChristoph Lameter struct mempolicy *new;
12736ce3c4c0SChristoph Lameter unsigned long end;
12746ce3c4c0SChristoph Lameter int err;
1275d8835445SYang Shi int ret;
12766ce3c4c0SChristoph Lameter LIST_HEAD(pagelist);
12776ce3c4c0SChristoph Lameter
1278b24f53a0SLee Schermerhorn if (flags & ~(unsigned long)MPOL_MF_VALID)
12796ce3c4c0SChristoph Lameter return -EINVAL;
128074c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
12816ce3c4c0SChristoph Lameter return -EPERM;
12826ce3c4c0SChristoph Lameter
12836ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK)
12846ce3c4c0SChristoph Lameter return -EINVAL;
12856ce3c4c0SChristoph Lameter
12866ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT)
12876ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT;
12886ce3c4c0SChristoph Lameter
1289aaa31e05Sze zuo len = PAGE_ALIGN(len);
12906ce3c4c0SChristoph Lameter end = start + len;
12916ce3c4c0SChristoph Lameter
12926ce3c4c0SChristoph Lameter if (end < start)
12936ce3c4c0SChristoph Lameter return -EINVAL;
12946ce3c4c0SChristoph Lameter if (end == start)
12956ce3c4c0SChristoph Lameter return 0;
12966ce3c4c0SChristoph Lameter
1297028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask);
12986ce3c4c0SChristoph Lameter if (IS_ERR(new))
12996ce3c4c0SChristoph Lameter return PTR_ERR(new);
13006ce3c4c0SChristoph Lameter
1301b24f53a0SLee Schermerhorn if (flags & MPOL_MF_LAZY)
1302b24f53a0SLee Schermerhorn new->flags |= MPOL_F_MOF;
1303b24f53a0SLee Schermerhorn
13046ce3c4c0SChristoph Lameter /*
13056ce3c4c0SChristoph Lameter * If we are using the default policy then operation
13066ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all
13076ce3c4c0SChristoph Lameter */
13086ce3c4c0SChristoph Lameter if (!new)
13096ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK;
13106ce3c4c0SChristoph Lameter
1311028fec41SDavid Rientjes pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1312028fec41SDavid Rientjes start, start + len, mode, mode_flags,
131300ef2d2fSDavid Rientjes nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
13146ce3c4c0SChristoph Lameter
13150aedadf9SChristoph Lameter if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
13160aedadf9SChristoph Lameter
1317361a2a22SMinchan Kim lru_cache_disable();
13180aedadf9SChristoph Lameter }
13194bfc4495SKAMEZAWA Hiroyuki {
13204bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
13214bfc4495SKAMEZAWA Hiroyuki if (scratch) {
1322d8ed45c5SMichel Lespinasse mmap_write_lock(mm);
13234bfc4495SKAMEZAWA Hiroyuki err = mpol_set_nodemask(new, nmask, scratch);
13244bfc4495SKAMEZAWA Hiroyuki if (err)
1325d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
13264bfc4495SKAMEZAWA Hiroyuki } else
13274bfc4495SKAMEZAWA Hiroyuki err = -ENOMEM;
13284bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
13294bfc4495SKAMEZAWA Hiroyuki }
1330b05ca738SKOSAKI Motohiro if (err)
1331b05ca738SKOSAKI Motohiro goto mpol_out;
1332b05ca738SKOSAKI Motohiro
13336c21e066SJann Horn /*
13346c21e066SJann Horn * Lock the VMAs before scanning for pages to migrate, to ensure we don't
13356c21e066SJann Horn * miss a concurrently inserted page.
13366c21e066SJann Horn */
1337d8835445SYang Shi ret = queue_pages_range(mm, start, end, nmask,
133849b06385SSuren Baghdasaryan flags | MPOL_MF_INVERT, &pagelist, true);
1339d8835445SYang Shi
1340d8835445SYang Shi if (ret < 0) {
1341a85dfc30SYang Shi err = ret;
1342d8835445SYang Shi goto up_out;
1343d8835445SYang Shi }
1344d8835445SYang Shi
1345f4e9e0e6SLiam R. Howlett vma_iter_init(&vmi, mm, start);
1346f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi);
1347f4e9e0e6SLiam R. Howlett for_each_vma_range(vmi, vma, end) {
1348f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new);
1349f4e9e0e6SLiam R. Howlett if (err)
1350f4e9e0e6SLiam R. Howlett break;
1351f4e9e0e6SLiam R. Howlett }
13527e2ab150SChristoph Lameter
1353b24f53a0SLee Schermerhorn if (!err) {
1354b24f53a0SLee Schermerhorn int nr_failed = 0;
1355b24f53a0SLee Schermerhorn
1356cf608ac1SMinchan Kim if (!list_empty(&pagelist)) {
1357b24f53a0SLee Schermerhorn WARN_ON_ONCE(flags & MPOL_MF_LAZY);
13584e096ae1SMatthew Wilcox (Oracle) nr_failed = migrate_pages(&pagelist, new_folio, NULL,
13595ac95884SYang Shi start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1360cf608ac1SMinchan Kim if (nr_failed)
136174060e4dSNaoya Horiguchi putback_movable_pages(&pagelist);
1362cf608ac1SMinchan Kim }
13636ce3c4c0SChristoph Lameter
136424526268SYang Shi if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
13656ce3c4c0SChristoph Lameter err = -EIO;
1366a85dfc30SYang Shi } else {
1367d8835445SYang Shi up_out:
1368a85dfc30SYang Shi if (!list_empty(&pagelist))
1369a85dfc30SYang Shi putback_movable_pages(&pagelist);
1370a85dfc30SYang Shi }
1371a85dfc30SYang Shi
1372d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
1373b05ca738SKOSAKI Motohiro mpol_out:
1374f0be3d32SLee Schermerhorn mpol_put(new);
1375d479960eSMinchan Kim if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1376361a2a22SMinchan Kim lru_cache_enable();
13776ce3c4c0SChristoph Lameter return err;
13786ce3c4c0SChristoph Lameter }
13796ce3c4c0SChristoph Lameter
138039743889SChristoph Lameter /*
13818bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists.
13828bccd85fSChristoph Lameter */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1383e130242dSArnd Bergmann static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1384e130242dSArnd Bergmann unsigned long maxnode)
1385e130242dSArnd Bergmann {
1386e130242dSArnd Bergmann unsigned long nlongs = BITS_TO_LONGS(maxnode);
1387e130242dSArnd Bergmann int ret;
1388e130242dSArnd Bergmann
1389e130242dSArnd Bergmann if (in_compat_syscall())
1390e130242dSArnd Bergmann ret = compat_get_bitmap(mask,
1391e130242dSArnd Bergmann (const compat_ulong_t __user *)nmask,
1392e130242dSArnd Bergmann maxnode);
1393e130242dSArnd Bergmann else
1394e130242dSArnd Bergmann ret = copy_from_user(mask, nmask,
1395e130242dSArnd Bergmann nlongs * sizeof(unsigned long));
1396e130242dSArnd Bergmann
1397e130242dSArnd Bergmann if (ret)
1398e130242dSArnd Bergmann return -EFAULT;
1399e130242dSArnd Bergmann
1400e130242dSArnd Bergmann if (maxnode % BITS_PER_LONG)
1401e130242dSArnd Bergmann mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1402e130242dSArnd Bergmann
1403e130242dSArnd Bergmann return 0;
1404e130242dSArnd Bergmann }
14058bccd85fSChristoph Lameter
14068bccd85fSChristoph Lameter /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)140739743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
14088bccd85fSChristoph Lameter unsigned long maxnode)
14098bccd85fSChristoph Lameter {
14108bccd85fSChristoph Lameter --maxnode;
14118bccd85fSChristoph Lameter nodes_clear(*nodes);
14128bccd85fSChristoph Lameter if (maxnode == 0 || !nmask)
14138bccd85fSChristoph Lameter return 0;
1414a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1415636f13c1SChris Wright return -EINVAL;
14168bccd85fSChristoph Lameter
141756521e7aSYisheng Xie /*
141856521e7aSYisheng Xie * When the user specified more nodes than supported just check
1419e130242dSArnd Bergmann * if the non supported part is all zero, one word at a time,
1420e130242dSArnd Bergmann * starting at the end.
142156521e7aSYisheng Xie */
1422e130242dSArnd Bergmann while (maxnode > MAX_NUMNODES) {
1423e130242dSArnd Bergmann unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1424e130242dSArnd Bergmann unsigned long t;
14258bccd85fSChristoph Lameter
1426000eca5dSTianyu Li if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
142756521e7aSYisheng Xie return -EFAULT;
1428e130242dSArnd Bergmann
1429e130242dSArnd Bergmann if (maxnode - bits >= MAX_NUMNODES) {
1430e130242dSArnd Bergmann maxnode -= bits;
1431e130242dSArnd Bergmann } else {
1432e130242dSArnd Bergmann maxnode = MAX_NUMNODES;
1433e130242dSArnd Bergmann t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1434e130242dSArnd Bergmann }
1435e130242dSArnd Bergmann if (t)
143656521e7aSYisheng Xie return -EINVAL;
143756521e7aSYisheng Xie }
143856521e7aSYisheng Xie
1439e130242dSArnd Bergmann return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
14408bccd85fSChristoph Lameter }
14418bccd85fSChristoph Lameter
14428bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)14438bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
14448bccd85fSChristoph Lameter nodemask_t *nodes)
14458bccd85fSChristoph Lameter {
14468bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1447050c17f2SRalph Campbell unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1448e130242dSArnd Bergmann bool compat = in_compat_syscall();
1449e130242dSArnd Bergmann
1450e130242dSArnd Bergmann if (compat)
1451e130242dSArnd Bergmann nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
14528bccd85fSChristoph Lameter
14538bccd85fSChristoph Lameter if (copy > nbytes) {
14548bccd85fSChristoph Lameter if (copy > PAGE_SIZE)
14558bccd85fSChristoph Lameter return -EINVAL;
14568bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes))
14578bccd85fSChristoph Lameter return -EFAULT;
14588bccd85fSChristoph Lameter copy = nbytes;
1459e130242dSArnd Bergmann maxnode = nr_node_ids;
14608bccd85fSChristoph Lameter }
1461e130242dSArnd Bergmann
1462e130242dSArnd Bergmann if (compat)
1463e130242dSArnd Bergmann return compat_put_bitmap((compat_ulong_t __user *)mask,
1464e130242dSArnd Bergmann nodes_addr(*nodes), maxnode);
1465e130242dSArnd Bergmann
14668bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
14678bccd85fSChristoph Lameter }
14688bccd85fSChristoph Lameter
146995837924SFeng Tang /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)147095837924SFeng Tang static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
147195837924SFeng Tang {
147295837924SFeng Tang *flags = *mode & MPOL_MODE_FLAGS;
147395837924SFeng Tang *mode &= ~MPOL_MODE_FLAGS;
1474b27abaccSDave Hansen
1475a38a59fdSBen Widawsky if ((unsigned int)(*mode) >= MPOL_MAX)
147695837924SFeng Tang return -EINVAL;
147795837924SFeng Tang if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
147895837924SFeng Tang return -EINVAL;
14796d2aec9eSEric Dumazet if (*flags & MPOL_F_NUMA_BALANCING) {
14806d2aec9eSEric Dumazet if (*mode != MPOL_BIND)
14816d2aec9eSEric Dumazet return -EINVAL;
14826d2aec9eSEric Dumazet *flags |= (MPOL_F_MOF | MPOL_F_MORON);
14836d2aec9eSEric Dumazet }
148495837924SFeng Tang return 0;
148595837924SFeng Tang }
148695837924SFeng Tang
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1487e7dc9ad6SDominik Brodowski static long kernel_mbind(unsigned long start, unsigned long len,
1488e7dc9ad6SDominik Brodowski unsigned long mode, const unsigned long __user *nmask,
1489e7dc9ad6SDominik Brodowski unsigned long maxnode, unsigned int flags)
14908bccd85fSChristoph Lameter {
1491028fec41SDavid Rientjes unsigned short mode_flags;
149295837924SFeng Tang nodemask_t nodes;
149395837924SFeng Tang int lmode = mode;
149495837924SFeng Tang int err;
14958bccd85fSChristoph Lameter
1496057d3389SAndrey Konovalov start = untagged_addr(start);
149795837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags);
149895837924SFeng Tang if (err)
149995837924SFeng Tang return err;
150095837924SFeng Tang
15018bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode);
15028bccd85fSChristoph Lameter if (err)
15038bccd85fSChristoph Lameter return err;
150495837924SFeng Tang
150595837924SFeng Tang return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
15068bccd85fSChristoph Lameter }
15078bccd85fSChristoph Lameter
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1508c6018b4bSAneesh Kumar K.V SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1509c6018b4bSAneesh Kumar K.V unsigned long, home_node, unsigned long, flags)
1510c6018b4bSAneesh Kumar K.V {
1511c6018b4bSAneesh Kumar K.V struct mm_struct *mm = current->mm;
1512f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev;
1513e976936cSMichal Hocko struct mempolicy *new, *old;
1514c6018b4bSAneesh Kumar K.V unsigned long end;
1515c6018b4bSAneesh Kumar K.V int err = -ENOENT;
151666850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, start);
1517c6018b4bSAneesh Kumar K.V
1518c6018b4bSAneesh Kumar K.V start = untagged_addr(start);
1519c6018b4bSAneesh Kumar K.V if (start & ~PAGE_MASK)
1520c6018b4bSAneesh Kumar K.V return -EINVAL;
1521c6018b4bSAneesh Kumar K.V /*
1522c6018b4bSAneesh Kumar K.V * flags is used for future extension if any.
1523c6018b4bSAneesh Kumar K.V */
1524c6018b4bSAneesh Kumar K.V if (flags != 0)
1525c6018b4bSAneesh Kumar K.V return -EINVAL;
1526c6018b4bSAneesh Kumar K.V
1527c6018b4bSAneesh Kumar K.V /*
1528c6018b4bSAneesh Kumar K.V * Check home_node is online to avoid accessing uninitialized
1529c6018b4bSAneesh Kumar K.V * NODE_DATA.
1530c6018b4bSAneesh Kumar K.V */
1531c6018b4bSAneesh Kumar K.V if (home_node >= MAX_NUMNODES || !node_online(home_node))
1532c6018b4bSAneesh Kumar K.V return -EINVAL;
1533c6018b4bSAneesh Kumar K.V
1534aaa31e05Sze zuo len = PAGE_ALIGN(len);
1535c6018b4bSAneesh Kumar K.V end = start + len;
1536c6018b4bSAneesh Kumar K.V
1537c6018b4bSAneesh Kumar K.V if (end < start)
1538c6018b4bSAneesh Kumar K.V return -EINVAL;
1539c6018b4bSAneesh Kumar K.V if (end == start)
1540c6018b4bSAneesh Kumar K.V return 0;
1541c6018b4bSAneesh Kumar K.V mmap_write_lock(mm);
1542f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi);
154366850be5SLiam R. Howlett for_each_vma_range(vmi, vma, end) {
1544c6018b4bSAneesh Kumar K.V /*
1545c6018b4bSAneesh Kumar K.V * If any vma in the range got policy other than MPOL_BIND
1546c6018b4bSAneesh Kumar K.V * or MPOL_PREFERRED_MANY we return error. We don't reset
1547c6018b4bSAneesh Kumar K.V * the home node for vmas we already updated before.
1548c6018b4bSAneesh Kumar K.V */
1549e976936cSMichal Hocko old = vma_policy(vma);
155051f62537SLiam R. Howlett if (!old) {
155151f62537SLiam R. Howlett prev = vma;
1552e976936cSMichal Hocko continue;
155351f62537SLiam R. Howlett }
1554e976936cSMichal Hocko if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1555c6018b4bSAneesh Kumar K.V err = -EOPNOTSUPP;
1556c6018b4bSAneesh Kumar K.V break;
1557c6018b4bSAneesh Kumar K.V }
1558e976936cSMichal Hocko new = mpol_dup(old);
1559e976936cSMichal Hocko if (IS_ERR(new)) {
1560e976936cSMichal Hocko err = PTR_ERR(new);
1561e976936cSMichal Hocko break;
1562e976936cSMichal Hocko }
1563c6018b4bSAneesh Kumar K.V
15646c21e066SJann Horn vma_start_write(vma);
1565c6018b4bSAneesh Kumar K.V new->home_node = home_node;
1566f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new);
1567c6018b4bSAneesh Kumar K.V mpol_put(new);
1568c6018b4bSAneesh Kumar K.V if (err)
1569c6018b4bSAneesh Kumar K.V break;
1570c6018b4bSAneesh Kumar K.V }
1571c6018b4bSAneesh Kumar K.V mmap_write_unlock(mm);
1572c6018b4bSAneesh Kumar K.V return err;
1573c6018b4bSAneesh Kumar K.V }
1574c6018b4bSAneesh Kumar K.V
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1575e7dc9ad6SDominik Brodowski SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1576e7dc9ad6SDominik Brodowski unsigned long, mode, const unsigned long __user *, nmask,
1577e7dc9ad6SDominik Brodowski unsigned long, maxnode, unsigned int, flags)
1578e7dc9ad6SDominik Brodowski {
1579e7dc9ad6SDominik Brodowski return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1580e7dc9ad6SDominik Brodowski }
1581e7dc9ad6SDominik Brodowski
15828bccd85fSChristoph Lameter /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1583af03c4acSDominik Brodowski static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1584af03c4acSDominik Brodowski unsigned long maxnode)
15858bccd85fSChristoph Lameter {
158695837924SFeng Tang unsigned short mode_flags;
15878bccd85fSChristoph Lameter nodemask_t nodes;
158895837924SFeng Tang int lmode = mode;
158995837924SFeng Tang int err;
15908bccd85fSChristoph Lameter
159195837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags);
159295837924SFeng Tang if (err)
159395837924SFeng Tang return err;
159495837924SFeng Tang
15958bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode);
15968bccd85fSChristoph Lameter if (err)
15978bccd85fSChristoph Lameter return err;
159895837924SFeng Tang
159995837924SFeng Tang return do_set_mempolicy(lmode, mode_flags, &nodes);
16008bccd85fSChristoph Lameter }
16018bccd85fSChristoph Lameter
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1602af03c4acSDominik Brodowski SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1603af03c4acSDominik Brodowski unsigned long, maxnode)
1604af03c4acSDominik Brodowski {
1605af03c4acSDominik Brodowski return kernel_set_mempolicy(mode, nmask, maxnode);
1606af03c4acSDominik Brodowski }
1607af03c4acSDominik Brodowski
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1608b6e9b0baSDominik Brodowski static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1609b6e9b0baSDominik Brodowski const unsigned long __user *old_nodes,
1610b6e9b0baSDominik Brodowski const unsigned long __user *new_nodes)
161139743889SChristoph Lameter {
1612596d7cfaSKOSAKI Motohiro struct mm_struct *mm = NULL;
161339743889SChristoph Lameter struct task_struct *task;
161439743889SChristoph Lameter nodemask_t task_nodes;
161539743889SChristoph Lameter int err;
1616596d7cfaSKOSAKI Motohiro nodemask_t *old;
1617596d7cfaSKOSAKI Motohiro nodemask_t *new;
1618596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH(scratch);
161939743889SChristoph Lameter
1620596d7cfaSKOSAKI Motohiro if (!scratch)
1621596d7cfaSKOSAKI Motohiro return -ENOMEM;
162239743889SChristoph Lameter
1623596d7cfaSKOSAKI Motohiro old = &scratch->mask1;
1624596d7cfaSKOSAKI Motohiro new = &scratch->mask2;
1625596d7cfaSKOSAKI Motohiro
1626596d7cfaSKOSAKI Motohiro err = get_nodes(old, old_nodes, maxnode);
162739743889SChristoph Lameter if (err)
1628596d7cfaSKOSAKI Motohiro goto out;
1629596d7cfaSKOSAKI Motohiro
1630596d7cfaSKOSAKI Motohiro err = get_nodes(new, new_nodes, maxnode);
1631596d7cfaSKOSAKI Motohiro if (err)
1632596d7cfaSKOSAKI Motohiro goto out;
163339743889SChristoph Lameter
163439743889SChristoph Lameter /* Find the mm_struct */
163555cfaa3cSZeng Zhaoming rcu_read_lock();
1636228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current;
163739743889SChristoph Lameter if (!task) {
163855cfaa3cSZeng Zhaoming rcu_read_unlock();
1639596d7cfaSKOSAKI Motohiro err = -ESRCH;
1640596d7cfaSKOSAKI Motohiro goto out;
164139743889SChristoph Lameter }
16423268c63eSChristoph Lameter get_task_struct(task);
164339743889SChristoph Lameter
1644596d7cfaSKOSAKI Motohiro err = -EINVAL;
164539743889SChristoph Lameter
164639743889SChristoph Lameter /*
164731367466SOtto Ebeling * Check if this process has the right to modify the specified process.
164831367466SOtto Ebeling * Use the regular "ptrace_may_access()" checks.
164939743889SChristoph Lameter */
165031367466SOtto Ebeling if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1651c69e8d9cSDavid Howells rcu_read_unlock();
165239743889SChristoph Lameter err = -EPERM;
16533268c63eSChristoph Lameter goto out_put;
165439743889SChristoph Lameter }
1655c69e8d9cSDavid Howells rcu_read_unlock();
165639743889SChristoph Lameter
165739743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task);
165839743889SChristoph Lameter /* Is the user allowed to access the target nodes? */
1659596d7cfaSKOSAKI Motohiro if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
166039743889SChristoph Lameter err = -EPERM;
16613268c63eSChristoph Lameter goto out_put;
166239743889SChristoph Lameter }
166339743889SChristoph Lameter
16640486a38bSYisheng Xie task_nodes = cpuset_mems_allowed(current);
16650486a38bSYisheng Xie nodes_and(*new, *new, task_nodes);
16660486a38bSYisheng Xie if (nodes_empty(*new))
16673268c63eSChristoph Lameter goto out_put;
16680486a38bSYisheng Xie
166986c3a764SDavid Quigley err = security_task_movememory(task);
167086c3a764SDavid Quigley if (err)
16713268c63eSChristoph Lameter goto out_put;
167286c3a764SDavid Quigley
16733268c63eSChristoph Lameter mm = get_task_mm(task);
16743268c63eSChristoph Lameter put_task_struct(task);
1675f2a9ef88SSasha Levin
1676f2a9ef88SSasha Levin if (!mm) {
1677f2a9ef88SSasha Levin err = -EINVAL;
1678f2a9ef88SSasha Levin goto out;
1679f2a9ef88SSasha Levin }
1680f2a9ef88SSasha Levin
1681596d7cfaSKOSAKI Motohiro err = do_migrate_pages(mm, old, new,
168274c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
16833268c63eSChristoph Lameter
168439743889SChristoph Lameter mmput(mm);
16853268c63eSChristoph Lameter out:
1686596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH_FREE(scratch);
1687596d7cfaSKOSAKI Motohiro
168839743889SChristoph Lameter return err;
16893268c63eSChristoph Lameter
16903268c63eSChristoph Lameter out_put:
16913268c63eSChristoph Lameter put_task_struct(task);
16923268c63eSChristoph Lameter goto out;
16933268c63eSChristoph Lameter
169439743889SChristoph Lameter }
169539743889SChristoph Lameter
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1696b6e9b0baSDominik Brodowski SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1697b6e9b0baSDominik Brodowski const unsigned long __user *, old_nodes,
1698b6e9b0baSDominik Brodowski const unsigned long __user *, new_nodes)
1699b6e9b0baSDominik Brodowski {
1700b6e9b0baSDominik Brodowski return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1701b6e9b0baSDominik Brodowski }
1702b6e9b0baSDominik Brodowski
170339743889SChristoph Lameter
17048bccd85fSChristoph Lameter /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1705af03c4acSDominik Brodowski static int kernel_get_mempolicy(int __user *policy,
1706af03c4acSDominik Brodowski unsigned long __user *nmask,
1707af03c4acSDominik Brodowski unsigned long maxnode,
1708af03c4acSDominik Brodowski unsigned long addr,
1709af03c4acSDominik Brodowski unsigned long flags)
17108bccd85fSChristoph Lameter {
1711dbcb0f19SAdrian Bunk int err;
17123f649ab7SKees Cook int pval;
17138bccd85fSChristoph Lameter nodemask_t nodes;
17148bccd85fSChristoph Lameter
1715050c17f2SRalph Campbell if (nmask != NULL && maxnode < nr_node_ids)
17168bccd85fSChristoph Lameter return -EINVAL;
17178bccd85fSChristoph Lameter
17184605f057SWenchao Hao addr = untagged_addr(addr);
17194605f057SWenchao Hao
17208bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags);
17218bccd85fSChristoph Lameter
17228bccd85fSChristoph Lameter if (err)
17238bccd85fSChristoph Lameter return err;
17248bccd85fSChristoph Lameter
17258bccd85fSChristoph Lameter if (policy && put_user(pval, policy))
17268bccd85fSChristoph Lameter return -EFAULT;
17278bccd85fSChristoph Lameter
17288bccd85fSChristoph Lameter if (nmask)
17298bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes);
17308bccd85fSChristoph Lameter
17318bccd85fSChristoph Lameter return err;
17328bccd85fSChristoph Lameter }
17338bccd85fSChristoph Lameter
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1734af03c4acSDominik Brodowski SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1735af03c4acSDominik Brodowski unsigned long __user *, nmask, unsigned long, maxnode,
1736af03c4acSDominik Brodowski unsigned long, addr, unsigned long, flags)
1737af03c4acSDominik Brodowski {
1738af03c4acSDominik Brodowski return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1739af03c4acSDominik Brodowski }
1740af03c4acSDominik Brodowski
vma_migratable(struct vm_area_struct * vma)174120ca87f2SLi Xinhai bool vma_migratable(struct vm_area_struct *vma)
174220ca87f2SLi Xinhai {
174320ca87f2SLi Xinhai if (vma->vm_flags & (VM_IO | VM_PFNMAP))
174420ca87f2SLi Xinhai return false;
174520ca87f2SLi Xinhai
174620ca87f2SLi Xinhai /*
174720ca87f2SLi Xinhai * DAX device mappings require predictable access latency, so avoid
174820ca87f2SLi Xinhai * incurring periodic faults.
174920ca87f2SLi Xinhai */
175020ca87f2SLi Xinhai if (vma_is_dax(vma))
175120ca87f2SLi Xinhai return false;
175220ca87f2SLi Xinhai
175320ca87f2SLi Xinhai if (is_vm_hugetlb_page(vma) &&
175420ca87f2SLi Xinhai !hugepage_migration_supported(hstate_vma(vma)))
175520ca87f2SLi Xinhai return false;
175620ca87f2SLi Xinhai
175720ca87f2SLi Xinhai /*
175820ca87f2SLi Xinhai * Migration allocates pages in the highest zone. If we cannot
175920ca87f2SLi Xinhai * do so then migration (at least from node to node) is not
176020ca87f2SLi Xinhai * possible.
176120ca87f2SLi Xinhai */
176220ca87f2SLi Xinhai if (vma->vm_file &&
176320ca87f2SLi Xinhai gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
176420ca87f2SLi Xinhai < policy_zone)
176520ca87f2SLi Xinhai return false;
176620ca87f2SLi Xinhai return true;
176720ca87f2SLi Xinhai }
176820ca87f2SLi Xinhai
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr)176974d2c3a0SOleg Nesterov struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
177074d2c3a0SOleg Nesterov unsigned long addr)
17711da177e4SLinus Torvalds {
17728d90274bSOleg Nesterov struct mempolicy *pol = NULL;
17731da177e4SLinus Torvalds
17741da177e4SLinus Torvalds if (vma) {
1775480eccf9SLee Schermerhorn if (vma->vm_ops && vma->vm_ops->get_policy) {
17768d90274bSOleg Nesterov pol = vma->vm_ops->get_policy(vma, addr);
177700442ad0SMel Gorman } else if (vma->vm_policy) {
17781da177e4SLinus Torvalds pol = vma->vm_policy;
177900442ad0SMel Gorman
178000442ad0SMel Gorman /*
178100442ad0SMel Gorman * shmem_alloc_page() passes MPOL_F_SHARED policy with
178200442ad0SMel Gorman * a pseudo vma whose vma->vm_ops=NULL. Take a reference
178300442ad0SMel Gorman * count on these policies which will be dropped by
178400442ad0SMel Gorman * mpol_cond_put() later
178500442ad0SMel Gorman */
178600442ad0SMel Gorman if (mpol_needs_cond_ref(pol))
178700442ad0SMel Gorman mpol_get(pol);
178800442ad0SMel Gorman }
17891da177e4SLinus Torvalds }
1790f15ca78eSOleg Nesterov
179174d2c3a0SOleg Nesterov return pol;
179274d2c3a0SOleg Nesterov }
179374d2c3a0SOleg Nesterov
179474d2c3a0SOleg Nesterov /*
1795dd6eecb9SOleg Nesterov * get_vma_policy(@vma, @addr)
179674d2c3a0SOleg Nesterov * @vma: virtual memory area whose policy is sought
179774d2c3a0SOleg Nesterov * @addr: address in @vma for shared policy lookup
179874d2c3a0SOleg Nesterov *
179974d2c3a0SOleg Nesterov * Returns effective policy for a VMA at specified address.
1800dd6eecb9SOleg Nesterov * Falls back to current->mempolicy or system default policy, as necessary.
180174d2c3a0SOleg Nesterov * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
180274d2c3a0SOleg Nesterov * count--added by the get_policy() vm_op, as appropriate--to protect against
180374d2c3a0SOleg Nesterov * freeing by another task. It is the caller's responsibility to free the
180474d2c3a0SOleg Nesterov * extra reference for shared policies.
180574d2c3a0SOleg Nesterov */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1806ac79f78dSDavid Rientjes static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1807dd6eecb9SOleg Nesterov unsigned long addr)
180874d2c3a0SOleg Nesterov {
180974d2c3a0SOleg Nesterov struct mempolicy *pol = __get_vma_policy(vma, addr);
181074d2c3a0SOleg Nesterov
18118d90274bSOleg Nesterov if (!pol)
1812dd6eecb9SOleg Nesterov pol = get_task_policy(current);
18138d90274bSOleg Nesterov
18141da177e4SLinus Torvalds return pol;
18151da177e4SLinus Torvalds }
18161da177e4SLinus Torvalds
vma_policy_mof(struct vm_area_struct * vma)18176b6482bbSOleg Nesterov bool vma_policy_mof(struct vm_area_struct *vma)
1818fc314724SMel Gorman {
18196b6482bbSOleg Nesterov struct mempolicy *pol;
1820f15ca78eSOleg Nesterov
1821fc314724SMel Gorman if (vma->vm_ops && vma->vm_ops->get_policy) {
1822fc314724SMel Gorman bool ret = false;
1823fc314724SMel Gorman
1824fc314724SMel Gorman pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1825fc314724SMel Gorman if (pol && (pol->flags & MPOL_F_MOF))
1826fc314724SMel Gorman ret = true;
1827fc314724SMel Gorman mpol_cond_put(pol);
1828fc314724SMel Gorman
1829fc314724SMel Gorman return ret;
18308d90274bSOleg Nesterov }
18318d90274bSOleg Nesterov
1832fc314724SMel Gorman pol = vma->vm_policy;
18338d90274bSOleg Nesterov if (!pol)
18346b6482bbSOleg Nesterov pol = get_task_policy(current);
1835fc314724SMel Gorman
1836fc314724SMel Gorman return pol->flags & MPOL_F_MOF;
1837fc314724SMel Gorman }
1838fc314724SMel Gorman
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1839d2226ebdSFeng Tang bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1840d3eb1570SLai Jiangshan {
1841d3eb1570SLai Jiangshan enum zone_type dynamic_policy_zone = policy_zone;
1842d3eb1570SLai Jiangshan
1843d3eb1570SLai Jiangshan BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1844d3eb1570SLai Jiangshan
1845d3eb1570SLai Jiangshan /*
1846269fbe72SBen Widawsky * if policy->nodes has movable memory only,
1847d3eb1570SLai Jiangshan * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1848d3eb1570SLai Jiangshan *
1849269fbe72SBen Widawsky * policy->nodes is intersect with node_states[N_MEMORY].
1850f0953a1bSIngo Molnar * so if the following test fails, it implies
1851269fbe72SBen Widawsky * policy->nodes has movable memory only.
1852d3eb1570SLai Jiangshan */
1853269fbe72SBen Widawsky if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1854d3eb1570SLai Jiangshan dynamic_policy_zone = ZONE_MOVABLE;
1855d3eb1570SLai Jiangshan
1856d3eb1570SLai Jiangshan return zone >= dynamic_policy_zone;
1857d3eb1570SLai Jiangshan }
1858d3eb1570SLai Jiangshan
185952cd3b07SLee Schermerhorn /*
186052cd3b07SLee Schermerhorn * Return a nodemask representing a mempolicy for filtering nodes for
186152cd3b07SLee Schermerhorn * page allocation
186252cd3b07SLee Schermerhorn */
policy_nodemask(gfp_t gfp,struct mempolicy * policy)18638ca39e68SMuchun Song nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
186419770b32SMel Gorman {
1865b27abaccSDave Hansen int mode = policy->mode;
1866b27abaccSDave Hansen
186719770b32SMel Gorman /* Lower zones don't get a nodemask applied for MPOL_BIND */
1868b27abaccSDave Hansen if (unlikely(mode == MPOL_BIND) &&
1869d3eb1570SLai Jiangshan apply_policy_zone(policy, gfp_zone(gfp)) &&
1870269fbe72SBen Widawsky cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1871269fbe72SBen Widawsky return &policy->nodes;
187219770b32SMel Gorman
1873b27abaccSDave Hansen if (mode == MPOL_PREFERRED_MANY)
1874b27abaccSDave Hansen return &policy->nodes;
1875b27abaccSDave Hansen
187619770b32SMel Gorman return NULL;
187719770b32SMel Gorman }
187819770b32SMel Gorman
1879b27abaccSDave Hansen /*
1880b27abaccSDave Hansen * Return the preferred node id for 'prefer' mempolicy, and return
1881b27abaccSDave Hansen * the given id for all other policies.
1882b27abaccSDave Hansen *
1883b27abaccSDave Hansen * policy_node() is always coupled with policy_nodemask(), which
1884b27abaccSDave Hansen * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1885b27abaccSDave Hansen */
policy_node(gfp_t gfp,struct mempolicy * policy,int nd)1886f8fd5253SWei Yang static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
18871da177e4SLinus Torvalds {
18887858d7bcSFeng Tang if (policy->mode == MPOL_PREFERRED) {
1889269fbe72SBen Widawsky nd = first_node(policy->nodes);
18907858d7bcSFeng Tang } else {
189119770b32SMel Gorman /*
18926d840958SMichal Hocko * __GFP_THISNODE shouldn't even be used with the bind policy
18936d840958SMichal Hocko * because we might easily break the expectation to stay on the
18946d840958SMichal Hocko * requested node and not break the policy.
189519770b32SMel Gorman */
18966d840958SMichal Hocko WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
18971da177e4SLinus Torvalds }
18986d840958SMichal Hocko
1899c6018b4bSAneesh Kumar K.V if ((policy->mode == MPOL_BIND ||
1900c6018b4bSAneesh Kumar K.V policy->mode == MPOL_PREFERRED_MANY) &&
1901c6018b4bSAneesh Kumar K.V policy->home_node != NUMA_NO_NODE)
1902c6018b4bSAneesh Kumar K.V return policy->home_node;
1903c6018b4bSAneesh Kumar K.V
190404ec6264SVlastimil Babka return nd;
19051da177e4SLinus Torvalds }
19061da177e4SLinus Torvalds
19071da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)19081da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
19091da177e4SLinus Torvalds {
191045816682SVlastimil Babka unsigned next;
19111da177e4SLinus Torvalds struct task_struct *me = current;
19121da177e4SLinus Torvalds
1913269fbe72SBen Widawsky next = next_node_in(me->il_prev, policy->nodes);
1914f5b087b5SDavid Rientjes if (next < MAX_NUMNODES)
191545816682SVlastimil Babka me->il_prev = next;
191645816682SVlastimil Babka return next;
19171da177e4SLinus Torvalds }
19181da177e4SLinus Torvalds
1919dc85da15SChristoph Lameter /*
1920dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the
1921dc85da15SChristoph Lameter * next slab entry.
1922dc85da15SChristoph Lameter */
mempolicy_slab_node(void)19232a389610SDavid Rientjes unsigned int mempolicy_slab_node(void)
1924dc85da15SChristoph Lameter {
1925e7b691b0SAndi Kleen struct mempolicy *policy;
19262a389610SDavid Rientjes int node = numa_mem_id();
1927e7b691b0SAndi Kleen
192838b031ddSVasily Averin if (!in_task())
19292a389610SDavid Rientjes return node;
1930e7b691b0SAndi Kleen
1931e7b691b0SAndi Kleen policy = current->mempolicy;
19327858d7bcSFeng Tang if (!policy)
19332a389610SDavid Rientjes return node;
1934765c4507SChristoph Lameter
1935bea904d5SLee Schermerhorn switch (policy->mode) {
1936bea904d5SLee Schermerhorn case MPOL_PREFERRED:
1937269fbe72SBen Widawsky return first_node(policy->nodes);
1938bea904d5SLee Schermerhorn
1939dc85da15SChristoph Lameter case MPOL_INTERLEAVE:
1940dc85da15SChristoph Lameter return interleave_nodes(policy);
1941dc85da15SChristoph Lameter
1942b27abaccSDave Hansen case MPOL_BIND:
1943b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
1944b27abaccSDave Hansen {
1945c33d6c06SMel Gorman struct zoneref *z;
1946c33d6c06SMel Gorman
1947dc85da15SChristoph Lameter /*
1948dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the
1949dc85da15SChristoph Lameter * first node.
1950dc85da15SChristoph Lameter */
195119770b32SMel Gorman struct zonelist *zonelist;
195219770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1953c9634cf0SAneesh Kumar K.V zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1954c33d6c06SMel Gorman z = first_zones_zonelist(zonelist, highest_zoneidx,
1955269fbe72SBen Widawsky &policy->nodes);
1956c1093b74SPavel Tatashin return z->zone ? zone_to_nid(z->zone) : node;
1957dd1a239fSMel Gorman }
19587858d7bcSFeng Tang case MPOL_LOCAL:
19597858d7bcSFeng Tang return node;
1960dc85da15SChristoph Lameter
1961dc85da15SChristoph Lameter default:
1962bea904d5SLee Schermerhorn BUG();
1963dc85da15SChristoph Lameter }
1964dc85da15SChristoph Lameter }
1965dc85da15SChristoph Lameter
1966fee83b3aSAndrew Morton /*
1967fee83b3aSAndrew Morton * Do static interleaving for a VMA with known offset @n. Returns the n'th
1968269fbe72SBen Widawsky * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1969fee83b3aSAndrew Morton * number of present nodes.
1970fee83b3aSAndrew Morton */
offset_il_node(struct mempolicy * pol,unsigned long n)197198c70baaSLaurent Dufour static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
19721da177e4SLinus Torvalds {
1973276aeee1Syanghui nodemask_t nodemask = pol->nodes;
1974276aeee1Syanghui unsigned int target, nnodes;
1975fee83b3aSAndrew Morton int i;
1976fee83b3aSAndrew Morton int nid;
1977276aeee1Syanghui /*
1978276aeee1Syanghui * The barrier will stabilize the nodemask in a register or on
1979276aeee1Syanghui * the stack so that it will stop changing under the code.
1980276aeee1Syanghui *
1981276aeee1Syanghui * Between first_node() and next_node(), pol->nodes could be changed
1982276aeee1Syanghui * by other threads. So we put pol->nodes in a local stack.
1983276aeee1Syanghui */
1984276aeee1Syanghui barrier();
19851da177e4SLinus Torvalds
1986276aeee1Syanghui nnodes = nodes_weight(nodemask);
1987f5b087b5SDavid Rientjes if (!nnodes)
1988f5b087b5SDavid Rientjes return numa_node_id();
1989fee83b3aSAndrew Morton target = (unsigned int)n % nnodes;
1990276aeee1Syanghui nid = first_node(nodemask);
1991fee83b3aSAndrew Morton for (i = 0; i < target; i++)
1992276aeee1Syanghui nid = next_node(nid, nodemask);
19931da177e4SLinus Torvalds return nid;
19941da177e4SLinus Torvalds }
19951da177e4SLinus Torvalds
19965da7ca86SChristoph Lameter /* Determine a node number for interleave */
interleave_nid(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long addr,int shift)19975da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
19985da7ca86SChristoph Lameter struct vm_area_struct *vma, unsigned long addr, int shift)
19995da7ca86SChristoph Lameter {
20005da7ca86SChristoph Lameter if (vma) {
20015da7ca86SChristoph Lameter unsigned long off;
20025da7ca86SChristoph Lameter
20033b98b087SNishanth Aravamudan /*
20043b98b087SNishanth Aravamudan * for small pages, there is no difference between
20053b98b087SNishanth Aravamudan * shift and PAGE_SHIFT, so the bit-shift is safe.
20063b98b087SNishanth Aravamudan * for huge pages, since vm_pgoff is in units of small
20073b98b087SNishanth Aravamudan * pages, we need to shift off the always 0 bits to get
20083b98b087SNishanth Aravamudan * a useful offset.
20093b98b087SNishanth Aravamudan */
20103b98b087SNishanth Aravamudan BUG_ON(shift < PAGE_SHIFT);
20113b98b087SNishanth Aravamudan off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
20125da7ca86SChristoph Lameter off += (addr - vma->vm_start) >> shift;
201398c70baaSLaurent Dufour return offset_il_node(pol, off);
20145da7ca86SChristoph Lameter } else
20155da7ca86SChristoph Lameter return interleave_nodes(pol);
20165da7ca86SChristoph Lameter }
20175da7ca86SChristoph Lameter
201800ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
2019480eccf9SLee Schermerhorn /*
202004ec6264SVlastimil Babka * huge_node(@vma, @addr, @gfp_flags, @mpol)
2021b46e14acSFabian Frederick * @vma: virtual memory area whose policy is sought
2022b46e14acSFabian Frederick * @addr: address in @vma for shared policy lookup and interleave policy
2023b46e14acSFabian Frederick * @gfp_flags: for requested zone
2024b46e14acSFabian Frederick * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2025b27abaccSDave Hansen * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2026480eccf9SLee Schermerhorn *
202704ec6264SVlastimil Babka * Returns a nid suitable for a huge page allocation and a pointer
202852cd3b07SLee Schermerhorn * to the struct mempolicy for conditional unref after allocation.
2029b27abaccSDave Hansen * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2030b27abaccSDave Hansen * to the mempolicy's @nodemask for filtering the zonelist.
2031c0ff7453SMiao Xie *
2032d26914d1SMel Gorman * Must be protected by read_mems_allowed_begin()
2033480eccf9SLee Schermerhorn */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)203404ec6264SVlastimil Babka int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
203504ec6264SVlastimil Babka struct mempolicy **mpol, nodemask_t **nodemask)
20365da7ca86SChristoph Lameter {
203704ec6264SVlastimil Babka int nid;
2038b27abaccSDave Hansen int mode;
20395da7ca86SChristoph Lameter
2040dd6eecb9SOleg Nesterov *mpol = get_vma_policy(vma, addr);
2041b27abaccSDave Hansen *nodemask = NULL;
2042b27abaccSDave Hansen mode = (*mpol)->mode;
20435da7ca86SChristoph Lameter
2044b27abaccSDave Hansen if (unlikely(mode == MPOL_INTERLEAVE)) {
204504ec6264SVlastimil Babka nid = interleave_nid(*mpol, vma, addr,
204604ec6264SVlastimil Babka huge_page_shift(hstate_vma(vma)));
204752cd3b07SLee Schermerhorn } else {
204804ec6264SVlastimil Babka nid = policy_node(gfp_flags, *mpol, numa_node_id());
2049b27abaccSDave Hansen if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2050269fbe72SBen Widawsky *nodemask = &(*mpol)->nodes;
2051480eccf9SLee Schermerhorn }
205204ec6264SVlastimil Babka return nid;
20535da7ca86SChristoph Lameter }
205406808b08SLee Schermerhorn
205506808b08SLee Schermerhorn /*
205606808b08SLee Schermerhorn * init_nodemask_of_mempolicy
205706808b08SLee Schermerhorn *
205806808b08SLee Schermerhorn * If the current task's mempolicy is "default" [NULL], return 'false'
205906808b08SLee Schermerhorn * to indicate default policy. Otherwise, extract the policy nodemask
206006808b08SLee Schermerhorn * for 'bind' or 'interleave' policy into the argument nodemask, or
206106808b08SLee Schermerhorn * initialize the argument nodemask to contain the single node for
206206808b08SLee Schermerhorn * 'preferred' or 'local' policy and return 'true' to indicate presence
206306808b08SLee Schermerhorn * of non-default mempolicy.
206406808b08SLee Schermerhorn *
206506808b08SLee Schermerhorn * We don't bother with reference counting the mempolicy [mpol_get/put]
206606808b08SLee Schermerhorn * because the current task is examining it's own mempolicy and a task's
206706808b08SLee Schermerhorn * mempolicy is only ever changed by the task itself.
206806808b08SLee Schermerhorn *
206906808b08SLee Schermerhorn * N.B., it is the caller's responsibility to free a returned nodemask.
207006808b08SLee Schermerhorn */
init_nodemask_of_mempolicy(nodemask_t * mask)207106808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask)
207206808b08SLee Schermerhorn {
207306808b08SLee Schermerhorn struct mempolicy *mempolicy;
207406808b08SLee Schermerhorn
207506808b08SLee Schermerhorn if (!(mask && current->mempolicy))
207606808b08SLee Schermerhorn return false;
207706808b08SLee Schermerhorn
2078c0ff7453SMiao Xie task_lock(current);
207906808b08SLee Schermerhorn mempolicy = current->mempolicy;
208006808b08SLee Schermerhorn switch (mempolicy->mode) {
208106808b08SLee Schermerhorn case MPOL_PREFERRED:
2082b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
208306808b08SLee Schermerhorn case MPOL_BIND:
208406808b08SLee Schermerhorn case MPOL_INTERLEAVE:
2085269fbe72SBen Widawsky *mask = mempolicy->nodes;
208606808b08SLee Schermerhorn break;
208706808b08SLee Schermerhorn
20887858d7bcSFeng Tang case MPOL_LOCAL:
2089269fbe72SBen Widawsky init_nodemask_of_node(mask, numa_node_id());
20907858d7bcSFeng Tang break;
20917858d7bcSFeng Tang
209206808b08SLee Schermerhorn default:
209306808b08SLee Schermerhorn BUG();
209406808b08SLee Schermerhorn }
2095c0ff7453SMiao Xie task_unlock(current);
209606808b08SLee Schermerhorn
209706808b08SLee Schermerhorn return true;
209806808b08SLee Schermerhorn }
209900ac59adSChen, Kenneth W #endif
21005da7ca86SChristoph Lameter
21016f48d0ebSDavid Rientjes /*
2102b26e517aSFeng Tang * mempolicy_in_oom_domain
21036f48d0ebSDavid Rientjes *
2104b26e517aSFeng Tang * If tsk's mempolicy is "bind", check for intersection between mask and
2105b26e517aSFeng Tang * the policy nodemask. Otherwise, return true for all other policies
2106b26e517aSFeng Tang * including "interleave", as a tsk with "interleave" policy may have
2107b26e517aSFeng Tang * memory allocated from all nodes in system.
21086f48d0ebSDavid Rientjes *
21096f48d0ebSDavid Rientjes * Takes task_lock(tsk) to prevent freeing of its mempolicy.
21106f48d0ebSDavid Rientjes */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2111b26e517aSFeng Tang bool mempolicy_in_oom_domain(struct task_struct *tsk,
21126f48d0ebSDavid Rientjes const nodemask_t *mask)
21136f48d0ebSDavid Rientjes {
21146f48d0ebSDavid Rientjes struct mempolicy *mempolicy;
21156f48d0ebSDavid Rientjes bool ret = true;
21166f48d0ebSDavid Rientjes
21176f48d0ebSDavid Rientjes if (!mask)
21186f48d0ebSDavid Rientjes return ret;
2119b26e517aSFeng Tang
21206f48d0ebSDavid Rientjes task_lock(tsk);
21216f48d0ebSDavid Rientjes mempolicy = tsk->mempolicy;
2122b26e517aSFeng Tang if (mempolicy && mempolicy->mode == MPOL_BIND)
2123269fbe72SBen Widawsky ret = nodes_intersects(mempolicy->nodes, *mask);
21246f48d0ebSDavid Rientjes task_unlock(tsk);
2125b26e517aSFeng Tang
21266f48d0ebSDavid Rientjes return ret;
21276f48d0ebSDavid Rientjes }
21286f48d0ebSDavid Rientjes
21291da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
21301da177e4SLinus Torvalds Own path because it needs to do special accounting. */
alloc_page_interleave(gfp_t gfp,unsigned order,unsigned nid)2131662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2132662f3a0bSAndi Kleen unsigned nid)
21331da177e4SLinus Torvalds {
21341da177e4SLinus Torvalds struct page *page;
21351da177e4SLinus Torvalds
213684172f4bSMatthew Wilcox (Oracle) page = __alloc_pages(gfp, order, nid, NULL);
21374518085eSKemi Wang /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
21384518085eSKemi Wang if (!static_branch_likely(&vm_numa_stat_key))
21394518085eSKemi Wang return page;
2140de55c8b2SAndrey Ryabinin if (page && page_to_nid(page) == nid) {
2141de55c8b2SAndrey Ryabinin preempt_disable();
2142f19298b9SMel Gorman __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2143de55c8b2SAndrey Ryabinin preempt_enable();
2144de55c8b2SAndrey Ryabinin }
21451da177e4SLinus Torvalds return page;
21461da177e4SLinus Torvalds }
21471da177e4SLinus Torvalds
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,struct mempolicy * pol)21484c54d949SFeng Tang static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
21494c54d949SFeng Tang int nid, struct mempolicy *pol)
21504c54d949SFeng Tang {
21514c54d949SFeng Tang struct page *page;
21524c54d949SFeng Tang gfp_t preferred_gfp;
21534c54d949SFeng Tang
21544c54d949SFeng Tang /*
21554c54d949SFeng Tang * This is a two pass approach. The first pass will only try the
21564c54d949SFeng Tang * preferred nodes but skip the direct reclaim and allow the
21574c54d949SFeng Tang * allocation to fail, while the second pass will try all the
21584c54d949SFeng Tang * nodes in system.
21594c54d949SFeng Tang */
21604c54d949SFeng Tang preferred_gfp = gfp | __GFP_NOWARN;
21614c54d949SFeng Tang preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
21624c54d949SFeng Tang page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
21634c54d949SFeng Tang if (!page)
2164c0455116SAneesh Kumar K.V page = __alloc_pages(gfp, order, nid, NULL);
21654c54d949SFeng Tang
21664c54d949SFeng Tang return page;
21674c54d949SFeng Tang }
21684c54d949SFeng Tang
21691da177e4SLinus Torvalds /**
2170adf88aa8SMatthew Wilcox (Oracle) * vma_alloc_folio - Allocate a folio for a VMA.
2171eb350739SMatthew Wilcox (Oracle) * @gfp: GFP flags.
2172adf88aa8SMatthew Wilcox (Oracle) * @order: Order of the folio.
21731da177e4SLinus Torvalds * @vma: Pointer to VMA or NULL if not available.
2174eb350739SMatthew Wilcox (Oracle) * @addr: Virtual address of the allocation. Must be inside @vma.
2175eb350739SMatthew Wilcox (Oracle) * @hugepage: For hugepages try only the preferred node if possible.
21761da177e4SLinus Torvalds *
2177adf88aa8SMatthew Wilcox (Oracle) * Allocate a folio for a specific address in @vma, using the appropriate
2178eb350739SMatthew Wilcox (Oracle) * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2179eb350739SMatthew Wilcox (Oracle) * of the mm_struct of the VMA to prevent it from going away. Should be
2180adf88aa8SMatthew Wilcox (Oracle) * used for all allocations for folios that will be mapped into user space.
2181eb350739SMatthew Wilcox (Oracle) *
2182adf88aa8SMatthew Wilcox (Oracle) * Return: The folio on success or NULL if allocation fails.
21831da177e4SLinus Torvalds */
vma_alloc_folio(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr,bool hugepage)2184adf88aa8SMatthew Wilcox (Oracle) struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2185be1a13ebSMichal Hocko unsigned long addr, bool hugepage)
21861da177e4SLinus Torvalds {
2187cc9a6c87SMel Gorman struct mempolicy *pol;
2188be1a13ebSMichal Hocko int node = numa_node_id();
2189adf88aa8SMatthew Wilcox (Oracle) struct folio *folio;
219004ec6264SVlastimil Babka int preferred_nid;
2191be97a41bSVlastimil Babka nodemask_t *nmask;
21921da177e4SLinus Torvalds
2193dd6eecb9SOleg Nesterov pol = get_vma_policy(vma, addr);
2194cc9a6c87SMel Gorman
2195be97a41bSVlastimil Babka if (pol->mode == MPOL_INTERLEAVE) {
2196adf88aa8SMatthew Wilcox (Oracle) struct page *page;
21971da177e4SLinus Torvalds unsigned nid;
21985da7ca86SChristoph Lameter
21998eac563cSAndi Kleen nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
220052cd3b07SLee Schermerhorn mpol_cond_put(pol);
2201adf88aa8SMatthew Wilcox (Oracle) gfp |= __GFP_COMP;
22020bbbc0b3SAndrea Arcangeli page = alloc_page_interleave(gfp, order, nid);
2203adf88aa8SMatthew Wilcox (Oracle) folio = (struct folio *)page;
2204da6e7bf3SMatthew Wilcox (Oracle) if (folio && order > 1)
2205da6e7bf3SMatthew Wilcox (Oracle) folio_prep_large_rmappable(folio);
2206be97a41bSVlastimil Babka goto out;
22071da177e4SLinus Torvalds }
22081da177e4SLinus Torvalds
22094c54d949SFeng Tang if (pol->mode == MPOL_PREFERRED_MANY) {
2210adf88aa8SMatthew Wilcox (Oracle) struct page *page;
2211adf88aa8SMatthew Wilcox (Oracle)
2212c0455116SAneesh Kumar K.V node = policy_node(gfp, pol, node);
2213adf88aa8SMatthew Wilcox (Oracle) gfp |= __GFP_COMP;
22144c54d949SFeng Tang page = alloc_pages_preferred_many(gfp, order, node, pol);
22154c54d949SFeng Tang mpol_cond_put(pol);
2216adf88aa8SMatthew Wilcox (Oracle) folio = (struct folio *)page;
2217da6e7bf3SMatthew Wilcox (Oracle) if (folio && order > 1)
2218da6e7bf3SMatthew Wilcox (Oracle) folio_prep_large_rmappable(folio);
22194c54d949SFeng Tang goto out;
22204c54d949SFeng Tang }
22214c54d949SFeng Tang
222219deb769SDavid Rientjes if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
222319deb769SDavid Rientjes int hpage_node = node;
222419deb769SDavid Rientjes
222519deb769SDavid Rientjes /*
222619deb769SDavid Rientjes * For hugepage allocation and non-interleave policy which
222719deb769SDavid Rientjes * allows the current node (or other explicitly preferred
222819deb769SDavid Rientjes * node) we only try to allocate from the current/preferred
222919deb769SDavid Rientjes * node and don't fall back to other nodes, as the cost of
223019deb769SDavid Rientjes * remote accesses would likely offset THP benefits.
223119deb769SDavid Rientjes *
2232b27abaccSDave Hansen * If the policy is interleave or does not allow the current
223319deb769SDavid Rientjes * node in its nodemask, we allocate the standard way.
223419deb769SDavid Rientjes */
22357858d7bcSFeng Tang if (pol->mode == MPOL_PREFERRED)
2236269fbe72SBen Widawsky hpage_node = first_node(pol->nodes);
223719deb769SDavid Rientjes
223819deb769SDavid Rientjes nmask = policy_nodemask(gfp, pol);
223919deb769SDavid Rientjes if (!nmask || node_isset(hpage_node, *nmask)) {
224019deb769SDavid Rientjes mpol_cond_put(pol);
2241cc638f32SVlastimil Babka /*
2242cc638f32SVlastimil Babka * First, try to allocate THP only on local node, but
2243cc638f32SVlastimil Babka * don't reclaim unnecessarily, just compact.
2244cc638f32SVlastimil Babka */
2245adf88aa8SMatthew Wilcox (Oracle) folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2246adf88aa8SMatthew Wilcox (Oracle) __GFP_NORETRY, order, hpage_node);
224776e654ccSDavid Rientjes
224876e654ccSDavid Rientjes /*
224976e654ccSDavid Rientjes * If hugepage allocations are configured to always
225076e654ccSDavid Rientjes * synchronous compact or the vma has been madvised
225176e654ccSDavid Rientjes * to prefer hugepage backing, retry allowing remote
2252cc638f32SVlastimil Babka * memory with both reclaim and compact as well.
225376e654ccSDavid Rientjes */
2254adf88aa8SMatthew Wilcox (Oracle) if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2255adf88aa8SMatthew Wilcox (Oracle) folio = __folio_alloc(gfp, order, hpage_node,
2256adf88aa8SMatthew Wilcox (Oracle) nmask);
225776e654ccSDavid Rientjes
225819deb769SDavid Rientjes goto out;
225919deb769SDavid Rientjes }
226019deb769SDavid Rientjes }
226119deb769SDavid Rientjes
2262077fcf11SAneesh Kumar K.V nmask = policy_nodemask(gfp, pol);
226304ec6264SVlastimil Babka preferred_nid = policy_node(gfp, pol, node);
2264adf88aa8SMatthew Wilcox (Oracle) folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2265d51e9894SVlastimil Babka mpol_cond_put(pol);
2266be97a41bSVlastimil Babka out:
2267f584b680SMatthew Wilcox (Oracle) return folio;
2268f584b680SMatthew Wilcox (Oracle) }
2269adf88aa8SMatthew Wilcox (Oracle) EXPORT_SYMBOL(vma_alloc_folio);
2270f584b680SMatthew Wilcox (Oracle)
22711da177e4SLinus Torvalds /**
2272d7f946d0SMatthew Wilcox (Oracle) * alloc_pages - Allocate pages.
22736421ec76SMatthew Wilcox (Oracle) * @gfp: GFP flags.
22746421ec76SMatthew Wilcox (Oracle) * @order: Power of two of number of pages to allocate.
22751da177e4SLinus Torvalds *
22766421ec76SMatthew Wilcox (Oracle) * Allocate 1 << @order contiguous pages. The physical address of the
22776421ec76SMatthew Wilcox (Oracle) * first page is naturally aligned (eg an order-3 allocation will be aligned
22786421ec76SMatthew Wilcox (Oracle) * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
22796421ec76SMatthew Wilcox (Oracle) * process is honoured when in process context.
22801da177e4SLinus Torvalds *
22816421ec76SMatthew Wilcox (Oracle) * Context: Can be called from any context, providing the appropriate GFP
22826421ec76SMatthew Wilcox (Oracle) * flags are used.
22836421ec76SMatthew Wilcox (Oracle) * Return: The page on success or NULL if allocation fails.
22841da177e4SLinus Torvalds */
alloc_pages(gfp_t gfp,unsigned order)2285d7f946d0SMatthew Wilcox (Oracle) struct page *alloc_pages(gfp_t gfp, unsigned order)
22861da177e4SLinus Torvalds {
22878d90274bSOleg Nesterov struct mempolicy *pol = &default_policy;
2288c0ff7453SMiao Xie struct page *page;
22891da177e4SLinus Torvalds
22908d90274bSOleg Nesterov if (!in_interrupt() && !(gfp & __GFP_THISNODE))
22918d90274bSOleg Nesterov pol = get_task_policy(current);
229252cd3b07SLee Schermerhorn
229352cd3b07SLee Schermerhorn /*
229452cd3b07SLee Schermerhorn * No reference counting needed for current->mempolicy
229552cd3b07SLee Schermerhorn * nor system default_policy
229652cd3b07SLee Schermerhorn */
229745c4745aSLee Schermerhorn if (pol->mode == MPOL_INTERLEAVE)
2298c0ff7453SMiao Xie page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
22994c54d949SFeng Tang else if (pol->mode == MPOL_PREFERRED_MANY)
23004c54d949SFeng Tang page = alloc_pages_preferred_many(gfp, order,
2301c0455116SAneesh Kumar K.V policy_node(gfp, pol, numa_node_id()), pol);
2302c0ff7453SMiao Xie else
230384172f4bSMatthew Wilcox (Oracle) page = __alloc_pages(gfp, order,
230404ec6264SVlastimil Babka policy_node(gfp, pol, numa_node_id()),
23055c4b4be3SAndi Kleen policy_nodemask(gfp, pol));
2306cc9a6c87SMel Gorman
2307c0ff7453SMiao Xie return page;
23081da177e4SLinus Torvalds }
2309d7f946d0SMatthew Wilcox (Oracle) EXPORT_SYMBOL(alloc_pages);
23101da177e4SLinus Torvalds
folio_alloc(gfp_t gfp,unsigned order)2311cc09cb13SMatthew Wilcox (Oracle) struct folio *folio_alloc(gfp_t gfp, unsigned order)
2312cc09cb13SMatthew Wilcox (Oracle) {
2313cc09cb13SMatthew Wilcox (Oracle) struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2314da6e7bf3SMatthew Wilcox (Oracle) struct folio *folio = (struct folio *)page;
2315cc09cb13SMatthew Wilcox (Oracle)
2316da6e7bf3SMatthew Wilcox (Oracle) if (folio && order > 1)
2317da6e7bf3SMatthew Wilcox (Oracle) folio_prep_large_rmappable(folio);
2318da6e7bf3SMatthew Wilcox (Oracle) return folio;
2319cc09cb13SMatthew Wilcox (Oracle) }
2320cc09cb13SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_alloc);
2321cc09cb13SMatthew Wilcox (Oracle)
alloc_pages_bulk_array_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2322c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2323c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages,
2324c00b6b96SChen Wandun struct page **page_array)
2325c00b6b96SChen Wandun {
2326c00b6b96SChen Wandun int nodes;
2327c00b6b96SChen Wandun unsigned long nr_pages_per_node;
2328c00b6b96SChen Wandun int delta;
2329c00b6b96SChen Wandun int i;
2330c00b6b96SChen Wandun unsigned long nr_allocated;
2331c00b6b96SChen Wandun unsigned long total_allocated = 0;
2332c00b6b96SChen Wandun
2333c00b6b96SChen Wandun nodes = nodes_weight(pol->nodes);
2334c00b6b96SChen Wandun nr_pages_per_node = nr_pages / nodes;
2335c00b6b96SChen Wandun delta = nr_pages - nodes * nr_pages_per_node;
2336c00b6b96SChen Wandun
2337c00b6b96SChen Wandun for (i = 0; i < nodes; i++) {
2338c00b6b96SChen Wandun if (delta) {
2339c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(gfp,
2340c00b6b96SChen Wandun interleave_nodes(pol), NULL,
2341c00b6b96SChen Wandun nr_pages_per_node + 1, NULL,
2342c00b6b96SChen Wandun page_array);
2343c00b6b96SChen Wandun delta--;
2344c00b6b96SChen Wandun } else {
2345c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(gfp,
2346c00b6b96SChen Wandun interleave_nodes(pol), NULL,
2347c00b6b96SChen Wandun nr_pages_per_node, NULL, page_array);
2348c00b6b96SChen Wandun }
2349c00b6b96SChen Wandun
2350c00b6b96SChen Wandun page_array += nr_allocated;
2351c00b6b96SChen Wandun total_allocated += nr_allocated;
2352c00b6b96SChen Wandun }
2353c00b6b96SChen Wandun
2354c00b6b96SChen Wandun return total_allocated;
2355c00b6b96SChen Wandun }
2356c00b6b96SChen Wandun
alloc_pages_bulk_array_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2357c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2358c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages,
2359c00b6b96SChen Wandun struct page **page_array)
2360c00b6b96SChen Wandun {
2361c00b6b96SChen Wandun gfp_t preferred_gfp;
2362c00b6b96SChen Wandun unsigned long nr_allocated = 0;
2363c00b6b96SChen Wandun
2364c00b6b96SChen Wandun preferred_gfp = gfp | __GFP_NOWARN;
2365c00b6b96SChen Wandun preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2366c00b6b96SChen Wandun
2367c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2368c00b6b96SChen Wandun nr_pages, NULL, page_array);
2369c00b6b96SChen Wandun
2370c00b6b96SChen Wandun if (nr_allocated < nr_pages)
2371c00b6b96SChen Wandun nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2372c00b6b96SChen Wandun nr_pages - nr_allocated, NULL,
2373c00b6b96SChen Wandun page_array + nr_allocated);
2374c00b6b96SChen Wandun return nr_allocated;
2375c00b6b96SChen Wandun }
2376c00b6b96SChen Wandun
2377c00b6b96SChen Wandun /* alloc pages bulk and mempolicy should be considered at the
2378c00b6b96SChen Wandun * same time in some situation such as vmalloc.
2379c00b6b96SChen Wandun *
2380c00b6b96SChen Wandun * It can accelerate memory allocation especially interleaving
2381c00b6b96SChen Wandun * allocate memory.
2382c00b6b96SChen Wandun */
alloc_pages_bulk_array_mempolicy(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2383c00b6b96SChen Wandun unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2384c00b6b96SChen Wandun unsigned long nr_pages, struct page **page_array)
2385c00b6b96SChen Wandun {
2386c00b6b96SChen Wandun struct mempolicy *pol = &default_policy;
2387c00b6b96SChen Wandun
2388c00b6b96SChen Wandun if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2389c00b6b96SChen Wandun pol = get_task_policy(current);
2390c00b6b96SChen Wandun
2391c00b6b96SChen Wandun if (pol->mode == MPOL_INTERLEAVE)
2392c00b6b96SChen Wandun return alloc_pages_bulk_array_interleave(gfp, pol,
2393c00b6b96SChen Wandun nr_pages, page_array);
2394c00b6b96SChen Wandun
2395c00b6b96SChen Wandun if (pol->mode == MPOL_PREFERRED_MANY)
2396c00b6b96SChen Wandun return alloc_pages_bulk_array_preferred_many(gfp,
2397c00b6b96SChen Wandun numa_node_id(), pol, nr_pages, page_array);
2398c00b6b96SChen Wandun
2399c00b6b96SChen Wandun return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2400c00b6b96SChen Wandun policy_nodemask(gfp, pol), nr_pages, NULL,
2401c00b6b96SChen Wandun page_array);
2402c00b6b96SChen Wandun }
2403c00b6b96SChen Wandun
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2404ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2405ef0855d3SOleg Nesterov {
2406ef0855d3SOleg Nesterov struct mempolicy *pol = mpol_dup(vma_policy(src));
2407ef0855d3SOleg Nesterov
2408ef0855d3SOleg Nesterov if (IS_ERR(pol))
2409ef0855d3SOleg Nesterov return PTR_ERR(pol);
2410ef0855d3SOleg Nesterov dst->vm_policy = pol;
2411ef0855d3SOleg Nesterov return 0;
2412ef0855d3SOleg Nesterov }
2413ef0855d3SOleg Nesterov
24144225399aSPaul Jackson /*
2415846a16bfSLee Schermerhorn * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
24164225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy()
24174225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This
24184225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See
24194225399aSPaul Jackson * further kernel/cpuset.c update_nodemask().
2420708c1bbcSMiao Xie *
2421708c1bbcSMiao Xie * current's mempolicy may be rebinded by the other task(the task that changes
2422708c1bbcSMiao Xie * cpuset's mems), so we needn't do rebind work for current task.
24234225399aSPaul Jackson */
24244225399aSPaul Jackson
2425846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2426846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
24271da177e4SLinus Torvalds {
24281da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
24291da177e4SLinus Torvalds
24301da177e4SLinus Torvalds if (!new)
24311da177e4SLinus Torvalds return ERR_PTR(-ENOMEM);
2432708c1bbcSMiao Xie
2433708c1bbcSMiao Xie /* task's mempolicy is protected by alloc_lock */
2434708c1bbcSMiao Xie if (old == current->mempolicy) {
2435708c1bbcSMiao Xie task_lock(current);
2436708c1bbcSMiao Xie *new = *old;
2437708c1bbcSMiao Xie task_unlock(current);
2438708c1bbcSMiao Xie } else
2439708c1bbcSMiao Xie *new = *old;
2440708c1bbcSMiao Xie
24414225399aSPaul Jackson if (current_cpuset_is_being_rebound()) {
24424225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current);
2443213980c0SVlastimil Babka mpol_rebind_policy(new, &mems);
24444225399aSPaul Jackson }
24451da177e4SLinus Torvalds atomic_set(&new->refcnt, 1);
24461da177e4SLinus Torvalds return new;
24471da177e4SLinus Torvalds }
24481da177e4SLinus Torvalds
24491da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2450fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
24511da177e4SLinus Torvalds {
24521da177e4SLinus Torvalds if (!a || !b)
2453fcfb4dccSKOSAKI Motohiro return false;
245445c4745aSLee Schermerhorn if (a->mode != b->mode)
2455fcfb4dccSKOSAKI Motohiro return false;
245619800502SBob Liu if (a->flags != b->flags)
2457fcfb4dccSKOSAKI Motohiro return false;
2458c6018b4bSAneesh Kumar K.V if (a->home_node != b->home_node)
2459c6018b4bSAneesh Kumar K.V return false;
246019800502SBob Liu if (mpol_store_user_nodemask(a))
246119800502SBob Liu if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2462fcfb4dccSKOSAKI Motohiro return false;
246319800502SBob Liu
246445c4745aSLee Schermerhorn switch (a->mode) {
246519770b32SMel Gorman case MPOL_BIND:
24661da177e4SLinus Torvalds case MPOL_INTERLEAVE:
24671da177e4SLinus Torvalds case MPOL_PREFERRED:
2468b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
2469269fbe72SBen Widawsky return !!nodes_equal(a->nodes, b->nodes);
24707858d7bcSFeng Tang case MPOL_LOCAL:
24717858d7bcSFeng Tang return true;
24721da177e4SLinus Torvalds default:
24731da177e4SLinus Torvalds BUG();
2474fcfb4dccSKOSAKI Motohiro return false;
24751da177e4SLinus Torvalds }
24761da177e4SLinus Torvalds }
24771da177e4SLinus Torvalds
24781da177e4SLinus Torvalds /*
24791da177e4SLinus Torvalds * Shared memory backing store policy support.
24801da177e4SLinus Torvalds *
24811da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped.
24821da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode.
24834a8c7bb5SNathan Zimmer * They are protected by the sp->lock rwlock, which should be held
24841da177e4SLinus Torvalds * for any accesses to the tree.
24851da177e4SLinus Torvalds */
24861da177e4SLinus Torvalds
24874a8c7bb5SNathan Zimmer /*
24884a8c7bb5SNathan Zimmer * lookup first element intersecting start-end. Caller holds sp->lock for
24894a8c7bb5SNathan Zimmer * reading or for writing
24904a8c7bb5SNathan Zimmer */
24911da177e4SLinus Torvalds static struct sp_node *
sp_lookup(struct shared_policy * sp,unsigned long start,unsigned long end)24921da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
24931da177e4SLinus Torvalds {
24941da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node;
24951da177e4SLinus Torvalds
24961da177e4SLinus Torvalds while (n) {
24971da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd);
24981da177e4SLinus Torvalds
24991da177e4SLinus Torvalds if (start >= p->end)
25001da177e4SLinus Torvalds n = n->rb_right;
25011da177e4SLinus Torvalds else if (end <= p->start)
25021da177e4SLinus Torvalds n = n->rb_left;
25031da177e4SLinus Torvalds else
25041da177e4SLinus Torvalds break;
25051da177e4SLinus Torvalds }
25061da177e4SLinus Torvalds if (!n)
25071da177e4SLinus Torvalds return NULL;
25081da177e4SLinus Torvalds for (;;) {
25091da177e4SLinus Torvalds struct sp_node *w = NULL;
25101da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n);
25111da177e4SLinus Torvalds if (!prev)
25121da177e4SLinus Torvalds break;
25131da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd);
25141da177e4SLinus Torvalds if (w->end <= start)
25151da177e4SLinus Torvalds break;
25161da177e4SLinus Torvalds n = prev;
25171da177e4SLinus Torvalds }
25181da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd);
25191da177e4SLinus Torvalds }
25201da177e4SLinus Torvalds
25214a8c7bb5SNathan Zimmer /*
25224a8c7bb5SNathan Zimmer * Insert a new shared policy into the list. Caller holds sp->lock for
25234a8c7bb5SNathan Zimmer * writing.
25244a8c7bb5SNathan Zimmer */
sp_insert(struct shared_policy * sp,struct sp_node * new)25251da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
25261da177e4SLinus Torvalds {
25271da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node;
25281da177e4SLinus Torvalds struct rb_node *parent = NULL;
25291da177e4SLinus Torvalds struct sp_node *nd;
25301da177e4SLinus Torvalds
25311da177e4SLinus Torvalds while (*p) {
25321da177e4SLinus Torvalds parent = *p;
25331da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd);
25341da177e4SLinus Torvalds if (new->start < nd->start)
25351da177e4SLinus Torvalds p = &(*p)->rb_left;
25361da177e4SLinus Torvalds else if (new->end > nd->end)
25371da177e4SLinus Torvalds p = &(*p)->rb_right;
25381da177e4SLinus Torvalds else
25391da177e4SLinus Torvalds BUG();
25401da177e4SLinus Torvalds }
25411da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p);
25421da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root);
2543140d5a49SPaul Mundt pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
254445c4745aSLee Schermerhorn new->policy ? new->policy->mode : 0);
25451da177e4SLinus Torvalds }
25461da177e4SLinus Torvalds
25471da177e4SLinus Torvalds /* Find shared policy intersecting idx */
25481da177e4SLinus Torvalds struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy * sp,unsigned long idx)25491da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
25501da177e4SLinus Torvalds {
25511da177e4SLinus Torvalds struct mempolicy *pol = NULL;
25521da177e4SLinus Torvalds struct sp_node *sn;
25531da177e4SLinus Torvalds
25541da177e4SLinus Torvalds if (!sp->root.rb_node)
25551da177e4SLinus Torvalds return NULL;
25564a8c7bb5SNathan Zimmer read_lock(&sp->lock);
25571da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1);
25581da177e4SLinus Torvalds if (sn) {
25591da177e4SLinus Torvalds mpol_get(sn->policy);
25601da177e4SLinus Torvalds pol = sn->policy;
25611da177e4SLinus Torvalds }
25624a8c7bb5SNathan Zimmer read_unlock(&sp->lock);
25631da177e4SLinus Torvalds return pol;
25641da177e4SLinus Torvalds }
25651da177e4SLinus Torvalds
sp_free(struct sp_node * n)256663f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n)
256763f74ca2SKOSAKI Motohiro {
256863f74ca2SKOSAKI Motohiro mpol_put(n->policy);
256963f74ca2SKOSAKI Motohiro kmem_cache_free(sn_cache, n);
257063f74ca2SKOSAKI Motohiro }
257163f74ca2SKOSAKI Motohiro
2572771fb4d8SLee Schermerhorn /**
2573771fb4d8SLee Schermerhorn * mpol_misplaced - check whether current page node is valid in policy
2574771fb4d8SLee Schermerhorn *
2575b46e14acSFabian Frederick * @page: page to be checked
2576b46e14acSFabian Frederick * @vma: vm area where page mapped
2577b46e14acSFabian Frederick * @addr: virtual address where page mapped
2578771fb4d8SLee Schermerhorn *
2579771fb4d8SLee Schermerhorn * Lookup current policy node id for vma,addr and "compare to" page's
25805f076944SMatthew Wilcox (Oracle) * node id. Policy determination "mimics" alloc_page_vma().
2581771fb4d8SLee Schermerhorn * Called from fault path where we know the vma and faulting address.
25825f076944SMatthew Wilcox (Oracle) *
2583062db293SBaolin Wang * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2584062db293SBaolin Wang * policy, or a suitable node ID to allocate a replacement page from.
2585771fb4d8SLee Schermerhorn */
mpol_misplaced(struct page * page,struct vm_area_struct * vma,unsigned long addr)2586771fb4d8SLee Schermerhorn int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2587771fb4d8SLee Schermerhorn {
2588771fb4d8SLee Schermerhorn struct mempolicy *pol;
2589c33d6c06SMel Gorman struct zoneref *z;
2590771fb4d8SLee Schermerhorn int curnid = page_to_nid(page);
2591771fb4d8SLee Schermerhorn unsigned long pgoff;
259290572890SPeter Zijlstra int thiscpu = raw_smp_processor_id();
259390572890SPeter Zijlstra int thisnid = cpu_to_node(thiscpu);
259498fa15f3SAnshuman Khandual int polnid = NUMA_NO_NODE;
2595062db293SBaolin Wang int ret = NUMA_NO_NODE;
2596771fb4d8SLee Schermerhorn
2597dd6eecb9SOleg Nesterov pol = get_vma_policy(vma, addr);
2598771fb4d8SLee Schermerhorn if (!(pol->flags & MPOL_F_MOF))
2599771fb4d8SLee Schermerhorn goto out;
2600771fb4d8SLee Schermerhorn
2601771fb4d8SLee Schermerhorn switch (pol->mode) {
2602771fb4d8SLee Schermerhorn case MPOL_INTERLEAVE:
2603771fb4d8SLee Schermerhorn pgoff = vma->vm_pgoff;
2604771fb4d8SLee Schermerhorn pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
260598c70baaSLaurent Dufour polnid = offset_il_node(pol, pgoff);
2606771fb4d8SLee Schermerhorn break;
2607771fb4d8SLee Schermerhorn
2608771fb4d8SLee Schermerhorn case MPOL_PREFERRED:
2609b27abaccSDave Hansen if (node_isset(curnid, pol->nodes))
2610b27abaccSDave Hansen goto out;
2611269fbe72SBen Widawsky polnid = first_node(pol->nodes);
2612771fb4d8SLee Schermerhorn break;
2613771fb4d8SLee Schermerhorn
26147858d7bcSFeng Tang case MPOL_LOCAL:
26157858d7bcSFeng Tang polnid = numa_node_id();
26167858d7bcSFeng Tang break;
26177858d7bcSFeng Tang
2618771fb4d8SLee Schermerhorn case MPOL_BIND:
2619bda420b9SHuang Ying /* Optimize placement among multiple nodes via NUMA balancing */
2620bda420b9SHuang Ying if (pol->flags & MPOL_F_MORON) {
2621269fbe72SBen Widawsky if (node_isset(thisnid, pol->nodes))
2622bda420b9SHuang Ying break;
2623bda420b9SHuang Ying goto out;
2624bda420b9SHuang Ying }
2625b27abaccSDave Hansen fallthrough;
2626c33d6c06SMel Gorman
2627b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
2628771fb4d8SLee Schermerhorn /*
2629771fb4d8SLee Schermerhorn * use current page if in policy nodemask,
2630771fb4d8SLee Schermerhorn * else select nearest allowed node, if any.
2631771fb4d8SLee Schermerhorn * If no allowed nodes, use current [!misplaced].
2632771fb4d8SLee Schermerhorn */
2633269fbe72SBen Widawsky if (node_isset(curnid, pol->nodes))
2634771fb4d8SLee Schermerhorn goto out;
2635c33d6c06SMel Gorman z = first_zones_zonelist(
2636771fb4d8SLee Schermerhorn node_zonelist(numa_node_id(), GFP_HIGHUSER),
2637771fb4d8SLee Schermerhorn gfp_zone(GFP_HIGHUSER),
2638269fbe72SBen Widawsky &pol->nodes);
2639c1093b74SPavel Tatashin polnid = zone_to_nid(z->zone);
2640771fb4d8SLee Schermerhorn break;
2641771fb4d8SLee Schermerhorn
2642771fb4d8SLee Schermerhorn default:
2643771fb4d8SLee Schermerhorn BUG();
2644771fb4d8SLee Schermerhorn }
26455606e387SMel Gorman
26465606e387SMel Gorman /* Migrate the page towards the node whose CPU is referencing it */
2647e42c8ff2SMel Gorman if (pol->flags & MPOL_F_MORON) {
264890572890SPeter Zijlstra polnid = thisnid;
26495606e387SMel Gorman
265010f39042SRik van Riel if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2651de1c9ce6SRik van Riel goto out;
2652de1c9ce6SRik van Riel }
2653e42c8ff2SMel Gorman
2654771fb4d8SLee Schermerhorn if (curnid != polnid)
2655771fb4d8SLee Schermerhorn ret = polnid;
2656771fb4d8SLee Schermerhorn out:
2657771fb4d8SLee Schermerhorn mpol_cond_put(pol);
2658771fb4d8SLee Schermerhorn
2659771fb4d8SLee Schermerhorn return ret;
2660771fb4d8SLee Schermerhorn }
2661771fb4d8SLee Schermerhorn
2662c11600e4SDavid Rientjes /*
2663c11600e4SDavid Rientjes * Drop the (possibly final) reference to task->mempolicy. It needs to be
2664c11600e4SDavid Rientjes * dropped after task->mempolicy is set to NULL so that any allocation done as
2665c11600e4SDavid Rientjes * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2666c11600e4SDavid Rientjes * policy.
2667c11600e4SDavid Rientjes */
mpol_put_task_policy(struct task_struct * task)2668c11600e4SDavid Rientjes void mpol_put_task_policy(struct task_struct *task)
2669c11600e4SDavid Rientjes {
2670c11600e4SDavid Rientjes struct mempolicy *pol;
2671c11600e4SDavid Rientjes
2672c11600e4SDavid Rientjes task_lock(task);
2673c11600e4SDavid Rientjes pol = task->mempolicy;
2674c11600e4SDavid Rientjes task->mempolicy = NULL;
2675c11600e4SDavid Rientjes task_unlock(task);
2676c11600e4SDavid Rientjes mpol_put(pol);
2677c11600e4SDavid Rientjes }
2678c11600e4SDavid Rientjes
sp_delete(struct shared_policy * sp,struct sp_node * n)26791da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
26801da177e4SLinus Torvalds {
2681140d5a49SPaul Mundt pr_debug("deleting %lx-l%lx\n", n->start, n->end);
26821da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root);
268363f74ca2SKOSAKI Motohiro sp_free(n);
26841da177e4SLinus Torvalds }
26851da177e4SLinus Torvalds
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)268642288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start,
268742288fe3SMel Gorman unsigned long end, struct mempolicy *pol)
268842288fe3SMel Gorman {
268942288fe3SMel Gorman node->start = start;
269042288fe3SMel Gorman node->end = end;
269142288fe3SMel Gorman node->policy = pol;
269242288fe3SMel Gorman }
269342288fe3SMel Gorman
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2694dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2695dbcb0f19SAdrian Bunk struct mempolicy *pol)
26961da177e4SLinus Torvalds {
2697869833f2SKOSAKI Motohiro struct sp_node *n;
2698869833f2SKOSAKI Motohiro struct mempolicy *newpol;
26991da177e4SLinus Torvalds
2700869833f2SKOSAKI Motohiro n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
27011da177e4SLinus Torvalds if (!n)
27021da177e4SLinus Torvalds return NULL;
2703869833f2SKOSAKI Motohiro
2704869833f2SKOSAKI Motohiro newpol = mpol_dup(pol);
2705869833f2SKOSAKI Motohiro if (IS_ERR(newpol)) {
2706869833f2SKOSAKI Motohiro kmem_cache_free(sn_cache, n);
2707869833f2SKOSAKI Motohiro return NULL;
2708869833f2SKOSAKI Motohiro }
2709869833f2SKOSAKI Motohiro newpol->flags |= MPOL_F_SHARED;
271042288fe3SMel Gorman sp_node_init(n, start, end, newpol);
2711869833f2SKOSAKI Motohiro
27121da177e4SLinus Torvalds return n;
27131da177e4SLinus Torvalds }
27141da177e4SLinus Torvalds
27151da177e4SLinus Torvalds /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,unsigned long start,unsigned long end,struct sp_node * new)27161da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
27171da177e4SLinus Torvalds unsigned long end, struct sp_node *new)
27181da177e4SLinus Torvalds {
2719b22d127aSMel Gorman struct sp_node *n;
272042288fe3SMel Gorman struct sp_node *n_new = NULL;
272142288fe3SMel Gorman struct mempolicy *mpol_new = NULL;
2722b22d127aSMel Gorman int ret = 0;
27231da177e4SLinus Torvalds
272442288fe3SMel Gorman restart:
27254a8c7bb5SNathan Zimmer write_lock(&sp->lock);
27261da177e4SLinus Torvalds n = sp_lookup(sp, start, end);
27271da177e4SLinus Torvalds /* Take care of old policies in the same range. */
27281da177e4SLinus Torvalds while (n && n->start < end) {
27291da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd);
27301da177e4SLinus Torvalds if (n->start >= start) {
27311da177e4SLinus Torvalds if (n->end <= end)
27321da177e4SLinus Torvalds sp_delete(sp, n);
27331da177e4SLinus Torvalds else
27341da177e4SLinus Torvalds n->start = end;
27351da177e4SLinus Torvalds } else {
27361da177e4SLinus Torvalds /* Old policy spanning whole new range. */
27371da177e4SLinus Torvalds if (n->end > end) {
273842288fe3SMel Gorman if (!n_new)
273942288fe3SMel Gorman goto alloc_new;
274042288fe3SMel Gorman
274142288fe3SMel Gorman *mpol_new = *n->policy;
274242288fe3SMel Gorman atomic_set(&mpol_new->refcnt, 1);
27437880639cSKOSAKI Motohiro sp_node_init(n_new, end, n->end, mpol_new);
27441da177e4SLinus Torvalds n->end = start;
27455ca39575SHillf Danton sp_insert(sp, n_new);
274642288fe3SMel Gorman n_new = NULL;
274742288fe3SMel Gorman mpol_new = NULL;
27481da177e4SLinus Torvalds break;
27491da177e4SLinus Torvalds } else
27501da177e4SLinus Torvalds n->end = start;
27511da177e4SLinus Torvalds }
27521da177e4SLinus Torvalds if (!next)
27531da177e4SLinus Torvalds break;
27541da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd);
27551da177e4SLinus Torvalds }
27561da177e4SLinus Torvalds if (new)
27571da177e4SLinus Torvalds sp_insert(sp, new);
27584a8c7bb5SNathan Zimmer write_unlock(&sp->lock);
275942288fe3SMel Gorman ret = 0;
276042288fe3SMel Gorman
276142288fe3SMel Gorman err_out:
276242288fe3SMel Gorman if (mpol_new)
276342288fe3SMel Gorman mpol_put(mpol_new);
276442288fe3SMel Gorman if (n_new)
276542288fe3SMel Gorman kmem_cache_free(sn_cache, n_new);
276642288fe3SMel Gorman
2767b22d127aSMel Gorman return ret;
276842288fe3SMel Gorman
276942288fe3SMel Gorman alloc_new:
27704a8c7bb5SNathan Zimmer write_unlock(&sp->lock);
277142288fe3SMel Gorman ret = -ENOMEM;
277242288fe3SMel Gorman n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
277342288fe3SMel Gorman if (!n_new)
277442288fe3SMel Gorman goto err_out;
277542288fe3SMel Gorman mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277642288fe3SMel Gorman if (!mpol_new)
277742288fe3SMel Gorman goto err_out;
27784ad09955SMiaohe Lin atomic_set(&mpol_new->refcnt, 1);
277942288fe3SMel Gorman goto restart;
27801da177e4SLinus Torvalds }
27811da177e4SLinus Torvalds
278271fe804bSLee Schermerhorn /**
278371fe804bSLee Schermerhorn * mpol_shared_policy_init - initialize shared policy for inode
278471fe804bSLee Schermerhorn * @sp: pointer to inode shared policy
278571fe804bSLee Schermerhorn * @mpol: struct mempolicy to install
278671fe804bSLee Schermerhorn *
278771fe804bSLee Schermerhorn * Install non-NULL @mpol in inode's shared policy rb-tree.
278871fe804bSLee Schermerhorn * On entry, the current task has a reference on a non-NULL @mpol.
278971fe804bSLee Schermerhorn * This must be released on exit.
27904bfc4495SKAMEZAWA Hiroyuki * This is called at get_inode() calls and we can use GFP_KERNEL.
279171fe804bSLee Schermerhorn */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)279271fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
27937339ff83SRobin Holt {
279458568d2aSMiao Xie int ret;
279558568d2aSMiao Xie
279671fe804bSLee Schermerhorn sp->root = RB_ROOT; /* empty tree == default mempolicy */
27974a8c7bb5SNathan Zimmer rwlock_init(&sp->lock);
27987339ff83SRobin Holt
279971fe804bSLee Schermerhorn if (mpol) {
28007339ff83SRobin Holt struct vm_area_struct pvma;
280171fe804bSLee Schermerhorn struct mempolicy *new;
28024bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
28037339ff83SRobin Holt
28044bfc4495SKAMEZAWA Hiroyuki if (!scratch)
28055c0c1654SLee Schermerhorn goto put_mpol;
280671fe804bSLee Schermerhorn /* contextualize the tmpfs mount point mempolicy */
280771fe804bSLee Schermerhorn new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
280815d77835SLee Schermerhorn if (IS_ERR(new))
28090cae3457SDan Carpenter goto free_scratch; /* no valid nodemask intersection */
281058568d2aSMiao Xie
281158568d2aSMiao Xie task_lock(current);
28124bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
281358568d2aSMiao Xie task_unlock(current);
281415d77835SLee Schermerhorn if (ret)
28155c0c1654SLee Schermerhorn goto put_new;
281671fe804bSLee Schermerhorn
281771fe804bSLee Schermerhorn /* Create pseudo-vma that contains just the policy */
28182c4541e2SKirill A. Shutemov vma_init(&pvma, NULL);
281971fe804bSLee Schermerhorn pvma.vm_end = TASK_SIZE; /* policy covers entire file */
282071fe804bSLee Schermerhorn mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
282115d77835SLee Schermerhorn
28225c0c1654SLee Schermerhorn put_new:
282371fe804bSLee Schermerhorn mpol_put(new); /* drop initial ref */
28240cae3457SDan Carpenter free_scratch:
28254bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
28265c0c1654SLee Schermerhorn put_mpol:
28275c0c1654SLee Schermerhorn mpol_put(mpol); /* drop our incoming ref on sb mpol */
28287339ff83SRobin Holt }
28297339ff83SRobin Holt }
28307339ff83SRobin Holt
mpol_set_shared_policy(struct shared_policy * info,struct vm_area_struct * vma,struct mempolicy * npol)28311da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
28321da177e4SLinus Torvalds struct vm_area_struct *vma, struct mempolicy *npol)
28331da177e4SLinus Torvalds {
28341da177e4SLinus Torvalds int err;
28351da177e4SLinus Torvalds struct sp_node *new = NULL;
28361da177e4SLinus Torvalds unsigned long sz = vma_pages(vma);
28371da177e4SLinus Torvalds
2838028fec41SDavid Rientjes pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
28391da177e4SLinus Torvalds vma->vm_pgoff,
284045c4745aSLee Schermerhorn sz, npol ? npol->mode : -1,
2841028fec41SDavid Rientjes npol ? npol->flags : -1,
2842269fbe72SBen Widawsky npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
28431da177e4SLinus Torvalds
28441da177e4SLinus Torvalds if (npol) {
28451da177e4SLinus Torvalds new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
28461da177e4SLinus Torvalds if (!new)
28471da177e4SLinus Torvalds return -ENOMEM;
28481da177e4SLinus Torvalds }
28491da177e4SLinus Torvalds err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
28501da177e4SLinus Torvalds if (err && new)
285163f74ca2SKOSAKI Motohiro sp_free(new);
28521da177e4SLinus Torvalds return err;
28531da177e4SLinus Torvalds }
28541da177e4SLinus Torvalds
28551da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * p)28561da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
28571da177e4SLinus Torvalds {
28581da177e4SLinus Torvalds struct sp_node *n;
28591da177e4SLinus Torvalds struct rb_node *next;
28601da177e4SLinus Torvalds
28611da177e4SLinus Torvalds if (!p->root.rb_node)
28621da177e4SLinus Torvalds return;
28634a8c7bb5SNathan Zimmer write_lock(&p->lock);
28641da177e4SLinus Torvalds next = rb_first(&p->root);
28651da177e4SLinus Torvalds while (next) {
28661da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd);
28671da177e4SLinus Torvalds next = rb_next(&n->nd);
286863f74ca2SKOSAKI Motohiro sp_delete(p, n);
28691da177e4SLinus Torvalds }
28704a8c7bb5SNathan Zimmer write_unlock(&p->lock);
28711da177e4SLinus Torvalds }
28721da177e4SLinus Torvalds
28731a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING
2874c297663cSMel Gorman static int __initdata numabalancing_override;
28751a687c2eSMel Gorman
check_numabalancing_enable(void)28761a687c2eSMel Gorman static void __init check_numabalancing_enable(void)
28771a687c2eSMel Gorman {
28781a687c2eSMel Gorman bool numabalancing_default = false;
28791a687c2eSMel Gorman
28801a687c2eSMel Gorman if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
28811a687c2eSMel Gorman numabalancing_default = true;
28821a687c2eSMel Gorman
2883c297663cSMel Gorman /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2884c297663cSMel Gorman if (numabalancing_override)
2885c297663cSMel Gorman set_numabalancing_state(numabalancing_override == 1);
2886c297663cSMel Gorman
2887b0dc2b9bSMel Gorman if (num_online_nodes() > 1 && !numabalancing_override) {
2888756a025fSJoe Perches pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2889c297663cSMel Gorman numabalancing_default ? "Enabling" : "Disabling");
28901a687c2eSMel Gorman set_numabalancing_state(numabalancing_default);
28911a687c2eSMel Gorman }
28921a687c2eSMel Gorman }
28931a687c2eSMel Gorman
setup_numabalancing(char * str)28941a687c2eSMel Gorman static int __init setup_numabalancing(char *str)
28951a687c2eSMel Gorman {
28961a687c2eSMel Gorman int ret = 0;
28971a687c2eSMel Gorman if (!str)
28981a687c2eSMel Gorman goto out;
28991a687c2eSMel Gorman
29001a687c2eSMel Gorman if (!strcmp(str, "enable")) {
2901c297663cSMel Gorman numabalancing_override = 1;
29021a687c2eSMel Gorman ret = 1;
29031a687c2eSMel Gorman } else if (!strcmp(str, "disable")) {
2904c297663cSMel Gorman numabalancing_override = -1;
29051a687c2eSMel Gorman ret = 1;
29061a687c2eSMel Gorman }
29071a687c2eSMel Gorman out:
29081a687c2eSMel Gorman if (!ret)
29094a404beaSAndrew Morton pr_warn("Unable to parse numa_balancing=\n");
29101a687c2eSMel Gorman
29111a687c2eSMel Gorman return ret;
29121a687c2eSMel Gorman }
29131a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing);
29141a687c2eSMel Gorman #else
check_numabalancing_enable(void)29151a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void)
29161a687c2eSMel Gorman {
29171a687c2eSMel Gorman }
29181a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */
29191a687c2eSMel Gorman
29201da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
numa_policy_init(void)29211da177e4SLinus Torvalds void __init numa_policy_init(void)
29221da177e4SLinus Torvalds {
2923b71636e2SPaul Mundt nodemask_t interleave_nodes;
2924b71636e2SPaul Mundt unsigned long largest = 0;
2925b71636e2SPaul Mundt int nid, prefer = 0;
2926b71636e2SPaul Mundt
29271da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy",
29281da177e4SLinus Torvalds sizeof(struct mempolicy),
292920c2df83SPaul Mundt 0, SLAB_PANIC, NULL);
29301da177e4SLinus Torvalds
29311da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node",
29321da177e4SLinus Torvalds sizeof(struct sp_node),
293320c2df83SPaul Mundt 0, SLAB_PANIC, NULL);
29341da177e4SLinus Torvalds
29355606e387SMel Gorman for_each_node(nid) {
29365606e387SMel Gorman preferred_node_policy[nid] = (struct mempolicy) {
29375606e387SMel Gorman .refcnt = ATOMIC_INIT(1),
29385606e387SMel Gorman .mode = MPOL_PREFERRED,
29395606e387SMel Gorman .flags = MPOL_F_MOF | MPOL_F_MORON,
2940269fbe72SBen Widawsky .nodes = nodemask_of_node(nid),
29415606e387SMel Gorman };
29425606e387SMel Gorman }
29435606e387SMel Gorman
2944b71636e2SPaul Mundt /*
2945b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only
2946b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or
2947b71636e2SPaul Mundt * fall back to the largest node if they're all smaller.
2948b71636e2SPaul Mundt */
2949b71636e2SPaul Mundt nodes_clear(interleave_nodes);
295001f13bd6SLai Jiangshan for_each_node_state(nid, N_MEMORY) {
2951b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid);
29521da177e4SLinus Torvalds
2953b71636e2SPaul Mundt /* Preserve the largest node */
2954b71636e2SPaul Mundt if (largest < total_pages) {
2955b71636e2SPaul Mundt largest = total_pages;
2956b71636e2SPaul Mundt prefer = nid;
2957b71636e2SPaul Mundt }
2958b71636e2SPaul Mundt
2959b71636e2SPaul Mundt /* Interleave this node? */
2960b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2961b71636e2SPaul Mundt node_set(nid, interleave_nodes);
2962b71636e2SPaul Mundt }
2963b71636e2SPaul Mundt
2964b71636e2SPaul Mundt /* All too small, use the largest */
2965b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes)))
2966b71636e2SPaul Mundt node_set(prefer, interleave_nodes);
2967b71636e2SPaul Mundt
2968028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2969b1de0d13SMitchel Humpherys pr_err("%s: interleaving failed\n", __func__);
29701a687c2eSMel Gorman
29711a687c2eSMel Gorman check_numabalancing_enable();
29721da177e4SLinus Torvalds }
29731da177e4SLinus Torvalds
29748bccd85fSChristoph Lameter /* Reset policy of current process to default */
numa_default_policy(void)29751da177e4SLinus Torvalds void numa_default_policy(void)
29761da177e4SLinus Torvalds {
2977028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
29781da177e4SLinus Torvalds }
297968860ec1SPaul Jackson
29804225399aSPaul Jackson /*
2981095f1fc4SLee Schermerhorn * Parse and format mempolicy from/to strings
2982095f1fc4SLee Schermerhorn */
2983095f1fc4SLee Schermerhorn
2984345ace9cSLee Schermerhorn static const char * const policy_modes[] =
2985345ace9cSLee Schermerhorn {
2986345ace9cSLee Schermerhorn [MPOL_DEFAULT] = "default",
2987345ace9cSLee Schermerhorn [MPOL_PREFERRED] = "prefer",
2988345ace9cSLee Schermerhorn [MPOL_BIND] = "bind",
2989345ace9cSLee Schermerhorn [MPOL_INTERLEAVE] = "interleave",
2990d3a71033SLee Schermerhorn [MPOL_LOCAL] = "local",
2991b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = "prefer (many)",
2992345ace9cSLee Schermerhorn };
29931a75a6c8SChristoph Lameter
2994095f1fc4SLee Schermerhorn
2995095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS
2996095f1fc4SLee Schermerhorn /**
2997f2a07f40SHugh Dickins * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2998095f1fc4SLee Schermerhorn * @str: string containing mempolicy to parse
299971fe804bSLee Schermerhorn * @mpol: pointer to struct mempolicy pointer, returned on success.
3000095f1fc4SLee Schermerhorn *
3001095f1fc4SLee Schermerhorn * Format of input:
3002095f1fc4SLee Schermerhorn * <mode>[=<flags>][:<nodelist>]
3003095f1fc4SLee Schermerhorn *
3004dad5b023SRandy Dunlap * Return: %0 on success, else %1
3005095f1fc4SLee Schermerhorn */
mpol_parse_str(char * str,struct mempolicy ** mpol)3006a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol)
3007095f1fc4SLee Schermerhorn {
300871fe804bSLee Schermerhorn struct mempolicy *new = NULL;
3009f2a07f40SHugh Dickins unsigned short mode_flags;
301071fe804bSLee Schermerhorn nodemask_t nodes;
3011095f1fc4SLee Schermerhorn char *nodelist = strchr(str, ':');
3012095f1fc4SLee Schermerhorn char *flags = strchr(str, '=');
3013dedf2c73Szhong jiang int err = 1, mode;
3014095f1fc4SLee Schermerhorn
3015c7a91bc7SDan Carpenter if (flags)
3016c7a91bc7SDan Carpenter *flags++ = '\0'; /* terminate mode string */
3017c7a91bc7SDan Carpenter
3018095f1fc4SLee Schermerhorn if (nodelist) {
3019095f1fc4SLee Schermerhorn /* NUL-terminate mode or flags string */
3020095f1fc4SLee Schermerhorn *nodelist++ = '\0';
302171fe804bSLee Schermerhorn if (nodelist_parse(nodelist, nodes))
3022095f1fc4SLee Schermerhorn goto out;
302301f13bd6SLai Jiangshan if (!nodes_subset(nodes, node_states[N_MEMORY]))
3024095f1fc4SLee Schermerhorn goto out;
302571fe804bSLee Schermerhorn } else
302671fe804bSLee Schermerhorn nodes_clear(nodes);
302771fe804bSLee Schermerhorn
3028dedf2c73Szhong jiang mode = match_string(policy_modes, MPOL_MAX, str);
3029dedf2c73Szhong jiang if (mode < 0)
3030095f1fc4SLee Schermerhorn goto out;
3031095f1fc4SLee Schermerhorn
303271fe804bSLee Schermerhorn switch (mode) {
3033095f1fc4SLee Schermerhorn case MPOL_PREFERRED:
303471fe804bSLee Schermerhorn /*
3035aa9f7d51SRandy Dunlap * Insist on a nodelist of one node only, although later
3036aa9f7d51SRandy Dunlap * we use first_node(nodes) to grab a single node, so here
3037aa9f7d51SRandy Dunlap * nodelist (or nodes) cannot be empty.
303871fe804bSLee Schermerhorn */
3039095f1fc4SLee Schermerhorn if (nodelist) {
3040095f1fc4SLee Schermerhorn char *rest = nodelist;
3041095f1fc4SLee Schermerhorn while (isdigit(*rest))
3042095f1fc4SLee Schermerhorn rest++;
3043926f2ae0SKOSAKI Motohiro if (*rest)
3044926f2ae0SKOSAKI Motohiro goto out;
3045aa9f7d51SRandy Dunlap if (nodes_empty(nodes))
3046aa9f7d51SRandy Dunlap goto out;
3047095f1fc4SLee Schermerhorn }
3048095f1fc4SLee Schermerhorn break;
3049095f1fc4SLee Schermerhorn case MPOL_INTERLEAVE:
3050095f1fc4SLee Schermerhorn /*
3051095f1fc4SLee Schermerhorn * Default to online nodes with memory if no nodelist
3052095f1fc4SLee Schermerhorn */
3053095f1fc4SLee Schermerhorn if (!nodelist)
305401f13bd6SLai Jiangshan nodes = node_states[N_MEMORY];
30553f226aa1SLee Schermerhorn break;
305671fe804bSLee Schermerhorn case MPOL_LOCAL:
30573f226aa1SLee Schermerhorn /*
305871fe804bSLee Schermerhorn * Don't allow a nodelist; mpol_new() checks flags
30593f226aa1SLee Schermerhorn */
306071fe804bSLee Schermerhorn if (nodelist)
30613f226aa1SLee Schermerhorn goto out;
30623f226aa1SLee Schermerhorn break;
3063413b43deSRavikiran G Thirumalai case MPOL_DEFAULT:
3064413b43deSRavikiran G Thirumalai /*
3065413b43deSRavikiran G Thirumalai * Insist on a empty nodelist
3066413b43deSRavikiran G Thirumalai */
3067413b43deSRavikiran G Thirumalai if (!nodelist)
3068413b43deSRavikiran G Thirumalai err = 0;
3069413b43deSRavikiran G Thirumalai goto out;
3070b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
3071d69b2e63SKOSAKI Motohiro case MPOL_BIND:
307271fe804bSLee Schermerhorn /*
3073d69b2e63SKOSAKI Motohiro * Insist on a nodelist
307471fe804bSLee Schermerhorn */
3075d69b2e63SKOSAKI Motohiro if (!nodelist)
3076d69b2e63SKOSAKI Motohiro goto out;
3077095f1fc4SLee Schermerhorn }
3078095f1fc4SLee Schermerhorn
307971fe804bSLee Schermerhorn mode_flags = 0;
3080095f1fc4SLee Schermerhorn if (flags) {
3081095f1fc4SLee Schermerhorn /*
3082095f1fc4SLee Schermerhorn * Currently, we only support two mutually exclusive
3083095f1fc4SLee Schermerhorn * mode flags.
3084095f1fc4SLee Schermerhorn */
3085095f1fc4SLee Schermerhorn if (!strcmp(flags, "static"))
308671fe804bSLee Schermerhorn mode_flags |= MPOL_F_STATIC_NODES;
3087095f1fc4SLee Schermerhorn else if (!strcmp(flags, "relative"))
308871fe804bSLee Schermerhorn mode_flags |= MPOL_F_RELATIVE_NODES;
3089095f1fc4SLee Schermerhorn else
3090926f2ae0SKOSAKI Motohiro goto out;
3091095f1fc4SLee Schermerhorn }
309271fe804bSLee Schermerhorn
309371fe804bSLee Schermerhorn new = mpol_new(mode, mode_flags, &nodes);
309471fe804bSLee Schermerhorn if (IS_ERR(new))
3095926f2ae0SKOSAKI Motohiro goto out;
3096926f2ae0SKOSAKI Motohiro
3097f2a07f40SHugh Dickins /*
3098f2a07f40SHugh Dickins * Save nodes for mpol_to_str() to show the tmpfs mount options
3099f2a07f40SHugh Dickins * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3100f2a07f40SHugh Dickins */
3101269fbe72SBen Widawsky if (mode != MPOL_PREFERRED) {
3102269fbe72SBen Widawsky new->nodes = nodes;
3103269fbe72SBen Widawsky } else if (nodelist) {
3104269fbe72SBen Widawsky nodes_clear(new->nodes);
3105269fbe72SBen Widawsky node_set(first_node(nodes), new->nodes);
3106269fbe72SBen Widawsky } else {
31077858d7bcSFeng Tang new->mode = MPOL_LOCAL;
3108269fbe72SBen Widawsky }
3109f2a07f40SHugh Dickins
3110f2a07f40SHugh Dickins /*
3111f2a07f40SHugh Dickins * Save nodes for contextualization: this will be used to "clone"
3112f2a07f40SHugh Dickins * the mempolicy in a specific context [cpuset] at a later time.
3113f2a07f40SHugh Dickins */
3114e17f74afSLee Schermerhorn new->w.user_nodemask = nodes;
3115f2a07f40SHugh Dickins
3116926f2ae0SKOSAKI Motohiro err = 0;
311771fe804bSLee Schermerhorn
3118095f1fc4SLee Schermerhorn out:
3119095f1fc4SLee Schermerhorn /* Restore string for error message */
3120095f1fc4SLee Schermerhorn if (nodelist)
3121095f1fc4SLee Schermerhorn *--nodelist = ':';
3122095f1fc4SLee Schermerhorn if (flags)
3123095f1fc4SLee Schermerhorn *--flags = '=';
312471fe804bSLee Schermerhorn if (!err)
312571fe804bSLee Schermerhorn *mpol = new;
3126095f1fc4SLee Schermerhorn return err;
3127095f1fc4SLee Schermerhorn }
3128095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */
3129095f1fc4SLee Schermerhorn
313071fe804bSLee Schermerhorn /**
313171fe804bSLee Schermerhorn * mpol_to_str - format a mempolicy structure for printing
313271fe804bSLee Schermerhorn * @buffer: to contain formatted mempolicy string
313371fe804bSLee Schermerhorn * @maxlen: length of @buffer
313471fe804bSLee Schermerhorn * @pol: pointer to mempolicy to be formatted
313571fe804bSLee Schermerhorn *
3136948927eeSDavid Rientjes * Convert @pol into a string. If @buffer is too short, truncate the string.
3137948927eeSDavid Rientjes * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3138948927eeSDavid Rientjes * longest flag, "relative", and to display at least a few node ids.
31391a75a6c8SChristoph Lameter */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3140948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
31411a75a6c8SChristoph Lameter {
31421a75a6c8SChristoph Lameter char *p = buffer;
3143948927eeSDavid Rientjes nodemask_t nodes = NODE_MASK_NONE;
3144948927eeSDavid Rientjes unsigned short mode = MPOL_DEFAULT;
3145948927eeSDavid Rientjes unsigned short flags = 0;
31461a75a6c8SChristoph Lameter
31478790c71aSDavid Rientjes if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3148bea904d5SLee Schermerhorn mode = pol->mode;
3149948927eeSDavid Rientjes flags = pol->flags;
3150948927eeSDavid Rientjes }
3151bea904d5SLee Schermerhorn
31521a75a6c8SChristoph Lameter switch (mode) {
31531a75a6c8SChristoph Lameter case MPOL_DEFAULT:
31547858d7bcSFeng Tang case MPOL_LOCAL:
31551a75a6c8SChristoph Lameter break;
31561a75a6c8SChristoph Lameter case MPOL_PREFERRED:
3157b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
31581a75a6c8SChristoph Lameter case MPOL_BIND:
31591a75a6c8SChristoph Lameter case MPOL_INTERLEAVE:
3160269fbe72SBen Widawsky nodes = pol->nodes;
31611a75a6c8SChristoph Lameter break;
31621a75a6c8SChristoph Lameter default:
3163948927eeSDavid Rientjes WARN_ON_ONCE(1);
3164948927eeSDavid Rientjes snprintf(p, maxlen, "unknown");
3165948927eeSDavid Rientjes return;
31661a75a6c8SChristoph Lameter }
31671a75a6c8SChristoph Lameter
3168b7a9f420SDavid Rientjes p += snprintf(p, maxlen, "%s", policy_modes[mode]);
31691a75a6c8SChristoph Lameter
3170fc36b8d3SLee Schermerhorn if (flags & MPOL_MODE_FLAGS) {
3171948927eeSDavid Rientjes p += snprintf(p, buffer + maxlen - p, "=");
3172f5b087b5SDavid Rientjes
31732291990aSLee Schermerhorn /*
31742291990aSLee Schermerhorn * Currently, the only defined flags are mutually exclusive
31752291990aSLee Schermerhorn */
3176f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES)
31772291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "static");
31782291990aSLee Schermerhorn else if (flags & MPOL_F_RELATIVE_NODES)
31792291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "relative");
3180f5b087b5SDavid Rientjes }
3181f5b087b5SDavid Rientjes
31829e763e0fSTejun Heo if (!nodes_empty(nodes))
31839e763e0fSTejun Heo p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
31849e763e0fSTejun Heo nodemask_pr_args(&nodes));
31851a75a6c8SChristoph Lameter }
3186