11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs. 58bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 61da177e4SLinus Torvalds * Subject to the GNU Public License, version 2. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should 91da177e4SLinus Torvalds * be allocated. 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * Support four policies per VMA and per process: 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault. 141da177e4SLinus Torvalds * 151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes, 161da177e4SLinus Torvalds * with normal fallback if it fails. 171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the 181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping 191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter 201da177e4SLinus Torvalds * is used. 218bccd85fSChristoph Lameter * 221da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes, 231da177e4SLinus Torvalds * no fallback. 248bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node 258bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict 268bccd85fSChristoph Lameter * the allocation to memory nodes instead 278bccd85fSChristoph Lameter * 281da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback. 2900ef2d2fSDavid Rientjes * As a special case NUMA_NO_NODE here means do the allocation 301da177e4SLinus Torvalds * on the local CPU. This is normally identical to default, 311da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default 321da177e4SLinus Torvalds * process policy. 338bccd85fSChristoph Lameter * 341da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA 351da177e4SLinus Torvalds * use the process policy. This is what Linux always did 361da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default. 371da177e4SLinus Torvalds * 381da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations 391da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always 401da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory 411da177e4SLinus Torvalds * allocations for a VMA in the VM. 421da177e4SLinus Torvalds * 431da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy 441da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy 451da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins. 461da177e4SLinus Torvalds * 471da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations 481da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that 491da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied. 501da177e4SLinus Torvalds * Same with GFP_DMA allocations. 511da177e4SLinus Torvalds * 521da177e4SLinus Torvalds * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 531da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped. 541da177e4SLinus Torvalds */ 551da177e4SLinus Torvalds 561da177e4SLinus Torvalds /* Notebook: 571da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache 581da177e4SLinus Torvalds object 591da177e4SLinus Torvalds statistics for bigpages 601da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires 611da177e4SLinus Torvalds first item above. 621da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy) 631da177e4SLinus Torvalds grows down? 641da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the 651da177e4SLinus Torvalds kernel is not always grateful with that. 661da177e4SLinus Torvalds */ 671da177e4SLinus Torvalds 68*b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 69*b1de0d13SMitchel Humpherys 701da177e4SLinus Torvalds #include <linux/mempolicy.h> 711da177e4SLinus Torvalds #include <linux/mm.h> 721da177e4SLinus Torvalds #include <linux/highmem.h> 731da177e4SLinus Torvalds #include <linux/hugetlb.h> 741da177e4SLinus Torvalds #include <linux/kernel.h> 751da177e4SLinus Torvalds #include <linux/sched.h> 761da177e4SLinus Torvalds #include <linux/nodemask.h> 771da177e4SLinus Torvalds #include <linux/cpuset.h> 781da177e4SLinus Torvalds #include <linux/slab.h> 791da177e4SLinus Torvalds #include <linux/string.h> 80b95f1b31SPaul Gortmaker #include <linux/export.h> 81b488893aSPavel Emelyanov #include <linux/nsproxy.h> 821da177e4SLinus Torvalds #include <linux/interrupt.h> 831da177e4SLinus Torvalds #include <linux/init.h> 841da177e4SLinus Torvalds #include <linux/compat.h> 85dc9aa5b9SChristoph Lameter #include <linux/swap.h> 861a75a6c8SChristoph Lameter #include <linux/seq_file.h> 871a75a6c8SChristoph Lameter #include <linux/proc_fs.h> 88b20a3503SChristoph Lameter #include <linux/migrate.h> 8962b61f61SHugh Dickins #include <linux/ksm.h> 9095a402c3SChristoph Lameter #include <linux/rmap.h> 9186c3a764SDavid Quigley #include <linux/security.h> 92dbcb0f19SAdrian Bunk #include <linux/syscalls.h> 93095f1fc4SLee Schermerhorn #include <linux/ctype.h> 946d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h> 95b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h> 96*b1de0d13SMitchel Humpherys #include <linux/printk.h> 97dc9aa5b9SChristoph Lameter 981da177e4SLinus Torvalds #include <asm/tlbflush.h> 991da177e4SLinus Torvalds #include <asm/uaccess.h> 100778d3b0fSMichal Hocko #include <linux/random.h> 1011da177e4SLinus Torvalds 10262695a84SNick Piggin #include "internal.h" 10362695a84SNick Piggin 10438e35860SChristoph Lameter /* Internal flags */ 105dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 10638e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 107dc9aa5b9SChristoph Lameter 108fcc234f8SPekka Enberg static struct kmem_cache *policy_cache; 109fcc234f8SPekka Enberg static struct kmem_cache *sn_cache; 1101da177e4SLinus Torvalds 1111da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not 1121da177e4SLinus Torvalds policied. */ 1136267276fSChristoph Lameter enum zone_type policy_zone = 0; 1141da177e4SLinus Torvalds 115bea904d5SLee Schermerhorn /* 116bea904d5SLee Schermerhorn * run-time system-wide default policy => local allocation 117bea904d5SLee Schermerhorn */ 118e754d79dSH Hartley Sweeten static struct mempolicy default_policy = { 1191da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */ 120bea904d5SLee Schermerhorn .mode = MPOL_PREFERRED, 121fc36b8d3SLee Schermerhorn .flags = MPOL_F_LOCAL, 1221da177e4SLinus Torvalds }; 1231da177e4SLinus Torvalds 1245606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 1255606e387SMel Gorman 1265606e387SMel Gorman static struct mempolicy *get_task_policy(struct task_struct *p) 1275606e387SMel Gorman { 1285606e387SMel Gorman struct mempolicy *pol = p->mempolicy; 1295606e387SMel Gorman 1305606e387SMel Gorman if (!pol) { 1311da6f0e1SJianguo Wu int node = numa_node_id(); 1325606e387SMel Gorman 1331da6f0e1SJianguo Wu if (node != NUMA_NO_NODE) { 1341da6f0e1SJianguo Wu pol = &preferred_node_policy[node]; 1351da6f0e1SJianguo Wu /* 1361da6f0e1SJianguo Wu * preferred_node_policy is not initialised early in 1371da6f0e1SJianguo Wu * boot 1381da6f0e1SJianguo Wu */ 1395606e387SMel Gorman if (!pol->mode) 1405606e387SMel Gorman pol = NULL; 1415606e387SMel Gorman } 1421da6f0e1SJianguo Wu } 1435606e387SMel Gorman 1445606e387SMel Gorman return pol; 1455606e387SMel Gorman } 1465606e387SMel Gorman 14737012946SDavid Rientjes static const struct mempolicy_operations { 14837012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 149708c1bbcSMiao Xie /* 150708c1bbcSMiao Xie * If read-side task has no lock to protect task->mempolicy, write-side 151708c1bbcSMiao Xie * task will rebind the task->mempolicy by two step. The first step is 152708c1bbcSMiao Xie * setting all the newly nodes, and the second step is cleaning all the 153708c1bbcSMiao Xie * disallowed nodes. In this way, we can avoid finding no node to alloc 154708c1bbcSMiao Xie * page. 155708c1bbcSMiao Xie * If we have a lock to protect task->mempolicy in read-side, we do 156708c1bbcSMiao Xie * rebind directly. 157708c1bbcSMiao Xie * 158708c1bbcSMiao Xie * step: 159708c1bbcSMiao Xie * MPOL_REBIND_ONCE - do rebind work at once 160708c1bbcSMiao Xie * MPOL_REBIND_STEP1 - set all the newly nodes 161708c1bbcSMiao Xie * MPOL_REBIND_STEP2 - clean all the disallowed nodes 162708c1bbcSMiao Xie */ 163708c1bbcSMiao Xie void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, 164708c1bbcSMiao Xie enum mpol_rebind_step step); 16537012946SDavid Rientjes } mpol_ops[MPOL_MAX]; 16637012946SDavid Rientjes 16719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */ 16837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask) 1691da177e4SLinus Torvalds { 170d3eb1570SLai Jiangshan return nodes_intersects(*nodemask, node_states[N_MEMORY]); 1711da177e4SLinus Torvalds } 1721da177e4SLinus Torvalds 173f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 174f5b087b5SDavid Rientjes { 1756d556294SBob Liu return pol->flags & MPOL_MODE_FLAGS; 1764c50bc01SDavid Rientjes } 1774c50bc01SDavid Rientjes 1784c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 1794c50bc01SDavid Rientjes const nodemask_t *rel) 1804c50bc01SDavid Rientjes { 1814c50bc01SDavid Rientjes nodemask_t tmp; 1824c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel)); 1834c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel); 184f5b087b5SDavid Rientjes } 185f5b087b5SDavid Rientjes 18637012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) 18737012946SDavid Rientjes { 18837012946SDavid Rientjes if (nodes_empty(*nodes)) 18937012946SDavid Rientjes return -EINVAL; 19037012946SDavid Rientjes pol->v.nodes = *nodes; 19137012946SDavid Rientjes return 0; 19237012946SDavid Rientjes } 19337012946SDavid Rientjes 19437012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 19537012946SDavid Rientjes { 19637012946SDavid Rientjes if (!nodes) 197fc36b8d3SLee Schermerhorn pol->flags |= MPOL_F_LOCAL; /* local allocation */ 19837012946SDavid Rientjes else if (nodes_empty(*nodes)) 19937012946SDavid Rientjes return -EINVAL; /* no allowed nodes */ 20037012946SDavid Rientjes else 20137012946SDavid Rientjes pol->v.preferred_node = first_node(*nodes); 20237012946SDavid Rientjes return 0; 20337012946SDavid Rientjes } 20437012946SDavid Rientjes 20537012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) 20637012946SDavid Rientjes { 20737012946SDavid Rientjes if (!is_valid_nodemask(nodes)) 20837012946SDavid Rientjes return -EINVAL; 20937012946SDavid Rientjes pol->v.nodes = *nodes; 21037012946SDavid Rientjes return 0; 21137012946SDavid Rientjes } 21237012946SDavid Rientjes 21358568d2aSMiao Xie /* 21458568d2aSMiao Xie * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 21558568d2aSMiao Xie * any, for the new policy. mpol_new() has already validated the nodes 21658568d2aSMiao Xie * parameter with respect to the policy mode and flags. But, we need to 21758568d2aSMiao Xie * handle an empty nodemask with MPOL_PREFERRED here. 21858568d2aSMiao Xie * 21958568d2aSMiao Xie * Must be called holding task's alloc_lock to protect task's mems_allowed 22058568d2aSMiao Xie * and mempolicy. May also be called holding the mmap_semaphore for write. 22158568d2aSMiao Xie */ 2224bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol, 2234bfc4495SKAMEZAWA Hiroyuki const nodemask_t *nodes, struct nodemask_scratch *nsc) 22458568d2aSMiao Xie { 22558568d2aSMiao Xie int ret; 22658568d2aSMiao Xie 22758568d2aSMiao Xie /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 22858568d2aSMiao Xie if (pol == NULL) 22958568d2aSMiao Xie return 0; 23001f13bd6SLai Jiangshan /* Check N_MEMORY */ 2314bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask1, 23201f13bd6SLai Jiangshan cpuset_current_mems_allowed, node_states[N_MEMORY]); 23358568d2aSMiao Xie 23458568d2aSMiao Xie VM_BUG_ON(!nodes); 23558568d2aSMiao Xie if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 23658568d2aSMiao Xie nodes = NULL; /* explicit local allocation */ 23758568d2aSMiao Xie else { 23858568d2aSMiao Xie if (pol->flags & MPOL_F_RELATIVE_NODES) 2394bfc4495SKAMEZAWA Hiroyuki mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); 24058568d2aSMiao Xie else 2414bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask2, *nodes, nsc->mask1); 2424bfc4495SKAMEZAWA Hiroyuki 24358568d2aSMiao Xie if (mpol_store_user_nodemask(pol)) 24458568d2aSMiao Xie pol->w.user_nodemask = *nodes; 24558568d2aSMiao Xie else 24658568d2aSMiao Xie pol->w.cpuset_mems_allowed = 24758568d2aSMiao Xie cpuset_current_mems_allowed; 24858568d2aSMiao Xie } 24958568d2aSMiao Xie 2504bfc4495SKAMEZAWA Hiroyuki if (nodes) 2514bfc4495SKAMEZAWA Hiroyuki ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 2524bfc4495SKAMEZAWA Hiroyuki else 2534bfc4495SKAMEZAWA Hiroyuki ret = mpol_ops[pol->mode].create(pol, NULL); 25458568d2aSMiao Xie return ret; 25558568d2aSMiao Xie } 25658568d2aSMiao Xie 25758568d2aSMiao Xie /* 25858568d2aSMiao Xie * This function just creates a new policy, does some check and simple 25958568d2aSMiao Xie * initialization. You must invoke mpol_set_nodemask() to set nodes. 26058568d2aSMiao Xie */ 261028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 262028fec41SDavid Rientjes nodemask_t *nodes) 2631da177e4SLinus Torvalds { 2641da177e4SLinus Torvalds struct mempolicy *policy; 2651da177e4SLinus Torvalds 266028fec41SDavid Rientjes pr_debug("setting mode %d flags %d nodes[0] %lx\n", 26700ef2d2fSDavid Rientjes mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); 268140d5a49SPaul Mundt 2693e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) { 2703e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes)) 27137012946SDavid Rientjes return ERR_PTR(-EINVAL); 272d3a71033SLee Schermerhorn return NULL; 27337012946SDavid Rientjes } 2743e1f0645SDavid Rientjes VM_BUG_ON(!nodes); 2753e1f0645SDavid Rientjes 2763e1f0645SDavid Rientjes /* 2773e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 2783e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 2793e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask. 2803e1f0645SDavid Rientjes */ 2813e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) { 2823e1f0645SDavid Rientjes if (nodes_empty(*nodes)) { 2833e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) || 2843e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES))) 2853e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 2863e1f0645SDavid Rientjes } 287479e2802SPeter Zijlstra } else if (mode == MPOL_LOCAL) { 288479e2802SPeter Zijlstra if (!nodes_empty(*nodes)) 289479e2802SPeter Zijlstra return ERR_PTR(-EINVAL); 290479e2802SPeter Zijlstra mode = MPOL_PREFERRED; 2913e1f0645SDavid Rientjes } else if (nodes_empty(*nodes)) 2923e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 2931da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2941da177e4SLinus Torvalds if (!policy) 2951da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 2961da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1); 29745c4745aSLee Schermerhorn policy->mode = mode; 29837012946SDavid Rientjes policy->flags = flags; 2993e1f0645SDavid Rientjes 30037012946SDavid Rientjes return policy; 30137012946SDavid Rientjes } 30237012946SDavid Rientjes 30352cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */ 30452cd3b07SLee Schermerhorn void __mpol_put(struct mempolicy *p) 30552cd3b07SLee Schermerhorn { 30652cd3b07SLee Schermerhorn if (!atomic_dec_and_test(&p->refcnt)) 30752cd3b07SLee Schermerhorn return; 30852cd3b07SLee Schermerhorn kmem_cache_free(policy_cache, p); 30952cd3b07SLee Schermerhorn } 31052cd3b07SLee Schermerhorn 311708c1bbcSMiao Xie static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, 312708c1bbcSMiao Xie enum mpol_rebind_step step) 31337012946SDavid Rientjes { 31437012946SDavid Rientjes } 31537012946SDavid Rientjes 316708c1bbcSMiao Xie /* 317708c1bbcSMiao Xie * step: 318708c1bbcSMiao Xie * MPOL_REBIND_ONCE - do rebind work at once 319708c1bbcSMiao Xie * MPOL_REBIND_STEP1 - set all the newly nodes 320708c1bbcSMiao Xie * MPOL_REBIND_STEP2 - clean all the disallowed nodes 321708c1bbcSMiao Xie */ 322708c1bbcSMiao Xie static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, 323708c1bbcSMiao Xie enum mpol_rebind_step step) 3241d0d2680SDavid Rientjes { 3251d0d2680SDavid Rientjes nodemask_t tmp; 3261d0d2680SDavid Rientjes 32737012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) 32837012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes); 32937012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES) 33037012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 3311d0d2680SDavid Rientjes else { 332708c1bbcSMiao Xie /* 333708c1bbcSMiao Xie * if step == 1, we use ->w.cpuset_mems_allowed to cache the 334708c1bbcSMiao Xie * result 335708c1bbcSMiao Xie */ 336708c1bbcSMiao Xie if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { 337708c1bbcSMiao Xie nodes_remap(tmp, pol->v.nodes, 338708c1bbcSMiao Xie pol->w.cpuset_mems_allowed, *nodes); 339708c1bbcSMiao Xie pol->w.cpuset_mems_allowed = step ? tmp : *nodes; 340708c1bbcSMiao Xie } else if (step == MPOL_REBIND_STEP2) { 341708c1bbcSMiao Xie tmp = pol->w.cpuset_mems_allowed; 34237012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 343708c1bbcSMiao Xie } else 344708c1bbcSMiao Xie BUG(); 3451d0d2680SDavid Rientjes } 34637012946SDavid Rientjes 347708c1bbcSMiao Xie if (nodes_empty(tmp)) 348708c1bbcSMiao Xie tmp = *nodes; 349708c1bbcSMiao Xie 350708c1bbcSMiao Xie if (step == MPOL_REBIND_STEP1) 351708c1bbcSMiao Xie nodes_or(pol->v.nodes, pol->v.nodes, tmp); 352708c1bbcSMiao Xie else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) 3531d0d2680SDavid Rientjes pol->v.nodes = tmp; 354708c1bbcSMiao Xie else 355708c1bbcSMiao Xie BUG(); 356708c1bbcSMiao Xie 3571d0d2680SDavid Rientjes if (!node_isset(current->il_next, tmp)) { 3581d0d2680SDavid Rientjes current->il_next = next_node(current->il_next, tmp); 3591d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 3601d0d2680SDavid Rientjes current->il_next = first_node(tmp); 3611d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 3621d0d2680SDavid Rientjes current->il_next = numa_node_id(); 3631d0d2680SDavid Rientjes } 36437012946SDavid Rientjes } 36537012946SDavid Rientjes 36637012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol, 367708c1bbcSMiao Xie const nodemask_t *nodes, 368708c1bbcSMiao Xie enum mpol_rebind_step step) 36937012946SDavid Rientjes { 37037012946SDavid Rientjes nodemask_t tmp; 37137012946SDavid Rientjes 37237012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) { 3731d0d2680SDavid Rientjes int node = first_node(pol->w.user_nodemask); 3741d0d2680SDavid Rientjes 375fc36b8d3SLee Schermerhorn if (node_isset(node, *nodes)) { 3761d0d2680SDavid Rientjes pol->v.preferred_node = node; 377fc36b8d3SLee Schermerhorn pol->flags &= ~MPOL_F_LOCAL; 378fc36b8d3SLee Schermerhorn } else 379fc36b8d3SLee Schermerhorn pol->flags |= MPOL_F_LOCAL; 38037012946SDavid Rientjes } else if (pol->flags & MPOL_F_RELATIVE_NODES) { 38137012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 3821d0d2680SDavid Rientjes pol->v.preferred_node = first_node(tmp); 383fc36b8d3SLee Schermerhorn } else if (!(pol->flags & MPOL_F_LOCAL)) { 3841d0d2680SDavid Rientjes pol->v.preferred_node = node_remap(pol->v.preferred_node, 38537012946SDavid Rientjes pol->w.cpuset_mems_allowed, 38637012946SDavid Rientjes *nodes); 38737012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 3881d0d2680SDavid Rientjes } 3891d0d2680SDavid Rientjes } 39037012946SDavid Rientjes 391708c1bbcSMiao Xie /* 392708c1bbcSMiao Xie * mpol_rebind_policy - Migrate a policy to a different set of nodes 393708c1bbcSMiao Xie * 394708c1bbcSMiao Xie * If read-side task has no lock to protect task->mempolicy, write-side 395708c1bbcSMiao Xie * task will rebind the task->mempolicy by two step. The first step is 396708c1bbcSMiao Xie * setting all the newly nodes, and the second step is cleaning all the 397708c1bbcSMiao Xie * disallowed nodes. In this way, we can avoid finding no node to alloc 398708c1bbcSMiao Xie * page. 399708c1bbcSMiao Xie * If we have a lock to protect task->mempolicy in read-side, we do 400708c1bbcSMiao Xie * rebind directly. 401708c1bbcSMiao Xie * 402708c1bbcSMiao Xie * step: 403708c1bbcSMiao Xie * MPOL_REBIND_ONCE - do rebind work at once 404708c1bbcSMiao Xie * MPOL_REBIND_STEP1 - set all the newly nodes 405708c1bbcSMiao Xie * MPOL_REBIND_STEP2 - clean all the disallowed nodes 406708c1bbcSMiao Xie */ 407708c1bbcSMiao Xie static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, 408708c1bbcSMiao Xie enum mpol_rebind_step step) 40937012946SDavid Rientjes { 41037012946SDavid Rientjes if (!pol) 41137012946SDavid Rientjes return; 41289c522c7SWang Sheng-Hui if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && 41337012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 41437012946SDavid Rientjes return; 415708c1bbcSMiao Xie 416708c1bbcSMiao Xie if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) 417708c1bbcSMiao Xie return; 418708c1bbcSMiao Xie 419708c1bbcSMiao Xie if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) 420708c1bbcSMiao Xie BUG(); 421708c1bbcSMiao Xie 422708c1bbcSMiao Xie if (step == MPOL_REBIND_STEP1) 423708c1bbcSMiao Xie pol->flags |= MPOL_F_REBINDING; 424708c1bbcSMiao Xie else if (step == MPOL_REBIND_STEP2) 425708c1bbcSMiao Xie pol->flags &= ~MPOL_F_REBINDING; 426708c1bbcSMiao Xie else if (step >= MPOL_REBIND_NSTEP) 427708c1bbcSMiao Xie BUG(); 428708c1bbcSMiao Xie 429708c1bbcSMiao Xie mpol_ops[pol->mode].rebind(pol, newmask, step); 4301d0d2680SDavid Rientjes } 4311d0d2680SDavid Rientjes 4321d0d2680SDavid Rientjes /* 4331d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task 4341d0d2680SDavid Rientjes * pointer, and updates task mempolicy. 43558568d2aSMiao Xie * 43658568d2aSMiao Xie * Called with task's alloc_lock held. 4371d0d2680SDavid Rientjes */ 4381d0d2680SDavid Rientjes 439708c1bbcSMiao Xie void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, 440708c1bbcSMiao Xie enum mpol_rebind_step step) 4411d0d2680SDavid Rientjes { 442708c1bbcSMiao Xie mpol_rebind_policy(tsk->mempolicy, new, step); 4431d0d2680SDavid Rientjes } 4441d0d2680SDavid Rientjes 4451d0d2680SDavid Rientjes /* 4461d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask. 4471d0d2680SDavid Rientjes * 4481d0d2680SDavid Rientjes * Call holding a reference to mm. Takes mm->mmap_sem during call. 4491d0d2680SDavid Rientjes */ 4501d0d2680SDavid Rientjes 4511d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 4521d0d2680SDavid Rientjes { 4531d0d2680SDavid Rientjes struct vm_area_struct *vma; 4541d0d2680SDavid Rientjes 4551d0d2680SDavid Rientjes down_write(&mm->mmap_sem); 4561d0d2680SDavid Rientjes for (vma = mm->mmap; vma; vma = vma->vm_next) 457708c1bbcSMiao Xie mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); 4581d0d2680SDavid Rientjes up_write(&mm->mmap_sem); 4591d0d2680SDavid Rientjes } 4601d0d2680SDavid Rientjes 46137012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 46237012946SDavid Rientjes [MPOL_DEFAULT] = { 46337012946SDavid Rientjes .rebind = mpol_rebind_default, 46437012946SDavid Rientjes }, 46537012946SDavid Rientjes [MPOL_INTERLEAVE] = { 46637012946SDavid Rientjes .create = mpol_new_interleave, 46737012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 46837012946SDavid Rientjes }, 46937012946SDavid Rientjes [MPOL_PREFERRED] = { 47037012946SDavid Rientjes .create = mpol_new_preferred, 47137012946SDavid Rientjes .rebind = mpol_rebind_preferred, 47237012946SDavid Rientjes }, 47337012946SDavid Rientjes [MPOL_BIND] = { 47437012946SDavid Rientjes .create = mpol_new_bind, 47537012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 47637012946SDavid Rientjes }, 47737012946SDavid Rientjes }; 47837012946SDavid Rientjes 479fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 480fc301289SChristoph Lameter unsigned long flags); 4811a75a6c8SChristoph Lameter 48298094945SNaoya Horiguchi /* 48398094945SNaoya Horiguchi * Scan through pages checking if pages follow certain conditions, 48498094945SNaoya Horiguchi * and move them to the pagelist if they do. 48598094945SNaoya Horiguchi */ 48698094945SNaoya Horiguchi static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 487dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 488dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 48938e35860SChristoph Lameter void *private) 4901da177e4SLinus Torvalds { 49191612e0dSHugh Dickins pte_t *orig_pte; 49291612e0dSHugh Dickins pte_t *pte; 493705e87c0SHugh Dickins spinlock_t *ptl; 494941150a3SHugh Dickins 495705e87c0SHugh Dickins orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 49691612e0dSHugh Dickins do { 4976aab341eSLinus Torvalds struct page *page; 49825ba77c1SAndy Whitcroft int nid; 49991612e0dSHugh Dickins 50091612e0dSHugh Dickins if (!pte_present(*pte)) 50191612e0dSHugh Dickins continue; 5026aab341eSLinus Torvalds page = vm_normal_page(vma, addr, *pte); 5036aab341eSLinus Torvalds if (!page) 50491612e0dSHugh Dickins continue; 505053837fcSNick Piggin /* 50662b61f61SHugh Dickins * vm_normal_page() filters out zero pages, but there might 50762b61f61SHugh Dickins * still be PageReserved pages to skip, perhaps in a VDSO. 508053837fcSNick Piggin */ 509b79bc0a0SHugh Dickins if (PageReserved(page)) 510f4598c8bSChristoph Lameter continue; 5116aab341eSLinus Torvalds nid = page_to_nid(page); 51238e35860SChristoph Lameter if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 51338e35860SChristoph Lameter continue; 51438e35860SChristoph Lameter 515b1f72d18SStephen Wilson if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 516fc301289SChristoph Lameter migrate_page_add(page, private, flags); 517dc9aa5b9SChristoph Lameter else 5181da177e4SLinus Torvalds break; 51991612e0dSHugh Dickins } while (pte++, addr += PAGE_SIZE, addr != end); 520705e87c0SHugh Dickins pte_unmap_unlock(orig_pte, ptl); 52191612e0dSHugh Dickins return addr != end; 52291612e0dSHugh Dickins } 52391612e0dSHugh Dickins 52498094945SNaoya Horiguchi static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, 52598094945SNaoya Horiguchi pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, 526e2d8cf40SNaoya Horiguchi void *private) 527e2d8cf40SNaoya Horiguchi { 528e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 529e2d8cf40SNaoya Horiguchi int nid; 530e2d8cf40SNaoya Horiguchi struct page *page; 531cb900f41SKirill A. Shutemov spinlock_t *ptl; 532e2d8cf40SNaoya Horiguchi 533cb900f41SKirill A. Shutemov ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 534e2d8cf40SNaoya Horiguchi page = pte_page(huge_ptep_get((pte_t *)pmd)); 535e2d8cf40SNaoya Horiguchi nid = page_to_nid(page); 536e2d8cf40SNaoya Horiguchi if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 537e2d8cf40SNaoya Horiguchi goto unlock; 538e2d8cf40SNaoya Horiguchi /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 539e2d8cf40SNaoya Horiguchi if (flags & (MPOL_MF_MOVE_ALL) || 540e2d8cf40SNaoya Horiguchi (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 541e2d8cf40SNaoya Horiguchi isolate_huge_page(page, private); 542e2d8cf40SNaoya Horiguchi unlock: 543cb900f41SKirill A. Shutemov spin_unlock(ptl); 544e2d8cf40SNaoya Horiguchi #else 545e2d8cf40SNaoya Horiguchi BUG(); 546e2d8cf40SNaoya Horiguchi #endif 547e2d8cf40SNaoya Horiguchi } 548e2d8cf40SNaoya Horiguchi 54998094945SNaoya Horiguchi static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, 550dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 551dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 55238e35860SChristoph Lameter void *private) 55391612e0dSHugh Dickins { 55491612e0dSHugh Dickins pmd_t *pmd; 55591612e0dSHugh Dickins unsigned long next; 55691612e0dSHugh Dickins 55791612e0dSHugh Dickins pmd = pmd_offset(pud, addr); 55891612e0dSHugh Dickins do { 55991612e0dSHugh Dickins next = pmd_addr_end(addr, end); 560e2d8cf40SNaoya Horiguchi if (!pmd_present(*pmd)) 561e2d8cf40SNaoya Horiguchi continue; 562e2d8cf40SNaoya Horiguchi if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { 56398094945SNaoya Horiguchi queue_pages_hugetlb_pmd_range(vma, pmd, nodes, 564e2d8cf40SNaoya Horiguchi flags, private); 565e2d8cf40SNaoya Horiguchi continue; 566e2d8cf40SNaoya Horiguchi } 567e180377fSKirill A. Shutemov split_huge_page_pmd(vma, addr, pmd); 5681a5a9906SAndrea Arcangeli if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 56991612e0dSHugh Dickins continue; 57098094945SNaoya Horiguchi if (queue_pages_pte_range(vma, pmd, addr, next, nodes, 57138e35860SChristoph Lameter flags, private)) 57291612e0dSHugh Dickins return -EIO; 57391612e0dSHugh Dickins } while (pmd++, addr = next, addr != end); 57491612e0dSHugh Dickins return 0; 57591612e0dSHugh Dickins } 57691612e0dSHugh Dickins 57798094945SNaoya Horiguchi static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 578dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 579dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 58038e35860SChristoph Lameter void *private) 58191612e0dSHugh Dickins { 58291612e0dSHugh Dickins pud_t *pud; 58391612e0dSHugh Dickins unsigned long next; 58491612e0dSHugh Dickins 58591612e0dSHugh Dickins pud = pud_offset(pgd, addr); 58691612e0dSHugh Dickins do { 58791612e0dSHugh Dickins next = pud_addr_end(addr, end); 588e2d8cf40SNaoya Horiguchi if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) 589e2d8cf40SNaoya Horiguchi continue; 59091612e0dSHugh Dickins if (pud_none_or_clear_bad(pud)) 59191612e0dSHugh Dickins continue; 59298094945SNaoya Horiguchi if (queue_pages_pmd_range(vma, pud, addr, next, nodes, 59338e35860SChristoph Lameter flags, private)) 59491612e0dSHugh Dickins return -EIO; 59591612e0dSHugh Dickins } while (pud++, addr = next, addr != end); 59691612e0dSHugh Dickins return 0; 59791612e0dSHugh Dickins } 59891612e0dSHugh Dickins 59998094945SNaoya Horiguchi static inline int queue_pages_pgd_range(struct vm_area_struct *vma, 600dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 601dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 60238e35860SChristoph Lameter void *private) 60391612e0dSHugh Dickins { 60491612e0dSHugh Dickins pgd_t *pgd; 60591612e0dSHugh Dickins unsigned long next; 60691612e0dSHugh Dickins 607b5810039SNick Piggin pgd = pgd_offset(vma->vm_mm, addr); 60891612e0dSHugh Dickins do { 60991612e0dSHugh Dickins next = pgd_addr_end(addr, end); 61091612e0dSHugh Dickins if (pgd_none_or_clear_bad(pgd)) 61191612e0dSHugh Dickins continue; 61298094945SNaoya Horiguchi if (queue_pages_pud_range(vma, pgd, addr, next, nodes, 61338e35860SChristoph Lameter flags, private)) 61491612e0dSHugh Dickins return -EIO; 61591612e0dSHugh Dickins } while (pgd++, addr = next, addr != end); 61691612e0dSHugh Dickins return 0; 6171da177e4SLinus Torvalds } 6181da177e4SLinus Torvalds 6195877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING 620b24f53a0SLee Schermerhorn /* 6214b10e7d5SMel Gorman * This is used to mark a range of virtual addresses to be inaccessible. 6224b10e7d5SMel Gorman * These are later cleared by a NUMA hinting fault. Depending on these 6234b10e7d5SMel Gorman * faults, pages may be migrated for better NUMA placement. 6244b10e7d5SMel Gorman * 6254b10e7d5SMel Gorman * This is assuming that NUMA faults are handled using PROT_NONE. If 6264b10e7d5SMel Gorman * an architecture makes a different choice, it will need further 6274b10e7d5SMel Gorman * changes to the core. 628b24f53a0SLee Schermerhorn */ 6294b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma, 6304b10e7d5SMel Gorman unsigned long addr, unsigned long end) 631b24f53a0SLee Schermerhorn { 6324b10e7d5SMel Gorman int nr_updated; 633b24f53a0SLee Schermerhorn 6344b10e7d5SMel Gorman nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 63503c5a6e1SMel Gorman if (nr_updated) 63603c5a6e1SMel Gorman count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 637b24f53a0SLee Schermerhorn 6384b10e7d5SMel Gorman return nr_updated; 639b24f53a0SLee Schermerhorn } 640b24f53a0SLee Schermerhorn #else 641b24f53a0SLee Schermerhorn static unsigned long change_prot_numa(struct vm_area_struct *vma, 642b24f53a0SLee Schermerhorn unsigned long addr, unsigned long end) 643b24f53a0SLee Schermerhorn { 644b24f53a0SLee Schermerhorn return 0; 645b24f53a0SLee Schermerhorn } 6465877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */ 647b24f53a0SLee Schermerhorn 648dc9aa5b9SChristoph Lameter /* 64998094945SNaoya Horiguchi * Walk through page tables and collect pages to be migrated. 65098094945SNaoya Horiguchi * 65198094945SNaoya Horiguchi * If pages found in a given range are on a set of nodes (determined by 65298094945SNaoya Horiguchi * @nodes and @flags,) it's isolated and queued to the pagelist which is 65398094945SNaoya Horiguchi * passed via @private.) 654dc9aa5b9SChristoph Lameter */ 6551da177e4SLinus Torvalds static struct vm_area_struct * 65698094945SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 65738e35860SChristoph Lameter const nodemask_t *nodes, unsigned long flags, void *private) 6581da177e4SLinus Torvalds { 6591da177e4SLinus Torvalds int err; 6601da177e4SLinus Torvalds struct vm_area_struct *first, *vma, *prev; 6611da177e4SLinus Torvalds 662053837fcSNick Piggin 6631da177e4SLinus Torvalds first = find_vma(mm, start); 6641da177e4SLinus Torvalds if (!first) 6651da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 6661da177e4SLinus Torvalds prev = NULL; 6671da177e4SLinus Torvalds for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 6685b952b3cSAndi Kleen unsigned long endvma = vma->vm_end; 669dc9aa5b9SChristoph Lameter 6705b952b3cSAndi Kleen if (endvma > end) 6715b952b3cSAndi Kleen endvma = end; 6725b952b3cSAndi Kleen if (vma->vm_start > start) 6735b952b3cSAndi Kleen start = vma->vm_start; 674b24f53a0SLee Schermerhorn 675b24f53a0SLee Schermerhorn if (!(flags & MPOL_MF_DISCONTIG_OK)) { 676b24f53a0SLee Schermerhorn if (!vma->vm_next && vma->vm_end < end) 677b24f53a0SLee Schermerhorn return ERR_PTR(-EFAULT); 678b24f53a0SLee Schermerhorn if (prev && prev->vm_end < vma->vm_start) 679b24f53a0SLee Schermerhorn return ERR_PTR(-EFAULT); 680b24f53a0SLee Schermerhorn } 681b24f53a0SLee Schermerhorn 682b24f53a0SLee Schermerhorn if (flags & MPOL_MF_LAZY) { 683b24f53a0SLee Schermerhorn change_prot_numa(vma, start, endvma); 684b24f53a0SLee Schermerhorn goto next; 685b24f53a0SLee Schermerhorn } 686b24f53a0SLee Schermerhorn 687b24f53a0SLee Schermerhorn if ((flags & MPOL_MF_STRICT) || 688b24f53a0SLee Schermerhorn ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 689b24f53a0SLee Schermerhorn vma_migratable(vma))) { 690b24f53a0SLee Schermerhorn 69198094945SNaoya Horiguchi err = queue_pages_pgd_range(vma, start, endvma, nodes, 69238e35860SChristoph Lameter flags, private); 6931da177e4SLinus Torvalds if (err) { 6941da177e4SLinus Torvalds first = ERR_PTR(err); 6951da177e4SLinus Torvalds break; 6961da177e4SLinus Torvalds } 6971da177e4SLinus Torvalds } 698b24f53a0SLee Schermerhorn next: 6991da177e4SLinus Torvalds prev = vma; 7001da177e4SLinus Torvalds } 7011da177e4SLinus Torvalds return first; 7021da177e4SLinus Torvalds } 7031da177e4SLinus Torvalds 704869833f2SKOSAKI Motohiro /* 705869833f2SKOSAKI Motohiro * Apply policy to a single VMA 706869833f2SKOSAKI Motohiro * This must be called with the mmap_sem held for writing. 707869833f2SKOSAKI Motohiro */ 708869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma, 709869833f2SKOSAKI Motohiro struct mempolicy *pol) 7108d34694cSKOSAKI Motohiro { 711869833f2SKOSAKI Motohiro int err; 712869833f2SKOSAKI Motohiro struct mempolicy *old; 713869833f2SKOSAKI Motohiro struct mempolicy *new; 7148d34694cSKOSAKI Motohiro 7158d34694cSKOSAKI Motohiro pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 7168d34694cSKOSAKI Motohiro vma->vm_start, vma->vm_end, vma->vm_pgoff, 7178d34694cSKOSAKI Motohiro vma->vm_ops, vma->vm_file, 7188d34694cSKOSAKI Motohiro vma->vm_ops ? vma->vm_ops->set_policy : NULL); 7198d34694cSKOSAKI Motohiro 720869833f2SKOSAKI Motohiro new = mpol_dup(pol); 721869833f2SKOSAKI Motohiro if (IS_ERR(new)) 722869833f2SKOSAKI Motohiro return PTR_ERR(new); 723869833f2SKOSAKI Motohiro 724869833f2SKOSAKI Motohiro if (vma->vm_ops && vma->vm_ops->set_policy) { 7258d34694cSKOSAKI Motohiro err = vma->vm_ops->set_policy(vma, new); 726869833f2SKOSAKI Motohiro if (err) 727869833f2SKOSAKI Motohiro goto err_out; 7288d34694cSKOSAKI Motohiro } 729869833f2SKOSAKI Motohiro 730869833f2SKOSAKI Motohiro old = vma->vm_policy; 731869833f2SKOSAKI Motohiro vma->vm_policy = new; /* protected by mmap_sem */ 732869833f2SKOSAKI Motohiro mpol_put(old); 733869833f2SKOSAKI Motohiro 734869833f2SKOSAKI Motohiro return 0; 735869833f2SKOSAKI Motohiro err_out: 736869833f2SKOSAKI Motohiro mpol_put(new); 7378d34694cSKOSAKI Motohiro return err; 7388d34694cSKOSAKI Motohiro } 7398d34694cSKOSAKI Motohiro 7401da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */ 7419d8cebd4SKOSAKI Motohiro static int mbind_range(struct mm_struct *mm, unsigned long start, 7429d8cebd4SKOSAKI Motohiro unsigned long end, struct mempolicy *new_pol) 7431da177e4SLinus Torvalds { 7441da177e4SLinus Torvalds struct vm_area_struct *next; 7459d8cebd4SKOSAKI Motohiro struct vm_area_struct *prev; 7469d8cebd4SKOSAKI Motohiro struct vm_area_struct *vma; 7479d8cebd4SKOSAKI Motohiro int err = 0; 748e26a5114SKOSAKI Motohiro pgoff_t pgoff; 7499d8cebd4SKOSAKI Motohiro unsigned long vmstart; 7509d8cebd4SKOSAKI Motohiro unsigned long vmend; 7511da177e4SLinus Torvalds 752097d5910SLinus Torvalds vma = find_vma(mm, start); 7539d8cebd4SKOSAKI Motohiro if (!vma || vma->vm_start > start) 7549d8cebd4SKOSAKI Motohiro return -EFAULT; 7559d8cebd4SKOSAKI Motohiro 756097d5910SLinus Torvalds prev = vma->vm_prev; 757e26a5114SKOSAKI Motohiro if (start > vma->vm_start) 758e26a5114SKOSAKI Motohiro prev = vma; 759e26a5114SKOSAKI Motohiro 7609d8cebd4SKOSAKI Motohiro for (; vma && vma->vm_start < end; prev = vma, vma = next) { 7611da177e4SLinus Torvalds next = vma->vm_next; 7629d8cebd4SKOSAKI Motohiro vmstart = max(start, vma->vm_start); 7639d8cebd4SKOSAKI Motohiro vmend = min(end, vma->vm_end); 7649d8cebd4SKOSAKI Motohiro 765e26a5114SKOSAKI Motohiro if (mpol_equal(vma_policy(vma), new_pol)) 766e26a5114SKOSAKI Motohiro continue; 767e26a5114SKOSAKI Motohiro 768e26a5114SKOSAKI Motohiro pgoff = vma->vm_pgoff + 769e26a5114SKOSAKI Motohiro ((vmstart - vma->vm_start) >> PAGE_SHIFT); 7709d8cebd4SKOSAKI Motohiro prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 771e26a5114SKOSAKI Motohiro vma->anon_vma, vma->vm_file, pgoff, 7728aacc9f5SCaspar Zhang new_pol); 7739d8cebd4SKOSAKI Motohiro if (prev) { 7749d8cebd4SKOSAKI Motohiro vma = prev; 7759d8cebd4SKOSAKI Motohiro next = vma->vm_next; 7763964acd0SOleg Nesterov if (mpol_equal(vma_policy(vma), new_pol)) 7779d8cebd4SKOSAKI Motohiro continue; 7783964acd0SOleg Nesterov /* vma_merge() joined vma && vma->next, case 8 */ 7793964acd0SOleg Nesterov goto replace; 7801da177e4SLinus Torvalds } 7819d8cebd4SKOSAKI Motohiro if (vma->vm_start != vmstart) { 7829d8cebd4SKOSAKI Motohiro err = split_vma(vma->vm_mm, vma, vmstart, 1); 7839d8cebd4SKOSAKI Motohiro if (err) 7849d8cebd4SKOSAKI Motohiro goto out; 7859d8cebd4SKOSAKI Motohiro } 7869d8cebd4SKOSAKI Motohiro if (vma->vm_end != vmend) { 7879d8cebd4SKOSAKI Motohiro err = split_vma(vma->vm_mm, vma, vmend, 0); 7889d8cebd4SKOSAKI Motohiro if (err) 7899d8cebd4SKOSAKI Motohiro goto out; 7909d8cebd4SKOSAKI Motohiro } 7913964acd0SOleg Nesterov replace: 792869833f2SKOSAKI Motohiro err = vma_replace_policy(vma, new_pol); 7939d8cebd4SKOSAKI Motohiro if (err) 7949d8cebd4SKOSAKI Motohiro goto out; 7959d8cebd4SKOSAKI Motohiro } 7969d8cebd4SKOSAKI Motohiro 7979d8cebd4SKOSAKI Motohiro out: 7981da177e4SLinus Torvalds return err; 7991da177e4SLinus Torvalds } 8001da177e4SLinus Torvalds 8011da177e4SLinus Torvalds /* Set the process memory policy */ 802028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags, 803028fec41SDavid Rientjes nodemask_t *nodes) 8041da177e4SLinus Torvalds { 80558568d2aSMiao Xie struct mempolicy *new, *old; 806f4e53d91SLee Schermerhorn struct mm_struct *mm = current->mm; 8074bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 80858568d2aSMiao Xie int ret; 8091da177e4SLinus Torvalds 8104bfc4495SKAMEZAWA Hiroyuki if (!scratch) 8114bfc4495SKAMEZAWA Hiroyuki return -ENOMEM; 812f4e53d91SLee Schermerhorn 8134bfc4495SKAMEZAWA Hiroyuki new = mpol_new(mode, flags, nodes); 8144bfc4495SKAMEZAWA Hiroyuki if (IS_ERR(new)) { 8154bfc4495SKAMEZAWA Hiroyuki ret = PTR_ERR(new); 8164bfc4495SKAMEZAWA Hiroyuki goto out; 8174bfc4495SKAMEZAWA Hiroyuki } 818f4e53d91SLee Schermerhorn /* 819f4e53d91SLee Schermerhorn * prevent changing our mempolicy while show_numa_maps() 820f4e53d91SLee Schermerhorn * is using it. 821f4e53d91SLee Schermerhorn * Note: do_set_mempolicy() can be called at init time 822f4e53d91SLee Schermerhorn * with no 'mm'. 823f4e53d91SLee Schermerhorn */ 824f4e53d91SLee Schermerhorn if (mm) 825f4e53d91SLee Schermerhorn down_write(&mm->mmap_sem); 82658568d2aSMiao Xie task_lock(current); 8274bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, nodes, scratch); 82858568d2aSMiao Xie if (ret) { 82958568d2aSMiao Xie task_unlock(current); 83058568d2aSMiao Xie if (mm) 83158568d2aSMiao Xie up_write(&mm->mmap_sem); 83258568d2aSMiao Xie mpol_put(new); 8334bfc4495SKAMEZAWA Hiroyuki goto out; 83458568d2aSMiao Xie } 83558568d2aSMiao Xie old = current->mempolicy; 8361da177e4SLinus Torvalds current->mempolicy = new; 83745c4745aSLee Schermerhorn if (new && new->mode == MPOL_INTERLEAVE && 838f5b087b5SDavid Rientjes nodes_weight(new->v.nodes)) 839dfcd3c0dSAndi Kleen current->il_next = first_node(new->v.nodes); 84058568d2aSMiao Xie task_unlock(current); 841f4e53d91SLee Schermerhorn if (mm) 842f4e53d91SLee Schermerhorn up_write(&mm->mmap_sem); 843f4e53d91SLee Schermerhorn 84458568d2aSMiao Xie mpol_put(old); 8454bfc4495SKAMEZAWA Hiroyuki ret = 0; 8464bfc4495SKAMEZAWA Hiroyuki out: 8474bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 8484bfc4495SKAMEZAWA Hiroyuki return ret; 8491da177e4SLinus Torvalds } 8501da177e4SLinus Torvalds 851bea904d5SLee Schermerhorn /* 852bea904d5SLee Schermerhorn * Return nodemask for policy for get_mempolicy() query 85358568d2aSMiao Xie * 85458568d2aSMiao Xie * Called with task's alloc_lock held 855bea904d5SLee Schermerhorn */ 856bea904d5SLee Schermerhorn static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 8571da177e4SLinus Torvalds { 858dfcd3c0dSAndi Kleen nodes_clear(*nodes); 859bea904d5SLee Schermerhorn if (p == &default_policy) 860bea904d5SLee Schermerhorn return; 861bea904d5SLee Schermerhorn 86245c4745aSLee Schermerhorn switch (p->mode) { 86319770b32SMel Gorman case MPOL_BIND: 86419770b32SMel Gorman /* Fall through */ 8651da177e4SLinus Torvalds case MPOL_INTERLEAVE: 866dfcd3c0dSAndi Kleen *nodes = p->v.nodes; 8671da177e4SLinus Torvalds break; 8681da177e4SLinus Torvalds case MPOL_PREFERRED: 869fc36b8d3SLee Schermerhorn if (!(p->flags & MPOL_F_LOCAL)) 870dfcd3c0dSAndi Kleen node_set(p->v.preferred_node, *nodes); 87153f2556bSLee Schermerhorn /* else return empty node mask for local allocation */ 8721da177e4SLinus Torvalds break; 8731da177e4SLinus Torvalds default: 8741da177e4SLinus Torvalds BUG(); 8751da177e4SLinus Torvalds } 8761da177e4SLinus Torvalds } 8771da177e4SLinus Torvalds 8781da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr) 8791da177e4SLinus Torvalds { 8801da177e4SLinus Torvalds struct page *p; 8811da177e4SLinus Torvalds int err; 8821da177e4SLinus Torvalds 8831da177e4SLinus Torvalds err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 8841da177e4SLinus Torvalds if (err >= 0) { 8851da177e4SLinus Torvalds err = page_to_nid(p); 8861da177e4SLinus Torvalds put_page(p); 8871da177e4SLinus Torvalds } 8881da177e4SLinus Torvalds return err; 8891da177e4SLinus Torvalds } 8901da177e4SLinus Torvalds 8911da177e4SLinus Torvalds /* Retrieve NUMA policy */ 892dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask, 8931da177e4SLinus Torvalds unsigned long addr, unsigned long flags) 8941da177e4SLinus Torvalds { 8958bccd85fSChristoph Lameter int err; 8961da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 8971da177e4SLinus Torvalds struct vm_area_struct *vma = NULL; 8981da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 8991da177e4SLinus Torvalds 900754af6f5SLee Schermerhorn if (flags & 901754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 9021da177e4SLinus Torvalds return -EINVAL; 903754af6f5SLee Schermerhorn 904754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) { 905754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 906754af6f5SLee Schermerhorn return -EINVAL; 907754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */ 90858568d2aSMiao Xie task_lock(current); 909754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed; 91058568d2aSMiao Xie task_unlock(current); 911754af6f5SLee Schermerhorn return 0; 912754af6f5SLee Schermerhorn } 913754af6f5SLee Schermerhorn 9141da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 915bea904d5SLee Schermerhorn /* 916bea904d5SLee Schermerhorn * Do NOT fall back to task policy if the 917bea904d5SLee Schermerhorn * vma/shared policy at addr is NULL. We 918bea904d5SLee Schermerhorn * want to return MPOL_DEFAULT in this case. 919bea904d5SLee Schermerhorn */ 9201da177e4SLinus Torvalds down_read(&mm->mmap_sem); 9211da177e4SLinus Torvalds vma = find_vma_intersection(mm, addr, addr+1); 9221da177e4SLinus Torvalds if (!vma) { 9231da177e4SLinus Torvalds up_read(&mm->mmap_sem); 9241da177e4SLinus Torvalds return -EFAULT; 9251da177e4SLinus Torvalds } 9261da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy) 9271da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 9281da177e4SLinus Torvalds else 9291da177e4SLinus Torvalds pol = vma->vm_policy; 9301da177e4SLinus Torvalds } else if (addr) 9311da177e4SLinus Torvalds return -EINVAL; 9321da177e4SLinus Torvalds 9331da177e4SLinus Torvalds if (!pol) 934bea904d5SLee Schermerhorn pol = &default_policy; /* indicates default behavior */ 9351da177e4SLinus Torvalds 9361da177e4SLinus Torvalds if (flags & MPOL_F_NODE) { 9371da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 9381da177e4SLinus Torvalds err = lookup_node(mm, addr); 9391da177e4SLinus Torvalds if (err < 0) 9401da177e4SLinus Torvalds goto out; 9418bccd85fSChristoph Lameter *policy = err; 9421da177e4SLinus Torvalds } else if (pol == current->mempolicy && 94345c4745aSLee Schermerhorn pol->mode == MPOL_INTERLEAVE) { 9448bccd85fSChristoph Lameter *policy = current->il_next; 9451da177e4SLinus Torvalds } else { 9461da177e4SLinus Torvalds err = -EINVAL; 9471da177e4SLinus Torvalds goto out; 9481da177e4SLinus Torvalds } 949bea904d5SLee Schermerhorn } else { 950bea904d5SLee Schermerhorn *policy = pol == &default_policy ? MPOL_DEFAULT : 951bea904d5SLee Schermerhorn pol->mode; 952d79df630SDavid Rientjes /* 953d79df630SDavid Rientjes * Internal mempolicy flags must be masked off before exposing 954d79df630SDavid Rientjes * the policy to userspace. 955d79df630SDavid Rientjes */ 956d79df630SDavid Rientjes *policy |= (pol->flags & MPOL_MODE_FLAGS); 957bea904d5SLee Schermerhorn } 9581da177e4SLinus Torvalds 9591da177e4SLinus Torvalds if (vma) { 9601da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 9611da177e4SLinus Torvalds vma = NULL; 9621da177e4SLinus Torvalds } 9631da177e4SLinus Torvalds 9641da177e4SLinus Torvalds err = 0; 96558568d2aSMiao Xie if (nmask) { 966c6b6ef8bSLee Schermerhorn if (mpol_store_user_nodemask(pol)) { 967c6b6ef8bSLee Schermerhorn *nmask = pol->w.user_nodemask; 968c6b6ef8bSLee Schermerhorn } else { 96958568d2aSMiao Xie task_lock(current); 970bea904d5SLee Schermerhorn get_policy_nodemask(pol, nmask); 97158568d2aSMiao Xie task_unlock(current); 97258568d2aSMiao Xie } 973c6b6ef8bSLee Schermerhorn } 9741da177e4SLinus Torvalds 9751da177e4SLinus Torvalds out: 97652cd3b07SLee Schermerhorn mpol_cond_put(pol); 9771da177e4SLinus Torvalds if (vma) 9781da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 9791da177e4SLinus Torvalds return err; 9801da177e4SLinus Torvalds } 9811da177e4SLinus Torvalds 982b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION 9838bccd85fSChristoph Lameter /* 9846ce3c4c0SChristoph Lameter * page migration 9856ce3c4c0SChristoph Lameter */ 986fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 987fc301289SChristoph Lameter unsigned long flags) 9886ce3c4c0SChristoph Lameter { 9896ce3c4c0SChristoph Lameter /* 990fc301289SChristoph Lameter * Avoid migrating a page that is shared with others. 9916ce3c4c0SChristoph Lameter */ 99262695a84SNick Piggin if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 99362695a84SNick Piggin if (!isolate_lru_page(page)) { 99462695a84SNick Piggin list_add_tail(&page->lru, pagelist); 9956d9c285aSKOSAKI Motohiro inc_zone_page_state(page, NR_ISOLATED_ANON + 9966d9c285aSKOSAKI Motohiro page_is_file_cache(page)); 99762695a84SNick Piggin } 99862695a84SNick Piggin } 9996ce3c4c0SChristoph Lameter } 10006ce3c4c0SChristoph Lameter 1001742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x) 100295a402c3SChristoph Lameter { 1003e2d8cf40SNaoya Horiguchi if (PageHuge(page)) 1004e2d8cf40SNaoya Horiguchi return alloc_huge_page_node(page_hstate(compound_head(page)), 1005e2d8cf40SNaoya Horiguchi node); 1006e2d8cf40SNaoya Horiguchi else 10076484eb3eSMel Gorman return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 100895a402c3SChristoph Lameter } 100995a402c3SChristoph Lameter 10106ce3c4c0SChristoph Lameter /* 10117e2ab150SChristoph Lameter * Migrate pages from one node to a target node. 10127e2ab150SChristoph Lameter * Returns error or the number of pages not migrated. 10137e2ab150SChristoph Lameter */ 1014dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest, 1015dbcb0f19SAdrian Bunk int flags) 10167e2ab150SChristoph Lameter { 10177e2ab150SChristoph Lameter nodemask_t nmask; 10187e2ab150SChristoph Lameter LIST_HEAD(pagelist); 10197e2ab150SChristoph Lameter int err = 0; 10207e2ab150SChristoph Lameter 10217e2ab150SChristoph Lameter nodes_clear(nmask); 10227e2ab150SChristoph Lameter node_set(source, nmask); 10237e2ab150SChristoph Lameter 102408270807SMinchan Kim /* 102508270807SMinchan Kim * This does not "check" the range but isolates all pages that 102608270807SMinchan Kim * need migration. Between passing in the full user address 102708270807SMinchan Kim * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 102808270807SMinchan Kim */ 102908270807SMinchan Kim VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 103098094945SNaoya Horiguchi queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 10317e2ab150SChristoph Lameter flags | MPOL_MF_DISCONTIG_OK, &pagelist); 10327e2ab150SChristoph Lameter 1033cf608ac1SMinchan Kim if (!list_empty(&pagelist)) { 103468711a74SDavid Rientjes err = migrate_pages(&pagelist, new_node_page, NULL, dest, 10359c620e2bSHugh Dickins MIGRATE_SYNC, MR_SYSCALL); 1036cf608ac1SMinchan Kim if (err) 1037e2d8cf40SNaoya Horiguchi putback_movable_pages(&pagelist); 1038cf608ac1SMinchan Kim } 103995a402c3SChristoph Lameter 10407e2ab150SChristoph Lameter return err; 10417e2ab150SChristoph Lameter } 10427e2ab150SChristoph Lameter 10437e2ab150SChristoph Lameter /* 10447e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical 10457e2ab150SChristoph Lameter * layout as much as possible. 104639743889SChristoph Lameter * 104739743889SChristoph Lameter * Returns the number of page that could not be moved. 104839743889SChristoph Lameter */ 10490ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 10500ce72d4fSAndrew Morton const nodemask_t *to, int flags) 105139743889SChristoph Lameter { 10527e2ab150SChristoph Lameter int busy = 0; 10530aedadf9SChristoph Lameter int err; 10547e2ab150SChristoph Lameter nodemask_t tmp; 105539743889SChristoph Lameter 10560aedadf9SChristoph Lameter err = migrate_prep(); 10570aedadf9SChristoph Lameter if (err) 10580aedadf9SChristoph Lameter return err; 10590aedadf9SChristoph Lameter 106039743889SChristoph Lameter down_read(&mm->mmap_sem); 1061d4984711SChristoph Lameter 10620ce72d4fSAndrew Morton err = migrate_vmas(mm, from, to, flags); 10637b2259b3SChristoph Lameter if (err) 10647b2259b3SChristoph Lameter goto out; 10657b2259b3SChristoph Lameter 10667e2ab150SChristoph Lameter /* 10677e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 10687e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 10697e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration. 10707e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map. 10717e2ab150SChristoph Lameter * 10727e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some 10737e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the 10747e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node 10757e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move. 10767e2ab150SChristoph Lameter * 10777e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left 10787e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false 10797e2ab150SChristoph Lameter * (nothing left to migrate). 10807e2ab150SChristoph Lameter * 10817e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that 10827e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other 10837e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a 10847e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node 10857e2ab150SChristoph Lameter * before migrating outgoing memory source that same node. 10867e2ab150SChristoph Lameter * 10877e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the 10887e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair 10897e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot 10907e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair. 1091ae0e47f0SJustin P. Mattock * Otherwise when we finish scanning from_tmp, we at least have the 10927e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through 10937e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less 10947e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating. 10957e2ab150SChristoph Lameter */ 10967e2ab150SChristoph Lameter 10970ce72d4fSAndrew Morton tmp = *from; 10987e2ab150SChristoph Lameter while (!nodes_empty(tmp)) { 10997e2ab150SChristoph Lameter int s,d; 1100b76ac7e7SJianguo Wu int source = NUMA_NO_NODE; 11017e2ab150SChristoph Lameter int dest = 0; 11027e2ab150SChristoph Lameter 11037e2ab150SChristoph Lameter for_each_node_mask(s, tmp) { 11044a5b18ccSLarry Woodman 11054a5b18ccSLarry Woodman /* 11064a5b18ccSLarry Woodman * do_migrate_pages() tries to maintain the relative 11074a5b18ccSLarry Woodman * node relationship of the pages established between 11084a5b18ccSLarry Woodman * threads and memory areas. 11094a5b18ccSLarry Woodman * 11104a5b18ccSLarry Woodman * However if the number of source nodes is not equal to 11114a5b18ccSLarry Woodman * the number of destination nodes we can not preserve 11124a5b18ccSLarry Woodman * this node relative relationship. In that case, skip 11134a5b18ccSLarry Woodman * copying memory from a node that is in the destination 11144a5b18ccSLarry Woodman * mask. 11154a5b18ccSLarry Woodman * 11164a5b18ccSLarry Woodman * Example: [2,3,4] -> [3,4,5] moves everything. 11174a5b18ccSLarry Woodman * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 11184a5b18ccSLarry Woodman */ 11194a5b18ccSLarry Woodman 11200ce72d4fSAndrew Morton if ((nodes_weight(*from) != nodes_weight(*to)) && 11210ce72d4fSAndrew Morton (node_isset(s, *to))) 11224a5b18ccSLarry Woodman continue; 11234a5b18ccSLarry Woodman 11240ce72d4fSAndrew Morton d = node_remap(s, *from, *to); 11257e2ab150SChristoph Lameter if (s == d) 11267e2ab150SChristoph Lameter continue; 11277e2ab150SChristoph Lameter 11287e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */ 11297e2ab150SChristoph Lameter dest = d; 11307e2ab150SChristoph Lameter 11317e2ab150SChristoph Lameter /* dest not in remaining from nodes? */ 11327e2ab150SChristoph Lameter if (!node_isset(dest, tmp)) 11337e2ab150SChristoph Lameter break; 11347e2ab150SChristoph Lameter } 1135b76ac7e7SJianguo Wu if (source == NUMA_NO_NODE) 11367e2ab150SChristoph Lameter break; 11377e2ab150SChristoph Lameter 11387e2ab150SChristoph Lameter node_clear(source, tmp); 11397e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags); 11407e2ab150SChristoph Lameter if (err > 0) 11417e2ab150SChristoph Lameter busy += err; 11427e2ab150SChristoph Lameter if (err < 0) 11437e2ab150SChristoph Lameter break; 114439743889SChristoph Lameter } 11457b2259b3SChristoph Lameter out: 114639743889SChristoph Lameter up_read(&mm->mmap_sem); 11477e2ab150SChristoph Lameter if (err < 0) 11487e2ab150SChristoph Lameter return err; 11497e2ab150SChristoph Lameter return busy; 1150b20a3503SChristoph Lameter 115139743889SChristoph Lameter } 115239743889SChristoph Lameter 11533ad33b24SLee Schermerhorn /* 11543ad33b24SLee Schermerhorn * Allocate a new page for page migration based on vma policy. 11553ad33b24SLee Schermerhorn * Start assuming that page is mapped by vma pointed to by @private. 11563ad33b24SLee Schermerhorn * Search forward from there, if not. N.B., this assumes that the 11573ad33b24SLee Schermerhorn * list of pages handed to migrate_pages()--which is how we get here-- 11583ad33b24SLee Schermerhorn * is in virtual address order. 11593ad33b24SLee Schermerhorn */ 1160742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 116195a402c3SChristoph Lameter { 116295a402c3SChristoph Lameter struct vm_area_struct *vma = (struct vm_area_struct *)private; 11633ad33b24SLee Schermerhorn unsigned long uninitialized_var(address); 116495a402c3SChristoph Lameter 11653ad33b24SLee Schermerhorn while (vma) { 11663ad33b24SLee Schermerhorn address = page_address_in_vma(page, vma); 11673ad33b24SLee Schermerhorn if (address != -EFAULT) 11683ad33b24SLee Schermerhorn break; 11693ad33b24SLee Schermerhorn vma = vma->vm_next; 11703ad33b24SLee Schermerhorn } 11713ad33b24SLee Schermerhorn 117211c731e8SWanpeng Li if (PageHuge(page)) { 1173cc81717eSMichal Hocko BUG_ON(!vma); 117474060e4dSNaoya Horiguchi return alloc_huge_page_noerr(vma, address, 1); 117511c731e8SWanpeng Li } 117611c731e8SWanpeng Li /* 117711c731e8SWanpeng Li * if !vma, alloc_page_vma() will use task or system default policy 117811c731e8SWanpeng Li */ 11793ad33b24SLee Schermerhorn return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 118095a402c3SChristoph Lameter } 1181b20a3503SChristoph Lameter #else 1182b20a3503SChristoph Lameter 1183b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 1184b20a3503SChristoph Lameter unsigned long flags) 1185b20a3503SChristoph Lameter { 1186b20a3503SChristoph Lameter } 1187b20a3503SChristoph Lameter 11880ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 11890ce72d4fSAndrew Morton const nodemask_t *to, int flags) 1190b20a3503SChristoph Lameter { 1191b20a3503SChristoph Lameter return -ENOSYS; 1192b20a3503SChristoph Lameter } 119395a402c3SChristoph Lameter 119469939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 119595a402c3SChristoph Lameter { 119695a402c3SChristoph Lameter return NULL; 119795a402c3SChristoph Lameter } 1198b20a3503SChristoph Lameter #endif 1199b20a3503SChristoph Lameter 1200dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len, 1201028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags, 1202028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags) 12036ce3c4c0SChristoph Lameter { 12046ce3c4c0SChristoph Lameter struct vm_area_struct *vma; 12056ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm; 12066ce3c4c0SChristoph Lameter struct mempolicy *new; 12076ce3c4c0SChristoph Lameter unsigned long end; 12086ce3c4c0SChristoph Lameter int err; 12096ce3c4c0SChristoph Lameter LIST_HEAD(pagelist); 12106ce3c4c0SChristoph Lameter 1211b24f53a0SLee Schermerhorn if (flags & ~(unsigned long)MPOL_MF_VALID) 12126ce3c4c0SChristoph Lameter return -EINVAL; 121374c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 12146ce3c4c0SChristoph Lameter return -EPERM; 12156ce3c4c0SChristoph Lameter 12166ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK) 12176ce3c4c0SChristoph Lameter return -EINVAL; 12186ce3c4c0SChristoph Lameter 12196ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT) 12206ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT; 12216ce3c4c0SChristoph Lameter 12226ce3c4c0SChristoph Lameter len = (len + PAGE_SIZE - 1) & PAGE_MASK; 12236ce3c4c0SChristoph Lameter end = start + len; 12246ce3c4c0SChristoph Lameter 12256ce3c4c0SChristoph Lameter if (end < start) 12266ce3c4c0SChristoph Lameter return -EINVAL; 12276ce3c4c0SChristoph Lameter if (end == start) 12286ce3c4c0SChristoph Lameter return 0; 12296ce3c4c0SChristoph Lameter 1230028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask); 12316ce3c4c0SChristoph Lameter if (IS_ERR(new)) 12326ce3c4c0SChristoph Lameter return PTR_ERR(new); 12336ce3c4c0SChristoph Lameter 1234b24f53a0SLee Schermerhorn if (flags & MPOL_MF_LAZY) 1235b24f53a0SLee Schermerhorn new->flags |= MPOL_F_MOF; 1236b24f53a0SLee Schermerhorn 12376ce3c4c0SChristoph Lameter /* 12386ce3c4c0SChristoph Lameter * If we are using the default policy then operation 12396ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all 12406ce3c4c0SChristoph Lameter */ 12416ce3c4c0SChristoph Lameter if (!new) 12426ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK; 12436ce3c4c0SChristoph Lameter 1244028fec41SDavid Rientjes pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1245028fec41SDavid Rientjes start, start + len, mode, mode_flags, 124600ef2d2fSDavid Rientjes nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); 12476ce3c4c0SChristoph Lameter 12480aedadf9SChristoph Lameter if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 12490aedadf9SChristoph Lameter 12500aedadf9SChristoph Lameter err = migrate_prep(); 12510aedadf9SChristoph Lameter if (err) 1252b05ca738SKOSAKI Motohiro goto mpol_out; 12530aedadf9SChristoph Lameter } 12544bfc4495SKAMEZAWA Hiroyuki { 12554bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 12564bfc4495SKAMEZAWA Hiroyuki if (scratch) { 12576ce3c4c0SChristoph Lameter down_write(&mm->mmap_sem); 125858568d2aSMiao Xie task_lock(current); 12594bfc4495SKAMEZAWA Hiroyuki err = mpol_set_nodemask(new, nmask, scratch); 126058568d2aSMiao Xie task_unlock(current); 12614bfc4495SKAMEZAWA Hiroyuki if (err) 126258568d2aSMiao Xie up_write(&mm->mmap_sem); 12634bfc4495SKAMEZAWA Hiroyuki } else 12644bfc4495SKAMEZAWA Hiroyuki err = -ENOMEM; 12654bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 12664bfc4495SKAMEZAWA Hiroyuki } 1267b05ca738SKOSAKI Motohiro if (err) 1268b05ca738SKOSAKI Motohiro goto mpol_out; 1269b05ca738SKOSAKI Motohiro 127098094945SNaoya Horiguchi vma = queue_pages_range(mm, start, end, nmask, 12716ce3c4c0SChristoph Lameter flags | MPOL_MF_INVERT, &pagelist); 12726ce3c4c0SChristoph Lameter 1273b24f53a0SLee Schermerhorn err = PTR_ERR(vma); /* maybe ... */ 1274a720094dSMel Gorman if (!IS_ERR(vma)) 12759d8cebd4SKOSAKI Motohiro err = mbind_range(mm, start, end, new); 12767e2ab150SChristoph Lameter 1277b24f53a0SLee Schermerhorn if (!err) { 1278b24f53a0SLee Schermerhorn int nr_failed = 0; 1279b24f53a0SLee Schermerhorn 1280cf608ac1SMinchan Kim if (!list_empty(&pagelist)) { 1281b24f53a0SLee Schermerhorn WARN_ON_ONCE(flags & MPOL_MF_LAZY); 128295a402c3SChristoph Lameter nr_failed = migrate_pages(&pagelist, new_vma_page, 128368711a74SDavid Rientjes NULL, (unsigned long)vma, 12849c620e2bSHugh Dickins MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1285cf608ac1SMinchan Kim if (nr_failed) 128674060e4dSNaoya Horiguchi putback_movable_pages(&pagelist); 1287cf608ac1SMinchan Kim } 12886ce3c4c0SChristoph Lameter 1289b24f53a0SLee Schermerhorn if (nr_failed && (flags & MPOL_MF_STRICT)) 12906ce3c4c0SChristoph Lameter err = -EIO; 1291ab8a3e14SKOSAKI Motohiro } else 1292b0e5fd73SJoonsoo Kim putback_movable_pages(&pagelist); 1293b20a3503SChristoph Lameter 12946ce3c4c0SChristoph Lameter up_write(&mm->mmap_sem); 1295b05ca738SKOSAKI Motohiro mpol_out: 1296f0be3d32SLee Schermerhorn mpol_put(new); 12976ce3c4c0SChristoph Lameter return err; 12986ce3c4c0SChristoph Lameter } 12996ce3c4c0SChristoph Lameter 130039743889SChristoph Lameter /* 13018bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists. 13028bccd85fSChristoph Lameter */ 13038bccd85fSChristoph Lameter 13048bccd85fSChristoph Lameter /* Copy a node mask from user space. */ 130539743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 13068bccd85fSChristoph Lameter unsigned long maxnode) 13078bccd85fSChristoph Lameter { 13088bccd85fSChristoph Lameter unsigned long k; 13098bccd85fSChristoph Lameter unsigned long nlongs; 13108bccd85fSChristoph Lameter unsigned long endmask; 13118bccd85fSChristoph Lameter 13128bccd85fSChristoph Lameter --maxnode; 13138bccd85fSChristoph Lameter nodes_clear(*nodes); 13148bccd85fSChristoph Lameter if (maxnode == 0 || !nmask) 13158bccd85fSChristoph Lameter return 0; 1316a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1317636f13c1SChris Wright return -EINVAL; 13188bccd85fSChristoph Lameter 13198bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(maxnode); 13208bccd85fSChristoph Lameter if ((maxnode % BITS_PER_LONG) == 0) 13218bccd85fSChristoph Lameter endmask = ~0UL; 13228bccd85fSChristoph Lameter else 13238bccd85fSChristoph Lameter endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 13248bccd85fSChristoph Lameter 13258bccd85fSChristoph Lameter /* When the user specified more nodes than supported just check 13268bccd85fSChristoph Lameter if the non supported part is all zero. */ 13278bccd85fSChristoph Lameter if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 13288bccd85fSChristoph Lameter if (nlongs > PAGE_SIZE/sizeof(long)) 13298bccd85fSChristoph Lameter return -EINVAL; 13308bccd85fSChristoph Lameter for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 13318bccd85fSChristoph Lameter unsigned long t; 13328bccd85fSChristoph Lameter if (get_user(t, nmask + k)) 13338bccd85fSChristoph Lameter return -EFAULT; 13348bccd85fSChristoph Lameter if (k == nlongs - 1) { 13358bccd85fSChristoph Lameter if (t & endmask) 13368bccd85fSChristoph Lameter return -EINVAL; 13378bccd85fSChristoph Lameter } else if (t) 13388bccd85fSChristoph Lameter return -EINVAL; 13398bccd85fSChristoph Lameter } 13408bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(MAX_NUMNODES); 13418bccd85fSChristoph Lameter endmask = ~0UL; 13428bccd85fSChristoph Lameter } 13438bccd85fSChristoph Lameter 13448bccd85fSChristoph Lameter if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 13458bccd85fSChristoph Lameter return -EFAULT; 13468bccd85fSChristoph Lameter nodes_addr(*nodes)[nlongs-1] &= endmask; 13478bccd85fSChristoph Lameter return 0; 13488bccd85fSChristoph Lameter } 13498bccd85fSChristoph Lameter 13508bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */ 13518bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 13528bccd85fSChristoph Lameter nodemask_t *nodes) 13538bccd85fSChristoph Lameter { 13548bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8; 13558bccd85fSChristoph Lameter const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 13568bccd85fSChristoph Lameter 13578bccd85fSChristoph Lameter if (copy > nbytes) { 13588bccd85fSChristoph Lameter if (copy > PAGE_SIZE) 13598bccd85fSChristoph Lameter return -EINVAL; 13608bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 13618bccd85fSChristoph Lameter return -EFAULT; 13628bccd85fSChristoph Lameter copy = nbytes; 13638bccd85fSChristoph Lameter } 13648bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 13658bccd85fSChristoph Lameter } 13668bccd85fSChristoph Lameter 1367938bb9f5SHeiko Carstens SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1368f7f28ca9SRasmus Villemoes unsigned long, mode, const unsigned long __user *, nmask, 1369938bb9f5SHeiko Carstens unsigned long, maxnode, unsigned, flags) 13708bccd85fSChristoph Lameter { 13718bccd85fSChristoph Lameter nodemask_t nodes; 13728bccd85fSChristoph Lameter int err; 1373028fec41SDavid Rientjes unsigned short mode_flags; 13748bccd85fSChristoph Lameter 1375028fec41SDavid Rientjes mode_flags = mode & MPOL_MODE_FLAGS; 1376028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1377a3b51e01SDavid Rientjes if (mode >= MPOL_MAX) 1378a3b51e01SDavid Rientjes return -EINVAL; 13794c50bc01SDavid Rientjes if ((mode_flags & MPOL_F_STATIC_NODES) && 13804c50bc01SDavid Rientjes (mode_flags & MPOL_F_RELATIVE_NODES)) 13814c50bc01SDavid Rientjes return -EINVAL; 13828bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 13838bccd85fSChristoph Lameter if (err) 13848bccd85fSChristoph Lameter return err; 1385028fec41SDavid Rientjes return do_mbind(start, len, mode, mode_flags, &nodes, flags); 13868bccd85fSChristoph Lameter } 13878bccd85fSChristoph Lameter 13888bccd85fSChristoph Lameter /* Set the process memory policy */ 138923c8902dSRasmus Villemoes SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1390938bb9f5SHeiko Carstens unsigned long, maxnode) 13918bccd85fSChristoph Lameter { 13928bccd85fSChristoph Lameter int err; 13938bccd85fSChristoph Lameter nodemask_t nodes; 1394028fec41SDavid Rientjes unsigned short flags; 13958bccd85fSChristoph Lameter 1396028fec41SDavid Rientjes flags = mode & MPOL_MODE_FLAGS; 1397028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1398028fec41SDavid Rientjes if ((unsigned int)mode >= MPOL_MAX) 13998bccd85fSChristoph Lameter return -EINVAL; 14004c50bc01SDavid Rientjes if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) 14014c50bc01SDavid Rientjes return -EINVAL; 14028bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 14038bccd85fSChristoph Lameter if (err) 14048bccd85fSChristoph Lameter return err; 1405028fec41SDavid Rientjes return do_set_mempolicy(mode, flags, &nodes); 14068bccd85fSChristoph Lameter } 14078bccd85fSChristoph Lameter 1408938bb9f5SHeiko Carstens SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1409938bb9f5SHeiko Carstens const unsigned long __user *, old_nodes, 1410938bb9f5SHeiko Carstens const unsigned long __user *, new_nodes) 141139743889SChristoph Lameter { 1412c69e8d9cSDavid Howells const struct cred *cred = current_cred(), *tcred; 1413596d7cfaSKOSAKI Motohiro struct mm_struct *mm = NULL; 141439743889SChristoph Lameter struct task_struct *task; 141539743889SChristoph Lameter nodemask_t task_nodes; 141639743889SChristoph Lameter int err; 1417596d7cfaSKOSAKI Motohiro nodemask_t *old; 1418596d7cfaSKOSAKI Motohiro nodemask_t *new; 1419596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH(scratch); 142039743889SChristoph Lameter 1421596d7cfaSKOSAKI Motohiro if (!scratch) 1422596d7cfaSKOSAKI Motohiro return -ENOMEM; 142339743889SChristoph Lameter 1424596d7cfaSKOSAKI Motohiro old = &scratch->mask1; 1425596d7cfaSKOSAKI Motohiro new = &scratch->mask2; 1426596d7cfaSKOSAKI Motohiro 1427596d7cfaSKOSAKI Motohiro err = get_nodes(old, old_nodes, maxnode); 142839743889SChristoph Lameter if (err) 1429596d7cfaSKOSAKI Motohiro goto out; 1430596d7cfaSKOSAKI Motohiro 1431596d7cfaSKOSAKI Motohiro err = get_nodes(new, new_nodes, maxnode); 1432596d7cfaSKOSAKI Motohiro if (err) 1433596d7cfaSKOSAKI Motohiro goto out; 143439743889SChristoph Lameter 143539743889SChristoph Lameter /* Find the mm_struct */ 143655cfaa3cSZeng Zhaoming rcu_read_lock(); 1437228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 143839743889SChristoph Lameter if (!task) { 143955cfaa3cSZeng Zhaoming rcu_read_unlock(); 1440596d7cfaSKOSAKI Motohiro err = -ESRCH; 1441596d7cfaSKOSAKI Motohiro goto out; 144239743889SChristoph Lameter } 14433268c63eSChristoph Lameter get_task_struct(task); 144439743889SChristoph Lameter 1445596d7cfaSKOSAKI Motohiro err = -EINVAL; 144639743889SChristoph Lameter 144739743889SChristoph Lameter /* 144839743889SChristoph Lameter * Check if this process has the right to modify the specified 144939743889SChristoph Lameter * process. The right exists if the process has administrative 14507f927fccSAlexey Dobriyan * capabilities, superuser privileges or the same 145139743889SChristoph Lameter * userid as the target process. 145239743889SChristoph Lameter */ 1453c69e8d9cSDavid Howells tcred = __task_cred(task); 1454b38a86ebSEric W. Biederman if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && 1455b38a86ebSEric W. Biederman !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && 145674c00241SChristoph Lameter !capable(CAP_SYS_NICE)) { 1457c69e8d9cSDavid Howells rcu_read_unlock(); 145839743889SChristoph Lameter err = -EPERM; 14593268c63eSChristoph Lameter goto out_put; 146039743889SChristoph Lameter } 1461c69e8d9cSDavid Howells rcu_read_unlock(); 146239743889SChristoph Lameter 146339743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task); 146439743889SChristoph Lameter /* Is the user allowed to access the target nodes? */ 1465596d7cfaSKOSAKI Motohiro if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 146639743889SChristoph Lameter err = -EPERM; 14673268c63eSChristoph Lameter goto out_put; 146839743889SChristoph Lameter } 146939743889SChristoph Lameter 147001f13bd6SLai Jiangshan if (!nodes_subset(*new, node_states[N_MEMORY])) { 14713b42d28bSChristoph Lameter err = -EINVAL; 14723268c63eSChristoph Lameter goto out_put; 14733b42d28bSChristoph Lameter } 14743b42d28bSChristoph Lameter 147586c3a764SDavid Quigley err = security_task_movememory(task); 147686c3a764SDavid Quigley if (err) 14773268c63eSChristoph Lameter goto out_put; 147886c3a764SDavid Quigley 14793268c63eSChristoph Lameter mm = get_task_mm(task); 14803268c63eSChristoph Lameter put_task_struct(task); 1481f2a9ef88SSasha Levin 1482f2a9ef88SSasha Levin if (!mm) { 1483f2a9ef88SSasha Levin err = -EINVAL; 1484f2a9ef88SSasha Levin goto out; 1485f2a9ef88SSasha Levin } 1486f2a9ef88SSasha Levin 1487596d7cfaSKOSAKI Motohiro err = do_migrate_pages(mm, old, new, 148874c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 14893268c63eSChristoph Lameter 149039743889SChristoph Lameter mmput(mm); 14913268c63eSChristoph Lameter out: 1492596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH_FREE(scratch); 1493596d7cfaSKOSAKI Motohiro 149439743889SChristoph Lameter return err; 14953268c63eSChristoph Lameter 14963268c63eSChristoph Lameter out_put: 14973268c63eSChristoph Lameter put_task_struct(task); 14983268c63eSChristoph Lameter goto out; 14993268c63eSChristoph Lameter 150039743889SChristoph Lameter } 150139743889SChristoph Lameter 150239743889SChristoph Lameter 15038bccd85fSChristoph Lameter /* Retrieve NUMA policy */ 1504938bb9f5SHeiko Carstens SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1505938bb9f5SHeiko Carstens unsigned long __user *, nmask, unsigned long, maxnode, 1506938bb9f5SHeiko Carstens unsigned long, addr, unsigned long, flags) 15078bccd85fSChristoph Lameter { 1508dbcb0f19SAdrian Bunk int err; 1509dbcb0f19SAdrian Bunk int uninitialized_var(pval); 15108bccd85fSChristoph Lameter nodemask_t nodes; 15118bccd85fSChristoph Lameter 15128bccd85fSChristoph Lameter if (nmask != NULL && maxnode < MAX_NUMNODES) 15138bccd85fSChristoph Lameter return -EINVAL; 15148bccd85fSChristoph Lameter 15158bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags); 15168bccd85fSChristoph Lameter 15178bccd85fSChristoph Lameter if (err) 15188bccd85fSChristoph Lameter return err; 15198bccd85fSChristoph Lameter 15208bccd85fSChristoph Lameter if (policy && put_user(pval, policy)) 15218bccd85fSChristoph Lameter return -EFAULT; 15228bccd85fSChristoph Lameter 15238bccd85fSChristoph Lameter if (nmask) 15248bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes); 15258bccd85fSChristoph Lameter 15268bccd85fSChristoph Lameter return err; 15278bccd85fSChristoph Lameter } 15288bccd85fSChristoph Lameter 15291da177e4SLinus Torvalds #ifdef CONFIG_COMPAT 15301da177e4SLinus Torvalds 1531c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1532c93e0f6cSHeiko Carstens compat_ulong_t __user *, nmask, 1533c93e0f6cSHeiko Carstens compat_ulong_t, maxnode, 1534c93e0f6cSHeiko Carstens compat_ulong_t, addr, compat_ulong_t, flags) 15351da177e4SLinus Torvalds { 15361da177e4SLinus Torvalds long err; 15371da177e4SLinus Torvalds unsigned long __user *nm = NULL; 15381da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 15391da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 15401da177e4SLinus Torvalds 15411da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 15421da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 15431da177e4SLinus Torvalds 15441da177e4SLinus Torvalds if (nmask) 15451da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 15461da177e4SLinus Torvalds 15471da177e4SLinus Torvalds err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 15481da177e4SLinus Torvalds 15491da177e4SLinus Torvalds if (!err && nmask) { 15502bbff6c7SKAMEZAWA Hiroyuki unsigned long copy_size; 15512bbff6c7SKAMEZAWA Hiroyuki copy_size = min_t(unsigned long, sizeof(bm), alloc_size); 15522bbff6c7SKAMEZAWA Hiroyuki err = copy_from_user(bm, nm, copy_size); 15531da177e4SLinus Torvalds /* ensure entire bitmap is zeroed */ 15541da177e4SLinus Torvalds err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 15551da177e4SLinus Torvalds err |= compat_put_bitmap(nmask, bm, nr_bits); 15561da177e4SLinus Torvalds } 15571da177e4SLinus Torvalds 15581da177e4SLinus Torvalds return err; 15591da177e4SLinus Torvalds } 15601da177e4SLinus Torvalds 1561c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, 1562c93e0f6cSHeiko Carstens compat_ulong_t, maxnode) 15631da177e4SLinus Torvalds { 15641da177e4SLinus Torvalds long err = 0; 15651da177e4SLinus Torvalds unsigned long __user *nm = NULL; 15661da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 15671da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 15681da177e4SLinus Torvalds 15691da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 15701da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 15711da177e4SLinus Torvalds 15721da177e4SLinus Torvalds if (nmask) { 15731da177e4SLinus Torvalds err = compat_get_bitmap(bm, nmask, nr_bits); 15741da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 15751da177e4SLinus Torvalds err |= copy_to_user(nm, bm, alloc_size); 15761da177e4SLinus Torvalds } 15771da177e4SLinus Torvalds 15781da177e4SLinus Torvalds if (err) 15791da177e4SLinus Torvalds return -EFAULT; 15801da177e4SLinus Torvalds 15811da177e4SLinus Torvalds return sys_set_mempolicy(mode, nm, nr_bits+1); 15821da177e4SLinus Torvalds } 15831da177e4SLinus Torvalds 1584c93e0f6cSHeiko Carstens COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, 1585c93e0f6cSHeiko Carstens compat_ulong_t, mode, compat_ulong_t __user *, nmask, 1586c93e0f6cSHeiko Carstens compat_ulong_t, maxnode, compat_ulong_t, flags) 15871da177e4SLinus Torvalds { 15881da177e4SLinus Torvalds long err = 0; 15891da177e4SLinus Torvalds unsigned long __user *nm = NULL; 15901da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 1591dfcd3c0dSAndi Kleen nodemask_t bm; 15921da177e4SLinus Torvalds 15931da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 15941da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 15951da177e4SLinus Torvalds 15961da177e4SLinus Torvalds if (nmask) { 1597dfcd3c0dSAndi Kleen err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 15981da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 1599dfcd3c0dSAndi Kleen err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 16001da177e4SLinus Torvalds } 16011da177e4SLinus Torvalds 16021da177e4SLinus Torvalds if (err) 16031da177e4SLinus Torvalds return -EFAULT; 16041da177e4SLinus Torvalds 16051da177e4SLinus Torvalds return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 16061da177e4SLinus Torvalds } 16071da177e4SLinus Torvalds 16081da177e4SLinus Torvalds #endif 16091da177e4SLinus Torvalds 1610480eccf9SLee Schermerhorn /* 1611480eccf9SLee Schermerhorn * get_vma_policy(@task, @vma, @addr) 1612b46e14acSFabian Frederick * @task: task for fallback if vma policy == default 1613b46e14acSFabian Frederick * @vma: virtual memory area whose policy is sought 1614b46e14acSFabian Frederick * @addr: address in @vma for shared policy lookup 1615480eccf9SLee Schermerhorn * 1616480eccf9SLee Schermerhorn * Returns effective policy for a VMA at specified address. 1617480eccf9SLee Schermerhorn * Falls back to @task or system default policy, as necessary. 161832f8516aSDavid Rientjes * Current or other task's task mempolicy and non-shared vma policies must be 161932f8516aSDavid Rientjes * protected by task_lock(task) by the caller. 162052cd3b07SLee Schermerhorn * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 162152cd3b07SLee Schermerhorn * count--added by the get_policy() vm_op, as appropriate--to protect against 162252cd3b07SLee Schermerhorn * freeing by another task. It is the caller's responsibility to free the 162352cd3b07SLee Schermerhorn * extra reference for shared policies. 1624480eccf9SLee Schermerhorn */ 1625d98f6cb6SStephen Wilson struct mempolicy *get_vma_policy(struct task_struct *task, 162648fce342SChristoph Lameter struct vm_area_struct *vma, unsigned long addr) 16271da177e4SLinus Torvalds { 16285606e387SMel Gorman struct mempolicy *pol = get_task_policy(task); 16291da177e4SLinus Torvalds 16301da177e4SLinus Torvalds if (vma) { 1631480eccf9SLee Schermerhorn if (vma->vm_ops && vma->vm_ops->get_policy) { 1632ae4d8c16SLee Schermerhorn struct mempolicy *vpol = vma->vm_ops->get_policy(vma, 1633ae4d8c16SLee Schermerhorn addr); 1634ae4d8c16SLee Schermerhorn if (vpol) 1635ae4d8c16SLee Schermerhorn pol = vpol; 163600442ad0SMel Gorman } else if (vma->vm_policy) { 16371da177e4SLinus Torvalds pol = vma->vm_policy; 163800442ad0SMel Gorman 163900442ad0SMel Gorman /* 164000442ad0SMel Gorman * shmem_alloc_page() passes MPOL_F_SHARED policy with 164100442ad0SMel Gorman * a pseudo vma whose vma->vm_ops=NULL. Take a reference 164200442ad0SMel Gorman * count on these policies which will be dropped by 164300442ad0SMel Gorman * mpol_cond_put() later 164400442ad0SMel Gorman */ 164500442ad0SMel Gorman if (mpol_needs_cond_ref(pol)) 164600442ad0SMel Gorman mpol_get(pol); 164700442ad0SMel Gorman } 16481da177e4SLinus Torvalds } 16491da177e4SLinus Torvalds if (!pol) 16501da177e4SLinus Torvalds pol = &default_policy; 16511da177e4SLinus Torvalds return pol; 16521da177e4SLinus Torvalds } 16531da177e4SLinus Torvalds 1654fc314724SMel Gorman bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) 1655fc314724SMel Gorman { 1656fc314724SMel Gorman struct mempolicy *pol = get_task_policy(task); 1657fc314724SMel Gorman if (vma) { 1658fc314724SMel Gorman if (vma->vm_ops && vma->vm_ops->get_policy) { 1659fc314724SMel Gorman bool ret = false; 1660fc314724SMel Gorman 1661fc314724SMel Gorman pol = vma->vm_ops->get_policy(vma, vma->vm_start); 1662fc314724SMel Gorman if (pol && (pol->flags & MPOL_F_MOF)) 1663fc314724SMel Gorman ret = true; 1664fc314724SMel Gorman mpol_cond_put(pol); 1665fc314724SMel Gorman 1666fc314724SMel Gorman return ret; 1667fc314724SMel Gorman } else if (vma->vm_policy) { 1668fc314724SMel Gorman pol = vma->vm_policy; 1669fc314724SMel Gorman } 1670fc314724SMel Gorman } 1671fc314724SMel Gorman 1672fc314724SMel Gorman if (!pol) 1673fc314724SMel Gorman return default_policy.flags & MPOL_F_MOF; 1674fc314724SMel Gorman 1675fc314724SMel Gorman return pol->flags & MPOL_F_MOF; 1676fc314724SMel Gorman } 1677fc314724SMel Gorman 1678d3eb1570SLai Jiangshan static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1679d3eb1570SLai Jiangshan { 1680d3eb1570SLai Jiangshan enum zone_type dynamic_policy_zone = policy_zone; 1681d3eb1570SLai Jiangshan 1682d3eb1570SLai Jiangshan BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1683d3eb1570SLai Jiangshan 1684d3eb1570SLai Jiangshan /* 1685d3eb1570SLai Jiangshan * if policy->v.nodes has movable memory only, 1686d3eb1570SLai Jiangshan * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1687d3eb1570SLai Jiangshan * 1688d3eb1570SLai Jiangshan * policy->v.nodes is intersect with node_states[N_MEMORY]. 1689d3eb1570SLai Jiangshan * so if the following test faile, it implies 1690d3eb1570SLai Jiangshan * policy->v.nodes has movable memory only. 1691d3eb1570SLai Jiangshan */ 1692d3eb1570SLai Jiangshan if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) 1693d3eb1570SLai Jiangshan dynamic_policy_zone = ZONE_MOVABLE; 1694d3eb1570SLai Jiangshan 1695d3eb1570SLai Jiangshan return zone >= dynamic_policy_zone; 1696d3eb1570SLai Jiangshan } 1697d3eb1570SLai Jiangshan 169852cd3b07SLee Schermerhorn /* 169952cd3b07SLee Schermerhorn * Return a nodemask representing a mempolicy for filtering nodes for 170052cd3b07SLee Schermerhorn * page allocation 170152cd3b07SLee Schermerhorn */ 170252cd3b07SLee Schermerhorn static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) 170319770b32SMel Gorman { 170419770b32SMel Gorman /* Lower zones don't get a nodemask applied for MPOL_BIND */ 170545c4745aSLee Schermerhorn if (unlikely(policy->mode == MPOL_BIND) && 1706d3eb1570SLai Jiangshan apply_policy_zone(policy, gfp_zone(gfp)) && 170719770b32SMel Gorman cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 170819770b32SMel Gorman return &policy->v.nodes; 170919770b32SMel Gorman 171019770b32SMel Gorman return NULL; 171119770b32SMel Gorman } 171219770b32SMel Gorman 171352cd3b07SLee Schermerhorn /* Return a zonelist indicated by gfp for node representing a mempolicy */ 17142f5f9486SAndi Kleen static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, 17152f5f9486SAndi Kleen int nd) 17161da177e4SLinus Torvalds { 171745c4745aSLee Schermerhorn switch (policy->mode) { 17181da177e4SLinus Torvalds case MPOL_PREFERRED: 1719fc36b8d3SLee Schermerhorn if (!(policy->flags & MPOL_F_LOCAL)) 17201da177e4SLinus Torvalds nd = policy->v.preferred_node; 17211da177e4SLinus Torvalds break; 17221da177e4SLinus Torvalds case MPOL_BIND: 172319770b32SMel Gorman /* 172452cd3b07SLee Schermerhorn * Normally, MPOL_BIND allocations are node-local within the 172552cd3b07SLee Schermerhorn * allowed nodemask. However, if __GFP_THISNODE is set and the 17266eb27e1fSBob Liu * current node isn't part of the mask, we use the zonelist for 172752cd3b07SLee Schermerhorn * the first node in the mask instead. 172819770b32SMel Gorman */ 172919770b32SMel Gorman if (unlikely(gfp & __GFP_THISNODE) && 173019770b32SMel Gorman unlikely(!node_isset(nd, policy->v.nodes))) 173119770b32SMel Gorman nd = first_node(policy->v.nodes); 173219770b32SMel Gorman break; 17331da177e4SLinus Torvalds default: 17341da177e4SLinus Torvalds BUG(); 17351da177e4SLinus Torvalds } 17360e88460dSMel Gorman return node_zonelist(nd, gfp); 17371da177e4SLinus Torvalds } 17381da177e4SLinus Torvalds 17391da177e4SLinus Torvalds /* Do dynamic interleaving for a process */ 17401da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy) 17411da177e4SLinus Torvalds { 17421da177e4SLinus Torvalds unsigned nid, next; 17431da177e4SLinus Torvalds struct task_struct *me = current; 17441da177e4SLinus Torvalds 17451da177e4SLinus Torvalds nid = me->il_next; 1746dfcd3c0dSAndi Kleen next = next_node(nid, policy->v.nodes); 17471da177e4SLinus Torvalds if (next >= MAX_NUMNODES) 1748dfcd3c0dSAndi Kleen next = first_node(policy->v.nodes); 1749f5b087b5SDavid Rientjes if (next < MAX_NUMNODES) 17501da177e4SLinus Torvalds me->il_next = next; 17511da177e4SLinus Torvalds return nid; 17521da177e4SLinus Torvalds } 17531da177e4SLinus Torvalds 1754dc85da15SChristoph Lameter /* 1755dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the 1756dc85da15SChristoph Lameter * next slab entry. 1757dc85da15SChristoph Lameter */ 17582a389610SDavid Rientjes unsigned int mempolicy_slab_node(void) 1759dc85da15SChristoph Lameter { 1760e7b691b0SAndi Kleen struct mempolicy *policy; 17612a389610SDavid Rientjes int node = numa_mem_id(); 1762e7b691b0SAndi Kleen 1763e7b691b0SAndi Kleen if (in_interrupt()) 17642a389610SDavid Rientjes return node; 1765e7b691b0SAndi Kleen 1766e7b691b0SAndi Kleen policy = current->mempolicy; 1767fc36b8d3SLee Schermerhorn if (!policy || policy->flags & MPOL_F_LOCAL) 17682a389610SDavid Rientjes return node; 1769765c4507SChristoph Lameter 1770bea904d5SLee Schermerhorn switch (policy->mode) { 1771bea904d5SLee Schermerhorn case MPOL_PREFERRED: 1772fc36b8d3SLee Schermerhorn /* 1773fc36b8d3SLee Schermerhorn * handled MPOL_F_LOCAL above 1774fc36b8d3SLee Schermerhorn */ 1775bea904d5SLee Schermerhorn return policy->v.preferred_node; 1776bea904d5SLee Schermerhorn 1777dc85da15SChristoph Lameter case MPOL_INTERLEAVE: 1778dc85da15SChristoph Lameter return interleave_nodes(policy); 1779dc85da15SChristoph Lameter 1780dd1a239fSMel Gorman case MPOL_BIND: { 1781dc85da15SChristoph Lameter /* 1782dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the 1783dc85da15SChristoph Lameter * first node. 1784dc85da15SChristoph Lameter */ 178519770b32SMel Gorman struct zonelist *zonelist; 178619770b32SMel Gorman struct zone *zone; 178719770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 17882a389610SDavid Rientjes zonelist = &NODE_DATA(node)->node_zonelists[0]; 178919770b32SMel Gorman (void)first_zones_zonelist(zonelist, highest_zoneidx, 179019770b32SMel Gorman &policy->v.nodes, 179119770b32SMel Gorman &zone); 17922a389610SDavid Rientjes return zone ? zone->node : node; 1793dd1a239fSMel Gorman } 1794dc85da15SChristoph Lameter 1795dc85da15SChristoph Lameter default: 1796bea904d5SLee Schermerhorn BUG(); 1797dc85da15SChristoph Lameter } 1798dc85da15SChristoph Lameter } 1799dc85da15SChristoph Lameter 18001da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */ 18011da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol, 18021da177e4SLinus Torvalds struct vm_area_struct *vma, unsigned long off) 18031da177e4SLinus Torvalds { 1804dfcd3c0dSAndi Kleen unsigned nnodes = nodes_weight(pol->v.nodes); 1805f5b087b5SDavid Rientjes unsigned target; 18061da177e4SLinus Torvalds int c; 1807b76ac7e7SJianguo Wu int nid = NUMA_NO_NODE; 18081da177e4SLinus Torvalds 1809f5b087b5SDavid Rientjes if (!nnodes) 1810f5b087b5SDavid Rientjes return numa_node_id(); 1811f5b087b5SDavid Rientjes target = (unsigned int)off % nnodes; 18121da177e4SLinus Torvalds c = 0; 18131da177e4SLinus Torvalds do { 1814dfcd3c0dSAndi Kleen nid = next_node(nid, pol->v.nodes); 18151da177e4SLinus Torvalds c++; 18161da177e4SLinus Torvalds } while (c <= target); 18171da177e4SLinus Torvalds return nid; 18181da177e4SLinus Torvalds } 18191da177e4SLinus Torvalds 18205da7ca86SChristoph Lameter /* Determine a node number for interleave */ 18215da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol, 18225da7ca86SChristoph Lameter struct vm_area_struct *vma, unsigned long addr, int shift) 18235da7ca86SChristoph Lameter { 18245da7ca86SChristoph Lameter if (vma) { 18255da7ca86SChristoph Lameter unsigned long off; 18265da7ca86SChristoph Lameter 18273b98b087SNishanth Aravamudan /* 18283b98b087SNishanth Aravamudan * for small pages, there is no difference between 18293b98b087SNishanth Aravamudan * shift and PAGE_SHIFT, so the bit-shift is safe. 18303b98b087SNishanth Aravamudan * for huge pages, since vm_pgoff is in units of small 18313b98b087SNishanth Aravamudan * pages, we need to shift off the always 0 bits to get 18323b98b087SNishanth Aravamudan * a useful offset. 18333b98b087SNishanth Aravamudan */ 18343b98b087SNishanth Aravamudan BUG_ON(shift < PAGE_SHIFT); 18353b98b087SNishanth Aravamudan off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 18365da7ca86SChristoph Lameter off += (addr - vma->vm_start) >> shift; 18375da7ca86SChristoph Lameter return offset_il_node(pol, vma, off); 18385da7ca86SChristoph Lameter } else 18395da7ca86SChristoph Lameter return interleave_nodes(pol); 18405da7ca86SChristoph Lameter } 18415da7ca86SChristoph Lameter 1842778d3b0fSMichal Hocko /* 1843778d3b0fSMichal Hocko * Return the bit number of a random bit set in the nodemask. 1844b76ac7e7SJianguo Wu * (returns NUMA_NO_NODE if nodemask is empty) 1845778d3b0fSMichal Hocko */ 1846778d3b0fSMichal Hocko int node_random(const nodemask_t *maskp) 1847778d3b0fSMichal Hocko { 1848b76ac7e7SJianguo Wu int w, bit = NUMA_NO_NODE; 1849778d3b0fSMichal Hocko 1850778d3b0fSMichal Hocko w = nodes_weight(*maskp); 1851778d3b0fSMichal Hocko if (w) 1852778d3b0fSMichal Hocko bit = bitmap_ord_to_pos(maskp->bits, 1853778d3b0fSMichal Hocko get_random_int() % w, MAX_NUMNODES); 1854778d3b0fSMichal Hocko return bit; 1855778d3b0fSMichal Hocko } 1856778d3b0fSMichal Hocko 185700ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS 1858480eccf9SLee Schermerhorn /* 1859480eccf9SLee Schermerhorn * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1860b46e14acSFabian Frederick * @vma: virtual memory area whose policy is sought 1861b46e14acSFabian Frederick * @addr: address in @vma for shared policy lookup and interleave policy 1862b46e14acSFabian Frederick * @gfp_flags: for requested zone 1863b46e14acSFabian Frederick * @mpol: pointer to mempolicy pointer for reference counted mempolicy 1864b46e14acSFabian Frederick * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask 1865480eccf9SLee Schermerhorn * 186652cd3b07SLee Schermerhorn * Returns a zonelist suitable for a huge page allocation and a pointer 186752cd3b07SLee Schermerhorn * to the struct mempolicy for conditional unref after allocation. 186852cd3b07SLee Schermerhorn * If the effective policy is 'BIND, returns a pointer to the mempolicy's 186952cd3b07SLee Schermerhorn * @nodemask for filtering the zonelist. 1870c0ff7453SMiao Xie * 1871d26914d1SMel Gorman * Must be protected by read_mems_allowed_begin() 1872480eccf9SLee Schermerhorn */ 1873396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 187419770b32SMel Gorman gfp_t gfp_flags, struct mempolicy **mpol, 187519770b32SMel Gorman nodemask_t **nodemask) 18765da7ca86SChristoph Lameter { 1877480eccf9SLee Schermerhorn struct zonelist *zl; 18785da7ca86SChristoph Lameter 187952cd3b07SLee Schermerhorn *mpol = get_vma_policy(current, vma, addr); 188019770b32SMel Gorman *nodemask = NULL; /* assume !MPOL_BIND */ 18815da7ca86SChristoph Lameter 188252cd3b07SLee Schermerhorn if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 188352cd3b07SLee Schermerhorn zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1884a5516438SAndi Kleen huge_page_shift(hstate_vma(vma))), gfp_flags); 188552cd3b07SLee Schermerhorn } else { 18862f5f9486SAndi Kleen zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); 188752cd3b07SLee Schermerhorn if ((*mpol)->mode == MPOL_BIND) 188852cd3b07SLee Schermerhorn *nodemask = &(*mpol)->v.nodes; 1889480eccf9SLee Schermerhorn } 1890480eccf9SLee Schermerhorn return zl; 18915da7ca86SChristoph Lameter } 189206808b08SLee Schermerhorn 189306808b08SLee Schermerhorn /* 189406808b08SLee Schermerhorn * init_nodemask_of_mempolicy 189506808b08SLee Schermerhorn * 189606808b08SLee Schermerhorn * If the current task's mempolicy is "default" [NULL], return 'false' 189706808b08SLee Schermerhorn * to indicate default policy. Otherwise, extract the policy nodemask 189806808b08SLee Schermerhorn * for 'bind' or 'interleave' policy into the argument nodemask, or 189906808b08SLee Schermerhorn * initialize the argument nodemask to contain the single node for 190006808b08SLee Schermerhorn * 'preferred' or 'local' policy and return 'true' to indicate presence 190106808b08SLee Schermerhorn * of non-default mempolicy. 190206808b08SLee Schermerhorn * 190306808b08SLee Schermerhorn * We don't bother with reference counting the mempolicy [mpol_get/put] 190406808b08SLee Schermerhorn * because the current task is examining it's own mempolicy and a task's 190506808b08SLee Schermerhorn * mempolicy is only ever changed by the task itself. 190606808b08SLee Schermerhorn * 190706808b08SLee Schermerhorn * N.B., it is the caller's responsibility to free a returned nodemask. 190806808b08SLee Schermerhorn */ 190906808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask) 191006808b08SLee Schermerhorn { 191106808b08SLee Schermerhorn struct mempolicy *mempolicy; 191206808b08SLee Schermerhorn int nid; 191306808b08SLee Schermerhorn 191406808b08SLee Schermerhorn if (!(mask && current->mempolicy)) 191506808b08SLee Schermerhorn return false; 191606808b08SLee Schermerhorn 1917c0ff7453SMiao Xie task_lock(current); 191806808b08SLee Schermerhorn mempolicy = current->mempolicy; 191906808b08SLee Schermerhorn switch (mempolicy->mode) { 192006808b08SLee Schermerhorn case MPOL_PREFERRED: 192106808b08SLee Schermerhorn if (mempolicy->flags & MPOL_F_LOCAL) 192206808b08SLee Schermerhorn nid = numa_node_id(); 192306808b08SLee Schermerhorn else 192406808b08SLee Schermerhorn nid = mempolicy->v.preferred_node; 192506808b08SLee Schermerhorn init_nodemask_of_node(mask, nid); 192606808b08SLee Schermerhorn break; 192706808b08SLee Schermerhorn 192806808b08SLee Schermerhorn case MPOL_BIND: 192906808b08SLee Schermerhorn /* Fall through */ 193006808b08SLee Schermerhorn case MPOL_INTERLEAVE: 193106808b08SLee Schermerhorn *mask = mempolicy->v.nodes; 193206808b08SLee Schermerhorn break; 193306808b08SLee Schermerhorn 193406808b08SLee Schermerhorn default: 193506808b08SLee Schermerhorn BUG(); 193606808b08SLee Schermerhorn } 1937c0ff7453SMiao Xie task_unlock(current); 193806808b08SLee Schermerhorn 193906808b08SLee Schermerhorn return true; 194006808b08SLee Schermerhorn } 194100ac59adSChen, Kenneth W #endif 19425da7ca86SChristoph Lameter 19436f48d0ebSDavid Rientjes /* 19446f48d0ebSDavid Rientjes * mempolicy_nodemask_intersects 19456f48d0ebSDavid Rientjes * 19466f48d0ebSDavid Rientjes * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default 19476f48d0ebSDavid Rientjes * policy. Otherwise, check for intersection between mask and the policy 19486f48d0ebSDavid Rientjes * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' 19496f48d0ebSDavid Rientjes * policy, always return true since it may allocate elsewhere on fallback. 19506f48d0ebSDavid Rientjes * 19516f48d0ebSDavid Rientjes * Takes task_lock(tsk) to prevent freeing of its mempolicy. 19526f48d0ebSDavid Rientjes */ 19536f48d0ebSDavid Rientjes bool mempolicy_nodemask_intersects(struct task_struct *tsk, 19546f48d0ebSDavid Rientjes const nodemask_t *mask) 19556f48d0ebSDavid Rientjes { 19566f48d0ebSDavid Rientjes struct mempolicy *mempolicy; 19576f48d0ebSDavid Rientjes bool ret = true; 19586f48d0ebSDavid Rientjes 19596f48d0ebSDavid Rientjes if (!mask) 19606f48d0ebSDavid Rientjes return ret; 19616f48d0ebSDavid Rientjes task_lock(tsk); 19626f48d0ebSDavid Rientjes mempolicy = tsk->mempolicy; 19636f48d0ebSDavid Rientjes if (!mempolicy) 19646f48d0ebSDavid Rientjes goto out; 19656f48d0ebSDavid Rientjes 19666f48d0ebSDavid Rientjes switch (mempolicy->mode) { 19676f48d0ebSDavid Rientjes case MPOL_PREFERRED: 19686f48d0ebSDavid Rientjes /* 19696f48d0ebSDavid Rientjes * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to 19706f48d0ebSDavid Rientjes * allocate from, they may fallback to other nodes when oom. 19716f48d0ebSDavid Rientjes * Thus, it's possible for tsk to have allocated memory from 19726f48d0ebSDavid Rientjes * nodes in mask. 19736f48d0ebSDavid Rientjes */ 19746f48d0ebSDavid Rientjes break; 19756f48d0ebSDavid Rientjes case MPOL_BIND: 19766f48d0ebSDavid Rientjes case MPOL_INTERLEAVE: 19776f48d0ebSDavid Rientjes ret = nodes_intersects(mempolicy->v.nodes, *mask); 19786f48d0ebSDavid Rientjes break; 19796f48d0ebSDavid Rientjes default: 19806f48d0ebSDavid Rientjes BUG(); 19816f48d0ebSDavid Rientjes } 19826f48d0ebSDavid Rientjes out: 19836f48d0ebSDavid Rientjes task_unlock(tsk); 19846f48d0ebSDavid Rientjes return ret; 19856f48d0ebSDavid Rientjes } 19866f48d0ebSDavid Rientjes 19871da177e4SLinus Torvalds /* Allocate a page in interleaved policy. 19881da177e4SLinus Torvalds Own path because it needs to do special accounting. */ 1989662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1990662f3a0bSAndi Kleen unsigned nid) 19911da177e4SLinus Torvalds { 19921da177e4SLinus Torvalds struct zonelist *zl; 19931da177e4SLinus Torvalds struct page *page; 19941da177e4SLinus Torvalds 19950e88460dSMel Gorman zl = node_zonelist(nid, gfp); 19961da177e4SLinus Torvalds page = __alloc_pages(gfp, order, zl); 1997dd1a239fSMel Gorman if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) 1998ca889e6cSChristoph Lameter inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 19991da177e4SLinus Torvalds return page; 20001da177e4SLinus Torvalds } 20011da177e4SLinus Torvalds 20021da177e4SLinus Torvalds /** 20030bbbc0b3SAndrea Arcangeli * alloc_pages_vma - Allocate a page for a VMA. 20041da177e4SLinus Torvalds * 20051da177e4SLinus Torvalds * @gfp: 20061da177e4SLinus Torvalds * %GFP_USER user allocation. 20071da177e4SLinus Torvalds * %GFP_KERNEL kernel allocations, 20081da177e4SLinus Torvalds * %GFP_HIGHMEM highmem/user allocations, 20091da177e4SLinus Torvalds * %GFP_FS allocation should not call back into a file system. 20101da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 20111da177e4SLinus Torvalds * 20120bbbc0b3SAndrea Arcangeli * @order:Order of the GFP allocation. 20131da177e4SLinus Torvalds * @vma: Pointer to VMA or NULL if not available. 20141da177e4SLinus Torvalds * @addr: Virtual Address of the allocation. Must be inside the VMA. 20151da177e4SLinus Torvalds * 20161da177e4SLinus Torvalds * This function allocates a page from the kernel page pool and applies 20171da177e4SLinus Torvalds * a NUMA policy associated with the VMA or the current process. 20181da177e4SLinus Torvalds * When VMA is not NULL caller must hold down_read on the mmap_sem of the 20191da177e4SLinus Torvalds * mm_struct of the VMA to prevent it from going away. Should be used for 20201da177e4SLinus Torvalds * all allocations for pages that will be mapped into 20211da177e4SLinus Torvalds * user space. Returns NULL when no page can be allocated. 20221da177e4SLinus Torvalds * 20231da177e4SLinus Torvalds * Should be called with the mm_sem of the vma hold. 20241da177e4SLinus Torvalds */ 20251da177e4SLinus Torvalds struct page * 20260bbbc0b3SAndrea Arcangeli alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 20272f5f9486SAndi Kleen unsigned long addr, int node) 20281da177e4SLinus Torvalds { 2029cc9a6c87SMel Gorman struct mempolicy *pol; 2030c0ff7453SMiao Xie struct page *page; 2031cc9a6c87SMel Gorman unsigned int cpuset_mems_cookie; 20321da177e4SLinus Torvalds 2033cc9a6c87SMel Gorman retry_cpuset: 2034cc9a6c87SMel Gorman pol = get_vma_policy(current, vma, addr); 2035d26914d1SMel Gorman cpuset_mems_cookie = read_mems_allowed_begin(); 2036cc9a6c87SMel Gorman 203745c4745aSLee Schermerhorn if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 20381da177e4SLinus Torvalds unsigned nid; 20395da7ca86SChristoph Lameter 20408eac563cSAndi Kleen nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 204152cd3b07SLee Schermerhorn mpol_cond_put(pol); 20420bbbc0b3SAndrea Arcangeli page = alloc_page_interleave(gfp, order, nid); 2043d26914d1SMel Gorman if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2044cc9a6c87SMel Gorman goto retry_cpuset; 2045cc9a6c87SMel Gorman 2046c0ff7453SMiao Xie return page; 20471da177e4SLinus Torvalds } 2048212a0a6fSDavid Rientjes page = __alloc_pages_nodemask(gfp, order, 2049212a0a6fSDavid Rientjes policy_zonelist(gfp, pol, node), 20500bbbc0b3SAndrea Arcangeli policy_nodemask(gfp, pol)); 2051212a0a6fSDavid Rientjes if (unlikely(mpol_needs_cond_ref(pol))) 2052212a0a6fSDavid Rientjes __mpol_put(pol); 2053d26914d1SMel Gorman if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2054cc9a6c87SMel Gorman goto retry_cpuset; 2055c0ff7453SMiao Xie return page; 20561da177e4SLinus Torvalds } 20571da177e4SLinus Torvalds 20581da177e4SLinus Torvalds /** 20591da177e4SLinus Torvalds * alloc_pages_current - Allocate pages. 20601da177e4SLinus Torvalds * 20611da177e4SLinus Torvalds * @gfp: 20621da177e4SLinus Torvalds * %GFP_USER user allocation, 20631da177e4SLinus Torvalds * %GFP_KERNEL kernel allocation, 20641da177e4SLinus Torvalds * %GFP_HIGHMEM highmem allocation, 20651da177e4SLinus Torvalds * %GFP_FS don't call back into a file system. 20661da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 20671da177e4SLinus Torvalds * @order: Power of two of allocation size in pages. 0 is a single page. 20681da177e4SLinus Torvalds * 20691da177e4SLinus Torvalds * Allocate a page from the kernel page pool. When not in 20701da177e4SLinus Torvalds * interrupt context and apply the current process NUMA policy. 20711da177e4SLinus Torvalds * Returns NULL when no page can be allocated. 20721da177e4SLinus Torvalds * 2073cf2a473cSPaul Jackson * Don't call cpuset_update_task_memory_state() unless 20741da177e4SLinus Torvalds * 1) it's ok to take cpuset_sem (can WAIT), and 20751da177e4SLinus Torvalds * 2) allocating for current task (not interrupt). 20761da177e4SLinus Torvalds */ 2077dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order) 20781da177e4SLinus Torvalds { 20795606e387SMel Gorman struct mempolicy *pol = get_task_policy(current); 2080c0ff7453SMiao Xie struct page *page; 2081cc9a6c87SMel Gorman unsigned int cpuset_mems_cookie; 20821da177e4SLinus Torvalds 20839b819d20SChristoph Lameter if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 20841da177e4SLinus Torvalds pol = &default_policy; 208552cd3b07SLee Schermerhorn 2086cc9a6c87SMel Gorman retry_cpuset: 2087d26914d1SMel Gorman cpuset_mems_cookie = read_mems_allowed_begin(); 2088cc9a6c87SMel Gorman 208952cd3b07SLee Schermerhorn /* 209052cd3b07SLee Schermerhorn * No reference counting needed for current->mempolicy 209152cd3b07SLee Schermerhorn * nor system default_policy 209252cd3b07SLee Schermerhorn */ 209345c4745aSLee Schermerhorn if (pol->mode == MPOL_INTERLEAVE) 2094c0ff7453SMiao Xie page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 2095c0ff7453SMiao Xie else 2096c0ff7453SMiao Xie page = __alloc_pages_nodemask(gfp, order, 20975c4b4be3SAndi Kleen policy_zonelist(gfp, pol, numa_node_id()), 20985c4b4be3SAndi Kleen policy_nodemask(gfp, pol)); 2099cc9a6c87SMel Gorman 2100d26914d1SMel Gorman if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2101cc9a6c87SMel Gorman goto retry_cpuset; 2102cc9a6c87SMel Gorman 2103c0ff7453SMiao Xie return page; 21041da177e4SLinus Torvalds } 21051da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current); 21061da177e4SLinus Torvalds 2107ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2108ef0855d3SOleg Nesterov { 2109ef0855d3SOleg Nesterov struct mempolicy *pol = mpol_dup(vma_policy(src)); 2110ef0855d3SOleg Nesterov 2111ef0855d3SOleg Nesterov if (IS_ERR(pol)) 2112ef0855d3SOleg Nesterov return PTR_ERR(pol); 2113ef0855d3SOleg Nesterov dst->vm_policy = pol; 2114ef0855d3SOleg Nesterov return 0; 2115ef0855d3SOleg Nesterov } 2116ef0855d3SOleg Nesterov 21174225399aSPaul Jackson /* 2118846a16bfSLee Schermerhorn * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 21194225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy() 21204225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This 21214225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See 21224225399aSPaul Jackson * further kernel/cpuset.c update_nodemask(). 2123708c1bbcSMiao Xie * 2124708c1bbcSMiao Xie * current's mempolicy may be rebinded by the other task(the task that changes 2125708c1bbcSMiao Xie * cpuset's mems), so we needn't do rebind work for current task. 21264225399aSPaul Jackson */ 21274225399aSPaul Jackson 2128846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */ 2129846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old) 21301da177e4SLinus Torvalds { 21311da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 21321da177e4SLinus Torvalds 21331da177e4SLinus Torvalds if (!new) 21341da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 2135708c1bbcSMiao Xie 2136708c1bbcSMiao Xie /* task's mempolicy is protected by alloc_lock */ 2137708c1bbcSMiao Xie if (old == current->mempolicy) { 2138708c1bbcSMiao Xie task_lock(current); 2139708c1bbcSMiao Xie *new = *old; 2140708c1bbcSMiao Xie task_unlock(current); 2141708c1bbcSMiao Xie } else 2142708c1bbcSMiao Xie *new = *old; 2143708c1bbcSMiao Xie 214499ee4ca7SPaul E. McKenney rcu_read_lock(); 21454225399aSPaul Jackson if (current_cpuset_is_being_rebound()) { 21464225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current); 2147708c1bbcSMiao Xie if (new->flags & MPOL_F_REBINDING) 2148708c1bbcSMiao Xie mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); 2149708c1bbcSMiao Xie else 2150708c1bbcSMiao Xie mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 21514225399aSPaul Jackson } 215299ee4ca7SPaul E. McKenney rcu_read_unlock(); 21531da177e4SLinus Torvalds atomic_set(&new->refcnt, 1); 21541da177e4SLinus Torvalds return new; 21551da177e4SLinus Torvalds } 21561da177e4SLinus Torvalds 21571da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */ 2158fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 21591da177e4SLinus Torvalds { 21601da177e4SLinus Torvalds if (!a || !b) 2161fcfb4dccSKOSAKI Motohiro return false; 216245c4745aSLee Schermerhorn if (a->mode != b->mode) 2163fcfb4dccSKOSAKI Motohiro return false; 216419800502SBob Liu if (a->flags != b->flags) 2165fcfb4dccSKOSAKI Motohiro return false; 216619800502SBob Liu if (mpol_store_user_nodemask(a)) 216719800502SBob Liu if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2168fcfb4dccSKOSAKI Motohiro return false; 216919800502SBob Liu 217045c4745aSLee Schermerhorn switch (a->mode) { 217119770b32SMel Gorman case MPOL_BIND: 217219770b32SMel Gorman /* Fall through */ 21731da177e4SLinus Torvalds case MPOL_INTERLEAVE: 2174fcfb4dccSKOSAKI Motohiro return !!nodes_equal(a->v.nodes, b->v.nodes); 21751da177e4SLinus Torvalds case MPOL_PREFERRED: 217675719661SNamhyung Kim return a->v.preferred_node == b->v.preferred_node; 21771da177e4SLinus Torvalds default: 21781da177e4SLinus Torvalds BUG(); 2179fcfb4dccSKOSAKI Motohiro return false; 21801da177e4SLinus Torvalds } 21811da177e4SLinus Torvalds } 21821da177e4SLinus Torvalds 21831da177e4SLinus Torvalds /* 21841da177e4SLinus Torvalds * Shared memory backing store policy support. 21851da177e4SLinus Torvalds * 21861da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped. 21871da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode. 21881da177e4SLinus Torvalds * They are protected by the sp->lock spinlock, which should be held 21891da177e4SLinus Torvalds * for any accesses to the tree. 21901da177e4SLinus Torvalds */ 21911da177e4SLinus Torvalds 21921da177e4SLinus Torvalds /* lookup first element intersecting start-end */ 219342288fe3SMel Gorman /* Caller holds sp->lock */ 21941da177e4SLinus Torvalds static struct sp_node * 21951da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 21961da177e4SLinus Torvalds { 21971da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node; 21981da177e4SLinus Torvalds 21991da177e4SLinus Torvalds while (n) { 22001da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd); 22011da177e4SLinus Torvalds 22021da177e4SLinus Torvalds if (start >= p->end) 22031da177e4SLinus Torvalds n = n->rb_right; 22041da177e4SLinus Torvalds else if (end <= p->start) 22051da177e4SLinus Torvalds n = n->rb_left; 22061da177e4SLinus Torvalds else 22071da177e4SLinus Torvalds break; 22081da177e4SLinus Torvalds } 22091da177e4SLinus Torvalds if (!n) 22101da177e4SLinus Torvalds return NULL; 22111da177e4SLinus Torvalds for (;;) { 22121da177e4SLinus Torvalds struct sp_node *w = NULL; 22131da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n); 22141da177e4SLinus Torvalds if (!prev) 22151da177e4SLinus Torvalds break; 22161da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd); 22171da177e4SLinus Torvalds if (w->end <= start) 22181da177e4SLinus Torvalds break; 22191da177e4SLinus Torvalds n = prev; 22201da177e4SLinus Torvalds } 22211da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd); 22221da177e4SLinus Torvalds } 22231da177e4SLinus Torvalds 22241da177e4SLinus Torvalds /* Insert a new shared policy into the list. */ 22251da177e4SLinus Torvalds /* Caller holds sp->lock */ 22261da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new) 22271da177e4SLinus Torvalds { 22281da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node; 22291da177e4SLinus Torvalds struct rb_node *parent = NULL; 22301da177e4SLinus Torvalds struct sp_node *nd; 22311da177e4SLinus Torvalds 22321da177e4SLinus Torvalds while (*p) { 22331da177e4SLinus Torvalds parent = *p; 22341da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd); 22351da177e4SLinus Torvalds if (new->start < nd->start) 22361da177e4SLinus Torvalds p = &(*p)->rb_left; 22371da177e4SLinus Torvalds else if (new->end > nd->end) 22381da177e4SLinus Torvalds p = &(*p)->rb_right; 22391da177e4SLinus Torvalds else 22401da177e4SLinus Torvalds BUG(); 22411da177e4SLinus Torvalds } 22421da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p); 22431da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root); 2244140d5a49SPaul Mundt pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 224545c4745aSLee Schermerhorn new->policy ? new->policy->mode : 0); 22461da177e4SLinus Torvalds } 22471da177e4SLinus Torvalds 22481da177e4SLinus Torvalds /* Find shared policy intersecting idx */ 22491da177e4SLinus Torvalds struct mempolicy * 22501da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 22511da177e4SLinus Torvalds { 22521da177e4SLinus Torvalds struct mempolicy *pol = NULL; 22531da177e4SLinus Torvalds struct sp_node *sn; 22541da177e4SLinus Torvalds 22551da177e4SLinus Torvalds if (!sp->root.rb_node) 22561da177e4SLinus Torvalds return NULL; 225742288fe3SMel Gorman spin_lock(&sp->lock); 22581da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1); 22591da177e4SLinus Torvalds if (sn) { 22601da177e4SLinus Torvalds mpol_get(sn->policy); 22611da177e4SLinus Torvalds pol = sn->policy; 22621da177e4SLinus Torvalds } 226342288fe3SMel Gorman spin_unlock(&sp->lock); 22641da177e4SLinus Torvalds return pol; 22651da177e4SLinus Torvalds } 22661da177e4SLinus Torvalds 226763f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n) 226863f74ca2SKOSAKI Motohiro { 226963f74ca2SKOSAKI Motohiro mpol_put(n->policy); 227063f74ca2SKOSAKI Motohiro kmem_cache_free(sn_cache, n); 227163f74ca2SKOSAKI Motohiro } 227263f74ca2SKOSAKI Motohiro 2273771fb4d8SLee Schermerhorn /** 2274771fb4d8SLee Schermerhorn * mpol_misplaced - check whether current page node is valid in policy 2275771fb4d8SLee Schermerhorn * 2276b46e14acSFabian Frederick * @page: page to be checked 2277b46e14acSFabian Frederick * @vma: vm area where page mapped 2278b46e14acSFabian Frederick * @addr: virtual address where page mapped 2279771fb4d8SLee Schermerhorn * 2280771fb4d8SLee Schermerhorn * Lookup current policy node id for vma,addr and "compare to" page's 2281771fb4d8SLee Schermerhorn * node id. 2282771fb4d8SLee Schermerhorn * 2283771fb4d8SLee Schermerhorn * Returns: 2284771fb4d8SLee Schermerhorn * -1 - not misplaced, page is in the right node 2285771fb4d8SLee Schermerhorn * node - node id where the page should be 2286771fb4d8SLee Schermerhorn * 2287771fb4d8SLee Schermerhorn * Policy determination "mimics" alloc_page_vma(). 2288771fb4d8SLee Schermerhorn * Called from fault path where we know the vma and faulting address. 2289771fb4d8SLee Schermerhorn */ 2290771fb4d8SLee Schermerhorn int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) 2291771fb4d8SLee Schermerhorn { 2292771fb4d8SLee Schermerhorn struct mempolicy *pol; 2293771fb4d8SLee Schermerhorn struct zone *zone; 2294771fb4d8SLee Schermerhorn int curnid = page_to_nid(page); 2295771fb4d8SLee Schermerhorn unsigned long pgoff; 229690572890SPeter Zijlstra int thiscpu = raw_smp_processor_id(); 229790572890SPeter Zijlstra int thisnid = cpu_to_node(thiscpu); 2298771fb4d8SLee Schermerhorn int polnid = -1; 2299771fb4d8SLee Schermerhorn int ret = -1; 2300771fb4d8SLee Schermerhorn 2301771fb4d8SLee Schermerhorn BUG_ON(!vma); 2302771fb4d8SLee Schermerhorn 2303771fb4d8SLee Schermerhorn pol = get_vma_policy(current, vma, addr); 2304771fb4d8SLee Schermerhorn if (!(pol->flags & MPOL_F_MOF)) 2305771fb4d8SLee Schermerhorn goto out; 2306771fb4d8SLee Schermerhorn 2307771fb4d8SLee Schermerhorn switch (pol->mode) { 2308771fb4d8SLee Schermerhorn case MPOL_INTERLEAVE: 2309771fb4d8SLee Schermerhorn BUG_ON(addr >= vma->vm_end); 2310771fb4d8SLee Schermerhorn BUG_ON(addr < vma->vm_start); 2311771fb4d8SLee Schermerhorn 2312771fb4d8SLee Schermerhorn pgoff = vma->vm_pgoff; 2313771fb4d8SLee Schermerhorn pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; 2314771fb4d8SLee Schermerhorn polnid = offset_il_node(pol, vma, pgoff); 2315771fb4d8SLee Schermerhorn break; 2316771fb4d8SLee Schermerhorn 2317771fb4d8SLee Schermerhorn case MPOL_PREFERRED: 2318771fb4d8SLee Schermerhorn if (pol->flags & MPOL_F_LOCAL) 2319771fb4d8SLee Schermerhorn polnid = numa_node_id(); 2320771fb4d8SLee Schermerhorn else 2321771fb4d8SLee Schermerhorn polnid = pol->v.preferred_node; 2322771fb4d8SLee Schermerhorn break; 2323771fb4d8SLee Schermerhorn 2324771fb4d8SLee Schermerhorn case MPOL_BIND: 2325771fb4d8SLee Schermerhorn /* 2326771fb4d8SLee Schermerhorn * allows binding to multiple nodes. 2327771fb4d8SLee Schermerhorn * use current page if in policy nodemask, 2328771fb4d8SLee Schermerhorn * else select nearest allowed node, if any. 2329771fb4d8SLee Schermerhorn * If no allowed nodes, use current [!misplaced]. 2330771fb4d8SLee Schermerhorn */ 2331771fb4d8SLee Schermerhorn if (node_isset(curnid, pol->v.nodes)) 2332771fb4d8SLee Schermerhorn goto out; 2333771fb4d8SLee Schermerhorn (void)first_zones_zonelist( 2334771fb4d8SLee Schermerhorn node_zonelist(numa_node_id(), GFP_HIGHUSER), 2335771fb4d8SLee Schermerhorn gfp_zone(GFP_HIGHUSER), 2336771fb4d8SLee Schermerhorn &pol->v.nodes, &zone); 2337771fb4d8SLee Schermerhorn polnid = zone->node; 2338771fb4d8SLee Schermerhorn break; 2339771fb4d8SLee Schermerhorn 2340771fb4d8SLee Schermerhorn default: 2341771fb4d8SLee Schermerhorn BUG(); 2342771fb4d8SLee Schermerhorn } 23435606e387SMel Gorman 23445606e387SMel Gorman /* Migrate the page towards the node whose CPU is referencing it */ 2345e42c8ff2SMel Gorman if (pol->flags & MPOL_F_MORON) { 234690572890SPeter Zijlstra polnid = thisnid; 23475606e387SMel Gorman 234810f39042SRik van Riel if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) 2349de1c9ce6SRik van Riel goto out; 2350de1c9ce6SRik van Riel } 2351e42c8ff2SMel Gorman 2352771fb4d8SLee Schermerhorn if (curnid != polnid) 2353771fb4d8SLee Schermerhorn ret = polnid; 2354771fb4d8SLee Schermerhorn out: 2355771fb4d8SLee Schermerhorn mpol_cond_put(pol); 2356771fb4d8SLee Schermerhorn 2357771fb4d8SLee Schermerhorn return ret; 2358771fb4d8SLee Schermerhorn } 2359771fb4d8SLee Schermerhorn 23601da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n) 23611da177e4SLinus Torvalds { 2362140d5a49SPaul Mundt pr_debug("deleting %lx-l%lx\n", n->start, n->end); 23631da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root); 236463f74ca2SKOSAKI Motohiro sp_free(n); 23651da177e4SLinus Torvalds } 23661da177e4SLinus Torvalds 236742288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start, 236842288fe3SMel Gorman unsigned long end, struct mempolicy *pol) 236942288fe3SMel Gorman { 237042288fe3SMel Gorman node->start = start; 237142288fe3SMel Gorman node->end = end; 237242288fe3SMel Gorman node->policy = pol; 237342288fe3SMel Gorman } 237442288fe3SMel Gorman 2375dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2376dbcb0f19SAdrian Bunk struct mempolicy *pol) 23771da177e4SLinus Torvalds { 2378869833f2SKOSAKI Motohiro struct sp_node *n; 2379869833f2SKOSAKI Motohiro struct mempolicy *newpol; 23801da177e4SLinus Torvalds 2381869833f2SKOSAKI Motohiro n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 23821da177e4SLinus Torvalds if (!n) 23831da177e4SLinus Torvalds return NULL; 2384869833f2SKOSAKI Motohiro 2385869833f2SKOSAKI Motohiro newpol = mpol_dup(pol); 2386869833f2SKOSAKI Motohiro if (IS_ERR(newpol)) { 2387869833f2SKOSAKI Motohiro kmem_cache_free(sn_cache, n); 2388869833f2SKOSAKI Motohiro return NULL; 2389869833f2SKOSAKI Motohiro } 2390869833f2SKOSAKI Motohiro newpol->flags |= MPOL_F_SHARED; 239142288fe3SMel Gorman sp_node_init(n, start, end, newpol); 2392869833f2SKOSAKI Motohiro 23931da177e4SLinus Torvalds return n; 23941da177e4SLinus Torvalds } 23951da177e4SLinus Torvalds 23961da177e4SLinus Torvalds /* Replace a policy range. */ 23971da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 23981da177e4SLinus Torvalds unsigned long end, struct sp_node *new) 23991da177e4SLinus Torvalds { 2400b22d127aSMel Gorman struct sp_node *n; 240142288fe3SMel Gorman struct sp_node *n_new = NULL; 240242288fe3SMel Gorman struct mempolicy *mpol_new = NULL; 2403b22d127aSMel Gorman int ret = 0; 24041da177e4SLinus Torvalds 240542288fe3SMel Gorman restart: 240642288fe3SMel Gorman spin_lock(&sp->lock); 24071da177e4SLinus Torvalds n = sp_lookup(sp, start, end); 24081da177e4SLinus Torvalds /* Take care of old policies in the same range. */ 24091da177e4SLinus Torvalds while (n && n->start < end) { 24101da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd); 24111da177e4SLinus Torvalds if (n->start >= start) { 24121da177e4SLinus Torvalds if (n->end <= end) 24131da177e4SLinus Torvalds sp_delete(sp, n); 24141da177e4SLinus Torvalds else 24151da177e4SLinus Torvalds n->start = end; 24161da177e4SLinus Torvalds } else { 24171da177e4SLinus Torvalds /* Old policy spanning whole new range. */ 24181da177e4SLinus Torvalds if (n->end > end) { 241942288fe3SMel Gorman if (!n_new) 242042288fe3SMel Gorman goto alloc_new; 242142288fe3SMel Gorman 242242288fe3SMel Gorman *mpol_new = *n->policy; 242342288fe3SMel Gorman atomic_set(&mpol_new->refcnt, 1); 24247880639cSKOSAKI Motohiro sp_node_init(n_new, end, n->end, mpol_new); 24251da177e4SLinus Torvalds n->end = start; 24265ca39575SHillf Danton sp_insert(sp, n_new); 242742288fe3SMel Gorman n_new = NULL; 242842288fe3SMel Gorman mpol_new = NULL; 24291da177e4SLinus Torvalds break; 24301da177e4SLinus Torvalds } else 24311da177e4SLinus Torvalds n->end = start; 24321da177e4SLinus Torvalds } 24331da177e4SLinus Torvalds if (!next) 24341da177e4SLinus Torvalds break; 24351da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 24361da177e4SLinus Torvalds } 24371da177e4SLinus Torvalds if (new) 24381da177e4SLinus Torvalds sp_insert(sp, new); 243942288fe3SMel Gorman spin_unlock(&sp->lock); 244042288fe3SMel Gorman ret = 0; 244142288fe3SMel Gorman 244242288fe3SMel Gorman err_out: 244342288fe3SMel Gorman if (mpol_new) 244442288fe3SMel Gorman mpol_put(mpol_new); 244542288fe3SMel Gorman if (n_new) 244642288fe3SMel Gorman kmem_cache_free(sn_cache, n_new); 244742288fe3SMel Gorman 2448b22d127aSMel Gorman return ret; 244942288fe3SMel Gorman 245042288fe3SMel Gorman alloc_new: 245142288fe3SMel Gorman spin_unlock(&sp->lock); 245242288fe3SMel Gorman ret = -ENOMEM; 245342288fe3SMel Gorman n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 245442288fe3SMel Gorman if (!n_new) 245542288fe3SMel Gorman goto err_out; 245642288fe3SMel Gorman mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 245742288fe3SMel Gorman if (!mpol_new) 245842288fe3SMel Gorman goto err_out; 245942288fe3SMel Gorman goto restart; 24601da177e4SLinus Torvalds } 24611da177e4SLinus Torvalds 246271fe804bSLee Schermerhorn /** 246371fe804bSLee Schermerhorn * mpol_shared_policy_init - initialize shared policy for inode 246471fe804bSLee Schermerhorn * @sp: pointer to inode shared policy 246571fe804bSLee Schermerhorn * @mpol: struct mempolicy to install 246671fe804bSLee Schermerhorn * 246771fe804bSLee Schermerhorn * Install non-NULL @mpol in inode's shared policy rb-tree. 246871fe804bSLee Schermerhorn * On entry, the current task has a reference on a non-NULL @mpol. 246971fe804bSLee Schermerhorn * This must be released on exit. 24704bfc4495SKAMEZAWA Hiroyuki * This is called at get_inode() calls and we can use GFP_KERNEL. 247171fe804bSLee Schermerhorn */ 247271fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 24737339ff83SRobin Holt { 247458568d2aSMiao Xie int ret; 247558568d2aSMiao Xie 247671fe804bSLee Schermerhorn sp->root = RB_ROOT; /* empty tree == default mempolicy */ 247742288fe3SMel Gorman spin_lock_init(&sp->lock); 24787339ff83SRobin Holt 247971fe804bSLee Schermerhorn if (mpol) { 24807339ff83SRobin Holt struct vm_area_struct pvma; 248171fe804bSLee Schermerhorn struct mempolicy *new; 24824bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 24837339ff83SRobin Holt 24844bfc4495SKAMEZAWA Hiroyuki if (!scratch) 24855c0c1654SLee Schermerhorn goto put_mpol; 248671fe804bSLee Schermerhorn /* contextualize the tmpfs mount point mempolicy */ 248771fe804bSLee Schermerhorn new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 248815d77835SLee Schermerhorn if (IS_ERR(new)) 24890cae3457SDan Carpenter goto free_scratch; /* no valid nodemask intersection */ 249058568d2aSMiao Xie 249158568d2aSMiao Xie task_lock(current); 24924bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 249358568d2aSMiao Xie task_unlock(current); 249415d77835SLee Schermerhorn if (ret) 24955c0c1654SLee Schermerhorn goto put_new; 249671fe804bSLee Schermerhorn 249771fe804bSLee Schermerhorn /* Create pseudo-vma that contains just the policy */ 24987339ff83SRobin Holt memset(&pvma, 0, sizeof(struct vm_area_struct)); 249971fe804bSLee Schermerhorn pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 250071fe804bSLee Schermerhorn mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 250115d77835SLee Schermerhorn 25025c0c1654SLee Schermerhorn put_new: 250371fe804bSLee Schermerhorn mpol_put(new); /* drop initial ref */ 25040cae3457SDan Carpenter free_scratch: 25054bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 25065c0c1654SLee Schermerhorn put_mpol: 25075c0c1654SLee Schermerhorn mpol_put(mpol); /* drop our incoming ref on sb mpol */ 25087339ff83SRobin Holt } 25097339ff83SRobin Holt } 25107339ff83SRobin Holt 25111da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info, 25121da177e4SLinus Torvalds struct vm_area_struct *vma, struct mempolicy *npol) 25131da177e4SLinus Torvalds { 25141da177e4SLinus Torvalds int err; 25151da177e4SLinus Torvalds struct sp_node *new = NULL; 25161da177e4SLinus Torvalds unsigned long sz = vma_pages(vma); 25171da177e4SLinus Torvalds 2518028fec41SDavid Rientjes pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 25191da177e4SLinus Torvalds vma->vm_pgoff, 252045c4745aSLee Schermerhorn sz, npol ? npol->mode : -1, 2521028fec41SDavid Rientjes npol ? npol->flags : -1, 252200ef2d2fSDavid Rientjes npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); 25231da177e4SLinus Torvalds 25241da177e4SLinus Torvalds if (npol) { 25251da177e4SLinus Torvalds new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 25261da177e4SLinus Torvalds if (!new) 25271da177e4SLinus Torvalds return -ENOMEM; 25281da177e4SLinus Torvalds } 25291da177e4SLinus Torvalds err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 25301da177e4SLinus Torvalds if (err && new) 253163f74ca2SKOSAKI Motohiro sp_free(new); 25321da177e4SLinus Torvalds return err; 25331da177e4SLinus Torvalds } 25341da177e4SLinus Torvalds 25351da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */ 25361da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p) 25371da177e4SLinus Torvalds { 25381da177e4SLinus Torvalds struct sp_node *n; 25391da177e4SLinus Torvalds struct rb_node *next; 25401da177e4SLinus Torvalds 25411da177e4SLinus Torvalds if (!p->root.rb_node) 25421da177e4SLinus Torvalds return; 254342288fe3SMel Gorman spin_lock(&p->lock); 25441da177e4SLinus Torvalds next = rb_first(&p->root); 25451da177e4SLinus Torvalds while (next) { 25461da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 25471da177e4SLinus Torvalds next = rb_next(&n->nd); 254863f74ca2SKOSAKI Motohiro sp_delete(p, n); 25491da177e4SLinus Torvalds } 255042288fe3SMel Gorman spin_unlock(&p->lock); 25511da177e4SLinus Torvalds } 25521da177e4SLinus Torvalds 25531a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING 2554c297663cSMel Gorman static int __initdata numabalancing_override; 25551a687c2eSMel Gorman 25561a687c2eSMel Gorman static void __init check_numabalancing_enable(void) 25571a687c2eSMel Gorman { 25581a687c2eSMel Gorman bool numabalancing_default = false; 25591a687c2eSMel Gorman 25601a687c2eSMel Gorman if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 25611a687c2eSMel Gorman numabalancing_default = true; 25621a687c2eSMel Gorman 2563c297663cSMel Gorman /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 2564c297663cSMel Gorman if (numabalancing_override) 2565c297663cSMel Gorman set_numabalancing_state(numabalancing_override == 1); 2566c297663cSMel Gorman 25671a687c2eSMel Gorman if (nr_node_ids > 1 && !numabalancing_override) { 25684a404beaSAndrew Morton pr_info("%s automatic NUMA balancing. " 2569c297663cSMel Gorman "Configure with numa_balancing= or the " 2570c297663cSMel Gorman "kernel.numa_balancing sysctl", 2571c297663cSMel Gorman numabalancing_default ? "Enabling" : "Disabling"); 25721a687c2eSMel Gorman set_numabalancing_state(numabalancing_default); 25731a687c2eSMel Gorman } 25741a687c2eSMel Gorman } 25751a687c2eSMel Gorman 25761a687c2eSMel Gorman static int __init setup_numabalancing(char *str) 25771a687c2eSMel Gorman { 25781a687c2eSMel Gorman int ret = 0; 25791a687c2eSMel Gorman if (!str) 25801a687c2eSMel Gorman goto out; 25811a687c2eSMel Gorman 25821a687c2eSMel Gorman if (!strcmp(str, "enable")) { 2583c297663cSMel Gorman numabalancing_override = 1; 25841a687c2eSMel Gorman ret = 1; 25851a687c2eSMel Gorman } else if (!strcmp(str, "disable")) { 2586c297663cSMel Gorman numabalancing_override = -1; 25871a687c2eSMel Gorman ret = 1; 25881a687c2eSMel Gorman } 25891a687c2eSMel Gorman out: 25901a687c2eSMel Gorman if (!ret) 25914a404beaSAndrew Morton pr_warn("Unable to parse numa_balancing=\n"); 25921a687c2eSMel Gorman 25931a687c2eSMel Gorman return ret; 25941a687c2eSMel Gorman } 25951a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing); 25961a687c2eSMel Gorman #else 25971a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void) 25981a687c2eSMel Gorman { 25991a687c2eSMel Gorman } 26001a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */ 26011a687c2eSMel Gorman 26021da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */ 26031da177e4SLinus Torvalds void __init numa_policy_init(void) 26041da177e4SLinus Torvalds { 2605b71636e2SPaul Mundt nodemask_t interleave_nodes; 2606b71636e2SPaul Mundt unsigned long largest = 0; 2607b71636e2SPaul Mundt int nid, prefer = 0; 2608b71636e2SPaul Mundt 26091da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy", 26101da177e4SLinus Torvalds sizeof(struct mempolicy), 261120c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 26121da177e4SLinus Torvalds 26131da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node", 26141da177e4SLinus Torvalds sizeof(struct sp_node), 261520c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 26161da177e4SLinus Torvalds 26175606e387SMel Gorman for_each_node(nid) { 26185606e387SMel Gorman preferred_node_policy[nid] = (struct mempolicy) { 26195606e387SMel Gorman .refcnt = ATOMIC_INIT(1), 26205606e387SMel Gorman .mode = MPOL_PREFERRED, 26215606e387SMel Gorman .flags = MPOL_F_MOF | MPOL_F_MORON, 26225606e387SMel Gorman .v = { .preferred_node = nid, }, 26235606e387SMel Gorman }; 26245606e387SMel Gorman } 26255606e387SMel Gorman 2626b71636e2SPaul Mundt /* 2627b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only 2628b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or 2629b71636e2SPaul Mundt * fall back to the largest node if they're all smaller. 2630b71636e2SPaul Mundt */ 2631b71636e2SPaul Mundt nodes_clear(interleave_nodes); 263201f13bd6SLai Jiangshan for_each_node_state(nid, N_MEMORY) { 2633b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid); 26341da177e4SLinus Torvalds 2635b71636e2SPaul Mundt /* Preserve the largest node */ 2636b71636e2SPaul Mundt if (largest < total_pages) { 2637b71636e2SPaul Mundt largest = total_pages; 2638b71636e2SPaul Mundt prefer = nid; 2639b71636e2SPaul Mundt } 2640b71636e2SPaul Mundt 2641b71636e2SPaul Mundt /* Interleave this node? */ 2642b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 2643b71636e2SPaul Mundt node_set(nid, interleave_nodes); 2644b71636e2SPaul Mundt } 2645b71636e2SPaul Mundt 2646b71636e2SPaul Mundt /* All too small, use the largest */ 2647b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes))) 2648b71636e2SPaul Mundt node_set(prefer, interleave_nodes); 2649b71636e2SPaul Mundt 2650028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2651*b1de0d13SMitchel Humpherys pr_err("%s: interleaving failed\n", __func__); 26521a687c2eSMel Gorman 26531a687c2eSMel Gorman check_numabalancing_enable(); 26541da177e4SLinus Torvalds } 26551da177e4SLinus Torvalds 26568bccd85fSChristoph Lameter /* Reset policy of current process to default */ 26571da177e4SLinus Torvalds void numa_default_policy(void) 26581da177e4SLinus Torvalds { 2659028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 26601da177e4SLinus Torvalds } 266168860ec1SPaul Jackson 26624225399aSPaul Jackson /* 2663095f1fc4SLee Schermerhorn * Parse and format mempolicy from/to strings 2664095f1fc4SLee Schermerhorn */ 2665095f1fc4SLee Schermerhorn 2666095f1fc4SLee Schermerhorn /* 2667f2a07f40SHugh Dickins * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. 26681a75a6c8SChristoph Lameter */ 2669345ace9cSLee Schermerhorn static const char * const policy_modes[] = 2670345ace9cSLee Schermerhorn { 2671345ace9cSLee Schermerhorn [MPOL_DEFAULT] = "default", 2672345ace9cSLee Schermerhorn [MPOL_PREFERRED] = "prefer", 2673345ace9cSLee Schermerhorn [MPOL_BIND] = "bind", 2674345ace9cSLee Schermerhorn [MPOL_INTERLEAVE] = "interleave", 2675d3a71033SLee Schermerhorn [MPOL_LOCAL] = "local", 2676345ace9cSLee Schermerhorn }; 26771a75a6c8SChristoph Lameter 2678095f1fc4SLee Schermerhorn 2679095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS 2680095f1fc4SLee Schermerhorn /** 2681f2a07f40SHugh Dickins * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 2682095f1fc4SLee Schermerhorn * @str: string containing mempolicy to parse 268371fe804bSLee Schermerhorn * @mpol: pointer to struct mempolicy pointer, returned on success. 2684095f1fc4SLee Schermerhorn * 2685095f1fc4SLee Schermerhorn * Format of input: 2686095f1fc4SLee Schermerhorn * <mode>[=<flags>][:<nodelist>] 2687095f1fc4SLee Schermerhorn * 268871fe804bSLee Schermerhorn * On success, returns 0, else 1 2689095f1fc4SLee Schermerhorn */ 2690a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol) 2691095f1fc4SLee Schermerhorn { 269271fe804bSLee Schermerhorn struct mempolicy *new = NULL; 2693b4652e84SLee Schermerhorn unsigned short mode; 2694f2a07f40SHugh Dickins unsigned short mode_flags; 269571fe804bSLee Schermerhorn nodemask_t nodes; 2696095f1fc4SLee Schermerhorn char *nodelist = strchr(str, ':'); 2697095f1fc4SLee Schermerhorn char *flags = strchr(str, '='); 2698095f1fc4SLee Schermerhorn int err = 1; 2699095f1fc4SLee Schermerhorn 2700095f1fc4SLee Schermerhorn if (nodelist) { 2701095f1fc4SLee Schermerhorn /* NUL-terminate mode or flags string */ 2702095f1fc4SLee Schermerhorn *nodelist++ = '\0'; 270371fe804bSLee Schermerhorn if (nodelist_parse(nodelist, nodes)) 2704095f1fc4SLee Schermerhorn goto out; 270501f13bd6SLai Jiangshan if (!nodes_subset(nodes, node_states[N_MEMORY])) 2706095f1fc4SLee Schermerhorn goto out; 270771fe804bSLee Schermerhorn } else 270871fe804bSLee Schermerhorn nodes_clear(nodes); 270971fe804bSLee Schermerhorn 2710095f1fc4SLee Schermerhorn if (flags) 2711095f1fc4SLee Schermerhorn *flags++ = '\0'; /* terminate mode string */ 2712095f1fc4SLee Schermerhorn 2713479e2802SPeter Zijlstra for (mode = 0; mode < MPOL_MAX; mode++) { 2714345ace9cSLee Schermerhorn if (!strcmp(str, policy_modes[mode])) { 2715095f1fc4SLee Schermerhorn break; 2716095f1fc4SLee Schermerhorn } 2717095f1fc4SLee Schermerhorn } 2718a720094dSMel Gorman if (mode >= MPOL_MAX) 2719095f1fc4SLee Schermerhorn goto out; 2720095f1fc4SLee Schermerhorn 272171fe804bSLee Schermerhorn switch (mode) { 2722095f1fc4SLee Schermerhorn case MPOL_PREFERRED: 272371fe804bSLee Schermerhorn /* 272471fe804bSLee Schermerhorn * Insist on a nodelist of one node only 272571fe804bSLee Schermerhorn */ 2726095f1fc4SLee Schermerhorn if (nodelist) { 2727095f1fc4SLee Schermerhorn char *rest = nodelist; 2728095f1fc4SLee Schermerhorn while (isdigit(*rest)) 2729095f1fc4SLee Schermerhorn rest++; 2730926f2ae0SKOSAKI Motohiro if (*rest) 2731926f2ae0SKOSAKI Motohiro goto out; 2732095f1fc4SLee Schermerhorn } 2733095f1fc4SLee Schermerhorn break; 2734095f1fc4SLee Schermerhorn case MPOL_INTERLEAVE: 2735095f1fc4SLee Schermerhorn /* 2736095f1fc4SLee Schermerhorn * Default to online nodes with memory if no nodelist 2737095f1fc4SLee Schermerhorn */ 2738095f1fc4SLee Schermerhorn if (!nodelist) 273901f13bd6SLai Jiangshan nodes = node_states[N_MEMORY]; 27403f226aa1SLee Schermerhorn break; 274171fe804bSLee Schermerhorn case MPOL_LOCAL: 27423f226aa1SLee Schermerhorn /* 274371fe804bSLee Schermerhorn * Don't allow a nodelist; mpol_new() checks flags 27443f226aa1SLee Schermerhorn */ 274571fe804bSLee Schermerhorn if (nodelist) 27463f226aa1SLee Schermerhorn goto out; 274771fe804bSLee Schermerhorn mode = MPOL_PREFERRED; 27483f226aa1SLee Schermerhorn break; 2749413b43deSRavikiran G Thirumalai case MPOL_DEFAULT: 2750413b43deSRavikiran G Thirumalai /* 2751413b43deSRavikiran G Thirumalai * Insist on a empty nodelist 2752413b43deSRavikiran G Thirumalai */ 2753413b43deSRavikiran G Thirumalai if (!nodelist) 2754413b43deSRavikiran G Thirumalai err = 0; 2755413b43deSRavikiran G Thirumalai goto out; 2756d69b2e63SKOSAKI Motohiro case MPOL_BIND: 275771fe804bSLee Schermerhorn /* 2758d69b2e63SKOSAKI Motohiro * Insist on a nodelist 275971fe804bSLee Schermerhorn */ 2760d69b2e63SKOSAKI Motohiro if (!nodelist) 2761d69b2e63SKOSAKI Motohiro goto out; 2762095f1fc4SLee Schermerhorn } 2763095f1fc4SLee Schermerhorn 276471fe804bSLee Schermerhorn mode_flags = 0; 2765095f1fc4SLee Schermerhorn if (flags) { 2766095f1fc4SLee Schermerhorn /* 2767095f1fc4SLee Schermerhorn * Currently, we only support two mutually exclusive 2768095f1fc4SLee Schermerhorn * mode flags. 2769095f1fc4SLee Schermerhorn */ 2770095f1fc4SLee Schermerhorn if (!strcmp(flags, "static")) 277171fe804bSLee Schermerhorn mode_flags |= MPOL_F_STATIC_NODES; 2772095f1fc4SLee Schermerhorn else if (!strcmp(flags, "relative")) 277371fe804bSLee Schermerhorn mode_flags |= MPOL_F_RELATIVE_NODES; 2774095f1fc4SLee Schermerhorn else 2775926f2ae0SKOSAKI Motohiro goto out; 2776095f1fc4SLee Schermerhorn } 277771fe804bSLee Schermerhorn 277871fe804bSLee Schermerhorn new = mpol_new(mode, mode_flags, &nodes); 277971fe804bSLee Schermerhorn if (IS_ERR(new)) 2780926f2ae0SKOSAKI Motohiro goto out; 2781926f2ae0SKOSAKI Motohiro 2782f2a07f40SHugh Dickins /* 2783f2a07f40SHugh Dickins * Save nodes for mpol_to_str() to show the tmpfs mount options 2784f2a07f40SHugh Dickins * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 2785f2a07f40SHugh Dickins */ 2786f2a07f40SHugh Dickins if (mode != MPOL_PREFERRED) 2787f2a07f40SHugh Dickins new->v.nodes = nodes; 2788f2a07f40SHugh Dickins else if (nodelist) 2789f2a07f40SHugh Dickins new->v.preferred_node = first_node(nodes); 2790f2a07f40SHugh Dickins else 2791f2a07f40SHugh Dickins new->flags |= MPOL_F_LOCAL; 2792f2a07f40SHugh Dickins 2793f2a07f40SHugh Dickins /* 2794f2a07f40SHugh Dickins * Save nodes for contextualization: this will be used to "clone" 2795f2a07f40SHugh Dickins * the mempolicy in a specific context [cpuset] at a later time. 2796f2a07f40SHugh Dickins */ 2797e17f74afSLee Schermerhorn new->w.user_nodemask = nodes; 2798f2a07f40SHugh Dickins 2799926f2ae0SKOSAKI Motohiro err = 0; 280071fe804bSLee Schermerhorn 2801095f1fc4SLee Schermerhorn out: 2802095f1fc4SLee Schermerhorn /* Restore string for error message */ 2803095f1fc4SLee Schermerhorn if (nodelist) 2804095f1fc4SLee Schermerhorn *--nodelist = ':'; 2805095f1fc4SLee Schermerhorn if (flags) 2806095f1fc4SLee Schermerhorn *--flags = '='; 280771fe804bSLee Schermerhorn if (!err) 280871fe804bSLee Schermerhorn *mpol = new; 2809095f1fc4SLee Schermerhorn return err; 2810095f1fc4SLee Schermerhorn } 2811095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */ 2812095f1fc4SLee Schermerhorn 281371fe804bSLee Schermerhorn /** 281471fe804bSLee Schermerhorn * mpol_to_str - format a mempolicy structure for printing 281571fe804bSLee Schermerhorn * @buffer: to contain formatted mempolicy string 281671fe804bSLee Schermerhorn * @maxlen: length of @buffer 281771fe804bSLee Schermerhorn * @pol: pointer to mempolicy to be formatted 281871fe804bSLee Schermerhorn * 2819948927eeSDavid Rientjes * Convert @pol into a string. If @buffer is too short, truncate the string. 2820948927eeSDavid Rientjes * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the 2821948927eeSDavid Rientjes * longest flag, "relative", and to display at least a few node ids. 28221a75a6c8SChristoph Lameter */ 2823948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 28241a75a6c8SChristoph Lameter { 28251a75a6c8SChristoph Lameter char *p = buffer; 2826948927eeSDavid Rientjes nodemask_t nodes = NODE_MASK_NONE; 2827948927eeSDavid Rientjes unsigned short mode = MPOL_DEFAULT; 2828948927eeSDavid Rientjes unsigned short flags = 0; 28291a75a6c8SChristoph Lameter 28308790c71aSDavid Rientjes if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { 2831bea904d5SLee Schermerhorn mode = pol->mode; 2832948927eeSDavid Rientjes flags = pol->flags; 2833948927eeSDavid Rientjes } 2834bea904d5SLee Schermerhorn 28351a75a6c8SChristoph Lameter switch (mode) { 28361a75a6c8SChristoph Lameter case MPOL_DEFAULT: 28371a75a6c8SChristoph Lameter break; 28381a75a6c8SChristoph Lameter case MPOL_PREFERRED: 2839fc36b8d3SLee Schermerhorn if (flags & MPOL_F_LOCAL) 2840f2a07f40SHugh Dickins mode = MPOL_LOCAL; 284153f2556bSLee Schermerhorn else 2842fc36b8d3SLee Schermerhorn node_set(pol->v.preferred_node, nodes); 28431a75a6c8SChristoph Lameter break; 28441a75a6c8SChristoph Lameter case MPOL_BIND: 28451a75a6c8SChristoph Lameter case MPOL_INTERLEAVE: 28461a75a6c8SChristoph Lameter nodes = pol->v.nodes; 28471a75a6c8SChristoph Lameter break; 28481a75a6c8SChristoph Lameter default: 2849948927eeSDavid Rientjes WARN_ON_ONCE(1); 2850948927eeSDavid Rientjes snprintf(p, maxlen, "unknown"); 2851948927eeSDavid Rientjes return; 28521a75a6c8SChristoph Lameter } 28531a75a6c8SChristoph Lameter 2854b7a9f420SDavid Rientjes p += snprintf(p, maxlen, "%s", policy_modes[mode]); 28551a75a6c8SChristoph Lameter 2856fc36b8d3SLee Schermerhorn if (flags & MPOL_MODE_FLAGS) { 2857948927eeSDavid Rientjes p += snprintf(p, buffer + maxlen - p, "="); 2858f5b087b5SDavid Rientjes 28592291990aSLee Schermerhorn /* 28602291990aSLee Schermerhorn * Currently, the only defined flags are mutually exclusive 28612291990aSLee Schermerhorn */ 2862f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES) 28632291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "static"); 28642291990aSLee Schermerhorn else if (flags & MPOL_F_RELATIVE_NODES) 28652291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "relative"); 2866f5b087b5SDavid Rientjes } 2867f5b087b5SDavid Rientjes 28681a75a6c8SChristoph Lameter if (!nodes_empty(nodes)) { 2869948927eeSDavid Rientjes p += snprintf(p, buffer + maxlen - p, ":"); 28701a75a6c8SChristoph Lameter p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 28711a75a6c8SChristoph Lameter } 28721a75a6c8SChristoph Lameter } 2873