11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs. 58bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 61da177e4SLinus Torvalds * Subject to the GNU Public License, version 2. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should 91da177e4SLinus Torvalds * be allocated. 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * Support four policies per VMA and per process: 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault. 141da177e4SLinus Torvalds * 151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes, 161da177e4SLinus Torvalds * with normal fallback if it fails. 171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the 181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping 191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter 201da177e4SLinus Torvalds * is used. 218bccd85fSChristoph Lameter * 221da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes, 231da177e4SLinus Torvalds * no fallback. 248bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node 258bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict 268bccd85fSChristoph Lameter * the allocation to memory nodes instead 278bccd85fSChristoph Lameter * 281da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback. 291da177e4SLinus Torvalds * As a special case node -1 here means do the allocation 301da177e4SLinus Torvalds * on the local CPU. This is normally identical to default, 311da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default 321da177e4SLinus Torvalds * process policy. 338bccd85fSChristoph Lameter * 341da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA 351da177e4SLinus Torvalds * use the process policy. This is what Linux always did 361da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default. 371da177e4SLinus Torvalds * 381da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations 391da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always 401da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory 411da177e4SLinus Torvalds * allocations for a VMA in the VM. 421da177e4SLinus Torvalds * 431da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy 441da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy 451da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins. 461da177e4SLinus Torvalds * 471da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations 481da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that 491da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied. 501da177e4SLinus Torvalds * Same with GFP_DMA allocations. 511da177e4SLinus Torvalds * 521da177e4SLinus Torvalds * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 531da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped. 541da177e4SLinus Torvalds */ 551da177e4SLinus Torvalds 561da177e4SLinus Torvalds /* Notebook: 571da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache 581da177e4SLinus Torvalds object 591da177e4SLinus Torvalds statistics for bigpages 601da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires 611da177e4SLinus Torvalds first item above. 621da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy) 631da177e4SLinus Torvalds grows down? 641da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the 651da177e4SLinus Torvalds kernel is not always grateful with that. 661da177e4SLinus Torvalds */ 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds #include <linux/mempolicy.h> 691da177e4SLinus Torvalds #include <linux/mm.h> 701da177e4SLinus Torvalds #include <linux/highmem.h> 711da177e4SLinus Torvalds #include <linux/hugetlb.h> 721da177e4SLinus Torvalds #include <linux/kernel.h> 731da177e4SLinus Torvalds #include <linux/sched.h> 741da177e4SLinus Torvalds #include <linux/nodemask.h> 751da177e4SLinus Torvalds #include <linux/cpuset.h> 761da177e4SLinus Torvalds #include <linux/gfp.h> 771da177e4SLinus Torvalds #include <linux/slab.h> 781da177e4SLinus Torvalds #include <linux/string.h> 791da177e4SLinus Torvalds #include <linux/module.h> 80b488893aSPavel Emelyanov #include <linux/nsproxy.h> 811da177e4SLinus Torvalds #include <linux/interrupt.h> 821da177e4SLinus Torvalds #include <linux/init.h> 831da177e4SLinus Torvalds #include <linux/compat.h> 84dc9aa5b9SChristoph Lameter #include <linux/swap.h> 851a75a6c8SChristoph Lameter #include <linux/seq_file.h> 861a75a6c8SChristoph Lameter #include <linux/proc_fs.h> 87b20a3503SChristoph Lameter #include <linux/migrate.h> 8895a402c3SChristoph Lameter #include <linux/rmap.h> 8986c3a764SDavid Quigley #include <linux/security.h> 90dbcb0f19SAdrian Bunk #include <linux/syscalls.h> 91dc9aa5b9SChristoph Lameter 921da177e4SLinus Torvalds #include <asm/tlbflush.h> 931da177e4SLinus Torvalds #include <asm/uaccess.h> 941da177e4SLinus Torvalds 9538e35860SChristoph Lameter /* Internal flags */ 96dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 9738e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 981a75a6c8SChristoph Lameter #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 99dc9aa5b9SChristoph Lameter 100fcc234f8SPekka Enberg static struct kmem_cache *policy_cache; 101fcc234f8SPekka Enberg static struct kmem_cache *sn_cache; 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not 1041da177e4SLinus Torvalds policied. */ 1056267276fSChristoph Lameter enum zone_type policy_zone = 0; 1061da177e4SLinus Torvalds 107d42c6997SAndi Kleen struct mempolicy default_policy = { 1081da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */ 1091da177e4SLinus Torvalds .policy = MPOL_DEFAULT, 1101da177e4SLinus Torvalds }; 1111da177e4SLinus Torvalds 11237012946SDavid Rientjes static const struct mempolicy_operations { 11337012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 11437012946SDavid Rientjes void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 11537012946SDavid Rientjes } mpol_ops[MPOL_MAX]; 11637012946SDavid Rientjes 11719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */ 11837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask) 1191da177e4SLinus Torvalds { 12019770b32SMel Gorman int nd, k; 1211da177e4SLinus Torvalds 12219770b32SMel Gorman /* Check that there is something useful in this mask */ 12319770b32SMel Gorman k = policy_zone; 12419770b32SMel Gorman 12519770b32SMel Gorman for_each_node_mask(nd, *nodemask) { 12619770b32SMel Gorman struct zone *z; 12719770b32SMel Gorman 12819770b32SMel Gorman for (k = 0; k <= policy_zone; k++) { 12919770b32SMel Gorman z = &NODE_DATA(nd)->node_zones[k]; 130dd942ae3SAndi Kleen if (z->present_pages > 0) 13119770b32SMel Gorman return 1; 132dd942ae3SAndi Kleen } 133dd942ae3SAndi Kleen } 13419770b32SMel Gorman 13519770b32SMel Gorman return 0; 1361da177e4SLinus Torvalds } 1371da177e4SLinus Torvalds 138f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 139f5b087b5SDavid Rientjes { 1404c50bc01SDavid Rientjes return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); 1414c50bc01SDavid Rientjes } 1424c50bc01SDavid Rientjes 1434c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 1444c50bc01SDavid Rientjes const nodemask_t *rel) 1454c50bc01SDavid Rientjes { 1464c50bc01SDavid Rientjes nodemask_t tmp; 1474c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel)); 1484c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel); 149f5b087b5SDavid Rientjes } 150f5b087b5SDavid Rientjes 15137012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) 15237012946SDavid Rientjes { 15337012946SDavid Rientjes if (nodes_empty(*nodes)) 15437012946SDavid Rientjes return -EINVAL; 15537012946SDavid Rientjes pol->v.nodes = *nodes; 15637012946SDavid Rientjes return 0; 15737012946SDavid Rientjes } 15837012946SDavid Rientjes 15937012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 16037012946SDavid Rientjes { 16137012946SDavid Rientjes if (!nodes) 16237012946SDavid Rientjes pol->v.preferred_node = -1; /* local allocation */ 16337012946SDavid Rientjes else if (nodes_empty(*nodes)) 16437012946SDavid Rientjes return -EINVAL; /* no allowed nodes */ 16537012946SDavid Rientjes else 16637012946SDavid Rientjes pol->v.preferred_node = first_node(*nodes); 16737012946SDavid Rientjes return 0; 16837012946SDavid Rientjes } 16937012946SDavid Rientjes 17037012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) 17137012946SDavid Rientjes { 17237012946SDavid Rientjes if (!is_valid_nodemask(nodes)) 17337012946SDavid Rientjes return -EINVAL; 17437012946SDavid Rientjes pol->v.nodes = *nodes; 17537012946SDavid Rientjes return 0; 17637012946SDavid Rientjes } 17737012946SDavid Rientjes 1781da177e4SLinus Torvalds /* Create a new policy */ 179028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 180028fec41SDavid Rientjes nodemask_t *nodes) 1811da177e4SLinus Torvalds { 1821da177e4SLinus Torvalds struct mempolicy *policy; 183f5b087b5SDavid Rientjes nodemask_t cpuset_context_nmask; 18437012946SDavid Rientjes int ret; 1851da177e4SLinus Torvalds 186028fec41SDavid Rientjes pr_debug("setting mode %d flags %d nodes[0] %lx\n", 187028fec41SDavid Rientjes mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 188140d5a49SPaul Mundt 189*3e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) { 190*3e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes)) 19137012946SDavid Rientjes return ERR_PTR(-EINVAL); 192*3e1f0645SDavid Rientjes return NULL; 19337012946SDavid Rientjes } 194*3e1f0645SDavid Rientjes VM_BUG_ON(!nodes); 195*3e1f0645SDavid Rientjes 196*3e1f0645SDavid Rientjes /* 197*3e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 198*3e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 199*3e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask. 200*3e1f0645SDavid Rientjes */ 201*3e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) { 202*3e1f0645SDavid Rientjes if (nodes_empty(*nodes)) { 203*3e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) || 204*3e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES))) 205*3e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 206*3e1f0645SDavid Rientjes nodes = NULL; /* flag local alloc */ 207*3e1f0645SDavid Rientjes } 208*3e1f0645SDavid Rientjes } else if (nodes_empty(*nodes)) 209*3e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 2101da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2111da177e4SLinus Torvalds if (!policy) 2121da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 2131da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1); 21437012946SDavid Rientjes policy->policy = mode; 21537012946SDavid Rientjes policy->flags = flags; 216*3e1f0645SDavid Rientjes 217*3e1f0645SDavid Rientjes if (nodes) { 218*3e1f0645SDavid Rientjes /* 219*3e1f0645SDavid Rientjes * cpuset related setup doesn't apply to local allocation 220*3e1f0645SDavid Rientjes */ 221f5b087b5SDavid Rientjes cpuset_update_task_memory_state(); 2224c50bc01SDavid Rientjes if (flags & MPOL_F_RELATIVE_NODES) 2234c50bc01SDavid Rientjes mpol_relative_nodemask(&cpuset_context_nmask, nodes, 2244c50bc01SDavid Rientjes &cpuset_current_mems_allowed); 2254c50bc01SDavid Rientjes else 2264c50bc01SDavid Rientjes nodes_and(cpuset_context_nmask, *nodes, 2274c50bc01SDavid Rientjes cpuset_current_mems_allowed); 228f5b087b5SDavid Rientjes if (mpol_store_user_nodemask(policy)) 229f5b087b5SDavid Rientjes policy->w.user_nodemask = *nodes; 230f5b087b5SDavid Rientjes else 23137012946SDavid Rientjes policy->w.cpuset_mems_allowed = 23237012946SDavid Rientjes cpuset_mems_allowed(current); 2331da177e4SLinus Torvalds } 2341da177e4SLinus Torvalds 23537012946SDavid Rientjes ret = mpol_ops[mode].create(policy, 236*3e1f0645SDavid Rientjes nodes ? &cpuset_context_nmask : NULL); 23737012946SDavid Rientjes if (ret < 0) { 23837012946SDavid Rientjes kmem_cache_free(policy_cache, policy); 23937012946SDavid Rientjes return ERR_PTR(ret); 24037012946SDavid Rientjes } 24137012946SDavid Rientjes return policy; 24237012946SDavid Rientjes } 24337012946SDavid Rientjes 24437012946SDavid Rientjes static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 24537012946SDavid Rientjes { 24637012946SDavid Rientjes } 24737012946SDavid Rientjes 24837012946SDavid Rientjes static void mpol_rebind_nodemask(struct mempolicy *pol, 24937012946SDavid Rientjes const nodemask_t *nodes) 2501d0d2680SDavid Rientjes { 2511d0d2680SDavid Rientjes nodemask_t tmp; 2521d0d2680SDavid Rientjes 25337012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) 25437012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes); 25537012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES) 25637012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 2571d0d2680SDavid Rientjes else { 25837012946SDavid Rientjes nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 25937012946SDavid Rientjes *nodes); 26037012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 2611d0d2680SDavid Rientjes } 26237012946SDavid Rientjes 2631d0d2680SDavid Rientjes pol->v.nodes = tmp; 2641d0d2680SDavid Rientjes if (!node_isset(current->il_next, tmp)) { 2651d0d2680SDavid Rientjes current->il_next = next_node(current->il_next, tmp); 2661d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 2671d0d2680SDavid Rientjes current->il_next = first_node(tmp); 2681d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 2691d0d2680SDavid Rientjes current->il_next = numa_node_id(); 2701d0d2680SDavid Rientjes } 27137012946SDavid Rientjes } 27237012946SDavid Rientjes 27337012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol, 27437012946SDavid Rientjes const nodemask_t *nodes) 27537012946SDavid Rientjes { 27637012946SDavid Rientjes nodemask_t tmp; 27737012946SDavid Rientjes 27837012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) { 2791d0d2680SDavid Rientjes int node = first_node(pol->w.user_nodemask); 2801d0d2680SDavid Rientjes 28137012946SDavid Rientjes if (node_isset(node, *nodes)) 2821d0d2680SDavid Rientjes pol->v.preferred_node = node; 2831d0d2680SDavid Rientjes else 2841d0d2680SDavid Rientjes pol->v.preferred_node = -1; 28537012946SDavid Rientjes } else if (pol->flags & MPOL_F_RELATIVE_NODES) { 28637012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 2871d0d2680SDavid Rientjes pol->v.preferred_node = first_node(tmp); 288*3e1f0645SDavid Rientjes } else if (pol->v.preferred_node != -1) { 2891d0d2680SDavid Rientjes pol->v.preferred_node = node_remap(pol->v.preferred_node, 29037012946SDavid Rientjes pol->w.cpuset_mems_allowed, 29137012946SDavid Rientjes *nodes); 29237012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 2931d0d2680SDavid Rientjes } 2941d0d2680SDavid Rientjes } 29537012946SDavid Rientjes 29637012946SDavid Rientjes /* Migrate a policy to a different set of nodes */ 29737012946SDavid Rientjes static void mpol_rebind_policy(struct mempolicy *pol, 29837012946SDavid Rientjes const nodemask_t *newmask) 29937012946SDavid Rientjes { 30037012946SDavid Rientjes if (!pol) 30137012946SDavid Rientjes return; 30237012946SDavid Rientjes if (!mpol_store_user_nodemask(pol) && 30337012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 30437012946SDavid Rientjes return; 30537012946SDavid Rientjes mpol_ops[pol->policy].rebind(pol, newmask); 3061d0d2680SDavid Rientjes } 3071d0d2680SDavid Rientjes 3081d0d2680SDavid Rientjes /* 3091d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task 3101d0d2680SDavid Rientjes * pointer, and updates task mempolicy. 3111d0d2680SDavid Rientjes */ 3121d0d2680SDavid Rientjes 3131d0d2680SDavid Rientjes void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 3141d0d2680SDavid Rientjes { 3151d0d2680SDavid Rientjes mpol_rebind_policy(tsk->mempolicy, new); 3161d0d2680SDavid Rientjes } 3171d0d2680SDavid Rientjes 3181d0d2680SDavid Rientjes /* 3191d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask. 3201d0d2680SDavid Rientjes * 3211d0d2680SDavid Rientjes * Call holding a reference to mm. Takes mm->mmap_sem during call. 3221d0d2680SDavid Rientjes */ 3231d0d2680SDavid Rientjes 3241d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 3251d0d2680SDavid Rientjes { 3261d0d2680SDavid Rientjes struct vm_area_struct *vma; 3271d0d2680SDavid Rientjes 3281d0d2680SDavid Rientjes down_write(&mm->mmap_sem); 3291d0d2680SDavid Rientjes for (vma = mm->mmap; vma; vma = vma->vm_next) 3301d0d2680SDavid Rientjes mpol_rebind_policy(vma->vm_policy, new); 3311d0d2680SDavid Rientjes up_write(&mm->mmap_sem); 3321d0d2680SDavid Rientjes } 3331d0d2680SDavid Rientjes 33437012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 33537012946SDavid Rientjes [MPOL_DEFAULT] = { 33637012946SDavid Rientjes .rebind = mpol_rebind_default, 33737012946SDavid Rientjes }, 33837012946SDavid Rientjes [MPOL_INTERLEAVE] = { 33937012946SDavid Rientjes .create = mpol_new_interleave, 34037012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 34137012946SDavid Rientjes }, 34237012946SDavid Rientjes [MPOL_PREFERRED] = { 34337012946SDavid Rientjes .create = mpol_new_preferred, 34437012946SDavid Rientjes .rebind = mpol_rebind_preferred, 34537012946SDavid Rientjes }, 34637012946SDavid Rientjes [MPOL_BIND] = { 34737012946SDavid Rientjes .create = mpol_new_bind, 34837012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 34937012946SDavid Rientjes }, 35037012946SDavid Rientjes }; 35137012946SDavid Rientjes 352397874dfSChristoph Lameter static void gather_stats(struct page *, void *, int pte_dirty); 353fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 354fc301289SChristoph Lameter unsigned long flags); 3551a75a6c8SChristoph Lameter 35638e35860SChristoph Lameter /* Scan through pages checking if pages follow certain conditions. */ 357b5810039SNick Piggin static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 358dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 359dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 36038e35860SChristoph Lameter void *private) 3611da177e4SLinus Torvalds { 36291612e0dSHugh Dickins pte_t *orig_pte; 36391612e0dSHugh Dickins pte_t *pte; 364705e87c0SHugh Dickins spinlock_t *ptl; 365941150a3SHugh Dickins 366705e87c0SHugh Dickins orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 36791612e0dSHugh Dickins do { 3686aab341eSLinus Torvalds struct page *page; 36925ba77c1SAndy Whitcroft int nid; 37091612e0dSHugh Dickins 37191612e0dSHugh Dickins if (!pte_present(*pte)) 37291612e0dSHugh Dickins continue; 3736aab341eSLinus Torvalds page = vm_normal_page(vma, addr, *pte); 3746aab341eSLinus Torvalds if (!page) 37591612e0dSHugh Dickins continue; 376053837fcSNick Piggin /* 377053837fcSNick Piggin * The check for PageReserved here is important to avoid 378053837fcSNick Piggin * handling zero pages and other pages that may have been 379053837fcSNick Piggin * marked special by the system. 380053837fcSNick Piggin * 381053837fcSNick Piggin * If the PageReserved would not be checked here then f.e. 382053837fcSNick Piggin * the location of the zero page could have an influence 383053837fcSNick Piggin * on MPOL_MF_STRICT, zero pages would be counted for 384053837fcSNick Piggin * the per node stats, and there would be useless attempts 385053837fcSNick Piggin * to put zero pages on the migration list. 386053837fcSNick Piggin */ 387f4598c8bSChristoph Lameter if (PageReserved(page)) 388f4598c8bSChristoph Lameter continue; 3896aab341eSLinus Torvalds nid = page_to_nid(page); 39038e35860SChristoph Lameter if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 39138e35860SChristoph Lameter continue; 39238e35860SChristoph Lameter 3931a75a6c8SChristoph Lameter if (flags & MPOL_MF_STATS) 394397874dfSChristoph Lameter gather_stats(page, private, pte_dirty(*pte)); 395053837fcSNick Piggin else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 396fc301289SChristoph Lameter migrate_page_add(page, private, flags); 397dc9aa5b9SChristoph Lameter else 3981da177e4SLinus Torvalds break; 39991612e0dSHugh Dickins } while (pte++, addr += PAGE_SIZE, addr != end); 400705e87c0SHugh Dickins pte_unmap_unlock(orig_pte, ptl); 40191612e0dSHugh Dickins return addr != end; 40291612e0dSHugh Dickins } 40391612e0dSHugh Dickins 404b5810039SNick Piggin static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 405dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 406dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 40738e35860SChristoph Lameter void *private) 40891612e0dSHugh Dickins { 40991612e0dSHugh Dickins pmd_t *pmd; 41091612e0dSHugh Dickins unsigned long next; 41191612e0dSHugh Dickins 41291612e0dSHugh Dickins pmd = pmd_offset(pud, addr); 41391612e0dSHugh Dickins do { 41491612e0dSHugh Dickins next = pmd_addr_end(addr, end); 41591612e0dSHugh Dickins if (pmd_none_or_clear_bad(pmd)) 41691612e0dSHugh Dickins continue; 417dc9aa5b9SChristoph Lameter if (check_pte_range(vma, pmd, addr, next, nodes, 41838e35860SChristoph Lameter flags, private)) 41991612e0dSHugh Dickins return -EIO; 42091612e0dSHugh Dickins } while (pmd++, addr = next, addr != end); 42191612e0dSHugh Dickins return 0; 42291612e0dSHugh Dickins } 42391612e0dSHugh Dickins 424b5810039SNick Piggin static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 425dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 426dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 42738e35860SChristoph Lameter void *private) 42891612e0dSHugh Dickins { 42991612e0dSHugh Dickins pud_t *pud; 43091612e0dSHugh Dickins unsigned long next; 43191612e0dSHugh Dickins 43291612e0dSHugh Dickins pud = pud_offset(pgd, addr); 43391612e0dSHugh Dickins do { 43491612e0dSHugh Dickins next = pud_addr_end(addr, end); 43591612e0dSHugh Dickins if (pud_none_or_clear_bad(pud)) 43691612e0dSHugh Dickins continue; 437dc9aa5b9SChristoph Lameter if (check_pmd_range(vma, pud, addr, next, nodes, 43838e35860SChristoph Lameter flags, private)) 43991612e0dSHugh Dickins return -EIO; 44091612e0dSHugh Dickins } while (pud++, addr = next, addr != end); 44191612e0dSHugh Dickins return 0; 44291612e0dSHugh Dickins } 44391612e0dSHugh Dickins 444b5810039SNick Piggin static inline int check_pgd_range(struct vm_area_struct *vma, 445dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 446dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 44738e35860SChristoph Lameter void *private) 44891612e0dSHugh Dickins { 44991612e0dSHugh Dickins pgd_t *pgd; 45091612e0dSHugh Dickins unsigned long next; 45191612e0dSHugh Dickins 452b5810039SNick Piggin pgd = pgd_offset(vma->vm_mm, addr); 45391612e0dSHugh Dickins do { 45491612e0dSHugh Dickins next = pgd_addr_end(addr, end); 45591612e0dSHugh Dickins if (pgd_none_or_clear_bad(pgd)) 45691612e0dSHugh Dickins continue; 457dc9aa5b9SChristoph Lameter if (check_pud_range(vma, pgd, addr, next, nodes, 45838e35860SChristoph Lameter flags, private)) 45991612e0dSHugh Dickins return -EIO; 46091612e0dSHugh Dickins } while (pgd++, addr = next, addr != end); 46191612e0dSHugh Dickins return 0; 4621da177e4SLinus Torvalds } 4631da177e4SLinus Torvalds 464dc9aa5b9SChristoph Lameter /* 465dc9aa5b9SChristoph Lameter * Check if all pages in a range are on a set of nodes. 466dc9aa5b9SChristoph Lameter * If pagelist != NULL then isolate pages from the LRU and 467dc9aa5b9SChristoph Lameter * put them on the pagelist. 468dc9aa5b9SChristoph Lameter */ 4691da177e4SLinus Torvalds static struct vm_area_struct * 4701da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 47138e35860SChristoph Lameter const nodemask_t *nodes, unsigned long flags, void *private) 4721da177e4SLinus Torvalds { 4731da177e4SLinus Torvalds int err; 4741da177e4SLinus Torvalds struct vm_area_struct *first, *vma, *prev; 4751da177e4SLinus Torvalds 47690036ee5SChristoph Lameter if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 47790036ee5SChristoph Lameter 478b20a3503SChristoph Lameter err = migrate_prep(); 479b20a3503SChristoph Lameter if (err) 480b20a3503SChristoph Lameter return ERR_PTR(err); 48190036ee5SChristoph Lameter } 482053837fcSNick Piggin 4831da177e4SLinus Torvalds first = find_vma(mm, start); 4841da177e4SLinus Torvalds if (!first) 4851da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 4861da177e4SLinus Torvalds prev = NULL; 4871da177e4SLinus Torvalds for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 488dc9aa5b9SChristoph Lameter if (!(flags & MPOL_MF_DISCONTIG_OK)) { 4891da177e4SLinus Torvalds if (!vma->vm_next && vma->vm_end < end) 4901da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 4911da177e4SLinus Torvalds if (prev && prev->vm_end < vma->vm_start) 4921da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 493dc9aa5b9SChristoph Lameter } 494dc9aa5b9SChristoph Lameter if (!is_vm_hugetlb_page(vma) && 495dc9aa5b9SChristoph Lameter ((flags & MPOL_MF_STRICT) || 496dc9aa5b9SChristoph Lameter ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 497dc9aa5b9SChristoph Lameter vma_migratable(vma)))) { 4985b952b3cSAndi Kleen unsigned long endvma = vma->vm_end; 499dc9aa5b9SChristoph Lameter 5005b952b3cSAndi Kleen if (endvma > end) 5015b952b3cSAndi Kleen endvma = end; 5025b952b3cSAndi Kleen if (vma->vm_start > start) 5035b952b3cSAndi Kleen start = vma->vm_start; 504dc9aa5b9SChristoph Lameter err = check_pgd_range(vma, start, endvma, nodes, 50538e35860SChristoph Lameter flags, private); 5061da177e4SLinus Torvalds if (err) { 5071da177e4SLinus Torvalds first = ERR_PTR(err); 5081da177e4SLinus Torvalds break; 5091da177e4SLinus Torvalds } 5101da177e4SLinus Torvalds } 5111da177e4SLinus Torvalds prev = vma; 5121da177e4SLinus Torvalds } 5131da177e4SLinus Torvalds return first; 5141da177e4SLinus Torvalds } 5151da177e4SLinus Torvalds 5161da177e4SLinus Torvalds /* Apply policy to a single VMA */ 5171da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 5181da177e4SLinus Torvalds { 5191da177e4SLinus Torvalds int err = 0; 5201da177e4SLinus Torvalds struct mempolicy *old = vma->vm_policy; 5211da177e4SLinus Torvalds 522140d5a49SPaul Mundt pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 5231da177e4SLinus Torvalds vma->vm_start, vma->vm_end, vma->vm_pgoff, 5241da177e4SLinus Torvalds vma->vm_ops, vma->vm_file, 5251da177e4SLinus Torvalds vma->vm_ops ? vma->vm_ops->set_policy : NULL); 5261da177e4SLinus Torvalds 5271da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->set_policy) 5281da177e4SLinus Torvalds err = vma->vm_ops->set_policy(vma, new); 5291da177e4SLinus Torvalds if (!err) { 5301da177e4SLinus Torvalds mpol_get(new); 5311da177e4SLinus Torvalds vma->vm_policy = new; 5321da177e4SLinus Torvalds mpol_free(old); 5331da177e4SLinus Torvalds } 5341da177e4SLinus Torvalds return err; 5351da177e4SLinus Torvalds } 5361da177e4SLinus Torvalds 5371da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */ 5381da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start, 5391da177e4SLinus Torvalds unsigned long end, struct mempolicy *new) 5401da177e4SLinus Torvalds { 5411da177e4SLinus Torvalds struct vm_area_struct *next; 5421da177e4SLinus Torvalds int err; 5431da177e4SLinus Torvalds 5441da177e4SLinus Torvalds err = 0; 5451da177e4SLinus Torvalds for (; vma && vma->vm_start < end; vma = next) { 5461da177e4SLinus Torvalds next = vma->vm_next; 5471da177e4SLinus Torvalds if (vma->vm_start < start) 5481da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, start, 1); 5491da177e4SLinus Torvalds if (!err && vma->vm_end > end) 5501da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, end, 0); 5511da177e4SLinus Torvalds if (!err) 5521da177e4SLinus Torvalds err = policy_vma(vma, new); 5531da177e4SLinus Torvalds if (err) 5541da177e4SLinus Torvalds break; 5551da177e4SLinus Torvalds } 5561da177e4SLinus Torvalds return err; 5571da177e4SLinus Torvalds } 5581da177e4SLinus Torvalds 559c61afb18SPaul Jackson /* 560c61afb18SPaul Jackson * Update task->flags PF_MEMPOLICY bit: set iff non-default 561c61afb18SPaul Jackson * mempolicy. Allows more rapid checking of this (combined perhaps 562c61afb18SPaul Jackson * with other PF_* flag bits) on memory allocation hot code paths. 563c61afb18SPaul Jackson * 564c61afb18SPaul Jackson * If called from outside this file, the task 'p' should -only- be 565c61afb18SPaul Jackson * a newly forked child not yet visible on the task list, because 566c61afb18SPaul Jackson * manipulating the task flags of a visible task is not safe. 567c61afb18SPaul Jackson * 568c61afb18SPaul Jackson * The above limitation is why this routine has the funny name 569c61afb18SPaul Jackson * mpol_fix_fork_child_flag(). 570c61afb18SPaul Jackson * 571c61afb18SPaul Jackson * It is also safe to call this with a task pointer of current, 572c61afb18SPaul Jackson * which the static wrapper mpol_set_task_struct_flag() does, 573c61afb18SPaul Jackson * for use within this file. 574c61afb18SPaul Jackson */ 575c61afb18SPaul Jackson 576c61afb18SPaul Jackson void mpol_fix_fork_child_flag(struct task_struct *p) 577c61afb18SPaul Jackson { 578c61afb18SPaul Jackson if (p->mempolicy) 579c61afb18SPaul Jackson p->flags |= PF_MEMPOLICY; 580c61afb18SPaul Jackson else 581c61afb18SPaul Jackson p->flags &= ~PF_MEMPOLICY; 582c61afb18SPaul Jackson } 583c61afb18SPaul Jackson 584c61afb18SPaul Jackson static void mpol_set_task_struct_flag(void) 585c61afb18SPaul Jackson { 586c61afb18SPaul Jackson mpol_fix_fork_child_flag(current); 587c61afb18SPaul Jackson } 588c61afb18SPaul Jackson 5891da177e4SLinus Torvalds /* Set the process memory policy */ 590028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags, 591028fec41SDavid Rientjes nodemask_t *nodes) 5921da177e4SLinus Torvalds { 5931da177e4SLinus Torvalds struct mempolicy *new; 5941da177e4SLinus Torvalds 595028fec41SDavid Rientjes new = mpol_new(mode, flags, nodes); 5961da177e4SLinus Torvalds if (IS_ERR(new)) 5971da177e4SLinus Torvalds return PTR_ERR(new); 5981da177e4SLinus Torvalds mpol_free(current->mempolicy); 5991da177e4SLinus Torvalds current->mempolicy = new; 600c61afb18SPaul Jackson mpol_set_task_struct_flag(); 601f5b087b5SDavid Rientjes if (new && new->policy == MPOL_INTERLEAVE && 602f5b087b5SDavid Rientjes nodes_weight(new->v.nodes)) 603dfcd3c0dSAndi Kleen current->il_next = first_node(new->v.nodes); 6041da177e4SLinus Torvalds return 0; 6051da177e4SLinus Torvalds } 6061da177e4SLinus Torvalds 6071da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */ 608dfcd3c0dSAndi Kleen static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 6091da177e4SLinus Torvalds { 610dfcd3c0dSAndi Kleen nodes_clear(*nodes); 6111da177e4SLinus Torvalds switch (p->policy) { 6121da177e4SLinus Torvalds case MPOL_DEFAULT: 6131da177e4SLinus Torvalds break; 61419770b32SMel Gorman case MPOL_BIND: 61519770b32SMel Gorman /* Fall through */ 6161da177e4SLinus Torvalds case MPOL_INTERLEAVE: 617dfcd3c0dSAndi Kleen *nodes = p->v.nodes; 6181da177e4SLinus Torvalds break; 6191da177e4SLinus Torvalds case MPOL_PREFERRED: 62056bbd65dSChristoph Lameter /* or use current node instead of memory_map? */ 6211da177e4SLinus Torvalds if (p->v.preferred_node < 0) 62256bbd65dSChristoph Lameter *nodes = node_states[N_HIGH_MEMORY]; 6231da177e4SLinus Torvalds else 624dfcd3c0dSAndi Kleen node_set(p->v.preferred_node, *nodes); 6251da177e4SLinus Torvalds break; 6261da177e4SLinus Torvalds default: 6271da177e4SLinus Torvalds BUG(); 6281da177e4SLinus Torvalds } 6291da177e4SLinus Torvalds } 6301da177e4SLinus Torvalds 6311da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr) 6321da177e4SLinus Torvalds { 6331da177e4SLinus Torvalds struct page *p; 6341da177e4SLinus Torvalds int err; 6351da177e4SLinus Torvalds 6361da177e4SLinus Torvalds err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 6371da177e4SLinus Torvalds if (err >= 0) { 6381da177e4SLinus Torvalds err = page_to_nid(p); 6391da177e4SLinus Torvalds put_page(p); 6401da177e4SLinus Torvalds } 6411da177e4SLinus Torvalds return err; 6421da177e4SLinus Torvalds } 6431da177e4SLinus Torvalds 6441da177e4SLinus Torvalds /* Retrieve NUMA policy */ 645dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask, 6461da177e4SLinus Torvalds unsigned long addr, unsigned long flags) 6471da177e4SLinus Torvalds { 6488bccd85fSChristoph Lameter int err; 6491da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 6501da177e4SLinus Torvalds struct vm_area_struct *vma = NULL; 6511da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 6521da177e4SLinus Torvalds 653cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 654754af6f5SLee Schermerhorn if (flags & 655754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 6561da177e4SLinus Torvalds return -EINVAL; 657754af6f5SLee Schermerhorn 658754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) { 659754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 660754af6f5SLee Schermerhorn return -EINVAL; 661754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */ 662754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed; 663754af6f5SLee Schermerhorn return 0; 664754af6f5SLee Schermerhorn } 665754af6f5SLee Schermerhorn 6661da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 6671da177e4SLinus Torvalds down_read(&mm->mmap_sem); 6681da177e4SLinus Torvalds vma = find_vma_intersection(mm, addr, addr+1); 6691da177e4SLinus Torvalds if (!vma) { 6701da177e4SLinus Torvalds up_read(&mm->mmap_sem); 6711da177e4SLinus Torvalds return -EFAULT; 6721da177e4SLinus Torvalds } 6731da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy) 6741da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 6751da177e4SLinus Torvalds else 6761da177e4SLinus Torvalds pol = vma->vm_policy; 6771da177e4SLinus Torvalds } else if (addr) 6781da177e4SLinus Torvalds return -EINVAL; 6791da177e4SLinus Torvalds 6801da177e4SLinus Torvalds if (!pol) 6811da177e4SLinus Torvalds pol = &default_policy; 6821da177e4SLinus Torvalds 6831da177e4SLinus Torvalds if (flags & MPOL_F_NODE) { 6841da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 6851da177e4SLinus Torvalds err = lookup_node(mm, addr); 6861da177e4SLinus Torvalds if (err < 0) 6871da177e4SLinus Torvalds goto out; 6888bccd85fSChristoph Lameter *policy = err; 6891da177e4SLinus Torvalds } else if (pol == current->mempolicy && 6901da177e4SLinus Torvalds pol->policy == MPOL_INTERLEAVE) { 6918bccd85fSChristoph Lameter *policy = current->il_next; 6921da177e4SLinus Torvalds } else { 6931da177e4SLinus Torvalds err = -EINVAL; 6941da177e4SLinus Torvalds goto out; 6951da177e4SLinus Torvalds } 6961da177e4SLinus Torvalds } else 697028fec41SDavid Rientjes *policy = pol->policy | pol->flags; 6981da177e4SLinus Torvalds 6991da177e4SLinus Torvalds if (vma) { 7001da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 7011da177e4SLinus Torvalds vma = NULL; 7021da177e4SLinus Torvalds } 7031da177e4SLinus Torvalds 7041da177e4SLinus Torvalds err = 0; 7058bccd85fSChristoph Lameter if (nmask) 7068bccd85fSChristoph Lameter get_zonemask(pol, nmask); 7071da177e4SLinus Torvalds 7081da177e4SLinus Torvalds out: 7091da177e4SLinus Torvalds if (vma) 7101da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 7111da177e4SLinus Torvalds return err; 7121da177e4SLinus Torvalds } 7131da177e4SLinus Torvalds 714b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION 7158bccd85fSChristoph Lameter /* 7166ce3c4c0SChristoph Lameter * page migration 7176ce3c4c0SChristoph Lameter */ 718fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 719fc301289SChristoph Lameter unsigned long flags) 7206ce3c4c0SChristoph Lameter { 7216ce3c4c0SChristoph Lameter /* 722fc301289SChristoph Lameter * Avoid migrating a page that is shared with others. 7236ce3c4c0SChristoph Lameter */ 724b20a3503SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 725b20a3503SChristoph Lameter isolate_lru_page(page, pagelist); 7266ce3c4c0SChristoph Lameter } 7276ce3c4c0SChristoph Lameter 728742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x) 72995a402c3SChristoph Lameter { 730769848c0SMel Gorman return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 73195a402c3SChristoph Lameter } 73295a402c3SChristoph Lameter 7336ce3c4c0SChristoph Lameter /* 7347e2ab150SChristoph Lameter * Migrate pages from one node to a target node. 7357e2ab150SChristoph Lameter * Returns error or the number of pages not migrated. 7367e2ab150SChristoph Lameter */ 737dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest, 738dbcb0f19SAdrian Bunk int flags) 7397e2ab150SChristoph Lameter { 7407e2ab150SChristoph Lameter nodemask_t nmask; 7417e2ab150SChristoph Lameter LIST_HEAD(pagelist); 7427e2ab150SChristoph Lameter int err = 0; 7437e2ab150SChristoph Lameter 7447e2ab150SChristoph Lameter nodes_clear(nmask); 7457e2ab150SChristoph Lameter node_set(source, nmask); 7467e2ab150SChristoph Lameter 7477e2ab150SChristoph Lameter check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 7487e2ab150SChristoph Lameter flags | MPOL_MF_DISCONTIG_OK, &pagelist); 7497e2ab150SChristoph Lameter 7507e2ab150SChristoph Lameter if (!list_empty(&pagelist)) 75195a402c3SChristoph Lameter err = migrate_pages(&pagelist, new_node_page, dest); 75295a402c3SChristoph Lameter 7537e2ab150SChristoph Lameter return err; 7547e2ab150SChristoph Lameter } 7557e2ab150SChristoph Lameter 7567e2ab150SChristoph Lameter /* 7577e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical 7587e2ab150SChristoph Lameter * layout as much as possible. 75939743889SChristoph Lameter * 76039743889SChristoph Lameter * Returns the number of page that could not be moved. 76139743889SChristoph Lameter */ 76239743889SChristoph Lameter int do_migrate_pages(struct mm_struct *mm, 76339743889SChristoph Lameter const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 76439743889SChristoph Lameter { 76539743889SChristoph Lameter LIST_HEAD(pagelist); 7667e2ab150SChristoph Lameter int busy = 0; 7677e2ab150SChristoph Lameter int err = 0; 7687e2ab150SChristoph Lameter nodemask_t tmp; 76939743889SChristoph Lameter 77039743889SChristoph Lameter down_read(&mm->mmap_sem); 771d4984711SChristoph Lameter 7727b2259b3SChristoph Lameter err = migrate_vmas(mm, from_nodes, to_nodes, flags); 7737b2259b3SChristoph Lameter if (err) 7747b2259b3SChristoph Lameter goto out; 7757b2259b3SChristoph Lameter 7767e2ab150SChristoph Lameter /* 7777e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 7787e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 7797e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration. 7807e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map. 7817e2ab150SChristoph Lameter * 7827e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some 7837e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the 7847e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node 7857e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move. 7867e2ab150SChristoph Lameter * 7877e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left 7887e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false 7897e2ab150SChristoph Lameter * (nothing left to migrate). 7907e2ab150SChristoph Lameter * 7917e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that 7927e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other 7937e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a 7947e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node 7957e2ab150SChristoph Lameter * before migrating outgoing memory source that same node. 7967e2ab150SChristoph Lameter * 7977e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the 7987e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair 7997e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot 8007e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair. 8017e2ab150SChristoph Lameter * Otherwise when we finish scannng from_tmp, we at least have the 8027e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through 8037e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less 8047e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating. 8057e2ab150SChristoph Lameter */ 8067e2ab150SChristoph Lameter 8077e2ab150SChristoph Lameter tmp = *from_nodes; 8087e2ab150SChristoph Lameter while (!nodes_empty(tmp)) { 8097e2ab150SChristoph Lameter int s,d; 8107e2ab150SChristoph Lameter int source = -1; 8117e2ab150SChristoph Lameter int dest = 0; 8127e2ab150SChristoph Lameter 8137e2ab150SChristoph Lameter for_each_node_mask(s, tmp) { 8147e2ab150SChristoph Lameter d = node_remap(s, *from_nodes, *to_nodes); 8157e2ab150SChristoph Lameter if (s == d) 8167e2ab150SChristoph Lameter continue; 8177e2ab150SChristoph Lameter 8187e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */ 8197e2ab150SChristoph Lameter dest = d; 8207e2ab150SChristoph Lameter 8217e2ab150SChristoph Lameter /* dest not in remaining from nodes? */ 8227e2ab150SChristoph Lameter if (!node_isset(dest, tmp)) 8237e2ab150SChristoph Lameter break; 8247e2ab150SChristoph Lameter } 8257e2ab150SChristoph Lameter if (source == -1) 8267e2ab150SChristoph Lameter break; 8277e2ab150SChristoph Lameter 8287e2ab150SChristoph Lameter node_clear(source, tmp); 8297e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags); 8307e2ab150SChristoph Lameter if (err > 0) 8317e2ab150SChristoph Lameter busy += err; 8327e2ab150SChristoph Lameter if (err < 0) 8337e2ab150SChristoph Lameter break; 83439743889SChristoph Lameter } 8357b2259b3SChristoph Lameter out: 83639743889SChristoph Lameter up_read(&mm->mmap_sem); 8377e2ab150SChristoph Lameter if (err < 0) 8387e2ab150SChristoph Lameter return err; 8397e2ab150SChristoph Lameter return busy; 840b20a3503SChristoph Lameter 84139743889SChristoph Lameter } 84239743889SChristoph Lameter 8433ad33b24SLee Schermerhorn /* 8443ad33b24SLee Schermerhorn * Allocate a new page for page migration based on vma policy. 8453ad33b24SLee Schermerhorn * Start assuming that page is mapped by vma pointed to by @private. 8463ad33b24SLee Schermerhorn * Search forward from there, if not. N.B., this assumes that the 8473ad33b24SLee Schermerhorn * list of pages handed to migrate_pages()--which is how we get here-- 8483ad33b24SLee Schermerhorn * is in virtual address order. 8493ad33b24SLee Schermerhorn */ 850742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 85195a402c3SChristoph Lameter { 85295a402c3SChristoph Lameter struct vm_area_struct *vma = (struct vm_area_struct *)private; 8533ad33b24SLee Schermerhorn unsigned long uninitialized_var(address); 85495a402c3SChristoph Lameter 8553ad33b24SLee Schermerhorn while (vma) { 8563ad33b24SLee Schermerhorn address = page_address_in_vma(page, vma); 8573ad33b24SLee Schermerhorn if (address != -EFAULT) 8583ad33b24SLee Schermerhorn break; 8593ad33b24SLee Schermerhorn vma = vma->vm_next; 8603ad33b24SLee Schermerhorn } 8613ad33b24SLee Schermerhorn 8623ad33b24SLee Schermerhorn /* 8633ad33b24SLee Schermerhorn * if !vma, alloc_page_vma() will use task or system default policy 8643ad33b24SLee Schermerhorn */ 8653ad33b24SLee Schermerhorn return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 86695a402c3SChristoph Lameter } 867b20a3503SChristoph Lameter #else 868b20a3503SChristoph Lameter 869b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 870b20a3503SChristoph Lameter unsigned long flags) 871b20a3503SChristoph Lameter { 872b20a3503SChristoph Lameter } 873b20a3503SChristoph Lameter 874b20a3503SChristoph Lameter int do_migrate_pages(struct mm_struct *mm, 875b20a3503SChristoph Lameter const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 876b20a3503SChristoph Lameter { 877b20a3503SChristoph Lameter return -ENOSYS; 878b20a3503SChristoph Lameter } 87995a402c3SChristoph Lameter 88069939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 88195a402c3SChristoph Lameter { 88295a402c3SChristoph Lameter return NULL; 88395a402c3SChristoph Lameter } 884b20a3503SChristoph Lameter #endif 885b20a3503SChristoph Lameter 886dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len, 887028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags, 888028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags) 8896ce3c4c0SChristoph Lameter { 8906ce3c4c0SChristoph Lameter struct vm_area_struct *vma; 8916ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm; 8926ce3c4c0SChristoph Lameter struct mempolicy *new; 8936ce3c4c0SChristoph Lameter unsigned long end; 8946ce3c4c0SChristoph Lameter int err; 8956ce3c4c0SChristoph Lameter LIST_HEAD(pagelist); 8966ce3c4c0SChristoph Lameter 897a3b51e01SDavid Rientjes if (flags & ~(unsigned long)(MPOL_MF_STRICT | 8986ce3c4c0SChristoph Lameter MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 8996ce3c4c0SChristoph Lameter return -EINVAL; 90074c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 9016ce3c4c0SChristoph Lameter return -EPERM; 9026ce3c4c0SChristoph Lameter 9036ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK) 9046ce3c4c0SChristoph Lameter return -EINVAL; 9056ce3c4c0SChristoph Lameter 9066ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT) 9076ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT; 9086ce3c4c0SChristoph Lameter 9096ce3c4c0SChristoph Lameter len = (len + PAGE_SIZE - 1) & PAGE_MASK; 9106ce3c4c0SChristoph Lameter end = start + len; 9116ce3c4c0SChristoph Lameter 9126ce3c4c0SChristoph Lameter if (end < start) 9136ce3c4c0SChristoph Lameter return -EINVAL; 9146ce3c4c0SChristoph Lameter if (end == start) 9156ce3c4c0SChristoph Lameter return 0; 9166ce3c4c0SChristoph Lameter 917028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask); 9186ce3c4c0SChristoph Lameter if (IS_ERR(new)) 9196ce3c4c0SChristoph Lameter return PTR_ERR(new); 9206ce3c4c0SChristoph Lameter 9216ce3c4c0SChristoph Lameter /* 9226ce3c4c0SChristoph Lameter * If we are using the default policy then operation 9236ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all 9246ce3c4c0SChristoph Lameter */ 9256ce3c4c0SChristoph Lameter if (!new) 9266ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK; 9276ce3c4c0SChristoph Lameter 928028fec41SDavid Rientjes pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 929028fec41SDavid Rientjes start, start + len, mode, mode_flags, 930028fec41SDavid Rientjes nmask ? nodes_addr(*nmask)[0] : -1); 9316ce3c4c0SChristoph Lameter 9326ce3c4c0SChristoph Lameter down_write(&mm->mmap_sem); 9336ce3c4c0SChristoph Lameter vma = check_range(mm, start, end, nmask, 9346ce3c4c0SChristoph Lameter flags | MPOL_MF_INVERT, &pagelist); 9356ce3c4c0SChristoph Lameter 9366ce3c4c0SChristoph Lameter err = PTR_ERR(vma); 9376ce3c4c0SChristoph Lameter if (!IS_ERR(vma)) { 9386ce3c4c0SChristoph Lameter int nr_failed = 0; 9396ce3c4c0SChristoph Lameter 9406ce3c4c0SChristoph Lameter err = mbind_range(vma, start, end, new); 9417e2ab150SChristoph Lameter 9426ce3c4c0SChristoph Lameter if (!list_empty(&pagelist)) 94395a402c3SChristoph Lameter nr_failed = migrate_pages(&pagelist, new_vma_page, 94495a402c3SChristoph Lameter (unsigned long)vma); 9456ce3c4c0SChristoph Lameter 9466ce3c4c0SChristoph Lameter if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 9476ce3c4c0SChristoph Lameter err = -EIO; 9486ce3c4c0SChristoph Lameter } 949b20a3503SChristoph Lameter 9506ce3c4c0SChristoph Lameter up_write(&mm->mmap_sem); 9516ce3c4c0SChristoph Lameter mpol_free(new); 9526ce3c4c0SChristoph Lameter return err; 9536ce3c4c0SChristoph Lameter } 9546ce3c4c0SChristoph Lameter 95539743889SChristoph Lameter /* 9568bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists. 9578bccd85fSChristoph Lameter */ 9588bccd85fSChristoph Lameter 9598bccd85fSChristoph Lameter /* Copy a node mask from user space. */ 96039743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 9618bccd85fSChristoph Lameter unsigned long maxnode) 9628bccd85fSChristoph Lameter { 9638bccd85fSChristoph Lameter unsigned long k; 9648bccd85fSChristoph Lameter unsigned long nlongs; 9658bccd85fSChristoph Lameter unsigned long endmask; 9668bccd85fSChristoph Lameter 9678bccd85fSChristoph Lameter --maxnode; 9688bccd85fSChristoph Lameter nodes_clear(*nodes); 9698bccd85fSChristoph Lameter if (maxnode == 0 || !nmask) 9708bccd85fSChristoph Lameter return 0; 971a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 972636f13c1SChris Wright return -EINVAL; 9738bccd85fSChristoph Lameter 9748bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(maxnode); 9758bccd85fSChristoph Lameter if ((maxnode % BITS_PER_LONG) == 0) 9768bccd85fSChristoph Lameter endmask = ~0UL; 9778bccd85fSChristoph Lameter else 9788bccd85fSChristoph Lameter endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 9798bccd85fSChristoph Lameter 9808bccd85fSChristoph Lameter /* When the user specified more nodes than supported just check 9818bccd85fSChristoph Lameter if the non supported part is all zero. */ 9828bccd85fSChristoph Lameter if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 9838bccd85fSChristoph Lameter if (nlongs > PAGE_SIZE/sizeof(long)) 9848bccd85fSChristoph Lameter return -EINVAL; 9858bccd85fSChristoph Lameter for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 9868bccd85fSChristoph Lameter unsigned long t; 9878bccd85fSChristoph Lameter if (get_user(t, nmask + k)) 9888bccd85fSChristoph Lameter return -EFAULT; 9898bccd85fSChristoph Lameter if (k == nlongs - 1) { 9908bccd85fSChristoph Lameter if (t & endmask) 9918bccd85fSChristoph Lameter return -EINVAL; 9928bccd85fSChristoph Lameter } else if (t) 9938bccd85fSChristoph Lameter return -EINVAL; 9948bccd85fSChristoph Lameter } 9958bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(MAX_NUMNODES); 9968bccd85fSChristoph Lameter endmask = ~0UL; 9978bccd85fSChristoph Lameter } 9988bccd85fSChristoph Lameter 9998bccd85fSChristoph Lameter if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 10008bccd85fSChristoph Lameter return -EFAULT; 10018bccd85fSChristoph Lameter nodes_addr(*nodes)[nlongs-1] &= endmask; 10028bccd85fSChristoph Lameter return 0; 10038bccd85fSChristoph Lameter } 10048bccd85fSChristoph Lameter 10058bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */ 10068bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 10078bccd85fSChristoph Lameter nodemask_t *nodes) 10088bccd85fSChristoph Lameter { 10098bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8; 10108bccd85fSChristoph Lameter const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 10118bccd85fSChristoph Lameter 10128bccd85fSChristoph Lameter if (copy > nbytes) { 10138bccd85fSChristoph Lameter if (copy > PAGE_SIZE) 10148bccd85fSChristoph Lameter return -EINVAL; 10158bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 10168bccd85fSChristoph Lameter return -EFAULT; 10178bccd85fSChristoph Lameter copy = nbytes; 10188bccd85fSChristoph Lameter } 10198bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 10208bccd85fSChristoph Lameter } 10218bccd85fSChristoph Lameter 10228bccd85fSChristoph Lameter asmlinkage long sys_mbind(unsigned long start, unsigned long len, 10238bccd85fSChristoph Lameter unsigned long mode, 10248bccd85fSChristoph Lameter unsigned long __user *nmask, unsigned long maxnode, 10258bccd85fSChristoph Lameter unsigned flags) 10268bccd85fSChristoph Lameter { 10278bccd85fSChristoph Lameter nodemask_t nodes; 10288bccd85fSChristoph Lameter int err; 1029028fec41SDavid Rientjes unsigned short mode_flags; 10308bccd85fSChristoph Lameter 1031028fec41SDavid Rientjes mode_flags = mode & MPOL_MODE_FLAGS; 1032028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1033a3b51e01SDavid Rientjes if (mode >= MPOL_MAX) 1034a3b51e01SDavid Rientjes return -EINVAL; 10354c50bc01SDavid Rientjes if ((mode_flags & MPOL_F_STATIC_NODES) && 10364c50bc01SDavid Rientjes (mode_flags & MPOL_F_RELATIVE_NODES)) 10374c50bc01SDavid Rientjes return -EINVAL; 10388bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 10398bccd85fSChristoph Lameter if (err) 10408bccd85fSChristoph Lameter return err; 1041028fec41SDavid Rientjes return do_mbind(start, len, mode, mode_flags, &nodes, flags); 10428bccd85fSChristoph Lameter } 10438bccd85fSChristoph Lameter 10448bccd85fSChristoph Lameter /* Set the process memory policy */ 10458bccd85fSChristoph Lameter asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 10468bccd85fSChristoph Lameter unsigned long maxnode) 10478bccd85fSChristoph Lameter { 10488bccd85fSChristoph Lameter int err; 10498bccd85fSChristoph Lameter nodemask_t nodes; 1050028fec41SDavid Rientjes unsigned short flags; 10518bccd85fSChristoph Lameter 1052028fec41SDavid Rientjes flags = mode & MPOL_MODE_FLAGS; 1053028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1054028fec41SDavid Rientjes if ((unsigned int)mode >= MPOL_MAX) 10558bccd85fSChristoph Lameter return -EINVAL; 10564c50bc01SDavid Rientjes if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) 10574c50bc01SDavid Rientjes return -EINVAL; 10588bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 10598bccd85fSChristoph Lameter if (err) 10608bccd85fSChristoph Lameter return err; 1061028fec41SDavid Rientjes return do_set_mempolicy(mode, flags, &nodes); 10628bccd85fSChristoph Lameter } 10638bccd85fSChristoph Lameter 106439743889SChristoph Lameter asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 106539743889SChristoph Lameter const unsigned long __user *old_nodes, 106639743889SChristoph Lameter const unsigned long __user *new_nodes) 106739743889SChristoph Lameter { 106839743889SChristoph Lameter struct mm_struct *mm; 106939743889SChristoph Lameter struct task_struct *task; 107039743889SChristoph Lameter nodemask_t old; 107139743889SChristoph Lameter nodemask_t new; 107239743889SChristoph Lameter nodemask_t task_nodes; 107339743889SChristoph Lameter int err; 107439743889SChristoph Lameter 107539743889SChristoph Lameter err = get_nodes(&old, old_nodes, maxnode); 107639743889SChristoph Lameter if (err) 107739743889SChristoph Lameter return err; 107839743889SChristoph Lameter 107939743889SChristoph Lameter err = get_nodes(&new, new_nodes, maxnode); 108039743889SChristoph Lameter if (err) 108139743889SChristoph Lameter return err; 108239743889SChristoph Lameter 108339743889SChristoph Lameter /* Find the mm_struct */ 108439743889SChristoph Lameter read_lock(&tasklist_lock); 1085228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 108639743889SChristoph Lameter if (!task) { 108739743889SChristoph Lameter read_unlock(&tasklist_lock); 108839743889SChristoph Lameter return -ESRCH; 108939743889SChristoph Lameter } 109039743889SChristoph Lameter mm = get_task_mm(task); 109139743889SChristoph Lameter read_unlock(&tasklist_lock); 109239743889SChristoph Lameter 109339743889SChristoph Lameter if (!mm) 109439743889SChristoph Lameter return -EINVAL; 109539743889SChristoph Lameter 109639743889SChristoph Lameter /* 109739743889SChristoph Lameter * Check if this process has the right to modify the specified 109839743889SChristoph Lameter * process. The right exists if the process has administrative 10997f927fccSAlexey Dobriyan * capabilities, superuser privileges or the same 110039743889SChristoph Lameter * userid as the target process. 110139743889SChristoph Lameter */ 110239743889SChristoph Lameter if ((current->euid != task->suid) && (current->euid != task->uid) && 110339743889SChristoph Lameter (current->uid != task->suid) && (current->uid != task->uid) && 110474c00241SChristoph Lameter !capable(CAP_SYS_NICE)) { 110539743889SChristoph Lameter err = -EPERM; 110639743889SChristoph Lameter goto out; 110739743889SChristoph Lameter } 110839743889SChristoph Lameter 110939743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task); 111039743889SChristoph Lameter /* Is the user allowed to access the target nodes? */ 111174c00241SChristoph Lameter if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 111239743889SChristoph Lameter err = -EPERM; 111339743889SChristoph Lameter goto out; 111439743889SChristoph Lameter } 111539743889SChristoph Lameter 111637b07e41SLee Schermerhorn if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { 11173b42d28bSChristoph Lameter err = -EINVAL; 11183b42d28bSChristoph Lameter goto out; 11193b42d28bSChristoph Lameter } 11203b42d28bSChristoph Lameter 112186c3a764SDavid Quigley err = security_task_movememory(task); 112286c3a764SDavid Quigley if (err) 112386c3a764SDavid Quigley goto out; 112486c3a764SDavid Quigley 1125511030bcSChristoph Lameter err = do_migrate_pages(mm, &old, &new, 112674c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 112739743889SChristoph Lameter out: 112839743889SChristoph Lameter mmput(mm); 112939743889SChristoph Lameter return err; 113039743889SChristoph Lameter } 113139743889SChristoph Lameter 113239743889SChristoph Lameter 11338bccd85fSChristoph Lameter /* Retrieve NUMA policy */ 11348bccd85fSChristoph Lameter asmlinkage long sys_get_mempolicy(int __user *policy, 11358bccd85fSChristoph Lameter unsigned long __user *nmask, 11368bccd85fSChristoph Lameter unsigned long maxnode, 11378bccd85fSChristoph Lameter unsigned long addr, unsigned long flags) 11388bccd85fSChristoph Lameter { 1139dbcb0f19SAdrian Bunk int err; 1140dbcb0f19SAdrian Bunk int uninitialized_var(pval); 11418bccd85fSChristoph Lameter nodemask_t nodes; 11428bccd85fSChristoph Lameter 11438bccd85fSChristoph Lameter if (nmask != NULL && maxnode < MAX_NUMNODES) 11448bccd85fSChristoph Lameter return -EINVAL; 11458bccd85fSChristoph Lameter 11468bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags); 11478bccd85fSChristoph Lameter 11488bccd85fSChristoph Lameter if (err) 11498bccd85fSChristoph Lameter return err; 11508bccd85fSChristoph Lameter 11518bccd85fSChristoph Lameter if (policy && put_user(pval, policy)) 11528bccd85fSChristoph Lameter return -EFAULT; 11538bccd85fSChristoph Lameter 11548bccd85fSChristoph Lameter if (nmask) 11558bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes); 11568bccd85fSChristoph Lameter 11578bccd85fSChristoph Lameter return err; 11588bccd85fSChristoph Lameter } 11598bccd85fSChristoph Lameter 11601da177e4SLinus Torvalds #ifdef CONFIG_COMPAT 11611da177e4SLinus Torvalds 11621da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy, 11631da177e4SLinus Torvalds compat_ulong_t __user *nmask, 11641da177e4SLinus Torvalds compat_ulong_t maxnode, 11651da177e4SLinus Torvalds compat_ulong_t addr, compat_ulong_t flags) 11661da177e4SLinus Torvalds { 11671da177e4SLinus Torvalds long err; 11681da177e4SLinus Torvalds unsigned long __user *nm = NULL; 11691da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 11701da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 11711da177e4SLinus Torvalds 11721da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 11731da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 11741da177e4SLinus Torvalds 11751da177e4SLinus Torvalds if (nmask) 11761da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 11771da177e4SLinus Torvalds 11781da177e4SLinus Torvalds err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 11791da177e4SLinus Torvalds 11801da177e4SLinus Torvalds if (!err && nmask) { 11811da177e4SLinus Torvalds err = copy_from_user(bm, nm, alloc_size); 11821da177e4SLinus Torvalds /* ensure entire bitmap is zeroed */ 11831da177e4SLinus Torvalds err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 11841da177e4SLinus Torvalds err |= compat_put_bitmap(nmask, bm, nr_bits); 11851da177e4SLinus Torvalds } 11861da177e4SLinus Torvalds 11871da177e4SLinus Torvalds return err; 11881da177e4SLinus Torvalds } 11891da177e4SLinus Torvalds 11901da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 11911da177e4SLinus Torvalds compat_ulong_t maxnode) 11921da177e4SLinus Torvalds { 11931da177e4SLinus Torvalds long err = 0; 11941da177e4SLinus Torvalds unsigned long __user *nm = NULL; 11951da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 11961da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 11971da177e4SLinus Torvalds 11981da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 11991da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 12001da177e4SLinus Torvalds 12011da177e4SLinus Torvalds if (nmask) { 12021da177e4SLinus Torvalds err = compat_get_bitmap(bm, nmask, nr_bits); 12031da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 12041da177e4SLinus Torvalds err |= copy_to_user(nm, bm, alloc_size); 12051da177e4SLinus Torvalds } 12061da177e4SLinus Torvalds 12071da177e4SLinus Torvalds if (err) 12081da177e4SLinus Torvalds return -EFAULT; 12091da177e4SLinus Torvalds 12101da177e4SLinus Torvalds return sys_set_mempolicy(mode, nm, nr_bits+1); 12111da177e4SLinus Torvalds } 12121da177e4SLinus Torvalds 12131da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 12141da177e4SLinus Torvalds compat_ulong_t mode, compat_ulong_t __user *nmask, 12151da177e4SLinus Torvalds compat_ulong_t maxnode, compat_ulong_t flags) 12161da177e4SLinus Torvalds { 12171da177e4SLinus Torvalds long err = 0; 12181da177e4SLinus Torvalds unsigned long __user *nm = NULL; 12191da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 1220dfcd3c0dSAndi Kleen nodemask_t bm; 12211da177e4SLinus Torvalds 12221da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 12231da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 12241da177e4SLinus Torvalds 12251da177e4SLinus Torvalds if (nmask) { 1226dfcd3c0dSAndi Kleen err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 12271da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 1228dfcd3c0dSAndi Kleen err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 12291da177e4SLinus Torvalds } 12301da177e4SLinus Torvalds 12311da177e4SLinus Torvalds if (err) 12321da177e4SLinus Torvalds return -EFAULT; 12331da177e4SLinus Torvalds 12341da177e4SLinus Torvalds return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 12351da177e4SLinus Torvalds } 12361da177e4SLinus Torvalds 12371da177e4SLinus Torvalds #endif 12381da177e4SLinus Torvalds 1239480eccf9SLee Schermerhorn /* 1240480eccf9SLee Schermerhorn * get_vma_policy(@task, @vma, @addr) 1241480eccf9SLee Schermerhorn * @task - task for fallback if vma policy == default 1242480eccf9SLee Schermerhorn * @vma - virtual memory area whose policy is sought 1243480eccf9SLee Schermerhorn * @addr - address in @vma for shared policy lookup 1244480eccf9SLee Schermerhorn * 1245480eccf9SLee Schermerhorn * Returns effective policy for a VMA at specified address. 1246480eccf9SLee Schermerhorn * Falls back to @task or system default policy, as necessary. 1247480eccf9SLee Schermerhorn * Returned policy has extra reference count if shared, vma, 1248480eccf9SLee Schermerhorn * or some other task's policy [show_numa_maps() can pass 1249480eccf9SLee Schermerhorn * @task != current]. It is the caller's responsibility to 1250480eccf9SLee Schermerhorn * free the reference in these cases. 1251480eccf9SLee Schermerhorn */ 125248fce342SChristoph Lameter static struct mempolicy * get_vma_policy(struct task_struct *task, 125348fce342SChristoph Lameter struct vm_area_struct *vma, unsigned long addr) 12541da177e4SLinus Torvalds { 12556e21c8f1SChristoph Lameter struct mempolicy *pol = task->mempolicy; 1256480eccf9SLee Schermerhorn int shared_pol = 0; 12571da177e4SLinus Torvalds 12581da177e4SLinus Torvalds if (vma) { 1259480eccf9SLee Schermerhorn if (vma->vm_ops && vma->vm_ops->get_policy) { 12601da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 1261480eccf9SLee Schermerhorn shared_pol = 1; /* if pol non-NULL, add ref below */ 1262480eccf9SLee Schermerhorn } else if (vma->vm_policy && 12631da177e4SLinus Torvalds vma->vm_policy->policy != MPOL_DEFAULT) 12641da177e4SLinus Torvalds pol = vma->vm_policy; 12651da177e4SLinus Torvalds } 12661da177e4SLinus Torvalds if (!pol) 12671da177e4SLinus Torvalds pol = &default_policy; 1268480eccf9SLee Schermerhorn else if (!shared_pol && pol != current->mempolicy) 1269480eccf9SLee Schermerhorn mpol_get(pol); /* vma or other task's policy */ 12701da177e4SLinus Torvalds return pol; 12711da177e4SLinus Torvalds } 12721da177e4SLinus Torvalds 127319770b32SMel Gorman /* Return a nodemask representing a mempolicy */ 127419770b32SMel Gorman static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) 127519770b32SMel Gorman { 127619770b32SMel Gorman /* Lower zones don't get a nodemask applied for MPOL_BIND */ 127719770b32SMel Gorman if (unlikely(policy->policy == MPOL_BIND) && 127819770b32SMel Gorman gfp_zone(gfp) >= policy_zone && 127919770b32SMel Gorman cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 128019770b32SMel Gorman return &policy->v.nodes; 128119770b32SMel Gorman 128219770b32SMel Gorman return NULL; 128319770b32SMel Gorman } 128419770b32SMel Gorman 12851da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */ 1286dd0fc66fSAl Viro static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 12871da177e4SLinus Torvalds { 12881da177e4SLinus Torvalds int nd; 12891da177e4SLinus Torvalds 12901da177e4SLinus Torvalds switch (policy->policy) { 12911da177e4SLinus Torvalds case MPOL_PREFERRED: 12921da177e4SLinus Torvalds nd = policy->v.preferred_node; 12931da177e4SLinus Torvalds if (nd < 0) 12941da177e4SLinus Torvalds nd = numa_node_id(); 12951da177e4SLinus Torvalds break; 12961da177e4SLinus Torvalds case MPOL_BIND: 129719770b32SMel Gorman /* 129819770b32SMel Gorman * Normally, MPOL_BIND allocations node-local are node-local 129919770b32SMel Gorman * within the allowed nodemask. However, if __GFP_THISNODE is 130019770b32SMel Gorman * set and the current node is part of the mask, we use the 130119770b32SMel Gorman * the zonelist for the first node in the mask instead. 130219770b32SMel Gorman */ 130319770b32SMel Gorman nd = numa_node_id(); 130419770b32SMel Gorman if (unlikely(gfp & __GFP_THISNODE) && 130519770b32SMel Gorman unlikely(!node_isset(nd, policy->v.nodes))) 130619770b32SMel Gorman nd = first_node(policy->v.nodes); 130719770b32SMel Gorman break; 13081da177e4SLinus Torvalds case MPOL_INTERLEAVE: /* should not happen */ 13091da177e4SLinus Torvalds case MPOL_DEFAULT: 13101da177e4SLinus Torvalds nd = numa_node_id(); 13111da177e4SLinus Torvalds break; 13121da177e4SLinus Torvalds default: 13131da177e4SLinus Torvalds nd = 0; 13141da177e4SLinus Torvalds BUG(); 13151da177e4SLinus Torvalds } 13160e88460dSMel Gorman return node_zonelist(nd, gfp); 13171da177e4SLinus Torvalds } 13181da177e4SLinus Torvalds 13191da177e4SLinus Torvalds /* Do dynamic interleaving for a process */ 13201da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy) 13211da177e4SLinus Torvalds { 13221da177e4SLinus Torvalds unsigned nid, next; 13231da177e4SLinus Torvalds struct task_struct *me = current; 13241da177e4SLinus Torvalds 13251da177e4SLinus Torvalds nid = me->il_next; 1326dfcd3c0dSAndi Kleen next = next_node(nid, policy->v.nodes); 13271da177e4SLinus Torvalds if (next >= MAX_NUMNODES) 1328dfcd3c0dSAndi Kleen next = first_node(policy->v.nodes); 1329f5b087b5SDavid Rientjes if (next < MAX_NUMNODES) 13301da177e4SLinus Torvalds me->il_next = next; 13311da177e4SLinus Torvalds return nid; 13321da177e4SLinus Torvalds } 13331da177e4SLinus Torvalds 1334dc85da15SChristoph Lameter /* 1335dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the 1336dc85da15SChristoph Lameter * next slab entry. 1337dc85da15SChristoph Lameter */ 1338dc85da15SChristoph Lameter unsigned slab_node(struct mempolicy *policy) 1339dc85da15SChristoph Lameter { 1340a3b51e01SDavid Rientjes unsigned short pol = policy ? policy->policy : MPOL_DEFAULT; 1341765c4507SChristoph Lameter 1342765c4507SChristoph Lameter switch (pol) { 1343dc85da15SChristoph Lameter case MPOL_INTERLEAVE: 1344dc85da15SChristoph Lameter return interleave_nodes(policy); 1345dc85da15SChristoph Lameter 1346dd1a239fSMel Gorman case MPOL_BIND: { 1347dc85da15SChristoph Lameter /* 1348dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the 1349dc85da15SChristoph Lameter * first node. 1350dc85da15SChristoph Lameter */ 135119770b32SMel Gorman struct zonelist *zonelist; 135219770b32SMel Gorman struct zone *zone; 135319770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 135419770b32SMel Gorman zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; 135519770b32SMel Gorman (void)first_zones_zonelist(zonelist, highest_zoneidx, 135619770b32SMel Gorman &policy->v.nodes, 135719770b32SMel Gorman &zone); 135819770b32SMel Gorman return zone->node; 1359dd1a239fSMel Gorman } 1360dc85da15SChristoph Lameter 1361dc85da15SChristoph Lameter case MPOL_PREFERRED: 1362dc85da15SChristoph Lameter if (policy->v.preferred_node >= 0) 1363dc85da15SChristoph Lameter return policy->v.preferred_node; 1364dc85da15SChristoph Lameter /* Fall through */ 1365dc85da15SChristoph Lameter 1366dc85da15SChristoph Lameter default: 1367dc85da15SChristoph Lameter return numa_node_id(); 1368dc85da15SChristoph Lameter } 1369dc85da15SChristoph Lameter } 1370dc85da15SChristoph Lameter 13711da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */ 13721da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol, 13731da177e4SLinus Torvalds struct vm_area_struct *vma, unsigned long off) 13741da177e4SLinus Torvalds { 1375dfcd3c0dSAndi Kleen unsigned nnodes = nodes_weight(pol->v.nodes); 1376f5b087b5SDavid Rientjes unsigned target; 13771da177e4SLinus Torvalds int c; 13781da177e4SLinus Torvalds int nid = -1; 13791da177e4SLinus Torvalds 1380f5b087b5SDavid Rientjes if (!nnodes) 1381f5b087b5SDavid Rientjes return numa_node_id(); 1382f5b087b5SDavid Rientjes target = (unsigned int)off % nnodes; 13831da177e4SLinus Torvalds c = 0; 13841da177e4SLinus Torvalds do { 1385dfcd3c0dSAndi Kleen nid = next_node(nid, pol->v.nodes); 13861da177e4SLinus Torvalds c++; 13871da177e4SLinus Torvalds } while (c <= target); 13881da177e4SLinus Torvalds return nid; 13891da177e4SLinus Torvalds } 13901da177e4SLinus Torvalds 13915da7ca86SChristoph Lameter /* Determine a node number for interleave */ 13925da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol, 13935da7ca86SChristoph Lameter struct vm_area_struct *vma, unsigned long addr, int shift) 13945da7ca86SChristoph Lameter { 13955da7ca86SChristoph Lameter if (vma) { 13965da7ca86SChristoph Lameter unsigned long off; 13975da7ca86SChristoph Lameter 13983b98b087SNishanth Aravamudan /* 13993b98b087SNishanth Aravamudan * for small pages, there is no difference between 14003b98b087SNishanth Aravamudan * shift and PAGE_SHIFT, so the bit-shift is safe. 14013b98b087SNishanth Aravamudan * for huge pages, since vm_pgoff is in units of small 14023b98b087SNishanth Aravamudan * pages, we need to shift off the always 0 bits to get 14033b98b087SNishanth Aravamudan * a useful offset. 14043b98b087SNishanth Aravamudan */ 14053b98b087SNishanth Aravamudan BUG_ON(shift < PAGE_SHIFT); 14063b98b087SNishanth Aravamudan off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 14075da7ca86SChristoph Lameter off += (addr - vma->vm_start) >> shift; 14085da7ca86SChristoph Lameter return offset_il_node(pol, vma, off); 14095da7ca86SChristoph Lameter } else 14105da7ca86SChristoph Lameter return interleave_nodes(pol); 14115da7ca86SChristoph Lameter } 14125da7ca86SChristoph Lameter 141300ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS 1414480eccf9SLee Schermerhorn /* 1415480eccf9SLee Schermerhorn * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1416480eccf9SLee Schermerhorn * @vma = virtual memory area whose policy is sought 1417480eccf9SLee Schermerhorn * @addr = address in @vma for shared policy lookup and interleave policy 1418480eccf9SLee Schermerhorn * @gfp_flags = for requested zone 141919770b32SMel Gorman * @mpol = pointer to mempolicy pointer for reference counted mempolicy 142019770b32SMel Gorman * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1421480eccf9SLee Schermerhorn * 1422480eccf9SLee Schermerhorn * Returns a zonelist suitable for a huge page allocation. 142319770b32SMel Gorman * If the effective policy is 'BIND, returns pointer to local node's zonelist, 142419770b32SMel Gorman * and a pointer to the mempolicy's @nodemask for filtering the zonelist. 1425480eccf9SLee Schermerhorn * If it is also a policy for which get_vma_policy() returns an extra 142619770b32SMel Gorman * reference, we must hold that reference until after the allocation. 1427480eccf9SLee Schermerhorn * In that case, return policy via @mpol so hugetlb allocation can drop 1428480eccf9SLee Schermerhorn * the reference. For non-'BIND referenced policies, we can/do drop the 1429480eccf9SLee Schermerhorn * reference here, so the caller doesn't need to know about the special case 1430480eccf9SLee Schermerhorn * for default and current task policy. 1431480eccf9SLee Schermerhorn */ 1432396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 143319770b32SMel Gorman gfp_t gfp_flags, struct mempolicy **mpol, 143419770b32SMel Gorman nodemask_t **nodemask) 14355da7ca86SChristoph Lameter { 14365da7ca86SChristoph Lameter struct mempolicy *pol = get_vma_policy(current, vma, addr); 1437480eccf9SLee Schermerhorn struct zonelist *zl; 14385da7ca86SChristoph Lameter 1439480eccf9SLee Schermerhorn *mpol = NULL; /* probably no unref needed */ 144019770b32SMel Gorman *nodemask = NULL; /* assume !MPOL_BIND */ 144119770b32SMel Gorman if (pol->policy == MPOL_BIND) { 144219770b32SMel Gorman *nodemask = &pol->v.nodes; 144319770b32SMel Gorman } else if (pol->policy == MPOL_INTERLEAVE) { 14445da7ca86SChristoph Lameter unsigned nid; 14455da7ca86SChristoph Lameter 14465da7ca86SChristoph Lameter nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 144769682d85SLee Schermerhorn if (unlikely(pol != &default_policy && 144869682d85SLee Schermerhorn pol != current->mempolicy)) 1449480eccf9SLee Schermerhorn __mpol_free(pol); /* finished with pol */ 14500e88460dSMel Gorman return node_zonelist(nid, gfp_flags); 14515da7ca86SChristoph Lameter } 1452480eccf9SLee Schermerhorn 1453480eccf9SLee Schermerhorn zl = zonelist_policy(GFP_HIGHUSER, pol); 1454480eccf9SLee Schermerhorn if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1455480eccf9SLee Schermerhorn if (pol->policy != MPOL_BIND) 1456480eccf9SLee Schermerhorn __mpol_free(pol); /* finished with pol */ 1457480eccf9SLee Schermerhorn else 1458480eccf9SLee Schermerhorn *mpol = pol; /* unref needed after allocation */ 1459480eccf9SLee Schermerhorn } 1460480eccf9SLee Schermerhorn return zl; 14615da7ca86SChristoph Lameter } 146200ac59adSChen, Kenneth W #endif 14635da7ca86SChristoph Lameter 14641da177e4SLinus Torvalds /* Allocate a page in interleaved policy. 14651da177e4SLinus Torvalds Own path because it needs to do special accounting. */ 1466662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1467662f3a0bSAndi Kleen unsigned nid) 14681da177e4SLinus Torvalds { 14691da177e4SLinus Torvalds struct zonelist *zl; 14701da177e4SLinus Torvalds struct page *page; 14711da177e4SLinus Torvalds 14720e88460dSMel Gorman zl = node_zonelist(nid, gfp); 14731da177e4SLinus Torvalds page = __alloc_pages(gfp, order, zl); 1474dd1a239fSMel Gorman if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) 1475ca889e6cSChristoph Lameter inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 14761da177e4SLinus Torvalds return page; 14771da177e4SLinus Torvalds } 14781da177e4SLinus Torvalds 14791da177e4SLinus Torvalds /** 14801da177e4SLinus Torvalds * alloc_page_vma - Allocate a page for a VMA. 14811da177e4SLinus Torvalds * 14821da177e4SLinus Torvalds * @gfp: 14831da177e4SLinus Torvalds * %GFP_USER user allocation. 14841da177e4SLinus Torvalds * %GFP_KERNEL kernel allocations, 14851da177e4SLinus Torvalds * %GFP_HIGHMEM highmem/user allocations, 14861da177e4SLinus Torvalds * %GFP_FS allocation should not call back into a file system. 14871da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 14881da177e4SLinus Torvalds * 14891da177e4SLinus Torvalds * @vma: Pointer to VMA or NULL if not available. 14901da177e4SLinus Torvalds * @addr: Virtual Address of the allocation. Must be inside the VMA. 14911da177e4SLinus Torvalds * 14921da177e4SLinus Torvalds * This function allocates a page from the kernel page pool and applies 14931da177e4SLinus Torvalds * a NUMA policy associated with the VMA or the current process. 14941da177e4SLinus Torvalds * When VMA is not NULL caller must hold down_read on the mmap_sem of the 14951da177e4SLinus Torvalds * mm_struct of the VMA to prevent it from going away. Should be used for 14961da177e4SLinus Torvalds * all allocations for pages that will be mapped into 14971da177e4SLinus Torvalds * user space. Returns NULL when no page can be allocated. 14981da177e4SLinus Torvalds * 14991da177e4SLinus Torvalds * Should be called with the mm_sem of the vma hold. 15001da177e4SLinus Torvalds */ 15011da177e4SLinus Torvalds struct page * 1502dd0fc66fSAl Viro alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 15031da177e4SLinus Torvalds { 15046e21c8f1SChristoph Lameter struct mempolicy *pol = get_vma_policy(current, vma, addr); 1505480eccf9SLee Schermerhorn struct zonelist *zl; 15061da177e4SLinus Torvalds 1507cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 15081da177e4SLinus Torvalds 15091da177e4SLinus Torvalds if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 15101da177e4SLinus Torvalds unsigned nid; 15115da7ca86SChristoph Lameter 15125da7ca86SChristoph Lameter nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 151369682d85SLee Schermerhorn if (unlikely(pol != &default_policy && 151469682d85SLee Schermerhorn pol != current->mempolicy)) 151569682d85SLee Schermerhorn __mpol_free(pol); /* finished with pol */ 15161da177e4SLinus Torvalds return alloc_page_interleave(gfp, 0, nid); 15171da177e4SLinus Torvalds } 1518480eccf9SLee Schermerhorn zl = zonelist_policy(gfp, pol); 1519480eccf9SLee Schermerhorn if (pol != &default_policy && pol != current->mempolicy) { 1520480eccf9SLee Schermerhorn /* 1521480eccf9SLee Schermerhorn * slow path: ref counted policy -- shared or vma 1522480eccf9SLee Schermerhorn */ 152319770b32SMel Gorman struct page *page = __alloc_pages_nodemask(gfp, 0, 152419770b32SMel Gorman zl, nodemask_policy(gfp, pol)); 1525480eccf9SLee Schermerhorn __mpol_free(pol); 1526480eccf9SLee Schermerhorn return page; 1527480eccf9SLee Schermerhorn } 1528480eccf9SLee Schermerhorn /* 1529480eccf9SLee Schermerhorn * fast path: default or task policy 1530480eccf9SLee Schermerhorn */ 153119770b32SMel Gorman return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); 15321da177e4SLinus Torvalds } 15331da177e4SLinus Torvalds 15341da177e4SLinus Torvalds /** 15351da177e4SLinus Torvalds * alloc_pages_current - Allocate pages. 15361da177e4SLinus Torvalds * 15371da177e4SLinus Torvalds * @gfp: 15381da177e4SLinus Torvalds * %GFP_USER user allocation, 15391da177e4SLinus Torvalds * %GFP_KERNEL kernel allocation, 15401da177e4SLinus Torvalds * %GFP_HIGHMEM highmem allocation, 15411da177e4SLinus Torvalds * %GFP_FS don't call back into a file system. 15421da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 15431da177e4SLinus Torvalds * @order: Power of two of allocation size in pages. 0 is a single page. 15441da177e4SLinus Torvalds * 15451da177e4SLinus Torvalds * Allocate a page from the kernel page pool. When not in 15461da177e4SLinus Torvalds * interrupt context and apply the current process NUMA policy. 15471da177e4SLinus Torvalds * Returns NULL when no page can be allocated. 15481da177e4SLinus Torvalds * 1549cf2a473cSPaul Jackson * Don't call cpuset_update_task_memory_state() unless 15501da177e4SLinus Torvalds * 1) it's ok to take cpuset_sem (can WAIT), and 15511da177e4SLinus Torvalds * 2) allocating for current task (not interrupt). 15521da177e4SLinus Torvalds */ 1553dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order) 15541da177e4SLinus Torvalds { 15551da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 15561da177e4SLinus Torvalds 15571da177e4SLinus Torvalds if ((gfp & __GFP_WAIT) && !in_interrupt()) 1558cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 15599b819d20SChristoph Lameter if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 15601da177e4SLinus Torvalds pol = &default_policy; 15611da177e4SLinus Torvalds if (pol->policy == MPOL_INTERLEAVE) 15621da177e4SLinus Torvalds return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 156319770b32SMel Gorman return __alloc_pages_nodemask(gfp, order, 156419770b32SMel Gorman zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); 15651da177e4SLinus Torvalds } 15661da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current); 15671da177e4SLinus Torvalds 15684225399aSPaul Jackson /* 15694225399aSPaul Jackson * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it 15704225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy() 15714225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This 15724225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See 15734225399aSPaul Jackson * further kernel/cpuset.c update_nodemask(). 15744225399aSPaul Jackson */ 15754225399aSPaul Jackson 15761da177e4SLinus Torvalds /* Slow path of a mempolicy copy */ 15771da177e4SLinus Torvalds struct mempolicy *__mpol_copy(struct mempolicy *old) 15781da177e4SLinus Torvalds { 15791da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 15801da177e4SLinus Torvalds 15811da177e4SLinus Torvalds if (!new) 15821da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 15834225399aSPaul Jackson if (current_cpuset_is_being_rebound()) { 15844225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current); 15854225399aSPaul Jackson mpol_rebind_policy(old, &mems); 15864225399aSPaul Jackson } 15871da177e4SLinus Torvalds *new = *old; 15881da177e4SLinus Torvalds atomic_set(&new->refcnt, 1); 15891da177e4SLinus Torvalds return new; 15901da177e4SLinus Torvalds } 15911da177e4SLinus Torvalds 1592f5b087b5SDavid Rientjes static int mpol_match_intent(const struct mempolicy *a, 1593f5b087b5SDavid Rientjes const struct mempolicy *b) 1594f5b087b5SDavid Rientjes { 1595f5b087b5SDavid Rientjes if (a->flags != b->flags) 1596f5b087b5SDavid Rientjes return 0; 1597f5b087b5SDavid Rientjes if (!mpol_store_user_nodemask(a)) 1598f5b087b5SDavid Rientjes return 1; 1599f5b087b5SDavid Rientjes return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); 1600f5b087b5SDavid Rientjes } 1601f5b087b5SDavid Rientjes 16021da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */ 16031da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 16041da177e4SLinus Torvalds { 16051da177e4SLinus Torvalds if (!a || !b) 16061da177e4SLinus Torvalds return 0; 16071da177e4SLinus Torvalds if (a->policy != b->policy) 16081da177e4SLinus Torvalds return 0; 1609f5b087b5SDavid Rientjes if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1610f5b087b5SDavid Rientjes return 0; 16111da177e4SLinus Torvalds switch (a->policy) { 16121da177e4SLinus Torvalds case MPOL_DEFAULT: 16131da177e4SLinus Torvalds return 1; 161419770b32SMel Gorman case MPOL_BIND: 161519770b32SMel Gorman /* Fall through */ 16161da177e4SLinus Torvalds case MPOL_INTERLEAVE: 1617dfcd3c0dSAndi Kleen return nodes_equal(a->v.nodes, b->v.nodes); 16181da177e4SLinus Torvalds case MPOL_PREFERRED: 16191da177e4SLinus Torvalds return a->v.preferred_node == b->v.preferred_node; 16201da177e4SLinus Torvalds default: 16211da177e4SLinus Torvalds BUG(); 16221da177e4SLinus Torvalds return 0; 16231da177e4SLinus Torvalds } 16241da177e4SLinus Torvalds } 16251da177e4SLinus Torvalds 16261da177e4SLinus Torvalds /* Slow path of a mpol destructor. */ 16271da177e4SLinus Torvalds void __mpol_free(struct mempolicy *p) 16281da177e4SLinus Torvalds { 16291da177e4SLinus Torvalds if (!atomic_dec_and_test(&p->refcnt)) 16301da177e4SLinus Torvalds return; 16311da177e4SLinus Torvalds p->policy = MPOL_DEFAULT; 16321da177e4SLinus Torvalds kmem_cache_free(policy_cache, p); 16331da177e4SLinus Torvalds } 16341da177e4SLinus Torvalds 16351da177e4SLinus Torvalds /* 16361da177e4SLinus Torvalds * Shared memory backing store policy support. 16371da177e4SLinus Torvalds * 16381da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped. 16391da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode. 16401da177e4SLinus Torvalds * They are protected by the sp->lock spinlock, which should be held 16411da177e4SLinus Torvalds * for any accesses to the tree. 16421da177e4SLinus Torvalds */ 16431da177e4SLinus Torvalds 16441da177e4SLinus Torvalds /* lookup first element intersecting start-end */ 16451da177e4SLinus Torvalds /* Caller holds sp->lock */ 16461da177e4SLinus Torvalds static struct sp_node * 16471da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 16481da177e4SLinus Torvalds { 16491da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node; 16501da177e4SLinus Torvalds 16511da177e4SLinus Torvalds while (n) { 16521da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd); 16531da177e4SLinus Torvalds 16541da177e4SLinus Torvalds if (start >= p->end) 16551da177e4SLinus Torvalds n = n->rb_right; 16561da177e4SLinus Torvalds else if (end <= p->start) 16571da177e4SLinus Torvalds n = n->rb_left; 16581da177e4SLinus Torvalds else 16591da177e4SLinus Torvalds break; 16601da177e4SLinus Torvalds } 16611da177e4SLinus Torvalds if (!n) 16621da177e4SLinus Torvalds return NULL; 16631da177e4SLinus Torvalds for (;;) { 16641da177e4SLinus Torvalds struct sp_node *w = NULL; 16651da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n); 16661da177e4SLinus Torvalds if (!prev) 16671da177e4SLinus Torvalds break; 16681da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd); 16691da177e4SLinus Torvalds if (w->end <= start) 16701da177e4SLinus Torvalds break; 16711da177e4SLinus Torvalds n = prev; 16721da177e4SLinus Torvalds } 16731da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd); 16741da177e4SLinus Torvalds } 16751da177e4SLinus Torvalds 16761da177e4SLinus Torvalds /* Insert a new shared policy into the list. */ 16771da177e4SLinus Torvalds /* Caller holds sp->lock */ 16781da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new) 16791da177e4SLinus Torvalds { 16801da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node; 16811da177e4SLinus Torvalds struct rb_node *parent = NULL; 16821da177e4SLinus Torvalds struct sp_node *nd; 16831da177e4SLinus Torvalds 16841da177e4SLinus Torvalds while (*p) { 16851da177e4SLinus Torvalds parent = *p; 16861da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd); 16871da177e4SLinus Torvalds if (new->start < nd->start) 16881da177e4SLinus Torvalds p = &(*p)->rb_left; 16891da177e4SLinus Torvalds else if (new->end > nd->end) 16901da177e4SLinus Torvalds p = &(*p)->rb_right; 16911da177e4SLinus Torvalds else 16921da177e4SLinus Torvalds BUG(); 16931da177e4SLinus Torvalds } 16941da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p); 16951da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root); 1696140d5a49SPaul Mundt pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 16971da177e4SLinus Torvalds new->policy ? new->policy->policy : 0); 16981da177e4SLinus Torvalds } 16991da177e4SLinus Torvalds 17001da177e4SLinus Torvalds /* Find shared policy intersecting idx */ 17011da177e4SLinus Torvalds struct mempolicy * 17021da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 17031da177e4SLinus Torvalds { 17041da177e4SLinus Torvalds struct mempolicy *pol = NULL; 17051da177e4SLinus Torvalds struct sp_node *sn; 17061da177e4SLinus Torvalds 17071da177e4SLinus Torvalds if (!sp->root.rb_node) 17081da177e4SLinus Torvalds return NULL; 17091da177e4SLinus Torvalds spin_lock(&sp->lock); 17101da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1); 17111da177e4SLinus Torvalds if (sn) { 17121da177e4SLinus Torvalds mpol_get(sn->policy); 17131da177e4SLinus Torvalds pol = sn->policy; 17141da177e4SLinus Torvalds } 17151da177e4SLinus Torvalds spin_unlock(&sp->lock); 17161da177e4SLinus Torvalds return pol; 17171da177e4SLinus Torvalds } 17181da177e4SLinus Torvalds 17191da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n) 17201da177e4SLinus Torvalds { 1721140d5a49SPaul Mundt pr_debug("deleting %lx-l%lx\n", n->start, n->end); 17221da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root); 17231da177e4SLinus Torvalds mpol_free(n->policy); 17241da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 17251da177e4SLinus Torvalds } 17261da177e4SLinus Torvalds 1727dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 1728dbcb0f19SAdrian Bunk struct mempolicy *pol) 17291da177e4SLinus Torvalds { 17301da177e4SLinus Torvalds struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 17311da177e4SLinus Torvalds 17321da177e4SLinus Torvalds if (!n) 17331da177e4SLinus Torvalds return NULL; 17341da177e4SLinus Torvalds n->start = start; 17351da177e4SLinus Torvalds n->end = end; 17361da177e4SLinus Torvalds mpol_get(pol); 17371da177e4SLinus Torvalds n->policy = pol; 17381da177e4SLinus Torvalds return n; 17391da177e4SLinus Torvalds } 17401da177e4SLinus Torvalds 17411da177e4SLinus Torvalds /* Replace a policy range. */ 17421da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 17431da177e4SLinus Torvalds unsigned long end, struct sp_node *new) 17441da177e4SLinus Torvalds { 17451da177e4SLinus Torvalds struct sp_node *n, *new2 = NULL; 17461da177e4SLinus Torvalds 17471da177e4SLinus Torvalds restart: 17481da177e4SLinus Torvalds spin_lock(&sp->lock); 17491da177e4SLinus Torvalds n = sp_lookup(sp, start, end); 17501da177e4SLinus Torvalds /* Take care of old policies in the same range. */ 17511da177e4SLinus Torvalds while (n && n->start < end) { 17521da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd); 17531da177e4SLinus Torvalds if (n->start >= start) { 17541da177e4SLinus Torvalds if (n->end <= end) 17551da177e4SLinus Torvalds sp_delete(sp, n); 17561da177e4SLinus Torvalds else 17571da177e4SLinus Torvalds n->start = end; 17581da177e4SLinus Torvalds } else { 17591da177e4SLinus Torvalds /* Old policy spanning whole new range. */ 17601da177e4SLinus Torvalds if (n->end > end) { 17611da177e4SLinus Torvalds if (!new2) { 17621da177e4SLinus Torvalds spin_unlock(&sp->lock); 17631da177e4SLinus Torvalds new2 = sp_alloc(end, n->end, n->policy); 17641da177e4SLinus Torvalds if (!new2) 17651da177e4SLinus Torvalds return -ENOMEM; 17661da177e4SLinus Torvalds goto restart; 17671da177e4SLinus Torvalds } 17681da177e4SLinus Torvalds n->end = start; 17691da177e4SLinus Torvalds sp_insert(sp, new2); 17701da177e4SLinus Torvalds new2 = NULL; 17711da177e4SLinus Torvalds break; 17721da177e4SLinus Torvalds } else 17731da177e4SLinus Torvalds n->end = start; 17741da177e4SLinus Torvalds } 17751da177e4SLinus Torvalds if (!next) 17761da177e4SLinus Torvalds break; 17771da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 17781da177e4SLinus Torvalds } 17791da177e4SLinus Torvalds if (new) 17801da177e4SLinus Torvalds sp_insert(sp, new); 17811da177e4SLinus Torvalds spin_unlock(&sp->lock); 17821da177e4SLinus Torvalds if (new2) { 17831da177e4SLinus Torvalds mpol_free(new2->policy); 17841da177e4SLinus Torvalds kmem_cache_free(sn_cache, new2); 17851da177e4SLinus Torvalds } 17861da177e4SLinus Torvalds return 0; 17871da177e4SLinus Torvalds } 17881da177e4SLinus Torvalds 1789a3b51e01SDavid Rientjes void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy, 1790028fec41SDavid Rientjes unsigned short flags, nodemask_t *policy_nodes) 17917339ff83SRobin Holt { 17927339ff83SRobin Holt info->root = RB_ROOT; 17937339ff83SRobin Holt spin_lock_init(&info->lock); 17947339ff83SRobin Holt 17957339ff83SRobin Holt if (policy != MPOL_DEFAULT) { 17967339ff83SRobin Holt struct mempolicy *newpol; 17977339ff83SRobin Holt 17987339ff83SRobin Holt /* Falls back to MPOL_DEFAULT on any error */ 1799028fec41SDavid Rientjes newpol = mpol_new(policy, flags, policy_nodes); 18007339ff83SRobin Holt if (!IS_ERR(newpol)) { 18017339ff83SRobin Holt /* Create pseudo-vma that contains just the policy */ 18027339ff83SRobin Holt struct vm_area_struct pvma; 18037339ff83SRobin Holt 18047339ff83SRobin Holt memset(&pvma, 0, sizeof(struct vm_area_struct)); 18057339ff83SRobin Holt /* Policy covers entire file */ 18067339ff83SRobin Holt pvma.vm_end = TASK_SIZE; 18077339ff83SRobin Holt mpol_set_shared_policy(info, &pvma, newpol); 18087339ff83SRobin Holt mpol_free(newpol); 18097339ff83SRobin Holt } 18107339ff83SRobin Holt } 18117339ff83SRobin Holt } 18127339ff83SRobin Holt 18131da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info, 18141da177e4SLinus Torvalds struct vm_area_struct *vma, struct mempolicy *npol) 18151da177e4SLinus Torvalds { 18161da177e4SLinus Torvalds int err; 18171da177e4SLinus Torvalds struct sp_node *new = NULL; 18181da177e4SLinus Torvalds unsigned long sz = vma_pages(vma); 18191da177e4SLinus Torvalds 1820028fec41SDavid Rientjes pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 18211da177e4SLinus Torvalds vma->vm_pgoff, 18221da177e4SLinus Torvalds sz, npol ? npol->policy : -1, 1823028fec41SDavid Rientjes npol ? npol->flags : -1, 1824dfcd3c0dSAndi Kleen npol ? nodes_addr(npol->v.nodes)[0] : -1); 18251da177e4SLinus Torvalds 18261da177e4SLinus Torvalds if (npol) { 18271da177e4SLinus Torvalds new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 18281da177e4SLinus Torvalds if (!new) 18291da177e4SLinus Torvalds return -ENOMEM; 18301da177e4SLinus Torvalds } 18311da177e4SLinus Torvalds err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 18321da177e4SLinus Torvalds if (err && new) 18331da177e4SLinus Torvalds kmem_cache_free(sn_cache, new); 18341da177e4SLinus Torvalds return err; 18351da177e4SLinus Torvalds } 18361da177e4SLinus Torvalds 18371da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */ 18381da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p) 18391da177e4SLinus Torvalds { 18401da177e4SLinus Torvalds struct sp_node *n; 18411da177e4SLinus Torvalds struct rb_node *next; 18421da177e4SLinus Torvalds 18431da177e4SLinus Torvalds if (!p->root.rb_node) 18441da177e4SLinus Torvalds return; 18451da177e4SLinus Torvalds spin_lock(&p->lock); 18461da177e4SLinus Torvalds next = rb_first(&p->root); 18471da177e4SLinus Torvalds while (next) { 18481da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 18491da177e4SLinus Torvalds next = rb_next(&n->nd); 185090c5029eSAndi Kleen rb_erase(&n->nd, &p->root); 18511da177e4SLinus Torvalds mpol_free(n->policy); 18521da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 18531da177e4SLinus Torvalds } 18541da177e4SLinus Torvalds spin_unlock(&p->lock); 18551da177e4SLinus Torvalds } 18561da177e4SLinus Torvalds 18571da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */ 18581da177e4SLinus Torvalds void __init numa_policy_init(void) 18591da177e4SLinus Torvalds { 1860b71636e2SPaul Mundt nodemask_t interleave_nodes; 1861b71636e2SPaul Mundt unsigned long largest = 0; 1862b71636e2SPaul Mundt int nid, prefer = 0; 1863b71636e2SPaul Mundt 18641da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy", 18651da177e4SLinus Torvalds sizeof(struct mempolicy), 186620c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 18671da177e4SLinus Torvalds 18681da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node", 18691da177e4SLinus Torvalds sizeof(struct sp_node), 187020c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 18711da177e4SLinus Torvalds 1872b71636e2SPaul Mundt /* 1873b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only 1874b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or 1875b71636e2SPaul Mundt * fall back to the largest node if they're all smaller. 1876b71636e2SPaul Mundt */ 1877b71636e2SPaul Mundt nodes_clear(interleave_nodes); 187856bbd65dSChristoph Lameter for_each_node_state(nid, N_HIGH_MEMORY) { 1879b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid); 18801da177e4SLinus Torvalds 1881b71636e2SPaul Mundt /* Preserve the largest node */ 1882b71636e2SPaul Mundt if (largest < total_pages) { 1883b71636e2SPaul Mundt largest = total_pages; 1884b71636e2SPaul Mundt prefer = nid; 1885b71636e2SPaul Mundt } 1886b71636e2SPaul Mundt 1887b71636e2SPaul Mundt /* Interleave this node? */ 1888b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 1889b71636e2SPaul Mundt node_set(nid, interleave_nodes); 1890b71636e2SPaul Mundt } 1891b71636e2SPaul Mundt 1892b71636e2SPaul Mundt /* All too small, use the largest */ 1893b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes))) 1894b71636e2SPaul Mundt node_set(prefer, interleave_nodes); 1895b71636e2SPaul Mundt 1896028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 18971da177e4SLinus Torvalds printk("numa_policy_init: interleaving failed\n"); 18981da177e4SLinus Torvalds } 18991da177e4SLinus Torvalds 19008bccd85fSChristoph Lameter /* Reset policy of current process to default */ 19011da177e4SLinus Torvalds void numa_default_policy(void) 19021da177e4SLinus Torvalds { 1903028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 19041da177e4SLinus Torvalds } 190568860ec1SPaul Jackson 19064225399aSPaul Jackson /* 19071a75a6c8SChristoph Lameter * Display pages allocated per node and memory policy via /proc. 19081a75a6c8SChristoph Lameter */ 190915ad7cdcSHelge Deller static const char * const policy_types[] = 191015ad7cdcSHelge Deller { "default", "prefer", "bind", "interleave" }; 19111a75a6c8SChristoph Lameter 19121a75a6c8SChristoph Lameter /* 19131a75a6c8SChristoph Lameter * Convert a mempolicy into a string. 19141a75a6c8SChristoph Lameter * Returns the number of characters in buffer (if positive) 19151a75a6c8SChristoph Lameter * or an error (negative) 19161a75a6c8SChristoph Lameter */ 19171a75a6c8SChristoph Lameter static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 19181a75a6c8SChristoph Lameter { 19191a75a6c8SChristoph Lameter char *p = buffer; 19201a75a6c8SChristoph Lameter int l; 19211a75a6c8SChristoph Lameter nodemask_t nodes; 1922a3b51e01SDavid Rientjes unsigned short mode = pol ? pol->policy : MPOL_DEFAULT; 1923f5b087b5SDavid Rientjes unsigned short flags = pol ? pol->flags : 0; 19241a75a6c8SChristoph Lameter 19251a75a6c8SChristoph Lameter switch (mode) { 19261a75a6c8SChristoph Lameter case MPOL_DEFAULT: 19271a75a6c8SChristoph Lameter nodes_clear(nodes); 19281a75a6c8SChristoph Lameter break; 19291a75a6c8SChristoph Lameter 19301a75a6c8SChristoph Lameter case MPOL_PREFERRED: 19311a75a6c8SChristoph Lameter nodes_clear(nodes); 19321a75a6c8SChristoph Lameter node_set(pol->v.preferred_node, nodes); 19331a75a6c8SChristoph Lameter break; 19341a75a6c8SChristoph Lameter 19351a75a6c8SChristoph Lameter case MPOL_BIND: 193619770b32SMel Gorman /* Fall through */ 19371a75a6c8SChristoph Lameter case MPOL_INTERLEAVE: 19381a75a6c8SChristoph Lameter nodes = pol->v.nodes; 19391a75a6c8SChristoph Lameter break; 19401a75a6c8SChristoph Lameter 19411a75a6c8SChristoph Lameter default: 19421a75a6c8SChristoph Lameter BUG(); 19431a75a6c8SChristoph Lameter return -EFAULT; 19441a75a6c8SChristoph Lameter } 19451a75a6c8SChristoph Lameter 19461a75a6c8SChristoph Lameter l = strlen(policy_types[mode]); 19471a75a6c8SChristoph Lameter if (buffer + maxlen < p + l + 1) 19481a75a6c8SChristoph Lameter return -ENOSPC; 19491a75a6c8SChristoph Lameter 19501a75a6c8SChristoph Lameter strcpy(p, policy_types[mode]); 19511a75a6c8SChristoph Lameter p += l; 19521a75a6c8SChristoph Lameter 1953f5b087b5SDavid Rientjes if (flags) { 1954f5b087b5SDavid Rientjes int need_bar = 0; 1955f5b087b5SDavid Rientjes 1956f5b087b5SDavid Rientjes if (buffer + maxlen < p + 2) 1957f5b087b5SDavid Rientjes return -ENOSPC; 1958f5b087b5SDavid Rientjes *p++ = '='; 1959f5b087b5SDavid Rientjes 1960f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES) 1961f5b087b5SDavid Rientjes p += sprintf(p, "%sstatic", need_bar++ ? "|" : ""); 19624c50bc01SDavid Rientjes if (flags & MPOL_F_RELATIVE_NODES) 19634c50bc01SDavid Rientjes p += sprintf(p, "%srelative", need_bar++ ? "|" : ""); 1964f5b087b5SDavid Rientjes } 1965f5b087b5SDavid Rientjes 19661a75a6c8SChristoph Lameter if (!nodes_empty(nodes)) { 19671a75a6c8SChristoph Lameter if (buffer + maxlen < p + 2) 19681a75a6c8SChristoph Lameter return -ENOSPC; 19691a75a6c8SChristoph Lameter *p++ = '='; 19701a75a6c8SChristoph Lameter p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 19711a75a6c8SChristoph Lameter } 19721a75a6c8SChristoph Lameter return p - buffer; 19731a75a6c8SChristoph Lameter } 19741a75a6c8SChristoph Lameter 19751a75a6c8SChristoph Lameter struct numa_maps { 19761a75a6c8SChristoph Lameter unsigned long pages; 19771a75a6c8SChristoph Lameter unsigned long anon; 1978397874dfSChristoph Lameter unsigned long active; 1979397874dfSChristoph Lameter unsigned long writeback; 19801a75a6c8SChristoph Lameter unsigned long mapcount_max; 1981397874dfSChristoph Lameter unsigned long dirty; 1982397874dfSChristoph Lameter unsigned long swapcache; 19831a75a6c8SChristoph Lameter unsigned long node[MAX_NUMNODES]; 19841a75a6c8SChristoph Lameter }; 19851a75a6c8SChristoph Lameter 1986397874dfSChristoph Lameter static void gather_stats(struct page *page, void *private, int pte_dirty) 19871a75a6c8SChristoph Lameter { 19881a75a6c8SChristoph Lameter struct numa_maps *md = private; 19891a75a6c8SChristoph Lameter int count = page_mapcount(page); 19901a75a6c8SChristoph Lameter 19911a75a6c8SChristoph Lameter md->pages++; 1992397874dfSChristoph Lameter if (pte_dirty || PageDirty(page)) 1993397874dfSChristoph Lameter md->dirty++; 1994397874dfSChristoph Lameter 1995397874dfSChristoph Lameter if (PageSwapCache(page)) 1996397874dfSChristoph Lameter md->swapcache++; 1997397874dfSChristoph Lameter 1998397874dfSChristoph Lameter if (PageActive(page)) 1999397874dfSChristoph Lameter md->active++; 2000397874dfSChristoph Lameter 2001397874dfSChristoph Lameter if (PageWriteback(page)) 2002397874dfSChristoph Lameter md->writeback++; 20031a75a6c8SChristoph Lameter 20041a75a6c8SChristoph Lameter if (PageAnon(page)) 20051a75a6c8SChristoph Lameter md->anon++; 20061a75a6c8SChristoph Lameter 2007397874dfSChristoph Lameter if (count > md->mapcount_max) 2008397874dfSChristoph Lameter md->mapcount_max = count; 2009397874dfSChristoph Lameter 20101a75a6c8SChristoph Lameter md->node[page_to_nid(page)]++; 20111a75a6c8SChristoph Lameter } 20121a75a6c8SChristoph Lameter 20137f709ed0SAndrew Morton #ifdef CONFIG_HUGETLB_PAGE 2014397874dfSChristoph Lameter static void check_huge_range(struct vm_area_struct *vma, 2015397874dfSChristoph Lameter unsigned long start, unsigned long end, 2016397874dfSChristoph Lameter struct numa_maps *md) 2017397874dfSChristoph Lameter { 2018397874dfSChristoph Lameter unsigned long addr; 2019397874dfSChristoph Lameter struct page *page; 2020397874dfSChristoph Lameter 2021397874dfSChristoph Lameter for (addr = start; addr < end; addr += HPAGE_SIZE) { 2022397874dfSChristoph Lameter pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2023397874dfSChristoph Lameter pte_t pte; 2024397874dfSChristoph Lameter 2025397874dfSChristoph Lameter if (!ptep) 2026397874dfSChristoph Lameter continue; 2027397874dfSChristoph Lameter 2028397874dfSChristoph Lameter pte = *ptep; 2029397874dfSChristoph Lameter if (pte_none(pte)) 2030397874dfSChristoph Lameter continue; 2031397874dfSChristoph Lameter 2032397874dfSChristoph Lameter page = pte_page(pte); 2033397874dfSChristoph Lameter if (!page) 2034397874dfSChristoph Lameter continue; 2035397874dfSChristoph Lameter 2036397874dfSChristoph Lameter gather_stats(page, md, pte_dirty(*ptep)); 2037397874dfSChristoph Lameter } 2038397874dfSChristoph Lameter } 20397f709ed0SAndrew Morton #else 20407f709ed0SAndrew Morton static inline void check_huge_range(struct vm_area_struct *vma, 20417f709ed0SAndrew Morton unsigned long start, unsigned long end, 20427f709ed0SAndrew Morton struct numa_maps *md) 20437f709ed0SAndrew Morton { 20447f709ed0SAndrew Morton } 20457f709ed0SAndrew Morton #endif 2046397874dfSChristoph Lameter 20471a75a6c8SChristoph Lameter int show_numa_map(struct seq_file *m, void *v) 20481a75a6c8SChristoph Lameter { 204999f89551SEric W. Biederman struct proc_maps_private *priv = m->private; 20501a75a6c8SChristoph Lameter struct vm_area_struct *vma = v; 20511a75a6c8SChristoph Lameter struct numa_maps *md; 2052397874dfSChristoph Lameter struct file *file = vma->vm_file; 2053397874dfSChristoph Lameter struct mm_struct *mm = vma->vm_mm; 2054480eccf9SLee Schermerhorn struct mempolicy *pol; 20551a75a6c8SChristoph Lameter int n; 20561a75a6c8SChristoph Lameter char buffer[50]; 20571a75a6c8SChristoph Lameter 2058397874dfSChristoph Lameter if (!mm) 20591a75a6c8SChristoph Lameter return 0; 20601a75a6c8SChristoph Lameter 20611a75a6c8SChristoph Lameter md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 20621a75a6c8SChristoph Lameter if (!md) 20631a75a6c8SChristoph Lameter return 0; 20641a75a6c8SChristoph Lameter 2065480eccf9SLee Schermerhorn pol = get_vma_policy(priv->task, vma, vma->vm_start); 2066480eccf9SLee Schermerhorn mpol_to_str(buffer, sizeof(buffer), pol); 2067480eccf9SLee Schermerhorn /* 2068480eccf9SLee Schermerhorn * unref shared or other task's mempolicy 2069480eccf9SLee Schermerhorn */ 2070480eccf9SLee Schermerhorn if (pol != &default_policy && pol != current->mempolicy) 2071480eccf9SLee Schermerhorn __mpol_free(pol); 20721a75a6c8SChristoph Lameter 2073397874dfSChristoph Lameter seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2074397874dfSChristoph Lameter 2075397874dfSChristoph Lameter if (file) { 2076397874dfSChristoph Lameter seq_printf(m, " file="); 2077c32c2f63SJan Blunck seq_path(m, &file->f_path, "\n\t= "); 2078397874dfSChristoph Lameter } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 2079397874dfSChristoph Lameter seq_printf(m, " heap"); 2080397874dfSChristoph Lameter } else if (vma->vm_start <= mm->start_stack && 2081397874dfSChristoph Lameter vma->vm_end >= mm->start_stack) { 2082397874dfSChristoph Lameter seq_printf(m, " stack"); 2083397874dfSChristoph Lameter } 2084397874dfSChristoph Lameter 2085397874dfSChristoph Lameter if (is_vm_hugetlb_page(vma)) { 2086397874dfSChristoph Lameter check_huge_range(vma, vma->vm_start, vma->vm_end, md); 2087397874dfSChristoph Lameter seq_printf(m, " huge"); 2088397874dfSChristoph Lameter } else { 2089397874dfSChristoph Lameter check_pgd_range(vma, vma->vm_start, vma->vm_end, 209056bbd65dSChristoph Lameter &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); 2091397874dfSChristoph Lameter } 2092397874dfSChristoph Lameter 2093397874dfSChristoph Lameter if (!md->pages) 2094397874dfSChristoph Lameter goto out; 20951a75a6c8SChristoph Lameter 20961a75a6c8SChristoph Lameter if (md->anon) 20971a75a6c8SChristoph Lameter seq_printf(m," anon=%lu",md->anon); 20981a75a6c8SChristoph Lameter 2099397874dfSChristoph Lameter if (md->dirty) 2100397874dfSChristoph Lameter seq_printf(m," dirty=%lu",md->dirty); 2101397874dfSChristoph Lameter 2102397874dfSChristoph Lameter if (md->pages != md->anon && md->pages != md->dirty) 2103397874dfSChristoph Lameter seq_printf(m, " mapped=%lu", md->pages); 2104397874dfSChristoph Lameter 2105397874dfSChristoph Lameter if (md->mapcount_max > 1) 2106397874dfSChristoph Lameter seq_printf(m, " mapmax=%lu", md->mapcount_max); 2107397874dfSChristoph Lameter 2108397874dfSChristoph Lameter if (md->swapcache) 2109397874dfSChristoph Lameter seq_printf(m," swapcache=%lu", md->swapcache); 2110397874dfSChristoph Lameter 2111397874dfSChristoph Lameter if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 2112397874dfSChristoph Lameter seq_printf(m," active=%lu", md->active); 2113397874dfSChristoph Lameter 2114397874dfSChristoph Lameter if (md->writeback) 2115397874dfSChristoph Lameter seq_printf(m," writeback=%lu", md->writeback); 2116397874dfSChristoph Lameter 211756bbd65dSChristoph Lameter for_each_node_state(n, N_HIGH_MEMORY) 21181a75a6c8SChristoph Lameter if (md->node[n]) 21191a75a6c8SChristoph Lameter seq_printf(m, " N%d=%lu", n, md->node[n]); 2120397874dfSChristoph Lameter out: 21211a75a6c8SChristoph Lameter seq_putc(m, '\n'); 21221a75a6c8SChristoph Lameter kfree(md); 21231a75a6c8SChristoph Lameter 21241a75a6c8SChristoph Lameter if (m->count < m->size) 212599f89551SEric W. Biederman m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; 21261a75a6c8SChristoph Lameter return 0; 21271a75a6c8SChristoph Lameter } 2128