1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59 /* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
69 */
70
71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
73 #include <linux/mempolicy.h>
74 #include <linux/pagewalk.h>
75 #include <linux/highmem.h>
76 #include <linux/hugetlb.h>
77 #include <linux/kernel.h>
78 #include <linux/sched.h>
79 #include <linux/sched/mm.h>
80 #include <linux/sched/numa_balancing.h>
81 #include <linux/sched/task.h>
82 #include <linux/nodemask.h>
83 #include <linux/cpuset.h>
84 #include <linux/slab.h>
85 #include <linux/string.h>
86 #include <linux/export.h>
87 #include <linux/nsproxy.h>
88 #include <linux/interrupt.h>
89 #include <linux/init.h>
90 #include <linux/compat.h>
91 #include <linux/ptrace.h>
92 #include <linux/swap.h>
93 #include <linux/seq_file.h>
94 #include <linux/proc_fs.h>
95 #include <linux/migrate.h>
96 #include <linux/ksm.h>
97 #include <linux/rmap.h>
98 #include <linux/security.h>
99 #include <linux/syscalls.h>
100 #include <linux/ctype.h>
101 #include <linux/mm_inline.h>
102 #include <linux/mmu_notifier.h>
103 #include <linux/printk.h>
104 #include <linux/swapops.h>
105
106 #include <asm/tlbflush.h>
107 #include <asm/tlb.h>
108 #include <linux/uaccess.h>
109
110 #include "internal.h"
111
112 /* Internal flags */
113 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
114 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
115 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
116
117 static struct kmem_cache *policy_cache;
118 static struct kmem_cache *sn_cache;
119
120 /* Highest zone. An specific allocation for a zone below that is not
121 policied. */
122 enum zone_type policy_zone = 0;
123
124 /*
125 * run-time system-wide default policy => local allocation
126 */
127 static struct mempolicy default_policy = {
128 .refcnt = ATOMIC_INIT(1), /* never free it */
129 .mode = MPOL_LOCAL,
130 };
131
132 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
133
134 /**
135 * numa_nearest_node - Find nearest node by state
136 * @node: Node id to start the search
137 * @state: State to filter the search
138 *
139 * Lookup the closest node by distance if @nid is not in state.
140 *
141 * Return: this @node if it is in state, otherwise the closest node by distance
142 */
numa_nearest_node(int node,unsigned int state)143 int numa_nearest_node(int node, unsigned int state)
144 {
145 int min_dist = INT_MAX, dist, n, min_node;
146
147 if (state >= NR_NODE_STATES)
148 return -EINVAL;
149
150 if (node == NUMA_NO_NODE || node_state(node, state))
151 return node;
152
153 min_node = node;
154 for_each_node_state(n, state) {
155 dist = node_distance(node, n);
156 if (dist < min_dist) {
157 min_dist = dist;
158 min_node = n;
159 }
160 }
161
162 return min_node;
163 }
164 EXPORT_SYMBOL_GPL(numa_nearest_node);
165
get_task_policy(struct task_struct * p)166 struct mempolicy *get_task_policy(struct task_struct *p)
167 {
168 struct mempolicy *pol = p->mempolicy;
169 int node;
170
171 if (pol)
172 return pol;
173
174 node = numa_node_id();
175 if (node != NUMA_NO_NODE) {
176 pol = &preferred_node_policy[node];
177 /* preferred_node_policy is not initialised early in boot */
178 if (pol->mode)
179 return pol;
180 }
181
182 return &default_policy;
183 }
184
185 static const struct mempolicy_operations {
186 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
187 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
188 } mpol_ops[MPOL_MAX];
189
mpol_store_user_nodemask(const struct mempolicy * pol)190 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
191 {
192 return pol->flags & MPOL_MODE_FLAGS;
193 }
194
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)195 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
196 const nodemask_t *rel)
197 {
198 nodemask_t tmp;
199 nodes_fold(tmp, *orig, nodes_weight(*rel));
200 nodes_onto(*ret, tmp, *rel);
201 }
202
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)203 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
204 {
205 if (nodes_empty(*nodes))
206 return -EINVAL;
207 pol->nodes = *nodes;
208 return 0;
209 }
210
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)211 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
212 {
213 if (nodes_empty(*nodes))
214 return -EINVAL;
215
216 nodes_clear(pol->nodes);
217 node_set(first_node(*nodes), pol->nodes);
218 return 0;
219 }
220
221 /*
222 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
223 * any, for the new policy. mpol_new() has already validated the nodes
224 * parameter with respect to the policy mode and flags.
225 *
226 * Must be called holding task's alloc_lock to protect task's mems_allowed
227 * and mempolicy. May also be called holding the mmap_lock for write.
228 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)229 static int mpol_set_nodemask(struct mempolicy *pol,
230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
231 {
232 int ret;
233
234 /*
235 * Default (pol==NULL) resp. local memory policies are not a
236 * subject of any remapping. They also do not need any special
237 * constructor.
238 */
239 if (!pol || pol->mode == MPOL_LOCAL)
240 return 0;
241
242 /* Check N_MEMORY */
243 nodes_and(nsc->mask1,
244 cpuset_current_mems_allowed, node_states[N_MEMORY]);
245
246 VM_BUG_ON(!nodes);
247
248 if (pol->flags & MPOL_F_RELATIVE_NODES)
249 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
250 else
251 nodes_and(nsc->mask2, *nodes, nsc->mask1);
252
253 if (mpol_store_user_nodemask(pol))
254 pol->w.user_nodemask = *nodes;
255 else
256 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
257
258 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 return ret;
260 }
261
262 /*
263 * This function just creates a new policy, does some check and simple
264 * initialization. You must invoke mpol_set_nodemask() to set nodes.
265 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)266 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
267 nodemask_t *nodes)
268 {
269 struct mempolicy *policy;
270
271 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
272 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
273
274 if (mode == MPOL_DEFAULT) {
275 if (nodes && !nodes_empty(*nodes))
276 return ERR_PTR(-EINVAL);
277 return NULL;
278 }
279 VM_BUG_ON(!nodes);
280
281 /*
282 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
283 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
284 * All other modes require a valid pointer to a non-empty nodemask.
285 */
286 if (mode == MPOL_PREFERRED) {
287 if (nodes_empty(*nodes)) {
288 if (((flags & MPOL_F_STATIC_NODES) ||
289 (flags & MPOL_F_RELATIVE_NODES)))
290 return ERR_PTR(-EINVAL);
291
292 mode = MPOL_LOCAL;
293 }
294 } else if (mode == MPOL_LOCAL) {
295 if (!nodes_empty(*nodes) ||
296 (flags & MPOL_F_STATIC_NODES) ||
297 (flags & MPOL_F_RELATIVE_NODES))
298 return ERR_PTR(-EINVAL);
299 } else if (nodes_empty(*nodes))
300 return ERR_PTR(-EINVAL);
301 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
302 if (!policy)
303 return ERR_PTR(-ENOMEM);
304 atomic_set(&policy->refcnt, 1);
305 policy->mode = mode;
306 policy->flags = flags;
307 policy->home_node = NUMA_NO_NODE;
308
309 return policy;
310 }
311
312 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * p)313 void __mpol_put(struct mempolicy *p)
314 {
315 if (!atomic_dec_and_test(&p->refcnt))
316 return;
317 kmem_cache_free(policy_cache, p);
318 }
319
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)320 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
321 {
322 }
323
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)324 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
325 {
326 nodemask_t tmp;
327
328 if (pol->flags & MPOL_F_STATIC_NODES)
329 nodes_and(tmp, pol->w.user_nodemask, *nodes);
330 else if (pol->flags & MPOL_F_RELATIVE_NODES)
331 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 else {
333 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
334 *nodes);
335 pol->w.cpuset_mems_allowed = *nodes;
336 }
337
338 if (nodes_empty(tmp))
339 tmp = *nodes;
340
341 pol->nodes = tmp;
342 }
343
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)344 static void mpol_rebind_preferred(struct mempolicy *pol,
345 const nodemask_t *nodes)
346 {
347 pol->w.cpuset_mems_allowed = *nodes;
348 }
349
350 /*
351 * mpol_rebind_policy - Migrate a policy to a different set of nodes
352 *
353 * Per-vma policies are protected by mmap_lock. Allocations using per-task
354 * policies are protected by task->mems_allowed_seq to prevent a premature
355 * OOM/allocation failure due to parallel nodemask modification.
356 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)357 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
358 {
359 if (!pol || pol->mode == MPOL_LOCAL)
360 return;
361 if (!mpol_store_user_nodemask(pol) &&
362 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
363 return;
364
365 mpol_ops[pol->mode].rebind(pol, newmask);
366 }
367
368 /*
369 * Wrapper for mpol_rebind_policy() that just requires task
370 * pointer, and updates task mempolicy.
371 *
372 * Called with task's alloc_lock held.
373 */
374
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)375 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
376 {
377 mpol_rebind_policy(tsk->mempolicy, new);
378 }
379
380 /*
381 * Rebind each vma in mm to new nodemask.
382 *
383 * Call holding a reference to mm. Takes mm->mmap_lock during call.
384 */
385
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)386 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
387 {
388 struct vm_area_struct *vma;
389 VMA_ITERATOR(vmi, mm, 0);
390
391 mmap_write_lock(mm);
392 for_each_vma(vmi, vma) {
393 vma_start_write(vma);
394 mpol_rebind_policy(vma->vm_policy, new);
395 }
396 mmap_write_unlock(mm);
397 }
398
399 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
400 [MPOL_DEFAULT] = {
401 .rebind = mpol_rebind_default,
402 },
403 [MPOL_INTERLEAVE] = {
404 .create = mpol_new_nodemask,
405 .rebind = mpol_rebind_nodemask,
406 },
407 [MPOL_PREFERRED] = {
408 .create = mpol_new_preferred,
409 .rebind = mpol_rebind_preferred,
410 },
411 [MPOL_BIND] = {
412 .create = mpol_new_nodemask,
413 .rebind = mpol_rebind_nodemask,
414 },
415 [MPOL_LOCAL] = {
416 .rebind = mpol_rebind_default,
417 },
418 [MPOL_PREFERRED_MANY] = {
419 .create = mpol_new_nodemask,
420 .rebind = mpol_rebind_preferred,
421 },
422 };
423
424 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
425 unsigned long flags);
426
strictly_unmovable(unsigned long flags)427 static bool strictly_unmovable(unsigned long flags)
428 {
429 /*
430 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
431 * if any misplaced page is found.
432 */
433 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
434 MPOL_MF_STRICT;
435 }
436
437 struct queue_pages {
438 struct list_head *pagelist;
439 unsigned long flags;
440 nodemask_t *nmask;
441 unsigned long start;
442 unsigned long end;
443 struct vm_area_struct *first;
444 struct folio *large; /* note last large folio encountered */
445 long nr_failed; /* could not be isolated at this time */
446 };
447
448 /*
449 * Check if the folio's nid is in qp->nmask.
450 *
451 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
452 * in the invert of qp->nmask.
453 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)454 static inline bool queue_folio_required(struct folio *folio,
455 struct queue_pages *qp)
456 {
457 int nid = folio_nid(folio);
458 unsigned long flags = qp->flags;
459
460 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
461 }
462
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)463 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
464 {
465 struct folio *folio;
466 struct queue_pages *qp = walk->private;
467
468 if (unlikely(is_pmd_migration_entry(*pmd))) {
469 qp->nr_failed++;
470 return;
471 }
472 folio = pfn_folio(pmd_pfn(*pmd));
473 if (is_huge_zero_page(&folio->page)) {
474 walk->action = ACTION_CONTINUE;
475 return;
476 }
477 if (!queue_folio_required(folio, qp))
478 return;
479 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
480 !vma_migratable(walk->vma) ||
481 !migrate_folio_add(folio, qp->pagelist, qp->flags))
482 qp->nr_failed++;
483 }
484
485 /*
486 * Scan through folios, checking if they satisfy the required conditions,
487 * moving them from LRU to local pagelist for migration if they do (or not).
488 *
489 * queue_folios_pte_range() has two possible return values:
490 * 0 - continue walking to scan for more, even if an existing folio on the
491 * wrong node could not be isolated and queued for migration.
492 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
493 * and an existing folio was on a node that does not follow the policy.
494 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)495 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
496 unsigned long end, struct mm_walk *walk)
497 {
498 struct vm_area_struct *vma = walk->vma;
499 struct folio *folio;
500 struct queue_pages *qp = walk->private;
501 unsigned long flags = qp->flags;
502 pte_t *pte, *mapped_pte;
503 pte_t ptent;
504 spinlock_t *ptl;
505
506 ptl = pmd_trans_huge_lock(pmd, vma);
507 if (ptl) {
508 queue_folios_pmd(pmd, walk);
509 spin_unlock(ptl);
510 goto out;
511 }
512
513 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
514 if (!pte) {
515 walk->action = ACTION_AGAIN;
516 return 0;
517 }
518 for (; addr != end; pte++, addr += PAGE_SIZE) {
519 ptent = ptep_get(pte);
520 if (pte_none(ptent))
521 continue;
522 if (!pte_present(ptent)) {
523 if (is_migration_entry(pte_to_swp_entry(ptent)))
524 qp->nr_failed++;
525 continue;
526 }
527 folio = vm_normal_folio(vma, addr, ptent);
528 if (!folio || folio_is_zone_device(folio))
529 continue;
530 /*
531 * vm_normal_folio() filters out zero pages, but there might
532 * still be reserved folios to skip, perhaps in a VDSO.
533 */
534 if (folio_test_reserved(folio))
535 continue;
536 if (!queue_folio_required(folio, qp))
537 continue;
538 if (folio_test_large(folio)) {
539 /*
540 * A large folio can only be isolated from LRU once,
541 * but may be mapped by many PTEs (and Copy-On-Write may
542 * intersperse PTEs of other, order 0, folios). This is
543 * a common case, so don't mistake it for failure (but
544 * there can be other cases of multi-mapped pages which
545 * this quick check does not help to filter out - and a
546 * search of the pagelist might grow to be prohibitive).
547 *
548 * migrate_pages(&pagelist) returns nr_failed folios, so
549 * check "large" now so that queue_pages_range() returns
550 * a comparable nr_failed folios. This does imply that
551 * if folio could not be isolated for some racy reason
552 * at its first PTE, later PTEs will not give it another
553 * chance of isolation; but keeps the accounting simple.
554 */
555 if (folio == qp->large)
556 continue;
557 qp->large = folio;
558 }
559 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
560 !vma_migratable(vma) ||
561 !migrate_folio_add(folio, qp->pagelist, flags)) {
562 qp->nr_failed++;
563 if (strictly_unmovable(flags))
564 break;
565 }
566 }
567 pte_unmap_unlock(mapped_pte, ptl);
568 cond_resched();
569 out:
570 if (qp->nr_failed && strictly_unmovable(flags))
571 return -EIO;
572 return 0;
573 }
574
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)575 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
576 unsigned long addr, unsigned long end,
577 struct mm_walk *walk)
578 {
579 #ifdef CONFIG_HUGETLB_PAGE
580 struct queue_pages *qp = walk->private;
581 unsigned long flags = qp->flags;
582 struct folio *folio;
583 spinlock_t *ptl;
584 pte_t entry;
585
586 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
587 entry = huge_ptep_get(pte);
588 if (!pte_present(entry)) {
589 if (unlikely(is_hugetlb_entry_migration(entry)))
590 qp->nr_failed++;
591 goto unlock;
592 }
593 folio = pfn_folio(pte_pfn(entry));
594 if (!queue_folio_required(folio, qp))
595 goto unlock;
596 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
597 !vma_migratable(walk->vma)) {
598 qp->nr_failed++;
599 goto unlock;
600 }
601 /*
602 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
603 * Choosing not to migrate a shared folio is not counted as a failure.
604 *
605 * To check if the folio is shared, ideally we want to make sure
606 * every page is mapped to the same process. Doing that is very
607 * expensive, so check the estimated sharers of the folio instead.
608 */
609 if ((flags & MPOL_MF_MOVE_ALL) ||
610 (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
611 if (!isolate_hugetlb(folio, qp->pagelist))
612 qp->nr_failed++;
613 unlock:
614 spin_unlock(ptl);
615 if (qp->nr_failed && strictly_unmovable(flags))
616 return -EIO;
617 #endif
618 return 0;
619 }
620
621 #ifdef CONFIG_NUMA_BALANCING
622 /*
623 * This is used to mark a range of virtual addresses to be inaccessible.
624 * These are later cleared by a NUMA hinting fault. Depending on these
625 * faults, pages may be migrated for better NUMA placement.
626 *
627 * This is assuming that NUMA faults are handled using PROT_NONE. If
628 * an architecture makes a different choice, it will need further
629 * changes to the core.
630 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)631 unsigned long change_prot_numa(struct vm_area_struct *vma,
632 unsigned long addr, unsigned long end)
633 {
634 struct mmu_gather tlb;
635 long nr_updated;
636
637 tlb_gather_mmu(&tlb, vma->vm_mm);
638
639 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
640 if (nr_updated > 0)
641 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
642
643 tlb_finish_mmu(&tlb);
644
645 return nr_updated;
646 }
647 #else
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)648 static unsigned long change_prot_numa(struct vm_area_struct *vma,
649 unsigned long addr, unsigned long end)
650 {
651 return 0;
652 }
653 #endif /* CONFIG_NUMA_BALANCING */
654
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)655 static int queue_pages_test_walk(unsigned long start, unsigned long end,
656 struct mm_walk *walk)
657 {
658 struct vm_area_struct *next, *vma = walk->vma;
659 struct queue_pages *qp = walk->private;
660 unsigned long endvma = vma->vm_end;
661 unsigned long flags = qp->flags;
662
663 /* range check first */
664 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
665
666 if (!qp->first) {
667 qp->first = vma;
668 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
669 (qp->start < vma->vm_start))
670 /* hole at head side of range */
671 return -EFAULT;
672 }
673 next = find_vma(vma->vm_mm, vma->vm_end);
674 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
675 ((vma->vm_end < qp->end) &&
676 (!next || vma->vm_end < next->vm_start)))
677 /* hole at middle or tail of range */
678 return -EFAULT;
679
680 /*
681 * Need check MPOL_MF_STRICT to return -EIO if possible
682 * regardless of vma_migratable
683 */
684 if (!vma_migratable(vma) &&
685 !(flags & MPOL_MF_STRICT))
686 return 1;
687
688 if (endvma > end)
689 endvma = end;
690
691 if (flags & MPOL_MF_LAZY) {
692 /* Similar to task_numa_work, skip inaccessible VMAs */
693 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
694 !(vma->vm_flags & VM_MIXEDMAP))
695 change_prot_numa(vma, start, endvma);
696 return 1;
697 }
698
699 /*
700 * Check page nodes, and queue pages to move, in the current vma.
701 * But if no moving, and no strict checking, the scan can be skipped.
702 */
703 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
704 return 0;
705 return 1;
706 }
707
708 static const struct mm_walk_ops queue_pages_walk_ops = {
709 .hugetlb_entry = queue_folios_hugetlb,
710 .pmd_entry = queue_folios_pte_range,
711 .test_walk = queue_pages_test_walk,
712 .walk_lock = PGWALK_RDLOCK,
713 };
714
715 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
716 .hugetlb_entry = queue_folios_hugetlb,
717 .pmd_entry = queue_folios_pte_range,
718 .test_walk = queue_pages_test_walk,
719 .walk_lock = PGWALK_WRLOCK,
720 };
721
722 /*
723 * Walk through page tables and collect pages to be migrated.
724 *
725 * If pages found in a given range are not on the required set of @nodes,
726 * and migration is allowed, they are isolated and queued to @pagelist.
727 *
728 * queue_pages_range() may return:
729 * 0 - all pages already on the right node, or successfully queued for moving
730 * (or neither strict checking nor moving requested: only range checking).
731 * >0 - this number of misplaced folios could not be queued for moving
732 * (a hugetlbfs page or a transparent huge page being counted as 1).
733 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
734 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
735 */
736 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)737 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
738 nodemask_t *nodes, unsigned long flags,
739 struct list_head *pagelist)
740 {
741 int err;
742 struct queue_pages qp = {
743 .pagelist = pagelist,
744 .flags = flags,
745 .nmask = nodes,
746 .start = start,
747 .end = end,
748 .first = NULL,
749 };
750 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
751 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
752
753 err = walk_page_range(mm, start, end, ops, &qp);
754
755 if (!qp.first)
756 /* whole range in hole */
757 err = -EFAULT;
758
759 return err ? : qp.nr_failed;
760 }
761
762 /*
763 * Apply policy to a single VMA
764 * This must be called with the mmap_lock held for writing.
765 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)766 static int vma_replace_policy(struct vm_area_struct *vma,
767 struct mempolicy *pol)
768 {
769 int err;
770 struct mempolicy *old;
771 struct mempolicy *new;
772
773 vma_assert_write_locked(vma);
774
775 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
776 vma->vm_start, vma->vm_end, vma->vm_pgoff,
777 vma->vm_ops, vma->vm_file,
778 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
779
780 new = mpol_dup(pol);
781 if (IS_ERR(new))
782 return PTR_ERR(new);
783
784 if (vma->vm_ops && vma->vm_ops->set_policy) {
785 err = vma->vm_ops->set_policy(vma, new);
786 if (err)
787 goto err_out;
788 }
789
790 old = vma->vm_policy;
791 vma->vm_policy = new; /* protected by mmap_lock */
792 mpol_put(old);
793
794 return 0;
795 err_out:
796 mpol_put(new);
797 return err;
798 }
799
800 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)801 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
802 struct vm_area_struct **prev, unsigned long start,
803 unsigned long end, struct mempolicy *new_pol)
804 {
805 struct vm_area_struct *merged;
806 unsigned long vmstart, vmend;
807 pgoff_t pgoff;
808 int err;
809
810 vmend = min(end, vma->vm_end);
811 if (start > vma->vm_start) {
812 *prev = vma;
813 vmstart = start;
814 } else {
815 vmstart = vma->vm_start;
816 }
817
818 if (mpol_equal(vma_policy(vma), new_pol)) {
819 *prev = vma;
820 return 0;
821 }
822
823 pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
824 merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
825 vma->anon_vma, vma->vm_file, pgoff, new_pol,
826 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
827 if (merged) {
828 *prev = merged;
829 return vma_replace_policy(merged, new_pol);
830 }
831
832 if (vma->vm_start != vmstart) {
833 err = split_vma(vmi, vma, vmstart, 1);
834 if (err)
835 return err;
836 }
837
838 if (vma->vm_end != vmend) {
839 err = split_vma(vmi, vma, vmend, 0);
840 if (err)
841 return err;
842 }
843
844 *prev = vma;
845 return vma_replace_policy(vma, new_pol);
846 }
847
848 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)849 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
850 nodemask_t *nodes)
851 {
852 struct mempolicy *new, *old;
853 NODEMASK_SCRATCH(scratch);
854 int ret;
855
856 if (!scratch)
857 return -ENOMEM;
858
859 new = mpol_new(mode, flags, nodes);
860 if (IS_ERR(new)) {
861 ret = PTR_ERR(new);
862 goto out;
863 }
864
865 task_lock(current);
866 ret = mpol_set_nodemask(new, nodes, scratch);
867 if (ret) {
868 task_unlock(current);
869 mpol_put(new);
870 goto out;
871 }
872
873 old = current->mempolicy;
874 current->mempolicy = new;
875 if (new && new->mode == MPOL_INTERLEAVE)
876 current->il_prev = MAX_NUMNODES-1;
877 task_unlock(current);
878 mpol_put(old);
879 ret = 0;
880 out:
881 NODEMASK_SCRATCH_FREE(scratch);
882 return ret;
883 }
884
885 /*
886 * Return nodemask for policy for get_mempolicy() query
887 *
888 * Called with task's alloc_lock held
889 */
get_policy_nodemask(struct mempolicy * p,nodemask_t * nodes)890 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
891 {
892 nodes_clear(*nodes);
893 if (p == &default_policy)
894 return;
895
896 switch (p->mode) {
897 case MPOL_BIND:
898 case MPOL_INTERLEAVE:
899 case MPOL_PREFERRED:
900 case MPOL_PREFERRED_MANY:
901 *nodes = p->nodes;
902 break;
903 case MPOL_LOCAL:
904 /* return empty node mask for local allocation */
905 break;
906 default:
907 BUG();
908 }
909 }
910
lookup_node(struct mm_struct * mm,unsigned long addr)911 static int lookup_node(struct mm_struct *mm, unsigned long addr)
912 {
913 struct page *p = NULL;
914 int ret;
915
916 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
917 if (ret > 0) {
918 ret = page_to_nid(p);
919 put_page(p);
920 }
921 return ret;
922 }
923
924 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)925 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
926 unsigned long addr, unsigned long flags)
927 {
928 int err;
929 struct mm_struct *mm = current->mm;
930 struct vm_area_struct *vma = NULL;
931 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
932
933 if (flags &
934 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
935 return -EINVAL;
936
937 if (flags & MPOL_F_MEMS_ALLOWED) {
938 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
939 return -EINVAL;
940 *policy = 0; /* just so it's initialized */
941 task_lock(current);
942 *nmask = cpuset_current_mems_allowed;
943 task_unlock(current);
944 return 0;
945 }
946
947 if (flags & MPOL_F_ADDR) {
948 /*
949 * Do NOT fall back to task policy if the
950 * vma/shared policy at addr is NULL. We
951 * want to return MPOL_DEFAULT in this case.
952 */
953 mmap_read_lock(mm);
954 vma = vma_lookup(mm, addr);
955 if (!vma) {
956 mmap_read_unlock(mm);
957 return -EFAULT;
958 }
959 if (vma->vm_ops && vma->vm_ops->get_policy)
960 pol = vma->vm_ops->get_policy(vma, addr);
961 else
962 pol = vma->vm_policy;
963 } else if (addr)
964 return -EINVAL;
965
966 if (!pol)
967 pol = &default_policy; /* indicates default behavior */
968
969 if (flags & MPOL_F_NODE) {
970 if (flags & MPOL_F_ADDR) {
971 /*
972 * Take a refcount on the mpol, because we are about to
973 * drop the mmap_lock, after which only "pol" remains
974 * valid, "vma" is stale.
975 */
976 pol_refcount = pol;
977 vma = NULL;
978 mpol_get(pol);
979 mmap_read_unlock(mm);
980 err = lookup_node(mm, addr);
981 if (err < 0)
982 goto out;
983 *policy = err;
984 } else if (pol == current->mempolicy &&
985 pol->mode == MPOL_INTERLEAVE) {
986 *policy = next_node_in(current->il_prev, pol->nodes);
987 } else {
988 err = -EINVAL;
989 goto out;
990 }
991 } else {
992 *policy = pol == &default_policy ? MPOL_DEFAULT :
993 pol->mode;
994 /*
995 * Internal mempolicy flags must be masked off before exposing
996 * the policy to userspace.
997 */
998 *policy |= (pol->flags & MPOL_MODE_FLAGS);
999 }
1000
1001 err = 0;
1002 if (nmask) {
1003 if (mpol_store_user_nodemask(pol)) {
1004 *nmask = pol->w.user_nodemask;
1005 } else {
1006 task_lock(current);
1007 get_policy_nodemask(pol, nmask);
1008 task_unlock(current);
1009 }
1010 }
1011
1012 out:
1013 mpol_cond_put(pol);
1014 if (vma)
1015 mmap_read_unlock(mm);
1016 if (pol_refcount)
1017 mpol_put(pol_refcount);
1018 return err;
1019 }
1020
1021 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1022 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1023 unsigned long flags)
1024 {
1025 /*
1026 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1027 * Choosing not to migrate a shared folio is not counted as a failure.
1028 *
1029 * To check if the folio is shared, ideally we want to make sure
1030 * every page is mapped to the same process. Doing that is very
1031 * expensive, so check the estimated sharers of the folio instead.
1032 */
1033 if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1034 if (folio_isolate_lru(folio)) {
1035 list_add_tail(&folio->lru, foliolist);
1036 node_stat_mod_folio(folio,
1037 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1038 folio_nr_pages(folio));
1039 } else {
1040 /*
1041 * Non-movable folio may reach here. And, there may be
1042 * temporary off LRU folios or non-LRU movable folios.
1043 * Treat them as unmovable folios since they can't be
1044 * isolated, so they can't be moved at the moment.
1045 */
1046 return false;
1047 }
1048 }
1049 return true;
1050 }
1051
1052 /*
1053 * Migrate pages from one node to a target node.
1054 * Returns error or the number of pages not migrated.
1055 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1056 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1057 int flags)
1058 {
1059 nodemask_t nmask;
1060 struct vm_area_struct *vma;
1061 LIST_HEAD(pagelist);
1062 long nr_failed;
1063 long err = 0;
1064 struct migration_target_control mtc = {
1065 .nid = dest,
1066 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1067 };
1068
1069 nodes_clear(nmask);
1070 node_set(source, nmask);
1071
1072 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1073 vma = find_vma(mm, 0);
1074 if (unlikely(!vma)) {
1075 mmap_read_unlock(mm);
1076 return 0;
1077 }
1078
1079 /*
1080 * This does not migrate the range, but isolates all pages that
1081 * need migration. Between passing in the full user address
1082 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1083 * but passes back the count of pages which could not be isolated.
1084 */
1085 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1086 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1087
1088 if (!list_empty(&pagelist)) {
1089 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1090 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1091 if (err)
1092 putback_movable_pages(&pagelist);
1093 }
1094
1095 if (err >= 0)
1096 err += nr_failed;
1097 return err;
1098 }
1099
1100 /*
1101 * Move pages between the two nodesets so as to preserve the physical
1102 * layout as much as possible.
1103 *
1104 * Returns the number of page that could not be moved.
1105 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1106 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1107 const nodemask_t *to, int flags)
1108 {
1109 long nr_failed = 0;
1110 long err = 0;
1111 nodemask_t tmp;
1112
1113 lru_cache_disable();
1114
1115 mmap_read_lock(mm);
1116
1117 /*
1118 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1119 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1120 * bit in 'tmp', and return that <source, dest> pair for migration.
1121 * The pair of nodemasks 'to' and 'from' define the map.
1122 *
1123 * If no pair of bits is found that way, fallback to picking some
1124 * pair of 'source' and 'dest' bits that are not the same. If the
1125 * 'source' and 'dest' bits are the same, this represents a node
1126 * that will be migrating to itself, so no pages need move.
1127 *
1128 * If no bits are left in 'tmp', or if all remaining bits left
1129 * in 'tmp' correspond to the same bit in 'to', return false
1130 * (nothing left to migrate).
1131 *
1132 * This lets us pick a pair of nodes to migrate between, such that
1133 * if possible the dest node is not already occupied by some other
1134 * source node, minimizing the risk of overloading the memory on a
1135 * node that would happen if we migrated incoming memory to a node
1136 * before migrating outgoing memory source that same node.
1137 *
1138 * A single scan of tmp is sufficient. As we go, we remember the
1139 * most recent <s, d> pair that moved (s != d). If we find a pair
1140 * that not only moved, but what's better, moved to an empty slot
1141 * (d is not set in tmp), then we break out then, with that pair.
1142 * Otherwise when we finish scanning from_tmp, we at least have the
1143 * most recent <s, d> pair that moved. If we get all the way through
1144 * the scan of tmp without finding any node that moved, much less
1145 * moved to an empty node, then there is nothing left worth migrating.
1146 */
1147
1148 tmp = *from;
1149 while (!nodes_empty(tmp)) {
1150 int s, d;
1151 int source = NUMA_NO_NODE;
1152 int dest = 0;
1153
1154 for_each_node_mask(s, tmp) {
1155
1156 /*
1157 * do_migrate_pages() tries to maintain the relative
1158 * node relationship of the pages established between
1159 * threads and memory areas.
1160 *
1161 * However if the number of source nodes is not equal to
1162 * the number of destination nodes we can not preserve
1163 * this node relative relationship. In that case, skip
1164 * copying memory from a node that is in the destination
1165 * mask.
1166 *
1167 * Example: [2,3,4] -> [3,4,5] moves everything.
1168 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1169 */
1170
1171 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1172 (node_isset(s, *to)))
1173 continue;
1174
1175 d = node_remap(s, *from, *to);
1176 if (s == d)
1177 continue;
1178
1179 source = s; /* Node moved. Memorize */
1180 dest = d;
1181
1182 /* dest not in remaining from nodes? */
1183 if (!node_isset(dest, tmp))
1184 break;
1185 }
1186 if (source == NUMA_NO_NODE)
1187 break;
1188
1189 node_clear(source, tmp);
1190 err = migrate_to_node(mm, source, dest, flags);
1191 if (err > 0)
1192 nr_failed += err;
1193 if (err < 0)
1194 break;
1195 }
1196 mmap_read_unlock(mm);
1197
1198 lru_cache_enable();
1199 if (err < 0)
1200 return err;
1201 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1202 }
1203
1204 /*
1205 * Allocate a new page for page migration based on vma policy.
1206 * Start by assuming the page is mapped by the same vma as contains @start.
1207 * Search forward from there, if not. N.B., this assumes that the
1208 * list of pages handed to migrate_pages()--which is how we get here--
1209 * is in virtual address order.
1210 */
new_folio(struct folio * src,unsigned long start)1211 static struct folio *new_folio(struct folio *src, unsigned long start)
1212 {
1213 struct vm_area_struct *vma;
1214 unsigned long address;
1215 VMA_ITERATOR(vmi, current->mm, start);
1216 gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1217
1218 for_each_vma(vmi, vma) {
1219 address = page_address_in_vma(&src->page, vma);
1220 if (address != -EFAULT)
1221 break;
1222 }
1223
1224 if (folio_test_hugetlb(src)) {
1225 return alloc_hugetlb_folio_vma(folio_hstate(src),
1226 vma, address);
1227 }
1228
1229 if (folio_test_large(src))
1230 gfp = GFP_TRANSHUGE;
1231
1232 /*
1233 * if !vma, vma_alloc_folio() will use task or system default policy
1234 */
1235 return vma_alloc_folio(gfp, folio_order(src), vma, address,
1236 folio_test_large(src));
1237 }
1238 #else
1239
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1240 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1241 unsigned long flags)
1242 {
1243 return false;
1244 }
1245
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1246 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1247 const nodemask_t *to, int flags)
1248 {
1249 return -ENOSYS;
1250 }
1251
new_folio(struct folio * src,unsigned long start)1252 static struct folio *new_folio(struct folio *src, unsigned long start)
1253 {
1254 return NULL;
1255 }
1256 #endif
1257
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1258 static long do_mbind(unsigned long start, unsigned long len,
1259 unsigned short mode, unsigned short mode_flags,
1260 nodemask_t *nmask, unsigned long flags)
1261 {
1262 struct mm_struct *mm = current->mm;
1263 struct vm_area_struct *vma, *prev;
1264 struct vma_iterator vmi;
1265 struct mempolicy *new;
1266 unsigned long end;
1267 long err;
1268 long nr_failed;
1269 LIST_HEAD(pagelist);
1270
1271 if (flags & ~(unsigned long)MPOL_MF_VALID)
1272 return -EINVAL;
1273 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1274 return -EPERM;
1275
1276 if (start & ~PAGE_MASK)
1277 return -EINVAL;
1278
1279 if (mode == MPOL_DEFAULT)
1280 flags &= ~MPOL_MF_STRICT;
1281
1282 len = PAGE_ALIGN(len);
1283 end = start + len;
1284
1285 if (end < start)
1286 return -EINVAL;
1287 if (end == start)
1288 return 0;
1289
1290 new = mpol_new(mode, mode_flags, nmask);
1291 if (IS_ERR(new))
1292 return PTR_ERR(new);
1293
1294 if (flags & MPOL_MF_LAZY)
1295 new->flags |= MPOL_F_MOF;
1296
1297 /*
1298 * If we are using the default policy then operation
1299 * on discontinuous address spaces is okay after all
1300 */
1301 if (!new)
1302 flags |= MPOL_MF_DISCONTIG_OK;
1303
1304 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1305 start, start + len, mode, mode_flags,
1306 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1307
1308 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1309 lru_cache_disable();
1310 {
1311 NODEMASK_SCRATCH(scratch);
1312 if (scratch) {
1313 mmap_write_lock(mm);
1314 err = mpol_set_nodemask(new, nmask, scratch);
1315 if (err)
1316 mmap_write_unlock(mm);
1317 } else
1318 err = -ENOMEM;
1319 NODEMASK_SCRATCH_FREE(scratch);
1320 }
1321 if (err)
1322 goto mpol_out;
1323
1324 /*
1325 * Lock the VMAs before scanning for pages to migrate,
1326 * to ensure we don't miss a concurrently inserted page.
1327 */
1328 nr_failed = queue_pages_range(mm, start, end, nmask,
1329 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1330
1331 if (nr_failed < 0) {
1332 err = nr_failed;
1333 } else {
1334 vma_iter_init(&vmi, mm, start);
1335 prev = vma_prev(&vmi);
1336 for_each_vma_range(vmi, vma, end) {
1337 err = mbind_range(&vmi, vma, &prev, start, end, new);
1338 if (err)
1339 break;
1340 }
1341 }
1342
1343 if (!err) {
1344 if (!list_empty(&pagelist)) {
1345 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1346 nr_failed |= migrate_pages(&pagelist, new_folio, NULL,
1347 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1348 }
1349 if (nr_failed && (flags & MPOL_MF_STRICT))
1350 err = -EIO;
1351 }
1352
1353 if (!list_empty(&pagelist))
1354 putback_movable_pages(&pagelist);
1355
1356 mmap_write_unlock(mm);
1357 mpol_out:
1358 mpol_put(new);
1359 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1360 lru_cache_enable();
1361 return err;
1362 }
1363
1364 /*
1365 * User space interface with variable sized bitmaps for nodelists.
1366 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1367 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1368 unsigned long maxnode)
1369 {
1370 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1371 int ret;
1372
1373 if (in_compat_syscall())
1374 ret = compat_get_bitmap(mask,
1375 (const compat_ulong_t __user *)nmask,
1376 maxnode);
1377 else
1378 ret = copy_from_user(mask, nmask,
1379 nlongs * sizeof(unsigned long));
1380
1381 if (ret)
1382 return -EFAULT;
1383
1384 if (maxnode % BITS_PER_LONG)
1385 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1386
1387 return 0;
1388 }
1389
1390 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1391 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1392 unsigned long maxnode)
1393 {
1394 --maxnode;
1395 nodes_clear(*nodes);
1396 if (maxnode == 0 || !nmask)
1397 return 0;
1398 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1399 return -EINVAL;
1400
1401 /*
1402 * When the user specified more nodes than supported just check
1403 * if the non supported part is all zero, one word at a time,
1404 * starting at the end.
1405 */
1406 while (maxnode > MAX_NUMNODES) {
1407 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1408 unsigned long t;
1409
1410 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1411 return -EFAULT;
1412
1413 if (maxnode - bits >= MAX_NUMNODES) {
1414 maxnode -= bits;
1415 } else {
1416 maxnode = MAX_NUMNODES;
1417 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1418 }
1419 if (t)
1420 return -EINVAL;
1421 }
1422
1423 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1424 }
1425
1426 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1427 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1428 nodemask_t *nodes)
1429 {
1430 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1431 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1432 bool compat = in_compat_syscall();
1433
1434 if (compat)
1435 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1436
1437 if (copy > nbytes) {
1438 if (copy > PAGE_SIZE)
1439 return -EINVAL;
1440 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1441 return -EFAULT;
1442 copy = nbytes;
1443 maxnode = nr_node_ids;
1444 }
1445
1446 if (compat)
1447 return compat_put_bitmap((compat_ulong_t __user *)mask,
1448 nodes_addr(*nodes), maxnode);
1449
1450 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1451 }
1452
1453 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1454 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1455 {
1456 *flags = *mode & MPOL_MODE_FLAGS;
1457 *mode &= ~MPOL_MODE_FLAGS;
1458
1459 if ((unsigned int)(*mode) >= MPOL_MAX)
1460 return -EINVAL;
1461 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1462 return -EINVAL;
1463 if (*flags & MPOL_F_NUMA_BALANCING) {
1464 if (*mode != MPOL_BIND)
1465 return -EINVAL;
1466 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1467 }
1468 return 0;
1469 }
1470
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1471 static long kernel_mbind(unsigned long start, unsigned long len,
1472 unsigned long mode, const unsigned long __user *nmask,
1473 unsigned long maxnode, unsigned int flags)
1474 {
1475 unsigned short mode_flags;
1476 nodemask_t nodes;
1477 int lmode = mode;
1478 int err;
1479
1480 start = untagged_addr(start);
1481 err = sanitize_mpol_flags(&lmode, &mode_flags);
1482 if (err)
1483 return err;
1484
1485 err = get_nodes(&nodes, nmask, maxnode);
1486 if (err)
1487 return err;
1488
1489 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1490 }
1491
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1492 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1493 unsigned long, home_node, unsigned long, flags)
1494 {
1495 struct mm_struct *mm = current->mm;
1496 struct vm_area_struct *vma, *prev;
1497 struct mempolicy *new, *old;
1498 unsigned long end;
1499 int err = -ENOENT;
1500 VMA_ITERATOR(vmi, mm, start);
1501
1502 start = untagged_addr(start);
1503 if (start & ~PAGE_MASK)
1504 return -EINVAL;
1505 /*
1506 * flags is used for future extension if any.
1507 */
1508 if (flags != 0)
1509 return -EINVAL;
1510
1511 /*
1512 * Check home_node is online to avoid accessing uninitialized
1513 * NODE_DATA.
1514 */
1515 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1516 return -EINVAL;
1517
1518 len = PAGE_ALIGN(len);
1519 end = start + len;
1520
1521 if (end < start)
1522 return -EINVAL;
1523 if (end == start)
1524 return 0;
1525 mmap_write_lock(mm);
1526 prev = vma_prev(&vmi);
1527 for_each_vma_range(vmi, vma, end) {
1528 /*
1529 * If any vma in the range got policy other than MPOL_BIND
1530 * or MPOL_PREFERRED_MANY we return error. We don't reset
1531 * the home node for vmas we already updated before.
1532 */
1533 old = vma_policy(vma);
1534 if (!old) {
1535 prev = vma;
1536 continue;
1537 }
1538 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1539 err = -EOPNOTSUPP;
1540 break;
1541 }
1542 new = mpol_dup(old);
1543 if (IS_ERR(new)) {
1544 err = PTR_ERR(new);
1545 break;
1546 }
1547
1548 vma_start_write(vma);
1549 new->home_node = home_node;
1550 err = mbind_range(&vmi, vma, &prev, start, end, new);
1551 mpol_put(new);
1552 if (err)
1553 break;
1554 }
1555 mmap_write_unlock(mm);
1556 return err;
1557 }
1558
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1559 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1560 unsigned long, mode, const unsigned long __user *, nmask,
1561 unsigned long, maxnode, unsigned int, flags)
1562 {
1563 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1564 }
1565
1566 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1567 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1568 unsigned long maxnode)
1569 {
1570 unsigned short mode_flags;
1571 nodemask_t nodes;
1572 int lmode = mode;
1573 int err;
1574
1575 err = sanitize_mpol_flags(&lmode, &mode_flags);
1576 if (err)
1577 return err;
1578
1579 err = get_nodes(&nodes, nmask, maxnode);
1580 if (err)
1581 return err;
1582
1583 return do_set_mempolicy(lmode, mode_flags, &nodes);
1584 }
1585
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1586 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1587 unsigned long, maxnode)
1588 {
1589 return kernel_set_mempolicy(mode, nmask, maxnode);
1590 }
1591
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1592 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1593 const unsigned long __user *old_nodes,
1594 const unsigned long __user *new_nodes)
1595 {
1596 struct mm_struct *mm = NULL;
1597 struct task_struct *task;
1598 nodemask_t task_nodes;
1599 int err;
1600 nodemask_t *old;
1601 nodemask_t *new;
1602 NODEMASK_SCRATCH(scratch);
1603
1604 if (!scratch)
1605 return -ENOMEM;
1606
1607 old = &scratch->mask1;
1608 new = &scratch->mask2;
1609
1610 err = get_nodes(old, old_nodes, maxnode);
1611 if (err)
1612 goto out;
1613
1614 err = get_nodes(new, new_nodes, maxnode);
1615 if (err)
1616 goto out;
1617
1618 /* Find the mm_struct */
1619 rcu_read_lock();
1620 task = pid ? find_task_by_vpid(pid) : current;
1621 if (!task) {
1622 rcu_read_unlock();
1623 err = -ESRCH;
1624 goto out;
1625 }
1626 get_task_struct(task);
1627
1628 err = -EINVAL;
1629
1630 /*
1631 * Check if this process has the right to modify the specified process.
1632 * Use the regular "ptrace_may_access()" checks.
1633 */
1634 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1635 rcu_read_unlock();
1636 err = -EPERM;
1637 goto out_put;
1638 }
1639 rcu_read_unlock();
1640
1641 task_nodes = cpuset_mems_allowed(task);
1642 /* Is the user allowed to access the target nodes? */
1643 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1644 err = -EPERM;
1645 goto out_put;
1646 }
1647
1648 task_nodes = cpuset_mems_allowed(current);
1649 nodes_and(*new, *new, task_nodes);
1650 if (nodes_empty(*new))
1651 goto out_put;
1652
1653 err = security_task_movememory(task);
1654 if (err)
1655 goto out_put;
1656
1657 mm = get_task_mm(task);
1658 put_task_struct(task);
1659
1660 if (!mm) {
1661 err = -EINVAL;
1662 goto out;
1663 }
1664
1665 err = do_migrate_pages(mm, old, new,
1666 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1667
1668 mmput(mm);
1669 out:
1670 NODEMASK_SCRATCH_FREE(scratch);
1671
1672 return err;
1673
1674 out_put:
1675 put_task_struct(task);
1676 goto out;
1677
1678 }
1679
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1680 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1681 const unsigned long __user *, old_nodes,
1682 const unsigned long __user *, new_nodes)
1683 {
1684 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1685 }
1686
1687
1688 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1689 static int kernel_get_mempolicy(int __user *policy,
1690 unsigned long __user *nmask,
1691 unsigned long maxnode,
1692 unsigned long addr,
1693 unsigned long flags)
1694 {
1695 int err;
1696 int pval;
1697 nodemask_t nodes;
1698
1699 if (nmask != NULL && maxnode < nr_node_ids)
1700 return -EINVAL;
1701
1702 addr = untagged_addr(addr);
1703
1704 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1705
1706 if (err)
1707 return err;
1708
1709 if (policy && put_user(pval, policy))
1710 return -EFAULT;
1711
1712 if (nmask)
1713 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1714
1715 return err;
1716 }
1717
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1718 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1719 unsigned long __user *, nmask, unsigned long, maxnode,
1720 unsigned long, addr, unsigned long, flags)
1721 {
1722 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1723 }
1724
vma_migratable(struct vm_area_struct * vma)1725 bool vma_migratable(struct vm_area_struct *vma)
1726 {
1727 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1728 return false;
1729
1730 /*
1731 * DAX device mappings require predictable access latency, so avoid
1732 * incurring periodic faults.
1733 */
1734 if (vma_is_dax(vma))
1735 return false;
1736
1737 if (is_vm_hugetlb_page(vma) &&
1738 !hugepage_migration_supported(hstate_vma(vma)))
1739 return false;
1740
1741 /*
1742 * Migration allocates pages in the highest zone. If we cannot
1743 * do so then migration (at least from node to node) is not
1744 * possible.
1745 */
1746 if (vma->vm_file &&
1747 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1748 < policy_zone)
1749 return false;
1750 return true;
1751 }
1752
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1753 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1754 unsigned long addr)
1755 {
1756 struct mempolicy *pol = NULL;
1757
1758 if (vma) {
1759 if (vma->vm_ops && vma->vm_ops->get_policy) {
1760 pol = vma->vm_ops->get_policy(vma, addr);
1761 } else if (vma->vm_policy) {
1762 pol = vma->vm_policy;
1763
1764 /*
1765 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1766 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1767 * count on these policies which will be dropped by
1768 * mpol_cond_put() later
1769 */
1770 if (mpol_needs_cond_ref(pol))
1771 mpol_get(pol);
1772 }
1773 }
1774
1775 return pol;
1776 }
1777
1778 /*
1779 * get_vma_policy(@vma, @addr)
1780 * @vma: virtual memory area whose policy is sought
1781 * @addr: address in @vma for shared policy lookup
1782 *
1783 * Returns effective policy for a VMA at specified address.
1784 * Falls back to current->mempolicy or system default policy, as necessary.
1785 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1786 * count--added by the get_policy() vm_op, as appropriate--to protect against
1787 * freeing by another task. It is the caller's responsibility to free the
1788 * extra reference for shared policies.
1789 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1790 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1791 unsigned long addr)
1792 {
1793 struct mempolicy *pol = __get_vma_policy(vma, addr);
1794
1795 if (!pol)
1796 pol = get_task_policy(current);
1797
1798 return pol;
1799 }
1800
vma_policy_mof(struct vm_area_struct * vma)1801 bool vma_policy_mof(struct vm_area_struct *vma)
1802 {
1803 struct mempolicy *pol;
1804
1805 if (vma->vm_ops && vma->vm_ops->get_policy) {
1806 bool ret = false;
1807
1808 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1809 if (pol && (pol->flags & MPOL_F_MOF))
1810 ret = true;
1811 mpol_cond_put(pol);
1812
1813 return ret;
1814 }
1815
1816 pol = vma->vm_policy;
1817 if (!pol)
1818 pol = get_task_policy(current);
1819
1820 return pol->flags & MPOL_F_MOF;
1821 }
1822
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1823 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1824 {
1825 enum zone_type dynamic_policy_zone = policy_zone;
1826
1827 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1828
1829 /*
1830 * if policy->nodes has movable memory only,
1831 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1832 *
1833 * policy->nodes is intersect with node_states[N_MEMORY].
1834 * so if the following test fails, it implies
1835 * policy->nodes has movable memory only.
1836 */
1837 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1838 dynamic_policy_zone = ZONE_MOVABLE;
1839
1840 return zone >= dynamic_policy_zone;
1841 }
1842
1843 /*
1844 * Return a nodemask representing a mempolicy for filtering nodes for
1845 * page allocation
1846 */
policy_nodemask(gfp_t gfp,struct mempolicy * policy)1847 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1848 {
1849 int mode = policy->mode;
1850
1851 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1852 if (unlikely(mode == MPOL_BIND) &&
1853 apply_policy_zone(policy, gfp_zone(gfp)) &&
1854 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1855 return &policy->nodes;
1856
1857 if (mode == MPOL_PREFERRED_MANY)
1858 return &policy->nodes;
1859
1860 return NULL;
1861 }
1862
1863 /*
1864 * Return the preferred node id for 'prefer' mempolicy, and return
1865 * the given id for all other policies.
1866 *
1867 * policy_node() is always coupled with policy_nodemask(), which
1868 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1869 */
policy_node(gfp_t gfp,struct mempolicy * policy,int nd)1870 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1871 {
1872 if (policy->mode == MPOL_PREFERRED) {
1873 nd = first_node(policy->nodes);
1874 } else {
1875 /*
1876 * __GFP_THISNODE shouldn't even be used with the bind policy
1877 * because we might easily break the expectation to stay on the
1878 * requested node and not break the policy.
1879 */
1880 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1881 }
1882
1883 if ((policy->mode == MPOL_BIND ||
1884 policy->mode == MPOL_PREFERRED_MANY) &&
1885 policy->home_node != NUMA_NO_NODE)
1886 return policy->home_node;
1887
1888 return nd;
1889 }
1890
1891 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1892 static unsigned interleave_nodes(struct mempolicy *policy)
1893 {
1894 unsigned next;
1895 struct task_struct *me = current;
1896
1897 next = next_node_in(me->il_prev, policy->nodes);
1898 if (next < MAX_NUMNODES)
1899 me->il_prev = next;
1900 return next;
1901 }
1902
1903 /*
1904 * Depending on the memory policy provide a node from which to allocate the
1905 * next slab entry.
1906 */
mempolicy_slab_node(void)1907 unsigned int mempolicy_slab_node(void)
1908 {
1909 struct mempolicy *policy;
1910 int node = numa_mem_id();
1911
1912 if (!in_task())
1913 return node;
1914
1915 policy = current->mempolicy;
1916 if (!policy)
1917 return node;
1918
1919 switch (policy->mode) {
1920 case MPOL_PREFERRED:
1921 return first_node(policy->nodes);
1922
1923 case MPOL_INTERLEAVE:
1924 return interleave_nodes(policy);
1925
1926 case MPOL_BIND:
1927 case MPOL_PREFERRED_MANY:
1928 {
1929 struct zoneref *z;
1930
1931 /*
1932 * Follow bind policy behavior and start allocation at the
1933 * first node.
1934 */
1935 struct zonelist *zonelist;
1936 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1937 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1938 z = first_zones_zonelist(zonelist, highest_zoneidx,
1939 &policy->nodes);
1940 return z->zone ? zone_to_nid(z->zone) : node;
1941 }
1942 case MPOL_LOCAL:
1943 return node;
1944
1945 default:
1946 BUG();
1947 }
1948 }
1949
1950 /*
1951 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1952 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1953 * number of present nodes.
1954 */
offset_il_node(struct mempolicy * pol,unsigned long n)1955 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1956 {
1957 nodemask_t nodemask = pol->nodes;
1958 unsigned int target, nnodes;
1959 int i;
1960 int nid;
1961 /*
1962 * The barrier will stabilize the nodemask in a register or on
1963 * the stack so that it will stop changing under the code.
1964 *
1965 * Between first_node() and next_node(), pol->nodes could be changed
1966 * by other threads. So we put pol->nodes in a local stack.
1967 */
1968 barrier();
1969
1970 nnodes = nodes_weight(nodemask);
1971 if (!nnodes)
1972 return numa_node_id();
1973 target = (unsigned int)n % nnodes;
1974 nid = first_node(nodemask);
1975 for (i = 0; i < target; i++)
1976 nid = next_node(nid, nodemask);
1977 return nid;
1978 }
1979
1980 /* Determine a node number for interleave */
interleave_nid(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long addr,int shift)1981 static inline unsigned interleave_nid(struct mempolicy *pol,
1982 struct vm_area_struct *vma, unsigned long addr, int shift)
1983 {
1984 if (vma) {
1985 unsigned long off;
1986
1987 /*
1988 * for small pages, there is no difference between
1989 * shift and PAGE_SHIFT, so the bit-shift is safe.
1990 * for huge pages, since vm_pgoff is in units of small
1991 * pages, we need to shift off the always 0 bits to get
1992 * a useful offset.
1993 */
1994 BUG_ON(shift < PAGE_SHIFT);
1995 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1996 off += (addr - vma->vm_start) >> shift;
1997 return offset_il_node(pol, off);
1998 } else
1999 return interleave_nodes(pol);
2000 }
2001
2002 #ifdef CONFIG_HUGETLBFS
2003 /*
2004 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2005 * @vma: virtual memory area whose policy is sought
2006 * @addr: address in @vma for shared policy lookup and interleave policy
2007 * @gfp_flags: for requested zone
2008 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2009 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2010 *
2011 * Returns a nid suitable for a huge page allocation and a pointer
2012 * to the struct mempolicy for conditional unref after allocation.
2013 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2014 * to the mempolicy's @nodemask for filtering the zonelist.
2015 *
2016 * Must be protected by read_mems_allowed_begin()
2017 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2018 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2019 struct mempolicy **mpol, nodemask_t **nodemask)
2020 {
2021 int nid;
2022 int mode;
2023
2024 *mpol = get_vma_policy(vma, addr);
2025 *nodemask = NULL;
2026 mode = (*mpol)->mode;
2027
2028 if (unlikely(mode == MPOL_INTERLEAVE)) {
2029 nid = interleave_nid(*mpol, vma, addr,
2030 huge_page_shift(hstate_vma(vma)));
2031 } else {
2032 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2033 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2034 *nodemask = &(*mpol)->nodes;
2035 }
2036 return nid;
2037 }
2038
2039 /*
2040 * init_nodemask_of_mempolicy
2041 *
2042 * If the current task's mempolicy is "default" [NULL], return 'false'
2043 * to indicate default policy. Otherwise, extract the policy nodemask
2044 * for 'bind' or 'interleave' policy into the argument nodemask, or
2045 * initialize the argument nodemask to contain the single node for
2046 * 'preferred' or 'local' policy and return 'true' to indicate presence
2047 * of non-default mempolicy.
2048 *
2049 * We don't bother with reference counting the mempolicy [mpol_get/put]
2050 * because the current task is examining it's own mempolicy and a task's
2051 * mempolicy is only ever changed by the task itself.
2052 *
2053 * N.B., it is the caller's responsibility to free a returned nodemask.
2054 */
init_nodemask_of_mempolicy(nodemask_t * mask)2055 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2056 {
2057 struct mempolicy *mempolicy;
2058
2059 if (!(mask && current->mempolicy))
2060 return false;
2061
2062 task_lock(current);
2063 mempolicy = current->mempolicy;
2064 switch (mempolicy->mode) {
2065 case MPOL_PREFERRED:
2066 case MPOL_PREFERRED_MANY:
2067 case MPOL_BIND:
2068 case MPOL_INTERLEAVE:
2069 *mask = mempolicy->nodes;
2070 break;
2071
2072 case MPOL_LOCAL:
2073 init_nodemask_of_node(mask, numa_node_id());
2074 break;
2075
2076 default:
2077 BUG();
2078 }
2079 task_unlock(current);
2080
2081 return true;
2082 }
2083 #endif
2084
2085 /*
2086 * mempolicy_in_oom_domain
2087 *
2088 * If tsk's mempolicy is "bind", check for intersection between mask and
2089 * the policy nodemask. Otherwise, return true for all other policies
2090 * including "interleave", as a tsk with "interleave" policy may have
2091 * memory allocated from all nodes in system.
2092 *
2093 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2094 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2095 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2096 const nodemask_t *mask)
2097 {
2098 struct mempolicy *mempolicy;
2099 bool ret = true;
2100
2101 if (!mask)
2102 return ret;
2103
2104 task_lock(tsk);
2105 mempolicy = tsk->mempolicy;
2106 if (mempolicy && mempolicy->mode == MPOL_BIND)
2107 ret = nodes_intersects(mempolicy->nodes, *mask);
2108 task_unlock(tsk);
2109
2110 return ret;
2111 }
2112
2113 /* Allocate a page in interleaved policy.
2114 Own path because it needs to do special accounting. */
alloc_page_interleave(gfp_t gfp,unsigned order,unsigned nid)2115 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2116 unsigned nid)
2117 {
2118 struct page *page;
2119
2120 page = __alloc_pages(gfp, order, nid, NULL);
2121 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2122 if (!static_branch_likely(&vm_numa_stat_key))
2123 return page;
2124 if (page && page_to_nid(page) == nid) {
2125 preempt_disable();
2126 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2127 preempt_enable();
2128 }
2129 return page;
2130 }
2131
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,struct mempolicy * pol)2132 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2133 int nid, struct mempolicy *pol)
2134 {
2135 struct page *page;
2136 gfp_t preferred_gfp;
2137
2138 /*
2139 * This is a two pass approach. The first pass will only try the
2140 * preferred nodes but skip the direct reclaim and allow the
2141 * allocation to fail, while the second pass will try all the
2142 * nodes in system.
2143 */
2144 preferred_gfp = gfp | __GFP_NOWARN;
2145 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2146 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2147 if (!page)
2148 page = __alloc_pages(gfp, order, nid, NULL);
2149
2150 return page;
2151 }
2152
2153 /**
2154 * vma_alloc_folio - Allocate a folio for a VMA.
2155 * @gfp: GFP flags.
2156 * @order: Order of the folio.
2157 * @vma: Pointer to VMA or NULL if not available.
2158 * @addr: Virtual address of the allocation. Must be inside @vma.
2159 * @hugepage: For hugepages try only the preferred node if possible.
2160 *
2161 * Allocate a folio for a specific address in @vma, using the appropriate
2162 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2163 * of the mm_struct of the VMA to prevent it from going away. Should be
2164 * used for all allocations for folios that will be mapped into user space.
2165 *
2166 * Return: The folio on success or NULL if allocation fails.
2167 */
vma_alloc_folio(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr,bool hugepage)2168 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2169 unsigned long addr, bool hugepage)
2170 {
2171 struct mempolicy *pol;
2172 int node = numa_node_id();
2173 struct folio *folio;
2174 int preferred_nid;
2175 nodemask_t *nmask;
2176
2177 pol = get_vma_policy(vma, addr);
2178
2179 if (pol->mode == MPOL_INTERLEAVE) {
2180 struct page *page;
2181 unsigned nid;
2182
2183 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2184 mpol_cond_put(pol);
2185 gfp |= __GFP_COMP;
2186 page = alloc_page_interleave(gfp, order, nid);
2187 return page_rmappable_folio(page);
2188 }
2189
2190 if (pol->mode == MPOL_PREFERRED_MANY) {
2191 struct page *page;
2192
2193 node = policy_node(gfp, pol, node);
2194 gfp |= __GFP_COMP;
2195 page = alloc_pages_preferred_many(gfp, order, node, pol);
2196 mpol_cond_put(pol);
2197 return page_rmappable_folio(page);
2198 }
2199
2200 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2201 int hpage_node = node;
2202
2203 /*
2204 * For hugepage allocation and non-interleave policy which
2205 * allows the current node (or other explicitly preferred
2206 * node) we only try to allocate from the current/preferred
2207 * node and don't fall back to other nodes, as the cost of
2208 * remote accesses would likely offset THP benefits.
2209 *
2210 * If the policy is interleave or does not allow the current
2211 * node in its nodemask, we allocate the standard way.
2212 */
2213 if (pol->mode == MPOL_PREFERRED)
2214 hpage_node = first_node(pol->nodes);
2215
2216 nmask = policy_nodemask(gfp, pol);
2217 if (!nmask || node_isset(hpage_node, *nmask)) {
2218 mpol_cond_put(pol);
2219 /*
2220 * First, try to allocate THP only on local node, but
2221 * don't reclaim unnecessarily, just compact.
2222 */
2223 folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2224 __GFP_NORETRY, order, hpage_node);
2225
2226 /*
2227 * If hugepage allocations are configured to always
2228 * synchronous compact or the vma has been madvised
2229 * to prefer hugepage backing, retry allowing remote
2230 * memory with both reclaim and compact as well.
2231 */
2232 if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2233 folio = __folio_alloc(gfp, order, hpage_node,
2234 nmask);
2235
2236 goto out;
2237 }
2238 }
2239
2240 nmask = policy_nodemask(gfp, pol);
2241 preferred_nid = policy_node(gfp, pol, node);
2242 folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2243 mpol_cond_put(pol);
2244 out:
2245 return folio;
2246 }
2247 EXPORT_SYMBOL(vma_alloc_folio);
2248
2249 /**
2250 * alloc_pages - Allocate pages.
2251 * @gfp: GFP flags.
2252 * @order: Power of two of number of pages to allocate.
2253 *
2254 * Allocate 1 << @order contiguous pages. The physical address of the
2255 * first page is naturally aligned (eg an order-3 allocation will be aligned
2256 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2257 * process is honoured when in process context.
2258 *
2259 * Context: Can be called from any context, providing the appropriate GFP
2260 * flags are used.
2261 * Return: The page on success or NULL if allocation fails.
2262 */
alloc_pages(gfp_t gfp,unsigned order)2263 struct page *alloc_pages(gfp_t gfp, unsigned order)
2264 {
2265 struct mempolicy *pol = &default_policy;
2266 struct page *page;
2267
2268 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2269 pol = get_task_policy(current);
2270
2271 /*
2272 * No reference counting needed for current->mempolicy
2273 * nor system default_policy
2274 */
2275 if (pol->mode == MPOL_INTERLEAVE)
2276 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2277 else if (pol->mode == MPOL_PREFERRED_MANY)
2278 page = alloc_pages_preferred_many(gfp, order,
2279 policy_node(gfp, pol, numa_node_id()), pol);
2280 else
2281 page = __alloc_pages(gfp, order,
2282 policy_node(gfp, pol, numa_node_id()),
2283 policy_nodemask(gfp, pol));
2284
2285 return page;
2286 }
2287 EXPORT_SYMBOL(alloc_pages);
2288
folio_alloc(gfp_t gfp,unsigned order)2289 struct folio *folio_alloc(gfp_t gfp, unsigned order)
2290 {
2291 return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
2292 }
2293 EXPORT_SYMBOL(folio_alloc);
2294
alloc_pages_bulk_array_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2295 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2296 struct mempolicy *pol, unsigned long nr_pages,
2297 struct page **page_array)
2298 {
2299 int nodes;
2300 unsigned long nr_pages_per_node;
2301 int delta;
2302 int i;
2303 unsigned long nr_allocated;
2304 unsigned long total_allocated = 0;
2305
2306 nodes = nodes_weight(pol->nodes);
2307 nr_pages_per_node = nr_pages / nodes;
2308 delta = nr_pages - nodes * nr_pages_per_node;
2309
2310 for (i = 0; i < nodes; i++) {
2311 if (delta) {
2312 nr_allocated = __alloc_pages_bulk(gfp,
2313 interleave_nodes(pol), NULL,
2314 nr_pages_per_node + 1, NULL,
2315 page_array);
2316 delta--;
2317 } else {
2318 nr_allocated = __alloc_pages_bulk(gfp,
2319 interleave_nodes(pol), NULL,
2320 nr_pages_per_node, NULL, page_array);
2321 }
2322
2323 page_array += nr_allocated;
2324 total_allocated += nr_allocated;
2325 }
2326
2327 return total_allocated;
2328 }
2329
alloc_pages_bulk_array_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2330 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2331 struct mempolicy *pol, unsigned long nr_pages,
2332 struct page **page_array)
2333 {
2334 gfp_t preferred_gfp;
2335 unsigned long nr_allocated = 0;
2336
2337 preferred_gfp = gfp | __GFP_NOWARN;
2338 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2339
2340 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2341 nr_pages, NULL, page_array);
2342
2343 if (nr_allocated < nr_pages)
2344 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2345 nr_pages - nr_allocated, NULL,
2346 page_array + nr_allocated);
2347 return nr_allocated;
2348 }
2349
2350 /* alloc pages bulk and mempolicy should be considered at the
2351 * same time in some situation such as vmalloc.
2352 *
2353 * It can accelerate memory allocation especially interleaving
2354 * allocate memory.
2355 */
alloc_pages_bulk_array_mempolicy(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2356 unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2357 unsigned long nr_pages, struct page **page_array)
2358 {
2359 struct mempolicy *pol = &default_policy;
2360
2361 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2362 pol = get_task_policy(current);
2363
2364 if (pol->mode == MPOL_INTERLEAVE)
2365 return alloc_pages_bulk_array_interleave(gfp, pol,
2366 nr_pages, page_array);
2367
2368 if (pol->mode == MPOL_PREFERRED_MANY)
2369 return alloc_pages_bulk_array_preferred_many(gfp,
2370 numa_node_id(), pol, nr_pages, page_array);
2371
2372 return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2373 policy_nodemask(gfp, pol), nr_pages, NULL,
2374 page_array);
2375 }
2376
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2377 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2378 {
2379 struct mempolicy *pol = mpol_dup(vma_policy(src));
2380
2381 if (IS_ERR(pol))
2382 return PTR_ERR(pol);
2383 dst->vm_policy = pol;
2384 return 0;
2385 }
2386
2387 /*
2388 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2389 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2390 * with the mems_allowed returned by cpuset_mems_allowed(). This
2391 * keeps mempolicies cpuset relative after its cpuset moves. See
2392 * further kernel/cpuset.c update_nodemask().
2393 *
2394 * current's mempolicy may be rebinded by the other task(the task that changes
2395 * cpuset's mems), so we needn't do rebind work for current task.
2396 */
2397
2398 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2399 struct mempolicy *__mpol_dup(struct mempolicy *old)
2400 {
2401 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2402
2403 if (!new)
2404 return ERR_PTR(-ENOMEM);
2405
2406 /* task's mempolicy is protected by alloc_lock */
2407 if (old == current->mempolicy) {
2408 task_lock(current);
2409 *new = *old;
2410 task_unlock(current);
2411 } else
2412 *new = *old;
2413
2414 if (current_cpuset_is_being_rebound()) {
2415 nodemask_t mems = cpuset_mems_allowed(current);
2416 mpol_rebind_policy(new, &mems);
2417 }
2418 atomic_set(&new->refcnt, 1);
2419 return new;
2420 }
2421
2422 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2423 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2424 {
2425 if (!a || !b)
2426 return false;
2427 if (a->mode != b->mode)
2428 return false;
2429 if (a->flags != b->flags)
2430 return false;
2431 if (a->home_node != b->home_node)
2432 return false;
2433 if (mpol_store_user_nodemask(a))
2434 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2435 return false;
2436
2437 switch (a->mode) {
2438 case MPOL_BIND:
2439 case MPOL_INTERLEAVE:
2440 case MPOL_PREFERRED:
2441 case MPOL_PREFERRED_MANY:
2442 return !!nodes_equal(a->nodes, b->nodes);
2443 case MPOL_LOCAL:
2444 return true;
2445 default:
2446 BUG();
2447 return false;
2448 }
2449 }
2450
2451 /*
2452 * Shared memory backing store policy support.
2453 *
2454 * Remember policies even when nobody has shared memory mapped.
2455 * The policies are kept in Red-Black tree linked from the inode.
2456 * They are protected by the sp->lock rwlock, which should be held
2457 * for any accesses to the tree.
2458 */
2459
2460 /*
2461 * lookup first element intersecting start-end. Caller holds sp->lock for
2462 * reading or for writing
2463 */
2464 static struct sp_node *
sp_lookup(struct shared_policy * sp,unsigned long start,unsigned long end)2465 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2466 {
2467 struct rb_node *n = sp->root.rb_node;
2468
2469 while (n) {
2470 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2471
2472 if (start >= p->end)
2473 n = n->rb_right;
2474 else if (end <= p->start)
2475 n = n->rb_left;
2476 else
2477 break;
2478 }
2479 if (!n)
2480 return NULL;
2481 for (;;) {
2482 struct sp_node *w = NULL;
2483 struct rb_node *prev = rb_prev(n);
2484 if (!prev)
2485 break;
2486 w = rb_entry(prev, struct sp_node, nd);
2487 if (w->end <= start)
2488 break;
2489 n = prev;
2490 }
2491 return rb_entry(n, struct sp_node, nd);
2492 }
2493
2494 /*
2495 * Insert a new shared policy into the list. Caller holds sp->lock for
2496 * writing.
2497 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2498 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2499 {
2500 struct rb_node **p = &sp->root.rb_node;
2501 struct rb_node *parent = NULL;
2502 struct sp_node *nd;
2503
2504 while (*p) {
2505 parent = *p;
2506 nd = rb_entry(parent, struct sp_node, nd);
2507 if (new->start < nd->start)
2508 p = &(*p)->rb_left;
2509 else if (new->end > nd->end)
2510 p = &(*p)->rb_right;
2511 else
2512 BUG();
2513 }
2514 rb_link_node(&new->nd, parent, p);
2515 rb_insert_color(&new->nd, &sp->root);
2516 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2517 new->policy ? new->policy->mode : 0);
2518 }
2519
2520 /* Find shared policy intersecting idx */
2521 struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy * sp,unsigned long idx)2522 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2523 {
2524 struct mempolicy *pol = NULL;
2525 struct sp_node *sn;
2526
2527 if (!sp->root.rb_node)
2528 return NULL;
2529 read_lock(&sp->lock);
2530 sn = sp_lookup(sp, idx, idx+1);
2531 if (sn) {
2532 mpol_get(sn->policy);
2533 pol = sn->policy;
2534 }
2535 read_unlock(&sp->lock);
2536 return pol;
2537 }
2538
sp_free(struct sp_node * n)2539 static void sp_free(struct sp_node *n)
2540 {
2541 mpol_put(n->policy);
2542 kmem_cache_free(sn_cache, n);
2543 }
2544
2545 /**
2546 * mpol_misplaced - check whether current page node is valid in policy
2547 *
2548 * @page: page to be checked
2549 * @vma: vm area where page mapped
2550 * @addr: virtual address where page mapped
2551 *
2552 * Lookup current policy node id for vma,addr and "compare to" page's
2553 * node id. Policy determination "mimics" alloc_page_vma().
2554 * Called from fault path where we know the vma and faulting address.
2555 *
2556 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2557 * policy, or a suitable node ID to allocate a replacement page from.
2558 */
mpol_misplaced(struct page * page,struct vm_area_struct * vma,unsigned long addr)2559 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2560 {
2561 struct mempolicy *pol;
2562 struct zoneref *z;
2563 int curnid = page_to_nid(page);
2564 unsigned long pgoff;
2565 int thiscpu = raw_smp_processor_id();
2566 int thisnid = cpu_to_node(thiscpu);
2567 int polnid = NUMA_NO_NODE;
2568 int ret = NUMA_NO_NODE;
2569
2570 pol = get_vma_policy(vma, addr);
2571 if (!(pol->flags & MPOL_F_MOF))
2572 goto out;
2573
2574 switch (pol->mode) {
2575 case MPOL_INTERLEAVE:
2576 pgoff = vma->vm_pgoff;
2577 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2578 polnid = offset_il_node(pol, pgoff);
2579 break;
2580
2581 case MPOL_PREFERRED:
2582 if (node_isset(curnid, pol->nodes))
2583 goto out;
2584 polnid = first_node(pol->nodes);
2585 break;
2586
2587 case MPOL_LOCAL:
2588 polnid = numa_node_id();
2589 break;
2590
2591 case MPOL_BIND:
2592 /* Optimize placement among multiple nodes via NUMA balancing */
2593 if (pol->flags & MPOL_F_MORON) {
2594 if (node_isset(thisnid, pol->nodes))
2595 break;
2596 goto out;
2597 }
2598 fallthrough;
2599
2600 case MPOL_PREFERRED_MANY:
2601 /*
2602 * use current page if in policy nodemask,
2603 * else select nearest allowed node, if any.
2604 * If no allowed nodes, use current [!misplaced].
2605 */
2606 if (node_isset(curnid, pol->nodes))
2607 goto out;
2608 z = first_zones_zonelist(
2609 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2610 gfp_zone(GFP_HIGHUSER),
2611 &pol->nodes);
2612 polnid = zone_to_nid(z->zone);
2613 break;
2614
2615 default:
2616 BUG();
2617 }
2618
2619 /* Migrate the page towards the node whose CPU is referencing it */
2620 if (pol->flags & MPOL_F_MORON) {
2621 polnid = thisnid;
2622
2623 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2624 goto out;
2625 }
2626
2627 if (curnid != polnid)
2628 ret = polnid;
2629 out:
2630 mpol_cond_put(pol);
2631
2632 return ret;
2633 }
2634
2635 /*
2636 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2637 * dropped after task->mempolicy is set to NULL so that any allocation done as
2638 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2639 * policy.
2640 */
mpol_put_task_policy(struct task_struct * task)2641 void mpol_put_task_policy(struct task_struct *task)
2642 {
2643 struct mempolicy *pol;
2644
2645 task_lock(task);
2646 pol = task->mempolicy;
2647 task->mempolicy = NULL;
2648 task_unlock(task);
2649 mpol_put(pol);
2650 }
2651
sp_delete(struct shared_policy * sp,struct sp_node * n)2652 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2653 {
2654 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2655 rb_erase(&n->nd, &sp->root);
2656 sp_free(n);
2657 }
2658
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2659 static void sp_node_init(struct sp_node *node, unsigned long start,
2660 unsigned long end, struct mempolicy *pol)
2661 {
2662 node->start = start;
2663 node->end = end;
2664 node->policy = pol;
2665 }
2666
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2667 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2668 struct mempolicy *pol)
2669 {
2670 struct sp_node *n;
2671 struct mempolicy *newpol;
2672
2673 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2674 if (!n)
2675 return NULL;
2676
2677 newpol = mpol_dup(pol);
2678 if (IS_ERR(newpol)) {
2679 kmem_cache_free(sn_cache, n);
2680 return NULL;
2681 }
2682 newpol->flags |= MPOL_F_SHARED;
2683 sp_node_init(n, start, end, newpol);
2684
2685 return n;
2686 }
2687
2688 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,unsigned long start,unsigned long end,struct sp_node * new)2689 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2690 unsigned long end, struct sp_node *new)
2691 {
2692 struct sp_node *n;
2693 struct sp_node *n_new = NULL;
2694 struct mempolicy *mpol_new = NULL;
2695 int ret = 0;
2696
2697 restart:
2698 write_lock(&sp->lock);
2699 n = sp_lookup(sp, start, end);
2700 /* Take care of old policies in the same range. */
2701 while (n && n->start < end) {
2702 struct rb_node *next = rb_next(&n->nd);
2703 if (n->start >= start) {
2704 if (n->end <= end)
2705 sp_delete(sp, n);
2706 else
2707 n->start = end;
2708 } else {
2709 /* Old policy spanning whole new range. */
2710 if (n->end > end) {
2711 if (!n_new)
2712 goto alloc_new;
2713
2714 *mpol_new = *n->policy;
2715 atomic_set(&mpol_new->refcnt, 1);
2716 sp_node_init(n_new, end, n->end, mpol_new);
2717 n->end = start;
2718 sp_insert(sp, n_new);
2719 n_new = NULL;
2720 mpol_new = NULL;
2721 break;
2722 } else
2723 n->end = start;
2724 }
2725 if (!next)
2726 break;
2727 n = rb_entry(next, struct sp_node, nd);
2728 }
2729 if (new)
2730 sp_insert(sp, new);
2731 write_unlock(&sp->lock);
2732 ret = 0;
2733
2734 err_out:
2735 if (mpol_new)
2736 mpol_put(mpol_new);
2737 if (n_new)
2738 kmem_cache_free(sn_cache, n_new);
2739
2740 return ret;
2741
2742 alloc_new:
2743 write_unlock(&sp->lock);
2744 ret = -ENOMEM;
2745 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2746 if (!n_new)
2747 goto err_out;
2748 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2749 if (!mpol_new)
2750 goto err_out;
2751 atomic_set(&mpol_new->refcnt, 1);
2752 goto restart;
2753 }
2754
2755 /**
2756 * mpol_shared_policy_init - initialize shared policy for inode
2757 * @sp: pointer to inode shared policy
2758 * @mpol: struct mempolicy to install
2759 *
2760 * Install non-NULL @mpol in inode's shared policy rb-tree.
2761 * On entry, the current task has a reference on a non-NULL @mpol.
2762 * This must be released on exit.
2763 * This is called at get_inode() calls and we can use GFP_KERNEL.
2764 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)2765 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2766 {
2767 int ret;
2768
2769 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2770 rwlock_init(&sp->lock);
2771
2772 if (mpol) {
2773 struct vm_area_struct pvma;
2774 struct mempolicy *new;
2775 NODEMASK_SCRATCH(scratch);
2776
2777 if (!scratch)
2778 goto put_mpol;
2779 /* contextualize the tmpfs mount point mempolicy */
2780 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2781 if (IS_ERR(new))
2782 goto free_scratch; /* no valid nodemask intersection */
2783
2784 task_lock(current);
2785 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2786 task_unlock(current);
2787 if (ret)
2788 goto put_new;
2789
2790 /* Create pseudo-vma that contains just the policy */
2791 vma_init(&pvma, NULL);
2792 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2793 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2794
2795 put_new:
2796 mpol_put(new); /* drop initial ref */
2797 free_scratch:
2798 NODEMASK_SCRATCH_FREE(scratch);
2799 put_mpol:
2800 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2801 }
2802 }
2803
mpol_set_shared_policy(struct shared_policy * info,struct vm_area_struct * vma,struct mempolicy * npol)2804 int mpol_set_shared_policy(struct shared_policy *info,
2805 struct vm_area_struct *vma, struct mempolicy *npol)
2806 {
2807 int err;
2808 struct sp_node *new = NULL;
2809 unsigned long sz = vma_pages(vma);
2810
2811 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2812 vma->vm_pgoff,
2813 sz, npol ? npol->mode : -1,
2814 npol ? npol->flags : -1,
2815 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2816
2817 if (npol) {
2818 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2819 if (!new)
2820 return -ENOMEM;
2821 }
2822 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2823 if (err && new)
2824 sp_free(new);
2825 return err;
2826 }
2827
2828 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * p)2829 void mpol_free_shared_policy(struct shared_policy *p)
2830 {
2831 struct sp_node *n;
2832 struct rb_node *next;
2833
2834 if (!p->root.rb_node)
2835 return;
2836 write_lock(&p->lock);
2837 next = rb_first(&p->root);
2838 while (next) {
2839 n = rb_entry(next, struct sp_node, nd);
2840 next = rb_next(&n->nd);
2841 sp_delete(p, n);
2842 }
2843 write_unlock(&p->lock);
2844 }
2845
2846 #ifdef CONFIG_NUMA_BALANCING
2847 static int __initdata numabalancing_override;
2848
check_numabalancing_enable(void)2849 static void __init check_numabalancing_enable(void)
2850 {
2851 bool numabalancing_default = false;
2852
2853 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2854 numabalancing_default = true;
2855
2856 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2857 if (numabalancing_override)
2858 set_numabalancing_state(numabalancing_override == 1);
2859
2860 if (num_online_nodes() > 1 && !numabalancing_override) {
2861 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2862 numabalancing_default ? "Enabling" : "Disabling");
2863 set_numabalancing_state(numabalancing_default);
2864 }
2865 }
2866
setup_numabalancing(char * str)2867 static int __init setup_numabalancing(char *str)
2868 {
2869 int ret = 0;
2870 if (!str)
2871 goto out;
2872
2873 if (!strcmp(str, "enable")) {
2874 numabalancing_override = 1;
2875 ret = 1;
2876 } else if (!strcmp(str, "disable")) {
2877 numabalancing_override = -1;
2878 ret = 1;
2879 }
2880 out:
2881 if (!ret)
2882 pr_warn("Unable to parse numa_balancing=\n");
2883
2884 return ret;
2885 }
2886 __setup("numa_balancing=", setup_numabalancing);
2887 #else
check_numabalancing_enable(void)2888 static inline void __init check_numabalancing_enable(void)
2889 {
2890 }
2891 #endif /* CONFIG_NUMA_BALANCING */
2892
2893 /* assumes fs == KERNEL_DS */
numa_policy_init(void)2894 void __init numa_policy_init(void)
2895 {
2896 nodemask_t interleave_nodes;
2897 unsigned long largest = 0;
2898 int nid, prefer = 0;
2899
2900 policy_cache = kmem_cache_create("numa_policy",
2901 sizeof(struct mempolicy),
2902 0, SLAB_PANIC, NULL);
2903
2904 sn_cache = kmem_cache_create("shared_policy_node",
2905 sizeof(struct sp_node),
2906 0, SLAB_PANIC, NULL);
2907
2908 for_each_node(nid) {
2909 preferred_node_policy[nid] = (struct mempolicy) {
2910 .refcnt = ATOMIC_INIT(1),
2911 .mode = MPOL_PREFERRED,
2912 .flags = MPOL_F_MOF | MPOL_F_MORON,
2913 .nodes = nodemask_of_node(nid),
2914 };
2915 }
2916
2917 /*
2918 * Set interleaving policy for system init. Interleaving is only
2919 * enabled across suitably sized nodes (default is >= 16MB), or
2920 * fall back to the largest node if they're all smaller.
2921 */
2922 nodes_clear(interleave_nodes);
2923 for_each_node_state(nid, N_MEMORY) {
2924 unsigned long total_pages = node_present_pages(nid);
2925
2926 /* Preserve the largest node */
2927 if (largest < total_pages) {
2928 largest = total_pages;
2929 prefer = nid;
2930 }
2931
2932 /* Interleave this node? */
2933 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2934 node_set(nid, interleave_nodes);
2935 }
2936
2937 /* All too small, use the largest */
2938 if (unlikely(nodes_empty(interleave_nodes)))
2939 node_set(prefer, interleave_nodes);
2940
2941 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2942 pr_err("%s: interleaving failed\n", __func__);
2943
2944 check_numabalancing_enable();
2945 }
2946
2947 /* Reset policy of current process to default */
numa_default_policy(void)2948 void numa_default_policy(void)
2949 {
2950 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2951 }
2952
2953 /*
2954 * Parse and format mempolicy from/to strings
2955 */
2956
2957 static const char * const policy_modes[] =
2958 {
2959 [MPOL_DEFAULT] = "default",
2960 [MPOL_PREFERRED] = "prefer",
2961 [MPOL_BIND] = "bind",
2962 [MPOL_INTERLEAVE] = "interleave",
2963 [MPOL_LOCAL] = "local",
2964 [MPOL_PREFERRED_MANY] = "prefer (many)",
2965 };
2966
2967
2968 #ifdef CONFIG_TMPFS
2969 /**
2970 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2971 * @str: string containing mempolicy to parse
2972 * @mpol: pointer to struct mempolicy pointer, returned on success.
2973 *
2974 * Format of input:
2975 * <mode>[=<flags>][:<nodelist>]
2976 *
2977 * Return: %0 on success, else %1
2978 */
mpol_parse_str(char * str,struct mempolicy ** mpol)2979 int mpol_parse_str(char *str, struct mempolicy **mpol)
2980 {
2981 struct mempolicy *new = NULL;
2982 unsigned short mode_flags;
2983 nodemask_t nodes;
2984 char *nodelist = strchr(str, ':');
2985 char *flags = strchr(str, '=');
2986 int err = 1, mode;
2987
2988 if (flags)
2989 *flags++ = '\0'; /* terminate mode string */
2990
2991 if (nodelist) {
2992 /* NUL-terminate mode or flags string */
2993 *nodelist++ = '\0';
2994 if (nodelist_parse(nodelist, nodes))
2995 goto out;
2996 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2997 goto out;
2998 } else
2999 nodes_clear(nodes);
3000
3001 mode = match_string(policy_modes, MPOL_MAX, str);
3002 if (mode < 0)
3003 goto out;
3004
3005 switch (mode) {
3006 case MPOL_PREFERRED:
3007 /*
3008 * Insist on a nodelist of one node only, although later
3009 * we use first_node(nodes) to grab a single node, so here
3010 * nodelist (or nodes) cannot be empty.
3011 */
3012 if (nodelist) {
3013 char *rest = nodelist;
3014 while (isdigit(*rest))
3015 rest++;
3016 if (*rest)
3017 goto out;
3018 if (nodes_empty(nodes))
3019 goto out;
3020 }
3021 break;
3022 case MPOL_INTERLEAVE:
3023 /*
3024 * Default to online nodes with memory if no nodelist
3025 */
3026 if (!nodelist)
3027 nodes = node_states[N_MEMORY];
3028 break;
3029 case MPOL_LOCAL:
3030 /*
3031 * Don't allow a nodelist; mpol_new() checks flags
3032 */
3033 if (nodelist)
3034 goto out;
3035 break;
3036 case MPOL_DEFAULT:
3037 /*
3038 * Insist on a empty nodelist
3039 */
3040 if (!nodelist)
3041 err = 0;
3042 goto out;
3043 case MPOL_PREFERRED_MANY:
3044 case MPOL_BIND:
3045 /*
3046 * Insist on a nodelist
3047 */
3048 if (!nodelist)
3049 goto out;
3050 }
3051
3052 mode_flags = 0;
3053 if (flags) {
3054 /*
3055 * Currently, we only support two mutually exclusive
3056 * mode flags.
3057 */
3058 if (!strcmp(flags, "static"))
3059 mode_flags |= MPOL_F_STATIC_NODES;
3060 else if (!strcmp(flags, "relative"))
3061 mode_flags |= MPOL_F_RELATIVE_NODES;
3062 else
3063 goto out;
3064 }
3065
3066 new = mpol_new(mode, mode_flags, &nodes);
3067 if (IS_ERR(new))
3068 goto out;
3069
3070 /*
3071 * Save nodes for mpol_to_str() to show the tmpfs mount options
3072 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3073 */
3074 if (mode != MPOL_PREFERRED) {
3075 new->nodes = nodes;
3076 } else if (nodelist) {
3077 nodes_clear(new->nodes);
3078 node_set(first_node(nodes), new->nodes);
3079 } else {
3080 new->mode = MPOL_LOCAL;
3081 }
3082
3083 /*
3084 * Save nodes for contextualization: this will be used to "clone"
3085 * the mempolicy in a specific context [cpuset] at a later time.
3086 */
3087 new->w.user_nodemask = nodes;
3088
3089 err = 0;
3090
3091 out:
3092 /* Restore string for error message */
3093 if (nodelist)
3094 *--nodelist = ':';
3095 if (flags)
3096 *--flags = '=';
3097 if (!err)
3098 *mpol = new;
3099 return err;
3100 }
3101 #endif /* CONFIG_TMPFS */
3102
3103 /**
3104 * mpol_to_str - format a mempolicy structure for printing
3105 * @buffer: to contain formatted mempolicy string
3106 * @maxlen: length of @buffer
3107 * @pol: pointer to mempolicy to be formatted
3108 *
3109 * Convert @pol into a string. If @buffer is too short, truncate the string.
3110 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3111 * interleave", plus the longest flag flags, "relative|balancing", and to
3112 * display at least a few node ids.
3113 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3114 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3115 {
3116 char *p = buffer;
3117 nodemask_t nodes = NODE_MASK_NONE;
3118 unsigned short mode = MPOL_DEFAULT;
3119 unsigned short flags = 0;
3120
3121 if (pol &&
3122 pol != &default_policy &&
3123 !(pol >= &preferred_node_policy[0] &&
3124 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3125 mode = pol->mode;
3126 flags = pol->flags;
3127 }
3128
3129 switch (mode) {
3130 case MPOL_DEFAULT:
3131 case MPOL_LOCAL:
3132 break;
3133 case MPOL_PREFERRED:
3134 case MPOL_PREFERRED_MANY:
3135 case MPOL_BIND:
3136 case MPOL_INTERLEAVE:
3137 nodes = pol->nodes;
3138 break;
3139 default:
3140 WARN_ON_ONCE(1);
3141 snprintf(p, maxlen, "unknown");
3142 return;
3143 }
3144
3145 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3146
3147 if (flags & MPOL_MODE_FLAGS) {
3148 p += snprintf(p, buffer + maxlen - p, "=");
3149
3150 /*
3151 * Static and relative are mutually exclusive.
3152 */
3153 if (flags & MPOL_F_STATIC_NODES)
3154 p += snprintf(p, buffer + maxlen - p, "static");
3155 else if (flags & MPOL_F_RELATIVE_NODES)
3156 p += snprintf(p, buffer + maxlen - p, "relative");
3157
3158 if (flags & MPOL_F_NUMA_BALANCING) {
3159 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3160 p += snprintf(p, buffer + maxlen - p, "|");
3161 p += snprintf(p, buffer + maxlen - p, "balancing");
3162 }
3163 }
3164
3165 if (!nodes_empty(nodes))
3166 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3167 nodemask_pr_args(&nodes));
3168 }
3169