xref: /openbmc/linux/mm/mempolicy.c (revision 28949b84b2cb2473507ec2fed06728f995dd7942)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Simple NUMA memory policy for the Linux kernel.
4   *
5   * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6   * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7   *
8   * NUMA policy allows the user to give hints in which node(s) memory should
9   * be allocated.
10   *
11   * Support four policies per VMA and per process:
12   *
13   * The VMA policy has priority over the process policy for a page fault.
14   *
15   * interleave     Allocate memory interleaved over a set of nodes,
16   *                with normal fallback if it fails.
17   *                For VMA based allocations this interleaves based on the
18   *                offset into the backing object or offset into the mapping
19   *                for anonymous memory. For process policy an process counter
20   *                is used.
21   *
22   * bind           Only allocate memory on a specific set of nodes,
23   *                no fallback.
24   *                FIXME: memory is allocated starting with the first node
25   *                to the last. It would be better if bind would truly restrict
26   *                the allocation to memory nodes instead
27   *
28   * preferred       Try a specific node first before normal fallback.
29   *                As a special case NUMA_NO_NODE here means do the allocation
30   *                on the local CPU. This is normally identical to default,
31   *                but useful to set in a VMA when you have a non default
32   *                process policy.
33   *
34   * default        Allocate on the local node first, or when on a VMA
35   *                use the process policy. This is what Linux always did
36   *		  in a NUMA aware kernel and still does by, ahem, default.
37   *
38   * The process policy is applied for most non interrupt memory allocations
39   * in that process' context. Interrupts ignore the policies and always
40   * try to allocate on the local CPU. The VMA policy is only applied for memory
41   * allocations for a VMA in the VM.
42   *
43   * Currently there are a few corner cases in swapping where the policy
44   * is not applied, but the majority should be handled. When process policy
45   * is used it is not remembered over swap outs/swap ins.
46   *
47   * Only the highest zone in the zone hierarchy gets policied. Allocations
48   * requesting a lower zone just use default policy. This implies that
49   * on systems with highmem kernel lowmem allocation don't get policied.
50   * Same with GFP_DMA allocations.
51   *
52   * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53   * all users and remembered even when nobody has memory mapped.
54   */
55  
56  /* Notebook:
57     fix mmap readahead to honour policy and enable policy for any page cache
58     object
59     statistics for bigpages
60     global policy for page cache? currently it uses process policy. Requires
61     first item above.
62     handle mremap for shared memory (currently ignored for the policy)
63     grows down?
64     make bind policy root only? It can trigger oom much faster and the
65     kernel is not always grateful with that.
66  */
67  
68  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69  
70  #include <linux/mempolicy.h>
71  #include <linux/pagewalk.h>
72  #include <linux/highmem.h>
73  #include <linux/hugetlb.h>
74  #include <linux/kernel.h>
75  #include <linux/sched.h>
76  #include <linux/sched/mm.h>
77  #include <linux/sched/numa_balancing.h>
78  #include <linux/sched/task.h>
79  #include <linux/nodemask.h>
80  #include <linux/cpuset.h>
81  #include <linux/slab.h>
82  #include <linux/string.h>
83  #include <linux/export.h>
84  #include <linux/nsproxy.h>
85  #include <linux/interrupt.h>
86  #include <linux/init.h>
87  #include <linux/compat.h>
88  #include <linux/ptrace.h>
89  #include <linux/swap.h>
90  #include <linux/seq_file.h>
91  #include <linux/proc_fs.h>
92  #include <linux/migrate.h>
93  #include <linux/ksm.h>
94  #include <linux/rmap.h>
95  #include <linux/security.h>
96  #include <linux/syscalls.h>
97  #include <linux/ctype.h>
98  #include <linux/mm_inline.h>
99  #include <linux/mmu_notifier.h>
100  #include <linux/printk.h>
101  #include <linux/swapops.h>
102  
103  #include <asm/tlbflush.h>
104  #include <linux/uaccess.h>
105  
106  #include "internal.h"
107  
108  /* Internal flags */
109  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111  
112  static struct kmem_cache *policy_cache;
113  static struct kmem_cache *sn_cache;
114  
115  /* Highest zone. An specific allocation for a zone below that is not
116     policied. */
117  enum zone_type policy_zone = 0;
118  
119  /*
120   * run-time system-wide default policy => local allocation
121   */
122  static struct mempolicy default_policy = {
123  	.refcnt = ATOMIC_INIT(1), /* never free it */
124  	.mode = MPOL_PREFERRED,
125  	.flags = MPOL_F_LOCAL,
126  };
127  
128  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129  
130  /**
131   * numa_map_to_online_node - Find closest online node
132   * @nid: Node id to start the search
133   *
134   * Lookup the next closest node by distance if @nid is not online.
135   */
136  int numa_map_to_online_node(int node)
137  {
138  	int min_dist = INT_MAX, dist, n, min_node;
139  
140  	if (node == NUMA_NO_NODE || node_online(node))
141  		return node;
142  
143  	min_node = node;
144  	for_each_online_node(n) {
145  		dist = node_distance(node, n);
146  		if (dist < min_dist) {
147  			min_dist = dist;
148  			min_node = n;
149  		}
150  	}
151  
152  	return min_node;
153  }
154  EXPORT_SYMBOL_GPL(numa_map_to_online_node);
155  
156  struct mempolicy *get_task_policy(struct task_struct *p)
157  {
158  	struct mempolicy *pol = p->mempolicy;
159  	int node;
160  
161  	if (pol)
162  		return pol;
163  
164  	node = numa_node_id();
165  	if (node != NUMA_NO_NODE) {
166  		pol = &preferred_node_policy[node];
167  		/* preferred_node_policy is not initialised early in boot */
168  		if (pol->mode)
169  			return pol;
170  	}
171  
172  	return &default_policy;
173  }
174  
175  static const struct mempolicy_operations {
176  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
177  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
178  } mpol_ops[MPOL_MAX];
179  
180  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
181  {
182  	return pol->flags & MPOL_MODE_FLAGS;
183  }
184  
185  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
186  				   const nodemask_t *rel)
187  {
188  	nodemask_t tmp;
189  	nodes_fold(tmp, *orig, nodes_weight(*rel));
190  	nodes_onto(*ret, tmp, *rel);
191  }
192  
193  static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
194  {
195  	if (nodes_empty(*nodes))
196  		return -EINVAL;
197  	pol->v.nodes = *nodes;
198  	return 0;
199  }
200  
201  static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
202  {
203  	if (!nodes)
204  		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
205  	else if (nodes_empty(*nodes))
206  		return -EINVAL;			/*  no allowed nodes */
207  	else
208  		pol->v.preferred_node = first_node(*nodes);
209  	return 0;
210  }
211  
212  static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
213  {
214  	if (nodes_empty(*nodes))
215  		return -EINVAL;
216  	pol->v.nodes = *nodes;
217  	return 0;
218  }
219  
220  /*
221   * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
222   * any, for the new policy.  mpol_new() has already validated the nodes
223   * parameter with respect to the policy mode and flags.  But, we need to
224   * handle an empty nodemask with MPOL_PREFERRED here.
225   *
226   * Must be called holding task's alloc_lock to protect task's mems_allowed
227   * and mempolicy.  May also be called holding the mmap_semaphore for write.
228   */
229  static int mpol_set_nodemask(struct mempolicy *pol,
230  		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
231  {
232  	int ret;
233  
234  	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
235  	if (pol == NULL)
236  		return 0;
237  	/* Check N_MEMORY */
238  	nodes_and(nsc->mask1,
239  		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
240  
241  	VM_BUG_ON(!nodes);
242  	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
243  		nodes = NULL;	/* explicit local allocation */
244  	else {
245  		if (pol->flags & MPOL_F_RELATIVE_NODES)
246  			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247  		else
248  			nodes_and(nsc->mask2, *nodes, nsc->mask1);
249  
250  		if (mpol_store_user_nodemask(pol))
251  			pol->w.user_nodemask = *nodes;
252  		else
253  			pol->w.cpuset_mems_allowed =
254  						cpuset_current_mems_allowed;
255  	}
256  
257  	if (nodes)
258  		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259  	else
260  		ret = mpol_ops[pol->mode].create(pol, NULL);
261  	return ret;
262  }
263  
264  /*
265   * This function just creates a new policy, does some check and simple
266   * initialization. You must invoke mpol_set_nodemask() to set nodes.
267   */
268  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269  				  nodemask_t *nodes)
270  {
271  	struct mempolicy *policy;
272  
273  	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
274  		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
275  
276  	if (mode == MPOL_DEFAULT) {
277  		if (nodes && !nodes_empty(*nodes))
278  			return ERR_PTR(-EINVAL);
279  		return NULL;
280  	}
281  	VM_BUG_ON(!nodes);
282  
283  	/*
284  	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
285  	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
286  	 * All other modes require a valid pointer to a non-empty nodemask.
287  	 */
288  	if (mode == MPOL_PREFERRED) {
289  		if (nodes_empty(*nodes)) {
290  			if (((flags & MPOL_F_STATIC_NODES) ||
291  			     (flags & MPOL_F_RELATIVE_NODES)))
292  				return ERR_PTR(-EINVAL);
293  		}
294  	} else if (mode == MPOL_LOCAL) {
295  		if (!nodes_empty(*nodes) ||
296  		    (flags & MPOL_F_STATIC_NODES) ||
297  		    (flags & MPOL_F_RELATIVE_NODES))
298  			return ERR_PTR(-EINVAL);
299  		mode = MPOL_PREFERRED;
300  	} else if (nodes_empty(*nodes))
301  		return ERR_PTR(-EINVAL);
302  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
303  	if (!policy)
304  		return ERR_PTR(-ENOMEM);
305  	atomic_set(&policy->refcnt, 1);
306  	policy->mode = mode;
307  	policy->flags = flags;
308  
309  	return policy;
310  }
311  
312  /* Slow path of a mpol destructor. */
313  void __mpol_put(struct mempolicy *p)
314  {
315  	if (!atomic_dec_and_test(&p->refcnt))
316  		return;
317  	kmem_cache_free(policy_cache, p);
318  }
319  
320  static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
321  {
322  }
323  
324  static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
325  {
326  	nodemask_t tmp;
327  
328  	if (pol->flags & MPOL_F_STATIC_NODES)
329  		nodes_and(tmp, pol->w.user_nodemask, *nodes);
330  	else if (pol->flags & MPOL_F_RELATIVE_NODES)
331  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332  	else {
333  		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
334  								*nodes);
335  		pol->w.cpuset_mems_allowed = *nodes;
336  	}
337  
338  	if (nodes_empty(tmp))
339  		tmp = *nodes;
340  
341  	pol->v.nodes = tmp;
342  }
343  
344  static void mpol_rebind_preferred(struct mempolicy *pol,
345  						const nodemask_t *nodes)
346  {
347  	nodemask_t tmp;
348  
349  	if (pol->flags & MPOL_F_STATIC_NODES) {
350  		int node = first_node(pol->w.user_nodemask);
351  
352  		if (node_isset(node, *nodes)) {
353  			pol->v.preferred_node = node;
354  			pol->flags &= ~MPOL_F_LOCAL;
355  		} else
356  			pol->flags |= MPOL_F_LOCAL;
357  	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
358  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
359  		pol->v.preferred_node = first_node(tmp);
360  	} else if (!(pol->flags & MPOL_F_LOCAL)) {
361  		pol->v.preferred_node = node_remap(pol->v.preferred_node,
362  						   pol->w.cpuset_mems_allowed,
363  						   *nodes);
364  		pol->w.cpuset_mems_allowed = *nodes;
365  	}
366  }
367  
368  /*
369   * mpol_rebind_policy - Migrate a policy to a different set of nodes
370   *
371   * Per-vma policies are protected by mmap_sem. Allocations using per-task
372   * policies are protected by task->mems_allowed_seq to prevent a premature
373   * OOM/allocation failure due to parallel nodemask modification.
374   */
375  static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
376  {
377  	if (!pol)
378  		return;
379  	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
380  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
381  		return;
382  
383  	mpol_ops[pol->mode].rebind(pol, newmask);
384  }
385  
386  /*
387   * Wrapper for mpol_rebind_policy() that just requires task
388   * pointer, and updates task mempolicy.
389   *
390   * Called with task's alloc_lock held.
391   */
392  
393  void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
394  {
395  	mpol_rebind_policy(tsk->mempolicy, new);
396  }
397  
398  /*
399   * Rebind each vma in mm to new nodemask.
400   *
401   * Call holding a reference to mm.  Takes mm->mmap_sem during call.
402   */
403  
404  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
405  {
406  	struct vm_area_struct *vma;
407  
408  	down_write(&mm->mmap_sem);
409  	for (vma = mm->mmap; vma; vma = vma->vm_next)
410  		mpol_rebind_policy(vma->vm_policy, new);
411  	up_write(&mm->mmap_sem);
412  }
413  
414  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415  	[MPOL_DEFAULT] = {
416  		.rebind = mpol_rebind_default,
417  	},
418  	[MPOL_INTERLEAVE] = {
419  		.create = mpol_new_interleave,
420  		.rebind = mpol_rebind_nodemask,
421  	},
422  	[MPOL_PREFERRED] = {
423  		.create = mpol_new_preferred,
424  		.rebind = mpol_rebind_preferred,
425  	},
426  	[MPOL_BIND] = {
427  		.create = mpol_new_bind,
428  		.rebind = mpol_rebind_nodemask,
429  	},
430  };
431  
432  static int migrate_page_add(struct page *page, struct list_head *pagelist,
433  				unsigned long flags);
434  
435  struct queue_pages {
436  	struct list_head *pagelist;
437  	unsigned long flags;
438  	nodemask_t *nmask;
439  	unsigned long start;
440  	unsigned long end;
441  	struct vm_area_struct *first;
442  };
443  
444  /*
445   * Check if the page's nid is in qp->nmask.
446   *
447   * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
448   * in the invert of qp->nmask.
449   */
450  static inline bool queue_pages_required(struct page *page,
451  					struct queue_pages *qp)
452  {
453  	int nid = page_to_nid(page);
454  	unsigned long flags = qp->flags;
455  
456  	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
457  }
458  
459  /*
460   * queue_pages_pmd() has four possible return values:
461   * 0 - pages are placed on the right node or queued successfully.
462   * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
463   *     specified.
464   * 2 - THP was split.
465   * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
466   *        existing page was already on a node that does not follow the
467   *        policy.
468   */
469  static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
470  				unsigned long end, struct mm_walk *walk)
471  	__releases(ptl)
472  {
473  	int ret = 0;
474  	struct page *page;
475  	struct queue_pages *qp = walk->private;
476  	unsigned long flags;
477  
478  	if (unlikely(is_pmd_migration_entry(*pmd))) {
479  		ret = -EIO;
480  		goto unlock;
481  	}
482  	page = pmd_page(*pmd);
483  	if (is_huge_zero_page(page)) {
484  		spin_unlock(ptl);
485  		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
486  		ret = 2;
487  		goto out;
488  	}
489  	if (!queue_pages_required(page, qp))
490  		goto unlock;
491  
492  	flags = qp->flags;
493  	/* go to thp migration */
494  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
495  		if (!vma_migratable(walk->vma) ||
496  		    migrate_page_add(page, qp->pagelist, flags)) {
497  			ret = 1;
498  			goto unlock;
499  		}
500  	} else
501  		ret = -EIO;
502  unlock:
503  	spin_unlock(ptl);
504  out:
505  	return ret;
506  }
507  
508  /*
509   * Scan through pages checking if pages follow certain conditions,
510   * and move them to the pagelist if they do.
511   *
512   * queue_pages_pte_range() has three possible return values:
513   * 0 - pages are placed on the right node or queued successfully.
514   * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
515   *     specified.
516   * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
517   *        on a node that does not follow the policy.
518   */
519  static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
520  			unsigned long end, struct mm_walk *walk)
521  {
522  	struct vm_area_struct *vma = walk->vma;
523  	struct page *page;
524  	struct queue_pages *qp = walk->private;
525  	unsigned long flags = qp->flags;
526  	int ret;
527  	bool has_unmovable = false;
528  	pte_t *pte;
529  	spinlock_t *ptl;
530  
531  	ptl = pmd_trans_huge_lock(pmd, vma);
532  	if (ptl) {
533  		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
534  		if (ret != 2)
535  			return ret;
536  	}
537  	/* THP was split, fall through to pte walk */
538  
539  	if (pmd_trans_unstable(pmd))
540  		return 0;
541  
542  	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
543  	for (; addr != end; pte++, addr += PAGE_SIZE) {
544  		if (!pte_present(*pte))
545  			continue;
546  		page = vm_normal_page(vma, addr, *pte);
547  		if (!page)
548  			continue;
549  		/*
550  		 * vm_normal_page() filters out zero pages, but there might
551  		 * still be PageReserved pages to skip, perhaps in a VDSO.
552  		 */
553  		if (PageReserved(page))
554  			continue;
555  		if (!queue_pages_required(page, qp))
556  			continue;
557  		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
558  			/* MPOL_MF_STRICT must be specified if we get here */
559  			if (!vma_migratable(vma)) {
560  				has_unmovable = true;
561  				break;
562  			}
563  
564  			/*
565  			 * Do not abort immediately since there may be
566  			 * temporary off LRU pages in the range.  Still
567  			 * need migrate other LRU pages.
568  			 */
569  			if (migrate_page_add(page, qp->pagelist, flags))
570  				has_unmovable = true;
571  		} else
572  			break;
573  	}
574  	pte_unmap_unlock(pte - 1, ptl);
575  	cond_resched();
576  
577  	if (has_unmovable)
578  		return 1;
579  
580  	return addr != end ? -EIO : 0;
581  }
582  
583  static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
584  			       unsigned long addr, unsigned long end,
585  			       struct mm_walk *walk)
586  {
587  	int ret = 0;
588  #ifdef CONFIG_HUGETLB_PAGE
589  	struct queue_pages *qp = walk->private;
590  	unsigned long flags = (qp->flags & MPOL_MF_VALID);
591  	struct page *page;
592  	spinlock_t *ptl;
593  	pte_t entry;
594  
595  	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596  	entry = huge_ptep_get(pte);
597  	if (!pte_present(entry))
598  		goto unlock;
599  	page = pte_page(entry);
600  	if (!queue_pages_required(page, qp))
601  		goto unlock;
602  
603  	if (flags == MPOL_MF_STRICT) {
604  		/*
605  		 * STRICT alone means only detecting misplaced page and no
606  		 * need to further check other vma.
607  		 */
608  		ret = -EIO;
609  		goto unlock;
610  	}
611  
612  	if (!vma_migratable(walk->vma)) {
613  		/*
614  		 * Must be STRICT with MOVE*, otherwise .test_walk() have
615  		 * stopped walking current vma.
616  		 * Detecting misplaced page but allow migrating pages which
617  		 * have been queued.
618  		 */
619  		ret = 1;
620  		goto unlock;
621  	}
622  
623  	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
624  	if (flags & (MPOL_MF_MOVE_ALL) ||
625  	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
626  		if (!isolate_huge_page(page, qp->pagelist) &&
627  			(flags & MPOL_MF_STRICT))
628  			/*
629  			 * Failed to isolate page but allow migrating pages
630  			 * which have been queued.
631  			 */
632  			ret = 1;
633  	}
634  unlock:
635  	spin_unlock(ptl);
636  #else
637  	BUG();
638  #endif
639  	return ret;
640  }
641  
642  #ifdef CONFIG_NUMA_BALANCING
643  /*
644   * This is used to mark a range of virtual addresses to be inaccessible.
645   * These are later cleared by a NUMA hinting fault. Depending on these
646   * faults, pages may be migrated for better NUMA placement.
647   *
648   * This is assuming that NUMA faults are handled using PROT_NONE. If
649   * an architecture makes a different choice, it will need further
650   * changes to the core.
651   */
652  unsigned long change_prot_numa(struct vm_area_struct *vma,
653  			unsigned long addr, unsigned long end)
654  {
655  	int nr_updated;
656  
657  	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
658  	if (nr_updated)
659  		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
660  
661  	return nr_updated;
662  }
663  #else
664  static unsigned long change_prot_numa(struct vm_area_struct *vma,
665  			unsigned long addr, unsigned long end)
666  {
667  	return 0;
668  }
669  #endif /* CONFIG_NUMA_BALANCING */
670  
671  static int queue_pages_test_walk(unsigned long start, unsigned long end,
672  				struct mm_walk *walk)
673  {
674  	struct vm_area_struct *vma = walk->vma;
675  	struct queue_pages *qp = walk->private;
676  	unsigned long endvma = vma->vm_end;
677  	unsigned long flags = qp->flags;
678  
679  	/* range check first */
680  	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
681  
682  	if (!qp->first) {
683  		qp->first = vma;
684  		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
685  			(qp->start < vma->vm_start))
686  			/* hole at head side of range */
687  			return -EFAULT;
688  	}
689  	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
690  		((vma->vm_end < qp->end) &&
691  		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
692  		/* hole at middle or tail of range */
693  		return -EFAULT;
694  
695  	/*
696  	 * Need check MPOL_MF_STRICT to return -EIO if possible
697  	 * regardless of vma_migratable
698  	 */
699  	if (!vma_migratable(vma) &&
700  	    !(flags & MPOL_MF_STRICT))
701  		return 1;
702  
703  	if (endvma > end)
704  		endvma = end;
705  
706  	if (flags & MPOL_MF_LAZY) {
707  		/* Similar to task_numa_work, skip inaccessible VMAs */
708  		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
709  			!(vma->vm_flags & VM_MIXEDMAP))
710  			change_prot_numa(vma, start, endvma);
711  		return 1;
712  	}
713  
714  	/* queue pages from current vma */
715  	if (flags & MPOL_MF_VALID)
716  		return 0;
717  	return 1;
718  }
719  
720  static const struct mm_walk_ops queue_pages_walk_ops = {
721  	.hugetlb_entry		= queue_pages_hugetlb,
722  	.pmd_entry		= queue_pages_pte_range,
723  	.test_walk		= queue_pages_test_walk,
724  };
725  
726  /*
727   * Walk through page tables and collect pages to be migrated.
728   *
729   * If pages found in a given range are on a set of nodes (determined by
730   * @nodes and @flags,) it's isolated and queued to the pagelist which is
731   * passed via @private.
732   *
733   * queue_pages_range() has three possible return values:
734   * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
735   *     specified.
736   * 0 - queue pages successfully or no misplaced page.
737   * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
738   *         memory range specified by nodemask and maxnode points outside
739   *         your accessible address space (-EFAULT)
740   */
741  static int
742  queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
743  		nodemask_t *nodes, unsigned long flags,
744  		struct list_head *pagelist)
745  {
746  	int err;
747  	struct queue_pages qp = {
748  		.pagelist = pagelist,
749  		.flags = flags,
750  		.nmask = nodes,
751  		.start = start,
752  		.end = end,
753  		.first = NULL,
754  	};
755  
756  	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
757  
758  	if (!qp.first)
759  		/* whole range in hole */
760  		err = -EFAULT;
761  
762  	return err;
763  }
764  
765  /*
766   * Apply policy to a single VMA
767   * This must be called with the mmap_sem held for writing.
768   */
769  static int vma_replace_policy(struct vm_area_struct *vma,
770  						struct mempolicy *pol)
771  {
772  	int err;
773  	struct mempolicy *old;
774  	struct mempolicy *new;
775  
776  	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
777  		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
778  		 vma->vm_ops, vma->vm_file,
779  		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
780  
781  	new = mpol_dup(pol);
782  	if (IS_ERR(new))
783  		return PTR_ERR(new);
784  
785  	if (vma->vm_ops && vma->vm_ops->set_policy) {
786  		err = vma->vm_ops->set_policy(vma, new);
787  		if (err)
788  			goto err_out;
789  	}
790  
791  	old = vma->vm_policy;
792  	vma->vm_policy = new; /* protected by mmap_sem */
793  	mpol_put(old);
794  
795  	return 0;
796   err_out:
797  	mpol_put(new);
798  	return err;
799  }
800  
801  /* Step 2: apply policy to a range and do splits. */
802  static int mbind_range(struct mm_struct *mm, unsigned long start,
803  		       unsigned long end, struct mempolicy *new_pol)
804  {
805  	struct vm_area_struct *next;
806  	struct vm_area_struct *prev;
807  	struct vm_area_struct *vma;
808  	int err = 0;
809  	pgoff_t pgoff;
810  	unsigned long vmstart;
811  	unsigned long vmend;
812  
813  	vma = find_vma(mm, start);
814  	VM_BUG_ON(!vma);
815  
816  	prev = vma->vm_prev;
817  	if (start > vma->vm_start)
818  		prev = vma;
819  
820  	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
821  		next = vma->vm_next;
822  		vmstart = max(start, vma->vm_start);
823  		vmend   = min(end, vma->vm_end);
824  
825  		if (mpol_equal(vma_policy(vma), new_pol))
826  			continue;
827  
828  		pgoff = vma->vm_pgoff +
829  			((vmstart - vma->vm_start) >> PAGE_SHIFT);
830  		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
831  				 vma->anon_vma, vma->vm_file, pgoff,
832  				 new_pol, vma->vm_userfaultfd_ctx);
833  		if (prev) {
834  			vma = prev;
835  			next = vma->vm_next;
836  			if (mpol_equal(vma_policy(vma), new_pol))
837  				continue;
838  			/* vma_merge() joined vma && vma->next, case 8 */
839  			goto replace;
840  		}
841  		if (vma->vm_start != vmstart) {
842  			err = split_vma(vma->vm_mm, vma, vmstart, 1);
843  			if (err)
844  				goto out;
845  		}
846  		if (vma->vm_end != vmend) {
847  			err = split_vma(vma->vm_mm, vma, vmend, 0);
848  			if (err)
849  				goto out;
850  		}
851   replace:
852  		err = vma_replace_policy(vma, new_pol);
853  		if (err)
854  			goto out;
855  	}
856  
857   out:
858  	return err;
859  }
860  
861  /* Set the process memory policy */
862  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
863  			     nodemask_t *nodes)
864  {
865  	struct mempolicy *new, *old;
866  	NODEMASK_SCRATCH(scratch);
867  	int ret;
868  
869  	if (!scratch)
870  		return -ENOMEM;
871  
872  	new = mpol_new(mode, flags, nodes);
873  	if (IS_ERR(new)) {
874  		ret = PTR_ERR(new);
875  		goto out;
876  	}
877  
878  	task_lock(current);
879  	ret = mpol_set_nodemask(new, nodes, scratch);
880  	if (ret) {
881  		task_unlock(current);
882  		mpol_put(new);
883  		goto out;
884  	}
885  	old = current->mempolicy;
886  	current->mempolicy = new;
887  	if (new && new->mode == MPOL_INTERLEAVE)
888  		current->il_prev = MAX_NUMNODES-1;
889  	task_unlock(current);
890  	mpol_put(old);
891  	ret = 0;
892  out:
893  	NODEMASK_SCRATCH_FREE(scratch);
894  	return ret;
895  }
896  
897  /*
898   * Return nodemask for policy for get_mempolicy() query
899   *
900   * Called with task's alloc_lock held
901   */
902  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
903  {
904  	nodes_clear(*nodes);
905  	if (p == &default_policy)
906  		return;
907  
908  	switch (p->mode) {
909  	case MPOL_BIND:
910  	case MPOL_INTERLEAVE:
911  		*nodes = p->v.nodes;
912  		break;
913  	case MPOL_PREFERRED:
914  		if (!(p->flags & MPOL_F_LOCAL))
915  			node_set(p->v.preferred_node, *nodes);
916  		/* else return empty node mask for local allocation */
917  		break;
918  	default:
919  		BUG();
920  	}
921  }
922  
923  static int lookup_node(struct mm_struct *mm, unsigned long addr)
924  {
925  	struct page *p = NULL;
926  	int err;
927  
928  	int locked = 1;
929  	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
930  	if (err == 0) {
931  		/* E.g. GUP interrupted by fatal signal */
932  		err = -EFAULT;
933  	} else if (err > 0) {
934  		err = page_to_nid(p);
935  		put_page(p);
936  	}
937  	if (locked)
938  		up_read(&mm->mmap_sem);
939  	return err;
940  }
941  
942  /* Retrieve NUMA policy */
943  static long do_get_mempolicy(int *policy, nodemask_t *nmask,
944  			     unsigned long addr, unsigned long flags)
945  {
946  	int err;
947  	struct mm_struct *mm = current->mm;
948  	struct vm_area_struct *vma = NULL;
949  	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
950  
951  	if (flags &
952  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
953  		return -EINVAL;
954  
955  	if (flags & MPOL_F_MEMS_ALLOWED) {
956  		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
957  			return -EINVAL;
958  		*policy = 0;	/* just so it's initialized */
959  		task_lock(current);
960  		*nmask  = cpuset_current_mems_allowed;
961  		task_unlock(current);
962  		return 0;
963  	}
964  
965  	if (flags & MPOL_F_ADDR) {
966  		/*
967  		 * Do NOT fall back to task policy if the
968  		 * vma/shared policy at addr is NULL.  We
969  		 * want to return MPOL_DEFAULT in this case.
970  		 */
971  		down_read(&mm->mmap_sem);
972  		vma = find_vma_intersection(mm, addr, addr+1);
973  		if (!vma) {
974  			up_read(&mm->mmap_sem);
975  			return -EFAULT;
976  		}
977  		if (vma->vm_ops && vma->vm_ops->get_policy)
978  			pol = vma->vm_ops->get_policy(vma, addr);
979  		else
980  			pol = vma->vm_policy;
981  	} else if (addr)
982  		return -EINVAL;
983  
984  	if (!pol)
985  		pol = &default_policy;	/* indicates default behavior */
986  
987  	if (flags & MPOL_F_NODE) {
988  		if (flags & MPOL_F_ADDR) {
989  			/*
990  			 * Take a refcount on the mpol, lookup_node()
991  			 * wil drop the mmap_sem, so after calling
992  			 * lookup_node() only "pol" remains valid, "vma"
993  			 * is stale.
994  			 */
995  			pol_refcount = pol;
996  			vma = NULL;
997  			mpol_get(pol);
998  			err = lookup_node(mm, addr);
999  			if (err < 0)
1000  				goto out;
1001  			*policy = err;
1002  		} else if (pol == current->mempolicy &&
1003  				pol->mode == MPOL_INTERLEAVE) {
1004  			*policy = next_node_in(current->il_prev, pol->v.nodes);
1005  		} else {
1006  			err = -EINVAL;
1007  			goto out;
1008  		}
1009  	} else {
1010  		*policy = pol == &default_policy ? MPOL_DEFAULT :
1011  						pol->mode;
1012  		/*
1013  		 * Internal mempolicy flags must be masked off before exposing
1014  		 * the policy to userspace.
1015  		 */
1016  		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1017  	}
1018  
1019  	err = 0;
1020  	if (nmask) {
1021  		if (mpol_store_user_nodemask(pol)) {
1022  			*nmask = pol->w.user_nodemask;
1023  		} else {
1024  			task_lock(current);
1025  			get_policy_nodemask(pol, nmask);
1026  			task_unlock(current);
1027  		}
1028  	}
1029  
1030   out:
1031  	mpol_cond_put(pol);
1032  	if (vma)
1033  		up_read(&mm->mmap_sem);
1034  	if (pol_refcount)
1035  		mpol_put(pol_refcount);
1036  	return err;
1037  }
1038  
1039  #ifdef CONFIG_MIGRATION
1040  /*
1041   * page migration, thp tail pages can be passed.
1042   */
1043  static int migrate_page_add(struct page *page, struct list_head *pagelist,
1044  				unsigned long flags)
1045  {
1046  	struct page *head = compound_head(page);
1047  	/*
1048  	 * Avoid migrating a page that is shared with others.
1049  	 */
1050  	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1051  		if (!isolate_lru_page(head)) {
1052  			list_add_tail(&head->lru, pagelist);
1053  			mod_node_page_state(page_pgdat(head),
1054  				NR_ISOLATED_ANON + page_is_file_lru(head),
1055  				hpage_nr_pages(head));
1056  		} else if (flags & MPOL_MF_STRICT) {
1057  			/*
1058  			 * Non-movable page may reach here.  And, there may be
1059  			 * temporary off LRU pages or non-LRU movable pages.
1060  			 * Treat them as unmovable pages since they can't be
1061  			 * isolated, so they can't be moved at the moment.  It
1062  			 * should return -EIO for this case too.
1063  			 */
1064  			return -EIO;
1065  		}
1066  	}
1067  
1068  	return 0;
1069  }
1070  
1071  /* page allocation callback for NUMA node migration */
1072  struct page *alloc_new_node_page(struct page *page, unsigned long node)
1073  {
1074  	if (PageHuge(page))
1075  		return alloc_huge_page_node(page_hstate(compound_head(page)),
1076  					node);
1077  	else if (PageTransHuge(page)) {
1078  		struct page *thp;
1079  
1080  		thp = alloc_pages_node(node,
1081  			(GFP_TRANSHUGE | __GFP_THISNODE),
1082  			HPAGE_PMD_ORDER);
1083  		if (!thp)
1084  			return NULL;
1085  		prep_transhuge_page(thp);
1086  		return thp;
1087  	} else
1088  		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1089  						    __GFP_THISNODE, 0);
1090  }
1091  
1092  /*
1093   * Migrate pages from one node to a target node.
1094   * Returns error or the number of pages not migrated.
1095   */
1096  static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1097  			   int flags)
1098  {
1099  	nodemask_t nmask;
1100  	LIST_HEAD(pagelist);
1101  	int err = 0;
1102  
1103  	nodes_clear(nmask);
1104  	node_set(source, nmask);
1105  
1106  	/*
1107  	 * This does not "check" the range but isolates all pages that
1108  	 * need migration.  Between passing in the full user address
1109  	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1110  	 */
1111  	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1112  	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1113  			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1114  
1115  	if (!list_empty(&pagelist)) {
1116  		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1117  					MIGRATE_SYNC, MR_SYSCALL);
1118  		if (err)
1119  			putback_movable_pages(&pagelist);
1120  	}
1121  
1122  	return err;
1123  }
1124  
1125  /*
1126   * Move pages between the two nodesets so as to preserve the physical
1127   * layout as much as possible.
1128   *
1129   * Returns the number of page that could not be moved.
1130   */
1131  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1132  		     const nodemask_t *to, int flags)
1133  {
1134  	int busy = 0;
1135  	int err;
1136  	nodemask_t tmp;
1137  
1138  	err = migrate_prep();
1139  	if (err)
1140  		return err;
1141  
1142  	down_read(&mm->mmap_sem);
1143  
1144  	/*
1145  	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1146  	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1147  	 * bit in 'tmp', and return that <source, dest> pair for migration.
1148  	 * The pair of nodemasks 'to' and 'from' define the map.
1149  	 *
1150  	 * If no pair of bits is found that way, fallback to picking some
1151  	 * pair of 'source' and 'dest' bits that are not the same.  If the
1152  	 * 'source' and 'dest' bits are the same, this represents a node
1153  	 * that will be migrating to itself, so no pages need move.
1154  	 *
1155  	 * If no bits are left in 'tmp', or if all remaining bits left
1156  	 * in 'tmp' correspond to the same bit in 'to', return false
1157  	 * (nothing left to migrate).
1158  	 *
1159  	 * This lets us pick a pair of nodes to migrate between, such that
1160  	 * if possible the dest node is not already occupied by some other
1161  	 * source node, minimizing the risk of overloading the memory on a
1162  	 * node that would happen if we migrated incoming memory to a node
1163  	 * before migrating outgoing memory source that same node.
1164  	 *
1165  	 * A single scan of tmp is sufficient.  As we go, we remember the
1166  	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1167  	 * that not only moved, but what's better, moved to an empty slot
1168  	 * (d is not set in tmp), then we break out then, with that pair.
1169  	 * Otherwise when we finish scanning from_tmp, we at least have the
1170  	 * most recent <s, d> pair that moved.  If we get all the way through
1171  	 * the scan of tmp without finding any node that moved, much less
1172  	 * moved to an empty node, then there is nothing left worth migrating.
1173  	 */
1174  
1175  	tmp = *from;
1176  	while (!nodes_empty(tmp)) {
1177  		int s,d;
1178  		int source = NUMA_NO_NODE;
1179  		int dest = 0;
1180  
1181  		for_each_node_mask(s, tmp) {
1182  
1183  			/*
1184  			 * do_migrate_pages() tries to maintain the relative
1185  			 * node relationship of the pages established between
1186  			 * threads and memory areas.
1187                           *
1188  			 * However if the number of source nodes is not equal to
1189  			 * the number of destination nodes we can not preserve
1190  			 * this node relative relationship.  In that case, skip
1191  			 * copying memory from a node that is in the destination
1192  			 * mask.
1193  			 *
1194  			 * Example: [2,3,4] -> [3,4,5] moves everything.
1195  			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1196  			 */
1197  
1198  			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1199  						(node_isset(s, *to)))
1200  				continue;
1201  
1202  			d = node_remap(s, *from, *to);
1203  			if (s == d)
1204  				continue;
1205  
1206  			source = s;	/* Node moved. Memorize */
1207  			dest = d;
1208  
1209  			/* dest not in remaining from nodes? */
1210  			if (!node_isset(dest, tmp))
1211  				break;
1212  		}
1213  		if (source == NUMA_NO_NODE)
1214  			break;
1215  
1216  		node_clear(source, tmp);
1217  		err = migrate_to_node(mm, source, dest, flags);
1218  		if (err > 0)
1219  			busy += err;
1220  		if (err < 0)
1221  			break;
1222  	}
1223  	up_read(&mm->mmap_sem);
1224  	if (err < 0)
1225  		return err;
1226  	return busy;
1227  
1228  }
1229  
1230  /*
1231   * Allocate a new page for page migration based on vma policy.
1232   * Start by assuming the page is mapped by the same vma as contains @start.
1233   * Search forward from there, if not.  N.B., this assumes that the
1234   * list of pages handed to migrate_pages()--which is how we get here--
1235   * is in virtual address order.
1236   */
1237  static struct page *new_page(struct page *page, unsigned long start)
1238  {
1239  	struct vm_area_struct *vma;
1240  	unsigned long uninitialized_var(address);
1241  
1242  	vma = find_vma(current->mm, start);
1243  	while (vma) {
1244  		address = page_address_in_vma(page, vma);
1245  		if (address != -EFAULT)
1246  			break;
1247  		vma = vma->vm_next;
1248  	}
1249  
1250  	if (PageHuge(page)) {
1251  		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1252  				vma, address);
1253  	} else if (PageTransHuge(page)) {
1254  		struct page *thp;
1255  
1256  		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1257  					 HPAGE_PMD_ORDER);
1258  		if (!thp)
1259  			return NULL;
1260  		prep_transhuge_page(thp);
1261  		return thp;
1262  	}
1263  	/*
1264  	 * if !vma, alloc_page_vma() will use task or system default policy
1265  	 */
1266  	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1267  			vma, address);
1268  }
1269  #else
1270  
1271  static int migrate_page_add(struct page *page, struct list_head *pagelist,
1272  				unsigned long flags)
1273  {
1274  	return -EIO;
1275  }
1276  
1277  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1278  		     const nodemask_t *to, int flags)
1279  {
1280  	return -ENOSYS;
1281  }
1282  
1283  static struct page *new_page(struct page *page, unsigned long start)
1284  {
1285  	return NULL;
1286  }
1287  #endif
1288  
1289  static long do_mbind(unsigned long start, unsigned long len,
1290  		     unsigned short mode, unsigned short mode_flags,
1291  		     nodemask_t *nmask, unsigned long flags)
1292  {
1293  	struct mm_struct *mm = current->mm;
1294  	struct mempolicy *new;
1295  	unsigned long end;
1296  	int err;
1297  	int ret;
1298  	LIST_HEAD(pagelist);
1299  
1300  	if (flags & ~(unsigned long)MPOL_MF_VALID)
1301  		return -EINVAL;
1302  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1303  		return -EPERM;
1304  
1305  	if (start & ~PAGE_MASK)
1306  		return -EINVAL;
1307  
1308  	if (mode == MPOL_DEFAULT)
1309  		flags &= ~MPOL_MF_STRICT;
1310  
1311  	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1312  	end = start + len;
1313  
1314  	if (end < start)
1315  		return -EINVAL;
1316  	if (end == start)
1317  		return 0;
1318  
1319  	new = mpol_new(mode, mode_flags, nmask);
1320  	if (IS_ERR(new))
1321  		return PTR_ERR(new);
1322  
1323  	if (flags & MPOL_MF_LAZY)
1324  		new->flags |= MPOL_F_MOF;
1325  
1326  	/*
1327  	 * If we are using the default policy then operation
1328  	 * on discontinuous address spaces is okay after all
1329  	 */
1330  	if (!new)
1331  		flags |= MPOL_MF_DISCONTIG_OK;
1332  
1333  	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1334  		 start, start + len, mode, mode_flags,
1335  		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1336  
1337  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1338  
1339  		err = migrate_prep();
1340  		if (err)
1341  			goto mpol_out;
1342  	}
1343  	{
1344  		NODEMASK_SCRATCH(scratch);
1345  		if (scratch) {
1346  			down_write(&mm->mmap_sem);
1347  			task_lock(current);
1348  			err = mpol_set_nodemask(new, nmask, scratch);
1349  			task_unlock(current);
1350  			if (err)
1351  				up_write(&mm->mmap_sem);
1352  		} else
1353  			err = -ENOMEM;
1354  		NODEMASK_SCRATCH_FREE(scratch);
1355  	}
1356  	if (err)
1357  		goto mpol_out;
1358  
1359  	ret = queue_pages_range(mm, start, end, nmask,
1360  			  flags | MPOL_MF_INVERT, &pagelist);
1361  
1362  	if (ret < 0) {
1363  		err = ret;
1364  		goto up_out;
1365  	}
1366  
1367  	err = mbind_range(mm, start, end, new);
1368  
1369  	if (!err) {
1370  		int nr_failed = 0;
1371  
1372  		if (!list_empty(&pagelist)) {
1373  			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1374  			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1375  				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1376  			if (nr_failed)
1377  				putback_movable_pages(&pagelist);
1378  		}
1379  
1380  		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1381  			err = -EIO;
1382  	} else {
1383  up_out:
1384  		if (!list_empty(&pagelist))
1385  			putback_movable_pages(&pagelist);
1386  	}
1387  
1388  	up_write(&mm->mmap_sem);
1389  mpol_out:
1390  	mpol_put(new);
1391  	return err;
1392  }
1393  
1394  /*
1395   * User space interface with variable sized bitmaps for nodelists.
1396   */
1397  
1398  /* Copy a node mask from user space. */
1399  static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1400  		     unsigned long maxnode)
1401  {
1402  	unsigned long k;
1403  	unsigned long t;
1404  	unsigned long nlongs;
1405  	unsigned long endmask;
1406  
1407  	--maxnode;
1408  	nodes_clear(*nodes);
1409  	if (maxnode == 0 || !nmask)
1410  		return 0;
1411  	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1412  		return -EINVAL;
1413  
1414  	nlongs = BITS_TO_LONGS(maxnode);
1415  	if ((maxnode % BITS_PER_LONG) == 0)
1416  		endmask = ~0UL;
1417  	else
1418  		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1419  
1420  	/*
1421  	 * When the user specified more nodes than supported just check
1422  	 * if the non supported part is all zero.
1423  	 *
1424  	 * If maxnode have more longs than MAX_NUMNODES, check
1425  	 * the bits in that area first. And then go through to
1426  	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1427  	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1428  	 */
1429  	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1430  		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1431  			if (get_user(t, nmask + k))
1432  				return -EFAULT;
1433  			if (k == nlongs - 1) {
1434  				if (t & endmask)
1435  					return -EINVAL;
1436  			} else if (t)
1437  				return -EINVAL;
1438  		}
1439  		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1440  		endmask = ~0UL;
1441  	}
1442  
1443  	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1444  		unsigned long valid_mask = endmask;
1445  
1446  		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1447  		if (get_user(t, nmask + nlongs - 1))
1448  			return -EFAULT;
1449  		if (t & valid_mask)
1450  			return -EINVAL;
1451  	}
1452  
1453  	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1454  		return -EFAULT;
1455  	nodes_addr(*nodes)[nlongs-1] &= endmask;
1456  	return 0;
1457  }
1458  
1459  /* Copy a kernel node mask to user space */
1460  static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1461  			      nodemask_t *nodes)
1462  {
1463  	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1464  	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1465  
1466  	if (copy > nbytes) {
1467  		if (copy > PAGE_SIZE)
1468  			return -EINVAL;
1469  		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1470  			return -EFAULT;
1471  		copy = nbytes;
1472  	}
1473  	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1474  }
1475  
1476  static long kernel_mbind(unsigned long start, unsigned long len,
1477  			 unsigned long mode, const unsigned long __user *nmask,
1478  			 unsigned long maxnode, unsigned int flags)
1479  {
1480  	nodemask_t nodes;
1481  	int err;
1482  	unsigned short mode_flags;
1483  
1484  	start = untagged_addr(start);
1485  	mode_flags = mode & MPOL_MODE_FLAGS;
1486  	mode &= ~MPOL_MODE_FLAGS;
1487  	if (mode >= MPOL_MAX)
1488  		return -EINVAL;
1489  	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1490  	    (mode_flags & MPOL_F_RELATIVE_NODES))
1491  		return -EINVAL;
1492  	err = get_nodes(&nodes, nmask, maxnode);
1493  	if (err)
1494  		return err;
1495  	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1496  }
1497  
1498  SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1499  		unsigned long, mode, const unsigned long __user *, nmask,
1500  		unsigned long, maxnode, unsigned int, flags)
1501  {
1502  	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1503  }
1504  
1505  /* Set the process memory policy */
1506  static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1507  				 unsigned long maxnode)
1508  {
1509  	int err;
1510  	nodemask_t nodes;
1511  	unsigned short flags;
1512  
1513  	flags = mode & MPOL_MODE_FLAGS;
1514  	mode &= ~MPOL_MODE_FLAGS;
1515  	if ((unsigned int)mode >= MPOL_MAX)
1516  		return -EINVAL;
1517  	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1518  		return -EINVAL;
1519  	err = get_nodes(&nodes, nmask, maxnode);
1520  	if (err)
1521  		return err;
1522  	return do_set_mempolicy(mode, flags, &nodes);
1523  }
1524  
1525  SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1526  		unsigned long, maxnode)
1527  {
1528  	return kernel_set_mempolicy(mode, nmask, maxnode);
1529  }
1530  
1531  static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1532  				const unsigned long __user *old_nodes,
1533  				const unsigned long __user *new_nodes)
1534  {
1535  	struct mm_struct *mm = NULL;
1536  	struct task_struct *task;
1537  	nodemask_t task_nodes;
1538  	int err;
1539  	nodemask_t *old;
1540  	nodemask_t *new;
1541  	NODEMASK_SCRATCH(scratch);
1542  
1543  	if (!scratch)
1544  		return -ENOMEM;
1545  
1546  	old = &scratch->mask1;
1547  	new = &scratch->mask2;
1548  
1549  	err = get_nodes(old, old_nodes, maxnode);
1550  	if (err)
1551  		goto out;
1552  
1553  	err = get_nodes(new, new_nodes, maxnode);
1554  	if (err)
1555  		goto out;
1556  
1557  	/* Find the mm_struct */
1558  	rcu_read_lock();
1559  	task = pid ? find_task_by_vpid(pid) : current;
1560  	if (!task) {
1561  		rcu_read_unlock();
1562  		err = -ESRCH;
1563  		goto out;
1564  	}
1565  	get_task_struct(task);
1566  
1567  	err = -EINVAL;
1568  
1569  	/*
1570  	 * Check if this process has the right to modify the specified process.
1571  	 * Use the regular "ptrace_may_access()" checks.
1572  	 */
1573  	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1574  		rcu_read_unlock();
1575  		err = -EPERM;
1576  		goto out_put;
1577  	}
1578  	rcu_read_unlock();
1579  
1580  	task_nodes = cpuset_mems_allowed(task);
1581  	/* Is the user allowed to access the target nodes? */
1582  	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1583  		err = -EPERM;
1584  		goto out_put;
1585  	}
1586  
1587  	task_nodes = cpuset_mems_allowed(current);
1588  	nodes_and(*new, *new, task_nodes);
1589  	if (nodes_empty(*new))
1590  		goto out_put;
1591  
1592  	err = security_task_movememory(task);
1593  	if (err)
1594  		goto out_put;
1595  
1596  	mm = get_task_mm(task);
1597  	put_task_struct(task);
1598  
1599  	if (!mm) {
1600  		err = -EINVAL;
1601  		goto out;
1602  	}
1603  
1604  	err = do_migrate_pages(mm, old, new,
1605  		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1606  
1607  	mmput(mm);
1608  out:
1609  	NODEMASK_SCRATCH_FREE(scratch);
1610  
1611  	return err;
1612  
1613  out_put:
1614  	put_task_struct(task);
1615  	goto out;
1616  
1617  }
1618  
1619  SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1620  		const unsigned long __user *, old_nodes,
1621  		const unsigned long __user *, new_nodes)
1622  {
1623  	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1624  }
1625  
1626  
1627  /* Retrieve NUMA policy */
1628  static int kernel_get_mempolicy(int __user *policy,
1629  				unsigned long __user *nmask,
1630  				unsigned long maxnode,
1631  				unsigned long addr,
1632  				unsigned long flags)
1633  {
1634  	int err;
1635  	int uninitialized_var(pval);
1636  	nodemask_t nodes;
1637  
1638  	addr = untagged_addr(addr);
1639  
1640  	if (nmask != NULL && maxnode < nr_node_ids)
1641  		return -EINVAL;
1642  
1643  	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1644  
1645  	if (err)
1646  		return err;
1647  
1648  	if (policy && put_user(pval, policy))
1649  		return -EFAULT;
1650  
1651  	if (nmask)
1652  		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1653  
1654  	return err;
1655  }
1656  
1657  SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1658  		unsigned long __user *, nmask, unsigned long, maxnode,
1659  		unsigned long, addr, unsigned long, flags)
1660  {
1661  	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1662  }
1663  
1664  #ifdef CONFIG_COMPAT
1665  
1666  COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1667  		       compat_ulong_t __user *, nmask,
1668  		       compat_ulong_t, maxnode,
1669  		       compat_ulong_t, addr, compat_ulong_t, flags)
1670  {
1671  	long err;
1672  	unsigned long __user *nm = NULL;
1673  	unsigned long nr_bits, alloc_size;
1674  	DECLARE_BITMAP(bm, MAX_NUMNODES);
1675  
1676  	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1677  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1678  
1679  	if (nmask)
1680  		nm = compat_alloc_user_space(alloc_size);
1681  
1682  	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1683  
1684  	if (!err && nmask) {
1685  		unsigned long copy_size;
1686  		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1687  		err = copy_from_user(bm, nm, copy_size);
1688  		/* ensure entire bitmap is zeroed */
1689  		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1690  		err |= compat_put_bitmap(nmask, bm, nr_bits);
1691  	}
1692  
1693  	return err;
1694  }
1695  
1696  COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1697  		       compat_ulong_t, maxnode)
1698  {
1699  	unsigned long __user *nm = NULL;
1700  	unsigned long nr_bits, alloc_size;
1701  	DECLARE_BITMAP(bm, MAX_NUMNODES);
1702  
1703  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1704  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1705  
1706  	if (nmask) {
1707  		if (compat_get_bitmap(bm, nmask, nr_bits))
1708  			return -EFAULT;
1709  		nm = compat_alloc_user_space(alloc_size);
1710  		if (copy_to_user(nm, bm, alloc_size))
1711  			return -EFAULT;
1712  	}
1713  
1714  	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1715  }
1716  
1717  COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1718  		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1719  		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1720  {
1721  	unsigned long __user *nm = NULL;
1722  	unsigned long nr_bits, alloc_size;
1723  	nodemask_t bm;
1724  
1725  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1726  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1727  
1728  	if (nmask) {
1729  		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1730  			return -EFAULT;
1731  		nm = compat_alloc_user_space(alloc_size);
1732  		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1733  			return -EFAULT;
1734  	}
1735  
1736  	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1737  }
1738  
1739  COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1740  		       compat_ulong_t, maxnode,
1741  		       const compat_ulong_t __user *, old_nodes,
1742  		       const compat_ulong_t __user *, new_nodes)
1743  {
1744  	unsigned long __user *old = NULL;
1745  	unsigned long __user *new = NULL;
1746  	nodemask_t tmp_mask;
1747  	unsigned long nr_bits;
1748  	unsigned long size;
1749  
1750  	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1751  	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1752  	if (old_nodes) {
1753  		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1754  			return -EFAULT;
1755  		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1756  		if (new_nodes)
1757  			new = old + size / sizeof(unsigned long);
1758  		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1759  			return -EFAULT;
1760  	}
1761  	if (new_nodes) {
1762  		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1763  			return -EFAULT;
1764  		if (new == NULL)
1765  			new = compat_alloc_user_space(size);
1766  		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1767  			return -EFAULT;
1768  	}
1769  	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1770  }
1771  
1772  #endif /* CONFIG_COMPAT */
1773  
1774  bool vma_migratable(struct vm_area_struct *vma)
1775  {
1776  	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1777  		return false;
1778  
1779  	/*
1780  	 * DAX device mappings require predictable access latency, so avoid
1781  	 * incurring periodic faults.
1782  	 */
1783  	if (vma_is_dax(vma))
1784  		return false;
1785  
1786  	if (is_vm_hugetlb_page(vma) &&
1787  		!hugepage_migration_supported(hstate_vma(vma)))
1788  		return false;
1789  
1790  	/*
1791  	 * Migration allocates pages in the highest zone. If we cannot
1792  	 * do so then migration (at least from node to node) is not
1793  	 * possible.
1794  	 */
1795  	if (vma->vm_file &&
1796  		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1797  			< policy_zone)
1798  		return false;
1799  	return true;
1800  }
1801  
1802  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1803  						unsigned long addr)
1804  {
1805  	struct mempolicy *pol = NULL;
1806  
1807  	if (vma) {
1808  		if (vma->vm_ops && vma->vm_ops->get_policy) {
1809  			pol = vma->vm_ops->get_policy(vma, addr);
1810  		} else if (vma->vm_policy) {
1811  			pol = vma->vm_policy;
1812  
1813  			/*
1814  			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1815  			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1816  			 * count on these policies which will be dropped by
1817  			 * mpol_cond_put() later
1818  			 */
1819  			if (mpol_needs_cond_ref(pol))
1820  				mpol_get(pol);
1821  		}
1822  	}
1823  
1824  	return pol;
1825  }
1826  
1827  /*
1828   * get_vma_policy(@vma, @addr)
1829   * @vma: virtual memory area whose policy is sought
1830   * @addr: address in @vma for shared policy lookup
1831   *
1832   * Returns effective policy for a VMA at specified address.
1833   * Falls back to current->mempolicy or system default policy, as necessary.
1834   * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1835   * count--added by the get_policy() vm_op, as appropriate--to protect against
1836   * freeing by another task.  It is the caller's responsibility to free the
1837   * extra reference for shared policies.
1838   */
1839  static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1840  						unsigned long addr)
1841  {
1842  	struct mempolicy *pol = __get_vma_policy(vma, addr);
1843  
1844  	if (!pol)
1845  		pol = get_task_policy(current);
1846  
1847  	return pol;
1848  }
1849  
1850  bool vma_policy_mof(struct vm_area_struct *vma)
1851  {
1852  	struct mempolicy *pol;
1853  
1854  	if (vma->vm_ops && vma->vm_ops->get_policy) {
1855  		bool ret = false;
1856  
1857  		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1858  		if (pol && (pol->flags & MPOL_F_MOF))
1859  			ret = true;
1860  		mpol_cond_put(pol);
1861  
1862  		return ret;
1863  	}
1864  
1865  	pol = vma->vm_policy;
1866  	if (!pol)
1867  		pol = get_task_policy(current);
1868  
1869  	return pol->flags & MPOL_F_MOF;
1870  }
1871  
1872  static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1873  {
1874  	enum zone_type dynamic_policy_zone = policy_zone;
1875  
1876  	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1877  
1878  	/*
1879  	 * if policy->v.nodes has movable memory only,
1880  	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1881  	 *
1882  	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1883  	 * so if the following test faile, it implies
1884  	 * policy->v.nodes has movable memory only.
1885  	 */
1886  	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1887  		dynamic_policy_zone = ZONE_MOVABLE;
1888  
1889  	return zone >= dynamic_policy_zone;
1890  }
1891  
1892  /*
1893   * Return a nodemask representing a mempolicy for filtering nodes for
1894   * page allocation
1895   */
1896  static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1897  {
1898  	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1899  	if (unlikely(policy->mode == MPOL_BIND) &&
1900  			apply_policy_zone(policy, gfp_zone(gfp)) &&
1901  			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1902  		return &policy->v.nodes;
1903  
1904  	return NULL;
1905  }
1906  
1907  /* Return the node id preferred by the given mempolicy, or the given id */
1908  static int policy_node(gfp_t gfp, struct mempolicy *policy,
1909  								int nd)
1910  {
1911  	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1912  		nd = policy->v.preferred_node;
1913  	else {
1914  		/*
1915  		 * __GFP_THISNODE shouldn't even be used with the bind policy
1916  		 * because we might easily break the expectation to stay on the
1917  		 * requested node and not break the policy.
1918  		 */
1919  		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1920  	}
1921  
1922  	return nd;
1923  }
1924  
1925  /* Do dynamic interleaving for a process */
1926  static unsigned interleave_nodes(struct mempolicy *policy)
1927  {
1928  	unsigned next;
1929  	struct task_struct *me = current;
1930  
1931  	next = next_node_in(me->il_prev, policy->v.nodes);
1932  	if (next < MAX_NUMNODES)
1933  		me->il_prev = next;
1934  	return next;
1935  }
1936  
1937  /*
1938   * Depending on the memory policy provide a node from which to allocate the
1939   * next slab entry.
1940   */
1941  unsigned int mempolicy_slab_node(void)
1942  {
1943  	struct mempolicy *policy;
1944  	int node = numa_mem_id();
1945  
1946  	if (in_interrupt())
1947  		return node;
1948  
1949  	policy = current->mempolicy;
1950  	if (!policy || policy->flags & MPOL_F_LOCAL)
1951  		return node;
1952  
1953  	switch (policy->mode) {
1954  	case MPOL_PREFERRED:
1955  		/*
1956  		 * handled MPOL_F_LOCAL above
1957  		 */
1958  		return policy->v.preferred_node;
1959  
1960  	case MPOL_INTERLEAVE:
1961  		return interleave_nodes(policy);
1962  
1963  	case MPOL_BIND: {
1964  		struct zoneref *z;
1965  
1966  		/*
1967  		 * Follow bind policy behavior and start allocation at the
1968  		 * first node.
1969  		 */
1970  		struct zonelist *zonelist;
1971  		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1972  		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1973  		z = first_zones_zonelist(zonelist, highest_zoneidx,
1974  							&policy->v.nodes);
1975  		return z->zone ? zone_to_nid(z->zone) : node;
1976  	}
1977  
1978  	default:
1979  		BUG();
1980  	}
1981  }
1982  
1983  /*
1984   * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1985   * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1986   * number of present nodes.
1987   */
1988  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1989  {
1990  	unsigned nnodes = nodes_weight(pol->v.nodes);
1991  	unsigned target;
1992  	int i;
1993  	int nid;
1994  
1995  	if (!nnodes)
1996  		return numa_node_id();
1997  	target = (unsigned int)n % nnodes;
1998  	nid = first_node(pol->v.nodes);
1999  	for (i = 0; i < target; i++)
2000  		nid = next_node(nid, pol->v.nodes);
2001  	return nid;
2002  }
2003  
2004  /* Determine a node number for interleave */
2005  static inline unsigned interleave_nid(struct mempolicy *pol,
2006  		 struct vm_area_struct *vma, unsigned long addr, int shift)
2007  {
2008  	if (vma) {
2009  		unsigned long off;
2010  
2011  		/*
2012  		 * for small pages, there is no difference between
2013  		 * shift and PAGE_SHIFT, so the bit-shift is safe.
2014  		 * for huge pages, since vm_pgoff is in units of small
2015  		 * pages, we need to shift off the always 0 bits to get
2016  		 * a useful offset.
2017  		 */
2018  		BUG_ON(shift < PAGE_SHIFT);
2019  		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
2020  		off += (addr - vma->vm_start) >> shift;
2021  		return offset_il_node(pol, off);
2022  	} else
2023  		return interleave_nodes(pol);
2024  }
2025  
2026  #ifdef CONFIG_HUGETLBFS
2027  /*
2028   * huge_node(@vma, @addr, @gfp_flags, @mpol)
2029   * @vma: virtual memory area whose policy is sought
2030   * @addr: address in @vma for shared policy lookup and interleave policy
2031   * @gfp_flags: for requested zone
2032   * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2033   * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2034   *
2035   * Returns a nid suitable for a huge page allocation and a pointer
2036   * to the struct mempolicy for conditional unref after allocation.
2037   * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2038   * @nodemask for filtering the zonelist.
2039   *
2040   * Must be protected by read_mems_allowed_begin()
2041   */
2042  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2043  				struct mempolicy **mpol, nodemask_t **nodemask)
2044  {
2045  	int nid;
2046  
2047  	*mpol = get_vma_policy(vma, addr);
2048  	*nodemask = NULL;	/* assume !MPOL_BIND */
2049  
2050  	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2051  		nid = interleave_nid(*mpol, vma, addr,
2052  					huge_page_shift(hstate_vma(vma)));
2053  	} else {
2054  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2055  		if ((*mpol)->mode == MPOL_BIND)
2056  			*nodemask = &(*mpol)->v.nodes;
2057  	}
2058  	return nid;
2059  }
2060  
2061  /*
2062   * init_nodemask_of_mempolicy
2063   *
2064   * If the current task's mempolicy is "default" [NULL], return 'false'
2065   * to indicate default policy.  Otherwise, extract the policy nodemask
2066   * for 'bind' or 'interleave' policy into the argument nodemask, or
2067   * initialize the argument nodemask to contain the single node for
2068   * 'preferred' or 'local' policy and return 'true' to indicate presence
2069   * of non-default mempolicy.
2070   *
2071   * We don't bother with reference counting the mempolicy [mpol_get/put]
2072   * because the current task is examining it's own mempolicy and a task's
2073   * mempolicy is only ever changed by the task itself.
2074   *
2075   * N.B., it is the caller's responsibility to free a returned nodemask.
2076   */
2077  bool init_nodemask_of_mempolicy(nodemask_t *mask)
2078  {
2079  	struct mempolicy *mempolicy;
2080  	int nid;
2081  
2082  	if (!(mask && current->mempolicy))
2083  		return false;
2084  
2085  	task_lock(current);
2086  	mempolicy = current->mempolicy;
2087  	switch (mempolicy->mode) {
2088  	case MPOL_PREFERRED:
2089  		if (mempolicy->flags & MPOL_F_LOCAL)
2090  			nid = numa_node_id();
2091  		else
2092  			nid = mempolicy->v.preferred_node;
2093  		init_nodemask_of_node(mask, nid);
2094  		break;
2095  
2096  	case MPOL_BIND:
2097  	case MPOL_INTERLEAVE:
2098  		*mask =  mempolicy->v.nodes;
2099  		break;
2100  
2101  	default:
2102  		BUG();
2103  	}
2104  	task_unlock(current);
2105  
2106  	return true;
2107  }
2108  #endif
2109  
2110  /*
2111   * mempolicy_nodemask_intersects
2112   *
2113   * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2114   * policy.  Otherwise, check for intersection between mask and the policy
2115   * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2116   * policy, always return true since it may allocate elsewhere on fallback.
2117   *
2118   * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2119   */
2120  bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2121  					const nodemask_t *mask)
2122  {
2123  	struct mempolicy *mempolicy;
2124  	bool ret = true;
2125  
2126  	if (!mask)
2127  		return ret;
2128  	task_lock(tsk);
2129  	mempolicy = tsk->mempolicy;
2130  	if (!mempolicy)
2131  		goto out;
2132  
2133  	switch (mempolicy->mode) {
2134  	case MPOL_PREFERRED:
2135  		/*
2136  		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2137  		 * allocate from, they may fallback to other nodes when oom.
2138  		 * Thus, it's possible for tsk to have allocated memory from
2139  		 * nodes in mask.
2140  		 */
2141  		break;
2142  	case MPOL_BIND:
2143  	case MPOL_INTERLEAVE:
2144  		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2145  		break;
2146  	default:
2147  		BUG();
2148  	}
2149  out:
2150  	task_unlock(tsk);
2151  	return ret;
2152  }
2153  
2154  /* Allocate a page in interleaved policy.
2155     Own path because it needs to do special accounting. */
2156  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2157  					unsigned nid)
2158  {
2159  	struct page *page;
2160  
2161  	page = __alloc_pages(gfp, order, nid);
2162  	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2163  	if (!static_branch_likely(&vm_numa_stat_key))
2164  		return page;
2165  	if (page && page_to_nid(page) == nid) {
2166  		preempt_disable();
2167  		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2168  		preempt_enable();
2169  	}
2170  	return page;
2171  }
2172  
2173  /**
2174   * 	alloc_pages_vma	- Allocate a page for a VMA.
2175   *
2176   * 	@gfp:
2177   *      %GFP_USER    user allocation.
2178   *      %GFP_KERNEL  kernel allocations,
2179   *      %GFP_HIGHMEM highmem/user allocations,
2180   *      %GFP_FS      allocation should not call back into a file system.
2181   *      %GFP_ATOMIC  don't sleep.
2182   *
2183   *	@order:Order of the GFP allocation.
2184   * 	@vma:  Pointer to VMA or NULL if not available.
2185   *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2186   *	@node: Which node to prefer for allocation (modulo policy).
2187   *	@hugepage: for hugepages try only the preferred node if possible
2188   *
2189   * 	This function allocates a page from the kernel page pool and applies
2190   *	a NUMA policy associated with the VMA or the current process.
2191   *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2192   *	mm_struct of the VMA to prevent it from going away. Should be used for
2193   *	all allocations for pages that will be mapped into user space. Returns
2194   *	NULL when no page can be allocated.
2195   */
2196  struct page *
2197  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2198  		unsigned long addr, int node, bool hugepage)
2199  {
2200  	struct mempolicy *pol;
2201  	struct page *page;
2202  	int preferred_nid;
2203  	nodemask_t *nmask;
2204  
2205  	pol = get_vma_policy(vma, addr);
2206  
2207  	if (pol->mode == MPOL_INTERLEAVE) {
2208  		unsigned nid;
2209  
2210  		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2211  		mpol_cond_put(pol);
2212  		page = alloc_page_interleave(gfp, order, nid);
2213  		goto out;
2214  	}
2215  
2216  	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2217  		int hpage_node = node;
2218  
2219  		/*
2220  		 * For hugepage allocation and non-interleave policy which
2221  		 * allows the current node (or other explicitly preferred
2222  		 * node) we only try to allocate from the current/preferred
2223  		 * node and don't fall back to other nodes, as the cost of
2224  		 * remote accesses would likely offset THP benefits.
2225  		 *
2226  		 * If the policy is interleave, or does not allow the current
2227  		 * node in its nodemask, we allocate the standard way.
2228  		 */
2229  		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2230  			hpage_node = pol->v.preferred_node;
2231  
2232  		nmask = policy_nodemask(gfp, pol);
2233  		if (!nmask || node_isset(hpage_node, *nmask)) {
2234  			mpol_cond_put(pol);
2235  			/*
2236  			 * First, try to allocate THP only on local node, but
2237  			 * don't reclaim unnecessarily, just compact.
2238  			 */
2239  			page = __alloc_pages_node(hpage_node,
2240  				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2241  
2242  			/*
2243  			 * If hugepage allocations are configured to always
2244  			 * synchronous compact or the vma has been madvised
2245  			 * to prefer hugepage backing, retry allowing remote
2246  			 * memory with both reclaim and compact as well.
2247  			 */
2248  			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2249  				page = __alloc_pages_node(hpage_node,
2250  								gfp, order);
2251  
2252  			goto out;
2253  		}
2254  	}
2255  
2256  	nmask = policy_nodemask(gfp, pol);
2257  	preferred_nid = policy_node(gfp, pol, node);
2258  	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2259  	mpol_cond_put(pol);
2260  out:
2261  	return page;
2262  }
2263  EXPORT_SYMBOL(alloc_pages_vma);
2264  
2265  /**
2266   * 	alloc_pages_current - Allocate pages.
2267   *
2268   *	@gfp:
2269   *		%GFP_USER   user allocation,
2270   *      	%GFP_KERNEL kernel allocation,
2271   *      	%GFP_HIGHMEM highmem allocation,
2272   *      	%GFP_FS     don't call back into a file system.
2273   *      	%GFP_ATOMIC don't sleep.
2274   *	@order: Power of two of allocation size in pages. 0 is a single page.
2275   *
2276   *	Allocate a page from the kernel page pool.  When not in
2277   *	interrupt context and apply the current process NUMA policy.
2278   *	Returns NULL when no page can be allocated.
2279   */
2280  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2281  {
2282  	struct mempolicy *pol = &default_policy;
2283  	struct page *page;
2284  
2285  	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2286  		pol = get_task_policy(current);
2287  
2288  	/*
2289  	 * No reference counting needed for current->mempolicy
2290  	 * nor system default_policy
2291  	 */
2292  	if (pol->mode == MPOL_INTERLEAVE)
2293  		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2294  	else
2295  		page = __alloc_pages_nodemask(gfp, order,
2296  				policy_node(gfp, pol, numa_node_id()),
2297  				policy_nodemask(gfp, pol));
2298  
2299  	return page;
2300  }
2301  EXPORT_SYMBOL(alloc_pages_current);
2302  
2303  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2304  {
2305  	struct mempolicy *pol = mpol_dup(vma_policy(src));
2306  
2307  	if (IS_ERR(pol))
2308  		return PTR_ERR(pol);
2309  	dst->vm_policy = pol;
2310  	return 0;
2311  }
2312  
2313  /*
2314   * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2315   * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2316   * with the mems_allowed returned by cpuset_mems_allowed().  This
2317   * keeps mempolicies cpuset relative after its cpuset moves.  See
2318   * further kernel/cpuset.c update_nodemask().
2319   *
2320   * current's mempolicy may be rebinded by the other task(the task that changes
2321   * cpuset's mems), so we needn't do rebind work for current task.
2322   */
2323  
2324  /* Slow path of a mempolicy duplicate */
2325  struct mempolicy *__mpol_dup(struct mempolicy *old)
2326  {
2327  	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2328  
2329  	if (!new)
2330  		return ERR_PTR(-ENOMEM);
2331  
2332  	/* task's mempolicy is protected by alloc_lock */
2333  	if (old == current->mempolicy) {
2334  		task_lock(current);
2335  		*new = *old;
2336  		task_unlock(current);
2337  	} else
2338  		*new = *old;
2339  
2340  	if (current_cpuset_is_being_rebound()) {
2341  		nodemask_t mems = cpuset_mems_allowed(current);
2342  		mpol_rebind_policy(new, &mems);
2343  	}
2344  	atomic_set(&new->refcnt, 1);
2345  	return new;
2346  }
2347  
2348  /* Slow path of a mempolicy comparison */
2349  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2350  {
2351  	if (!a || !b)
2352  		return false;
2353  	if (a->mode != b->mode)
2354  		return false;
2355  	if (a->flags != b->flags)
2356  		return false;
2357  	if (mpol_store_user_nodemask(a))
2358  		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2359  			return false;
2360  
2361  	switch (a->mode) {
2362  	case MPOL_BIND:
2363  	case MPOL_INTERLEAVE:
2364  		return !!nodes_equal(a->v.nodes, b->v.nodes);
2365  	case MPOL_PREFERRED:
2366  		/* a's ->flags is the same as b's */
2367  		if (a->flags & MPOL_F_LOCAL)
2368  			return true;
2369  		return a->v.preferred_node == b->v.preferred_node;
2370  	default:
2371  		BUG();
2372  		return false;
2373  	}
2374  }
2375  
2376  /*
2377   * Shared memory backing store policy support.
2378   *
2379   * Remember policies even when nobody has shared memory mapped.
2380   * The policies are kept in Red-Black tree linked from the inode.
2381   * They are protected by the sp->lock rwlock, which should be held
2382   * for any accesses to the tree.
2383   */
2384  
2385  /*
2386   * lookup first element intersecting start-end.  Caller holds sp->lock for
2387   * reading or for writing
2388   */
2389  static struct sp_node *
2390  sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2391  {
2392  	struct rb_node *n = sp->root.rb_node;
2393  
2394  	while (n) {
2395  		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2396  
2397  		if (start >= p->end)
2398  			n = n->rb_right;
2399  		else if (end <= p->start)
2400  			n = n->rb_left;
2401  		else
2402  			break;
2403  	}
2404  	if (!n)
2405  		return NULL;
2406  	for (;;) {
2407  		struct sp_node *w = NULL;
2408  		struct rb_node *prev = rb_prev(n);
2409  		if (!prev)
2410  			break;
2411  		w = rb_entry(prev, struct sp_node, nd);
2412  		if (w->end <= start)
2413  			break;
2414  		n = prev;
2415  	}
2416  	return rb_entry(n, struct sp_node, nd);
2417  }
2418  
2419  /*
2420   * Insert a new shared policy into the list.  Caller holds sp->lock for
2421   * writing.
2422   */
2423  static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2424  {
2425  	struct rb_node **p = &sp->root.rb_node;
2426  	struct rb_node *parent = NULL;
2427  	struct sp_node *nd;
2428  
2429  	while (*p) {
2430  		parent = *p;
2431  		nd = rb_entry(parent, struct sp_node, nd);
2432  		if (new->start < nd->start)
2433  			p = &(*p)->rb_left;
2434  		else if (new->end > nd->end)
2435  			p = &(*p)->rb_right;
2436  		else
2437  			BUG();
2438  	}
2439  	rb_link_node(&new->nd, parent, p);
2440  	rb_insert_color(&new->nd, &sp->root);
2441  	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2442  		 new->policy ? new->policy->mode : 0);
2443  }
2444  
2445  /* Find shared policy intersecting idx */
2446  struct mempolicy *
2447  mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2448  {
2449  	struct mempolicy *pol = NULL;
2450  	struct sp_node *sn;
2451  
2452  	if (!sp->root.rb_node)
2453  		return NULL;
2454  	read_lock(&sp->lock);
2455  	sn = sp_lookup(sp, idx, idx+1);
2456  	if (sn) {
2457  		mpol_get(sn->policy);
2458  		pol = sn->policy;
2459  	}
2460  	read_unlock(&sp->lock);
2461  	return pol;
2462  }
2463  
2464  static void sp_free(struct sp_node *n)
2465  {
2466  	mpol_put(n->policy);
2467  	kmem_cache_free(sn_cache, n);
2468  }
2469  
2470  /**
2471   * mpol_misplaced - check whether current page node is valid in policy
2472   *
2473   * @page: page to be checked
2474   * @vma: vm area where page mapped
2475   * @addr: virtual address where page mapped
2476   *
2477   * Lookup current policy node id for vma,addr and "compare to" page's
2478   * node id.
2479   *
2480   * Returns:
2481   *	-1	- not misplaced, page is in the right node
2482   *	node	- node id where the page should be
2483   *
2484   * Policy determination "mimics" alloc_page_vma().
2485   * Called from fault path where we know the vma and faulting address.
2486   */
2487  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2488  {
2489  	struct mempolicy *pol;
2490  	struct zoneref *z;
2491  	int curnid = page_to_nid(page);
2492  	unsigned long pgoff;
2493  	int thiscpu = raw_smp_processor_id();
2494  	int thisnid = cpu_to_node(thiscpu);
2495  	int polnid = NUMA_NO_NODE;
2496  	int ret = -1;
2497  
2498  	pol = get_vma_policy(vma, addr);
2499  	if (!(pol->flags & MPOL_F_MOF))
2500  		goto out;
2501  
2502  	switch (pol->mode) {
2503  	case MPOL_INTERLEAVE:
2504  		pgoff = vma->vm_pgoff;
2505  		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2506  		polnid = offset_il_node(pol, pgoff);
2507  		break;
2508  
2509  	case MPOL_PREFERRED:
2510  		if (pol->flags & MPOL_F_LOCAL)
2511  			polnid = numa_node_id();
2512  		else
2513  			polnid = pol->v.preferred_node;
2514  		break;
2515  
2516  	case MPOL_BIND:
2517  
2518  		/*
2519  		 * allows binding to multiple nodes.
2520  		 * use current page if in policy nodemask,
2521  		 * else select nearest allowed node, if any.
2522  		 * If no allowed nodes, use current [!misplaced].
2523  		 */
2524  		if (node_isset(curnid, pol->v.nodes))
2525  			goto out;
2526  		z = first_zones_zonelist(
2527  				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2528  				gfp_zone(GFP_HIGHUSER),
2529  				&pol->v.nodes);
2530  		polnid = zone_to_nid(z->zone);
2531  		break;
2532  
2533  	default:
2534  		BUG();
2535  	}
2536  
2537  	/* Migrate the page towards the node whose CPU is referencing it */
2538  	if (pol->flags & MPOL_F_MORON) {
2539  		polnid = thisnid;
2540  
2541  		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2542  			goto out;
2543  	}
2544  
2545  	if (curnid != polnid)
2546  		ret = polnid;
2547  out:
2548  	mpol_cond_put(pol);
2549  
2550  	return ret;
2551  }
2552  
2553  /*
2554   * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2555   * dropped after task->mempolicy is set to NULL so that any allocation done as
2556   * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2557   * policy.
2558   */
2559  void mpol_put_task_policy(struct task_struct *task)
2560  {
2561  	struct mempolicy *pol;
2562  
2563  	task_lock(task);
2564  	pol = task->mempolicy;
2565  	task->mempolicy = NULL;
2566  	task_unlock(task);
2567  	mpol_put(pol);
2568  }
2569  
2570  static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2571  {
2572  	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2573  	rb_erase(&n->nd, &sp->root);
2574  	sp_free(n);
2575  }
2576  
2577  static void sp_node_init(struct sp_node *node, unsigned long start,
2578  			unsigned long end, struct mempolicy *pol)
2579  {
2580  	node->start = start;
2581  	node->end = end;
2582  	node->policy = pol;
2583  }
2584  
2585  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2586  				struct mempolicy *pol)
2587  {
2588  	struct sp_node *n;
2589  	struct mempolicy *newpol;
2590  
2591  	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2592  	if (!n)
2593  		return NULL;
2594  
2595  	newpol = mpol_dup(pol);
2596  	if (IS_ERR(newpol)) {
2597  		kmem_cache_free(sn_cache, n);
2598  		return NULL;
2599  	}
2600  	newpol->flags |= MPOL_F_SHARED;
2601  	sp_node_init(n, start, end, newpol);
2602  
2603  	return n;
2604  }
2605  
2606  /* Replace a policy range. */
2607  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2608  				 unsigned long end, struct sp_node *new)
2609  {
2610  	struct sp_node *n;
2611  	struct sp_node *n_new = NULL;
2612  	struct mempolicy *mpol_new = NULL;
2613  	int ret = 0;
2614  
2615  restart:
2616  	write_lock(&sp->lock);
2617  	n = sp_lookup(sp, start, end);
2618  	/* Take care of old policies in the same range. */
2619  	while (n && n->start < end) {
2620  		struct rb_node *next = rb_next(&n->nd);
2621  		if (n->start >= start) {
2622  			if (n->end <= end)
2623  				sp_delete(sp, n);
2624  			else
2625  				n->start = end;
2626  		} else {
2627  			/* Old policy spanning whole new range. */
2628  			if (n->end > end) {
2629  				if (!n_new)
2630  					goto alloc_new;
2631  
2632  				*mpol_new = *n->policy;
2633  				atomic_set(&mpol_new->refcnt, 1);
2634  				sp_node_init(n_new, end, n->end, mpol_new);
2635  				n->end = start;
2636  				sp_insert(sp, n_new);
2637  				n_new = NULL;
2638  				mpol_new = NULL;
2639  				break;
2640  			} else
2641  				n->end = start;
2642  		}
2643  		if (!next)
2644  			break;
2645  		n = rb_entry(next, struct sp_node, nd);
2646  	}
2647  	if (new)
2648  		sp_insert(sp, new);
2649  	write_unlock(&sp->lock);
2650  	ret = 0;
2651  
2652  err_out:
2653  	if (mpol_new)
2654  		mpol_put(mpol_new);
2655  	if (n_new)
2656  		kmem_cache_free(sn_cache, n_new);
2657  
2658  	return ret;
2659  
2660  alloc_new:
2661  	write_unlock(&sp->lock);
2662  	ret = -ENOMEM;
2663  	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2664  	if (!n_new)
2665  		goto err_out;
2666  	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2667  	if (!mpol_new)
2668  		goto err_out;
2669  	goto restart;
2670  }
2671  
2672  /**
2673   * mpol_shared_policy_init - initialize shared policy for inode
2674   * @sp: pointer to inode shared policy
2675   * @mpol:  struct mempolicy to install
2676   *
2677   * Install non-NULL @mpol in inode's shared policy rb-tree.
2678   * On entry, the current task has a reference on a non-NULL @mpol.
2679   * This must be released on exit.
2680   * This is called at get_inode() calls and we can use GFP_KERNEL.
2681   */
2682  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2683  {
2684  	int ret;
2685  
2686  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2687  	rwlock_init(&sp->lock);
2688  
2689  	if (mpol) {
2690  		struct vm_area_struct pvma;
2691  		struct mempolicy *new;
2692  		NODEMASK_SCRATCH(scratch);
2693  
2694  		if (!scratch)
2695  			goto put_mpol;
2696  		/* contextualize the tmpfs mount point mempolicy */
2697  		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2698  		if (IS_ERR(new))
2699  			goto free_scratch; /* no valid nodemask intersection */
2700  
2701  		task_lock(current);
2702  		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2703  		task_unlock(current);
2704  		if (ret)
2705  			goto put_new;
2706  
2707  		/* Create pseudo-vma that contains just the policy */
2708  		vma_init(&pvma, NULL);
2709  		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2710  		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2711  
2712  put_new:
2713  		mpol_put(new);			/* drop initial ref */
2714  free_scratch:
2715  		NODEMASK_SCRATCH_FREE(scratch);
2716  put_mpol:
2717  		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2718  	}
2719  }
2720  
2721  int mpol_set_shared_policy(struct shared_policy *info,
2722  			struct vm_area_struct *vma, struct mempolicy *npol)
2723  {
2724  	int err;
2725  	struct sp_node *new = NULL;
2726  	unsigned long sz = vma_pages(vma);
2727  
2728  	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2729  		 vma->vm_pgoff,
2730  		 sz, npol ? npol->mode : -1,
2731  		 npol ? npol->flags : -1,
2732  		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2733  
2734  	if (npol) {
2735  		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2736  		if (!new)
2737  			return -ENOMEM;
2738  	}
2739  	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2740  	if (err && new)
2741  		sp_free(new);
2742  	return err;
2743  }
2744  
2745  /* Free a backing policy store on inode delete. */
2746  void mpol_free_shared_policy(struct shared_policy *p)
2747  {
2748  	struct sp_node *n;
2749  	struct rb_node *next;
2750  
2751  	if (!p->root.rb_node)
2752  		return;
2753  	write_lock(&p->lock);
2754  	next = rb_first(&p->root);
2755  	while (next) {
2756  		n = rb_entry(next, struct sp_node, nd);
2757  		next = rb_next(&n->nd);
2758  		sp_delete(p, n);
2759  	}
2760  	write_unlock(&p->lock);
2761  }
2762  
2763  #ifdef CONFIG_NUMA_BALANCING
2764  static int __initdata numabalancing_override;
2765  
2766  static void __init check_numabalancing_enable(void)
2767  {
2768  	bool numabalancing_default = false;
2769  
2770  	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2771  		numabalancing_default = true;
2772  
2773  	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2774  	if (numabalancing_override)
2775  		set_numabalancing_state(numabalancing_override == 1);
2776  
2777  	if (num_online_nodes() > 1 && !numabalancing_override) {
2778  		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2779  			numabalancing_default ? "Enabling" : "Disabling");
2780  		set_numabalancing_state(numabalancing_default);
2781  	}
2782  }
2783  
2784  static int __init setup_numabalancing(char *str)
2785  {
2786  	int ret = 0;
2787  	if (!str)
2788  		goto out;
2789  
2790  	if (!strcmp(str, "enable")) {
2791  		numabalancing_override = 1;
2792  		ret = 1;
2793  	} else if (!strcmp(str, "disable")) {
2794  		numabalancing_override = -1;
2795  		ret = 1;
2796  	}
2797  out:
2798  	if (!ret)
2799  		pr_warn("Unable to parse numa_balancing=\n");
2800  
2801  	return ret;
2802  }
2803  __setup("numa_balancing=", setup_numabalancing);
2804  #else
2805  static inline void __init check_numabalancing_enable(void)
2806  {
2807  }
2808  #endif /* CONFIG_NUMA_BALANCING */
2809  
2810  /* assumes fs == KERNEL_DS */
2811  void __init numa_policy_init(void)
2812  {
2813  	nodemask_t interleave_nodes;
2814  	unsigned long largest = 0;
2815  	int nid, prefer = 0;
2816  
2817  	policy_cache = kmem_cache_create("numa_policy",
2818  					 sizeof(struct mempolicy),
2819  					 0, SLAB_PANIC, NULL);
2820  
2821  	sn_cache = kmem_cache_create("shared_policy_node",
2822  				     sizeof(struct sp_node),
2823  				     0, SLAB_PANIC, NULL);
2824  
2825  	for_each_node(nid) {
2826  		preferred_node_policy[nid] = (struct mempolicy) {
2827  			.refcnt = ATOMIC_INIT(1),
2828  			.mode = MPOL_PREFERRED,
2829  			.flags = MPOL_F_MOF | MPOL_F_MORON,
2830  			.v = { .preferred_node = nid, },
2831  		};
2832  	}
2833  
2834  	/*
2835  	 * Set interleaving policy for system init. Interleaving is only
2836  	 * enabled across suitably sized nodes (default is >= 16MB), or
2837  	 * fall back to the largest node if they're all smaller.
2838  	 */
2839  	nodes_clear(interleave_nodes);
2840  	for_each_node_state(nid, N_MEMORY) {
2841  		unsigned long total_pages = node_present_pages(nid);
2842  
2843  		/* Preserve the largest node */
2844  		if (largest < total_pages) {
2845  			largest = total_pages;
2846  			prefer = nid;
2847  		}
2848  
2849  		/* Interleave this node? */
2850  		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2851  			node_set(nid, interleave_nodes);
2852  	}
2853  
2854  	/* All too small, use the largest */
2855  	if (unlikely(nodes_empty(interleave_nodes)))
2856  		node_set(prefer, interleave_nodes);
2857  
2858  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2859  		pr_err("%s: interleaving failed\n", __func__);
2860  
2861  	check_numabalancing_enable();
2862  }
2863  
2864  /* Reset policy of current process to default */
2865  void numa_default_policy(void)
2866  {
2867  	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2868  }
2869  
2870  /*
2871   * Parse and format mempolicy from/to strings
2872   */
2873  
2874  /*
2875   * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2876   */
2877  static const char * const policy_modes[] =
2878  {
2879  	[MPOL_DEFAULT]    = "default",
2880  	[MPOL_PREFERRED]  = "prefer",
2881  	[MPOL_BIND]       = "bind",
2882  	[MPOL_INTERLEAVE] = "interleave",
2883  	[MPOL_LOCAL]      = "local",
2884  };
2885  
2886  
2887  #ifdef CONFIG_TMPFS
2888  /**
2889   * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2890   * @str:  string containing mempolicy to parse
2891   * @mpol:  pointer to struct mempolicy pointer, returned on success.
2892   *
2893   * Format of input:
2894   *	<mode>[=<flags>][:<nodelist>]
2895   *
2896   * On success, returns 0, else 1
2897   */
2898  int mpol_parse_str(char *str, struct mempolicy **mpol)
2899  {
2900  	struct mempolicy *new = NULL;
2901  	unsigned short mode_flags;
2902  	nodemask_t nodes;
2903  	char *nodelist = strchr(str, ':');
2904  	char *flags = strchr(str, '=');
2905  	int err = 1, mode;
2906  
2907  	if (flags)
2908  		*flags++ = '\0';	/* terminate mode string */
2909  
2910  	if (nodelist) {
2911  		/* NUL-terminate mode or flags string */
2912  		*nodelist++ = '\0';
2913  		if (nodelist_parse(nodelist, nodes))
2914  			goto out;
2915  		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2916  			goto out;
2917  	} else
2918  		nodes_clear(nodes);
2919  
2920  	mode = match_string(policy_modes, MPOL_MAX, str);
2921  	if (mode < 0)
2922  		goto out;
2923  
2924  	switch (mode) {
2925  	case MPOL_PREFERRED:
2926  		/*
2927  		 * Insist on a nodelist of one node only, although later
2928  		 * we use first_node(nodes) to grab a single node, so here
2929  		 * nodelist (or nodes) cannot be empty.
2930  		 */
2931  		if (nodelist) {
2932  			char *rest = nodelist;
2933  			while (isdigit(*rest))
2934  				rest++;
2935  			if (*rest)
2936  				goto out;
2937  			if (nodes_empty(nodes))
2938  				goto out;
2939  		}
2940  		break;
2941  	case MPOL_INTERLEAVE:
2942  		/*
2943  		 * Default to online nodes with memory if no nodelist
2944  		 */
2945  		if (!nodelist)
2946  			nodes = node_states[N_MEMORY];
2947  		break;
2948  	case MPOL_LOCAL:
2949  		/*
2950  		 * Don't allow a nodelist;  mpol_new() checks flags
2951  		 */
2952  		if (nodelist)
2953  			goto out;
2954  		mode = MPOL_PREFERRED;
2955  		break;
2956  	case MPOL_DEFAULT:
2957  		/*
2958  		 * Insist on a empty nodelist
2959  		 */
2960  		if (!nodelist)
2961  			err = 0;
2962  		goto out;
2963  	case MPOL_BIND:
2964  		/*
2965  		 * Insist on a nodelist
2966  		 */
2967  		if (!nodelist)
2968  			goto out;
2969  	}
2970  
2971  	mode_flags = 0;
2972  	if (flags) {
2973  		/*
2974  		 * Currently, we only support two mutually exclusive
2975  		 * mode flags.
2976  		 */
2977  		if (!strcmp(flags, "static"))
2978  			mode_flags |= MPOL_F_STATIC_NODES;
2979  		else if (!strcmp(flags, "relative"))
2980  			mode_flags |= MPOL_F_RELATIVE_NODES;
2981  		else
2982  			goto out;
2983  	}
2984  
2985  	new = mpol_new(mode, mode_flags, &nodes);
2986  	if (IS_ERR(new))
2987  		goto out;
2988  
2989  	/*
2990  	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2991  	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2992  	 */
2993  	if (mode != MPOL_PREFERRED)
2994  		new->v.nodes = nodes;
2995  	else if (nodelist)
2996  		new->v.preferred_node = first_node(nodes);
2997  	else
2998  		new->flags |= MPOL_F_LOCAL;
2999  
3000  	/*
3001  	 * Save nodes for contextualization: this will be used to "clone"
3002  	 * the mempolicy in a specific context [cpuset] at a later time.
3003  	 */
3004  	new->w.user_nodemask = nodes;
3005  
3006  	err = 0;
3007  
3008  out:
3009  	/* Restore string for error message */
3010  	if (nodelist)
3011  		*--nodelist = ':';
3012  	if (flags)
3013  		*--flags = '=';
3014  	if (!err)
3015  		*mpol = new;
3016  	return err;
3017  }
3018  #endif /* CONFIG_TMPFS */
3019  
3020  /**
3021   * mpol_to_str - format a mempolicy structure for printing
3022   * @buffer:  to contain formatted mempolicy string
3023   * @maxlen:  length of @buffer
3024   * @pol:  pointer to mempolicy to be formatted
3025   *
3026   * Convert @pol into a string.  If @buffer is too short, truncate the string.
3027   * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3028   * longest flag, "relative", and to display at least a few node ids.
3029   */
3030  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3031  {
3032  	char *p = buffer;
3033  	nodemask_t nodes = NODE_MASK_NONE;
3034  	unsigned short mode = MPOL_DEFAULT;
3035  	unsigned short flags = 0;
3036  
3037  	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3038  		mode = pol->mode;
3039  		flags = pol->flags;
3040  	}
3041  
3042  	switch (mode) {
3043  	case MPOL_DEFAULT:
3044  		break;
3045  	case MPOL_PREFERRED:
3046  		if (flags & MPOL_F_LOCAL)
3047  			mode = MPOL_LOCAL;
3048  		else
3049  			node_set(pol->v.preferred_node, nodes);
3050  		break;
3051  	case MPOL_BIND:
3052  	case MPOL_INTERLEAVE:
3053  		nodes = pol->v.nodes;
3054  		break;
3055  	default:
3056  		WARN_ON_ONCE(1);
3057  		snprintf(p, maxlen, "unknown");
3058  		return;
3059  	}
3060  
3061  	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3062  
3063  	if (flags & MPOL_MODE_FLAGS) {
3064  		p += snprintf(p, buffer + maxlen - p, "=");
3065  
3066  		/*
3067  		 * Currently, the only defined flags are mutually exclusive
3068  		 */
3069  		if (flags & MPOL_F_STATIC_NODES)
3070  			p += snprintf(p, buffer + maxlen - p, "static");
3071  		else if (flags & MPOL_F_RELATIVE_NODES)
3072  			p += snprintf(p, buffer + maxlen - p, "relative");
3073  	}
3074  
3075  	if (!nodes_empty(nodes))
3076  		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3077  			       nodemask_pr_args(&nodes));
3078  }
3079