xref: /openbmc/linux/mm/rmap.c (revision 60630924bb5af8751adcecc896e7763c3783ca89)
1  /*
2   * mm/rmap.c - physical to virtual reverse mappings
3   *
4   * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5   * Released under the General Public License (GPL).
6   *
7   * Simple, low overhead reverse mapping scheme.
8   * Please try to keep this thing as modular as possible.
9   *
10   * Provides methods for unmapping each kind of mapped page:
11   * the anon methods track anonymous pages, and
12   * the file methods track pages belonging to an inode.
13   *
14   * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15   * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16   * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17   * Contributions by Hugh Dickins 2003, 2004
18   */
19  
20  /*
21   * Lock ordering in mm:
22   *
23   * inode->i_rwsem	(while writing or truncating, not reading or faulting)
24   *   mm->mmap_lock
25   *     mapping->invalidate_lock (in filemap_fault)
26   *       page->flags PG_locked (lock_page)   * (see hugetlbfs below)
27   *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
28   *           mapping->i_mmap_rwsem
29   *             hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
30   *             anon_vma->rwsem
31   *               mm->page_table_lock or pte_lock
32   *                 swap_lock (in swap_duplicate, swap_info_get)
33   *                   mmlist_lock (in mmput, drain_mmlist and others)
34   *                   mapping->private_lock (in __set_page_dirty_buffers)
35   *                     lock_page_memcg move_lock (in __set_page_dirty_buffers)
36   *                       i_pages lock (widely used)
37   *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
38   *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
39   *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
40   *                     sb_lock (within inode_lock in fs/fs-writeback.c)
41   *                     i_pages lock (widely used, in set_page_dirty,
42   *                               in arch-dependent flush_dcache_mmap_lock,
43   *                               within bdi.wb->list_lock in __sync_single_inode)
44   *
45   * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
46   *   ->tasklist_lock
47   *     pte map lock
48   *
49   * * hugetlbfs PageHuge() pages take locks in this order:
50   *         mapping->i_mmap_rwsem
51   *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
52   *             page->flags PG_locked (lock_page)
53   */
54  
55  #include <linux/mm.h>
56  #include <linux/sched/mm.h>
57  #include <linux/sched/task.h>
58  #include <linux/pagemap.h>
59  #include <linux/swap.h>
60  #include <linux/swapops.h>
61  #include <linux/slab.h>
62  #include <linux/init.h>
63  #include <linux/ksm.h>
64  #include <linux/rmap.h>
65  #include <linux/rcupdate.h>
66  #include <linux/export.h>
67  #include <linux/memcontrol.h>
68  #include <linux/mmu_notifier.h>
69  #include <linux/migrate.h>
70  #include <linux/hugetlb.h>
71  #include <linux/huge_mm.h>
72  #include <linux/backing-dev.h>
73  #include <linux/page_idle.h>
74  #include <linux/memremap.h>
75  #include <linux/userfaultfd_k.h>
76  
77  #include <asm/tlbflush.h>
78  
79  #include <trace/events/tlb.h>
80  
81  #include "internal.h"
82  
83  static struct kmem_cache *anon_vma_cachep;
84  static struct kmem_cache *anon_vma_chain_cachep;
85  
86  static inline struct anon_vma *anon_vma_alloc(void)
87  {
88  	struct anon_vma *anon_vma;
89  
90  	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
91  	if (anon_vma) {
92  		atomic_set(&anon_vma->refcount, 1);
93  		anon_vma->degree = 1;	/* Reference for first vma */
94  		anon_vma->parent = anon_vma;
95  		/*
96  		 * Initialise the anon_vma root to point to itself. If called
97  		 * from fork, the root will be reset to the parents anon_vma.
98  		 */
99  		anon_vma->root = anon_vma;
100  	}
101  
102  	return anon_vma;
103  }
104  
105  static inline void anon_vma_free(struct anon_vma *anon_vma)
106  {
107  	VM_BUG_ON(atomic_read(&anon_vma->refcount));
108  
109  	/*
110  	 * Synchronize against page_lock_anon_vma_read() such that
111  	 * we can safely hold the lock without the anon_vma getting
112  	 * freed.
113  	 *
114  	 * Relies on the full mb implied by the atomic_dec_and_test() from
115  	 * put_anon_vma() against the acquire barrier implied by
116  	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
117  	 *
118  	 * page_lock_anon_vma_read()	VS	put_anon_vma()
119  	 *   down_read_trylock()		  atomic_dec_and_test()
120  	 *   LOCK				  MB
121  	 *   atomic_read()			  rwsem_is_locked()
122  	 *
123  	 * LOCK should suffice since the actual taking of the lock must
124  	 * happen _before_ what follows.
125  	 */
126  	might_sleep();
127  	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
128  		anon_vma_lock_write(anon_vma);
129  		anon_vma_unlock_write(anon_vma);
130  	}
131  
132  	kmem_cache_free(anon_vma_cachep, anon_vma);
133  }
134  
135  static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
136  {
137  	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
138  }
139  
140  static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
141  {
142  	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
143  }
144  
145  static void anon_vma_chain_link(struct vm_area_struct *vma,
146  				struct anon_vma_chain *avc,
147  				struct anon_vma *anon_vma)
148  {
149  	avc->vma = vma;
150  	avc->anon_vma = anon_vma;
151  	list_add(&avc->same_vma, &vma->anon_vma_chain);
152  	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
153  }
154  
155  /**
156   * __anon_vma_prepare - attach an anon_vma to a memory region
157   * @vma: the memory region in question
158   *
159   * This makes sure the memory mapping described by 'vma' has
160   * an 'anon_vma' attached to it, so that we can associate the
161   * anonymous pages mapped into it with that anon_vma.
162   *
163   * The common case will be that we already have one, which
164   * is handled inline by anon_vma_prepare(). But if
165   * not we either need to find an adjacent mapping that we
166   * can re-use the anon_vma from (very common when the only
167   * reason for splitting a vma has been mprotect()), or we
168   * allocate a new one.
169   *
170   * Anon-vma allocations are very subtle, because we may have
171   * optimistically looked up an anon_vma in page_lock_anon_vma_read()
172   * and that may actually touch the rwsem even in the newly
173   * allocated vma (it depends on RCU to make sure that the
174   * anon_vma isn't actually destroyed).
175   *
176   * As a result, we need to do proper anon_vma locking even
177   * for the new allocation. At the same time, we do not want
178   * to do any locking for the common case of already having
179   * an anon_vma.
180   *
181   * This must be called with the mmap_lock held for reading.
182   */
183  int __anon_vma_prepare(struct vm_area_struct *vma)
184  {
185  	struct mm_struct *mm = vma->vm_mm;
186  	struct anon_vma *anon_vma, *allocated;
187  	struct anon_vma_chain *avc;
188  
189  	might_sleep();
190  
191  	avc = anon_vma_chain_alloc(GFP_KERNEL);
192  	if (!avc)
193  		goto out_enomem;
194  
195  	anon_vma = find_mergeable_anon_vma(vma);
196  	allocated = NULL;
197  	if (!anon_vma) {
198  		anon_vma = anon_vma_alloc();
199  		if (unlikely(!anon_vma))
200  			goto out_enomem_free_avc;
201  		allocated = anon_vma;
202  	}
203  
204  	anon_vma_lock_write(anon_vma);
205  	/* page_table_lock to protect against threads */
206  	spin_lock(&mm->page_table_lock);
207  	if (likely(!vma->anon_vma)) {
208  		vma->anon_vma = anon_vma;
209  		anon_vma_chain_link(vma, avc, anon_vma);
210  		/* vma reference or self-parent link for new root */
211  		anon_vma->degree++;
212  		allocated = NULL;
213  		avc = NULL;
214  	}
215  	spin_unlock(&mm->page_table_lock);
216  	anon_vma_unlock_write(anon_vma);
217  
218  	if (unlikely(allocated))
219  		put_anon_vma(allocated);
220  	if (unlikely(avc))
221  		anon_vma_chain_free(avc);
222  
223  	return 0;
224  
225   out_enomem_free_avc:
226  	anon_vma_chain_free(avc);
227   out_enomem:
228  	return -ENOMEM;
229  }
230  
231  /*
232   * This is a useful helper function for locking the anon_vma root as
233   * we traverse the vma->anon_vma_chain, looping over anon_vma's that
234   * have the same vma.
235   *
236   * Such anon_vma's should have the same root, so you'd expect to see
237   * just a single mutex_lock for the whole traversal.
238   */
239  static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
240  {
241  	struct anon_vma *new_root = anon_vma->root;
242  	if (new_root != root) {
243  		if (WARN_ON_ONCE(root))
244  			up_write(&root->rwsem);
245  		root = new_root;
246  		down_write(&root->rwsem);
247  	}
248  	return root;
249  }
250  
251  static inline void unlock_anon_vma_root(struct anon_vma *root)
252  {
253  	if (root)
254  		up_write(&root->rwsem);
255  }
256  
257  /*
258   * Attach the anon_vmas from src to dst.
259   * Returns 0 on success, -ENOMEM on failure.
260   *
261   * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
262   * anon_vma_fork(). The first three want an exact copy of src, while the last
263   * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
264   * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
265   * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
266   *
267   * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
268   * and reuse existing anon_vma which has no vmas and only one child anon_vma.
269   * This prevents degradation of anon_vma hierarchy to endless linear chain in
270   * case of constantly forking task. On the other hand, an anon_vma with more
271   * than one child isn't reused even if there was no alive vma, thus rmap
272   * walker has a good chance of avoiding scanning the whole hierarchy when it
273   * searches where page is mapped.
274   */
275  int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
276  {
277  	struct anon_vma_chain *avc, *pavc;
278  	struct anon_vma *root = NULL;
279  
280  	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
281  		struct anon_vma *anon_vma;
282  
283  		avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
284  		if (unlikely(!avc)) {
285  			unlock_anon_vma_root(root);
286  			root = NULL;
287  			avc = anon_vma_chain_alloc(GFP_KERNEL);
288  			if (!avc)
289  				goto enomem_failure;
290  		}
291  		anon_vma = pavc->anon_vma;
292  		root = lock_anon_vma_root(root, anon_vma);
293  		anon_vma_chain_link(dst, avc, anon_vma);
294  
295  		/*
296  		 * Reuse existing anon_vma if its degree lower than two,
297  		 * that means it has no vma and only one anon_vma child.
298  		 *
299  		 * Do not chose parent anon_vma, otherwise first child
300  		 * will always reuse it. Root anon_vma is never reused:
301  		 * it has self-parent reference and at least one child.
302  		 */
303  		if (!dst->anon_vma && src->anon_vma &&
304  		    anon_vma != src->anon_vma && anon_vma->degree < 2)
305  			dst->anon_vma = anon_vma;
306  	}
307  	if (dst->anon_vma)
308  		dst->anon_vma->degree++;
309  	unlock_anon_vma_root(root);
310  	return 0;
311  
312   enomem_failure:
313  	/*
314  	 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
315  	 * decremented in unlink_anon_vmas().
316  	 * We can safely do this because callers of anon_vma_clone() don't care
317  	 * about dst->anon_vma if anon_vma_clone() failed.
318  	 */
319  	dst->anon_vma = NULL;
320  	unlink_anon_vmas(dst);
321  	return -ENOMEM;
322  }
323  
324  /*
325   * Attach vma to its own anon_vma, as well as to the anon_vmas that
326   * the corresponding VMA in the parent process is attached to.
327   * Returns 0 on success, non-zero on failure.
328   */
329  int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
330  {
331  	struct anon_vma_chain *avc;
332  	struct anon_vma *anon_vma;
333  	int error;
334  
335  	/* Don't bother if the parent process has no anon_vma here. */
336  	if (!pvma->anon_vma)
337  		return 0;
338  
339  	/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
340  	vma->anon_vma = NULL;
341  
342  	/*
343  	 * First, attach the new VMA to the parent VMA's anon_vmas,
344  	 * so rmap can find non-COWed pages in child processes.
345  	 */
346  	error = anon_vma_clone(vma, pvma);
347  	if (error)
348  		return error;
349  
350  	/* An existing anon_vma has been reused, all done then. */
351  	if (vma->anon_vma)
352  		return 0;
353  
354  	/* Then add our own anon_vma. */
355  	anon_vma = anon_vma_alloc();
356  	if (!anon_vma)
357  		goto out_error;
358  	avc = anon_vma_chain_alloc(GFP_KERNEL);
359  	if (!avc)
360  		goto out_error_free_anon_vma;
361  
362  	/*
363  	 * The root anon_vma's rwsem is the lock actually used when we
364  	 * lock any of the anon_vmas in this anon_vma tree.
365  	 */
366  	anon_vma->root = pvma->anon_vma->root;
367  	anon_vma->parent = pvma->anon_vma;
368  	/*
369  	 * With refcounts, an anon_vma can stay around longer than the
370  	 * process it belongs to. The root anon_vma needs to be pinned until
371  	 * this anon_vma is freed, because the lock lives in the root.
372  	 */
373  	get_anon_vma(anon_vma->root);
374  	/* Mark this anon_vma as the one where our new (COWed) pages go. */
375  	vma->anon_vma = anon_vma;
376  	anon_vma_lock_write(anon_vma);
377  	anon_vma_chain_link(vma, avc, anon_vma);
378  	anon_vma->parent->degree++;
379  	anon_vma_unlock_write(anon_vma);
380  
381  	return 0;
382  
383   out_error_free_anon_vma:
384  	put_anon_vma(anon_vma);
385   out_error:
386  	unlink_anon_vmas(vma);
387  	return -ENOMEM;
388  }
389  
390  void unlink_anon_vmas(struct vm_area_struct *vma)
391  {
392  	struct anon_vma_chain *avc, *next;
393  	struct anon_vma *root = NULL;
394  
395  	/*
396  	 * Unlink each anon_vma chained to the VMA.  This list is ordered
397  	 * from newest to oldest, ensuring the root anon_vma gets freed last.
398  	 */
399  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
400  		struct anon_vma *anon_vma = avc->anon_vma;
401  
402  		root = lock_anon_vma_root(root, anon_vma);
403  		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
404  
405  		/*
406  		 * Leave empty anon_vmas on the list - we'll need
407  		 * to free them outside the lock.
408  		 */
409  		if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
410  			anon_vma->parent->degree--;
411  			continue;
412  		}
413  
414  		list_del(&avc->same_vma);
415  		anon_vma_chain_free(avc);
416  	}
417  	if (vma->anon_vma) {
418  		vma->anon_vma->degree--;
419  
420  		/*
421  		 * vma would still be needed after unlink, and anon_vma will be prepared
422  		 * when handle fault.
423  		 */
424  		vma->anon_vma = NULL;
425  	}
426  	unlock_anon_vma_root(root);
427  
428  	/*
429  	 * Iterate the list once more, it now only contains empty and unlinked
430  	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
431  	 * needing to write-acquire the anon_vma->root->rwsem.
432  	 */
433  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
434  		struct anon_vma *anon_vma = avc->anon_vma;
435  
436  		VM_WARN_ON(anon_vma->degree);
437  		put_anon_vma(anon_vma);
438  
439  		list_del(&avc->same_vma);
440  		anon_vma_chain_free(avc);
441  	}
442  }
443  
444  static void anon_vma_ctor(void *data)
445  {
446  	struct anon_vma *anon_vma = data;
447  
448  	init_rwsem(&anon_vma->rwsem);
449  	atomic_set(&anon_vma->refcount, 0);
450  	anon_vma->rb_root = RB_ROOT_CACHED;
451  }
452  
453  void __init anon_vma_init(void)
454  {
455  	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
456  			0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
457  			anon_vma_ctor);
458  	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
459  			SLAB_PANIC|SLAB_ACCOUNT);
460  }
461  
462  /*
463   * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
464   *
465   * Since there is no serialization what so ever against page_remove_rmap()
466   * the best this function can do is return a refcount increased anon_vma
467   * that might have been relevant to this page.
468   *
469   * The page might have been remapped to a different anon_vma or the anon_vma
470   * returned may already be freed (and even reused).
471   *
472   * In case it was remapped to a different anon_vma, the new anon_vma will be a
473   * child of the old anon_vma, and the anon_vma lifetime rules will therefore
474   * ensure that any anon_vma obtained from the page will still be valid for as
475   * long as we observe page_mapped() [ hence all those page_mapped() tests ].
476   *
477   * All users of this function must be very careful when walking the anon_vma
478   * chain and verify that the page in question is indeed mapped in it
479   * [ something equivalent to page_mapped_in_vma() ].
480   *
481   * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
482   * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
483   * if there is a mapcount, we can dereference the anon_vma after observing
484   * those.
485   */
486  struct anon_vma *page_get_anon_vma(struct page *page)
487  {
488  	struct anon_vma *anon_vma = NULL;
489  	unsigned long anon_mapping;
490  
491  	rcu_read_lock();
492  	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
493  	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
494  		goto out;
495  	if (!page_mapped(page))
496  		goto out;
497  
498  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
499  	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
500  		anon_vma = NULL;
501  		goto out;
502  	}
503  
504  	/*
505  	 * If this page is still mapped, then its anon_vma cannot have been
506  	 * freed.  But if it has been unmapped, we have no security against the
507  	 * anon_vma structure being freed and reused (for another anon_vma:
508  	 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
509  	 * above cannot corrupt).
510  	 */
511  	if (!page_mapped(page)) {
512  		rcu_read_unlock();
513  		put_anon_vma(anon_vma);
514  		return NULL;
515  	}
516  out:
517  	rcu_read_unlock();
518  
519  	return anon_vma;
520  }
521  
522  /*
523   * Similar to page_get_anon_vma() except it locks the anon_vma.
524   *
525   * Its a little more complex as it tries to keep the fast path to a single
526   * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
527   * reference like with page_get_anon_vma() and then block on the mutex.
528   */
529  struct anon_vma *page_lock_anon_vma_read(struct page *page)
530  {
531  	struct anon_vma *anon_vma = NULL;
532  	struct anon_vma *root_anon_vma;
533  	unsigned long anon_mapping;
534  
535  	rcu_read_lock();
536  	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
537  	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
538  		goto out;
539  	if (!page_mapped(page))
540  		goto out;
541  
542  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
543  	root_anon_vma = READ_ONCE(anon_vma->root);
544  	if (down_read_trylock(&root_anon_vma->rwsem)) {
545  		/*
546  		 * If the page is still mapped, then this anon_vma is still
547  		 * its anon_vma, and holding the mutex ensures that it will
548  		 * not go away, see anon_vma_free().
549  		 */
550  		if (!page_mapped(page)) {
551  			up_read(&root_anon_vma->rwsem);
552  			anon_vma = NULL;
553  		}
554  		goto out;
555  	}
556  
557  	/* trylock failed, we got to sleep */
558  	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
559  		anon_vma = NULL;
560  		goto out;
561  	}
562  
563  	if (!page_mapped(page)) {
564  		rcu_read_unlock();
565  		put_anon_vma(anon_vma);
566  		return NULL;
567  	}
568  
569  	/* we pinned the anon_vma, its safe to sleep */
570  	rcu_read_unlock();
571  	anon_vma_lock_read(anon_vma);
572  
573  	if (atomic_dec_and_test(&anon_vma->refcount)) {
574  		/*
575  		 * Oops, we held the last refcount, release the lock
576  		 * and bail -- can't simply use put_anon_vma() because
577  		 * we'll deadlock on the anon_vma_lock_write() recursion.
578  		 */
579  		anon_vma_unlock_read(anon_vma);
580  		__put_anon_vma(anon_vma);
581  		anon_vma = NULL;
582  	}
583  
584  	return anon_vma;
585  
586  out:
587  	rcu_read_unlock();
588  	return anon_vma;
589  }
590  
591  void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
592  {
593  	anon_vma_unlock_read(anon_vma);
594  }
595  
596  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
597  /*
598   * Flush TLB entries for recently unmapped pages from remote CPUs. It is
599   * important if a PTE was dirty when it was unmapped that it's flushed
600   * before any IO is initiated on the page to prevent lost writes. Similarly,
601   * it must be flushed before freeing to prevent data leakage.
602   */
603  void try_to_unmap_flush(void)
604  {
605  	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
606  
607  	if (!tlb_ubc->flush_required)
608  		return;
609  
610  	arch_tlbbatch_flush(&tlb_ubc->arch);
611  	tlb_ubc->flush_required = false;
612  	tlb_ubc->writable = false;
613  }
614  
615  /* Flush iff there are potentially writable TLB entries that can race with IO */
616  void try_to_unmap_flush_dirty(void)
617  {
618  	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
619  
620  	if (tlb_ubc->writable)
621  		try_to_unmap_flush();
622  }
623  
624  static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
625  {
626  	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
627  
628  	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
629  	tlb_ubc->flush_required = true;
630  
631  	/*
632  	 * Ensure compiler does not re-order the setting of tlb_flush_batched
633  	 * before the PTE is cleared.
634  	 */
635  	barrier();
636  	mm->tlb_flush_batched = true;
637  
638  	/*
639  	 * If the PTE was dirty then it's best to assume it's writable. The
640  	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
641  	 * before the page is queued for IO.
642  	 */
643  	if (writable)
644  		tlb_ubc->writable = true;
645  }
646  
647  /*
648   * Returns true if the TLB flush should be deferred to the end of a batch of
649   * unmap operations to reduce IPIs.
650   */
651  static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
652  {
653  	bool should_defer = false;
654  
655  	if (!(flags & TTU_BATCH_FLUSH))
656  		return false;
657  
658  	/* If remote CPUs need to be flushed then defer batch the flush */
659  	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
660  		should_defer = true;
661  	put_cpu();
662  
663  	return should_defer;
664  }
665  
666  /*
667   * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
668   * releasing the PTL if TLB flushes are batched. It's possible for a parallel
669   * operation such as mprotect or munmap to race between reclaim unmapping
670   * the page and flushing the page. If this race occurs, it potentially allows
671   * access to data via a stale TLB entry. Tracking all mm's that have TLB
672   * batching in flight would be expensive during reclaim so instead track
673   * whether TLB batching occurred in the past and if so then do a flush here
674   * if required. This will cost one additional flush per reclaim cycle paid
675   * by the first operation at risk such as mprotect and mumap.
676   *
677   * This must be called under the PTL so that an access to tlb_flush_batched
678   * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
679   * via the PTL.
680   */
681  void flush_tlb_batched_pending(struct mm_struct *mm)
682  {
683  	if (data_race(mm->tlb_flush_batched)) {
684  		flush_tlb_mm(mm);
685  
686  		/*
687  		 * Do not allow the compiler to re-order the clearing of
688  		 * tlb_flush_batched before the tlb is flushed.
689  		 */
690  		barrier();
691  		mm->tlb_flush_batched = false;
692  	}
693  }
694  #else
695  static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
696  {
697  }
698  
699  static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
700  {
701  	return false;
702  }
703  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
704  
705  /*
706   * At what user virtual address is page expected in vma?
707   * Caller should check the page is actually part of the vma.
708   */
709  unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
710  {
711  	if (PageAnon(page)) {
712  		struct anon_vma *page__anon_vma = page_anon_vma(page);
713  		/*
714  		 * Note: swapoff's unuse_vma() is more efficient with this
715  		 * check, and needs it to match anon_vma when KSM is active.
716  		 */
717  		if (!vma->anon_vma || !page__anon_vma ||
718  		    vma->anon_vma->root != page__anon_vma->root)
719  			return -EFAULT;
720  	} else if (!vma->vm_file) {
721  		return -EFAULT;
722  	} else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
723  		return -EFAULT;
724  	}
725  
726  	return vma_address(page, vma);
727  }
728  
729  pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
730  {
731  	pgd_t *pgd;
732  	p4d_t *p4d;
733  	pud_t *pud;
734  	pmd_t *pmd = NULL;
735  	pmd_t pmde;
736  
737  	pgd = pgd_offset(mm, address);
738  	if (!pgd_present(*pgd))
739  		goto out;
740  
741  	p4d = p4d_offset(pgd, address);
742  	if (!p4d_present(*p4d))
743  		goto out;
744  
745  	pud = pud_offset(p4d, address);
746  	if (!pud_present(*pud))
747  		goto out;
748  
749  	pmd = pmd_offset(pud, address);
750  	/*
751  	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
752  	 * without holding anon_vma lock for write.  So when looking for a
753  	 * genuine pmde (in which to find pte), test present and !THP together.
754  	 */
755  	pmde = *pmd;
756  	barrier();
757  	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
758  		pmd = NULL;
759  out:
760  	return pmd;
761  }
762  
763  struct page_referenced_arg {
764  	int mapcount;
765  	int referenced;
766  	unsigned long vm_flags;
767  	struct mem_cgroup *memcg;
768  };
769  /*
770   * arg: page_referenced_arg will be passed
771   */
772  static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
773  			unsigned long address, void *arg)
774  {
775  	struct page_referenced_arg *pra = arg;
776  	struct page_vma_mapped_walk pvmw = {
777  		.page = page,
778  		.vma = vma,
779  		.address = address,
780  	};
781  	int referenced = 0;
782  
783  	while (page_vma_mapped_walk(&pvmw)) {
784  		address = pvmw.address;
785  
786  		if (vma->vm_flags & VM_LOCKED) {
787  			page_vma_mapped_walk_done(&pvmw);
788  			pra->vm_flags |= VM_LOCKED;
789  			return false; /* To break the loop */
790  		}
791  
792  		if (pvmw.pte) {
793  			if (ptep_clear_flush_young_notify(vma, address,
794  						pvmw.pte)) {
795  				/*
796  				 * Don't treat a reference through
797  				 * a sequentially read mapping as such.
798  				 * If the page has been used in another mapping,
799  				 * we will catch it; if this other mapping is
800  				 * already gone, the unmap path will have set
801  				 * PG_referenced or activated the page.
802  				 */
803  				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
804  					referenced++;
805  			}
806  		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
807  			if (pmdp_clear_flush_young_notify(vma, address,
808  						pvmw.pmd))
809  				referenced++;
810  		} else {
811  			/* unexpected pmd-mapped page? */
812  			WARN_ON_ONCE(1);
813  		}
814  
815  		pra->mapcount--;
816  	}
817  
818  	if (referenced)
819  		clear_page_idle(page);
820  	if (test_and_clear_page_young(page))
821  		referenced++;
822  
823  	if (referenced) {
824  		pra->referenced++;
825  		pra->vm_flags |= vma->vm_flags;
826  	}
827  
828  	if (!pra->mapcount)
829  		return false; /* To break the loop */
830  
831  	return true;
832  }
833  
834  static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
835  {
836  	struct page_referenced_arg *pra = arg;
837  	struct mem_cgroup *memcg = pra->memcg;
838  
839  	if (!mm_match_cgroup(vma->vm_mm, memcg))
840  		return true;
841  
842  	return false;
843  }
844  
845  /**
846   * page_referenced - test if the page was referenced
847   * @page: the page to test
848   * @is_locked: caller holds lock on the page
849   * @memcg: target memory cgroup
850   * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
851   *
852   * Quick test_and_clear_referenced for all mappings to a page,
853   * returns the number of ptes which referenced the page.
854   */
855  int page_referenced(struct page *page,
856  		    int is_locked,
857  		    struct mem_cgroup *memcg,
858  		    unsigned long *vm_flags)
859  {
860  	int we_locked = 0;
861  	struct page_referenced_arg pra = {
862  		.mapcount = total_mapcount(page),
863  		.memcg = memcg,
864  	};
865  	struct rmap_walk_control rwc = {
866  		.rmap_one = page_referenced_one,
867  		.arg = (void *)&pra,
868  		.anon_lock = page_lock_anon_vma_read,
869  	};
870  
871  	*vm_flags = 0;
872  	if (!pra.mapcount)
873  		return 0;
874  
875  	if (!page_rmapping(page))
876  		return 0;
877  
878  	if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
879  		we_locked = trylock_page(page);
880  		if (!we_locked)
881  			return 1;
882  	}
883  
884  	/*
885  	 * If we are reclaiming on behalf of a cgroup, skip
886  	 * counting on behalf of references from different
887  	 * cgroups
888  	 */
889  	if (memcg) {
890  		rwc.invalid_vma = invalid_page_referenced_vma;
891  	}
892  
893  	rmap_walk(page, &rwc);
894  	*vm_flags = pra.vm_flags;
895  
896  	if (we_locked)
897  		unlock_page(page);
898  
899  	return pra.referenced;
900  }
901  
902  static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
903  			    unsigned long address, void *arg)
904  {
905  	struct page_vma_mapped_walk pvmw = {
906  		.page = page,
907  		.vma = vma,
908  		.address = address,
909  		.flags = PVMW_SYNC,
910  	};
911  	struct mmu_notifier_range range;
912  	int *cleaned = arg;
913  
914  	/*
915  	 * We have to assume the worse case ie pmd for invalidation. Note that
916  	 * the page can not be free from this function.
917  	 */
918  	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
919  				0, vma, vma->vm_mm, address,
920  				vma_address_end(page, vma));
921  	mmu_notifier_invalidate_range_start(&range);
922  
923  	while (page_vma_mapped_walk(&pvmw)) {
924  		int ret = 0;
925  
926  		address = pvmw.address;
927  		if (pvmw.pte) {
928  			pte_t entry;
929  			pte_t *pte = pvmw.pte;
930  
931  			if (!pte_dirty(*pte) && !pte_write(*pte))
932  				continue;
933  
934  			flush_cache_page(vma, address, pte_pfn(*pte));
935  			entry = ptep_clear_flush(vma, address, pte);
936  			entry = pte_wrprotect(entry);
937  			entry = pte_mkclean(entry);
938  			set_pte_at(vma->vm_mm, address, pte, entry);
939  			ret = 1;
940  		} else {
941  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
942  			pmd_t *pmd = pvmw.pmd;
943  			pmd_t entry;
944  
945  			if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
946  				continue;
947  
948  			flush_cache_page(vma, address, page_to_pfn(page));
949  			entry = pmdp_invalidate(vma, address, pmd);
950  			entry = pmd_wrprotect(entry);
951  			entry = pmd_mkclean(entry);
952  			set_pmd_at(vma->vm_mm, address, pmd, entry);
953  			ret = 1;
954  #else
955  			/* unexpected pmd-mapped page? */
956  			WARN_ON_ONCE(1);
957  #endif
958  		}
959  
960  		/*
961  		 * No need to call mmu_notifier_invalidate_range() as we are
962  		 * downgrading page table protection not changing it to point
963  		 * to a new page.
964  		 *
965  		 * See Documentation/vm/mmu_notifier.rst
966  		 */
967  		if (ret)
968  			(*cleaned)++;
969  	}
970  
971  	mmu_notifier_invalidate_range_end(&range);
972  
973  	return true;
974  }
975  
976  static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
977  {
978  	if (vma->vm_flags & VM_SHARED)
979  		return false;
980  
981  	return true;
982  }
983  
984  int folio_mkclean(struct folio *folio)
985  {
986  	int cleaned = 0;
987  	struct address_space *mapping;
988  	struct rmap_walk_control rwc = {
989  		.arg = (void *)&cleaned,
990  		.rmap_one = page_mkclean_one,
991  		.invalid_vma = invalid_mkclean_vma,
992  	};
993  
994  	BUG_ON(!folio_test_locked(folio));
995  
996  	if (!folio_mapped(folio))
997  		return 0;
998  
999  	mapping = folio_mapping(folio);
1000  	if (!mapping)
1001  		return 0;
1002  
1003  	rmap_walk(&folio->page, &rwc);
1004  
1005  	return cleaned;
1006  }
1007  EXPORT_SYMBOL_GPL(folio_mkclean);
1008  
1009  /**
1010   * page_move_anon_rmap - move a page to our anon_vma
1011   * @page:	the page to move to our anon_vma
1012   * @vma:	the vma the page belongs to
1013   *
1014   * When a page belongs exclusively to one process after a COW event,
1015   * that page can be moved into the anon_vma that belongs to just that
1016   * process, so the rmap code will not search the parent or sibling
1017   * processes.
1018   */
1019  void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1020  {
1021  	struct anon_vma *anon_vma = vma->anon_vma;
1022  
1023  	page = compound_head(page);
1024  
1025  	VM_BUG_ON_PAGE(!PageLocked(page), page);
1026  	VM_BUG_ON_VMA(!anon_vma, vma);
1027  
1028  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1029  	/*
1030  	 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
1031  	 * simultaneously, so a concurrent reader (eg page_referenced()'s
1032  	 * PageAnon()) will not see one without the other.
1033  	 */
1034  	WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1035  }
1036  
1037  /**
1038   * __page_set_anon_rmap - set up new anonymous rmap
1039   * @page:	Page or Hugepage to add to rmap
1040   * @vma:	VM area to add page to.
1041   * @address:	User virtual address of the mapping
1042   * @exclusive:	the page is exclusively owned by the current process
1043   */
1044  static void __page_set_anon_rmap(struct page *page,
1045  	struct vm_area_struct *vma, unsigned long address, int exclusive)
1046  {
1047  	struct anon_vma *anon_vma = vma->anon_vma;
1048  
1049  	BUG_ON(!anon_vma);
1050  
1051  	if (PageAnon(page))
1052  		return;
1053  
1054  	/*
1055  	 * If the page isn't exclusively mapped into this vma,
1056  	 * we must use the _oldest_ possible anon_vma for the
1057  	 * page mapping!
1058  	 */
1059  	if (!exclusive)
1060  		anon_vma = anon_vma->root;
1061  
1062  	/*
1063  	 * page_idle does a lockless/optimistic rmap scan on page->mapping.
1064  	 * Make sure the compiler doesn't split the stores of anon_vma and
1065  	 * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
1066  	 * could mistake the mapping for a struct address_space and crash.
1067  	 */
1068  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1069  	WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1070  	page->index = linear_page_index(vma, address);
1071  }
1072  
1073  /**
1074   * __page_check_anon_rmap - sanity check anonymous rmap addition
1075   * @page:	the page to add the mapping to
1076   * @vma:	the vm area in which the mapping is added
1077   * @address:	the user virtual address mapped
1078   */
1079  static void __page_check_anon_rmap(struct page *page,
1080  	struct vm_area_struct *vma, unsigned long address)
1081  {
1082  	/*
1083  	 * The page's anon-rmap details (mapping and index) are guaranteed to
1084  	 * be set up correctly at this point.
1085  	 *
1086  	 * We have exclusion against page_add_anon_rmap because the caller
1087  	 * always holds the page locked.
1088  	 *
1089  	 * We have exclusion against page_add_new_anon_rmap because those pages
1090  	 * are initially only visible via the pagetables, and the pte is locked
1091  	 * over the call to page_add_new_anon_rmap.
1092  	 */
1093  	VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
1094  	VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1095  		       page);
1096  }
1097  
1098  /**
1099   * page_add_anon_rmap - add pte mapping to an anonymous page
1100   * @page:	the page to add the mapping to
1101   * @vma:	the vm area in which the mapping is added
1102   * @address:	the user virtual address mapped
1103   * @compound:	charge the page as compound or small page
1104   *
1105   * The caller needs to hold the pte lock, and the page must be locked in
1106   * the anon_vma case: to serialize mapping,index checking after setting,
1107   * and to ensure that PageAnon is not being upgraded racily to PageKsm
1108   * (but PageKsm is never downgraded to PageAnon).
1109   */
1110  void page_add_anon_rmap(struct page *page,
1111  	struct vm_area_struct *vma, unsigned long address, bool compound)
1112  {
1113  	do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
1114  }
1115  
1116  /*
1117   * Special version of the above for do_swap_page, which often runs
1118   * into pages that are exclusively owned by the current process.
1119   * Everybody else should continue to use page_add_anon_rmap above.
1120   */
1121  void do_page_add_anon_rmap(struct page *page,
1122  	struct vm_area_struct *vma, unsigned long address, int flags)
1123  {
1124  	bool compound = flags & RMAP_COMPOUND;
1125  	bool first;
1126  
1127  	if (unlikely(PageKsm(page)))
1128  		lock_page_memcg(page);
1129  	else
1130  		VM_BUG_ON_PAGE(!PageLocked(page), page);
1131  
1132  	if (compound) {
1133  		atomic_t *mapcount;
1134  		VM_BUG_ON_PAGE(!PageLocked(page), page);
1135  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1136  		mapcount = compound_mapcount_ptr(page);
1137  		first = atomic_inc_and_test(mapcount);
1138  	} else {
1139  		first = atomic_inc_and_test(&page->_mapcount);
1140  	}
1141  
1142  	if (first) {
1143  		int nr = compound ? thp_nr_pages(page) : 1;
1144  		/*
1145  		 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1146  		 * these counters are not modified in interrupt context, and
1147  		 * pte lock(a spinlock) is held, which implies preemption
1148  		 * disabled.
1149  		 */
1150  		if (compound)
1151  			__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1152  		__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1153  	}
1154  
1155  	if (unlikely(PageKsm(page))) {
1156  		unlock_page_memcg(page);
1157  		return;
1158  	}
1159  
1160  	/* address might be in next vma when migration races vma_adjust */
1161  	if (first)
1162  		__page_set_anon_rmap(page, vma, address,
1163  				flags & RMAP_EXCLUSIVE);
1164  	else
1165  		__page_check_anon_rmap(page, vma, address);
1166  }
1167  
1168  /**
1169   * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1170   * @page:	the page to add the mapping to
1171   * @vma:	the vm area in which the mapping is added
1172   * @address:	the user virtual address mapped
1173   * @compound:	charge the page as compound or small page
1174   *
1175   * Same as page_add_anon_rmap but must only be called on *new* pages.
1176   * This means the inc-and-test can be bypassed.
1177   * Page does not have to be locked.
1178   */
1179  void page_add_new_anon_rmap(struct page *page,
1180  	struct vm_area_struct *vma, unsigned long address, bool compound)
1181  {
1182  	int nr = compound ? thp_nr_pages(page) : 1;
1183  
1184  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1185  	__SetPageSwapBacked(page);
1186  	if (compound) {
1187  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1188  		/* increment count (starts at -1) */
1189  		atomic_set(compound_mapcount_ptr(page), 0);
1190  		if (hpage_pincount_available(page))
1191  			atomic_set(compound_pincount_ptr(page), 0);
1192  
1193  		__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1194  	} else {
1195  		/* Anon THP always mapped first with PMD */
1196  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
1197  		/* increment count (starts at -1) */
1198  		atomic_set(&page->_mapcount, 0);
1199  	}
1200  	__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1201  	__page_set_anon_rmap(page, vma, address, 1);
1202  }
1203  
1204  /**
1205   * page_add_file_rmap - add pte mapping to a file page
1206   * @page: the page to add the mapping to
1207   * @compound: charge the page as compound or small page
1208   *
1209   * The caller needs to hold the pte lock.
1210   */
1211  void page_add_file_rmap(struct page *page, bool compound)
1212  {
1213  	int i, nr = 1;
1214  
1215  	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1216  	lock_page_memcg(page);
1217  	if (compound && PageTransHuge(page)) {
1218  		int nr_pages = thp_nr_pages(page);
1219  
1220  		for (i = 0, nr = 0; i < nr_pages; i++) {
1221  			if (atomic_inc_and_test(&page[i]._mapcount))
1222  				nr++;
1223  		}
1224  		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1225  			goto out;
1226  		if (PageSwapBacked(page))
1227  			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1228  						nr_pages);
1229  		else
1230  			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1231  						nr_pages);
1232  	} else {
1233  		if (PageTransCompound(page) && page_mapping(page)) {
1234  			struct page *head = compound_head(page);
1235  
1236  			VM_WARN_ON_ONCE(!PageLocked(page));
1237  
1238  			SetPageDoubleMap(head);
1239  			if (PageMlocked(page))
1240  				clear_page_mlock(head);
1241  		}
1242  		if (!atomic_inc_and_test(&page->_mapcount))
1243  			goto out;
1244  	}
1245  	__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
1246  out:
1247  	unlock_page_memcg(page);
1248  }
1249  
1250  static void page_remove_file_rmap(struct page *page, bool compound)
1251  {
1252  	int i, nr = 1;
1253  
1254  	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1255  
1256  	/* Hugepages are not counted in NR_FILE_MAPPED for now. */
1257  	if (unlikely(PageHuge(page))) {
1258  		/* hugetlb pages are always mapped with pmds */
1259  		atomic_dec(compound_mapcount_ptr(page));
1260  		return;
1261  	}
1262  
1263  	/* page still mapped by someone else? */
1264  	if (compound && PageTransHuge(page)) {
1265  		int nr_pages = thp_nr_pages(page);
1266  
1267  		for (i = 0, nr = 0; i < nr_pages; i++) {
1268  			if (atomic_add_negative(-1, &page[i]._mapcount))
1269  				nr++;
1270  		}
1271  		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1272  			return;
1273  		if (PageSwapBacked(page))
1274  			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1275  						-nr_pages);
1276  		else
1277  			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1278  						-nr_pages);
1279  	} else {
1280  		if (!atomic_add_negative(-1, &page->_mapcount))
1281  			return;
1282  	}
1283  
1284  	/*
1285  	 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
1286  	 * these counters are not modified in interrupt context, and
1287  	 * pte lock(a spinlock) is held, which implies preemption disabled.
1288  	 */
1289  	__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
1290  
1291  	if (unlikely(PageMlocked(page)))
1292  		clear_page_mlock(page);
1293  }
1294  
1295  static void page_remove_anon_compound_rmap(struct page *page)
1296  {
1297  	int i, nr;
1298  
1299  	if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1300  		return;
1301  
1302  	/* Hugepages are not counted in NR_ANON_PAGES for now. */
1303  	if (unlikely(PageHuge(page)))
1304  		return;
1305  
1306  	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1307  		return;
1308  
1309  	__mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
1310  
1311  	if (TestClearPageDoubleMap(page)) {
1312  		/*
1313  		 * Subpages can be mapped with PTEs too. Check how many of
1314  		 * them are still mapped.
1315  		 */
1316  		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1317  			if (atomic_add_negative(-1, &page[i]._mapcount))
1318  				nr++;
1319  		}
1320  
1321  		/*
1322  		 * Queue the page for deferred split if at least one small
1323  		 * page of the compound page is unmapped, but at least one
1324  		 * small page is still mapped.
1325  		 */
1326  		if (nr && nr < thp_nr_pages(page))
1327  			deferred_split_huge_page(page);
1328  	} else {
1329  		nr = thp_nr_pages(page);
1330  	}
1331  
1332  	if (unlikely(PageMlocked(page)))
1333  		clear_page_mlock(page);
1334  
1335  	if (nr)
1336  		__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
1337  }
1338  
1339  /**
1340   * page_remove_rmap - take down pte mapping from a page
1341   * @page:	page to remove mapping from
1342   * @compound:	uncharge the page as compound or small page
1343   *
1344   * The caller needs to hold the pte lock.
1345   */
1346  void page_remove_rmap(struct page *page, bool compound)
1347  {
1348  	lock_page_memcg(page);
1349  
1350  	if (!PageAnon(page)) {
1351  		page_remove_file_rmap(page, compound);
1352  		goto out;
1353  	}
1354  
1355  	if (compound) {
1356  		page_remove_anon_compound_rmap(page);
1357  		goto out;
1358  	}
1359  
1360  	/* page still mapped by someone else? */
1361  	if (!atomic_add_negative(-1, &page->_mapcount))
1362  		goto out;
1363  
1364  	/*
1365  	 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1366  	 * these counters are not modified in interrupt context, and
1367  	 * pte lock(a spinlock) is held, which implies preemption disabled.
1368  	 */
1369  	__dec_lruvec_page_state(page, NR_ANON_MAPPED);
1370  
1371  	if (unlikely(PageMlocked(page)))
1372  		clear_page_mlock(page);
1373  
1374  	if (PageTransCompound(page))
1375  		deferred_split_huge_page(compound_head(page));
1376  
1377  	/*
1378  	 * It would be tidy to reset the PageAnon mapping here,
1379  	 * but that might overwrite a racing page_add_anon_rmap
1380  	 * which increments mapcount after us but sets mapping
1381  	 * before us: so leave the reset to free_unref_page,
1382  	 * and remember that it's only reliable while mapped.
1383  	 * Leaving it set also helps swapoff to reinstate ptes
1384  	 * faster for those pages still in swapcache.
1385  	 */
1386  out:
1387  	unlock_page_memcg(page);
1388  }
1389  
1390  /*
1391   * @arg: enum ttu_flags will be passed to this argument
1392   */
1393  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1394  		     unsigned long address, void *arg)
1395  {
1396  	struct mm_struct *mm = vma->vm_mm;
1397  	struct page_vma_mapped_walk pvmw = {
1398  		.page = page,
1399  		.vma = vma,
1400  		.address = address,
1401  	};
1402  	pte_t pteval;
1403  	struct page *subpage;
1404  	bool ret = true;
1405  	struct mmu_notifier_range range;
1406  	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1407  
1408  	/*
1409  	 * When racing against e.g. zap_pte_range() on another cpu,
1410  	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
1411  	 * try_to_unmap() may return before page_mapped() has become false,
1412  	 * if page table locking is skipped: use TTU_SYNC to wait for that.
1413  	 */
1414  	if (flags & TTU_SYNC)
1415  		pvmw.flags = PVMW_SYNC;
1416  
1417  	if (flags & TTU_SPLIT_HUGE_PMD)
1418  		split_huge_pmd_address(vma, address, false, page);
1419  
1420  	/*
1421  	 * For THP, we have to assume the worse case ie pmd for invalidation.
1422  	 * For hugetlb, it could be much worse if we need to do pud
1423  	 * invalidation in the case of pmd sharing.
1424  	 *
1425  	 * Note that the page can not be free in this function as call of
1426  	 * try_to_unmap() must hold a reference on the page.
1427  	 */
1428  	range.end = PageKsm(page) ?
1429  			address + PAGE_SIZE : vma_address_end(page, vma);
1430  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1431  				address, range.end);
1432  	if (PageHuge(page)) {
1433  		/*
1434  		 * If sharing is possible, start and end will be adjusted
1435  		 * accordingly.
1436  		 */
1437  		adjust_range_if_pmd_sharing_possible(vma, &range.start,
1438  						     &range.end);
1439  	}
1440  	mmu_notifier_invalidate_range_start(&range);
1441  
1442  	while (page_vma_mapped_walk(&pvmw)) {
1443  		/*
1444  		 * If the page is mlock()d, we cannot swap it out.
1445  		 */
1446  		if (!(flags & TTU_IGNORE_MLOCK) &&
1447  		    (vma->vm_flags & VM_LOCKED)) {
1448  			/*
1449  			 * PTE-mapped THP are never marked as mlocked: so do
1450  			 * not set it on a DoubleMap THP, nor on an Anon THP
1451  			 * (which may still be PTE-mapped after DoubleMap was
1452  			 * cleared).  But stop unmapping even in those cases.
1453  			 */
1454  			if (!PageTransCompound(page) || (PageHead(page) &&
1455  			     !PageDoubleMap(page) && !PageAnon(page)))
1456  				mlock_vma_page(page);
1457  			page_vma_mapped_walk_done(&pvmw);
1458  			ret = false;
1459  			break;
1460  		}
1461  
1462  		/* Unexpected PMD-mapped THP? */
1463  		VM_BUG_ON_PAGE(!pvmw.pte, page);
1464  
1465  		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1466  		address = pvmw.address;
1467  
1468  		if (PageHuge(page) && !PageAnon(page)) {
1469  			/*
1470  			 * To call huge_pmd_unshare, i_mmap_rwsem must be
1471  			 * held in write mode.  Caller needs to explicitly
1472  			 * do this outside rmap routines.
1473  			 */
1474  			VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1475  			if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
1476  				/*
1477  				 * huge_pmd_unshare unmapped an entire PMD
1478  				 * page.  There is no way of knowing exactly
1479  				 * which PMDs may be cached for this mm, so
1480  				 * we must flush them all.  start/end were
1481  				 * already adjusted above to cover this range.
1482  				 */
1483  				flush_cache_range(vma, range.start, range.end);
1484  				flush_tlb_range(vma, range.start, range.end);
1485  				mmu_notifier_invalidate_range(mm, range.start,
1486  							      range.end);
1487  
1488  				/*
1489  				 * The ref count of the PMD page was dropped
1490  				 * which is part of the way map counting
1491  				 * is done for shared PMDs.  Return 'true'
1492  				 * here.  When there is no other sharing,
1493  				 * huge_pmd_unshare returns false and we will
1494  				 * unmap the actual page and drop map count
1495  				 * to zero.
1496  				 */
1497  				page_vma_mapped_walk_done(&pvmw);
1498  				break;
1499  			}
1500  		}
1501  
1502  		/* Nuke the page table entry. */
1503  		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1504  		if (should_defer_flush(mm, flags)) {
1505  			/*
1506  			 * We clear the PTE but do not flush so potentially
1507  			 * a remote CPU could still be writing to the page.
1508  			 * If the entry was previously clean then the
1509  			 * architecture must guarantee that a clear->dirty
1510  			 * transition on a cached TLB entry is written through
1511  			 * and traps if the PTE is unmapped.
1512  			 */
1513  			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
1514  
1515  			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
1516  		} else {
1517  			pteval = ptep_clear_flush(vma, address, pvmw.pte);
1518  		}
1519  
1520  		/* Move the dirty bit to the page. Now the pte is gone. */
1521  		if (pte_dirty(pteval))
1522  			set_page_dirty(page);
1523  
1524  		/* Update high watermark before we lower rss */
1525  		update_hiwater_rss(mm);
1526  
1527  		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1528  			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1529  			if (PageHuge(page)) {
1530  				hugetlb_count_sub(compound_nr(page), mm);
1531  				set_huge_swap_pte_at(mm, address,
1532  						     pvmw.pte, pteval,
1533  						     vma_mmu_pagesize(vma));
1534  			} else {
1535  				dec_mm_counter(mm, mm_counter(page));
1536  				set_pte_at(mm, address, pvmw.pte, pteval);
1537  			}
1538  
1539  		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
1540  			/*
1541  			 * The guest indicated that the page content is of no
1542  			 * interest anymore. Simply discard the pte, vmscan
1543  			 * will take care of the rest.
1544  			 * A future reference will then fault in a new zero
1545  			 * page. When userfaultfd is active, we must not drop
1546  			 * this page though, as its main user (postcopy
1547  			 * migration) will not expect userfaults on already
1548  			 * copied pages.
1549  			 */
1550  			dec_mm_counter(mm, mm_counter(page));
1551  			/* We have to invalidate as we cleared the pte */
1552  			mmu_notifier_invalidate_range(mm, address,
1553  						      address + PAGE_SIZE);
1554  		} else if (PageAnon(page)) {
1555  			swp_entry_t entry = { .val = page_private(subpage) };
1556  			pte_t swp_pte;
1557  			/*
1558  			 * Store the swap location in the pte.
1559  			 * See handle_pte_fault() ...
1560  			 */
1561  			if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
1562  				WARN_ON_ONCE(1);
1563  				ret = false;
1564  				/* We have to invalidate as we cleared the pte */
1565  				mmu_notifier_invalidate_range(mm, address,
1566  							address + PAGE_SIZE);
1567  				page_vma_mapped_walk_done(&pvmw);
1568  				break;
1569  			}
1570  
1571  			/* MADV_FREE page check */
1572  			if (!PageSwapBacked(page)) {
1573  				if (!PageDirty(page)) {
1574  					/* Invalidate as we cleared the pte */
1575  					mmu_notifier_invalidate_range(mm,
1576  						address, address + PAGE_SIZE);
1577  					dec_mm_counter(mm, MM_ANONPAGES);
1578  					goto discard;
1579  				}
1580  
1581  				/*
1582  				 * If the page was redirtied, it cannot be
1583  				 * discarded. Remap the page to page table.
1584  				 */
1585  				set_pte_at(mm, address, pvmw.pte, pteval);
1586  				SetPageSwapBacked(page);
1587  				ret = false;
1588  				page_vma_mapped_walk_done(&pvmw);
1589  				break;
1590  			}
1591  
1592  			if (swap_duplicate(entry) < 0) {
1593  				set_pte_at(mm, address, pvmw.pte, pteval);
1594  				ret = false;
1595  				page_vma_mapped_walk_done(&pvmw);
1596  				break;
1597  			}
1598  			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1599  				set_pte_at(mm, address, pvmw.pte, pteval);
1600  				ret = false;
1601  				page_vma_mapped_walk_done(&pvmw);
1602  				break;
1603  			}
1604  			if (list_empty(&mm->mmlist)) {
1605  				spin_lock(&mmlist_lock);
1606  				if (list_empty(&mm->mmlist))
1607  					list_add(&mm->mmlist, &init_mm.mmlist);
1608  				spin_unlock(&mmlist_lock);
1609  			}
1610  			dec_mm_counter(mm, MM_ANONPAGES);
1611  			inc_mm_counter(mm, MM_SWAPENTS);
1612  			swp_pte = swp_entry_to_pte(entry);
1613  			if (pte_soft_dirty(pteval))
1614  				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1615  			if (pte_uffd_wp(pteval))
1616  				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1617  			set_pte_at(mm, address, pvmw.pte, swp_pte);
1618  			/* Invalidate as we cleared the pte */
1619  			mmu_notifier_invalidate_range(mm, address,
1620  						      address + PAGE_SIZE);
1621  		} else {
1622  			/*
1623  			 * This is a locked file-backed page, thus it cannot
1624  			 * be removed from the page cache and replaced by a new
1625  			 * page before mmu_notifier_invalidate_range_end, so no
1626  			 * concurrent thread might update its page table to
1627  			 * point at new page while a device still is using this
1628  			 * page.
1629  			 *
1630  			 * See Documentation/vm/mmu_notifier.rst
1631  			 */
1632  			dec_mm_counter(mm, mm_counter_file(page));
1633  		}
1634  discard:
1635  		/*
1636  		 * No need to call mmu_notifier_invalidate_range() it has be
1637  		 * done above for all cases requiring it to happen under page
1638  		 * table lock before mmu_notifier_invalidate_range_end()
1639  		 *
1640  		 * See Documentation/vm/mmu_notifier.rst
1641  		 */
1642  		page_remove_rmap(subpage, PageHuge(page));
1643  		put_page(page);
1644  	}
1645  
1646  	mmu_notifier_invalidate_range_end(&range);
1647  
1648  	return ret;
1649  }
1650  
1651  static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1652  {
1653  	return vma_is_temporary_stack(vma);
1654  }
1655  
1656  static int page_not_mapped(struct page *page)
1657  {
1658  	return !page_mapped(page);
1659  }
1660  
1661  /**
1662   * try_to_unmap - try to remove all page table mappings to a page
1663   * @page: the page to get unmapped
1664   * @flags: action and flags
1665   *
1666   * Tries to remove all the page table entries which are mapping this
1667   * page, used in the pageout path.  Caller must hold the page lock.
1668   *
1669   * It is the caller's responsibility to check if the page is still
1670   * mapped when needed (use TTU_SYNC to prevent accounting races).
1671   */
1672  void try_to_unmap(struct page *page, enum ttu_flags flags)
1673  {
1674  	struct rmap_walk_control rwc = {
1675  		.rmap_one = try_to_unmap_one,
1676  		.arg = (void *)flags,
1677  		.done = page_not_mapped,
1678  		.anon_lock = page_lock_anon_vma_read,
1679  	};
1680  
1681  	if (flags & TTU_RMAP_LOCKED)
1682  		rmap_walk_locked(page, &rwc);
1683  	else
1684  		rmap_walk(page, &rwc);
1685  }
1686  
1687  /*
1688   * @arg: enum ttu_flags will be passed to this argument.
1689   *
1690   * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
1691   * containing migration entries.
1692   */
1693  static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
1694  		     unsigned long address, void *arg)
1695  {
1696  	struct mm_struct *mm = vma->vm_mm;
1697  	struct page_vma_mapped_walk pvmw = {
1698  		.page = page,
1699  		.vma = vma,
1700  		.address = address,
1701  	};
1702  	pte_t pteval;
1703  	struct page *subpage;
1704  	bool ret = true;
1705  	struct mmu_notifier_range range;
1706  	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1707  
1708  	/*
1709  	 * When racing against e.g. zap_pte_range() on another cpu,
1710  	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
1711  	 * try_to_migrate() may return before page_mapped() has become false,
1712  	 * if page table locking is skipped: use TTU_SYNC to wait for that.
1713  	 */
1714  	if (flags & TTU_SYNC)
1715  		pvmw.flags = PVMW_SYNC;
1716  
1717  	/*
1718  	 * unmap_page() in mm/huge_memory.c is the only user of migration with
1719  	 * TTU_SPLIT_HUGE_PMD and it wants to freeze.
1720  	 */
1721  	if (flags & TTU_SPLIT_HUGE_PMD)
1722  		split_huge_pmd_address(vma, address, true, page);
1723  
1724  	/*
1725  	 * For THP, we have to assume the worse case ie pmd for invalidation.
1726  	 * For hugetlb, it could be much worse if we need to do pud
1727  	 * invalidation in the case of pmd sharing.
1728  	 *
1729  	 * Note that the page can not be free in this function as call of
1730  	 * try_to_unmap() must hold a reference on the page.
1731  	 */
1732  	range.end = PageKsm(page) ?
1733  			address + PAGE_SIZE : vma_address_end(page, vma);
1734  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1735  				address, range.end);
1736  	if (PageHuge(page)) {
1737  		/*
1738  		 * If sharing is possible, start and end will be adjusted
1739  		 * accordingly.
1740  		 */
1741  		adjust_range_if_pmd_sharing_possible(vma, &range.start,
1742  						     &range.end);
1743  	}
1744  	mmu_notifier_invalidate_range_start(&range);
1745  
1746  	while (page_vma_mapped_walk(&pvmw)) {
1747  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1748  		/* PMD-mapped THP migration entry */
1749  		if (!pvmw.pte) {
1750  			VM_BUG_ON_PAGE(PageHuge(page) ||
1751  				       !PageTransCompound(page), page);
1752  
1753  			set_pmd_migration_entry(&pvmw, page);
1754  			continue;
1755  		}
1756  #endif
1757  
1758  		/* Unexpected PMD-mapped THP? */
1759  		VM_BUG_ON_PAGE(!pvmw.pte, page);
1760  
1761  		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1762  		address = pvmw.address;
1763  
1764  		if (PageHuge(page) && !PageAnon(page)) {
1765  			/*
1766  			 * To call huge_pmd_unshare, i_mmap_rwsem must be
1767  			 * held in write mode.  Caller needs to explicitly
1768  			 * do this outside rmap routines.
1769  			 */
1770  			VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1771  			if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
1772  				/*
1773  				 * huge_pmd_unshare unmapped an entire PMD
1774  				 * page.  There is no way of knowing exactly
1775  				 * which PMDs may be cached for this mm, so
1776  				 * we must flush them all.  start/end were
1777  				 * already adjusted above to cover this range.
1778  				 */
1779  				flush_cache_range(vma, range.start, range.end);
1780  				flush_tlb_range(vma, range.start, range.end);
1781  				mmu_notifier_invalidate_range(mm, range.start,
1782  							      range.end);
1783  
1784  				/*
1785  				 * The ref count of the PMD page was dropped
1786  				 * which is part of the way map counting
1787  				 * is done for shared PMDs.  Return 'true'
1788  				 * here.  When there is no other sharing,
1789  				 * huge_pmd_unshare returns false and we will
1790  				 * unmap the actual page and drop map count
1791  				 * to zero.
1792  				 */
1793  				page_vma_mapped_walk_done(&pvmw);
1794  				break;
1795  			}
1796  		}
1797  
1798  		/* Nuke the page table entry. */
1799  		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1800  		pteval = ptep_clear_flush(vma, address, pvmw.pte);
1801  
1802  		/* Move the dirty bit to the page. Now the pte is gone. */
1803  		if (pte_dirty(pteval))
1804  			set_page_dirty(page);
1805  
1806  		/* Update high watermark before we lower rss */
1807  		update_hiwater_rss(mm);
1808  
1809  		if (is_zone_device_page(page)) {
1810  			unsigned long pfn = page_to_pfn(page);
1811  			swp_entry_t entry;
1812  			pte_t swp_pte;
1813  
1814  			/*
1815  			 * Store the pfn of the page in a special migration
1816  			 * pte. do_swap_page() will wait until the migration
1817  			 * pte is removed and then restart fault handling.
1818  			 */
1819  			entry = pte_to_swp_entry(pteval);
1820  			if (is_writable_device_private_entry(entry))
1821  				entry = make_writable_migration_entry(pfn);
1822  			else
1823  				entry = make_readable_migration_entry(pfn);
1824  			swp_pte = swp_entry_to_pte(entry);
1825  
1826  			/*
1827  			 * pteval maps a zone device page and is therefore
1828  			 * a swap pte.
1829  			 */
1830  			if (pte_swp_soft_dirty(pteval))
1831  				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1832  			if (pte_swp_uffd_wp(pteval))
1833  				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1834  			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1835  			/*
1836  			 * No need to invalidate here it will synchronize on
1837  			 * against the special swap migration pte.
1838  			 *
1839  			 * The assignment to subpage above was computed from a
1840  			 * swap PTE which results in an invalid pointer.
1841  			 * Since only PAGE_SIZE pages can currently be
1842  			 * migrated, just set it to page. This will need to be
1843  			 * changed when hugepage migrations to device private
1844  			 * memory are supported.
1845  			 */
1846  			subpage = page;
1847  		} else if (PageHWPoison(page)) {
1848  			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1849  			if (PageHuge(page)) {
1850  				hugetlb_count_sub(compound_nr(page), mm);
1851  				set_huge_swap_pte_at(mm, address,
1852  						     pvmw.pte, pteval,
1853  						     vma_mmu_pagesize(vma));
1854  			} else {
1855  				dec_mm_counter(mm, mm_counter(page));
1856  				set_pte_at(mm, address, pvmw.pte, pteval);
1857  			}
1858  
1859  		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
1860  			/*
1861  			 * The guest indicated that the page content is of no
1862  			 * interest anymore. Simply discard the pte, vmscan
1863  			 * will take care of the rest.
1864  			 * A future reference will then fault in a new zero
1865  			 * page. When userfaultfd is active, we must not drop
1866  			 * this page though, as its main user (postcopy
1867  			 * migration) will not expect userfaults on already
1868  			 * copied pages.
1869  			 */
1870  			dec_mm_counter(mm, mm_counter(page));
1871  			/* We have to invalidate as we cleared the pte */
1872  			mmu_notifier_invalidate_range(mm, address,
1873  						      address + PAGE_SIZE);
1874  		} else {
1875  			swp_entry_t entry;
1876  			pte_t swp_pte;
1877  
1878  			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1879  				set_pte_at(mm, address, pvmw.pte, pteval);
1880  				ret = false;
1881  				page_vma_mapped_walk_done(&pvmw);
1882  				break;
1883  			}
1884  
1885  			/*
1886  			 * Store the pfn of the page in a special migration
1887  			 * pte. do_swap_page() will wait until the migration
1888  			 * pte is removed and then restart fault handling.
1889  			 */
1890  			if (pte_write(pteval))
1891  				entry = make_writable_migration_entry(
1892  							page_to_pfn(subpage));
1893  			else
1894  				entry = make_readable_migration_entry(
1895  							page_to_pfn(subpage));
1896  
1897  			swp_pte = swp_entry_to_pte(entry);
1898  			if (pte_soft_dirty(pteval))
1899  				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1900  			if (pte_uffd_wp(pteval))
1901  				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1902  			set_pte_at(mm, address, pvmw.pte, swp_pte);
1903  			/*
1904  			 * No need to invalidate here it will synchronize on
1905  			 * against the special swap migration pte.
1906  			 */
1907  		}
1908  
1909  		/*
1910  		 * No need to call mmu_notifier_invalidate_range() it has be
1911  		 * done above for all cases requiring it to happen under page
1912  		 * table lock before mmu_notifier_invalidate_range_end()
1913  		 *
1914  		 * See Documentation/vm/mmu_notifier.rst
1915  		 */
1916  		page_remove_rmap(subpage, PageHuge(page));
1917  		put_page(page);
1918  	}
1919  
1920  	mmu_notifier_invalidate_range_end(&range);
1921  
1922  	return ret;
1923  }
1924  
1925  /**
1926   * try_to_migrate - try to replace all page table mappings with swap entries
1927   * @page: the page to replace page table entries for
1928   * @flags: action and flags
1929   *
1930   * Tries to remove all the page table entries which are mapping this page and
1931   * replace them with special swap entries. Caller must hold the page lock.
1932   */
1933  void try_to_migrate(struct page *page, enum ttu_flags flags)
1934  {
1935  	struct rmap_walk_control rwc = {
1936  		.rmap_one = try_to_migrate_one,
1937  		.arg = (void *)flags,
1938  		.done = page_not_mapped,
1939  		.anon_lock = page_lock_anon_vma_read,
1940  	};
1941  
1942  	/*
1943  	 * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
1944  	 * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
1945  	 */
1946  	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
1947  					TTU_SYNC)))
1948  		return;
1949  
1950  	if (is_zone_device_page(page) && !is_device_private_page(page))
1951  		return;
1952  
1953  	/*
1954  	 * During exec, a temporary VMA is setup and later moved.
1955  	 * The VMA is moved under the anon_vma lock but not the
1956  	 * page tables leading to a race where migration cannot
1957  	 * find the migration ptes. Rather than increasing the
1958  	 * locking requirements of exec(), migration skips
1959  	 * temporary VMAs until after exec() completes.
1960  	 */
1961  	if (!PageKsm(page) && PageAnon(page))
1962  		rwc.invalid_vma = invalid_migration_vma;
1963  
1964  	if (flags & TTU_RMAP_LOCKED)
1965  		rmap_walk_locked(page, &rwc);
1966  	else
1967  		rmap_walk(page, &rwc);
1968  }
1969  
1970  /*
1971   * Walks the vma's mapping a page and mlocks the page if any locked vma's are
1972   * found. Once one is found the page is locked and the scan can be terminated.
1973   */
1974  static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
1975  				 unsigned long address, void *unused)
1976  {
1977  	struct page_vma_mapped_walk pvmw = {
1978  		.page = page,
1979  		.vma = vma,
1980  		.address = address,
1981  	};
1982  
1983  	/* An un-locked vma doesn't have any pages to lock, continue the scan */
1984  	if (!(vma->vm_flags & VM_LOCKED))
1985  		return true;
1986  
1987  	while (page_vma_mapped_walk(&pvmw)) {
1988  		/*
1989  		 * Need to recheck under the ptl to serialise with
1990  		 * __munlock_pagevec_fill() after VM_LOCKED is cleared in
1991  		 * munlock_vma_pages_range().
1992  		 */
1993  		if (vma->vm_flags & VM_LOCKED) {
1994  			/*
1995  			 * PTE-mapped THP are never marked as mlocked; but
1996  			 * this function is never called on a DoubleMap THP,
1997  			 * nor on an Anon THP (which may still be PTE-mapped
1998  			 * after DoubleMap was cleared).
1999  			 */
2000  			mlock_vma_page(page);
2001  			/*
2002  			 * No need to scan further once the page is marked
2003  			 * as mlocked.
2004  			 */
2005  			page_vma_mapped_walk_done(&pvmw);
2006  			return false;
2007  		}
2008  	}
2009  
2010  	return true;
2011  }
2012  
2013  /**
2014   * page_mlock - try to mlock a page
2015   * @page: the page to be mlocked
2016   *
2017   * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
2018   * the page if any are found. The page will be returned with PG_mlocked cleared
2019   * if it is not mapped by any locked vmas.
2020   */
2021  void page_mlock(struct page *page)
2022  {
2023  	struct rmap_walk_control rwc = {
2024  		.rmap_one = page_mlock_one,
2025  		.done = page_not_mapped,
2026  		.anon_lock = page_lock_anon_vma_read,
2027  
2028  	};
2029  
2030  	VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
2031  	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
2032  
2033  	/* Anon THP are only marked as mlocked when singly mapped */
2034  	if (PageTransCompound(page) && PageAnon(page))
2035  		return;
2036  
2037  	rmap_walk(page, &rwc);
2038  }
2039  
2040  #ifdef CONFIG_DEVICE_PRIVATE
2041  struct make_exclusive_args {
2042  	struct mm_struct *mm;
2043  	unsigned long address;
2044  	void *owner;
2045  	bool valid;
2046  };
2047  
2048  static bool page_make_device_exclusive_one(struct page *page,
2049  		struct vm_area_struct *vma, unsigned long address, void *priv)
2050  {
2051  	struct mm_struct *mm = vma->vm_mm;
2052  	struct page_vma_mapped_walk pvmw = {
2053  		.page = page,
2054  		.vma = vma,
2055  		.address = address,
2056  	};
2057  	struct make_exclusive_args *args = priv;
2058  	pte_t pteval;
2059  	struct page *subpage;
2060  	bool ret = true;
2061  	struct mmu_notifier_range range;
2062  	swp_entry_t entry;
2063  	pte_t swp_pte;
2064  
2065  	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
2066  				      vma->vm_mm, address, min(vma->vm_end,
2067  				      address + page_size(page)), args->owner);
2068  	mmu_notifier_invalidate_range_start(&range);
2069  
2070  	while (page_vma_mapped_walk(&pvmw)) {
2071  		/* Unexpected PMD-mapped THP? */
2072  		VM_BUG_ON_PAGE(!pvmw.pte, page);
2073  
2074  		if (!pte_present(*pvmw.pte)) {
2075  			ret = false;
2076  			page_vma_mapped_walk_done(&pvmw);
2077  			break;
2078  		}
2079  
2080  		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
2081  		address = pvmw.address;
2082  
2083  		/* Nuke the page table entry. */
2084  		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
2085  		pteval = ptep_clear_flush(vma, address, pvmw.pte);
2086  
2087  		/* Move the dirty bit to the page. Now the pte is gone. */
2088  		if (pte_dirty(pteval))
2089  			set_page_dirty(page);
2090  
2091  		/*
2092  		 * Check that our target page is still mapped at the expected
2093  		 * address.
2094  		 */
2095  		if (args->mm == mm && args->address == address &&
2096  		    pte_write(pteval))
2097  			args->valid = true;
2098  
2099  		/*
2100  		 * Store the pfn of the page in a special migration
2101  		 * pte. do_swap_page() will wait until the migration
2102  		 * pte is removed and then restart fault handling.
2103  		 */
2104  		if (pte_write(pteval))
2105  			entry = make_writable_device_exclusive_entry(
2106  							page_to_pfn(subpage));
2107  		else
2108  			entry = make_readable_device_exclusive_entry(
2109  							page_to_pfn(subpage));
2110  		swp_pte = swp_entry_to_pte(entry);
2111  		if (pte_soft_dirty(pteval))
2112  			swp_pte = pte_swp_mksoft_dirty(swp_pte);
2113  		if (pte_uffd_wp(pteval))
2114  			swp_pte = pte_swp_mkuffd_wp(swp_pte);
2115  
2116  		set_pte_at(mm, address, pvmw.pte, swp_pte);
2117  
2118  		/*
2119  		 * There is a reference on the page for the swap entry which has
2120  		 * been removed, so shouldn't take another.
2121  		 */
2122  		page_remove_rmap(subpage, false);
2123  	}
2124  
2125  	mmu_notifier_invalidate_range_end(&range);
2126  
2127  	return ret;
2128  }
2129  
2130  /**
2131   * page_make_device_exclusive - mark the page exclusively owned by a device
2132   * @page: the page to replace page table entries for
2133   * @mm: the mm_struct where the page is expected to be mapped
2134   * @address: address where the page is expected to be mapped
2135   * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
2136   *
2137   * Tries to remove all the page table entries which are mapping this page and
2138   * replace them with special device exclusive swap entries to grant a device
2139   * exclusive access to the page. Caller must hold the page lock.
2140   *
2141   * Returns false if the page is still mapped, or if it could not be unmapped
2142   * from the expected address. Otherwise returns true (success).
2143   */
2144  static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
2145  				unsigned long address, void *owner)
2146  {
2147  	struct make_exclusive_args args = {
2148  		.mm = mm,
2149  		.address = address,
2150  		.owner = owner,
2151  		.valid = false,
2152  	};
2153  	struct rmap_walk_control rwc = {
2154  		.rmap_one = page_make_device_exclusive_one,
2155  		.done = page_not_mapped,
2156  		.anon_lock = page_lock_anon_vma_read,
2157  		.arg = &args,
2158  	};
2159  
2160  	/*
2161  	 * Restrict to anonymous pages for now to avoid potential writeback
2162  	 * issues. Also tail pages shouldn't be passed to rmap_walk so skip
2163  	 * those.
2164  	 */
2165  	if (!PageAnon(page) || PageTail(page))
2166  		return false;
2167  
2168  	rmap_walk(page, &rwc);
2169  
2170  	return args.valid && !page_mapcount(page);
2171  }
2172  
2173  /**
2174   * make_device_exclusive_range() - Mark a range for exclusive use by a device
2175   * @mm: mm_struct of assoicated target process
2176   * @start: start of the region to mark for exclusive device access
2177   * @end: end address of region
2178   * @pages: returns the pages which were successfully marked for exclusive access
2179   * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
2180   *
2181   * Returns: number of pages found in the range by GUP. A page is marked for
2182   * exclusive access only if the page pointer is non-NULL.
2183   *
2184   * This function finds ptes mapping page(s) to the given address range, locks
2185   * them and replaces mappings with special swap entries preventing userspace CPU
2186   * access. On fault these entries are replaced with the original mapping after
2187   * calling MMU notifiers.
2188   *
2189   * A driver using this to program access from a device must use a mmu notifier
2190   * critical section to hold a device specific lock during programming. Once
2191   * programming is complete it should drop the page lock and reference after
2192   * which point CPU access to the page will revoke the exclusive access.
2193   */
2194  int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
2195  				unsigned long end, struct page **pages,
2196  				void *owner)
2197  {
2198  	long npages = (end - start) >> PAGE_SHIFT;
2199  	long i;
2200  
2201  	npages = get_user_pages_remote(mm, start, npages,
2202  				       FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
2203  				       pages, NULL, NULL);
2204  	if (npages < 0)
2205  		return npages;
2206  
2207  	for (i = 0; i < npages; i++, start += PAGE_SIZE) {
2208  		if (!trylock_page(pages[i])) {
2209  			put_page(pages[i]);
2210  			pages[i] = NULL;
2211  			continue;
2212  		}
2213  
2214  		if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
2215  			unlock_page(pages[i]);
2216  			put_page(pages[i]);
2217  			pages[i] = NULL;
2218  		}
2219  	}
2220  
2221  	return npages;
2222  }
2223  EXPORT_SYMBOL_GPL(make_device_exclusive_range);
2224  #endif
2225  
2226  void __put_anon_vma(struct anon_vma *anon_vma)
2227  {
2228  	struct anon_vma *root = anon_vma->root;
2229  
2230  	anon_vma_free(anon_vma);
2231  	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
2232  		anon_vma_free(root);
2233  }
2234  
2235  static struct anon_vma *rmap_walk_anon_lock(struct page *page,
2236  					struct rmap_walk_control *rwc)
2237  {
2238  	struct anon_vma *anon_vma;
2239  
2240  	if (rwc->anon_lock)
2241  		return rwc->anon_lock(page);
2242  
2243  	/*
2244  	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
2245  	 * because that depends on page_mapped(); but not all its usages
2246  	 * are holding mmap_lock. Users without mmap_lock are required to
2247  	 * take a reference count to prevent the anon_vma disappearing
2248  	 */
2249  	anon_vma = page_anon_vma(page);
2250  	if (!anon_vma)
2251  		return NULL;
2252  
2253  	anon_vma_lock_read(anon_vma);
2254  	return anon_vma;
2255  }
2256  
2257  /*
2258   * rmap_walk_anon - do something to anonymous page using the object-based
2259   * rmap method
2260   * @page: the page to be handled
2261   * @rwc: control variable according to each walk type
2262   *
2263   * Find all the mappings of a page using the mapping pointer and the vma chains
2264   * contained in the anon_vma struct it points to.
2265   *
2266   * When called from page_mlock(), the mmap_lock of the mm containing the vma
2267   * where the page was found will be held for write.  So, we won't recheck
2268   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
2269   * LOCKED.
2270   */
2271  static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
2272  		bool locked)
2273  {
2274  	struct anon_vma *anon_vma;
2275  	pgoff_t pgoff_start, pgoff_end;
2276  	struct anon_vma_chain *avc;
2277  
2278  	if (locked) {
2279  		anon_vma = page_anon_vma(page);
2280  		/* anon_vma disappear under us? */
2281  		VM_BUG_ON_PAGE(!anon_vma, page);
2282  	} else {
2283  		anon_vma = rmap_walk_anon_lock(page, rwc);
2284  	}
2285  	if (!anon_vma)
2286  		return;
2287  
2288  	pgoff_start = page_to_pgoff(page);
2289  	pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
2290  	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2291  			pgoff_start, pgoff_end) {
2292  		struct vm_area_struct *vma = avc->vma;
2293  		unsigned long address = vma_address(page, vma);
2294  
2295  		VM_BUG_ON_VMA(address == -EFAULT, vma);
2296  		cond_resched();
2297  
2298  		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2299  			continue;
2300  
2301  		if (!rwc->rmap_one(page, vma, address, rwc->arg))
2302  			break;
2303  		if (rwc->done && rwc->done(page))
2304  			break;
2305  	}
2306  
2307  	if (!locked)
2308  		anon_vma_unlock_read(anon_vma);
2309  }
2310  
2311  /*
2312   * rmap_walk_file - do something to file page using the object-based rmap method
2313   * @page: the page to be handled
2314   * @rwc: control variable according to each walk type
2315   *
2316   * Find all the mappings of a page using the mapping pointer and the vma chains
2317   * contained in the address_space struct it points to.
2318   *
2319   * When called from page_mlock(), the mmap_lock of the mm containing the vma
2320   * where the page was found will be held for write.  So, we won't recheck
2321   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
2322   * LOCKED.
2323   */
2324  static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
2325  		bool locked)
2326  {
2327  	struct address_space *mapping = page_mapping(page);
2328  	pgoff_t pgoff_start, pgoff_end;
2329  	struct vm_area_struct *vma;
2330  
2331  	/*
2332  	 * The page lock not only makes sure that page->mapping cannot
2333  	 * suddenly be NULLified by truncation, it makes sure that the
2334  	 * structure at mapping cannot be freed and reused yet,
2335  	 * so we can safely take mapping->i_mmap_rwsem.
2336  	 */
2337  	VM_BUG_ON_PAGE(!PageLocked(page), page);
2338  
2339  	if (!mapping)
2340  		return;
2341  
2342  	pgoff_start = page_to_pgoff(page);
2343  	pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
2344  	if (!locked)
2345  		i_mmap_lock_read(mapping);
2346  	vma_interval_tree_foreach(vma, &mapping->i_mmap,
2347  			pgoff_start, pgoff_end) {
2348  		unsigned long address = vma_address(page, vma);
2349  
2350  		VM_BUG_ON_VMA(address == -EFAULT, vma);
2351  		cond_resched();
2352  
2353  		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2354  			continue;
2355  
2356  		if (!rwc->rmap_one(page, vma, address, rwc->arg))
2357  			goto done;
2358  		if (rwc->done && rwc->done(page))
2359  			goto done;
2360  	}
2361  
2362  done:
2363  	if (!locked)
2364  		i_mmap_unlock_read(mapping);
2365  }
2366  
2367  void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
2368  {
2369  	if (unlikely(PageKsm(page)))
2370  		rmap_walk_ksm(page, rwc);
2371  	else if (PageAnon(page))
2372  		rmap_walk_anon(page, rwc, false);
2373  	else
2374  		rmap_walk_file(page, rwc, false);
2375  }
2376  
2377  /* Like rmap_walk, but caller holds relevant rmap lock */
2378  void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
2379  {
2380  	/* no ksm support for now */
2381  	VM_BUG_ON_PAGE(PageKsm(page), page);
2382  	if (PageAnon(page))
2383  		rmap_walk_anon(page, rwc, true);
2384  	else
2385  		rmap_walk_file(page, rwc, true);
2386  }
2387  
2388  #ifdef CONFIG_HUGETLB_PAGE
2389  /*
2390   * The following two functions are for anonymous (private mapped) hugepages.
2391   * Unlike common anonymous pages, anonymous hugepages have no accounting code
2392   * and no lru code, because we handle hugepages differently from common pages.
2393   */
2394  void hugepage_add_anon_rmap(struct page *page,
2395  			    struct vm_area_struct *vma, unsigned long address)
2396  {
2397  	struct anon_vma *anon_vma = vma->anon_vma;
2398  	int first;
2399  
2400  	BUG_ON(!PageLocked(page));
2401  	BUG_ON(!anon_vma);
2402  	/* address might be in next vma when migration races vma_adjust */
2403  	first = atomic_inc_and_test(compound_mapcount_ptr(page));
2404  	if (first)
2405  		__page_set_anon_rmap(page, vma, address, 0);
2406  }
2407  
2408  void hugepage_add_new_anon_rmap(struct page *page,
2409  			struct vm_area_struct *vma, unsigned long address)
2410  {
2411  	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
2412  	atomic_set(compound_mapcount_ptr(page), 0);
2413  	if (hpage_pincount_available(page))
2414  		atomic_set(compound_pincount_ptr(page), 0);
2415  
2416  	__page_set_anon_rmap(page, vma, address, 1);
2417  }
2418  #endif /* CONFIG_HUGETLB_PAGE */
2419