xref: /openbmc/linux/include/linux/rmap.h (revision 7d7ae873b5e0f46d19e5dc818d1a7809e4b7cc81)
1b2441318SGreg Kroah-Hartman /* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds #ifndef _LINUX_RMAP_H
31da177e4SLinus Torvalds #define _LINUX_RMAP_H
41da177e4SLinus Torvalds /*
51da177e4SLinus Torvalds  * Declarations for Reverse Mapping functions in mm/rmap.c
61da177e4SLinus Torvalds  */
71da177e4SLinus Torvalds 
81da177e4SLinus Torvalds #include <linux/list.h>
91da177e4SLinus Torvalds #include <linux/slab.h>
101da177e4SLinus Torvalds #include <linux/mm.h>
115a505085SIngo Molnar #include <linux/rwsem.h>
12bed7161aSBalbir Singh #include <linux/memcontrol.h>
13ace71a19SKirill A. Shutemov #include <linux/highmem.h>
142aff7a47SMatthew Wilcox (Oracle) #include <linux/pagemap.h>
15fb3d824dSDavid Hildenbrand #include <linux/memremap.h>
161da177e4SLinus Torvalds 
171da177e4SLinus Torvalds /*
181da177e4SLinus Torvalds  * The anon_vma heads a list of private "related" vmas, to scan if
191da177e4SLinus Torvalds  * an anonymous page pointing to this anon_vma needs to be unmapped:
201da177e4SLinus Torvalds  * the vmas on the list will be related by forking, or by splitting.
211da177e4SLinus Torvalds  *
221da177e4SLinus Torvalds  * Since vmas come and go as they are split and merged (particularly
231da177e4SLinus Torvalds  * in mprotect), the mapping field of an anonymous page cannot point
241da177e4SLinus Torvalds  * directly to a vma: instead it points to an anon_vma, on whose list
251da177e4SLinus Torvalds  * the related vmas can be easily linked or unlinked.
261da177e4SLinus Torvalds  *
271da177e4SLinus Torvalds  * After unlinking the last vma on the list, we must garbage collect
281da177e4SLinus Torvalds  * the anon_vma object itself: we're guaranteed no page can be
291da177e4SLinus Torvalds  * pointing to this anon_vma once its vma list is empty.
301da177e4SLinus Torvalds  */
311da177e4SLinus Torvalds struct anon_vma {
325c341ee1SRik van Riel 	struct anon_vma *root;		/* Root of this anon_vma tree */
335a505085SIngo Molnar 	struct rw_semaphore rwsem;	/* W: modification, R: walking the list */
347f60c214SMel Gorman 	/*
3583813267SPeter Zijlstra 	 * The refcount is taken on an anon_vma when there is no
367f60c214SMel Gorman 	 * guarantee that the vma of page tables will exist for
377f60c214SMel Gorman 	 * the duration of the operation. A caller that takes
387f60c214SMel Gorman 	 * the reference is responsible for clearing up the
397f60c214SMel Gorman 	 * anon_vma if they are the last user on release
407f60c214SMel Gorman 	 */
4183813267SPeter Zijlstra 	atomic_t refcount;
4283813267SPeter Zijlstra 
437906d00cSAndrea Arcangeli 	/*
442555283eSJann Horn 	 * Count of child anon_vmas. Equals to the count of all anon_vmas that
452555283eSJann Horn 	 * have ->parent pointing to this one, including itself.
467a3ef208SKonstantin Khlebnikov 	 *
477a3ef208SKonstantin Khlebnikov 	 * This counter is used for making decision about reusing anon_vma
487a3ef208SKonstantin Khlebnikov 	 * instead of forking new one. See comments in function anon_vma_clone.
497a3ef208SKonstantin Khlebnikov 	 */
502555283eSJann Horn 	unsigned long num_children;
512555283eSJann Horn 	/* Count of VMAs whose ->anon_vma pointer points to this object. */
522555283eSJann Horn 	unsigned long num_active_vmas;
537a3ef208SKonstantin Khlebnikov 
547a3ef208SKonstantin Khlebnikov 	struct anon_vma *parent;	/* Parent of this anon_vma */
557a3ef208SKonstantin Khlebnikov 
567a3ef208SKonstantin Khlebnikov 	/*
57bf181b9fSMichel Lespinasse 	 * NOTE: the LSB of the rb_root.rb_node is set by
587906d00cSAndrea Arcangeli 	 * mm_take_all_locks() _after_ taking the above lock. So the
59bf181b9fSMichel Lespinasse 	 * rb_root must only be read/written after taking the above lock
607906d00cSAndrea Arcangeli 	 * to be sure to see a valid next pointer. The LSB bit itself
617906d00cSAndrea Arcangeli 	 * is serialized by a system wide lock only visible to
627906d00cSAndrea Arcangeli 	 * mm_take_all_locks() (mm_all_locks_mutex).
637906d00cSAndrea Arcangeli 	 */
64f808c13fSDavidlohr Bueso 
65f808c13fSDavidlohr Bueso 	/* Interval tree of private "related" vmas */
66f808c13fSDavidlohr Bueso 	struct rb_root_cached rb_root;
675beb4930SRik van Riel };
685beb4930SRik van Riel 
695beb4930SRik van Riel /*
705beb4930SRik van Riel  * The copy-on-write semantics of fork mean that an anon_vma
715beb4930SRik van Riel  * can become associated with multiple processes. Furthermore,
725beb4930SRik van Riel  * each child process will have its own anon_vma, where new
735beb4930SRik van Riel  * pages for that process are instantiated.
745beb4930SRik van Riel  *
755beb4930SRik van Riel  * This structure allows us to find the anon_vmas associated
765beb4930SRik van Riel  * with a VMA, or the VMAs associated with an anon_vma.
775beb4930SRik van Riel  * The "same_vma" list contains the anon_vma_chains linking
785beb4930SRik van Riel  * all the anon_vmas associated with this VMA.
79bf181b9fSMichel Lespinasse  * The "rb" field indexes on an interval tree the anon_vma_chains
805beb4930SRik van Riel  * which link all the VMAs associated with this anon_vma.
815beb4930SRik van Riel  */
825beb4930SRik van Riel struct anon_vma_chain {
835beb4930SRik van Riel 	struct vm_area_struct *vma;
845beb4930SRik van Riel 	struct anon_vma *anon_vma;
85c1e8d7c6SMichel Lespinasse 	struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
865a505085SIngo Molnar 	struct rb_node rb;			/* locked by anon_vma->rwsem */
87bf181b9fSMichel Lespinasse 	unsigned long rb_subtree_last;
88ed8ea815SMichel Lespinasse #ifdef CONFIG_DEBUG_VM_RB
89ed8ea815SMichel Lespinasse 	unsigned long cached_vma_start, cached_vma_last;
90ed8ea815SMichel Lespinasse #endif
911da177e4SLinus Torvalds };
921da177e4SLinus Torvalds 
9302c6de8dSMinchan Kim enum ttu_flags {
94a128ca71SShaohua Li 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
95a128ca71SShaohua Li 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
96732ed558SHugh Dickins 	TTU_SYNC		= 0x10,	/* avoid racy checks with PVMW_SYNC */
976da6b1d4SNaoya Horiguchi 	TTU_HWPOISON		= 0x20,	/* do convert pte to hwpoison entry */
98a128ca71SShaohua Li 	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
9972b252aeSMel Gorman 					 * and caller guarantees they will
10072b252aeSMel Gorman 					 * do a final flush if necessary */
101b5ff8161SNaoya Horiguchi 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
1022a52bcbcSKirill A. Shutemov 					 * caller holds it */
10302c6de8dSMinchan Kim };
10402c6de8dSMinchan Kim 
1051da177e4SLinus Torvalds #ifdef CONFIG_MMU
get_anon_vma(struct anon_vma * anon_vma)10676545066SRik van Riel static inline void get_anon_vma(struct anon_vma *anon_vma)
10776545066SRik van Riel {
10883813267SPeter Zijlstra 	atomic_inc(&anon_vma->refcount);
10976545066SRik van Riel }
11076545066SRik van Riel 
11101d8b20dSPeter Zijlstra void __put_anon_vma(struct anon_vma *anon_vma);
11201d8b20dSPeter Zijlstra 
put_anon_vma(struct anon_vma * anon_vma)11301d8b20dSPeter Zijlstra static inline void put_anon_vma(struct anon_vma *anon_vma)
11401d8b20dSPeter Zijlstra {
11501d8b20dSPeter Zijlstra 	if (atomic_dec_and_test(&anon_vma->refcount))
11601d8b20dSPeter Zijlstra 		__put_anon_vma(anon_vma);
11701d8b20dSPeter Zijlstra }
1181da177e4SLinus Torvalds 
anon_vma_lock_write(struct anon_vma * anon_vma)1194fc3f1d6SIngo Molnar static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
120cba48b98SRik van Riel {
1215a505085SIngo Molnar 	down_write(&anon_vma->root->rwsem);
122cba48b98SRik van Riel }
123cba48b98SRik van Riel 
anon_vma_unlock_write(struct anon_vma * anon_vma)12408b52706SKonstantin Khlebnikov static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
125cba48b98SRik van Riel {
1265a505085SIngo Molnar 	up_write(&anon_vma->root->rwsem);
127cba48b98SRik van Riel }
128cba48b98SRik van Riel 
anon_vma_lock_read(struct anon_vma * anon_vma)1294fc3f1d6SIngo Molnar static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
1304fc3f1d6SIngo Molnar {
1314fc3f1d6SIngo Molnar 	down_read(&anon_vma->root->rwsem);
1324fc3f1d6SIngo Molnar }
1334fc3f1d6SIngo Molnar 
anon_vma_trylock_read(struct anon_vma * anon_vma)1346d4675e6SMinchan Kim static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
1356d4675e6SMinchan Kim {
1366d4675e6SMinchan Kim 	return down_read_trylock(&anon_vma->root->rwsem);
1376d4675e6SMinchan Kim }
1386d4675e6SMinchan Kim 
anon_vma_unlock_read(struct anon_vma * anon_vma)1394fc3f1d6SIngo Molnar static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
1404fc3f1d6SIngo Molnar {
1414fc3f1d6SIngo Molnar 	up_read(&anon_vma->root->rwsem);
1424fc3f1d6SIngo Molnar }
1434fc3f1d6SIngo Molnar 
1444fc3f1d6SIngo Molnar 
1451da177e4SLinus Torvalds /*
1461da177e4SLinus Torvalds  * anon_vma helper functions.
1471da177e4SLinus Torvalds  */
1481da177e4SLinus Torvalds void anon_vma_init(void);	/* create anon_vma_cachep */
149d5a187daSVlastimil Babka int  __anon_vma_prepare(struct vm_area_struct *);
1505beb4930SRik van Riel void unlink_anon_vmas(struct vm_area_struct *);
1515beb4930SRik van Riel int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
1525beb4930SRik van Riel int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
1531da177e4SLinus Torvalds 
anon_vma_prepare(struct vm_area_struct * vma)154d5a187daSVlastimil Babka static inline int anon_vma_prepare(struct vm_area_struct *vma)
155d5a187daSVlastimil Babka {
156d5a187daSVlastimil Babka 	if (likely(vma->anon_vma))
157d5a187daSVlastimil Babka 		return 0;
158d5a187daSVlastimil Babka 
159d5a187daSVlastimil Babka 	return __anon_vma_prepare(vma);
160d5a187daSVlastimil Babka }
161d5a187daSVlastimil Babka 
anon_vma_merge(struct vm_area_struct * vma,struct vm_area_struct * next)1625beb4930SRik van Riel static inline void anon_vma_merge(struct vm_area_struct *vma,
1635beb4930SRik van Riel 				  struct vm_area_struct *next)
1645beb4930SRik van Riel {
16581d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
1665beb4930SRik van Riel 	unlink_anon_vmas(next);
1675beb4930SRik van Riel }
1685beb4930SRik van Riel 
16929eea9b5SMatthew Wilcox (Oracle) struct anon_vma *folio_get_anon_vma(struct folio *folio);
17001d8b20dSPeter Zijlstra 
17114f9135dSDavid Hildenbrand /* RMAP flags, currently only relevant for some anon rmap operations. */
17214f9135dSDavid Hildenbrand typedef int __bitwise rmap_t;
17314f9135dSDavid Hildenbrand 
17414f9135dSDavid Hildenbrand /*
17514f9135dSDavid Hildenbrand  * No special request: if the page is a subpage of a compound page, it is
17614f9135dSDavid Hildenbrand  * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
17714f9135dSDavid Hildenbrand  */
17814f9135dSDavid Hildenbrand #define RMAP_NONE		((__force rmap_t)0)
17914f9135dSDavid Hildenbrand 
18014f9135dSDavid Hildenbrand /* The (sub)page is exclusive to a single process. */
18114f9135dSDavid Hildenbrand #define RMAP_EXCLUSIVE		((__force rmap_t)BIT(0))
18214f9135dSDavid Hildenbrand 
18314f9135dSDavid Hildenbrand /*
18414f9135dSDavid Hildenbrand  * The compound page is not mapped via PTEs, but instead via a single PMD and
18514f9135dSDavid Hildenbrand  * should be accounted accordingly.
18614f9135dSDavid Hildenbrand  */
18714f9135dSDavid Hildenbrand #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
188d281ee61SKirill A. Shutemov 
1891da177e4SLinus Torvalds /*
1901da177e4SLinus Torvalds  * rmap interfaces called when adding or removing pte of page
1911da177e4SLinus Torvalds  */
1925a49973dSHugh Dickins void page_move_anon_rmap(struct page *, struct vm_area_struct *);
193d281ee61SKirill A. Shutemov void page_add_anon_rmap(struct page *, struct vm_area_struct *,
19414f9135dSDavid Hildenbrand 		unsigned long address, rmap_t flags);
195d281ee61SKirill A. Shutemov void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
19640f2bbf7SDavid Hildenbrand 		unsigned long address);
1974d510f3dSMatthew Wilcox (Oracle) void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
1984d510f3dSMatthew Wilcox (Oracle) 		unsigned long address);
199cea86fe2SHugh Dickins void page_add_file_rmap(struct page *, struct vm_area_struct *,
200cea86fe2SHugh Dickins 		bool compound);
20186f35f69SYin Fengwei void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
20286f35f69SYin Fengwei 		struct vm_area_struct *, bool compound);
203cea86fe2SHugh Dickins void page_remove_rmap(struct page *, struct vm_area_struct *,
204cea86fe2SHugh Dickins 		bool compound);
20540f2bbf7SDavid Hildenbrand 
2060fe6e20bSNaoya Horiguchi void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
20728c5209dSDavid Hildenbrand 		unsigned long address, rmap_t flags);
208d0ce0e47SSidhartha Kumar void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
209cea86fe2SHugh Dickins 		unsigned long address);
2100fe6e20bSNaoya Horiguchi 
__page_dup_rmap(struct page * page,bool compound)211fb3d824dSDavid Hildenbrand static inline void __page_dup_rmap(struct page *page, bool compound)
2121da177e4SLinus Torvalds {
213c7f84b57SMatthew Wilcox (Oracle) 	if (compound) {
214c7f84b57SMatthew Wilcox (Oracle) 		struct folio *folio = (struct folio *)page;
215c7f84b57SMatthew Wilcox (Oracle) 
216c7f84b57SMatthew Wilcox (Oracle) 		VM_BUG_ON_PAGE(compound && !PageHead(page), page);
217c7f84b57SMatthew Wilcox (Oracle) 		atomic_inc(&folio->_entire_mapcount);
218c7f84b57SMatthew Wilcox (Oracle) 	} else {
219c7f84b57SMatthew Wilcox (Oracle) 		atomic_inc(&page->_mapcount);
220c7f84b57SMatthew Wilcox (Oracle) 	}
2211da177e4SLinus Torvalds }
2221da177e4SLinus Torvalds 
page_dup_file_rmap(struct page * page,bool compound)223fb3d824dSDavid Hildenbrand static inline void page_dup_file_rmap(struct page *page, bool compound)
224fb3d824dSDavid Hildenbrand {
225fb3d824dSDavid Hildenbrand 	__page_dup_rmap(page, compound);
226fb3d824dSDavid Hildenbrand }
227fb3d824dSDavid Hildenbrand 
228fb3d824dSDavid Hildenbrand /**
229fb3d824dSDavid Hildenbrand  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
230fb3d824dSDavid Hildenbrand  *			    anonymous page
231fb3d824dSDavid Hildenbrand  * @page: the page to duplicate the mapping for
232fb3d824dSDavid Hildenbrand  * @compound: the page is mapped as compound or as a small page
233fb3d824dSDavid Hildenbrand  * @vma: the source vma
234fb3d824dSDavid Hildenbrand  *
235fb3d824dSDavid Hildenbrand  * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
236fb3d824dSDavid Hildenbrand  *
237fb3d824dSDavid Hildenbrand  * Duplicating the mapping can only fail if the page may be pinned; device
238fb3d824dSDavid Hildenbrand  * private pages cannot get pinned and consequently this function cannot fail.
239fb3d824dSDavid Hildenbrand  *
240fb3d824dSDavid Hildenbrand  * If duplicating the mapping succeeds, the page has to be mapped R/O into
241fb3d824dSDavid Hildenbrand  * the parent and the child. It must *not* get mapped writable after this call.
242fb3d824dSDavid Hildenbrand  *
243fb3d824dSDavid Hildenbrand  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
244fb3d824dSDavid Hildenbrand  */
page_try_dup_anon_rmap(struct page * page,bool compound,struct vm_area_struct * vma)245fb3d824dSDavid Hildenbrand static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
246fb3d824dSDavid Hildenbrand 					 struct vm_area_struct *vma)
247fb3d824dSDavid Hildenbrand {
248fb3d824dSDavid Hildenbrand 	VM_BUG_ON_PAGE(!PageAnon(page), page);
249fb3d824dSDavid Hildenbrand 
250fb3d824dSDavid Hildenbrand 	/*
2516c287605SDavid Hildenbrand 	 * No need to check+clear for already shared pages, including KSM
2526c287605SDavid Hildenbrand 	 * pages.
2536c287605SDavid Hildenbrand 	 */
2546c287605SDavid Hildenbrand 	if (!PageAnonExclusive(page))
2556c287605SDavid Hildenbrand 		goto dup;
2566c287605SDavid Hildenbrand 
2576c287605SDavid Hildenbrand 	/*
258fb3d824dSDavid Hildenbrand 	 * If this page may have been pinned by the parent process,
259fb3d824dSDavid Hildenbrand 	 * don't allow to duplicate the mapping but instead require to e.g.,
260fb3d824dSDavid Hildenbrand 	 * copy the page immediately for the child so that we'll always
261fb3d824dSDavid Hildenbrand 	 * guarantee the pinned page won't be randomly replaced in the
262fb3d824dSDavid Hildenbrand 	 * future on write faults.
263fb3d824dSDavid Hildenbrand 	 */
264*62efb1e6SSteven Rostedt (Google) 	if (likely(!is_device_private_page(page)) &&
265*62efb1e6SSteven Rostedt (Google) 	    unlikely(page_needs_cow_for_dma(vma, page)))
266fb3d824dSDavid Hildenbrand 		return -EBUSY;
267fb3d824dSDavid Hildenbrand 
2686c287605SDavid Hildenbrand 	ClearPageAnonExclusive(page);
269fb3d824dSDavid Hildenbrand 	/*
270fb3d824dSDavid Hildenbrand 	 * It's okay to share the anon page between both processes, mapping
271fb3d824dSDavid Hildenbrand 	 * the page R/O into both processes.
272fb3d824dSDavid Hildenbrand 	 */
2736c287605SDavid Hildenbrand dup:
274fb3d824dSDavid Hildenbrand 	__page_dup_rmap(page, compound);
275fb3d824dSDavid Hildenbrand 	return 0;
276fb3d824dSDavid Hildenbrand }
277fb3d824dSDavid Hildenbrand 
2786c287605SDavid Hildenbrand /**
2796c287605SDavid Hildenbrand  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
2806c287605SDavid Hildenbrand  *			      shared to prepare for KSM or temporary unmapping
2816c287605SDavid Hildenbrand  * @page: the exclusive anonymous page to try marking possibly shared
2826c287605SDavid Hildenbrand  *
2836c287605SDavid Hildenbrand  * The caller needs to hold the PT lock and has to have the page table entry
284088b8aa5SDavid Hildenbrand  * cleared/invalidated.
2856c287605SDavid Hildenbrand  *
2866c287605SDavid Hildenbrand  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
2876c287605SDavid Hildenbrand  * to duplicate a mapping, but instead to prepare for KSM or temporarily
2886c287605SDavid Hildenbrand  * unmapping a page (swap, migration) via page_remove_rmap().
2896c287605SDavid Hildenbrand  *
2906c287605SDavid Hildenbrand  * Marking the page shared can only fail if the page may be pinned; device
2916c287605SDavid Hildenbrand  * private pages cannot get pinned and consequently this function cannot fail.
2926c287605SDavid Hildenbrand  *
2936c287605SDavid Hildenbrand  * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
2946c287605SDavid Hildenbrand  * otherwise.
2956c287605SDavid Hildenbrand  */
page_try_share_anon_rmap(struct page * page)2966c287605SDavid Hildenbrand static inline int page_try_share_anon_rmap(struct page *page)
2976c287605SDavid Hildenbrand {
2986c287605SDavid Hildenbrand 	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
2996c287605SDavid Hildenbrand 
300088b8aa5SDavid Hildenbrand 	/* device private pages cannot get pinned via GUP. */
301088b8aa5SDavid Hildenbrand 	if (unlikely(is_device_private_page(page))) {
3026c287605SDavid Hildenbrand 		ClearPageAnonExclusive(page);
3036c287605SDavid Hildenbrand 		return 0;
3046c287605SDavid Hildenbrand 	}
3056c287605SDavid Hildenbrand 
3061da177e4SLinus Torvalds 	/*
307088b8aa5SDavid Hildenbrand 	 * We have to make sure that when we clear PageAnonExclusive, that
308088b8aa5SDavid Hildenbrand 	 * the page is not pinned and that concurrent GUP-fast won't succeed in
309088b8aa5SDavid Hildenbrand 	 * concurrently pinning the page.
310088b8aa5SDavid Hildenbrand 	 *
311088b8aa5SDavid Hildenbrand 	 * Conceptually, PageAnonExclusive clearing consists of:
312088b8aa5SDavid Hildenbrand 	 * (A1) Clear PTE
313088b8aa5SDavid Hildenbrand 	 * (A2) Check if the page is pinned; back off if so.
314088b8aa5SDavid Hildenbrand 	 * (A3) Clear PageAnonExclusive
315088b8aa5SDavid Hildenbrand 	 * (A4) Restore PTE (optional, but certainly not writable)
316088b8aa5SDavid Hildenbrand 	 *
317088b8aa5SDavid Hildenbrand 	 * When clearing PageAnonExclusive, we cannot possibly map the page
318088b8aa5SDavid Hildenbrand 	 * writable again, because anon pages that may be shared must never
319088b8aa5SDavid Hildenbrand 	 * be writable. So in any case, if the PTE was writable it cannot
320088b8aa5SDavid Hildenbrand 	 * be writable anymore afterwards and there would be a PTE change. Only
321088b8aa5SDavid Hildenbrand 	 * if the PTE wasn't writable, there might not be a PTE change.
322088b8aa5SDavid Hildenbrand 	 *
323088b8aa5SDavid Hildenbrand 	 * Conceptually, GUP-fast pinning of an anon page consists of:
324088b8aa5SDavid Hildenbrand 	 * (B1) Read the PTE
325088b8aa5SDavid Hildenbrand 	 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
326088b8aa5SDavid Hildenbrand 	 * (B3) Pin the mapped page
327088b8aa5SDavid Hildenbrand 	 * (B4) Check if the PTE changed by re-reading it; back off if so.
328088b8aa5SDavid Hildenbrand 	 * (B5) If the original PTE is not writable, check if
329088b8aa5SDavid Hildenbrand 	 *	PageAnonExclusive is not set; back off if so.
330088b8aa5SDavid Hildenbrand 	 *
331088b8aa5SDavid Hildenbrand 	 * If the PTE was writable, we only have to make sure that GUP-fast
332088b8aa5SDavid Hildenbrand 	 * observes a PTE change and properly backs off.
333088b8aa5SDavid Hildenbrand 	 *
334088b8aa5SDavid Hildenbrand 	 * If the PTE was not writable, we have to make sure that GUP-fast either
335088b8aa5SDavid Hildenbrand 	 * detects a (temporary) PTE change or that PageAnonExclusive is cleared
336088b8aa5SDavid Hildenbrand 	 * and properly backs off.
337088b8aa5SDavid Hildenbrand 	 *
338088b8aa5SDavid Hildenbrand 	 * Consequently, when clearing PageAnonExclusive(), we have to make
339088b8aa5SDavid Hildenbrand 	 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
340088b8aa5SDavid Hildenbrand 	 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
341088b8aa5SDavid Hildenbrand 	 * and (B5) happen in the right memory order.
342088b8aa5SDavid Hildenbrand 	 *
343088b8aa5SDavid Hildenbrand 	 * We assume that there might not be a memory barrier after
344088b8aa5SDavid Hildenbrand 	 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
345088b8aa5SDavid Hildenbrand 	 * so we use explicit ones here.
346088b8aa5SDavid Hildenbrand 	 */
347088b8aa5SDavid Hildenbrand 
348088b8aa5SDavid Hildenbrand 	/* Paired with the memory barrier in try_grab_folio(). */
349088b8aa5SDavid Hildenbrand 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
350088b8aa5SDavid Hildenbrand 		smp_mb();
351088b8aa5SDavid Hildenbrand 
352088b8aa5SDavid Hildenbrand 	if (unlikely(page_maybe_dma_pinned(page)))
353088b8aa5SDavid Hildenbrand 		return -EBUSY;
354088b8aa5SDavid Hildenbrand 	ClearPageAnonExclusive(page);
355088b8aa5SDavid Hildenbrand 
356088b8aa5SDavid Hildenbrand 	/*
357088b8aa5SDavid Hildenbrand 	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
358088b8aa5SDavid Hildenbrand 	 * gup_must_unshare().
359088b8aa5SDavid Hildenbrand 	 */
360088b8aa5SDavid Hildenbrand 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
361088b8aa5SDavid Hildenbrand 		smp_mb__after_atomic();
362088b8aa5SDavid Hildenbrand 	return 0;
363088b8aa5SDavid Hildenbrand }
364088b8aa5SDavid Hildenbrand 
365088b8aa5SDavid Hildenbrand /*
3661da177e4SLinus Torvalds  * Called from mm/vmscan.c to handle paging out
3671da177e4SLinus Torvalds  */
368b3ac0413SMatthew Wilcox (Oracle) int folio_referenced(struct folio *, int is_locked,
36972835c86SJohannes Weiner 			struct mem_cgroup *memcg, unsigned long *vm_flags);
3705ad64688SHugh Dickins 
3714b8554c5SMatthew Wilcox (Oracle) void try_to_migrate(struct folio *folio, enum ttu_flags flags);
372869f7ee6SMatthew Wilcox (Oracle) void try_to_unmap(struct folio *, enum ttu_flags flags);
3731da177e4SLinus Torvalds 
374b756a3b5SAlistair Popple int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
375b756a3b5SAlistair Popple 				unsigned long end, struct page **pages,
376b756a3b5SAlistair Popple 				void *arg);
377b756a3b5SAlistair Popple 
378ace71a19SKirill A. Shutemov /* Avoid racy checks */
379ace71a19SKirill A. Shutemov #define PVMW_SYNC		(1 << 0)
3802aff7a47SMatthew Wilcox (Oracle) /* Look for migration entries rather than present PTEs */
381ace71a19SKirill A. Shutemov #define PVMW_MIGRATION		(1 << 1)
382ace71a19SKirill A. Shutemov 
383ace71a19SKirill A. Shutemov struct page_vma_mapped_walk {
3842aff7a47SMatthew Wilcox (Oracle) 	unsigned long pfn;
3852aff7a47SMatthew Wilcox (Oracle) 	unsigned long nr_pages;
3862aff7a47SMatthew Wilcox (Oracle) 	pgoff_t pgoff;
387ace71a19SKirill A. Shutemov 	struct vm_area_struct *vma;
388ace71a19SKirill A. Shutemov 	unsigned long address;
389ace71a19SKirill A. Shutemov 	pmd_t *pmd;
390ace71a19SKirill A. Shutemov 	pte_t *pte;
391ace71a19SKirill A. Shutemov 	spinlock_t *ptl;
392ace71a19SKirill A. Shutemov 	unsigned int flags;
393ace71a19SKirill A. Shutemov };
394ace71a19SKirill A. Shutemov 
395eed05e54SMatthew Wilcox (Oracle) #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)	\
396eed05e54SMatthew Wilcox (Oracle) 	struct page_vma_mapped_walk name = {				\
3972aff7a47SMatthew Wilcox (Oracle) 		.pfn = page_to_pfn(_page),				\
398507db792SYang Shi 		.nr_pages = compound_nr(_page),				\
399507db792SYang Shi 		.pgoff = page_to_pgoff(_page),				\
400eed05e54SMatthew Wilcox (Oracle) 		.vma = _vma,						\
401eed05e54SMatthew Wilcox (Oracle) 		.address = _address,					\
402eed05e54SMatthew Wilcox (Oracle) 		.flags = _flags,					\
403eed05e54SMatthew Wilcox (Oracle) 	}
404eed05e54SMatthew Wilcox (Oracle) 
405eed05e54SMatthew Wilcox (Oracle) #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)	\
406eed05e54SMatthew Wilcox (Oracle) 	struct page_vma_mapped_walk name = {				\
4072aff7a47SMatthew Wilcox (Oracle) 		.pfn = folio_pfn(_folio),				\
4082aff7a47SMatthew Wilcox (Oracle) 		.nr_pages = folio_nr_pages(_folio),			\
4092aff7a47SMatthew Wilcox (Oracle) 		.pgoff = folio_pgoff(_folio),				\
410eed05e54SMatthew Wilcox (Oracle) 		.vma = _vma,						\
411eed05e54SMatthew Wilcox (Oracle) 		.address = _address,					\
412eed05e54SMatthew Wilcox (Oracle) 		.flags = _flags,					\
413eed05e54SMatthew Wilcox (Oracle) 	}
414eed05e54SMatthew Wilcox (Oracle) 
page_vma_mapped_walk_done(struct page_vma_mapped_walk * pvmw)415ace71a19SKirill A. Shutemov static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
416ace71a19SKirill A. Shutemov {
4175d5d19edSMiaohe Lin 	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
4182aff7a47SMatthew Wilcox (Oracle) 	if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
419ace71a19SKirill A. Shutemov 		pte_unmap(pvmw->pte);
420ace71a19SKirill A. Shutemov 	if (pvmw->ptl)
421ace71a19SKirill A. Shutemov 		spin_unlock(pvmw->ptl);
422ace71a19SKirill A. Shutemov }
423ace71a19SKirill A. Shutemov 
424ace71a19SKirill A. Shutemov bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
425ace71a19SKirill A. Shutemov 
4268749cfeaSVladimir Davydov /*
4271da177e4SLinus Torvalds  * Used by swapoff to help locate where page is expected in vma.
4281da177e4SLinus Torvalds  */
4291da177e4SLinus Torvalds unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
4301da177e4SLinus Torvalds 
431d08b3851SPeter Zijlstra /*
432d08b3851SPeter Zijlstra  * Cleans the PTEs of shared mappings.
433d08b3851SPeter Zijlstra  * (and since clean PTEs should also be readonly, write protects them too)
434d08b3851SPeter Zijlstra  *
435d08b3851SPeter Zijlstra  * returns the number of cleaned PTEs.
436d08b3851SPeter Zijlstra  */
437d9c08e22SMatthew Wilcox (Oracle) int folio_mkclean(struct folio *);
438d08b3851SPeter Zijlstra 
4396a8e0596SMuchun Song int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
4406a8e0596SMuchun Song 		      struct vm_area_struct *vma);
4416a8e0596SMuchun Song 
4424eecb8b9SMatthew Wilcox (Oracle) void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
443e388466dSKirill A. Shutemov 
4446a46079cSAndi Kleen int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
44510be22dfSAndi Kleen 
4460dd1c7bbSJoonsoo Kim /*
4470dd1c7bbSJoonsoo Kim  * rmap_walk_control: To control rmap traversing for specific needs
4480dd1c7bbSJoonsoo Kim  *
4490dd1c7bbSJoonsoo Kim  * arg: passed to rmap_one() and invalid_vma()
4506d4675e6SMinchan Kim  * try_lock: bail out if the rmap lock is contended
4516d4675e6SMinchan Kim  * contended: indicate the rmap traversal bailed out due to lock contention
4520dd1c7bbSJoonsoo Kim  * rmap_one: executed on each vma where page is mapped
4530dd1c7bbSJoonsoo Kim  * done: for checking traversing termination condition
4540dd1c7bbSJoonsoo Kim  * anon_lock: for getting anon_lock by optimized way rather than default
4550dd1c7bbSJoonsoo Kim  * invalid_vma: for skipping uninterested vma
4560dd1c7bbSJoonsoo Kim  */
457051ac83aSJoonsoo Kim struct rmap_walk_control {
458051ac83aSJoonsoo Kim 	void *arg;
4596d4675e6SMinchan Kim 	bool try_lock;
4606d4675e6SMinchan Kim 	bool contended;
461e4b82222SMinchan Kim 	/*
462e4b82222SMinchan Kim 	 * Return false if page table scanning in rmap_walk should be stopped.
463e4b82222SMinchan Kim 	 * Otherwise, return true.
464e4b82222SMinchan Kim 	 */
4652f031c6fSMatthew Wilcox (Oracle) 	bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
466051ac83aSJoonsoo Kim 					unsigned long addr, void *arg);
4672f031c6fSMatthew Wilcox (Oracle) 	int (*done)(struct folio *folio);
4686d4675e6SMinchan Kim 	struct anon_vma *(*anon_lock)(struct folio *folio,
4696d4675e6SMinchan Kim 				      struct rmap_walk_control *rwc);
4700dd1c7bbSJoonsoo Kim 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
471051ac83aSJoonsoo Kim };
472051ac83aSJoonsoo Kim 
4736d4675e6SMinchan Kim void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
4746d4675e6SMinchan Kim void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
4756d4675e6SMinchan Kim struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
4766d4675e6SMinchan Kim 					  struct rmap_walk_control *rwc);
477e9995ef9SHugh Dickins 
4781da177e4SLinus Torvalds #else	/* !CONFIG_MMU */
4791da177e4SLinus Torvalds 
4801da177e4SLinus Torvalds #define anon_vma_init()		do {} while (0)
4811da177e4SLinus Torvalds #define anon_vma_prepare(vma)	(0)
4821da177e4SLinus Torvalds 
folio_referenced(struct folio * folio,int is_locked,struct mem_cgroup * memcg,unsigned long * vm_flags)483b3ac0413SMatthew Wilcox (Oracle) static inline int folio_referenced(struct folio *folio, int is_locked,
48472835c86SJohannes Weiner 				  struct mem_cgroup *memcg,
48501ff53f4SMike Frysinger 				  unsigned long *vm_flags)
48601ff53f4SMike Frysinger {
48701ff53f4SMike Frysinger 	*vm_flags = 0;
48864574746SJohannes Weiner 	return 0;
48901ff53f4SMike Frysinger }
49001ff53f4SMike Frysinger 
try_to_unmap(struct folio * folio,enum ttu_flags flags)491869f7ee6SMatthew Wilcox (Oracle) static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
492ab7965deSChristoph Hellwig {
493ab7965deSChristoph Hellwig }
4941da177e4SLinus Torvalds 
folio_mkclean(struct folio * folio)495d9c08e22SMatthew Wilcox (Oracle) static inline int folio_mkclean(struct folio *folio)
496d08b3851SPeter Zijlstra {
497d08b3851SPeter Zijlstra 	return 0;
498d08b3851SPeter Zijlstra }
4991da177e4SLinus Torvalds #endif	/* CONFIG_MMU */
5001da177e4SLinus Torvalds 
page_mkclean(struct page * page)501d9c08e22SMatthew Wilcox (Oracle) static inline int page_mkclean(struct page *page)
502d9c08e22SMatthew Wilcox (Oracle) {
503d9c08e22SMatthew Wilcox (Oracle) 	return folio_mkclean(page_folio(page));
504d9c08e22SMatthew Wilcox (Oracle) }
5051da177e4SLinus Torvalds #endif	/* _LINUX_RMAP_H */
506