1b2441318SGreg Kroah-Hartman /* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds #ifndef _LINUX_RMAP_H
31da177e4SLinus Torvalds #define _LINUX_RMAP_H
41da177e4SLinus Torvalds /*
51da177e4SLinus Torvalds * Declarations for Reverse Mapping functions in mm/rmap.c
61da177e4SLinus Torvalds */
71da177e4SLinus Torvalds
81da177e4SLinus Torvalds #include <linux/list.h>
91da177e4SLinus Torvalds #include <linux/slab.h>
101da177e4SLinus Torvalds #include <linux/mm.h>
115a505085SIngo Molnar #include <linux/rwsem.h>
12bed7161aSBalbir Singh #include <linux/memcontrol.h>
13ace71a19SKirill A. Shutemov #include <linux/highmem.h>
142aff7a47SMatthew Wilcox (Oracle) #include <linux/pagemap.h>
15fb3d824dSDavid Hildenbrand #include <linux/memremap.h>
161da177e4SLinus Torvalds
171da177e4SLinus Torvalds /*
181da177e4SLinus Torvalds * The anon_vma heads a list of private "related" vmas, to scan if
191da177e4SLinus Torvalds * an anonymous page pointing to this anon_vma needs to be unmapped:
201da177e4SLinus Torvalds * the vmas on the list will be related by forking, or by splitting.
211da177e4SLinus Torvalds *
221da177e4SLinus Torvalds * Since vmas come and go as they are split and merged (particularly
231da177e4SLinus Torvalds * in mprotect), the mapping field of an anonymous page cannot point
241da177e4SLinus Torvalds * directly to a vma: instead it points to an anon_vma, on whose list
251da177e4SLinus Torvalds * the related vmas can be easily linked or unlinked.
261da177e4SLinus Torvalds *
271da177e4SLinus Torvalds * After unlinking the last vma on the list, we must garbage collect
281da177e4SLinus Torvalds * the anon_vma object itself: we're guaranteed no page can be
291da177e4SLinus Torvalds * pointing to this anon_vma once its vma list is empty.
301da177e4SLinus Torvalds */
311da177e4SLinus Torvalds struct anon_vma {
325c341ee1SRik van Riel struct anon_vma *root; /* Root of this anon_vma tree */
335a505085SIngo Molnar struct rw_semaphore rwsem; /* W: modification, R: walking the list */
347f60c214SMel Gorman /*
3583813267SPeter Zijlstra * The refcount is taken on an anon_vma when there is no
367f60c214SMel Gorman * guarantee that the vma of page tables will exist for
377f60c214SMel Gorman * the duration of the operation. A caller that takes
387f60c214SMel Gorman * the reference is responsible for clearing up the
397f60c214SMel Gorman * anon_vma if they are the last user on release
407f60c214SMel Gorman */
4183813267SPeter Zijlstra atomic_t refcount;
4283813267SPeter Zijlstra
437906d00cSAndrea Arcangeli /*
442555283eSJann Horn * Count of child anon_vmas. Equals to the count of all anon_vmas that
452555283eSJann Horn * have ->parent pointing to this one, including itself.
467a3ef208SKonstantin Khlebnikov *
477a3ef208SKonstantin Khlebnikov * This counter is used for making decision about reusing anon_vma
487a3ef208SKonstantin Khlebnikov * instead of forking new one. See comments in function anon_vma_clone.
497a3ef208SKonstantin Khlebnikov */
502555283eSJann Horn unsigned long num_children;
512555283eSJann Horn /* Count of VMAs whose ->anon_vma pointer points to this object. */
522555283eSJann Horn unsigned long num_active_vmas;
537a3ef208SKonstantin Khlebnikov
547a3ef208SKonstantin Khlebnikov struct anon_vma *parent; /* Parent of this anon_vma */
557a3ef208SKonstantin Khlebnikov
567a3ef208SKonstantin Khlebnikov /*
57bf181b9fSMichel Lespinasse * NOTE: the LSB of the rb_root.rb_node is set by
587906d00cSAndrea Arcangeli * mm_take_all_locks() _after_ taking the above lock. So the
59bf181b9fSMichel Lespinasse * rb_root must only be read/written after taking the above lock
607906d00cSAndrea Arcangeli * to be sure to see a valid next pointer. The LSB bit itself
617906d00cSAndrea Arcangeli * is serialized by a system wide lock only visible to
627906d00cSAndrea Arcangeli * mm_take_all_locks() (mm_all_locks_mutex).
637906d00cSAndrea Arcangeli */
64f808c13fSDavidlohr Bueso
65f808c13fSDavidlohr Bueso /* Interval tree of private "related" vmas */
66f808c13fSDavidlohr Bueso struct rb_root_cached rb_root;
675beb4930SRik van Riel };
685beb4930SRik van Riel
695beb4930SRik van Riel /*
705beb4930SRik van Riel * The copy-on-write semantics of fork mean that an anon_vma
715beb4930SRik van Riel * can become associated with multiple processes. Furthermore,
725beb4930SRik van Riel * each child process will have its own anon_vma, where new
735beb4930SRik van Riel * pages for that process are instantiated.
745beb4930SRik van Riel *
755beb4930SRik van Riel * This structure allows us to find the anon_vmas associated
765beb4930SRik van Riel * with a VMA, or the VMAs associated with an anon_vma.
775beb4930SRik van Riel * The "same_vma" list contains the anon_vma_chains linking
785beb4930SRik van Riel * all the anon_vmas associated with this VMA.
79bf181b9fSMichel Lespinasse * The "rb" field indexes on an interval tree the anon_vma_chains
805beb4930SRik van Riel * which link all the VMAs associated with this anon_vma.
815beb4930SRik van Riel */
825beb4930SRik van Riel struct anon_vma_chain {
835beb4930SRik van Riel struct vm_area_struct *vma;
845beb4930SRik van Riel struct anon_vma *anon_vma;
85c1e8d7c6SMichel Lespinasse struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
865a505085SIngo Molnar struct rb_node rb; /* locked by anon_vma->rwsem */
87bf181b9fSMichel Lespinasse unsigned long rb_subtree_last;
88ed8ea815SMichel Lespinasse #ifdef CONFIG_DEBUG_VM_RB
89ed8ea815SMichel Lespinasse unsigned long cached_vma_start, cached_vma_last;
90ed8ea815SMichel Lespinasse #endif
911da177e4SLinus Torvalds };
921da177e4SLinus Torvalds
9302c6de8dSMinchan Kim enum ttu_flags {
94a128ca71SShaohua Li TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
95a128ca71SShaohua Li TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
96732ed558SHugh Dickins TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
976da6b1d4SNaoya Horiguchi TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */
98a128ca71SShaohua Li TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
9972b252aeSMel Gorman * and caller guarantees they will
10072b252aeSMel Gorman * do a final flush if necessary */
101b5ff8161SNaoya Horiguchi TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
1022a52bcbcSKirill A. Shutemov * caller holds it */
10302c6de8dSMinchan Kim };
10402c6de8dSMinchan Kim
1051da177e4SLinus Torvalds #ifdef CONFIG_MMU
get_anon_vma(struct anon_vma * anon_vma)10676545066SRik van Riel static inline void get_anon_vma(struct anon_vma *anon_vma)
10776545066SRik van Riel {
10883813267SPeter Zijlstra atomic_inc(&anon_vma->refcount);
10976545066SRik van Riel }
11076545066SRik van Riel
11101d8b20dSPeter Zijlstra void __put_anon_vma(struct anon_vma *anon_vma);
11201d8b20dSPeter Zijlstra
put_anon_vma(struct anon_vma * anon_vma)11301d8b20dSPeter Zijlstra static inline void put_anon_vma(struct anon_vma *anon_vma)
11401d8b20dSPeter Zijlstra {
11501d8b20dSPeter Zijlstra if (atomic_dec_and_test(&anon_vma->refcount))
11601d8b20dSPeter Zijlstra __put_anon_vma(anon_vma);
11701d8b20dSPeter Zijlstra }
1181da177e4SLinus Torvalds
anon_vma_lock_write(struct anon_vma * anon_vma)1194fc3f1d6SIngo Molnar static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
120cba48b98SRik van Riel {
1215a505085SIngo Molnar down_write(&anon_vma->root->rwsem);
122cba48b98SRik van Riel }
123cba48b98SRik van Riel
anon_vma_unlock_write(struct anon_vma * anon_vma)12408b52706SKonstantin Khlebnikov static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
125cba48b98SRik van Riel {
1265a505085SIngo Molnar up_write(&anon_vma->root->rwsem);
127cba48b98SRik van Riel }
128cba48b98SRik van Riel
anon_vma_lock_read(struct anon_vma * anon_vma)1294fc3f1d6SIngo Molnar static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
1304fc3f1d6SIngo Molnar {
1314fc3f1d6SIngo Molnar down_read(&anon_vma->root->rwsem);
1324fc3f1d6SIngo Molnar }
1334fc3f1d6SIngo Molnar
anon_vma_trylock_read(struct anon_vma * anon_vma)1346d4675e6SMinchan Kim static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
1356d4675e6SMinchan Kim {
1366d4675e6SMinchan Kim return down_read_trylock(&anon_vma->root->rwsem);
1376d4675e6SMinchan Kim }
1386d4675e6SMinchan Kim
anon_vma_unlock_read(struct anon_vma * anon_vma)1394fc3f1d6SIngo Molnar static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
1404fc3f1d6SIngo Molnar {
1414fc3f1d6SIngo Molnar up_read(&anon_vma->root->rwsem);
1424fc3f1d6SIngo Molnar }
1434fc3f1d6SIngo Molnar
1444fc3f1d6SIngo Molnar
1451da177e4SLinus Torvalds /*
1461da177e4SLinus Torvalds * anon_vma helper functions.
1471da177e4SLinus Torvalds */
1481da177e4SLinus Torvalds void anon_vma_init(void); /* create anon_vma_cachep */
149d5a187daSVlastimil Babka int __anon_vma_prepare(struct vm_area_struct *);
1505beb4930SRik van Riel void unlink_anon_vmas(struct vm_area_struct *);
1515beb4930SRik van Riel int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
1525beb4930SRik van Riel int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
1531da177e4SLinus Torvalds
anon_vma_prepare(struct vm_area_struct * vma)154d5a187daSVlastimil Babka static inline int anon_vma_prepare(struct vm_area_struct *vma)
155d5a187daSVlastimil Babka {
156d5a187daSVlastimil Babka if (likely(vma->anon_vma))
157d5a187daSVlastimil Babka return 0;
158d5a187daSVlastimil Babka
159d5a187daSVlastimil Babka return __anon_vma_prepare(vma);
160d5a187daSVlastimil Babka }
161d5a187daSVlastimil Babka
anon_vma_merge(struct vm_area_struct * vma,struct vm_area_struct * next)1625beb4930SRik van Riel static inline void anon_vma_merge(struct vm_area_struct *vma,
1635beb4930SRik van Riel struct vm_area_struct *next)
1645beb4930SRik van Riel {
16581d1b09cSSasha Levin VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
1665beb4930SRik van Riel unlink_anon_vmas(next);
1675beb4930SRik van Riel }
1685beb4930SRik van Riel
16929eea9b5SMatthew Wilcox (Oracle) struct anon_vma *folio_get_anon_vma(struct folio *folio);
17001d8b20dSPeter Zijlstra
17114f9135dSDavid Hildenbrand /* RMAP flags, currently only relevant for some anon rmap operations. */
17214f9135dSDavid Hildenbrand typedef int __bitwise rmap_t;
17314f9135dSDavid Hildenbrand
17414f9135dSDavid Hildenbrand /*
17514f9135dSDavid Hildenbrand * No special request: if the page is a subpage of a compound page, it is
17614f9135dSDavid Hildenbrand * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
17714f9135dSDavid Hildenbrand */
17814f9135dSDavid Hildenbrand #define RMAP_NONE ((__force rmap_t)0)
17914f9135dSDavid Hildenbrand
18014f9135dSDavid Hildenbrand /* The (sub)page is exclusive to a single process. */
18114f9135dSDavid Hildenbrand #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
18214f9135dSDavid Hildenbrand
18314f9135dSDavid Hildenbrand /*
18414f9135dSDavid Hildenbrand * The compound page is not mapped via PTEs, but instead via a single PMD and
18514f9135dSDavid Hildenbrand * should be accounted accordingly.
18614f9135dSDavid Hildenbrand */
18714f9135dSDavid Hildenbrand #define RMAP_COMPOUND ((__force rmap_t)BIT(1))
188d281ee61SKirill A. Shutemov
1891da177e4SLinus Torvalds /*
1901da177e4SLinus Torvalds * rmap interfaces called when adding or removing pte of page
1911da177e4SLinus Torvalds */
1925a49973dSHugh Dickins void page_move_anon_rmap(struct page *, struct vm_area_struct *);
193d281ee61SKirill A. Shutemov void page_add_anon_rmap(struct page *, struct vm_area_struct *,
19414f9135dSDavid Hildenbrand unsigned long address, rmap_t flags);
195d281ee61SKirill A. Shutemov void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
19640f2bbf7SDavid Hildenbrand unsigned long address);
1974d510f3dSMatthew Wilcox (Oracle) void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
1984d510f3dSMatthew Wilcox (Oracle) unsigned long address);
199cea86fe2SHugh Dickins void page_add_file_rmap(struct page *, struct vm_area_struct *,
200cea86fe2SHugh Dickins bool compound);
20186f35f69SYin Fengwei void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
20286f35f69SYin Fengwei struct vm_area_struct *, bool compound);
203cea86fe2SHugh Dickins void page_remove_rmap(struct page *, struct vm_area_struct *,
204cea86fe2SHugh Dickins bool compound);
20540f2bbf7SDavid Hildenbrand
2060fe6e20bSNaoya Horiguchi void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
20728c5209dSDavid Hildenbrand unsigned long address, rmap_t flags);
208d0ce0e47SSidhartha Kumar void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
209cea86fe2SHugh Dickins unsigned long address);
2100fe6e20bSNaoya Horiguchi
__page_dup_rmap(struct page * page,bool compound)211fb3d824dSDavid Hildenbrand static inline void __page_dup_rmap(struct page *page, bool compound)
2121da177e4SLinus Torvalds {
213c7f84b57SMatthew Wilcox (Oracle) if (compound) {
214c7f84b57SMatthew Wilcox (Oracle) struct folio *folio = (struct folio *)page;
215c7f84b57SMatthew Wilcox (Oracle)
216c7f84b57SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(compound && !PageHead(page), page);
217c7f84b57SMatthew Wilcox (Oracle) atomic_inc(&folio->_entire_mapcount);
218c7f84b57SMatthew Wilcox (Oracle) } else {
219c7f84b57SMatthew Wilcox (Oracle) atomic_inc(&page->_mapcount);
220c7f84b57SMatthew Wilcox (Oracle) }
2211da177e4SLinus Torvalds }
2221da177e4SLinus Torvalds
page_dup_file_rmap(struct page * page,bool compound)223fb3d824dSDavid Hildenbrand static inline void page_dup_file_rmap(struct page *page, bool compound)
224fb3d824dSDavid Hildenbrand {
225fb3d824dSDavid Hildenbrand __page_dup_rmap(page, compound);
226fb3d824dSDavid Hildenbrand }
227fb3d824dSDavid Hildenbrand
228fb3d824dSDavid Hildenbrand /**
229fb3d824dSDavid Hildenbrand * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
230fb3d824dSDavid Hildenbrand * anonymous page
231fb3d824dSDavid Hildenbrand * @page: the page to duplicate the mapping for
232fb3d824dSDavid Hildenbrand * @compound: the page is mapped as compound or as a small page
233fb3d824dSDavid Hildenbrand * @vma: the source vma
234fb3d824dSDavid Hildenbrand *
235fb3d824dSDavid Hildenbrand * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
236fb3d824dSDavid Hildenbrand *
237fb3d824dSDavid Hildenbrand * Duplicating the mapping can only fail if the page may be pinned; device
238fb3d824dSDavid Hildenbrand * private pages cannot get pinned and consequently this function cannot fail.
239fb3d824dSDavid Hildenbrand *
240fb3d824dSDavid Hildenbrand * If duplicating the mapping succeeds, the page has to be mapped R/O into
241fb3d824dSDavid Hildenbrand * the parent and the child. It must *not* get mapped writable after this call.
242fb3d824dSDavid Hildenbrand *
243fb3d824dSDavid Hildenbrand * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
244fb3d824dSDavid Hildenbrand */
page_try_dup_anon_rmap(struct page * page,bool compound,struct vm_area_struct * vma)245fb3d824dSDavid Hildenbrand static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
246fb3d824dSDavid Hildenbrand struct vm_area_struct *vma)
247fb3d824dSDavid Hildenbrand {
248fb3d824dSDavid Hildenbrand VM_BUG_ON_PAGE(!PageAnon(page), page);
249fb3d824dSDavid Hildenbrand
250fb3d824dSDavid Hildenbrand /*
2516c287605SDavid Hildenbrand * No need to check+clear for already shared pages, including KSM
2526c287605SDavid Hildenbrand * pages.
2536c287605SDavid Hildenbrand */
2546c287605SDavid Hildenbrand if (!PageAnonExclusive(page))
2556c287605SDavid Hildenbrand goto dup;
2566c287605SDavid Hildenbrand
2576c287605SDavid Hildenbrand /*
258fb3d824dSDavid Hildenbrand * If this page may have been pinned by the parent process,
259fb3d824dSDavid Hildenbrand * don't allow to duplicate the mapping but instead require to e.g.,
260fb3d824dSDavid Hildenbrand * copy the page immediately for the child so that we'll always
261fb3d824dSDavid Hildenbrand * guarantee the pinned page won't be randomly replaced in the
262fb3d824dSDavid Hildenbrand * future on write faults.
263fb3d824dSDavid Hildenbrand */
264*62efb1e6SSteven Rostedt (Google) if (likely(!is_device_private_page(page)) &&
265*62efb1e6SSteven Rostedt (Google) unlikely(page_needs_cow_for_dma(vma, page)))
266fb3d824dSDavid Hildenbrand return -EBUSY;
267fb3d824dSDavid Hildenbrand
2686c287605SDavid Hildenbrand ClearPageAnonExclusive(page);
269fb3d824dSDavid Hildenbrand /*
270fb3d824dSDavid Hildenbrand * It's okay to share the anon page between both processes, mapping
271fb3d824dSDavid Hildenbrand * the page R/O into both processes.
272fb3d824dSDavid Hildenbrand */
2736c287605SDavid Hildenbrand dup:
274fb3d824dSDavid Hildenbrand __page_dup_rmap(page, compound);
275fb3d824dSDavid Hildenbrand return 0;
276fb3d824dSDavid Hildenbrand }
277fb3d824dSDavid Hildenbrand
2786c287605SDavid Hildenbrand /**
2796c287605SDavid Hildenbrand * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
2806c287605SDavid Hildenbrand * shared to prepare for KSM or temporary unmapping
2816c287605SDavid Hildenbrand * @page: the exclusive anonymous page to try marking possibly shared
2826c287605SDavid Hildenbrand *
2836c287605SDavid Hildenbrand * The caller needs to hold the PT lock and has to have the page table entry
284088b8aa5SDavid Hildenbrand * cleared/invalidated.
2856c287605SDavid Hildenbrand *
2866c287605SDavid Hildenbrand * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
2876c287605SDavid Hildenbrand * to duplicate a mapping, but instead to prepare for KSM or temporarily
2886c287605SDavid Hildenbrand * unmapping a page (swap, migration) via page_remove_rmap().
2896c287605SDavid Hildenbrand *
2906c287605SDavid Hildenbrand * Marking the page shared can only fail if the page may be pinned; device
2916c287605SDavid Hildenbrand * private pages cannot get pinned and consequently this function cannot fail.
2926c287605SDavid Hildenbrand *
2936c287605SDavid Hildenbrand * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
2946c287605SDavid Hildenbrand * otherwise.
2956c287605SDavid Hildenbrand */
page_try_share_anon_rmap(struct page * page)2966c287605SDavid Hildenbrand static inline int page_try_share_anon_rmap(struct page *page)
2976c287605SDavid Hildenbrand {
2986c287605SDavid Hildenbrand VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
2996c287605SDavid Hildenbrand
300088b8aa5SDavid Hildenbrand /* device private pages cannot get pinned via GUP. */
301088b8aa5SDavid Hildenbrand if (unlikely(is_device_private_page(page))) {
3026c287605SDavid Hildenbrand ClearPageAnonExclusive(page);
3036c287605SDavid Hildenbrand return 0;
3046c287605SDavid Hildenbrand }
3056c287605SDavid Hildenbrand
3061da177e4SLinus Torvalds /*
307088b8aa5SDavid Hildenbrand * We have to make sure that when we clear PageAnonExclusive, that
308088b8aa5SDavid Hildenbrand * the page is not pinned and that concurrent GUP-fast won't succeed in
309088b8aa5SDavid Hildenbrand * concurrently pinning the page.
310088b8aa5SDavid Hildenbrand *
311088b8aa5SDavid Hildenbrand * Conceptually, PageAnonExclusive clearing consists of:
312088b8aa5SDavid Hildenbrand * (A1) Clear PTE
313088b8aa5SDavid Hildenbrand * (A2) Check if the page is pinned; back off if so.
314088b8aa5SDavid Hildenbrand * (A3) Clear PageAnonExclusive
315088b8aa5SDavid Hildenbrand * (A4) Restore PTE (optional, but certainly not writable)
316088b8aa5SDavid Hildenbrand *
317088b8aa5SDavid Hildenbrand * When clearing PageAnonExclusive, we cannot possibly map the page
318088b8aa5SDavid Hildenbrand * writable again, because anon pages that may be shared must never
319088b8aa5SDavid Hildenbrand * be writable. So in any case, if the PTE was writable it cannot
320088b8aa5SDavid Hildenbrand * be writable anymore afterwards and there would be a PTE change. Only
321088b8aa5SDavid Hildenbrand * if the PTE wasn't writable, there might not be a PTE change.
322088b8aa5SDavid Hildenbrand *
323088b8aa5SDavid Hildenbrand * Conceptually, GUP-fast pinning of an anon page consists of:
324088b8aa5SDavid Hildenbrand * (B1) Read the PTE
325088b8aa5SDavid Hildenbrand * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
326088b8aa5SDavid Hildenbrand * (B3) Pin the mapped page
327088b8aa5SDavid Hildenbrand * (B4) Check if the PTE changed by re-reading it; back off if so.
328088b8aa5SDavid Hildenbrand * (B5) If the original PTE is not writable, check if
329088b8aa5SDavid Hildenbrand * PageAnonExclusive is not set; back off if so.
330088b8aa5SDavid Hildenbrand *
331088b8aa5SDavid Hildenbrand * If the PTE was writable, we only have to make sure that GUP-fast
332088b8aa5SDavid Hildenbrand * observes a PTE change and properly backs off.
333088b8aa5SDavid Hildenbrand *
334088b8aa5SDavid Hildenbrand * If the PTE was not writable, we have to make sure that GUP-fast either
335088b8aa5SDavid Hildenbrand * detects a (temporary) PTE change or that PageAnonExclusive is cleared
336088b8aa5SDavid Hildenbrand * and properly backs off.
337088b8aa5SDavid Hildenbrand *
338088b8aa5SDavid Hildenbrand * Consequently, when clearing PageAnonExclusive(), we have to make
339088b8aa5SDavid Hildenbrand * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
340088b8aa5SDavid Hildenbrand * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
341088b8aa5SDavid Hildenbrand * and (B5) happen in the right memory order.
342088b8aa5SDavid Hildenbrand *
343088b8aa5SDavid Hildenbrand * We assume that there might not be a memory barrier after
344088b8aa5SDavid Hildenbrand * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
345088b8aa5SDavid Hildenbrand * so we use explicit ones here.
346088b8aa5SDavid Hildenbrand */
347088b8aa5SDavid Hildenbrand
348088b8aa5SDavid Hildenbrand /* Paired with the memory barrier in try_grab_folio(). */
349088b8aa5SDavid Hildenbrand if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
350088b8aa5SDavid Hildenbrand smp_mb();
351088b8aa5SDavid Hildenbrand
352088b8aa5SDavid Hildenbrand if (unlikely(page_maybe_dma_pinned(page)))
353088b8aa5SDavid Hildenbrand return -EBUSY;
354088b8aa5SDavid Hildenbrand ClearPageAnonExclusive(page);
355088b8aa5SDavid Hildenbrand
356088b8aa5SDavid Hildenbrand /*
357088b8aa5SDavid Hildenbrand * This is conceptually a smp_wmb() paired with the smp_rmb() in
358088b8aa5SDavid Hildenbrand * gup_must_unshare().
359088b8aa5SDavid Hildenbrand */
360088b8aa5SDavid Hildenbrand if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
361088b8aa5SDavid Hildenbrand smp_mb__after_atomic();
362088b8aa5SDavid Hildenbrand return 0;
363088b8aa5SDavid Hildenbrand }
364088b8aa5SDavid Hildenbrand
365088b8aa5SDavid Hildenbrand /*
3661da177e4SLinus Torvalds * Called from mm/vmscan.c to handle paging out
3671da177e4SLinus Torvalds */
368b3ac0413SMatthew Wilcox (Oracle) int folio_referenced(struct folio *, int is_locked,
36972835c86SJohannes Weiner struct mem_cgroup *memcg, unsigned long *vm_flags);
3705ad64688SHugh Dickins
3714b8554c5SMatthew Wilcox (Oracle) void try_to_migrate(struct folio *folio, enum ttu_flags flags);
372869f7ee6SMatthew Wilcox (Oracle) void try_to_unmap(struct folio *, enum ttu_flags flags);
3731da177e4SLinus Torvalds
374b756a3b5SAlistair Popple int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
375b756a3b5SAlistair Popple unsigned long end, struct page **pages,
376b756a3b5SAlistair Popple void *arg);
377b756a3b5SAlistair Popple
378ace71a19SKirill A. Shutemov /* Avoid racy checks */
379ace71a19SKirill A. Shutemov #define PVMW_SYNC (1 << 0)
3802aff7a47SMatthew Wilcox (Oracle) /* Look for migration entries rather than present PTEs */
381ace71a19SKirill A. Shutemov #define PVMW_MIGRATION (1 << 1)
382ace71a19SKirill A. Shutemov
383ace71a19SKirill A. Shutemov struct page_vma_mapped_walk {
3842aff7a47SMatthew Wilcox (Oracle) unsigned long pfn;
3852aff7a47SMatthew Wilcox (Oracle) unsigned long nr_pages;
3862aff7a47SMatthew Wilcox (Oracle) pgoff_t pgoff;
387ace71a19SKirill A. Shutemov struct vm_area_struct *vma;
388ace71a19SKirill A. Shutemov unsigned long address;
389ace71a19SKirill A. Shutemov pmd_t *pmd;
390ace71a19SKirill A. Shutemov pte_t *pte;
391ace71a19SKirill A. Shutemov spinlock_t *ptl;
392ace71a19SKirill A. Shutemov unsigned int flags;
393ace71a19SKirill A. Shutemov };
394ace71a19SKirill A. Shutemov
395eed05e54SMatthew Wilcox (Oracle) #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \
396eed05e54SMatthew Wilcox (Oracle) struct page_vma_mapped_walk name = { \
3972aff7a47SMatthew Wilcox (Oracle) .pfn = page_to_pfn(_page), \
398507db792SYang Shi .nr_pages = compound_nr(_page), \
399507db792SYang Shi .pgoff = page_to_pgoff(_page), \
400eed05e54SMatthew Wilcox (Oracle) .vma = _vma, \
401eed05e54SMatthew Wilcox (Oracle) .address = _address, \
402eed05e54SMatthew Wilcox (Oracle) .flags = _flags, \
403eed05e54SMatthew Wilcox (Oracle) }
404eed05e54SMatthew Wilcox (Oracle)
405eed05e54SMatthew Wilcox (Oracle) #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
406eed05e54SMatthew Wilcox (Oracle) struct page_vma_mapped_walk name = { \
4072aff7a47SMatthew Wilcox (Oracle) .pfn = folio_pfn(_folio), \
4082aff7a47SMatthew Wilcox (Oracle) .nr_pages = folio_nr_pages(_folio), \
4092aff7a47SMatthew Wilcox (Oracle) .pgoff = folio_pgoff(_folio), \
410eed05e54SMatthew Wilcox (Oracle) .vma = _vma, \
411eed05e54SMatthew Wilcox (Oracle) .address = _address, \
412eed05e54SMatthew Wilcox (Oracle) .flags = _flags, \
413eed05e54SMatthew Wilcox (Oracle) }
414eed05e54SMatthew Wilcox (Oracle)
page_vma_mapped_walk_done(struct page_vma_mapped_walk * pvmw)415ace71a19SKirill A. Shutemov static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
416ace71a19SKirill A. Shutemov {
4175d5d19edSMiaohe Lin /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
4182aff7a47SMatthew Wilcox (Oracle) if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
419ace71a19SKirill A. Shutemov pte_unmap(pvmw->pte);
420ace71a19SKirill A. Shutemov if (pvmw->ptl)
421ace71a19SKirill A. Shutemov spin_unlock(pvmw->ptl);
422ace71a19SKirill A. Shutemov }
423ace71a19SKirill A. Shutemov
424ace71a19SKirill A. Shutemov bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
425ace71a19SKirill A. Shutemov
4268749cfeaSVladimir Davydov /*
4271da177e4SLinus Torvalds * Used by swapoff to help locate where page is expected in vma.
4281da177e4SLinus Torvalds */
4291da177e4SLinus Torvalds unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
4301da177e4SLinus Torvalds
431d08b3851SPeter Zijlstra /*
432d08b3851SPeter Zijlstra * Cleans the PTEs of shared mappings.
433d08b3851SPeter Zijlstra * (and since clean PTEs should also be readonly, write protects them too)
434d08b3851SPeter Zijlstra *
435d08b3851SPeter Zijlstra * returns the number of cleaned PTEs.
436d08b3851SPeter Zijlstra */
437d9c08e22SMatthew Wilcox (Oracle) int folio_mkclean(struct folio *);
438d08b3851SPeter Zijlstra
4396a8e0596SMuchun Song int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
4406a8e0596SMuchun Song struct vm_area_struct *vma);
4416a8e0596SMuchun Song
4424eecb8b9SMatthew Wilcox (Oracle) void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
443e388466dSKirill A. Shutemov
4446a46079cSAndi Kleen int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
44510be22dfSAndi Kleen
4460dd1c7bbSJoonsoo Kim /*
4470dd1c7bbSJoonsoo Kim * rmap_walk_control: To control rmap traversing for specific needs
4480dd1c7bbSJoonsoo Kim *
4490dd1c7bbSJoonsoo Kim * arg: passed to rmap_one() and invalid_vma()
4506d4675e6SMinchan Kim * try_lock: bail out if the rmap lock is contended
4516d4675e6SMinchan Kim * contended: indicate the rmap traversal bailed out due to lock contention
4520dd1c7bbSJoonsoo Kim * rmap_one: executed on each vma where page is mapped
4530dd1c7bbSJoonsoo Kim * done: for checking traversing termination condition
4540dd1c7bbSJoonsoo Kim * anon_lock: for getting anon_lock by optimized way rather than default
4550dd1c7bbSJoonsoo Kim * invalid_vma: for skipping uninterested vma
4560dd1c7bbSJoonsoo Kim */
457051ac83aSJoonsoo Kim struct rmap_walk_control {
458051ac83aSJoonsoo Kim void *arg;
4596d4675e6SMinchan Kim bool try_lock;
4606d4675e6SMinchan Kim bool contended;
461e4b82222SMinchan Kim /*
462e4b82222SMinchan Kim * Return false if page table scanning in rmap_walk should be stopped.
463e4b82222SMinchan Kim * Otherwise, return true.
464e4b82222SMinchan Kim */
4652f031c6fSMatthew Wilcox (Oracle) bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
466051ac83aSJoonsoo Kim unsigned long addr, void *arg);
4672f031c6fSMatthew Wilcox (Oracle) int (*done)(struct folio *folio);
4686d4675e6SMinchan Kim struct anon_vma *(*anon_lock)(struct folio *folio,
4696d4675e6SMinchan Kim struct rmap_walk_control *rwc);
4700dd1c7bbSJoonsoo Kim bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
471051ac83aSJoonsoo Kim };
472051ac83aSJoonsoo Kim
4736d4675e6SMinchan Kim void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
4746d4675e6SMinchan Kim void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
4756d4675e6SMinchan Kim struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
4766d4675e6SMinchan Kim struct rmap_walk_control *rwc);
477e9995ef9SHugh Dickins
4781da177e4SLinus Torvalds #else /* !CONFIG_MMU */
4791da177e4SLinus Torvalds
4801da177e4SLinus Torvalds #define anon_vma_init() do {} while (0)
4811da177e4SLinus Torvalds #define anon_vma_prepare(vma) (0)
4821da177e4SLinus Torvalds
folio_referenced(struct folio * folio,int is_locked,struct mem_cgroup * memcg,unsigned long * vm_flags)483b3ac0413SMatthew Wilcox (Oracle) static inline int folio_referenced(struct folio *folio, int is_locked,
48472835c86SJohannes Weiner struct mem_cgroup *memcg,
48501ff53f4SMike Frysinger unsigned long *vm_flags)
48601ff53f4SMike Frysinger {
48701ff53f4SMike Frysinger *vm_flags = 0;
48864574746SJohannes Weiner return 0;
48901ff53f4SMike Frysinger }
49001ff53f4SMike Frysinger
try_to_unmap(struct folio * folio,enum ttu_flags flags)491869f7ee6SMatthew Wilcox (Oracle) static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
492ab7965deSChristoph Hellwig {
493ab7965deSChristoph Hellwig }
4941da177e4SLinus Torvalds
folio_mkclean(struct folio * folio)495d9c08e22SMatthew Wilcox (Oracle) static inline int folio_mkclean(struct folio *folio)
496d08b3851SPeter Zijlstra {
497d08b3851SPeter Zijlstra return 0;
498d08b3851SPeter Zijlstra }
4991da177e4SLinus Torvalds #endif /* CONFIG_MMU */
5001da177e4SLinus Torvalds
page_mkclean(struct page * page)501d9c08e22SMatthew Wilcox (Oracle) static inline int page_mkclean(struct page *page)
502d9c08e22SMatthew Wilcox (Oracle) {
503d9c08e22SMatthew Wilcox (Oracle) return folio_mkclean(page_folio(page));
504d9c08e22SMatthew Wilcox (Oracle) }
5051da177e4SLinus Torvalds #endif /* _LINUX_RMAP_H */
506