linux/mm/filemap.c

457c8996SThomas Gleixner// SPDX-License-Identifier: GPL-2.0-only
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds *	linux/mm/filemap.c
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds * Copyright (C) 1994-1999  Linus Torvalds
1da177e4SLinus Torvalds */
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds * This file handles the generic file mmap semantics used by
1da177e4SLinus Torvalds * most "normal" filesystems (but you don't /have/ to use this:
1da177e4SLinus Torvalds * the NFS filesystem used to do this differently, for example)
1da177e4SLinus Torvalds */
b95f1b31SPaul Gortmaker#include <linux/export.h>
1da177e4SLinus Torvalds#include <linux/compiler.h>
f9fe48beSRoss Zwisler#include <linux/dax.h>
1da177e4SLinus Torvalds#include <linux/fs.h>
3f07c014SIngo Molnar#include <linux/sched/signal.h>
c22ce143SHiro Yoshioka#include <linux/uaccess.h>
c59ede7bSRandy.Dunlap#include <linux/capability.h>
1da177e4SLinus Torvalds#include <linux/kernel_stat.h>
5a0e3ad6STejun Heo#include <linux/gfp.h>
1da177e4SLinus Torvalds#include <linux/mm.h>
1da177e4SLinus Torvalds#include <linux/swap.h>
ffa65753SAlistair Popple#include <linux/swapops.h>
cf264e13SNhat Pham#include <linux/syscalls.h>
1da177e4SLinus Torvalds#include <linux/mman.h>
1da177e4SLinus Torvalds#include <linux/pagemap.h>
1da177e4SLinus Torvalds#include <linux/file.h>
1da177e4SLinus Torvalds#include <linux/uio.h>
cfcbfb13SJosef Bacik#include <linux/error-injection.h>
1da177e4SLinus Torvalds#include <linux/hash.h>
1da177e4SLinus Torvalds#include <linux/writeback.h>
53253383SLinus Torvalds#include <linux/backing-dev.h>
1da177e4SLinus Torvalds#include <linux/pagevec.h>
1da177e4SLinus Torvalds#include <linux/security.h>
44110fe3SPaul Jackson#include <linux/cpuset.h>
00501b53SJohannes Weiner#include <linux/hugetlb.h>
8a9f3ccdSBalbir Singh#include <linux/memcontrol.h>
c7df8ad2SMel Gorman#include <linux/shmem_fs.h>
f1820361SKirill A. Shutemov#include <linux/rmap.h>
b1d29ba8SJohannes Weiner#include <linux/delayacct.h>
eb414681SJohannes Weiner#include <linux/psi.h>
d0e6a582SBen Dooks#include <linux/ramfs.h>
b9306a79SYang Shi#include <linux/page_idle.h>
ffa65753SAlistair Popple#include <linux/migrate.h>
07073eb0SDavid Howells#include <linux/pipe_fs_i.h>
07073eb0SDavid Howells#include <linux/splice.h>
f9ce0be7SKirill A. Shutemov#include <asm/pgalloc.h>
de591a82SWill Deacon#include <asm/tlbflush.h>
0f8053a5SNick Piggin#include "internal.h"
0f8053a5SNick Piggin
fe0bfaafSRobert Jarzmik#define CREATE_TRACE_POINTS
fe0bfaafSRobert Jarzmik#include <trace/events/filemap.h>
fe0bfaafSRobert Jarzmik
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds * FIXME: remove all knowledge of the buffer layer from the core VM
1da177e4SLinus Torvalds */
148f948bSJan Kara#include <linux/buffer_head.h> /* for try_to_free_buffers */
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds#include <asm/mman.h>
1da177e4SLinus Torvalds
cf264e13SNhat Pham#include "swap.h"
cf264e13SNhat Pham
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds * Shared mappings implemented 30.11.1994. It's not fully working yet,
1da177e4SLinus Torvalds * though.
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds * Shared mappings now work. 15.8.1995  Bruno.
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds * finished 'unifying' the page and buffer cache and SMP-threaded the
1da177e4SLinus Torvalds * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
1da177e4SLinus Torvalds */
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds * Lock ordering:
1da177e4SLinus Torvalds *
c8c06efaSDavidlohr Bueso *  ->i_mmap_rwsem		(truncate_pagecache)
e621900aSMatthew Wilcox (Oracle) *    ->private_lock		(__free_pte->block_dirty_folio)
5d337b91SHugh Dickins *      ->swap_lock		(exclusive_swap_page, others)
b93b0163SMatthew Wilcox *        ->i_pages lock
1da177e4SLinus Torvalds *
9608703eSJan Kara *  ->i_rwsem
730633f0SJan Kara *    ->invalidate_lock		(acquired by fs in truncate path)
c8c06efaSDavidlohr Bueso *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
1da177e4SLinus Torvalds *
c1e8d7c6SMichel Lespinasse *  ->mmap_lock
c8c06efaSDavidlohr Bueso *    ->i_mmap_rwsem
b8072f09SHugh Dickins *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
b93b0163SMatthew Wilcox *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
1da177e4SLinus Torvalds *
c1e8d7c6SMichel Lespinasse *  ->mmap_lock
730633f0SJan Kara *    ->invalidate_lock		(filemap_fault)
730633f0SJan Kara *      ->lock_page		(filemap_fault, access_process_vm)
1da177e4SLinus Torvalds *
9608703eSJan Kara *  ->i_rwsem			(generic_perform_write)
bb523b40SAndreas Gruenbacher *    ->mmap_lock		(fault_in_readable->do_page_fault)
1da177e4SLinus Torvalds *
f758eeabSChristoph Hellwig *  bdi->wb.list_lock
a66979abSDave Chinner *    sb_lock			(fs/fs-writeback.c)
b93b0163SMatthew Wilcox *    ->i_pages lock		(__sync_single_inode)
1da177e4SLinus Torvalds *
c8c06efaSDavidlohr Bueso *  ->i_mmap_rwsem
0503ea8fSLiam R. Howlett *    ->anon_vma.lock		(vma_merge)
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds *  ->anon_vma.lock
b8072f09SHugh Dickins *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1da177e4SLinus Torvalds *
b8072f09SHugh Dickins *  ->page_table_lock or pte_lock
5d337b91SHugh Dickins *    ->swap_lock		(try_to_unmap_one)
1da177e4SLinus Torvalds *    ->private_lock		(try_to_unmap_one)
b93b0163SMatthew Wilcox *    ->i_pages lock		(try_to_unmap_one)
15b44736SHugh Dickins *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
15b44736SHugh Dickins *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
1da177e4SLinus Torvalds *    ->private_lock		(page_remove_rmap->set_page_dirty)
b93b0163SMatthew Wilcox *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
f758eeabSChristoph Hellwig *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
250df6edSDave Chinner *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
6c77b607SKefeng Wang *    ->memcg->move_lock	(page_remove_rmap->folio_memcg_lock)
f758eeabSChristoph Hellwig *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
250df6edSDave Chinner *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
e621900aSMatthew Wilcox (Oracle) *    ->private_lock		(zap_pte_range->block_dirty_folio)
1da177e4SLinus Torvalds */
1da177e4SLinus Torvalds
5c024e6aSMatthew Wilcoxstatic void page_cache_delete(struct address_space *mapping,
a548b615SMatthew Wilcox (Oracle)				   struct folio *folio, void *shadow)
91b0abe3SJohannes Weiner{
a548b615SMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, folio->index);
a548b615SMatthew Wilcox (Oracle)	long nr = 1;
c70b647dSKirill A. Shutemov
5c024e6aSMatthew Wilcox	mapping_set_update(&xas, mapping);
5c024e6aSMatthew Wilcox
5c024e6aSMatthew Wilcox	/* hugetlb pages are represented by a single entry in the xarray */
a548b615SMatthew Wilcox (Oracle)	if (!folio_test_hugetlb(folio)) {
a548b615SMatthew Wilcox (Oracle)		xas_set_order(&xas, folio->index, folio_order(folio));
a548b615SMatthew Wilcox (Oracle)		nr = folio_nr_pages(folio);
5c024e6aSMatthew Wilcox	}
91b0abe3SJohannes Weiner
a548b615SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
449dd698SJohannes Weiner
5c024e6aSMatthew Wilcox	xas_store(&xas, shadow);
5c024e6aSMatthew Wilcox	xas_init_marks(&xas);
d3798ae8SJohannes Weiner
a548b615SMatthew Wilcox (Oracle)	folio->mapping = NULL;
2300638bSJan Kara	/* Leave page->index set: truncation lookup relies upon it */
d3798ae8SJohannes Weiner	mapping->nrpages -= nr;
91b0abe3SJohannes Weiner}
91b0abe3SJohannes Weiner
621db488SMatthew Wilcox (Oracle)static void filemap_unaccount_folio(struct address_space *mapping,
621db488SMatthew Wilcox (Oracle)		struct folio *folio)
1da177e4SLinus Torvalds{
621db488SMatthew Wilcox (Oracle)	long nr;
1da177e4SLinus Torvalds
621db488SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
621db488SMatthew Wilcox (Oracle)	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
06b241f3SHugh Dickins		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
621db488SMatthew Wilcox (Oracle)			 current->comm, folio_pfn(folio));
621db488SMatthew Wilcox (Oracle)		dump_page(&folio->page, "still mapped when deleted");
06b241f3SHugh Dickins		dump_stack();
06b241f3SHugh Dickins		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
06b241f3SHugh Dickins
85207ad8SHugh Dickins		if (mapping_exiting(mapping) && !folio_test_large(folio)) {
85207ad8SHugh Dickins			int mapcount = page_mapcount(&folio->page);
85207ad8SHugh Dickins
85207ad8SHugh Dickins			if (folio_ref_count(folio) >= mapcount + 2) {
06b241f3SHugh Dickins				/*
06b241f3SHugh Dickins				 * All vmas have already been torn down, so it's
85207ad8SHugh Dickins				 * a good bet that actually the page is unmapped
85207ad8SHugh Dickins				 * and we'd rather not leak it: if we're wrong,
85207ad8SHugh Dickins				 * another bad page check should catch it later.
06b241f3SHugh Dickins				 */
621db488SMatthew Wilcox (Oracle)				page_mapcount_reset(&folio->page);
621db488SMatthew Wilcox (Oracle)				folio_ref_sub(folio, mapcount);
06b241f3SHugh Dickins			}
06b241f3SHugh Dickins		}
85207ad8SHugh Dickins	}
06b241f3SHugh Dickins
621db488SMatthew Wilcox (Oracle)	/* hugetlb folios do not participate in page cache accounting. */
621db488SMatthew Wilcox (Oracle)	if (folio_test_hugetlb(folio))
5ecc4d85SJan Kara		return;
5ecc4d85SJan Kara
621db488SMatthew Wilcox (Oracle)	nr = folio_nr_pages(folio);
5ecc4d85SJan Kara
621db488SMatthew Wilcox (Oracle)	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
621db488SMatthew Wilcox (Oracle)	if (folio_test_swapbacked(folio)) {
621db488SMatthew Wilcox (Oracle)		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
621db488SMatthew Wilcox (Oracle)		if (folio_test_pmd_mappable(folio))
621db488SMatthew Wilcox (Oracle)			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
621db488SMatthew Wilcox (Oracle)	} else if (folio_test_pmd_mappable(folio)) {
621db488SMatthew Wilcox (Oracle)		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
09d91cdaSSong Liu		filemap_nr_thps_dec(mapping);
800d8c63SKirill A. Shutemov	}
3a692790SLinus Torvalds
3a692790SLinus Torvalds	/*
621db488SMatthew Wilcox (Oracle)	 * At this point folio must be either written or cleaned by
621db488SMatthew Wilcox (Oracle)	 * truncate.  Dirty folio here signals a bug and loss of
566d3362SHugh Dickins	 * unwritten data - on ordinary filesystems.
3a692790SLinus Torvalds	 *
566d3362SHugh Dickins	 * But it's harmless on in-memory filesystems like tmpfs; and can
566d3362SHugh Dickins	 * occur when a driver which did get_user_pages() sets page dirty
566d3362SHugh Dickins	 * before putting it, while the inode is being finally evicted.
566d3362SHugh Dickins	 *
566d3362SHugh Dickins	 * Below fixes dirty accounting after removing the folio entirely
621db488SMatthew Wilcox (Oracle)	 * but leaves the dirty flag set: it has no effect for truncated
621db488SMatthew Wilcox (Oracle)	 * folio and anyway will be cleared before returning folio to
76253fbcSJan Kara	 * buddy allocator.
3a692790SLinus Torvalds	 */
566d3362SHugh Dickins	if (WARN_ON_ONCE(folio_test_dirty(folio) &&
566d3362SHugh Dickins			 mapping_can_writeback(mapping)))
566d3362SHugh Dickins		folio_account_cleaned(folio, inode_to_wb(mapping->host));
76253fbcSJan Kara}
5ecc4d85SJan Kara
5ecc4d85SJan Kara/*
5ecc4d85SJan Kara * Delete a page from the page cache and free it. Caller has to make
5ecc4d85SJan Kara * sure the page is locked and that nobody else uses it - or that usage
b93b0163SMatthew Wilcox * is safe.  The caller must hold the i_pages lock.
5ecc4d85SJan Kara */
452e9e69SMatthew Wilcox (Oracle)void __filemap_remove_folio(struct folio *folio, void *shadow)
5ecc4d85SJan Kara{
452e9e69SMatthew Wilcox (Oracle)	struct address_space *mapping = folio->mapping;
5ecc4d85SJan Kara
a0580c6fSMatthew Wilcox (Oracle)	trace_mm_filemap_delete_from_page_cache(folio);
621db488SMatthew Wilcox (Oracle)	filemap_unaccount_folio(mapping, folio);
a548b615SMatthew Wilcox (Oracle)	page_cache_delete(mapping, folio, shadow);
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
78f42660SMatthew Wilcox (Oracle)void filemap_free_folio(struct address_space *mapping, struct folio *folio)
59c66c5fSJan Kara{
d2329aa0SMatthew Wilcox (Oracle)	void (*free_folio)(struct folio *);
3abb28e2SMatthew Wilcox (Oracle)	int refs = 1;
59c66c5fSJan Kara
d2329aa0SMatthew Wilcox (Oracle)	free_folio = mapping->a_ops->free_folio;
d2329aa0SMatthew Wilcox (Oracle)	if (free_folio)
d2329aa0SMatthew Wilcox (Oracle)		free_folio(folio);
59c66c5fSJan Kara
3abb28e2SMatthew Wilcox (Oracle)	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
3abb28e2SMatthew Wilcox (Oracle)		refs = folio_nr_pages(folio);
3abb28e2SMatthew Wilcox (Oracle)	folio_put_refs(folio, refs);
59c66c5fSJan Kara}
59c66c5fSJan Kara
702cfbf9SMinchan Kim/**
452e9e69SMatthew Wilcox (Oracle) * filemap_remove_folio - Remove folio from page cache.
452e9e69SMatthew Wilcox (Oracle) * @folio: The folio.
702cfbf9SMinchan Kim *
452e9e69SMatthew Wilcox (Oracle) * This must be called only on folios that are locked and have been
452e9e69SMatthew Wilcox (Oracle) * verified to be in the page cache.  It will never put the folio into
452e9e69SMatthew Wilcox (Oracle) * the free list because the caller has a reference on the page.
702cfbf9SMinchan Kim */
452e9e69SMatthew Wilcox (Oracle)void filemap_remove_folio(struct folio *folio)
1da177e4SLinus Torvalds{
452e9e69SMatthew Wilcox (Oracle)	struct address_space *mapping = folio->mapping;
1da177e4SLinus Torvalds
452e9e69SMatthew Wilcox (Oracle)	BUG_ON(!folio_test_locked(folio));
51b8c1feSJohannes Weiner	spin_lock(&mapping->host->i_lock);
30472509SJohannes Weiner	xa_lock_irq(&mapping->i_pages);
452e9e69SMatthew Wilcox (Oracle)	__filemap_remove_folio(folio, NULL);
30472509SJohannes Weiner	xa_unlock_irq(&mapping->i_pages);
51b8c1feSJohannes Weiner	if (mapping_shrinkable(mapping))
51b8c1feSJohannes Weiner		inode_add_lru(mapping->host);
51b8c1feSJohannes Weiner	spin_unlock(&mapping->host->i_lock);
6072d13cSLinus Torvalds
452e9e69SMatthew Wilcox (Oracle)	filemap_free_folio(mapping, folio);
83929372SKirill A. Shutemov}
97cecb5aSMinchan Kim
aa65c29cSJan Kara/*
51dcbdacSMatthew Wilcox (Oracle) * page_cache_delete_batch - delete several folios from page cache
51dcbdacSMatthew Wilcox (Oracle) * @mapping: the mapping to which folios belong
51dcbdacSMatthew Wilcox (Oracle) * @fbatch: batch of folios to delete
aa65c29cSJan Kara *
51dcbdacSMatthew Wilcox (Oracle) * The function walks over mapping->i_pages and removes folios passed in
51dcbdacSMatthew Wilcox (Oracle) * @fbatch from the mapping. The function expects @fbatch to be sorted
51dcbdacSMatthew Wilcox (Oracle) * by page index and is optimised for it to be dense.
51dcbdacSMatthew Wilcox (Oracle) * It tolerates holes in @fbatch (mapping entries at those indices are not
51dcbdacSMatthew Wilcox (Oracle) * modified).
aa65c29cSJan Kara *
b93b0163SMatthew Wilcox * The function expects the i_pages lock to be held.
aa65c29cSJan Kara */
ef8e5717SMatthew Wilcoxstatic void page_cache_delete_batch(struct address_space *mapping,
51dcbdacSMatthew Wilcox (Oracle)			     struct folio_batch *fbatch)
aa65c29cSJan Kara{
51dcbdacSMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
6b24ca4aSMatthew Wilcox (Oracle)	long total_pages = 0;
4101196bSMatthew Wilcox (Oracle)	int i = 0;
1afd7ae5SMatthew Wilcox (Oracle)	struct folio *folio;
aa65c29cSJan Kara
ef8e5717SMatthew Wilcox	mapping_set_update(&xas, mapping);
1afd7ae5SMatthew Wilcox (Oracle)	xas_for_each(&xas, folio, ULONG_MAX) {
51dcbdacSMatthew Wilcox (Oracle)		if (i >= folio_batch_count(fbatch))
aa65c29cSJan Kara			break;
4101196bSMatthew Wilcox (Oracle)
4101196bSMatthew Wilcox (Oracle)		/* A swap/dax/shadow entry got inserted? Skip it. */
1afd7ae5SMatthew Wilcox (Oracle)		if (xa_is_value(folio))
aa65c29cSJan Kara			continue;
aa65c29cSJan Kara		/*
4101196bSMatthew Wilcox (Oracle)		 * A page got inserted in our range? Skip it. We have our
4101196bSMatthew Wilcox (Oracle)		 * pages locked so they are protected from being removed.
4101196bSMatthew Wilcox (Oracle)		 * If we see a page whose index is higher than ours, it
4101196bSMatthew Wilcox (Oracle)		 * means our page has been removed, which shouldn't be
4101196bSMatthew Wilcox (Oracle)		 * possible because we're holding the PageLock.
aa65c29cSJan Kara		 */
51dcbdacSMatthew Wilcox (Oracle)		if (folio != fbatch->folios[i]) {
1afd7ae5SMatthew Wilcox (Oracle)			VM_BUG_ON_FOLIO(folio->index >
51dcbdacSMatthew Wilcox (Oracle)					fbatch->folios[i]->index, folio);
aa65c29cSJan Kara			continue;
ef8e5717SMatthew Wilcox		}
4101196bSMatthew Wilcox (Oracle)
1afd7ae5SMatthew Wilcox (Oracle)		WARN_ON_ONCE(!folio_test_locked(folio));
4101196bSMatthew Wilcox (Oracle)
1afd7ae5SMatthew Wilcox (Oracle)		folio->mapping = NULL;
51dcbdacSMatthew Wilcox (Oracle)		/* Leave folio->index set: truncation lookup relies on it */
4101196bSMatthew Wilcox (Oracle)
aa65c29cSJan Kara		i++;
ef8e5717SMatthew Wilcox		xas_store(&xas, NULL);
6b24ca4aSMatthew Wilcox (Oracle)		total_pages += folio_nr_pages(folio);
aa65c29cSJan Kara	}
aa65c29cSJan Kara	mapping->nrpages -= total_pages;
aa65c29cSJan Kara}
aa65c29cSJan Kara
aa65c29cSJan Karavoid delete_from_page_cache_batch(struct address_space *mapping,
51dcbdacSMatthew Wilcox (Oracle)				  struct folio_batch *fbatch)
aa65c29cSJan Kara{
aa65c29cSJan Kara	int i;
aa65c29cSJan Kara
51dcbdacSMatthew Wilcox (Oracle)	if (!folio_batch_count(fbatch))
aa65c29cSJan Kara		return;
aa65c29cSJan Kara
51b8c1feSJohannes Weiner	spin_lock(&mapping->host->i_lock);
30472509SJohannes Weiner	xa_lock_irq(&mapping->i_pages);
51dcbdacSMatthew Wilcox (Oracle)	for (i = 0; i < folio_batch_count(fbatch); i++) {
51dcbdacSMatthew Wilcox (Oracle)		struct folio *folio = fbatch->folios[i];
aa65c29cSJan Kara
a0580c6fSMatthew Wilcox (Oracle)		trace_mm_filemap_delete_from_page_cache(folio);
a0580c6fSMatthew Wilcox (Oracle)		filemap_unaccount_folio(mapping, folio);
aa65c29cSJan Kara	}
51dcbdacSMatthew Wilcox (Oracle)	page_cache_delete_batch(mapping, fbatch);
30472509SJohannes Weiner	xa_unlock_irq(&mapping->i_pages);
51b8c1feSJohannes Weiner	if (mapping_shrinkable(mapping))
51b8c1feSJohannes Weiner		inode_add_lru(mapping->host);
51b8c1feSJohannes Weiner	spin_unlock(&mapping->host->i_lock);
aa65c29cSJan Kara
51dcbdacSMatthew Wilcox (Oracle)	for (i = 0; i < folio_batch_count(fbatch); i++)
51dcbdacSMatthew Wilcox (Oracle)		filemap_free_folio(mapping, fbatch->folios[i]);
aa65c29cSJan Kara}
aa65c29cSJan Kara
d72d9e2aSMiklos Szerediint filemap_check_errors(struct address_space *mapping)
865ffef3SDmitry Monakhov{
865ffef3SDmitry Monakhov	int ret = 0;
865ffef3SDmitry Monakhov	/* Check for outstanding write errors */
7fcbbaf1SJens Axboe	if (test_bit(AS_ENOSPC, &mapping->flags) &&
7fcbbaf1SJens Axboe	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
865ffef3SDmitry Monakhov		ret = -ENOSPC;
7fcbbaf1SJens Axboe	if (test_bit(AS_EIO, &mapping->flags) &&
7fcbbaf1SJens Axboe	    test_and_clear_bit(AS_EIO, &mapping->flags))
865ffef3SDmitry Monakhov		ret = -EIO;
865ffef3SDmitry Monakhov	return ret;
865ffef3SDmitry Monakhov}
d72d9e2aSMiklos SzerediEXPORT_SYMBOL(filemap_check_errors);
865ffef3SDmitry Monakhov
76341cabSJeff Laytonstatic int filemap_check_and_keep_errors(struct address_space *mapping)
76341cabSJeff Layton{
76341cabSJeff Layton	/* Check for outstanding write errors */
76341cabSJeff Layton	if (test_bit(AS_EIO, &mapping->flags))
76341cabSJeff Layton		return -EIO;
76341cabSJeff Layton	if (test_bit(AS_ENOSPC, &mapping->flags))
76341cabSJeff Layton		return -ENOSPC;
76341cabSJeff Layton	return 0;
76341cabSJeff Layton}
76341cabSJeff Layton
1da177e4SLinus Torvalds/**
5a798493SJosef Bacik * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
5a798493SJosef Bacik * @mapping:	address space structure to write
5a798493SJosef Bacik * @wbc:	the writeback_control controlling the writeout
5a798493SJosef Bacik *
5a798493SJosef Bacik * Call writepages on the mapping using the provided wbc to control the
5a798493SJosef Bacik * writeout.
5a798493SJosef Bacik *
5a798493SJosef Bacik * Return: %0 on success, negative error code otherwise.
5a798493SJosef Bacik */
5a798493SJosef Bacikint filemap_fdatawrite_wbc(struct address_space *mapping,
5a798493SJosef Bacik			   struct writeback_control *wbc)
5a798493SJosef Bacik{
5a798493SJosef Bacik	int ret;
5a798493SJosef Bacik
5a798493SJosef Bacik	if (!mapping_can_writeback(mapping) ||
5a798493SJosef Bacik	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
5a798493SJosef Bacik		return 0;
5a798493SJosef Bacik
5a798493SJosef Bacik	wbc_attach_fdatawrite_inode(wbc, mapping->host);
5a798493SJosef Bacik	ret = do_writepages(mapping, wbc);
5a798493SJosef Bacik	wbc_detach_inode(wbc);
5a798493SJosef Bacik	return ret;
5a798493SJosef Bacik}
5a798493SJosef BacikEXPORT_SYMBOL(filemap_fdatawrite_wbc);
5a798493SJosef Bacik
5a798493SJosef Bacik/**
485bb99bSRandy Dunlap * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
1da177e4SLinus Torvalds * @mapping:	address space structure to write
1da177e4SLinus Torvalds * @start:	offset in bytes where the range starts
469eb4d0SAndrew Morton * @end:	offset in bytes where the range ends (inclusive)
67be2dd1SMartin Waitz * @sync_mode:	enable synchronous operation
1da177e4SLinus Torvalds *
485bb99bSRandy Dunlap * Start writeback against all of a mapping's dirty pages that lie
485bb99bSRandy Dunlap * within the byte offsets <start, end> inclusive.
485bb99bSRandy Dunlap *
1da177e4SLinus Torvalds * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
485bb99bSRandy Dunlap * opposed to a regular memory cleansing writeback.  The difference between
1da177e4SLinus Torvalds * these two operations is that if a dirty page/buffer is encountered, it must
1da177e4SLinus Torvalds * be waited upon, and not just skipped over.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: %0 on success, negative error code otherwise.
1da177e4SLinus Torvalds */
ebcf28e1SAndrew Mortonint __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1SAndrew Morton				loff_t end, int sync_mode)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	struct writeback_control wbc = {
1da177e4SLinus Torvalds		.sync_mode = sync_mode,
05fe478dSNick Piggin		.nr_to_write = LONG_MAX,
111ebb6eSOGAWA Hirofumi		.range_start = start,
111ebb6eSOGAWA Hirofumi		.range_end = end,
1da177e4SLinus Torvalds	};
1da177e4SLinus Torvalds
5a798493SJosef Bacik	return filemap_fdatawrite_wbc(mapping, &wbc);
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
1da177e4SLinus Torvaldsstatic inline int __filemap_fdatawrite(struct address_space *mapping,
1da177e4SLinus Torvalds	int sync_mode)
1da177e4SLinus Torvalds{
111ebb6eSOGAWA Hirofumi	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
1da177e4SLinus Torvaldsint filemap_fdatawrite(struct address_space *mapping)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
1da177e4SLinus Torvalds}
1da177e4SLinus TorvaldsEXPORT_SYMBOL(filemap_fdatawrite);
1da177e4SLinus Torvalds
f4c0a0fdSJan Karaint filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1SAndrew Morton				loff_t end)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
1da177e4SLinus Torvalds}
f4c0a0fdSJan KaraEXPORT_SYMBOL(filemap_fdatawrite_range);
1da177e4SLinus Torvalds
485bb99bSRandy Dunlap/**
485bb99bSRandy Dunlap * filemap_flush - mostly a non-blocking flush
485bb99bSRandy Dunlap * @mapping:	target address_space
485bb99bSRandy Dunlap *
1da177e4SLinus Torvalds * This is a mostly non-blocking flush.  Not suitable for data-integrity
1da177e4SLinus Torvalds * purposes - I/O may not be started against all dirty pages.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: %0 on success, negative error code otherwise.
1da177e4SLinus Torvalds */
1da177e4SLinus Torvaldsint filemap_flush(struct address_space *mapping)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
1da177e4SLinus Torvalds}
1da177e4SLinus TorvaldsEXPORT_SYMBOL(filemap_flush);
1da177e4SLinus Torvalds
7fc9e472SGoldwyn Rodrigues/**
7fc9e472SGoldwyn Rodrigues * filemap_range_has_page - check if a page exists in range.
7fc9e472SGoldwyn Rodrigues * @mapping:           address space within which to check
7fc9e472SGoldwyn Rodrigues * @start_byte:        offset in bytes where the range starts
7fc9e472SGoldwyn Rodrigues * @end_byte:          offset in bytes where the range ends (inclusive)
7fc9e472SGoldwyn Rodrigues *
7fc9e472SGoldwyn Rodrigues * Find at least one page in the range supplied, usually used to check if
7fc9e472SGoldwyn Rodrigues * direct writing in this range will trigger a writeback.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: %true if at least one page exists in the specified range,
a862f68aSMike Rapoport * %false otherwise.
7fc9e472SGoldwyn Rodrigues */
7fc9e472SGoldwyn Rodriguesbool filemap_range_has_page(struct address_space *mapping,
7fc9e472SGoldwyn Rodrigues			   loff_t start_byte, loff_t end_byte)
7fc9e472SGoldwyn Rodrigues{
eff3b364SMatthew Wilcox (Oracle)	struct folio *folio;
8fa8e538SMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
8fa8e538SMatthew Wilcox	pgoff_t max = end_byte >> PAGE_SHIFT;
7fc9e472SGoldwyn Rodrigues
7fc9e472SGoldwyn Rodrigues	if (end_byte < start_byte)
7fc9e472SGoldwyn Rodrigues		return false;
7fc9e472SGoldwyn Rodrigues
8fa8e538SMatthew Wilcox	rcu_read_lock();
8fa8e538SMatthew Wilcox	for (;;) {
eff3b364SMatthew Wilcox (Oracle)		folio = xas_find(&xas, max);
eff3b364SMatthew Wilcox (Oracle)		if (xas_retry(&xas, folio))
8fa8e538SMatthew Wilcox			continue;
8fa8e538SMatthew Wilcox		/* Shadow entries don't count */
eff3b364SMatthew Wilcox (Oracle)		if (xa_is_value(folio))
8fa8e538SMatthew Wilcox			continue;
8fa8e538SMatthew Wilcox		/*
8fa8e538SMatthew Wilcox		 * We don't need to try to pin this page; we're about to
8fa8e538SMatthew Wilcox		 * release the RCU lock anyway.  It is enough to know that
8fa8e538SMatthew Wilcox		 * there was a page here recently.
8fa8e538SMatthew Wilcox		 */
8fa8e538SMatthew Wilcox		break;
8fa8e538SMatthew Wilcox	}
8fa8e538SMatthew Wilcox	rcu_read_unlock();
7fc9e472SGoldwyn Rodrigues
eff3b364SMatthew Wilcox (Oracle)	return folio != NULL;
7fc9e472SGoldwyn Rodrigues}
7fc9e472SGoldwyn RodriguesEXPORT_SYMBOL(filemap_range_has_page);
7fc9e472SGoldwyn Rodrigues
5e8fcc1aSJeff Laytonstatic void __filemap_fdatawait_range(struct address_space *mapping,
aa750fd7SJunichi Nomura				     loff_t start_byte, loff_t end_byte)
1da177e4SLinus Torvalds{
09cbfeafSKirill A. Shutemov	pgoff_t index = start_byte >> PAGE_SHIFT;
09cbfeafSKirill A. Shutemov	pgoff_t end = end_byte >> PAGE_SHIFT;
6817ef51SVishal Moola (Oracle)	struct folio_batch fbatch;
6817ef51SVishal Moola (Oracle)	unsigned nr_folios;
1da177e4SLinus Torvalds
6817ef51SVishal Moola (Oracle)	folio_batch_init(&fbatch);
6817ef51SVishal Moola (Oracle)
312e9d2fSJan Kara	while (index <= end) {
1da177e4SLinus Torvalds		unsigned i;
1da177e4SLinus Torvalds
6817ef51SVishal Moola (Oracle)		nr_folios = filemap_get_folios_tag(mapping, &index, end,
6817ef51SVishal Moola (Oracle)				PAGECACHE_TAG_WRITEBACK, &fbatch);
6817ef51SVishal Moola (Oracle)
6817ef51SVishal Moola (Oracle)		if (!nr_folios)
312e9d2fSJan Kara			break;
312e9d2fSJan Kara
6817ef51SVishal Moola (Oracle)		for (i = 0; i < nr_folios; i++) {
6817ef51SVishal Moola (Oracle)			struct folio *folio = fbatch.folios[i];
1da177e4SLinus Torvalds
6817ef51SVishal Moola (Oracle)			folio_wait_writeback(folio);
6817ef51SVishal Moola (Oracle)			folio_clear_error(folio);
1da177e4SLinus Torvalds		}
6817ef51SVishal Moola (Oracle)		folio_batch_release(&fbatch);
1da177e4SLinus Torvalds		cond_resched();
1da177e4SLinus Torvalds	}
aa750fd7SJunichi Nomura}
aa750fd7SJunichi Nomura
aa750fd7SJunichi Nomura/**
aa750fd7SJunichi Nomura * filemap_fdatawait_range - wait for writeback to complete
aa750fd7SJunichi Nomura * @mapping:		address space structure to wait for
aa750fd7SJunichi Nomura * @start_byte:		offset in bytes where the range starts
aa750fd7SJunichi Nomura * @end_byte:		offset in bytes where the range ends (inclusive)
aa750fd7SJunichi Nomura *
aa750fd7SJunichi Nomura * Walk the list of under-writeback pages of the given address space
aa750fd7SJunichi Nomura * in the given range and wait for all of them.  Check error status of
aa750fd7SJunichi Nomura * the address space and return it.
aa750fd7SJunichi Nomura *
aa750fd7SJunichi Nomura * Since the error status of the address space is cleared by this function,
aa750fd7SJunichi Nomura * callers are responsible for checking the return value and handling and/or
aa750fd7SJunichi Nomura * reporting the error.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: error status of the address space.
aa750fd7SJunichi Nomura */
aa750fd7SJunichi Nomuraint filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
aa750fd7SJunichi Nomura			    loff_t end_byte)
aa750fd7SJunichi Nomura{
5e8fcc1aSJeff Layton	__filemap_fdatawait_range(mapping, start_byte, end_byte);
5e8fcc1aSJeff Layton	return filemap_check_errors(mapping);
1da177e4SLinus Torvalds}
d3bccb6fSJan KaraEXPORT_SYMBOL(filemap_fdatawait_range);
d3bccb6fSJan Kara
d3bccb6fSJan Kara/**
aa0bfcd9SRoss Zwisler * filemap_fdatawait_range_keep_errors - wait for writeback to complete
aa0bfcd9SRoss Zwisler * @mapping:		address space structure to wait for
aa0bfcd9SRoss Zwisler * @start_byte:		offset in bytes where the range starts
aa0bfcd9SRoss Zwisler * @end_byte:		offset in bytes where the range ends (inclusive)
aa0bfcd9SRoss Zwisler *
aa0bfcd9SRoss Zwisler * Walk the list of under-writeback pages of the given address space in the
aa0bfcd9SRoss Zwisler * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
aa0bfcd9SRoss Zwisler * this function does not clear error status of the address space.
aa0bfcd9SRoss Zwisler *
aa0bfcd9SRoss Zwisler * Use this function if callers don't handle errors themselves.  Expected
aa0bfcd9SRoss Zwisler * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
aa0bfcd9SRoss Zwisler * fsfreeze(8)
aa0bfcd9SRoss Zwisler */
aa0bfcd9SRoss Zwislerint filemap_fdatawait_range_keep_errors(struct address_space *mapping,
aa0bfcd9SRoss Zwisler		loff_t start_byte, loff_t end_byte)
aa0bfcd9SRoss Zwisler{
aa0bfcd9SRoss Zwisler	__filemap_fdatawait_range(mapping, start_byte, end_byte);
aa0bfcd9SRoss Zwisler	return filemap_check_and_keep_errors(mapping);
aa0bfcd9SRoss Zwisler}
aa0bfcd9SRoss ZwislerEXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
aa0bfcd9SRoss Zwisler
aa0bfcd9SRoss Zwisler/**
a823e458SJeff Layton * file_fdatawait_range - wait for writeback to complete
a823e458SJeff Layton * @file:		file pointing to address space structure to wait for
a823e458SJeff Layton * @start_byte:		offset in bytes where the range starts
a823e458SJeff Layton * @end_byte:		offset in bytes where the range ends (inclusive)
a823e458SJeff Layton *
a823e458SJeff Layton * Walk the list of under-writeback pages of the address space that file
a823e458SJeff Layton * refers to, in the given range and wait for all of them.  Check error
a823e458SJeff Layton * status of the address space vs. the file->f_wb_err cursor and return it.
a823e458SJeff Layton *
a823e458SJeff Layton * Since the error status of the file is advanced by this function,
a823e458SJeff Layton * callers are responsible for checking the return value and handling and/or
a823e458SJeff Layton * reporting the error.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: error status of the address space vs. the file->f_wb_err cursor.
a823e458SJeff Layton */
a823e458SJeff Laytonint file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
a823e458SJeff Layton{
a823e458SJeff Layton	struct address_space *mapping = file->f_mapping;
a823e458SJeff Layton
a823e458SJeff Layton	__filemap_fdatawait_range(mapping, start_byte, end_byte);
a823e458SJeff Layton	return file_check_and_advance_wb_err(file);
a823e458SJeff Layton}
a823e458SJeff LaytonEXPORT_SYMBOL(file_fdatawait_range);
a823e458SJeff Layton
a823e458SJeff Layton/**
aa750fd7SJunichi Nomura * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
aa750fd7SJunichi Nomura * @mapping: address space structure to wait for
aa750fd7SJunichi Nomura *
aa750fd7SJunichi Nomura * Walk the list of under-writeback pages of the given address space
aa750fd7SJunichi Nomura * and wait for all of them.  Unlike filemap_fdatawait(), this function
aa750fd7SJunichi Nomura * does not clear error status of the address space.
aa750fd7SJunichi Nomura *
aa750fd7SJunichi Nomura * Use this function if callers don't handle errors themselves.  Expected
aa750fd7SJunichi Nomura * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
aa750fd7SJunichi Nomura * fsfreeze(8)
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: error status of the address space.
aa750fd7SJunichi Nomura */
76341cabSJeff Laytonint filemap_fdatawait_keep_errors(struct address_space *mapping)
aa750fd7SJunichi Nomura{
ffb959bbSJeff Layton	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
76341cabSJeff Layton	return filemap_check_and_keep_errors(mapping);
aa750fd7SJunichi Nomura}
76341cabSJeff LaytonEXPORT_SYMBOL(filemap_fdatawait_keep_errors);
aa750fd7SJunichi Nomura
875d91b1SKonstantin Khlebnikov/* Returns true if writeback might be needed or already in progress. */
9326c9b2SJeff Laytonstatic bool mapping_needs_writeback(struct address_space *mapping)
1da177e4SLinus Torvalds{
875d91b1SKonstantin Khlebnikov	return mapping->nrpages;
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
4bdcd1ddSJens Axboebool filemap_range_has_writeback(struct address_space *mapping,
f8ee8909SJens Axboe				 loff_t start_byte, loff_t end_byte)
f8ee8909SJens Axboe{
f8ee8909SJens Axboe	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
f8ee8909SJens Axboe	pgoff_t max = end_byte >> PAGE_SHIFT;
b05f41a1SVishal Moola (Oracle)	struct folio *folio;
f8ee8909SJens Axboe
f8ee8909SJens Axboe	if (end_byte < start_byte)
f8ee8909SJens Axboe		return false;
f8ee8909SJens Axboe
f8ee8909SJens Axboe	rcu_read_lock();
b05f41a1SVishal Moola (Oracle)	xas_for_each(&xas, folio, max) {
b05f41a1SVishal Moola (Oracle)		if (xas_retry(&xas, folio))
f8ee8909SJens Axboe			continue;
b05f41a1SVishal Moola (Oracle)		if (xa_is_value(folio))
f8ee8909SJens Axboe			continue;
b05f41a1SVishal Moola (Oracle)		if (folio_test_dirty(folio) || folio_test_locked(folio) ||
b05f41a1SVishal Moola (Oracle)				folio_test_writeback(folio))
f8ee8909SJens Axboe			break;
f8ee8909SJens Axboe	}
f8ee8909SJens Axboe	rcu_read_unlock();
b05f41a1SVishal Moola (Oracle)	return folio != NULL;
f8ee8909SJens Axboe}
4bdcd1ddSJens AxboeEXPORT_SYMBOL_GPL(filemap_range_has_writeback);
63135aa3SJens Axboe
63135aa3SJens Axboe/**
485bb99bSRandy Dunlap * filemap_write_and_wait_range - write out & wait on a file range
485bb99bSRandy Dunlap * @mapping:	the address_space for the pages
485bb99bSRandy Dunlap * @lstart:	offset in bytes where the range starts
485bb99bSRandy Dunlap * @lend:	offset in bytes where the range ends (inclusive)
485bb99bSRandy Dunlap *
469eb4d0SAndrew Morton * Write out and wait upon file offsets lstart->lend, inclusive.
469eb4d0SAndrew Morton *
0e056eb5Smchehab@s-opensource.com * Note that @lend is inclusive (describes the last byte to be written) so
469eb4d0SAndrew Morton * that this function can be used to write to the very end-of-file (end = -1).
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: error status of the address space.
469eb4d0SAndrew Morton */
1da177e4SLinus Torvaldsint filemap_write_and_wait_range(struct address_space *mapping,
1da177e4SLinus Torvalds				 loff_t lstart, loff_t lend)
1da177e4SLinus Torvalds{
ccac11daSMiaohe Lin	int err = 0, err2;
1da177e4SLinus Torvalds
feeb9b26SBrian Foster	if (lend < lstart)
feeb9b26SBrian Foster		return 0;
feeb9b26SBrian Foster
9326c9b2SJeff Layton	if (mapping_needs_writeback(mapping)) {
28fd1298SOGAWA Hirofumi		err = __filemap_fdatawrite_range(mapping, lstart, lend,
1da177e4SLinus Torvalds						 WB_SYNC_ALL);
ddf8f376SIra Weiny		/*
ddf8f376SIra Weiny		 * Even if the above returned error, the pages may be
ddf8f376SIra Weiny		 * written partially (e.g. -ENOSPC), so we wait for it.
ddf8f376SIra Weiny		 * But the -EIO is special case, it may indicate the worst
ddf8f376SIra Weiny		 * thing (e.g. bug) happened, so we avoid waiting for it.
ddf8f376SIra Weiny		 */
ccac11daSMiaohe Lin		if (err != -EIO)
ccac11daSMiaohe Lin			__filemap_fdatawait_range(mapping, lstart, lend);
ccac11daSMiaohe Lin	}
ccac11daSMiaohe Lin	err2 = filemap_check_errors(mapping);
28fd1298SOGAWA Hirofumi	if (!err)
28fd1298SOGAWA Hirofumi		err = err2;
28fd1298SOGAWA Hirofumi	return err;
1da177e4SLinus Torvalds}
f6995585SChris MasonEXPORT_SYMBOL(filemap_write_and_wait_range);
1da177e4SLinus Torvalds
5660e13dSJeff Laytonvoid __filemap_set_wb_err(struct address_space *mapping, int err)
5660e13dSJeff Layton{
3acdfd28SJeff Layton	errseq_t eseq = errseq_set(&mapping->wb_err, err);
5660e13dSJeff Layton
5660e13dSJeff Layton	trace_filemap_set_wb_err(mapping, eseq);
5660e13dSJeff Layton}
5660e13dSJeff LaytonEXPORT_SYMBOL(__filemap_set_wb_err);
5660e13dSJeff Layton
5660e13dSJeff Layton/**
5660e13dSJeff Layton * file_check_and_advance_wb_err - report wb error (if any) that was previously
5660e13dSJeff Layton * 				   and advance wb_err to current one
5660e13dSJeff Layton * @file: struct file on which the error is being reported
5660e13dSJeff Layton *
5660e13dSJeff Layton * When userland calls fsync (or something like nfsd does the equivalent), we
5660e13dSJeff Layton * want to report any writeback errors that occurred since the last fsync (or
5660e13dSJeff Layton * since the file was opened if there haven't been any).
5660e13dSJeff Layton *
5660e13dSJeff Layton * Grab the wb_err from the mapping. If it matches what we have in the file,
5660e13dSJeff Layton * then just quickly return 0. The file is all caught up.
5660e13dSJeff Layton *
5660e13dSJeff Layton * If it doesn't match, then take the mapping value, set the "seen" flag in
5660e13dSJeff Layton * it and try to swap it into place. If it works, or another task beat us
5660e13dSJeff Layton * to it with the new value, then update the f_wb_err and return the error
5660e13dSJeff Layton * portion. The error at this point must be reported via proper channels
5660e13dSJeff Layton * (a'la fsync, or NFS COMMIT operation, etc.).
5660e13dSJeff Layton *
5660e13dSJeff Layton * While we handle mapping->wb_err with atomic operations, the f_wb_err
5660e13dSJeff Layton * value is protected by the f_lock since we must ensure that it reflects
5660e13dSJeff Layton * the latest value swapped in for this file descriptor.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: %0 on success, negative error code otherwise.
5660e13dSJeff Layton */
5660e13dSJeff Laytonint file_check_and_advance_wb_err(struct file *file)
5660e13dSJeff Layton{
5660e13dSJeff Layton	int err = 0;
5660e13dSJeff Layton	errseq_t old = READ_ONCE(file->f_wb_err);
5660e13dSJeff Layton	struct address_space *mapping = file->f_mapping;
5660e13dSJeff Layton
5660e13dSJeff Layton	/* Locklessly handle the common case where nothing has changed */
5660e13dSJeff Layton	if (errseq_check(&mapping->wb_err, old)) {
5660e13dSJeff Layton		/* Something changed, must use slow path */
5660e13dSJeff Layton		spin_lock(&file->f_lock);
5660e13dSJeff Layton		old = file->f_wb_err;
5660e13dSJeff Layton		err = errseq_check_and_advance(&mapping->wb_err,
5660e13dSJeff Layton						&file->f_wb_err);
5660e13dSJeff Layton		trace_file_check_and_advance_wb_err(file, old);
5660e13dSJeff Layton		spin_unlock(&file->f_lock);
5660e13dSJeff Layton	}
f4e222c5SJeff Layton
f4e222c5SJeff Layton	/*
f4e222c5SJeff Layton	 * We're mostly using this function as a drop in replacement for
f4e222c5SJeff Layton	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
f4e222c5SJeff Layton	 * that the legacy code would have had on these flags.
f4e222c5SJeff Layton	 */
f4e222c5SJeff Layton	clear_bit(AS_EIO, &mapping->flags);
f4e222c5SJeff Layton	clear_bit(AS_ENOSPC, &mapping->flags);
5660e13dSJeff Layton	return err;
5660e13dSJeff Layton}
5660e13dSJeff LaytonEXPORT_SYMBOL(file_check_and_advance_wb_err);
5660e13dSJeff Layton
5660e13dSJeff Layton/**
5660e13dSJeff Layton * file_write_and_wait_range - write out & wait on a file range
5660e13dSJeff Layton * @file:	file pointing to address_space with pages
5660e13dSJeff Layton * @lstart:	offset in bytes where the range starts
5660e13dSJeff Layton * @lend:	offset in bytes where the range ends (inclusive)
5660e13dSJeff Layton *
5660e13dSJeff Layton * Write out and wait upon file offsets lstart->lend, inclusive.
5660e13dSJeff Layton *
5660e13dSJeff Layton * Note that @lend is inclusive (describes the last byte to be written) so
5660e13dSJeff Layton * that this function can be used to write to the very end-of-file (end = -1).
5660e13dSJeff Layton *
5660e13dSJeff Layton * After writing out and waiting on the data, we check and advance the
5660e13dSJeff Layton * f_wb_err cursor to the latest value, and return any errors detected there.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: %0 on success, negative error code otherwise.
5660e13dSJeff Layton */
5660e13dSJeff Laytonint file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
5660e13dSJeff Layton{
5660e13dSJeff Layton	int err = 0, err2;
5660e13dSJeff Layton	struct address_space *mapping = file->f_mapping;
5660e13dSJeff Layton
feeb9b26SBrian Foster	if (lend < lstart)
feeb9b26SBrian Foster		return 0;
feeb9b26SBrian Foster
9326c9b2SJeff Layton	if (mapping_needs_writeback(mapping)) {
5660e13dSJeff Layton		err = __filemap_fdatawrite_range(mapping, lstart, lend,
5660e13dSJeff Layton						 WB_SYNC_ALL);
5660e13dSJeff Layton		/* See comment of filemap_write_and_wait() */
5660e13dSJeff Layton		if (err != -EIO)
5660e13dSJeff Layton			__filemap_fdatawait_range(mapping, lstart, lend);
5660e13dSJeff Layton	}
5660e13dSJeff Layton	err2 = file_check_and_advance_wb_err(file);
5660e13dSJeff Layton	if (!err)
5660e13dSJeff Layton		err = err2;
5660e13dSJeff Layton	return err;
5660e13dSJeff Layton}
5660e13dSJeff LaytonEXPORT_SYMBOL(file_write_and_wait_range);
5660e13dSJeff Layton
485bb99bSRandy Dunlap/**
3720dd6dSVishal Moola (Oracle) * replace_page_cache_folio - replace a pagecache folio with a new one
3720dd6dSVishal Moola (Oracle) * @old:	folio to be replaced
3720dd6dSVishal Moola (Oracle) * @new:	folio to replace with
ef6a3c63SMiklos Szeredi *
3720dd6dSVishal Moola (Oracle) * This function replaces a folio in the pagecache with a new one.  On
3720dd6dSVishal Moola (Oracle) * success it acquires the pagecache reference for the new folio and
3720dd6dSVishal Moola (Oracle) * drops it for the old folio.  Both the old and new folios must be
3720dd6dSVishal Moola (Oracle) * locked.  This function does not add the new folio to the LRU, the
ef6a3c63SMiklos Szeredi * caller must do that.
ef6a3c63SMiklos Szeredi *
74d60958SMatthew Wilcox * The remove + add is atomic.  This function cannot fail.
ef6a3c63SMiklos Szeredi */
3720dd6dSVishal Moola (Oracle)void replace_page_cache_folio(struct folio *old, struct folio *new)
ef6a3c63SMiklos Szeredi{
74d60958SMatthew Wilcox	struct address_space *mapping = old->mapping;
d2329aa0SMatthew Wilcox (Oracle)	void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
74d60958SMatthew Wilcox	pgoff_t offset = old->index;
74d60958SMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, offset);
ef6a3c63SMiklos Szeredi
3720dd6dSVishal Moola (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
3720dd6dSVishal Moola (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
3720dd6dSVishal Moola (Oracle)	VM_BUG_ON_FOLIO(new->mapping, new);
ef6a3c63SMiklos Szeredi
3720dd6dSVishal Moola (Oracle)	folio_get(new);
ef6a3c63SMiklos Szeredi	new->mapping = mapping;
ef6a3c63SMiklos Szeredi	new->index = offset;
ef6a3c63SMiklos Szeredi
3720dd6dSVishal Moola (Oracle)	mem_cgroup_migrate(old, new);
0d1c2072SJohannes Weiner
30472509SJohannes Weiner	xas_lock_irq(&xas);
74d60958SMatthew Wilcox	xas_store(&xas, new);
4165b9b4SMichal Hocko
74d60958SMatthew Wilcox	old->mapping = NULL;
74d60958SMatthew Wilcox	/* hugetlb pages do not participate in page cache accounting. */
3720dd6dSVishal Moola (Oracle)	if (!folio_test_hugetlb(old))
3720dd6dSVishal Moola (Oracle)		__lruvec_stat_sub_folio(old, NR_FILE_PAGES);
3720dd6dSVishal Moola (Oracle)	if (!folio_test_hugetlb(new))
3720dd6dSVishal Moola (Oracle)		__lruvec_stat_add_folio(new, NR_FILE_PAGES);
3720dd6dSVishal Moola (Oracle)	if (folio_test_swapbacked(old))
3720dd6dSVishal Moola (Oracle)		__lruvec_stat_sub_folio(old, NR_SHMEM);
3720dd6dSVishal Moola (Oracle)	if (folio_test_swapbacked(new))
3720dd6dSVishal Moola (Oracle)		__lruvec_stat_add_folio(new, NR_SHMEM);
30472509SJohannes Weiner	xas_unlock_irq(&xas);
d2329aa0SMatthew Wilcox (Oracle)	if (free_folio)
3720dd6dSVishal Moola (Oracle)		free_folio(old);
3720dd6dSVishal Moola (Oracle)	folio_put(old);
ef6a3c63SMiklos Szeredi}
3720dd6dSVishal Moola (Oracle)EXPORT_SYMBOL_GPL(replace_page_cache_folio);
ef6a3c63SMiklos Szeredi
9dd3d069SMatthew Wilcox (Oracle)noinline int __filemap_add_folio(struct address_space *mapping,
9dd3d069SMatthew Wilcox (Oracle)		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
1da177e4SLinus Torvalds{
9dd3d069SMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, index);
9dd3d069SMatthew Wilcox (Oracle)	int huge = folio_test_hugetlb(folio);
722e9e5aSKairui Song	void *alloced_shadow = NULL;
722e9e5aSKairui Song	int alloced_order = 0;
da74240eSWaiman Long	bool charged = false;
d68eccadSMatthew Wilcox (Oracle)	long nr = 1;
e286781dSNick Piggin
9dd3d069SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
9dd3d069SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
74d60958SMatthew Wilcox	mapping_set_update(&xas, mapping);
e286781dSNick Piggin
3fea5a49SJohannes Weiner	if (!huge) {
d68eccadSMatthew Wilcox (Oracle)		int error = mem_cgroup_charge(folio, NULL, gfp);
9dd3d069SMatthew Wilcox (Oracle)		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
3fea5a49SJohannes Weiner		if (error)
d68eccadSMatthew Wilcox (Oracle)			return error;
da74240eSWaiman Long		charged = true;
d68eccadSMatthew Wilcox (Oracle)		xas_set_order(&xas, index, folio_order(folio));
d68eccadSMatthew Wilcox (Oracle)		nr = folio_nr_pages(folio);
3fea5a49SJohannes Weiner	}
3fea5a49SJohannes Weiner
198b62f8SMatthew Wilcox (Oracle)	gfp &= GFP_RECLAIM_MASK;
d68eccadSMatthew Wilcox (Oracle)	folio_ref_add(folio, nr);
d68eccadSMatthew Wilcox (Oracle)	folio->mapping = mapping;
d68eccadSMatthew Wilcox (Oracle)	folio->index = xas.xa_index;
198b62f8SMatthew Wilcox (Oracle)
722e9e5aSKairui Song	for (;;) {
722e9e5aSKairui Song		int order = -1, split_order = 0;
198b62f8SMatthew Wilcox (Oracle)		void *entry, *old = NULL;
198b62f8SMatthew Wilcox (Oracle)
74d60958SMatthew Wilcox		xas_lock_irq(&xas);
198b62f8SMatthew Wilcox (Oracle)		xas_for_each_conflict(&xas, entry) {
198b62f8SMatthew Wilcox (Oracle)			old = entry;
198b62f8SMatthew Wilcox (Oracle)			if (!xa_is_value(entry)) {
74d60958SMatthew Wilcox				xas_set_err(&xas, -EEXIST);
198b62f8SMatthew Wilcox (Oracle)				goto unlock;
198b62f8SMatthew Wilcox (Oracle)			}
722e9e5aSKairui Song			/*
722e9e5aSKairui Song			 * If a larger entry exists,
722e9e5aSKairui Song			 * it will be the first and only entry iterated.
722e9e5aSKairui Song			 */
722e9e5aSKairui Song			if (order == -1)
722e9e5aSKairui Song				order = xas_get_order(&xas);
722e9e5aSKairui Song		}
722e9e5aSKairui Song
722e9e5aSKairui Song		/* entry may have changed before we re-acquire the lock */
722e9e5aSKairui Song		if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
722e9e5aSKairui Song			xas_destroy(&xas);
722e9e5aSKairui Song			alloced_order = 0;
198b62f8SMatthew Wilcox (Oracle)		}
198b62f8SMatthew Wilcox (Oracle)
198b62f8SMatthew Wilcox (Oracle)		if (old) {
722e9e5aSKairui Song			if (order > 0 && order > folio_order(folio)) {
d68eccadSMatthew Wilcox (Oracle)				/* How to handle large swap entries? */
d68eccadSMatthew Wilcox (Oracle)				BUG_ON(shmem_mapping(mapping));
722e9e5aSKairui Song				if (!alloced_order) {
722e9e5aSKairui Song					split_order = order;
722e9e5aSKairui Song					goto unlock;
722e9e5aSKairui Song				}
198b62f8SMatthew Wilcox (Oracle)				xas_split(&xas, old, order);
198b62f8SMatthew Wilcox (Oracle)				xas_reset(&xas);
198b62f8SMatthew Wilcox (Oracle)			}
722e9e5aSKairui Song			if (shadowp)
722e9e5aSKairui Song				*shadowp = old;
198b62f8SMatthew Wilcox (Oracle)		}
198b62f8SMatthew Wilcox (Oracle)
9dd3d069SMatthew Wilcox (Oracle)		xas_store(&xas, folio);
74d60958SMatthew Wilcox		if (xas_error(&xas))
74d60958SMatthew Wilcox			goto unlock;
4165b9b4SMichal Hocko
d68eccadSMatthew Wilcox (Oracle)		mapping->nrpages += nr;
74d60958SMatthew Wilcox
74d60958SMatthew Wilcox		/* hugetlb pages do not participate in page cache accounting */
d68eccadSMatthew Wilcox (Oracle)		if (!huge) {
d68eccadSMatthew Wilcox (Oracle)			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
d68eccadSMatthew Wilcox (Oracle)			if (folio_test_pmd_mappable(folio))
d68eccadSMatthew Wilcox (Oracle)				__lruvec_stat_mod_folio(folio,
d68eccadSMatthew Wilcox (Oracle)						NR_FILE_THPS, nr);
d68eccadSMatthew Wilcox (Oracle)		}
722e9e5aSKairui Song
74d60958SMatthew Wilcoxunlock:
74d60958SMatthew Wilcox		xas_unlock_irq(&xas);
722e9e5aSKairui Song
722e9e5aSKairui Song		/* split needed, alloc here and retry. */
722e9e5aSKairui Song		if (split_order) {
722e9e5aSKairui Song			xas_split_alloc(&xas, old, split_order, gfp);
722e9e5aSKairui Song			if (xas_error(&xas))
722e9e5aSKairui Song				goto error;
722e9e5aSKairui Song			alloced_shadow = old;
722e9e5aSKairui Song			alloced_order = split_order;
722e9e5aSKairui Song			xas_reset(&xas);
722e9e5aSKairui Song			continue;
722e9e5aSKairui Song		}
722e9e5aSKairui Song
722e9e5aSKairui Song		if (!xas_nomem(&xas, gfp))
722e9e5aSKairui Song			break;
722e9e5aSKairui Song	}
74d60958SMatthew Wilcox
d68eccadSMatthew Wilcox (Oracle)	if (xas_error(&xas))
74d60958SMatthew Wilcox		goto error;
74d60958SMatthew Wilcox
a0580c6fSMatthew Wilcox (Oracle)	trace_mm_filemap_add_to_page_cache(folio);
66a0c8eeSKirill A. Shutemov	return 0;
74d60958SMatthew Wilcoxerror:
d68eccadSMatthew Wilcox (Oracle)	if (charged)
d68eccadSMatthew Wilcox (Oracle)		mem_cgroup_uncharge(folio);
9dd3d069SMatthew Wilcox (Oracle)	folio->mapping = NULL;
b85e0effSHugh Dickins	/* Leave page->index set: truncation relies upon it */
d68eccadSMatthew Wilcox (Oracle)	folio_put_refs(folio, nr);
d68eccadSMatthew Wilcox (Oracle)	return xas_error(&xas);
1da177e4SLinus Torvalds}
9dd3d069SMatthew Wilcox (Oracle)ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
a528910eSJohannes Weiner
9dd3d069SMatthew Wilcox (Oracle)int filemap_add_folio(struct address_space *mapping, struct folio *folio,
9dd3d069SMatthew Wilcox (Oracle)				pgoff_t index, gfp_t gfp)
1da177e4SLinus Torvalds{
a528910eSJohannes Weiner	void *shadow = NULL;
4f98a2feSRik van Riel	int ret;
4f98a2feSRik van Riel
9dd3d069SMatthew Wilcox (Oracle)	__folio_set_locked(folio);
9dd3d069SMatthew Wilcox (Oracle)	ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
a528910eSJohannes Weiner	if (unlikely(ret))
9dd3d069SMatthew Wilcox (Oracle)		__folio_clear_locked(folio);
a528910eSJohannes Weiner	else {
a528910eSJohannes Weiner		/*
9dd3d069SMatthew Wilcox (Oracle)		 * The folio might have been evicted from cache only
a528910eSJohannes Weiner		 * recently, in which case it should be activated like
9dd3d069SMatthew Wilcox (Oracle)		 * any other repeatedly accessed folio.
9dd3d069SMatthew Wilcox (Oracle)		 * The exception is folios getting rewritten; evicting other
f0281a00SRik van Riel		 * data from the working set, only to cache data that will
f0281a00SRik van Riel		 * get overwritten with something else, is a waste of memory.
a528910eSJohannes Weiner		 */
9dd3d069SMatthew Wilcox (Oracle)		WARN_ON_ONCE(folio_test_active(folio));
9dd3d069SMatthew Wilcox (Oracle)		if (!(gfp & __GFP_WRITE) && shadow)
9dd3d069SMatthew Wilcox (Oracle)			workingset_refault(folio, shadow);
9dd3d069SMatthew Wilcox (Oracle)		folio_add_lru(folio);
a528910eSJohannes Weiner	}
1da177e4SLinus Torvalds	return ret;
1da177e4SLinus Torvalds}
9dd3d069SMatthew Wilcox (Oracle)EXPORT_SYMBOL_GPL(filemap_add_folio);
1da177e4SLinus Torvalds
44110fe3SPaul Jackson#ifdef CONFIG_NUMA
bb3c579eSMatthew Wilcox (Oracle)struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
44110fe3SPaul Jackson{
c0ff7453SMiao Xie	int n;
bb3c579eSMatthew Wilcox (Oracle)	struct folio *folio;
c0ff7453SMiao Xie
44110fe3SPaul Jackson	if (cpuset_do_page_mem_spread()) {
cc9a6c87SMel Gorman		unsigned int cpuset_mems_cookie;
cc9a6c87SMel Gorman		do {
d26914d1SMel Gorman			cpuset_mems_cookie = read_mems_allowed_begin();
c0ff7453SMiao Xie			n = cpuset_mem_spread_node();
bb3c579eSMatthew Wilcox (Oracle)			folio = __folio_alloc_node(gfp, order, n);
bb3c579eSMatthew Wilcox (Oracle)		} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
cc9a6c87SMel Gorman
bb3c579eSMatthew Wilcox (Oracle)		return folio;
44110fe3SPaul Jackson	}
bb3c579eSMatthew Wilcox (Oracle)	return folio_alloc(gfp, order);
44110fe3SPaul Jackson}
bb3c579eSMatthew Wilcox (Oracle)EXPORT_SYMBOL(filemap_alloc_folio);
44110fe3SPaul Jackson#endif
44110fe3SPaul Jackson
1da177e4SLinus Torvalds/*
7506ae6aSJan Kara * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
7506ae6aSJan Kara *
7506ae6aSJan Kara * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
7506ae6aSJan Kara *
7506ae6aSJan Kara * @mapping1: the first mapping to lock
7506ae6aSJan Kara * @mapping2: the second mapping to lock
7506ae6aSJan Kara */
7506ae6aSJan Karavoid filemap_invalidate_lock_two(struct address_space *mapping1,
7506ae6aSJan Kara				 struct address_space *mapping2)
7506ae6aSJan Kara{
7506ae6aSJan Kara	if (mapping1 > mapping2)
7506ae6aSJan Kara		swap(mapping1, mapping2);
7506ae6aSJan Kara	if (mapping1)
7506ae6aSJan Kara		down_write(&mapping1->invalidate_lock);
7506ae6aSJan Kara	if (mapping2 && mapping1 != mapping2)
7506ae6aSJan Kara		down_write_nested(&mapping2->invalidate_lock, 1);
7506ae6aSJan Kara}
7506ae6aSJan KaraEXPORT_SYMBOL(filemap_invalidate_lock_two);
7506ae6aSJan Kara
7506ae6aSJan Kara/*
7506ae6aSJan Kara * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
7506ae6aSJan Kara *
7506ae6aSJan Kara * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
7506ae6aSJan Kara *
7506ae6aSJan Kara * @mapping1: the first mapping to unlock
7506ae6aSJan Kara * @mapping2: the second mapping to unlock
7506ae6aSJan Kara */
7506ae6aSJan Karavoid filemap_invalidate_unlock_two(struct address_space *mapping1,
7506ae6aSJan Kara				   struct address_space *mapping2)
7506ae6aSJan Kara{
7506ae6aSJan Kara	if (mapping1)
7506ae6aSJan Kara		up_write(&mapping1->invalidate_lock);
7506ae6aSJan Kara	if (mapping2 && mapping1 != mapping2)
7506ae6aSJan Kara		up_write(&mapping2->invalidate_lock);
7506ae6aSJan Kara}
7506ae6aSJan KaraEXPORT_SYMBOL(filemap_invalidate_unlock_two);
7506ae6aSJan Kara
7506ae6aSJan Kara/*
1da177e4SLinus Torvalds * In order to wait for pages to become available there must be
1da177e4SLinus Torvalds * waitqueues associated with pages. By using a hash table of
1da177e4SLinus Torvalds * waitqueues where the bucket discipline is to maintain all
1da177e4SLinus Torvalds * waiters on the same queue and wake all when any of the pages
1da177e4SLinus Torvalds * become available, and for the woken contexts to check to be
1da177e4SLinus Torvalds * sure the appropriate page became available, this saves space
1da177e4SLinus Torvalds * at a cost of "thundering herd" phenomena during rare hash
1da177e4SLinus Torvalds * collisions.
1da177e4SLinus Torvalds */
62906027SNicholas Piggin#define PAGE_WAIT_TABLE_BITS 8
62906027SNicholas Piggin#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
df4d4f12SMatthew Wilcox (Oracle)static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
62906027SNicholas Piggin
df4d4f12SMatthew Wilcox (Oracle)static wait_queue_head_t *folio_waitqueue(struct folio *folio)
1da177e4SLinus Torvalds{
df4d4f12SMatthew Wilcox (Oracle)	return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
1da177e4SLinus Torvalds}
62906027SNicholas Piggin
62906027SNicholas Pigginvoid __init pagecache_init(void)
62906027SNicholas Piggin{
62906027SNicholas Piggin	int i;
62906027SNicholas Piggin
62906027SNicholas Piggin	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
df4d4f12SMatthew Wilcox (Oracle)		init_waitqueue_head(&folio_wait_table[i]);
62906027SNicholas Piggin
62906027SNicholas Piggin	page_writeback_init();
62906027SNicholas Piggin}
62906027SNicholas Piggin
5ef64cc8SLinus Torvalds/*
5ef64cc8SLinus Torvalds * The page wait code treats the "wait->flags" somewhat unusually, because
5868ec26SLinus Torvalds * we have multiple different kinds of waits, not just the usual "exclusive"
5ef64cc8SLinus Torvalds * one.
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds * We have:
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *  (a) no special bits set:
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	We're just waiting for the bit to be released, and when a waker
5ef64cc8SLinus Torvalds *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
5ef64cc8SLinus Torvalds *	and remove it from the wait queue.
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	Simple and straightforward.
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *  (b) WQ_FLAG_EXCLUSIVE:
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	The waiter is waiting to get the lock, and only one waiter should
5ef64cc8SLinus Torvalds *	be woken up to avoid any thundering herd behavior. We'll set the
5ef64cc8SLinus Torvalds *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	This is the traditional exclusive wait.
5ef64cc8SLinus Torvalds *
5868ec26SLinus Torvalds *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	The waiter is waiting to get the bit, and additionally wants the
5ef64cc8SLinus Torvalds *	lock to be transferred to it for fair lock behavior. If the lock
5ef64cc8SLinus Torvalds *	cannot be taken, we stop walking the wait queue without waking
5ef64cc8SLinus Torvalds *	the waiter.
5ef64cc8SLinus Torvalds *
5ef64cc8SLinus Torvalds *	This is the "fair lock handoff" case, and in addition to setting
5ef64cc8SLinus Torvalds *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
5ef64cc8SLinus Torvalds *	that it now has the lock.
5ef64cc8SLinus Torvalds */
ac6424b9SIngo Molnarstatic int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
62906027SNicholas Piggin{
5ef64cc8SLinus Torvalds	unsigned int flags;
62906027SNicholas Piggin	struct wait_page_key *key = arg;
62906027SNicholas Piggin	struct wait_page_queue *wait_page
62906027SNicholas Piggin		= container_of(wait, struct wait_page_queue, wait);
62906027SNicholas Piggin
cdc8fcb4SLinus Torvalds	if (!wake_page_match(wait_page, key))
62906027SNicholas Piggin		return 0;
3510ca20SLinus Torvalds
9a1ea439SHugh Dickins	/*
5ef64cc8SLinus Torvalds	 * If it's a lock handoff wait, we get the bit for it, and
5ef64cc8SLinus Torvalds	 * stop walking (and do not wake it up) if we can't.
9a1ea439SHugh Dickins	 */
5ef64cc8SLinus Torvalds	flags = wait->flags;
5ef64cc8SLinus Torvalds	if (flags & WQ_FLAG_EXCLUSIVE) {
df4d4f12SMatthew Wilcox (Oracle)		if (test_bit(key->bit_nr, &key->folio->flags))
5ef64cc8SLinus Torvalds			return -1;
5ef64cc8SLinus Torvalds		if (flags & WQ_FLAG_CUSTOM) {
df4d4f12SMatthew Wilcox (Oracle)			if (test_and_set_bit(key->bit_nr, &key->folio->flags))
3510ca20SLinus Torvalds				return -1;
5ef64cc8SLinus Torvalds			flags |= WQ_FLAG_DONE;
2a9127fcSLinus Torvalds		}
5ef64cc8SLinus Torvalds	}
62906027SNicholas Piggin
5ef64cc8SLinus Torvalds	/*
5ef64cc8SLinus Torvalds	 * We are holding the wait-queue lock, but the waiter that
5ef64cc8SLinus Torvalds	 * is waiting for this will be checking the flags without
5ef64cc8SLinus Torvalds	 * any locking.
5ef64cc8SLinus Torvalds	 *
5ef64cc8SLinus Torvalds	 * So update the flags atomically, and wake up the waiter
5ef64cc8SLinus Torvalds	 * afterwards to avoid any races. This store-release pairs
101c0bf6SMatthew Wilcox (Oracle)	 * with the load-acquire in folio_wait_bit_common().
5ef64cc8SLinus Torvalds	 */
5ef64cc8SLinus Torvalds	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
2a9127fcSLinus Torvalds	wake_up_state(wait->private, mode);
2a9127fcSLinus Torvalds
2a9127fcSLinus Torvalds	/*
2a9127fcSLinus Torvalds	 * Ok, we have successfully done what we're waiting for,
2a9127fcSLinus Torvalds	 * and we can unconditionally remove the wait entry.
2a9127fcSLinus Torvalds	 *
5ef64cc8SLinus Torvalds	 * Note that this pairs with the "finish_wait()" in the
5ef64cc8SLinus Torvalds	 * waiter, and has to be the absolute last thing we do.
5ef64cc8SLinus Torvalds	 * After this list_del_init(&wait->entry) the wait entry
2a9127fcSLinus Torvalds	 * might be de-allocated and the process might even have
2a9127fcSLinus Torvalds	 * exited.
2a9127fcSLinus Torvalds	 */
c6fe44d9SLinus Torvalds	list_del_init_careful(&wait->entry);
5ef64cc8SLinus Torvalds	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
62906027SNicholas Piggin}
62906027SNicholas Piggin
6974d7c9SMatthew Wilcox (Oracle)static void folio_wake_bit(struct folio *folio, int bit_nr)
62906027SNicholas Piggin{
df4d4f12SMatthew Wilcox (Oracle)	wait_queue_head_t *q = folio_waitqueue(folio);
62906027SNicholas Piggin	struct wait_page_key key;
62906027SNicholas Piggin	unsigned long flags;
11a19c7bSTim Chen	wait_queue_entry_t bookmark;
62906027SNicholas Piggin
df4d4f12SMatthew Wilcox (Oracle)	key.folio = folio;
62906027SNicholas Piggin	key.bit_nr = bit_nr;
62906027SNicholas Piggin	key.page_match = 0;
62906027SNicholas Piggin
11a19c7bSTim Chen	bookmark.flags = 0;
11a19c7bSTim Chen	bookmark.private = NULL;
11a19c7bSTim Chen	bookmark.func = NULL;
11a19c7bSTim Chen	INIT_LIST_HEAD(&bookmark.entry);
11a19c7bSTim Chen
62906027SNicholas Piggin	spin_lock_irqsave(&q->lock, flags);
11a19c7bSTim Chen	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
11a19c7bSTim Chen
11a19c7bSTim Chen	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
11a19c7bSTim Chen		/*
11a19c7bSTim Chen		 * Take a breather from holding the lock,
11a19c7bSTim Chen		 * allow pages that finish wake up asynchronously
11a19c7bSTim Chen		 * to acquire the lock and remove themselves
11a19c7bSTim Chen		 * from wait queue
11a19c7bSTim Chen		 */
11a19c7bSTim Chen		spin_unlock_irqrestore(&q->lock, flags);
11a19c7bSTim Chen		cpu_relax();
11a19c7bSTim Chen		spin_lock_irqsave(&q->lock, flags);
11a19c7bSTim Chen		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
11a19c7bSTim Chen	}
11a19c7bSTim Chen
62906027SNicholas Piggin	/*
bb43b14bSHugh Dickins	 * It's possible to miss clearing waiters here, when we woke our page
bb43b14bSHugh Dickins	 * waiters, but the hashed waitqueue has waiters for other pages on it.
62906027SNicholas Piggin	 * That's okay, it's a rare case. The next waker will clear it.
bb43b14bSHugh Dickins	 *
bb43b14bSHugh Dickins	 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
bb43b14bSHugh Dickins	 * other), the flag may be cleared in the course of freeing the page;
bb43b14bSHugh Dickins	 * but that is not required for correctness.
62906027SNicholas Piggin	 */
bb43b14bSHugh Dickins	if (!waitqueue_active(q) || !key.page_match)
bb43b14bSHugh Dickins		folio_clear_waiters(folio);
bb43b14bSHugh Dickins
62906027SNicholas Piggin	spin_unlock_irqrestore(&q->lock, flags);
62906027SNicholas Piggin}
74d81bfaSNicholas Piggin
4268b480SMatthew Wilcox (Oracle)static void folio_wake(struct folio *folio, int bit)
74d81bfaSNicholas Piggin{
4268b480SMatthew Wilcox (Oracle)	if (!folio_test_waiters(folio))
74d81bfaSNicholas Piggin		return;
6974d7c9SMatthew Wilcox (Oracle)	folio_wake_bit(folio, bit);
74d81bfaSNicholas Piggin}
62906027SNicholas Piggin
9a1ea439SHugh Dickins/*
101c0bf6SMatthew Wilcox (Oracle) * A choice of three behaviors for folio_wait_bit_common():
9a1ea439SHugh Dickins */
9a1ea439SHugh Dickinsenum behavior {
9a1ea439SHugh Dickins	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
7c23c782SMatthew Wilcox (Oracle)			 * __folio_lock() waiting on then setting PG_locked.
9a1ea439SHugh Dickins			 */
9a1ea439SHugh Dickins	SHARED,		/* Hold ref to page and check the bit when woken, like
9f2b04a2SMatthew Wilcox (Oracle)			 * folio_wait_writeback() waiting on PG_writeback.
9a1ea439SHugh Dickins			 */
9a1ea439SHugh Dickins	DROP,		/* Drop ref to page before wait, no check when woken,
9f2b04a2SMatthew Wilcox (Oracle)			 * like folio_put_wait_locked() on PG_locked.
9a1ea439SHugh Dickins			 */
9a1ea439SHugh Dickins};
9a1ea439SHugh Dickins
2a9127fcSLinus Torvalds/*
101c0bf6SMatthew Wilcox (Oracle) * Attempt to check (or get) the folio flag, and mark us done
5ef64cc8SLinus Torvalds * if successful.
2a9127fcSLinus Torvalds */
101c0bf6SMatthew Wilcox (Oracle)static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
2a9127fcSLinus Torvalds					struct wait_queue_entry *wait)
2a9127fcSLinus Torvalds{
2a9127fcSLinus Torvalds	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
101c0bf6SMatthew Wilcox (Oracle)		if (test_and_set_bit(bit_nr, &folio->flags))
2a9127fcSLinus Torvalds			return false;
101c0bf6SMatthew Wilcox (Oracle)	} else if (test_bit(bit_nr, &folio->flags))
2a9127fcSLinus Torvalds		return false;
2a9127fcSLinus Torvalds
5ef64cc8SLinus Torvalds	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
2a9127fcSLinus Torvalds	return true;
2a9127fcSLinus Torvalds}
2a9127fcSLinus Torvalds
5ef64cc8SLinus Torvalds/* How many times do we accept lock stealing from under a waiter? */
5ef64cc8SLinus Torvaldsint sysctl_page_lock_unfairness = 5;
5ef64cc8SLinus Torvalds
101c0bf6SMatthew Wilcox (Oracle)static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
101c0bf6SMatthew Wilcox (Oracle)		int state, enum behavior behavior)
62906027SNicholas Piggin{
df4d4f12SMatthew Wilcox (Oracle)	wait_queue_head_t *q = folio_waitqueue(folio);
5ef64cc8SLinus Torvalds	int unfairness = sysctl_page_lock_unfairness;
62906027SNicholas Piggin	struct wait_page_queue wait_page;
ac6424b9SIngo Molnar	wait_queue_entry_t *wait = &wait_page.wait;
b1d29ba8SJohannes Weiner	bool thrashing = false;
eb414681SJohannes Weiner	unsigned long pflags;
aa1cf99bSYang Yang	bool in_thrashing;
62906027SNicholas Piggin
eb414681SJohannes Weiner	if (bit_nr == PG_locked &&
101c0bf6SMatthew Wilcox (Oracle)	    !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
aa1cf99bSYang Yang		delayacct_thrashing_start(&in_thrashing);
eb414681SJohannes Weiner		psi_memstall_enter(&pflags);
b1d29ba8SJohannes Weiner		thrashing = true;
b1d29ba8SJohannes Weiner	}
b1d29ba8SJohannes Weiner
62906027SNicholas Piggin	init_wait(wait);
62906027SNicholas Piggin	wait->func = wake_page_function;
df4d4f12SMatthew Wilcox (Oracle)	wait_page.folio = folio;
62906027SNicholas Piggin	wait_page.bit_nr = bit_nr;
62906027SNicholas Piggin
5ef64cc8SLinus Torvaldsrepeat:
5ef64cc8SLinus Torvalds	wait->flags = 0;
5ef64cc8SLinus Torvalds	if (behavior == EXCLUSIVE) {
5ef64cc8SLinus Torvalds		wait->flags = WQ_FLAG_EXCLUSIVE;
5ef64cc8SLinus Torvalds		if (--unfairness < 0)
5ef64cc8SLinus Torvalds			wait->flags |= WQ_FLAG_CUSTOM;
5ef64cc8SLinus Torvalds	}
5ef64cc8SLinus Torvalds
2a9127fcSLinus Torvalds	/*
2a9127fcSLinus Torvalds	 * Do one last check whether we can get the
2a9127fcSLinus Torvalds	 * page bit synchronously.
2a9127fcSLinus Torvalds	 *
101c0bf6SMatthew Wilcox (Oracle)	 * Do the folio_set_waiters() marking before that
2a9127fcSLinus Torvalds	 * to let any waker we _just_ missed know they
2a9127fcSLinus Torvalds	 * need to wake us up (otherwise they'll never
2a9127fcSLinus Torvalds	 * even go to the slow case that looks at the
2a9127fcSLinus Torvalds	 * page queue), and add ourselves to the wait
2a9127fcSLinus Torvalds	 * queue if we need to sleep.
2a9127fcSLinus Torvalds	 *
2a9127fcSLinus Torvalds	 * This part needs to be done under the queue
2a9127fcSLinus Torvalds	 * lock to avoid races.
2a9127fcSLinus Torvalds	 */
62906027SNicholas Piggin	spin_lock_irq(&q->lock);
101c0bf6SMatthew Wilcox (Oracle)	folio_set_waiters(folio);
101c0bf6SMatthew Wilcox (Oracle)	if (!folio_trylock_flag(folio, bit_nr, wait))
2a9127fcSLinus Torvalds		__add_wait_queue_entry_tail(q, wait);
62906027SNicholas Piggin	spin_unlock_irq(&q->lock);
62906027SNicholas Piggin
2a9127fcSLinus Torvalds	/*
2a9127fcSLinus Torvalds	 * From now on, all the logic will be based on
5ef64cc8SLinus Torvalds	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
5ef64cc8SLinus Torvalds	 * see whether the page bit testing has already
5ef64cc8SLinus Torvalds	 * been done by the wake function.
2a9127fcSLinus Torvalds	 *
101c0bf6SMatthew Wilcox (Oracle)	 * We can drop our reference to the folio.
2a9127fcSLinus Torvalds	 */
9a1ea439SHugh Dickins	if (behavior == DROP)
101c0bf6SMatthew Wilcox (Oracle)		folio_put(folio);
62906027SNicholas Piggin
5ef64cc8SLinus Torvalds	/*
5ef64cc8SLinus Torvalds	 * Note that until the "finish_wait()", or until
5ef64cc8SLinus Torvalds	 * we see the WQ_FLAG_WOKEN flag, we need to
5ef64cc8SLinus Torvalds	 * be very careful with the 'wait->flags', because
5ef64cc8SLinus Torvalds	 * we may race with a waker that sets them.
5ef64cc8SLinus Torvalds	 */
2a9127fcSLinus Torvalds	for (;;) {
5ef64cc8SLinus Torvalds		unsigned int flags;
5ef64cc8SLinus Torvalds
2a9127fcSLinus Torvalds		set_current_state(state);
2a9127fcSLinus Torvalds
5ef64cc8SLinus Torvalds		/* Loop until we've been woken or interrupted */
5ef64cc8SLinus Torvalds		flags = smp_load_acquire(&wait->flags);
5ef64cc8SLinus Torvalds		if (!(flags & WQ_FLAG_WOKEN)) {
2a9127fcSLinus Torvalds			if (signal_pending_state(state, current))
2a9127fcSLinus Torvalds				break;
2a9127fcSLinus Torvalds
9a1ea439SHugh Dickins			io_schedule();
5ef64cc8SLinus Torvalds			continue;
62906027SNicholas Piggin		}
62906027SNicholas Piggin
5ef64cc8SLinus Torvalds		/* If we were non-exclusive, we're done */
5ef64cc8SLinus Torvalds		if (behavior != EXCLUSIVE)
5ef64cc8SLinus Torvalds			break;
5ef64cc8SLinus Torvalds
5ef64cc8SLinus Torvalds		/* If the waker got the lock for us, we're done */
5ef64cc8SLinus Torvalds		if (flags & WQ_FLAG_DONE)
5ef64cc8SLinus Torvalds			break;
5ef64cc8SLinus Torvalds
5ef64cc8SLinus Torvalds		/*
5ef64cc8SLinus Torvalds		 * Otherwise, if we're getting the lock, we need to
5ef64cc8SLinus Torvalds		 * try to get it ourselves.
5ef64cc8SLinus Torvalds		 *
5ef64cc8SLinus Torvalds		 * And if that fails, we'll have to retry this all.
5ef64cc8SLinus Torvalds		 */
101c0bf6SMatthew Wilcox (Oracle)		if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
5ef64cc8SLinus Torvalds			goto repeat;
5ef64cc8SLinus Torvalds
5ef64cc8SLinus Torvalds		wait->flags |= WQ_FLAG_DONE;
5ef64cc8SLinus Torvalds		break;
5ef64cc8SLinus Torvalds	}
5ef64cc8SLinus Torvalds
5ef64cc8SLinus Torvalds	/*
5ef64cc8SLinus Torvalds	 * If a signal happened, this 'finish_wait()' may remove the last
101c0bf6SMatthew Wilcox (Oracle)	 * waiter from the wait-queues, but the folio waiters bit will remain
5ef64cc8SLinus Torvalds	 * set. That's ok. The next wakeup will take care of it, and trying
5ef64cc8SLinus Torvalds	 * to do it here would be difficult and prone to races.
5ef64cc8SLinus Torvalds	 */
62906027SNicholas Piggin	finish_wait(q, wait);
62906027SNicholas Piggin
eb414681SJohannes Weiner	if (thrashing) {
aa1cf99bSYang Yang		delayacct_thrashing_end(&in_thrashing);
eb414681SJohannes Weiner		psi_memstall_leave(&pflags);
eb414681SJohannes Weiner	}
b1d29ba8SJohannes Weiner
62906027SNicholas Piggin	/*
5ef64cc8SLinus Torvalds	 * NOTE! The wait->flags weren't stable until we've done the
5ef64cc8SLinus Torvalds	 * 'finish_wait()', and we could have exited the loop above due
5ef64cc8SLinus Torvalds	 * to a signal, and had a wakeup event happen after the signal
5ef64cc8SLinus Torvalds	 * test but before the 'finish_wait()'.
5ef64cc8SLinus Torvalds	 *
5ef64cc8SLinus Torvalds	 * So only after the finish_wait() can we reliably determine
5ef64cc8SLinus Torvalds	 * if we got woken up or not, so we can now figure out the final
5ef64cc8SLinus Torvalds	 * return value based on that state without races.
5ef64cc8SLinus Torvalds	 *
5ef64cc8SLinus Torvalds	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
5ef64cc8SLinus Torvalds	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
62906027SNicholas Piggin	 */
5ef64cc8SLinus Torvalds	if (behavior == EXCLUSIVE)
5ef64cc8SLinus Torvalds		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
62906027SNicholas Piggin
2a9127fcSLinus Torvalds	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
62906027SNicholas Piggin}
1da177e4SLinus Torvalds
ffa65753SAlistair Popple#ifdef CONFIG_MIGRATION
ffa65753SAlistair Popple/**
ffa65753SAlistair Popple * migration_entry_wait_on_locked - Wait for a migration entry to be removed
ffa65753SAlistair Popple * @entry: migration swap entry.
ffa65753SAlistair Popple * @ptl: already locked ptl. This function will drop the lock.
ffa65753SAlistair Popple *
ffa65753SAlistair Popple * Wait for a migration entry referencing the given page to be removed. This is
ffa65753SAlistair Popple * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
ffa65753SAlistair Popple * this can be called without taking a reference on the page. Instead this
ffa65753SAlistair Popple * should be called while holding the ptl for the migration entry referencing
ffa65753SAlistair Popple * the page.
ffa65753SAlistair Popple *
0cb8fd4dSHugh Dickins * Returns after unlocking the ptl.
ffa65753SAlistair Popple *
ffa65753SAlistair Popple * This follows the same logic as folio_wait_bit_common() so see the comments
ffa65753SAlistair Popple * there.
ffa65753SAlistair Popple */
0cb8fd4dSHugh Dickinsvoid migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
0cb8fd4dSHugh Dickins	__releases(ptl)
ffa65753SAlistair Popple{
ffa65753SAlistair Popple	struct wait_page_queue wait_page;
ffa65753SAlistair Popple	wait_queue_entry_t *wait = &wait_page.wait;
ffa65753SAlistair Popple	bool thrashing = false;
ffa65753SAlistair Popple	unsigned long pflags;
aa1cf99bSYang Yang	bool in_thrashing;
ffa65753SAlistair Popple	wait_queue_head_t *q;
ffa65753SAlistair Popple	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
ffa65753SAlistair Popple
ffa65753SAlistair Popple	q = folio_waitqueue(folio);
ffa65753SAlistair Popple	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
aa1cf99bSYang Yang		delayacct_thrashing_start(&in_thrashing);
ffa65753SAlistair Popple		psi_memstall_enter(&pflags);
ffa65753SAlistair Popple		thrashing = true;
ffa65753SAlistair Popple	}
ffa65753SAlistair Popple
ffa65753SAlistair Popple	init_wait(wait);
ffa65753SAlistair Popple	wait->func = wake_page_function;
ffa65753SAlistair Popple	wait_page.folio = folio;
ffa65753SAlistair Popple	wait_page.bit_nr = PG_locked;
ffa65753SAlistair Popple	wait->flags = 0;
ffa65753SAlistair Popple
ffa65753SAlistair Popple	spin_lock_irq(&q->lock);
ffa65753SAlistair Popple	folio_set_waiters(folio);
ffa65753SAlistair Popple	if (!folio_trylock_flag(folio, PG_locked, wait))
ffa65753SAlistair Popple		__add_wait_queue_entry_tail(q, wait);
ffa65753SAlistair Popple	spin_unlock_irq(&q->lock);
ffa65753SAlistair Popple
ffa65753SAlistair Popple	/*
ffa65753SAlistair Popple	 * If a migration entry exists for the page the migration path must hold
ffa65753SAlistair Popple	 * a valid reference to the page, and it must take the ptl to remove the
ffa65753SAlistair Popple	 * migration entry. So the page is valid until the ptl is dropped.
ffa65753SAlistair Popple	 */
ffa65753SAlistair Popple	spin_unlock(ptl);
ffa65753SAlistair Popple
ffa65753SAlistair Popple	for (;;) {
ffa65753SAlistair Popple		unsigned int flags;
ffa65753SAlistair Popple
ffa65753SAlistair Popple		set_current_state(TASK_UNINTERRUPTIBLE);
ffa65753SAlistair Popple
ffa65753SAlistair Popple		/* Loop until we've been woken or interrupted */
ffa65753SAlistair Popple		flags = smp_load_acquire(&wait->flags);
ffa65753SAlistair Popple		if (!(flags & WQ_FLAG_WOKEN)) {
ffa65753SAlistair Popple			if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
ffa65753SAlistair Popple				break;
ffa65753SAlistair Popple
ffa65753SAlistair Popple			io_schedule();
ffa65753SAlistair Popple			continue;
ffa65753SAlistair Popple		}
ffa65753SAlistair Popple		break;
ffa65753SAlistair Popple	}
ffa65753SAlistair Popple
ffa65753SAlistair Popple	finish_wait(q, wait);
ffa65753SAlistair Popple
ffa65753SAlistair Popple	if (thrashing) {
aa1cf99bSYang Yang		delayacct_thrashing_end(&in_thrashing);
ffa65753SAlistair Popple		psi_memstall_leave(&pflags);
ffa65753SAlistair Popple	}
ffa65753SAlistair Popple}
ffa65753SAlistair Popple#endif
ffa65753SAlistair Popple
101c0bf6SMatthew Wilcox (Oracle)void folio_wait_bit(struct folio *folio, int bit_nr)
1da177e4SLinus Torvalds{
101c0bf6SMatthew Wilcox (Oracle)	folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1da177e4SLinus Torvalds}
101c0bf6SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_wait_bit);
1da177e4SLinus Torvalds
101c0bf6SMatthew Wilcox (Oracle)int folio_wait_bit_killable(struct folio *folio, int bit_nr)
f62e00ccSKOSAKI Motohiro{
101c0bf6SMatthew Wilcox (Oracle)	return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
f62e00ccSKOSAKI Motohiro}
101c0bf6SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_wait_bit_killable);
f62e00ccSKOSAKI Motohiro
1da177e4SLinus Torvalds/**
9f2b04a2SMatthew Wilcox (Oracle) * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
9f2b04a2SMatthew Wilcox (Oracle) * @folio: The folio to wait for.
48054625SMatthew Wilcox (Oracle) * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
9a1ea439SHugh Dickins *
9f2b04a2SMatthew Wilcox (Oracle) * The caller should hold a reference on @folio.  They expect the page to
9a1ea439SHugh Dickins * become unlocked relatively soon, but do not wish to hold up migration
9f2b04a2SMatthew Wilcox (Oracle) * (for example) by holding the reference while waiting for the folio to
9a1ea439SHugh Dickins * come unlocked.  After this function returns, the caller should not
9f2b04a2SMatthew Wilcox (Oracle) * dereference @folio.
48054625SMatthew Wilcox (Oracle) *
9f2b04a2SMatthew Wilcox (Oracle) * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
9a1ea439SHugh Dickins */
c195c321SKe Sunstatic int folio_put_wait_locked(struct folio *folio, int state)
9a1ea439SHugh Dickins{
9f2b04a2SMatthew Wilcox (Oracle)	return folio_wait_bit_common(folio, PG_locked, state, DROP);
9a1ea439SHugh Dickins}
9a1ea439SHugh Dickins
9a1ea439SHugh Dickins/**
df4d4f12SMatthew Wilcox (Oracle) * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
df4d4f12SMatthew Wilcox (Oracle) * @folio: Folio defining the wait queue of interest
697f619fSRandy Dunlap * @waiter: Waiter to add to the queue
385e1ca5SDavid Howells *
df4d4f12SMatthew Wilcox (Oracle) * Add an arbitrary @waiter to the wait queue for the nominated @folio.
385e1ca5SDavid Howells */
df4d4f12SMatthew Wilcox (Oracle)void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
385e1ca5SDavid Howells{
df4d4f12SMatthew Wilcox (Oracle)	wait_queue_head_t *q = folio_waitqueue(folio);
385e1ca5SDavid Howells	unsigned long flags;
385e1ca5SDavid Howells
385e1ca5SDavid Howells	spin_lock_irqsave(&q->lock, flags);
9c3a815fSLinus Torvalds	__add_wait_queue_entry_tail(q, waiter);
df4d4f12SMatthew Wilcox (Oracle)	folio_set_waiters(folio);
385e1ca5SDavid Howells	spin_unlock_irqrestore(&q->lock, flags);
385e1ca5SDavid Howells}
df4d4f12SMatthew Wilcox (Oracle)EXPORT_SYMBOL_GPL(folio_add_wait_queue);
385e1ca5SDavid Howells
b91e1302SLinus Torvalds#ifndef clear_bit_unlock_is_negative_byte
b91e1302SLinus Torvalds
b91e1302SLinus Torvalds/*
b91e1302SLinus Torvalds * PG_waiters is the high bit in the same byte as PG_lock.
b91e1302SLinus Torvalds *
b91e1302SLinus Torvalds * On x86 (and on many other architectures), we can clear PG_lock and
b91e1302SLinus Torvalds * test the sign bit at the same time. But if the architecture does
b91e1302SLinus Torvalds * not support that special operation, we just do this all by hand
b91e1302SLinus Torvalds * instead.
b91e1302SLinus Torvalds *
b91e1302SLinus Torvalds * The read of PG_waiters has to be after (or concurrently with) PG_locked
ffceeb62SEthon Paul * being cleared, but a memory barrier should be unnecessary since it is
b91e1302SLinus Torvalds * in the same byte as PG_locked.
b91e1302SLinus Torvalds */
b91e1302SLinus Torvaldsstatic inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
b91e1302SLinus Torvalds{
b91e1302SLinus Torvalds	clear_bit_unlock(nr, mem);
b91e1302SLinus Torvalds	/* smp_mb__after_atomic(); */
98473f9fSOlof Johansson	return test_bit(PG_waiters, mem);
b91e1302SLinus Torvalds}
b91e1302SLinus Torvalds
b91e1302SLinus Torvalds#endif
b91e1302SLinus Torvalds
385e1ca5SDavid Howells/**
4e136428SMatthew Wilcox (Oracle) * folio_unlock - Unlock a locked folio.
4e136428SMatthew Wilcox (Oracle) * @folio: The folio.
1da177e4SLinus Torvalds *
4e136428SMatthew Wilcox (Oracle) * Unlocks the folio and wakes up any thread sleeping on the page lock.
1da177e4SLinus Torvalds *
4e136428SMatthew Wilcox (Oracle) * Context: May be called from interrupt or process context.  May not be
4e136428SMatthew Wilcox (Oracle) * called from NMI context.
1da177e4SLinus Torvalds */
4e136428SMatthew Wilcox (Oracle)void folio_unlock(struct folio *folio)
1da177e4SLinus Torvalds{
4e136428SMatthew Wilcox (Oracle)	/* Bit 7 allows x86 to check the byte's sign bit */
b91e1302SLinus Torvalds	BUILD_BUG_ON(PG_waiters != 7);
4e136428SMatthew Wilcox (Oracle)	BUILD_BUG_ON(PG_locked > 7);
4e136428SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
4e136428SMatthew Wilcox (Oracle)	if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
6974d7c9SMatthew Wilcox (Oracle)		folio_wake_bit(folio, PG_locked);
1da177e4SLinus Torvalds}
4e136428SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_unlock);
1da177e4SLinus Torvalds
485bb99bSRandy Dunlap/**
b47393f8SMatthew Wilcox (Oracle) * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
b47393f8SMatthew Wilcox (Oracle) * @folio: The folio.
73e10dedSDavid Howells *
b47393f8SMatthew Wilcox (Oracle) * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
b47393f8SMatthew Wilcox (Oracle) * it.  The folio reference held for PG_private_2 being set is released.
73e10dedSDavid Howells *
b47393f8SMatthew Wilcox (Oracle) * This is, for example, used when a netfs folio is being written to a local
b47393f8SMatthew Wilcox (Oracle) * disk cache, thereby allowing writes to the cache for the same folio to be
73e10dedSDavid Howells * serialised.
73e10dedSDavid Howells */
b47393f8SMatthew Wilcox (Oracle)void folio_end_private_2(struct folio *folio)
73e10dedSDavid Howells{
6974d7c9SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
6974d7c9SMatthew Wilcox (Oracle)	clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
6974d7c9SMatthew Wilcox (Oracle)	folio_wake_bit(folio, PG_private_2);
6974d7c9SMatthew Wilcox (Oracle)	folio_put(folio);
73e10dedSDavid Howells}
b47393f8SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_end_private_2);
73e10dedSDavid Howells
73e10dedSDavid Howells/**
b47393f8SMatthew Wilcox (Oracle) * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
b47393f8SMatthew Wilcox (Oracle) * @folio: The folio to wait on.
73e10dedSDavid Howells *
b47393f8SMatthew Wilcox (Oracle) * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
73e10dedSDavid Howells */
b47393f8SMatthew Wilcox (Oracle)void folio_wait_private_2(struct folio *folio)
73e10dedSDavid Howells{
101c0bf6SMatthew Wilcox (Oracle)	while (folio_test_private_2(folio))
101c0bf6SMatthew Wilcox (Oracle)		folio_wait_bit(folio, PG_private_2);
73e10dedSDavid Howells}
b47393f8SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_wait_private_2);
73e10dedSDavid Howells
73e10dedSDavid Howells/**
b47393f8SMatthew Wilcox (Oracle) * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
b47393f8SMatthew Wilcox (Oracle) * @folio: The folio to wait on.
73e10dedSDavid Howells *
b47393f8SMatthew Wilcox (Oracle) * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
73e10dedSDavid Howells * fatal signal is received by the calling task.
73e10dedSDavid Howells *
73e10dedSDavid Howells * Return:
73e10dedSDavid Howells * - 0 if successful.
73e10dedSDavid Howells * - -EINTR if a fatal signal was encountered.
73e10dedSDavid Howells */
b47393f8SMatthew Wilcox (Oracle)int folio_wait_private_2_killable(struct folio *folio)
73e10dedSDavid Howells{
73e10dedSDavid Howells	int ret = 0;
73e10dedSDavid Howells
101c0bf6SMatthew Wilcox (Oracle)	while (folio_test_private_2(folio)) {
101c0bf6SMatthew Wilcox (Oracle)		ret = folio_wait_bit_killable(folio, PG_private_2);
73e10dedSDavid Howells		if (ret < 0)
73e10dedSDavid Howells			break;
73e10dedSDavid Howells	}
73e10dedSDavid Howells
73e10dedSDavid Howells	return ret;
73e10dedSDavid Howells}
b47393f8SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_wait_private_2_killable);
73e10dedSDavid Howells
73e10dedSDavid Howells/**
4268b480SMatthew Wilcox (Oracle) * folio_end_writeback - End writeback against a folio.
4268b480SMatthew Wilcox (Oracle) * @folio: The folio.
1da177e4SLinus Torvalds */
4268b480SMatthew Wilcox (Oracle)void folio_end_writeback(struct folio *folio)
1da177e4SLinus Torvalds{
888cf2dbSMel Gorman	/*
4268b480SMatthew Wilcox (Oracle)	 * folio_test_clear_reclaim() could be used here but it is an
4268b480SMatthew Wilcox (Oracle)	 * atomic operation and overkill in this particular case. Failing
4268b480SMatthew Wilcox (Oracle)	 * to shuffle a folio marked for immediate reclaim is too mild
4268b480SMatthew Wilcox (Oracle)	 * a gain to justify taking an atomic operation penalty at the
4268b480SMatthew Wilcox (Oracle)	 * end of every folio writeback.
888cf2dbSMel Gorman	 */
4268b480SMatthew Wilcox (Oracle)	if (folio_test_reclaim(folio)) {
4268b480SMatthew Wilcox (Oracle)		folio_clear_reclaim(folio);
575ced1cSMatthew Wilcox (Oracle)		folio_rotate_reclaimable(folio);
888cf2dbSMel Gorman	}
ac6aadb2SMiklos Szeredi
073861edSHugh Dickins	/*
4268b480SMatthew Wilcox (Oracle)	 * Writeback does not hold a folio reference of its own, relying
073861edSHugh Dickins	 * on truncation to wait for the clearing of PG_writeback.
4268b480SMatthew Wilcox (Oracle)	 * But here we must make sure that the folio is not freed and
4268b480SMatthew Wilcox (Oracle)	 * reused before the folio_wake().
073861edSHugh Dickins	 */
4268b480SMatthew Wilcox (Oracle)	folio_get(folio);
269ccca3SMatthew Wilcox (Oracle)	if (!__folio_end_writeback(folio))
1da177e4SLinus Torvalds		BUG();
ac6aadb2SMiklos Szeredi
4e857c58SPeter Zijlstra	smp_mb__after_atomic();
4268b480SMatthew Wilcox (Oracle)	folio_wake(folio, PG_writeback);
512b7931SLinus Torvalds	acct_reclaim_writeback(folio);
4268b480SMatthew Wilcox (Oracle)	folio_put(folio);
1da177e4SLinus Torvalds}
4268b480SMatthew Wilcox (Oracle)EXPORT_SYMBOL(folio_end_writeback);
1da177e4SLinus Torvalds
485bb99bSRandy Dunlap/**
7c23c782SMatthew Wilcox (Oracle) * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
7c23c782SMatthew Wilcox (Oracle) * @folio: The folio to lock
1da177e4SLinus Torvalds */
7c23c782SMatthew Wilcox (Oracle)void __folio_lock(struct folio *folio)
1da177e4SLinus Torvalds{
101c0bf6SMatthew Wilcox (Oracle)	folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
9a1ea439SHugh Dickins				EXCLUSIVE);
1da177e4SLinus Torvalds}
7c23c782SMatthew Wilcox (Oracle)EXPORT_SYMBOL(__folio_lock);
1da177e4SLinus Torvalds
af7f29d9SMatthew Wilcox (Oracle)int __folio_lock_killable(struct folio *folio)
2687a356SMatthew Wilcox{
101c0bf6SMatthew Wilcox (Oracle)	return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
9a1ea439SHugh Dickins					EXCLUSIVE);
2687a356SMatthew Wilcox}
af7f29d9SMatthew Wilcox (Oracle)EXPORT_SYMBOL_GPL(__folio_lock_killable);
2687a356SMatthew Wilcox
ffdc8dabSMatthew Wilcox (Oracle)static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
dd3e6d50SJens Axboe{
df4d4f12SMatthew Wilcox (Oracle)	struct wait_queue_head *q = folio_waitqueue(folio);
f32b5dd7SMatthew Wilcox (Oracle)	int ret = 0;
f32b5dd7SMatthew Wilcox (Oracle)
df4d4f12SMatthew Wilcox (Oracle)	wait->folio = folio;
f32b5dd7SMatthew Wilcox (Oracle)	wait->bit_nr = PG_locked;
f32b5dd7SMatthew Wilcox (Oracle)
f32b5dd7SMatthew Wilcox (Oracle)	spin_lock_irq(&q->lock);
f32b5dd7SMatthew Wilcox (Oracle)	__add_wait_queue_entry_tail(q, &wait->wait);
ffdc8dabSMatthew Wilcox (Oracle)	folio_set_waiters(folio);
ffdc8dabSMatthew Wilcox (Oracle)	ret = !folio_trylock(folio);
f32b5dd7SMatthew Wilcox (Oracle)	/*
f32b5dd7SMatthew Wilcox (Oracle)	 * If we were successful now, we know we're still on the
f32b5dd7SMatthew Wilcox (Oracle)	 * waitqueue as we're still under the lock. This means it's
f32b5dd7SMatthew Wilcox (Oracle)	 * safe to remove and return success, we know the callback
f32b5dd7SMatthew Wilcox (Oracle)	 * isn't going to trigger.
f32b5dd7SMatthew Wilcox (Oracle)	 */
f32b5dd7SMatthew Wilcox (Oracle)	if (!ret)
f32b5dd7SMatthew Wilcox (Oracle)		__remove_wait_queue(q, &wait->wait);
f32b5dd7SMatthew Wilcox (Oracle)	else
f32b5dd7SMatthew Wilcox (Oracle)		ret = -EIOCBQUEUED;
f32b5dd7SMatthew Wilcox (Oracle)	spin_unlock_irq(&q->lock);
f32b5dd7SMatthew Wilcox (Oracle)	return ret;
dd3e6d50SJens Axboe}
dd3e6d50SJens Axboe
9a95f3cfSPaul Cassella/*
9a95f3cfSPaul Cassella * Return values:
fdc724d6SSuren Baghdasaryan * 0 - folio is locked.
fdc724d6SSuren Baghdasaryan * non-zero - folio is not locked.
1235ccd0SSuren Baghdasaryan *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
1235ccd0SSuren Baghdasaryan *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
1235ccd0SSuren Baghdasaryan *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
9a95f3cfSPaul Cassella *
fdc724d6SSuren Baghdasaryan * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
1235ccd0SSuren Baghdasaryan * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
9a95f3cfSPaul Cassella */
fdc724d6SSuren Baghdasaryanvm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
d065bd81SMichel Lespinasse{
fdc724d6SSuren Baghdasaryan	unsigned int flags = vmf->flags;
fdc724d6SSuren Baghdasaryan
4064b982SPeter Xu	if (fault_flag_allow_retry_first(flags)) {
37b23e05SKOSAKI Motohiro		/*
1235ccd0SSuren Baghdasaryan		 * CAUTION! In this case, mmap_lock/per-VMA lock is not
1235ccd0SSuren Baghdasaryan		 * released even though returning VM_FAULT_RETRY.
37b23e05SKOSAKI Motohiro		 */
37b23e05SKOSAKI Motohiro		if (flags & FAULT_FLAG_RETRY_NOWAIT)
fdc724d6SSuren Baghdasaryan			return VM_FAULT_RETRY;
37b23e05SKOSAKI Motohiro
1235ccd0SSuren Baghdasaryan		release_fault_lock(vmf);
37b23e05SKOSAKI Motohiro		if (flags & FAULT_FLAG_KILLABLE)
6baa8d60SMatthew Wilcox (Oracle)			folio_wait_locked_killable(folio);
37b23e05SKOSAKI Motohiro		else
6baa8d60SMatthew Wilcox (Oracle)			folio_wait_locked(folio);
fdc724d6SSuren Baghdasaryan		return VM_FAULT_RETRY;
800bca7cSHailong Liu	}
37b23e05SKOSAKI Motohiro	if (flags & FAULT_FLAG_KILLABLE) {
9138e47eSMatthew Wilcox (Oracle)		bool ret;
37b23e05SKOSAKI Motohiro
af7f29d9SMatthew Wilcox (Oracle)		ret = __folio_lock_killable(folio);
37b23e05SKOSAKI Motohiro		if (ret) {
1235ccd0SSuren Baghdasaryan			release_fault_lock(vmf);
fdc724d6SSuren Baghdasaryan			return VM_FAULT_RETRY;
37b23e05SKOSAKI Motohiro		}
800bca7cSHailong Liu	} else {
af7f29d9SMatthew Wilcox (Oracle)		__folio_lock(folio);
d065bd81SMichel Lespinasse	}
800bca7cSHailong Liu
fdc724d6SSuren Baghdasaryan	return 0;
d065bd81SMichel Lespinasse}
d065bd81SMichel Lespinasse
485bb99bSRandy Dunlap/**
0d3f9296SMatthew Wilcox * page_cache_next_miss() - Find the next gap in the page cache.
0d3f9296SMatthew Wilcox * @mapping: Mapping.
0d3f9296SMatthew Wilcox * @index: Index.
0d3f9296SMatthew Wilcox * @max_scan: Maximum range to search.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
0d3f9296SMatthew Wilcox * gap with the lowest index.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * This function may be called under the rcu_read_lock.  However, this will
0d3f9296SMatthew Wilcox * not atomically search a snapshot of the cache at a single point in time.
0d3f9296SMatthew Wilcox * For example, if a gap is created at index 5, then subsequently a gap is
0d3f9296SMatthew Wilcox * created at index 10, page_cache_next_miss covering both indices may
0d3f9296SMatthew Wilcox * return 10 if called under the rcu_read_lock.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * Return: The index of the gap if found, otherwise an index outside the
0d3f9296SMatthew Wilcox * range specified (in which case 'return - index >= max_scan' will be true).
16f8eb3eSMike Kravetz * In the rare case of index wrap-around, 0 will be returned.
e7b563bbSJohannes Weiner */
0d3f9296SMatthew Wilcoxpgoff_t page_cache_next_miss(struct address_space *mapping,
e7b563bbSJohannes Weiner			     pgoff_t index, unsigned long max_scan)
e7b563bbSJohannes Weiner{
0d3f9296SMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, index);
e7b563bbSJohannes Weiner
0d3f9296SMatthew Wilcox	while (max_scan--) {
0d3f9296SMatthew Wilcox		void *entry = xas_next(&xas);
0d3f9296SMatthew Wilcox		if (!entry || xa_is_value(entry))
16f8eb3eSMike Kravetz			break;
16f8eb3eSMike Kravetz		if (xas.xa_index == 0)
16f8eb3eSMike Kravetz			break;
e7b563bbSJohannes Weiner	}
e7b563bbSJohannes Weiner
16f8eb3eSMike Kravetz	return xas.xa_index;
e7b563bbSJohannes Weiner}
0d3f9296SMatthew WilcoxEXPORT_SYMBOL(page_cache_next_miss);
e7b563bbSJohannes Weiner
e7b563bbSJohannes Weiner/**
2346a560SLaurent Dufour * page_cache_prev_miss() - Find the previous gap in the page cache.
0d3f9296SMatthew Wilcox * @mapping: Mapping.
0d3f9296SMatthew Wilcox * @index: Index.
0d3f9296SMatthew Wilcox * @max_scan: Maximum range to search.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * Search the range [max(index - max_scan + 1, 0), index] for the
0d3f9296SMatthew Wilcox * gap with the highest index.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * This function may be called under the rcu_read_lock.  However, this will
0d3f9296SMatthew Wilcox * not atomically search a snapshot of the cache at a single point in time.
0d3f9296SMatthew Wilcox * For example, if a gap is created at index 10, then subsequently a gap is
0d3f9296SMatthew Wilcox * created at index 5, page_cache_prev_miss() covering both indices may
0d3f9296SMatthew Wilcox * return 5 if called under the rcu_read_lock.
e7b563bbSJohannes Weiner *
0d3f9296SMatthew Wilcox * Return: The index of the gap if found, otherwise an index outside the
0d3f9296SMatthew Wilcox * range specified (in which case 'index - return >= max_scan' will be true).
16f8eb3eSMike Kravetz * In the rare case of wrap-around, ULONG_MAX will be returned.
e7b563bbSJohannes Weiner */
0d3f9296SMatthew Wilcoxpgoff_t page_cache_prev_miss(struct address_space *mapping,
e7b563bbSJohannes Weiner			     pgoff_t index, unsigned long max_scan)
e7b563bbSJohannes Weiner{
0d3f9296SMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, index);
e7b563bbSJohannes Weiner
0d3f9296SMatthew Wilcox	while (max_scan--) {
0d3f9296SMatthew Wilcox		void *entry = xas_prev(&xas);
0d3f9296SMatthew Wilcox		if (!entry || xa_is_value(entry))
16f8eb3eSMike Kravetz			break;
16f8eb3eSMike Kravetz		if (xas.xa_index == ULONG_MAX)
16f8eb3eSMike Kravetz			break;
e7b563bbSJohannes Weiner	}
e7b563bbSJohannes Weiner
16f8eb3eSMike Kravetz	return xas.xa_index;
e7b563bbSJohannes Weiner}
0d3f9296SMatthew WilcoxEXPORT_SYMBOL(page_cache_prev_miss);
e7b563bbSJohannes Weiner
44835d20SMatthew Wilcox (Oracle)/*
020853b6SMatthew Wilcox (Oracle) * Lockless page cache protocol:
020853b6SMatthew Wilcox (Oracle) * On the lookup side:
020853b6SMatthew Wilcox (Oracle) * 1. Load the folio from i_pages
020853b6SMatthew Wilcox (Oracle) * 2. Increment the refcount if it's not zero
020853b6SMatthew Wilcox (Oracle) * 3. If the folio is not found by xas_reload(), put the refcount and retry
020853b6SMatthew Wilcox (Oracle) *
020853b6SMatthew Wilcox (Oracle) * On the removal side:
020853b6SMatthew Wilcox (Oracle) * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
020853b6SMatthew Wilcox (Oracle) * B. Remove the page from i_pages
020853b6SMatthew Wilcox (Oracle) * C. Return the page to the page allocator
020853b6SMatthew Wilcox (Oracle) *
020853b6SMatthew Wilcox (Oracle) * This means that any page may have its reference count temporarily
020853b6SMatthew Wilcox (Oracle) * increased by a speculative page cache (or fast GUP) lookup as it can
020853b6SMatthew Wilcox (Oracle) * be allocated by another user before the RCU grace period expires.
020853b6SMatthew Wilcox (Oracle) * Because the refcount temporarily acquired here may end up being the
020853b6SMatthew Wilcox (Oracle) * last refcount on the page, any page allocation must be freeable by
020853b6SMatthew Wilcox (Oracle) * folio_put().
020853b6SMatthew Wilcox (Oracle) */
020853b6SMatthew Wilcox (Oracle)
020853b6SMatthew Wilcox (Oracle)/*
263e721eSChristoph Hellwig * filemap_get_entry - Get a page cache entry.
485bb99bSRandy Dunlap * @mapping: the address_space to search
a6de4b48SMatthew Wilcox (Oracle) * @index: The page cache index.
485bb99bSRandy Dunlap *
bca65eeaSMatthew Wilcox (Oracle) * Looks up the page cache entry at @mapping & @index.  If it is a folio,
bca65eeaSMatthew Wilcox (Oracle) * it is returned with an increased refcount.  If it is a shadow entry
bca65eeaSMatthew Wilcox (Oracle) * of a previously evicted folio, or a swap entry from shmem/tmpfs,
bca65eeaSMatthew Wilcox (Oracle) * it is returned without further action.
0cd6144aSJohannes Weiner *
bca65eeaSMatthew Wilcox (Oracle) * Return: The folio, swap or shadow entry, %NULL if nothing is found.
1da177e4SLinus Torvalds */
263e721eSChristoph Hellwigvoid *filemap_get_entry(struct address_space *mapping, pgoff_t index)
1da177e4SLinus Torvalds{
a6de4b48SMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, index);
bca65eeaSMatthew Wilcox (Oracle)	struct folio *folio;
1da177e4SLinus Torvalds
a60637c8SNick Piggin	rcu_read_lock();
a60637c8SNick Pigginrepeat:
4c7472c0SMatthew Wilcox	xas_reset(&xas);
bca65eeaSMatthew Wilcox (Oracle)	folio = xas_load(&xas);
bca65eeaSMatthew Wilcox (Oracle)	if (xas_retry(&xas, folio))
a60637c8SNick Piggin		goto repeat;
8079b1c8SHugh Dickins	/*
4c7472c0SMatthew Wilcox	 * A shadow entry of a recently evicted page, or a swap entry from
4c7472c0SMatthew Wilcox	 * shmem/tmpfs.  Return it without attempting to raise page count.
8079b1c8SHugh Dickins	 */
bca65eeaSMatthew Wilcox (Oracle)	if (!folio || xa_is_value(folio))
8079b1c8SHugh Dickins		goto out;
83929372SKirill A. Shutemov
16380f52SYang Shi	if (!folio_try_get(folio))
a60637c8SNick Piggin		goto repeat;
a60637c8SNick Piggin
bca65eeaSMatthew Wilcox (Oracle)	if (unlikely(folio != xas_reload(&xas))) {
bca65eeaSMatthew Wilcox (Oracle)		folio_put(folio);
a60637c8SNick Piggin		goto repeat;
a60637c8SNick Piggin	}
27d20fddSNick Pigginout:
a60637c8SNick Piggin	rcu_read_unlock();
a60637c8SNick Piggin
bca65eeaSMatthew Wilcox (Oracle)	return folio;
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
485bb99bSRandy Dunlap/**
3f0c6a07SMatthew Wilcox (Oracle) * __filemap_get_folio - Find and get a reference to a folio.
2294b32eSMatthew Wilcox (Oracle) * @mapping: The address_space to search.
2294b32eSMatthew Wilcox (Oracle) * @index: The page index.
3f0c6a07SMatthew Wilcox (Oracle) * @fgp_flags: %FGP flags modify how the folio is returned.
3f0c6a07SMatthew Wilcox (Oracle) * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
0cd6144aSJohannes Weiner *
2294b32eSMatthew Wilcox (Oracle) * Looks up the page cache entry at @mapping & @index.
0cd6144aSJohannes Weiner *
2294b32eSMatthew Wilcox (Oracle) * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
2294b32eSMatthew Wilcox (Oracle) * if the %GFP flags specified for %FGP_CREAT are atomic.
2457aec6SMel Gorman *
ffc143dbSMatthew Wilcox (Oracle) * If this function returns a folio, it is returned with an increased refcount.
a862f68aSMike Rapoport *
66dabbb6SChristoph Hellwig * Return: The found folio or an ERR_PTR() otherwise.
0cd6144aSJohannes Weiner */
3f0c6a07SMatthew Wilcox (Oracle)struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
ffc143dbSMatthew Wilcox (Oracle)		fgf_t fgp_flags, gfp_t gfp)
1da177e4SLinus Torvalds{
3f0c6a07SMatthew Wilcox (Oracle)	struct folio *folio;
2457aec6SMel Gorman
1da177e4SLinus Torvaldsrepeat:
263e721eSChristoph Hellwig	folio = filemap_get_entry(mapping, index);
48c9d113SChristoph Hellwig	if (xa_is_value(folio))
3f0c6a07SMatthew Wilcox (Oracle)		folio = NULL;
3f0c6a07SMatthew Wilcox (Oracle)	if (!folio)
2457aec6SMel Gorman		goto no_page;
2457aec6SMel Gorman
2457aec6SMel Gorman	if (fgp_flags & FGP_LOCK) {
2457aec6SMel Gorman		if (fgp_flags & FGP_NOWAIT) {
3f0c6a07SMatthew Wilcox (Oracle)			if (!folio_trylock(folio)) {
3f0c6a07SMatthew Wilcox (Oracle)				folio_put(folio);
66dabbb6SChristoph Hellwig				return ERR_PTR(-EAGAIN);
2457aec6SMel Gorman			}
2457aec6SMel Gorman		} else {
3f0c6a07SMatthew Wilcox (Oracle)			folio_lock(folio);
2457aec6SMel Gorman		}
2457aec6SMel Gorman
2457aec6SMel Gorman		/* Has the page been truncated? */
3f0c6a07SMatthew Wilcox (Oracle)		if (unlikely(folio->mapping != mapping)) {
3f0c6a07SMatthew Wilcox (Oracle)			folio_unlock(folio);
3f0c6a07SMatthew Wilcox (Oracle)			folio_put(folio);
2457aec6SMel Gorman			goto repeat;
2457aec6SMel Gorman		}
3f0c6a07SMatthew Wilcox (Oracle)		VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
2457aec6SMel Gorman	}
2457aec6SMel Gorman
c16eb000SKirill Tkhai	if (fgp_flags & FGP_ACCESSED)
3f0c6a07SMatthew Wilcox (Oracle)		folio_mark_accessed(folio);
b9306a79SYang Shi	else if (fgp_flags & FGP_WRITE) {
b9306a79SYang Shi		/* Clear idle flag for buffer write */
3f0c6a07SMatthew Wilcox (Oracle)		if (folio_test_idle(folio))
3f0c6a07SMatthew Wilcox (Oracle)			folio_clear_idle(folio);
b9306a79SYang Shi	}
2457aec6SMel Gorman
b27652d9SMatthew Wilcox (Oracle)	if (fgp_flags & FGP_STABLE)
b27652d9SMatthew Wilcox (Oracle)		folio_wait_stable(folio);
2457aec6SMel Gormanno_page:
3f0c6a07SMatthew Wilcox (Oracle)	if (!folio && (fgp_flags & FGP_CREAT)) {
4f661701SMatthew Wilcox (Oracle)		unsigned order = FGF_GET_ORDER(fgp_flags);
2457aec6SMel Gorman		int err;
4f661701SMatthew Wilcox (Oracle)
f56753acSChristoph Hellwig		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
3f0c6a07SMatthew Wilcox (Oracle)			gfp |= __GFP_WRITE;
45f87de5SMichal Hocko		if (fgp_flags & FGP_NOFS)
3f0c6a07SMatthew Wilcox (Oracle)			gfp &= ~__GFP_FS;
0dd316baSJens Axboe		if (fgp_flags & FGP_NOWAIT) {
0dd316baSJens Axboe			gfp &= ~GFP_KERNEL;
0dd316baSJens Axboe			gfp |= GFP_NOWAIT | __GFP_NOWARN;
0dd316baSJens Axboe		}
a75d4c33SJosef Bacik		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
2457aec6SMel Gorman			fgp_flags |= FGP_LOCK;
2457aec6SMel Gorman
4f661701SMatthew Wilcox (Oracle)		if (!mapping_large_folio_support(mapping))
4f661701SMatthew Wilcox (Oracle)			order = 0;
4f661701SMatthew Wilcox (Oracle)		if (order > MAX_PAGECACHE_ORDER)
4f661701SMatthew Wilcox (Oracle)			order = MAX_PAGECACHE_ORDER;
4f661701SMatthew Wilcox (Oracle)		/* If we're not aligned, allocate a smaller folio */
4f661701SMatthew Wilcox (Oracle)		if (index & ((1UL << order) - 1))
4f661701SMatthew Wilcox (Oracle)			order = __ffs(index);
4f661701SMatthew Wilcox (Oracle)
4f661701SMatthew Wilcox (Oracle)		do {
4f661701SMatthew Wilcox (Oracle)			gfp_t alloc_gfp = gfp;
4f661701SMatthew Wilcox (Oracle)
4f661701SMatthew Wilcox (Oracle)			err = -ENOMEM;
4f661701SMatthew Wilcox (Oracle)			if (order > 0)
4f661701SMatthew Wilcox (Oracle)				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
4f661701SMatthew Wilcox (Oracle)			folio = filemap_alloc_folio(alloc_gfp, order);
4f661701SMatthew Wilcox (Oracle)			if (!folio)
4f661701SMatthew Wilcox (Oracle)				continue;
4f661701SMatthew Wilcox (Oracle)
eb39d618SHugh Dickins			/* Init accessed so avoid atomic mark_page_accessed later */
2457aec6SMel Gorman			if (fgp_flags & FGP_ACCESSED)
3f0c6a07SMatthew Wilcox (Oracle)				__folio_set_referenced(folio);
2457aec6SMel Gorman
3f0c6a07SMatthew Wilcox (Oracle)			err = filemap_add_folio(mapping, folio, index, gfp);
4f661701SMatthew Wilcox (Oracle)			if (!err)
4f661701SMatthew Wilcox (Oracle)				break;
3f0c6a07SMatthew Wilcox (Oracle)			folio_put(folio);
3f0c6a07SMatthew Wilcox (Oracle)			folio = NULL;
4f661701SMatthew Wilcox (Oracle)		} while (order-- > 0);
4f661701SMatthew Wilcox (Oracle)
eb2be189SNick Piggin		if (err == -EEXIST)
1da177e4SLinus Torvalds			goto repeat;
4f661701SMatthew Wilcox (Oracle)		if (err)
4f661701SMatthew Wilcox (Oracle)			return ERR_PTR(err);
a75d4c33SJosef Bacik		/*
3f0c6a07SMatthew Wilcox (Oracle)		 * filemap_add_folio locks the page, and for mmap
3f0c6a07SMatthew Wilcox (Oracle)		 * we expect an unlocked page.
a75d4c33SJosef Bacik		 */
3f0c6a07SMatthew Wilcox (Oracle)		if (folio && (fgp_flags & FGP_FOR_MMAP))
3f0c6a07SMatthew Wilcox (Oracle)			folio_unlock(folio);
eb2be189SNick Piggin	}
2457aec6SMel Gorman
66dabbb6SChristoph Hellwig	if (!folio)
66dabbb6SChristoph Hellwig		return ERR_PTR(-ENOENT);
3f0c6a07SMatthew Wilcox (Oracle)	return folio;
1da177e4SLinus Torvalds}
3f0c6a07SMatthew Wilcox (Oracle)EXPORT_SYMBOL(__filemap_get_folio);
1da177e4SLinus Torvalds
f5e6429aSMatthew Wilcox (Oracle)static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
c7bad633SMatthew Wilcox (Oracle)		xa_mark_t mark)
c7bad633SMatthew Wilcox (Oracle){
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
c7bad633SMatthew Wilcox (Oracle)
c7bad633SMatthew Wilcox (Oracle)retry:
c7bad633SMatthew Wilcox (Oracle)	if (mark == XA_PRESENT)
f5e6429aSMatthew Wilcox (Oracle)		folio = xas_find(xas, max);
c7bad633SMatthew Wilcox (Oracle)	else
f5e6429aSMatthew Wilcox (Oracle)		folio = xas_find_marked(xas, max, mark);
c7bad633SMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)	if (xas_retry(xas, folio))
c7bad633SMatthew Wilcox (Oracle)		goto retry;
c7bad633SMatthew Wilcox (Oracle)	/*
c7bad633SMatthew Wilcox (Oracle)	 * A shadow entry of a recently evicted page, a swap
c7bad633SMatthew Wilcox (Oracle)	 * entry from shmem/tmpfs or a DAX entry.  Return it
c7bad633SMatthew Wilcox (Oracle)	 * without attempting to raise page count.
c7bad633SMatthew Wilcox (Oracle)	 */
f5e6429aSMatthew Wilcox (Oracle)	if (!folio || xa_is_value(folio))
f5e6429aSMatthew Wilcox (Oracle)		return folio;
c7bad633SMatthew Wilcox (Oracle)
16380f52SYang Shi	if (!folio_try_get(folio))
c7bad633SMatthew Wilcox (Oracle)		goto reset;
c7bad633SMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)	if (unlikely(folio != xas_reload(xas))) {
f5e6429aSMatthew Wilcox (Oracle)		folio_put(folio);
c7bad633SMatthew Wilcox (Oracle)		goto reset;
c7bad633SMatthew Wilcox (Oracle)	}
c7bad633SMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)	return folio;
c7bad633SMatthew Wilcox (Oracle)reset:
c7bad633SMatthew Wilcox (Oracle)	xas_reset(xas);
c7bad633SMatthew Wilcox (Oracle)	goto retry;
c7bad633SMatthew Wilcox (Oracle)}
c7bad633SMatthew Wilcox (Oracle)
1da177e4SLinus Torvalds/**
0cd6144aSJohannes Weiner * find_get_entries - gang pagecache lookup
0cd6144aSJohannes Weiner * @mapping:	The address_space to search
0cd6144aSJohannes Weiner * @start:	The starting page cache index
ca122fe4SMatthew Wilcox (Oracle) * @end:	The final page index (inclusive).
0e499ed3SMatthew Wilcox (Oracle) * @fbatch:	Where the resulting entries are placed.
0cd6144aSJohannes Weiner * @indices:	The cache indices corresponding to the entries in @entries
0cd6144aSJohannes Weiner *
cf2039afSMatthew Wilcox (Oracle) * find_get_entries() will search for and return a batch of entries in
0e499ed3SMatthew Wilcox (Oracle) * the mapping.  The entries are placed in @fbatch.  find_get_entries()
0e499ed3SMatthew Wilcox (Oracle) * takes a reference on any actual folios it returns.
0cd6144aSJohannes Weiner *
0e499ed3SMatthew Wilcox (Oracle) * The entries have ascending indexes.  The indices may not be consecutive
0e499ed3SMatthew Wilcox (Oracle) * due to not-present entries or large folios.
0cd6144aSJohannes Weiner *
0e499ed3SMatthew Wilcox (Oracle) * Any shadow entries of evicted folios, or swap entries from
139b6a6fSJohannes Weiner * shmem/tmpfs, are included in the returned array.
0cd6144aSJohannes Weiner *
0e499ed3SMatthew Wilcox (Oracle) * Return: The number of entries which were found.
0cd6144aSJohannes Weiner */
9fb6beeaSVishal Moola (Oracle)unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
0e499ed3SMatthew Wilcox (Oracle)		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
0cd6144aSJohannes Weiner{
9fb6beeaSVishal Moola (Oracle)	XA_STATE(xas, &mapping->i_pages, *start);
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
0cd6144aSJohannes Weiner
0cd6144aSJohannes Weiner	rcu_read_lock();
f5e6429aSMatthew Wilcox (Oracle)	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
0e499ed3SMatthew Wilcox (Oracle)		indices[fbatch->nr] = xas.xa_index;
0e499ed3SMatthew Wilcox (Oracle)		if (!folio_batch_add(fbatch, folio))
0cd6144aSJohannes Weiner			break;
0cd6144aSJohannes Weiner	}
0cd6144aSJohannes Weiner	rcu_read_unlock();
cf2039afSMatthew Wilcox (Oracle)
9fb6beeaSVishal Moola (Oracle)	if (folio_batch_count(fbatch)) {
9fb6beeaSVishal Moola (Oracle)		unsigned long nr = 1;
9fb6beeaSVishal Moola (Oracle)		int idx = folio_batch_count(fbatch) - 1;
9fb6beeaSVishal Moola (Oracle)
9fb6beeaSVishal Moola (Oracle)		folio = fbatch->folios[idx];
9fb6beeaSVishal Moola (Oracle)		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
9fb6beeaSVishal Moola (Oracle)			nr = folio_nr_pages(folio);
9fb6beeaSVishal Moola (Oracle)		*start = indices[idx] + nr;
9fb6beeaSVishal Moola (Oracle)	}
0e499ed3SMatthew Wilcox (Oracle)	return folio_batch_count(fbatch);
0cd6144aSJohannes Weiner}
0cd6144aSJohannes Weiner
0cd6144aSJohannes Weiner/**
5c211ba2SMatthew Wilcox (Oracle) * find_lock_entries - Find a batch of pagecache entries.
5c211ba2SMatthew Wilcox (Oracle) * @mapping:	The address_space to search.
5c211ba2SMatthew Wilcox (Oracle) * @start:	The starting page cache index.
5c211ba2SMatthew Wilcox (Oracle) * @end:	The final page index (inclusive).
51dcbdacSMatthew Wilcox (Oracle) * @fbatch:	Where the resulting entries are placed.
51dcbdacSMatthew Wilcox (Oracle) * @indices:	The cache indices of the entries in @fbatch.
5c211ba2SMatthew Wilcox (Oracle) *
5c211ba2SMatthew Wilcox (Oracle) * find_lock_entries() will return a batch of entries from @mapping.
f5e6429aSMatthew Wilcox (Oracle) * Swap, shadow and DAX entries are included.  Folios are returned
f5e6429aSMatthew Wilcox (Oracle) * locked and with an incremented refcount.  Folios which are locked
f5e6429aSMatthew Wilcox (Oracle) * by somebody else or under writeback are skipped.  Folios which are
f5e6429aSMatthew Wilcox (Oracle) * partially outside the range are not returned.
5c211ba2SMatthew Wilcox (Oracle) *
5c211ba2SMatthew Wilcox (Oracle) * The entries have ascending indexes.  The indices may not be consecutive
f5e6429aSMatthew Wilcox (Oracle) * due to not-present entries, large folios, folios which could not be
f5e6429aSMatthew Wilcox (Oracle) * locked or folios under writeback.
5c211ba2SMatthew Wilcox (Oracle) *
5c211ba2SMatthew Wilcox (Oracle) * Return: The number of entries which were found.
5c211ba2SMatthew Wilcox (Oracle) */
3392ca12SVishal Moola (Oracle)unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
51dcbdacSMatthew Wilcox (Oracle)		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
5c211ba2SMatthew Wilcox (Oracle){
3392ca12SVishal Moola (Oracle)	XA_STATE(xas, &mapping->i_pages, *start);
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
5c211ba2SMatthew Wilcox (Oracle)
5c211ba2SMatthew Wilcox (Oracle)	rcu_read_lock();
f5e6429aSMatthew Wilcox (Oracle)	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
f5e6429aSMatthew Wilcox (Oracle)		if (!xa_is_value(folio)) {
3392ca12SVishal Moola (Oracle)			if (folio->index < *start)
5c211ba2SMatthew Wilcox (Oracle)				goto put;
87b11f86SSidhartha Kumar			if (folio_next_index(folio) - 1 > end)
5c211ba2SMatthew Wilcox (Oracle)				goto put;
f5e6429aSMatthew Wilcox (Oracle)			if (!folio_trylock(folio))
5c211ba2SMatthew Wilcox (Oracle)				goto put;
f5e6429aSMatthew Wilcox (Oracle)			if (folio->mapping != mapping ||
f5e6429aSMatthew Wilcox (Oracle)			    folio_test_writeback(folio))
5c211ba2SMatthew Wilcox (Oracle)				goto unlock;
f5e6429aSMatthew Wilcox (Oracle)			VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
f5e6429aSMatthew Wilcox (Oracle)					folio);
5c211ba2SMatthew Wilcox (Oracle)		}
51dcbdacSMatthew Wilcox (Oracle)		indices[fbatch->nr] = xas.xa_index;
51dcbdacSMatthew Wilcox (Oracle)		if (!folio_batch_add(fbatch, folio))
5c211ba2SMatthew Wilcox (Oracle)			break;
6b24ca4aSMatthew Wilcox (Oracle)		continue;
5c211ba2SMatthew Wilcox (Oracle)unlock:
f5e6429aSMatthew Wilcox (Oracle)		folio_unlock(folio);
5c211ba2SMatthew Wilcox (Oracle)put:
f5e6429aSMatthew Wilcox (Oracle)		folio_put(folio);
5c211ba2SMatthew Wilcox (Oracle)	}
5c211ba2SMatthew Wilcox (Oracle)	rcu_read_unlock();
5c211ba2SMatthew Wilcox (Oracle)
3392ca12SVishal Moola (Oracle)	if (folio_batch_count(fbatch)) {
3392ca12SVishal Moola (Oracle)		unsigned long nr = 1;
3392ca12SVishal Moola (Oracle)		int idx = folio_batch_count(fbatch) - 1;
3392ca12SVishal Moola (Oracle)
3392ca12SVishal Moola (Oracle)		folio = fbatch->folios[idx];
3392ca12SVishal Moola (Oracle)		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
3392ca12SVishal Moola (Oracle)			nr = folio_nr_pages(folio);
3392ca12SVishal Moola (Oracle)		*start = indices[idx] + nr;
3392ca12SVishal Moola (Oracle)	}
51dcbdacSMatthew Wilcox (Oracle)	return folio_batch_count(fbatch);
5c211ba2SMatthew Wilcox (Oracle)}
5c211ba2SMatthew Wilcox (Oracle)
5c211ba2SMatthew Wilcox (Oracle)/**
be0ced5eSMatthew Wilcox (Oracle) * filemap_get_folios - Get a batch of folios
1da177e4SLinus Torvalds * @mapping:	The address_space to search
1da177e4SLinus Torvalds * @start:	The starting page index
b947cee4SJan Kara * @end:	The final page index (inclusive)
be0ced5eSMatthew Wilcox (Oracle) * @fbatch:	The batch to fill.
1da177e4SLinus Torvalds *
be0ced5eSMatthew Wilcox (Oracle) * Search for and return a batch of folios in the mapping starting at
be0ced5eSMatthew Wilcox (Oracle) * index @start and up to index @end (inclusive).  The folios are returned
be0ced5eSMatthew Wilcox (Oracle) * in @fbatch with an elevated reference count.
1da177e4SLinus Torvalds *
be0ced5eSMatthew Wilcox (Oracle) * The first folio may start before @start; if it does, it will contain
be0ced5eSMatthew Wilcox (Oracle) * @start.  The final folio may extend beyond @end; if it does, it will
be0ced5eSMatthew Wilcox (Oracle) * contain @end.  The folios have ascending indices.  There may be gaps
be0ced5eSMatthew Wilcox (Oracle) * between the folios if there are indices which have no folio in the
be0ced5eSMatthew Wilcox (Oracle) * page cache.  If folios are added to or removed from the page cache
be0ced5eSMatthew Wilcox (Oracle) * while this is running, they may or may not be found by this call.
1da177e4SLinus Torvalds *
be0ced5eSMatthew Wilcox (Oracle) * Return: The number of folios which were found.
be0ced5eSMatthew Wilcox (Oracle) * We also update @start to index the next folio for the traversal.
1da177e4SLinus Torvalds */
be0ced5eSMatthew Wilcox (Oracle)unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
be0ced5eSMatthew Wilcox (Oracle)		pgoff_t end, struct folio_batch *fbatch)
1da177e4SLinus Torvalds{
fd1b3ceeSMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, *start);
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
1da177e4SLinus Torvalds
a60637c8SNick Piggin	rcu_read_lock();
be0ced5eSMatthew Wilcox (Oracle)	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
fd1b3ceeSMatthew Wilcox		/* Skip over shadow, swap and DAX entries */
f5e6429aSMatthew Wilcox (Oracle)		if (xa_is_value(folio))
2cf938aaSMatthew Wilcox			continue;
be0ced5eSMatthew Wilcox (Oracle)		if (!folio_batch_add(fbatch, folio)) {
be0ced5eSMatthew Wilcox (Oracle)			unsigned long nr = folio_nr_pages(folio);
a60637c8SNick Piggin
be0ced5eSMatthew Wilcox (Oracle)			if (folio_test_hugetlb(folio))
be0ced5eSMatthew Wilcox (Oracle)				nr = 1;
be0ced5eSMatthew Wilcox (Oracle)			*start = folio->index + nr;
b947cee4SJan Kara			goto out;
b947cee4SJan Kara		}
a60637c8SNick Piggin	}
5b280c0cSHugh Dickins
b947cee4SJan Kara	/*
b947cee4SJan Kara	 * We come here when there is no page beyond @end. We take care to not
b947cee4SJan Kara	 * overflow the index @start as it confuses some of the callers. This
fd1b3ceeSMatthew Wilcox	 * breaks the iteration when there is a page at index -1 but that is
b947cee4SJan Kara	 * already broken anyway.
b947cee4SJan Kara	 */
b947cee4SJan Kara	if (end == (pgoff_t)-1)
b947cee4SJan Kara		*start = (pgoff_t)-1;
b947cee4SJan Kara	else
b947cee4SJan Kara		*start = end + 1;
b947cee4SJan Karaout:
a60637c8SNick Piggin	rcu_read_unlock();
d72dc8a2SJan Kara
be0ced5eSMatthew Wilcox (Oracle)	return folio_batch_count(fbatch);
be0ced5eSMatthew Wilcox (Oracle)}
be0ced5eSMatthew Wilcox (Oracle)EXPORT_SYMBOL(filemap_get_folios);
be0ced5eSMatthew Wilcox (Oracle)
ebf43500SJens Axboe/**
35b47146SVishal Moola (Oracle) * filemap_get_folios_contig - Get a batch of contiguous folios
ebf43500SJens Axboe * @mapping:	The address_space to search
35b47146SVishal Moola (Oracle) * @start:	The starting page index
35b47146SVishal Moola (Oracle) * @end:	The final page index (inclusive)
35b47146SVishal Moola (Oracle) * @fbatch:	The batch to fill
ebf43500SJens Axboe *
35b47146SVishal Moola (Oracle) * filemap_get_folios_contig() works exactly like filemap_get_folios(),
35b47146SVishal Moola (Oracle) * except the returned folios are guaranteed to be contiguous. This may
35b47146SVishal Moola (Oracle) * not return all contiguous folios if the batch gets filled up.
ebf43500SJens Axboe *
35b47146SVishal Moola (Oracle) * Return: The number of folios found.
35b47146SVishal Moola (Oracle) * Also update @start to be positioned for traversal of the next folio.
ebf43500SJens Axboe */
0fc9d104SKonstantin Khlebnikov
35b47146SVishal Moola (Oracle)unsigned filemap_get_folios_contig(struct address_space *mapping,
35b47146SVishal Moola (Oracle)		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
35b47146SVishal Moola (Oracle){
35b47146SVishal Moola (Oracle)	XA_STATE(xas, &mapping->i_pages, *start);
35b47146SVishal Moola (Oracle)	unsigned long nr;
35b47146SVishal Moola (Oracle)	struct folio *folio;
ebf43500SJens Axboe
a60637c8SNick Piggin	rcu_read_lock();
35b47146SVishal Moola (Oracle)
35b47146SVishal Moola (Oracle)	for (folio = xas_load(&xas); folio && xas.xa_index <= end;
35b47146SVishal Moola (Oracle)			folio = xas_next(&xas)) {
e1c37722SMatthew Wilcox (Oracle)		if (xas_retry(&xas, folio))
2cf938aaSMatthew Wilcox			continue;
8079b1c8SHugh Dickins		/*
3ece58a2SMatthew Wilcox		 * If the entry has been swapped out, we can stop looking.
3ece58a2SMatthew Wilcox		 * No current caller is looking for DAX entries.
8079b1c8SHugh Dickins		 */
e1c37722SMatthew Wilcox (Oracle)		if (xa_is_value(folio))
35b47146SVishal Moola (Oracle)			goto update_start;
a60637c8SNick Piggin
16380f52SYang Shi		if (!folio_try_get(folio))
3ece58a2SMatthew Wilcox			goto retry;
a60637c8SNick Piggin
e1c37722SMatthew Wilcox (Oracle)		if (unlikely(folio != xas_reload(&xas)))
35b47146SVishal Moola (Oracle)			goto put_folio;
a60637c8SNick Piggin
35b47146SVishal Moola (Oracle)		if (!folio_batch_add(fbatch, folio)) {
35b47146SVishal Moola (Oracle)			nr = folio_nr_pages(folio);
35b47146SVishal Moola (Oracle)
35b47146SVishal Moola (Oracle)			if (folio_test_hugetlb(folio))
35b47146SVishal Moola (Oracle)				nr = 1;
35b47146SVishal Moola (Oracle)			*start = folio->index + nr;
35b47146SVishal Moola (Oracle)			goto out;
6b24ca4aSMatthew Wilcox (Oracle)		}
3ece58a2SMatthew Wilcox		continue;
35b47146SVishal Moola (Oracle)put_folio:
e1c37722SMatthew Wilcox (Oracle)		folio_put(folio);
35b47146SVishal Moola (Oracle)
3ece58a2SMatthew Wilcoxretry:
3ece58a2SMatthew Wilcox		xas_reset(&xas);
ebf43500SJens Axboe	}
35b47146SVishal Moola (Oracle)
35b47146SVishal Moola (Oracle)update_start:
35b47146SVishal Moola (Oracle)	nr = folio_batch_count(fbatch);
35b47146SVishal Moola (Oracle)
35b47146SVishal Moola (Oracle)	if (nr) {
35b47146SVishal Moola (Oracle)		folio = fbatch->folios[nr - 1];
35b47146SVishal Moola (Oracle)		if (folio_test_hugetlb(folio))
35b47146SVishal Moola (Oracle)			*start = folio->index + 1;
35b47146SVishal Moola (Oracle)		else
87b11f86SSidhartha Kumar			*start = folio_next_index(folio);
ebf43500SJens Axboe	}
35b47146SVishal Moola (Oracle)out:
35b47146SVishal Moola (Oracle)	rcu_read_unlock();
35b47146SVishal Moola (Oracle)	return folio_batch_count(fbatch);
35b47146SVishal Moola (Oracle)}
35b47146SVishal Moola (Oracle)EXPORT_SYMBOL(filemap_get_folios_contig);
ebf43500SJens Axboe
485bb99bSRandy Dunlap/**
247f9e1fSVishal Moola (Oracle) * filemap_get_folios_tag - Get a batch of folios matching @tag
247f9e1fSVishal Moola (Oracle) * @mapping:    The address_space to search
247f9e1fSVishal Moola (Oracle) * @start:      The starting page index
72b045aeSJan Kara * @end:        The final page index (inclusive)
247f9e1fSVishal Moola (Oracle) * @tag:        The tag index
247f9e1fSVishal Moola (Oracle) * @fbatch:     The batch to fill
485bb99bSRandy Dunlap *
247f9e1fSVishal Moola (Oracle) * Same as filemap_get_folios(), but only returning folios tagged with @tag.
a862f68aSMike Rapoport *
247f9e1fSVishal Moola (Oracle) * Return: The number of folios found.
247f9e1fSVishal Moola (Oracle) * Also update @start to index the next folio for traversal.
1da177e4SLinus Torvalds */
247f9e1fSVishal Moola (Oracle)unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
247f9e1fSVishal Moola (Oracle)			pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
1da177e4SLinus Torvalds{
247f9e1fSVishal Moola (Oracle)	XA_STATE(xas, &mapping->i_pages, *start);
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
1da177e4SLinus Torvalds
a60637c8SNick Piggin	rcu_read_lock();
247f9e1fSVishal Moola (Oracle)	while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
8079b1c8SHugh Dickins		/*
a6906972SMatthew Wilcox		 * Shadow entries should never be tagged, but this iteration
a6906972SMatthew Wilcox		 * is lockless so there is a window for page reclaim to evict
a6906972SMatthew Wilcox		 * a page we saw tagged. Skip over it.
8079b1c8SHugh Dickins		 */
f5e6429aSMatthew Wilcox (Oracle)		if (xa_is_value(folio))
139b6a6fSJohannes Weiner			continue;
247f9e1fSVishal Moola (Oracle)		if (!folio_batch_add(fbatch, folio)) {
247f9e1fSVishal Moola (Oracle)			unsigned long nr = folio_nr_pages(folio);
a60637c8SNick Piggin
247f9e1fSVishal Moola (Oracle)			if (folio_test_hugetlb(folio))
247f9e1fSVishal Moola (Oracle)				nr = 1;
247f9e1fSVishal Moola (Oracle)			*start = folio->index + nr;
72b045aeSJan Kara			goto out;
72b045aeSJan Kara		}
a60637c8SNick Piggin	}
72b045aeSJan Kara	/*
247f9e1fSVishal Moola (Oracle)	 * We come here when there is no page beyond @end. We take care to not
247f9e1fSVishal Moola (Oracle)	 * overflow the index @start as it confuses some of the callers. This
247f9e1fSVishal Moola (Oracle)	 * breaks the iteration when there is a page at index -1 but that is
247f9e1fSVishal Moola (Oracle)	 * already broke anyway.
72b045aeSJan Kara	 */
72b045aeSJan Kara	if (end == (pgoff_t)-1)
247f9e1fSVishal Moola (Oracle)		*start = (pgoff_t)-1;
72b045aeSJan Kara	else
247f9e1fSVishal Moola (Oracle)		*start = end + 1;
72b045aeSJan Karaout:
a60637c8SNick Piggin	rcu_read_unlock();
a60637c8SNick Piggin
247f9e1fSVishal Moola (Oracle)	return folio_batch_count(fbatch);
1da177e4SLinus Torvalds}
247f9e1fSVishal Moola (Oracle)EXPORT_SYMBOL(filemap_get_folios_tag);
1da177e4SLinus Torvalds
76d42bd9SWu Fengguang/*
76d42bd9SWu Fengguang * CD/DVDs are error prone. When a medium error occurs, the driver may fail
76d42bd9SWu Fengguang * a _large_ part of the i/o request. Imagine the worst scenario:
76d42bd9SWu Fengguang *
76d42bd9SWu Fengguang *      ---R__________________________________________B__________
76d42bd9SWu Fengguang *         ^ reading here                             ^ bad block(assume 4k)
76d42bd9SWu Fengguang *
76d42bd9SWu Fengguang * read(R) => miss => readahead(R...B) => media error => frustrating retries
76d42bd9SWu Fengguang * => failing the whole request => read(R) => read(R+1) =>
76d42bd9SWu Fengguang * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
76d42bd9SWu Fengguang * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
76d42bd9SWu Fengguang * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
76d42bd9SWu Fengguang *
76d42bd9SWu Fengguang * It is going insane. Fix it by quickly scaling down the readahead size.
76d42bd9SWu Fengguang */
0f8e2db4SSouptick Joarderstatic void shrink_readahead_size_eio(struct file_ra_state *ra)
76d42bd9SWu Fengguang{
76d42bd9SWu Fengguang	ra->ra_pages /= 4;
76d42bd9SWu Fengguang}
76d42bd9SWu Fengguang
cbd59c48SMatthew Wilcox (Oracle)/*
25d6a23eSMatthew Wilcox (Oracle) * filemap_get_read_batch - Get a batch of folios for read
cbd59c48SMatthew Wilcox (Oracle) *
25d6a23eSMatthew Wilcox (Oracle) * Get a batch of folios which represent a contiguous range of bytes in
25d6a23eSMatthew Wilcox (Oracle) * the file.  No exceptional entries will be returned.  If @index is in
25d6a23eSMatthew Wilcox (Oracle) * the middle of a folio, the entire folio will be returned.  The last
25d6a23eSMatthew Wilcox (Oracle) * folio in the batch may have the readahead flag set or the uptodate flag
25d6a23eSMatthew Wilcox (Oracle) * clear so that the caller can take the appropriate action.
cbd59c48SMatthew Wilcox (Oracle) */
cbd59c48SMatthew Wilcox (Oracle)static void filemap_get_read_batch(struct address_space *mapping,
25d6a23eSMatthew Wilcox (Oracle)		pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
cbd59c48SMatthew Wilcox (Oracle){
cbd59c48SMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, index);
bdb72932SMatthew Wilcox (Oracle)	struct folio *folio;
cbd59c48SMatthew Wilcox (Oracle)
cbd59c48SMatthew Wilcox (Oracle)	rcu_read_lock();
bdb72932SMatthew Wilcox (Oracle)	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
bdb72932SMatthew Wilcox (Oracle)		if (xas_retry(&xas, folio))
cbd59c48SMatthew Wilcox (Oracle)			continue;
bdb72932SMatthew Wilcox (Oracle)		if (xas.xa_index > max || xa_is_value(folio))
cbd59c48SMatthew Wilcox (Oracle)			break;
cb995f4eSMatthew Wilcox (Oracle)		if (xa_is_sibling(folio))
cb995f4eSMatthew Wilcox (Oracle)			break;
16380f52SYang Shi		if (!folio_try_get(folio))
cbd59c48SMatthew Wilcox (Oracle)			goto retry;
cbd59c48SMatthew Wilcox (Oracle)
bdb72932SMatthew Wilcox (Oracle)		if (unlikely(folio != xas_reload(&xas)))
25d6a23eSMatthew Wilcox (Oracle)			goto put_folio;
cbd59c48SMatthew Wilcox (Oracle)
25d6a23eSMatthew Wilcox (Oracle)		if (!folio_batch_add(fbatch, folio))
cbd59c48SMatthew Wilcox (Oracle)			break;
bdb72932SMatthew Wilcox (Oracle)		if (!folio_test_uptodate(folio))
cbd59c48SMatthew Wilcox (Oracle)			break;
bdb72932SMatthew Wilcox (Oracle)		if (folio_test_readahead(folio))
cbd59c48SMatthew Wilcox (Oracle)			break;
87b11f86SSidhartha Kumar		xas_advance(&xas, folio_next_index(folio) - 1);
cbd59c48SMatthew Wilcox (Oracle)		continue;
25d6a23eSMatthew Wilcox (Oracle)put_folio:
bdb72932SMatthew Wilcox (Oracle)		folio_put(folio);
cbd59c48SMatthew Wilcox (Oracle)retry:
cbd59c48SMatthew Wilcox (Oracle)		xas_reset(&xas);
cbd59c48SMatthew Wilcox (Oracle)	}
cbd59c48SMatthew Wilcox (Oracle)	rcu_read_unlock();
cbd59c48SMatthew Wilcox (Oracle)}
cbd59c48SMatthew Wilcox (Oracle)
290e1a32SMatthew Wilcox (Oracle)static int filemap_read_folio(struct file *file, filler_t filler,
9d427b4eSMatthew Wilcox (Oracle)		struct folio *folio)
723ef24bSKent Overstreet{
17604240SChristoph Hellwig	bool workingset = folio_test_workingset(folio);
17604240SChristoph Hellwig	unsigned long pflags;
723ef24bSKent Overstreet	int error;
723ef24bSKent Overstreet
723ef24bSKent Overstreet	/*
68430303SMatthew Wilcox (Oracle)	 * A previous I/O error may have been due to temporary failures,
7e0a1265SMatthew Wilcox (Oracle)	 * eg. multipath errors.  PG_error will be set again if read_folio
68430303SMatthew Wilcox (Oracle)	 * fails.
723ef24bSKent Overstreet	 */
9d427b4eSMatthew Wilcox (Oracle)	folio_clear_error(folio);
17604240SChristoph Hellwig
723ef24bSKent Overstreet	/* Start the actual read. The read will unlock the page. */
17604240SChristoph Hellwig	if (unlikely(workingset))
17604240SChristoph Hellwig		psi_memstall_enter(&pflags);
290e1a32SMatthew Wilcox (Oracle)	error = filler(file, folio);
17604240SChristoph Hellwig	if (unlikely(workingset))
17604240SChristoph Hellwig		psi_memstall_leave(&pflags);
68430303SMatthew Wilcox (Oracle)	if (error)
68430303SMatthew Wilcox (Oracle)		return error;
723ef24bSKent Overstreet
9d427b4eSMatthew Wilcox (Oracle)	error = folio_wait_locked_killable(folio);
68430303SMatthew Wilcox (Oracle)	if (error)
68430303SMatthew Wilcox (Oracle)		return error;
9d427b4eSMatthew Wilcox (Oracle)	if (folio_test_uptodate(folio))
aa1ec2f6SMatthew Wilcox (Oracle)		return 0;
290e1a32SMatthew Wilcox (Oracle)	if (file)
68430303SMatthew Wilcox (Oracle)		shrink_readahead_size_eio(&file->f_ra);
aa1ec2f6SMatthew Wilcox (Oracle)	return -EIO;
723ef24bSKent Overstreet}
723ef24bSKent Overstreet
fce70da3SMatthew Wilcox (Oracle)static bool filemap_range_uptodate(struct address_space *mapping,
dd5b9d00SDavid Howells		loff_t pos, size_t count, struct folio *folio,
dd5b9d00SDavid Howells		bool need_uptodate)
fce70da3SMatthew Wilcox (Oracle){
2fa4eeb8SMatthew Wilcox (Oracle)	if (folio_test_uptodate(folio))
fce70da3SMatthew Wilcox (Oracle)		return true;
fce70da3SMatthew Wilcox (Oracle)	/* pipes can't handle partially uptodate pages */
dd5b9d00SDavid Howells	if (need_uptodate)
fce70da3SMatthew Wilcox (Oracle)		return false;
fce70da3SMatthew Wilcox (Oracle)	if (!mapping->a_ops->is_partially_uptodate)
fce70da3SMatthew Wilcox (Oracle)		return false;
2fa4eeb8SMatthew Wilcox (Oracle)	if (mapping->host->i_blkbits >= folio_shift(folio))
fce70da3SMatthew Wilcox (Oracle)		return false;
fce70da3SMatthew Wilcox (Oracle)
2fa4eeb8SMatthew Wilcox (Oracle)	if (folio_pos(folio) > pos) {
2fa4eeb8SMatthew Wilcox (Oracle)		count -= folio_pos(folio) - pos;
fce70da3SMatthew Wilcox (Oracle)		pos = 0;
fce70da3SMatthew Wilcox (Oracle)	} else {
2fa4eeb8SMatthew Wilcox (Oracle)		pos -= folio_pos(folio);
fce70da3SMatthew Wilcox (Oracle)	}
fce70da3SMatthew Wilcox (Oracle)
2e7e80f7SMatthew Wilcox (Oracle)	return mapping->a_ops->is_partially_uptodate(folio, pos, count);
fce70da3SMatthew Wilcox (Oracle)}
fce70da3SMatthew Wilcox (Oracle)
4612aeefSMatthew Wilcox (Oracle)static int filemap_update_page(struct kiocb *iocb,
dd5b9d00SDavid Howells		struct address_space *mapping, size_t count,
dd5b9d00SDavid Howells		struct folio *folio, bool need_uptodate)
723ef24bSKent Overstreet{
723ef24bSKent Overstreet	int error;
723ef24bSKent Overstreet
730633f0SJan Kara	if (iocb->ki_flags & IOCB_NOWAIT) {
730633f0SJan Kara		if (!filemap_invalidate_trylock_shared(mapping))
87d1d7b6SMatthew Wilcox (Oracle)			return -EAGAIN;
730633f0SJan Kara	} else {
730633f0SJan Kara		filemap_invalidate_lock_shared(mapping);
730633f0SJan Kara	}
730633f0SJan Kara
ffdc8dabSMatthew Wilcox (Oracle)	if (!folio_trylock(folio)) {
730633f0SJan Kara		error = -EAGAIN;
730633f0SJan Kara		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
730633f0SJan Kara			goto unlock_mapping;
87d1d7b6SMatthew Wilcox (Oracle)		if (!(iocb->ki_flags & IOCB_WAITQ)) {
730633f0SJan Kara			filemap_invalidate_unlock_shared(mapping);
9f2b04a2SMatthew Wilcox (Oracle)			/*
9f2b04a2SMatthew Wilcox (Oracle)			 * This is where we usually end up waiting for a
9f2b04a2SMatthew Wilcox (Oracle)			 * previously submitted readahead to finish.
9f2b04a2SMatthew Wilcox (Oracle)			 */
9f2b04a2SMatthew Wilcox (Oracle)			folio_put_wait_locked(folio, TASK_KILLABLE);
4612aeefSMatthew Wilcox (Oracle)			return AOP_TRUNCATED_PAGE;
bd8a1f36SMatthew Wilcox (Oracle)		}
ffdc8dabSMatthew Wilcox (Oracle)		error = __folio_lock_async(folio, iocb->ki_waitq);
87d1d7b6SMatthew Wilcox (Oracle)		if (error)
730633f0SJan Kara			goto unlock_mapping;
bd8a1f36SMatthew Wilcox (Oracle)	}
723ef24bSKent Overstreet
730633f0SJan Kara	error = AOP_TRUNCATED_PAGE;
ffdc8dabSMatthew Wilcox (Oracle)	if (!folio->mapping)
730633f0SJan Kara		goto unlock;
723ef24bSKent Overstreet
fce70da3SMatthew Wilcox (Oracle)	error = 0;
dd5b9d00SDavid Howells	if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
dd5b9d00SDavid Howells				   need_uptodate))
fce70da3SMatthew Wilcox (Oracle)		goto unlock;
fce70da3SMatthew Wilcox (Oracle)
fce70da3SMatthew Wilcox (Oracle)	error = -EAGAIN;
fce70da3SMatthew Wilcox (Oracle)	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
fce70da3SMatthew Wilcox (Oracle)		goto unlock;
fce70da3SMatthew Wilcox (Oracle)
290e1a32SMatthew Wilcox (Oracle)	error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
290e1a32SMatthew Wilcox (Oracle)			folio);
730633f0SJan Kara	goto unlock_mapping;
fce70da3SMatthew Wilcox (Oracle)unlock:
ffdc8dabSMatthew Wilcox (Oracle)	folio_unlock(folio);
730633f0SJan Karaunlock_mapping:
730633f0SJan Kara	filemap_invalidate_unlock_shared(mapping);
730633f0SJan Kara	if (error == AOP_TRUNCATED_PAGE)
ffdc8dabSMatthew Wilcox (Oracle)		folio_put(folio);
fce70da3SMatthew Wilcox (Oracle)	return error;
723ef24bSKent Overstreet}
723ef24bSKent Overstreet
a5d4ad09SMatthew Wilcox (Oracle)static int filemap_create_folio(struct file *file,
f253e185SMatthew Wilcox (Oracle)		struct address_space *mapping, pgoff_t index,
25d6a23eSMatthew Wilcox (Oracle)		struct folio_batch *fbatch)
723ef24bSKent Overstreet{
a5d4ad09SMatthew Wilcox (Oracle)	struct folio *folio;
723ef24bSKent Overstreet	int error;
723ef24bSKent Overstreet
a5d4ad09SMatthew Wilcox (Oracle)	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
a5d4ad09SMatthew Wilcox (Oracle)	if (!folio)
f253e185SMatthew Wilcox (Oracle)		return -ENOMEM;
723ef24bSKent Overstreet
730633f0SJan Kara	/*
a5d4ad09SMatthew Wilcox (Oracle)	 * Protect against truncate / hole punch. Grabbing invalidate_lock
a5d4ad09SMatthew Wilcox (Oracle)	 * here assures we cannot instantiate and bring uptodate new
a5d4ad09SMatthew Wilcox (Oracle)	 * pagecache folios after evicting page cache during truncate
a5d4ad09SMatthew Wilcox (Oracle)	 * and before actually freeing blocks.	Note that we could
a5d4ad09SMatthew Wilcox (Oracle)	 * release invalidate_lock after inserting the folio into
a5d4ad09SMatthew Wilcox (Oracle)	 * the page cache as the locked folio would then be enough to
a5d4ad09SMatthew Wilcox (Oracle)	 * synchronize with hole punching. But there are code paths
a5d4ad09SMatthew Wilcox (Oracle)	 * such as filemap_update_page() filling in partially uptodate
704528d8SMatthew Wilcox (Oracle)	 * pages or ->readahead() that need to hold invalidate_lock
a5d4ad09SMatthew Wilcox (Oracle)	 * while mapping blocks for IO so let's hold the lock here as
a5d4ad09SMatthew Wilcox (Oracle)	 * well to keep locking rules simple.
730633f0SJan Kara	 */
730633f0SJan Kara	filemap_invalidate_lock_shared(mapping);
a5d4ad09SMatthew Wilcox (Oracle)	error = filemap_add_folio(mapping, folio, index,
723ef24bSKent Overstreet			mapping_gfp_constraint(mapping, GFP_KERNEL));
f253e185SMatthew Wilcox (Oracle)	if (error == -EEXIST)
f253e185SMatthew Wilcox (Oracle)		error = AOP_TRUNCATED_PAGE;
f253e185SMatthew Wilcox (Oracle)	if (error)
f253e185SMatthew Wilcox (Oracle)		goto error;
f253e185SMatthew Wilcox (Oracle)
290e1a32SMatthew Wilcox (Oracle)	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
f253e185SMatthew Wilcox (Oracle)	if (error)
f253e185SMatthew Wilcox (Oracle)		goto error;
f253e185SMatthew Wilcox (Oracle)
730633f0SJan Kara	filemap_invalidate_unlock_shared(mapping);
25d6a23eSMatthew Wilcox (Oracle)	folio_batch_add(fbatch, folio);
f253e185SMatthew Wilcox (Oracle)	return 0;
f253e185SMatthew Wilcox (Oracle)error:
730633f0SJan Kara	filemap_invalidate_unlock_shared(mapping);
a5d4ad09SMatthew Wilcox (Oracle)	folio_put(folio);
f253e185SMatthew Wilcox (Oracle)	return error;
723ef24bSKent Overstreet}
723ef24bSKent Overstreet
5963fe03SMatthew Wilcox (Oracle)static int filemap_readahead(struct kiocb *iocb, struct file *file,
65bca53bSMatthew Wilcox (Oracle)		struct address_space *mapping, struct folio *folio,
5963fe03SMatthew Wilcox (Oracle)		pgoff_t last_index)
5963fe03SMatthew Wilcox (Oracle){
65bca53bSMatthew Wilcox (Oracle)	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
65bca53bSMatthew Wilcox (Oracle)
5963fe03SMatthew Wilcox (Oracle)	if (iocb->ki_flags & IOCB_NOIO)
5963fe03SMatthew Wilcox (Oracle)		return -EAGAIN;
65bca53bSMatthew Wilcox (Oracle)	page_cache_async_ra(&ractl, folio, last_index - folio->index);
5963fe03SMatthew Wilcox (Oracle)	return 0;
5963fe03SMatthew Wilcox (Oracle)}
5963fe03SMatthew Wilcox (Oracle)
dd5b9d00SDavid Howellsstatic int filemap_get_pages(struct kiocb *iocb, size_t count,
dd5b9d00SDavid Howells		struct folio_batch *fbatch, bool need_uptodate)
06c04442SKent Overstreet{
06c04442SKent Overstreet	struct file *filp = iocb->ki_filp;
06c04442SKent Overstreet	struct address_space *mapping = filp->f_mapping;
06c04442SKent Overstreet	struct file_ra_state *ra = &filp->f_ra;
06c04442SKent Overstreet	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
cbd59c48SMatthew Wilcox (Oracle)	pgoff_t last_index;
65bca53bSMatthew Wilcox (Oracle)	struct folio *folio;
cbd59c48SMatthew Wilcox (Oracle)	int err = 0;
06c04442SKent Overstreet
5956592cSQian Yingjin	/* "last_index" is the index of the page beyond the end of the read */
dd5b9d00SDavid Howells	last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
2642fca6SMatthew Wilcox (Oracle)retry:
06c04442SKent Overstreet	if (fatal_signal_pending(current))
06c04442SKent Overstreet		return -EINTR;
06c04442SKent Overstreet
5956592cSQian Yingjin	filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
25d6a23eSMatthew Wilcox (Oracle)	if (!folio_batch_count(fbatch)) {
06c04442SKent Overstreet		if (iocb->ki_flags & IOCB_NOIO)
06c04442SKent Overstreet			return -EAGAIN;
2642fca6SMatthew Wilcox (Oracle)		page_cache_sync_readahead(mapping, ra, filp, index,
2642fca6SMatthew Wilcox (Oracle)				last_index - index);
5956592cSQian Yingjin		filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2642fca6SMatthew Wilcox (Oracle)	}
25d6a23eSMatthew Wilcox (Oracle)	if (!folio_batch_count(fbatch)) {
f253e185SMatthew Wilcox (Oracle)		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
f253e185SMatthew Wilcox (Oracle)			return -EAGAIN;
a5d4ad09SMatthew Wilcox (Oracle)		err = filemap_create_folio(filp, mapping,
25d6a23eSMatthew Wilcox (Oracle)				iocb->ki_pos >> PAGE_SHIFT, fbatch);
f253e185SMatthew Wilcox (Oracle)		if (err == AOP_TRUNCATED_PAGE)
2642fca6SMatthew Wilcox (Oracle)			goto retry;
f253e185SMatthew Wilcox (Oracle)		return err;
f253e185SMatthew Wilcox (Oracle)	}
06c04442SKent Overstreet
25d6a23eSMatthew Wilcox (Oracle)	folio = fbatch->folios[folio_batch_count(fbatch) - 1];
65bca53bSMatthew Wilcox (Oracle)	if (folio_test_readahead(folio)) {
65bca53bSMatthew Wilcox (Oracle)		err = filemap_readahead(iocb, filp, mapping, folio, last_index);
2642fca6SMatthew Wilcox (Oracle)		if (err)
cbd59c48SMatthew Wilcox (Oracle)			goto err;
06c04442SKent Overstreet	}
65bca53bSMatthew Wilcox (Oracle)	if (!folio_test_uptodate(folio)) {
25d6a23eSMatthew Wilcox (Oracle)		if ((iocb->ki_flags & IOCB_WAITQ) &&
25d6a23eSMatthew Wilcox (Oracle)		    folio_batch_count(fbatch) > 1)
87d1d7b6SMatthew Wilcox (Oracle)			iocb->ki_flags |= IOCB_NOWAIT;
dd5b9d00SDavid Howells		err = filemap_update_page(iocb, mapping, count, folio,
dd5b9d00SDavid Howells					  need_uptodate);
2642fca6SMatthew Wilcox (Oracle)		if (err)
2642fca6SMatthew Wilcox (Oracle)			goto err;
06c04442SKent Overstreet	}
06c04442SKent Overstreet
2642fca6SMatthew Wilcox (Oracle)	return 0;
cbd59c48SMatthew Wilcox (Oracle)err:
2642fca6SMatthew Wilcox (Oracle)	if (err < 0)
65bca53bSMatthew Wilcox (Oracle)		folio_put(folio);
25d6a23eSMatthew Wilcox (Oracle)	if (likely(--fbatch->nr))
ff993ba1SMatthew Wilcox (Oracle)		return 0;
4612aeefSMatthew Wilcox (Oracle)	if (err == AOP_TRUNCATED_PAGE)
2642fca6SMatthew Wilcox (Oracle)		goto retry;
06c04442SKent Overstreet	return err;
06c04442SKent Overstreet}
06c04442SKent Overstreet
5ccc944dSMatthew Wilcox (Oracle)static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
5ccc944dSMatthew Wilcox (Oracle){
5ccc944dSMatthew Wilcox (Oracle)	unsigned int shift = folio_shift(folio);
5ccc944dSMatthew Wilcox (Oracle)
5ccc944dSMatthew Wilcox (Oracle)	return (pos1 >> shift == pos2 >> shift);
5ccc944dSMatthew Wilcox (Oracle)}
5ccc944dSMatthew Wilcox (Oracle)
485bb99bSRandy Dunlap/**
87fa0f3eSChristoph Hellwig * filemap_read - Read data from the page cache.
87fa0f3eSChristoph Hellwig * @iocb: The iocb to read.
87fa0f3eSChristoph Hellwig * @iter: Destination for the data.
87fa0f3eSChristoph Hellwig * @already_read: Number of bytes already read by the caller.
485bb99bSRandy Dunlap *
87fa0f3eSChristoph Hellwig * Copies data from the page cache.  If the data is not currently present,
7e0a1265SMatthew Wilcox (Oracle) * uses the readahead and read_folio address_space operations to fetch it.
1da177e4SLinus Torvalds *
87fa0f3eSChristoph Hellwig * Return: Total number of bytes copied, including those already read by
87fa0f3eSChristoph Hellwig * the caller.  If an error happens before any bytes are copied, returns
87fa0f3eSChristoph Hellwig * a negative error number.
1da177e4SLinus Torvalds */
87fa0f3eSChristoph Hellwigssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
87fa0f3eSChristoph Hellwig		ssize_t already_read)
1da177e4SLinus Torvalds{
47c27bc4SChristoph Hellwig	struct file *filp = iocb->ki_filp;
06c04442SKent Overstreet	struct file_ra_state *ra = &filp->f_ra;
36e78914SChristoph Hellwig	struct address_space *mapping = filp->f_mapping;
1da177e4SLinus Torvalds	struct inode *inode = mapping->host;
25d6a23eSMatthew Wilcox (Oracle)	struct folio_batch fbatch;
ff993ba1SMatthew Wilcox (Oracle)	int i, error = 0;
06c04442SKent Overstreet	bool writably_mapped;
06c04442SKent Overstreet	loff_t isize, end_offset;
f04d16eeSHaibo Li	loff_t last_pos = ra->prev_pos;
1da177e4SLinus Torvalds
723ef24bSKent Overstreet	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
d05c5f7bSLinus Torvalds		return 0;
3644e2d2SKent Overstreet	if (unlikely(!iov_iter_count(iter)))
3644e2d2SKent Overstreet		return 0;
3644e2d2SKent Overstreet
a2746ab3STrond Myklebust	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
25d6a23eSMatthew Wilcox (Oracle)	folio_batch_init(&fbatch);
c2a9737fSWei Fang
06c04442SKent Overstreet	do {
06c04442SKent Overstreet		cond_resched();
1da177e4SLinus Torvalds
13bd6914SJens Axboe		/*
13bd6914SJens Axboe		 * If we've already successfully copied some data, then we
13bd6914SJens Axboe		 * can no longer safely return -EIOCBQUEUED. Hence mark
13bd6914SJens Axboe		 * an async read NOWAIT at that point.
13bd6914SJens Axboe		 */
87fa0f3eSChristoph Hellwig		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
13bd6914SJens Axboe			iocb->ki_flags |= IOCB_NOWAIT;
13bd6914SJens Axboe
8c8387eeSDavid Howells		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
8c8387eeSDavid Howells			break;
8c8387eeSDavid Howells
3fc40265SDavid Howells		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
ff993ba1SMatthew Wilcox (Oracle)		if (error < 0)
06c04442SKent Overstreet			break;
5abf186aSMichal Hocko
723ef24bSKent Overstreet		/*
06c04442SKent Overstreet		 * i_size must be checked after we know the pages are Uptodate.
06c04442SKent Overstreet		 *
06c04442SKent Overstreet		 * Checking i_size after the check allows us to calculate
06c04442SKent Overstreet		 * the correct value for "nr", which means the zero-filled
06c04442SKent Overstreet		 * part of the page is not copied back to userspace (unless
06c04442SKent Overstreet		 * another truncate extends the file - this is desired though).
723ef24bSKent Overstreet		 */
06c04442SKent Overstreet		isize = i_size_read(inode);
06c04442SKent Overstreet		if (unlikely(iocb->ki_pos >= isize))
25d6a23eSMatthew Wilcox (Oracle)			goto put_folios;
06c04442SKent Overstreet		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
c8d317aaSHao Xu
06c04442SKent Overstreet		/*
d16eb52cSBaokun Li		 * Pairs with a barrier in
d16eb52cSBaokun Li		 * block_write_end()->mark_buffer_dirty() or other page
d16eb52cSBaokun Li		 * dirtying routines like iomap_write_end() to ensure
d16eb52cSBaokun Li		 * changes to page contents are visible before we see
d16eb52cSBaokun Li		 * increased inode size.
d16eb52cSBaokun Li		 */
d16eb52cSBaokun Li		smp_rmb();
d16eb52cSBaokun Li
d16eb52cSBaokun Li		/*
06c04442SKent Overstreet		 * Once we start copying data, we don't want to be touching any
06c04442SKent Overstreet		 * cachelines that might be contended:
06c04442SKent Overstreet		 */
06c04442SKent Overstreet		writably_mapped = mapping_writably_mapped(mapping);
06c04442SKent Overstreet
06c04442SKent Overstreet		/*
5ccc944dSMatthew Wilcox (Oracle)		 * When a read accesses the same folio several times, only
06c04442SKent Overstreet		 * mark it as accessed the first time.
06c04442SKent Overstreet		 */
f04d16eeSHaibo Li		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
5ccc944dSMatthew Wilcox (Oracle)				    fbatch.folios[0]))
25d6a23eSMatthew Wilcox (Oracle)			folio_mark_accessed(fbatch.folios[0]);
06c04442SKent Overstreet
25d6a23eSMatthew Wilcox (Oracle)		for (i = 0; i < folio_batch_count(&fbatch); i++) {
25d6a23eSMatthew Wilcox (Oracle)			struct folio *folio = fbatch.folios[i];
d996fc7fSMatthew Wilcox (Oracle)			size_t fsize = folio_size(folio);
d996fc7fSMatthew Wilcox (Oracle)			size_t offset = iocb->ki_pos & (fsize - 1);
cbd59c48SMatthew Wilcox (Oracle)			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
d996fc7fSMatthew Wilcox (Oracle)					     fsize - offset);
cbd59c48SMatthew Wilcox (Oracle)			size_t copied;
06c04442SKent Overstreet
d996fc7fSMatthew Wilcox (Oracle)			if (end_offset < folio_pos(folio))
cbd59c48SMatthew Wilcox (Oracle)				break;
cbd59c48SMatthew Wilcox (Oracle)			if (i > 0)
d996fc7fSMatthew Wilcox (Oracle)				folio_mark_accessed(folio);
06c04442SKent Overstreet			/*
d996fc7fSMatthew Wilcox (Oracle)			 * If users can be writing to this folio using arbitrary
d996fc7fSMatthew Wilcox (Oracle)			 * virtual addresses, take care of potential aliasing
d996fc7fSMatthew Wilcox (Oracle)			 * before reading the folio on the kernel side.
06c04442SKent Overstreet			 */
d996fc7fSMatthew Wilcox (Oracle)			if (writably_mapped)
d996fc7fSMatthew Wilcox (Oracle)				flush_dcache_folio(folio);
06c04442SKent Overstreet
d996fc7fSMatthew Wilcox (Oracle)			copied = copy_folio_to_iter(folio, offset, bytes, iter);
06c04442SKent Overstreet
87fa0f3eSChristoph Hellwig			already_read += copied;
06c04442SKent Overstreet			iocb->ki_pos += copied;
f04d16eeSHaibo Li			last_pos = iocb->ki_pos;
06c04442SKent Overstreet
06c04442SKent Overstreet			if (copied < bytes) {
06c04442SKent Overstreet				error = -EFAULT;
06c04442SKent Overstreet				break;
1da177e4SLinus Torvalds			}
1da177e4SLinus Torvalds		}
25d6a23eSMatthew Wilcox (Oracle)put_folios:
25d6a23eSMatthew Wilcox (Oracle)		for (i = 0; i < folio_batch_count(&fbatch); i++)
25d6a23eSMatthew Wilcox (Oracle)			folio_put(fbatch.folios[i]);
25d6a23eSMatthew Wilcox (Oracle)		folio_batch_init(&fbatch);
06c04442SKent Overstreet	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds	file_accessed(filp);
f04d16eeSHaibo Li	ra->prev_pos = last_pos;
87fa0f3eSChristoph Hellwig	return already_read ? already_read : error;
1da177e4SLinus Torvalds}
87fa0f3eSChristoph HellwigEXPORT_SYMBOL_GPL(filemap_read);
1da177e4SLinus Torvalds
3c435a0fSChristoph Hellwigint kiocb_write_and_wait(struct kiocb *iocb, size_t count)
3c435a0fSChristoph Hellwig{
3c435a0fSChristoph Hellwig	struct address_space *mapping = iocb->ki_filp->f_mapping;
3c435a0fSChristoph Hellwig	loff_t pos = iocb->ki_pos;
3c435a0fSChristoph Hellwig	loff_t end = pos + count - 1;
3c435a0fSChristoph Hellwig
3c435a0fSChristoph Hellwig	if (iocb->ki_flags & IOCB_NOWAIT) {
3c435a0fSChristoph Hellwig		if (filemap_range_needs_writeback(mapping, pos, end))
3c435a0fSChristoph Hellwig			return -EAGAIN;
3c435a0fSChristoph Hellwig		return 0;
3c435a0fSChristoph Hellwig	}
3c435a0fSChristoph Hellwig
3c435a0fSChristoph Hellwig	return filemap_write_and_wait_range(mapping, pos, end);
3c435a0fSChristoph Hellwig}
3c435a0fSChristoph Hellwig
e003f74aSChristoph Hellwigint kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
e003f74aSChristoph Hellwig{
e003f74aSChristoph Hellwig	struct address_space *mapping = iocb->ki_filp->f_mapping;
e003f74aSChristoph Hellwig	loff_t pos = iocb->ki_pos;
e003f74aSChristoph Hellwig	loff_t end = pos + count - 1;
e003f74aSChristoph Hellwig	int ret;
e003f74aSChristoph Hellwig
e003f74aSChristoph Hellwig	if (iocb->ki_flags & IOCB_NOWAIT) {
e003f74aSChristoph Hellwig		/* we could block if there are any pages in the range */
e003f74aSChristoph Hellwig		if (filemap_range_has_page(mapping, pos, end))
e003f74aSChristoph Hellwig			return -EAGAIN;
e003f74aSChristoph Hellwig	} else {
e003f74aSChristoph Hellwig		ret = filemap_write_and_wait_range(mapping, pos, end);
e003f74aSChristoph Hellwig		if (ret)
e003f74aSChristoph Hellwig			return ret;
e003f74aSChristoph Hellwig	}
e003f74aSChristoph Hellwig
e003f74aSChristoph Hellwig	/*
e003f74aSChristoph Hellwig	 * After a write we want buffered reads to be sure to go to disk to get
e003f74aSChristoph Hellwig	 * the new data.  We invalidate clean cached page from the region we're
e003f74aSChristoph Hellwig	 * about to write.  We do this *before* the write so that we can return
e003f74aSChristoph Hellwig	 * without clobbering -EIOCBQUEUED from ->direct_IO().
e003f74aSChristoph Hellwig	 */
e003f74aSChristoph Hellwig	return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
e003f74aSChristoph Hellwig					     end >> PAGE_SHIFT);
e003f74aSChristoph Hellwig}
e003f74aSChristoph Hellwig
485bb99bSRandy Dunlap/**
6abd2322SAl Viro * generic_file_read_iter - generic filesystem read routine
485bb99bSRandy Dunlap * @iocb:	kernel I/O control block
6abd2322SAl Viro * @iter:	destination for the data read
485bb99bSRandy Dunlap *
6abd2322SAl Viro * This is the "read_iter()" routine for all filesystems
1da177e4SLinus Torvalds * that can use the page cache directly.
41da51bcSAndreas Gruenbacher *
41da51bcSAndreas Gruenbacher * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
41da51bcSAndreas Gruenbacher * be returned when no data can be read without waiting for I/O requests
41da51bcSAndreas Gruenbacher * to complete; it doesn't prevent readahead.
41da51bcSAndreas Gruenbacher *
41da51bcSAndreas Gruenbacher * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
41da51bcSAndreas Gruenbacher * requests shall be made for the read or for readahead.  When no data
41da51bcSAndreas Gruenbacher * can be read, -EAGAIN shall be returned.  When readahead would be
41da51bcSAndreas Gruenbacher * triggered, a partial, possibly empty read shall be returned.
41da51bcSAndreas Gruenbacher *
a862f68aSMike Rapoport * Return:
a862f68aSMike Rapoport * * number of bytes copied, even for partial reads
41da51bcSAndreas Gruenbacher * * negative error code (or 0 if IOCB_NOIO) if nothing was read
1da177e4SLinus Torvalds */
1da177e4SLinus Torvaldsssize_t
ed978a81SAl Virogeneric_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1da177e4SLinus Torvalds{
e7080a43SNicolai Stange	size_t count = iov_iter_count(iter);
47c27bc4SChristoph Hellwig	ssize_t retval = 0;
e7080a43SNicolai Stange
e7080a43SNicolai Stange	if (!count)
826ea860SChristoph Hellwig		return 0; /* skip atime */
1da177e4SLinus Torvalds
2ba48ce5SAl Viro	if (iocb->ki_flags & IOCB_DIRECT) {
47c27bc4SChristoph Hellwig		struct file *file = iocb->ki_filp;
ed978a81SAl Viro		struct address_space *mapping = file->f_mapping;
ed978a81SAl Viro		struct inode *inode = mapping->host;
1da177e4SLinus Torvalds
3c435a0fSChristoph Hellwig		retval = kiocb_write_and_wait(iocb, count);
0d5b0cf2SChristoph Hellwig		if (retval < 0)
826ea860SChristoph Hellwig			return retval;
0d5b0cf2SChristoph Hellwig		file_accessed(file);
0d5b0cf2SChristoph Hellwig
5ecda137SAl Viro		retval = mapping->a_ops->direct_IO(iocb, iter);
c3a69024SAl Viro		if (retval >= 0) {
c64fb5c7SChristoph Hellwig			iocb->ki_pos += retval;
5ecda137SAl Viro			count -= retval;
66f998f6SJosef Bacik		}
ab2125dfSPavel Begunkov		if (retval != -EIOCBQUEUED)
5b47d59aSAl Viro			iov_iter_revert(iter, count - iov_iter_count(iter));
66f998f6SJosef Bacik
66f998f6SJosef Bacik		/*
66f998f6SJosef Bacik		 * Btrfs can have a short DIO read if we encounter
66f998f6SJosef Bacik		 * compressed extents, so if there was an error, or if
66f998f6SJosef Bacik		 * we've already read everything we wanted to, or if
66f998f6SJosef Bacik		 * there was a short read because we hit EOF, go ahead
66f998f6SJosef Bacik		 * and return.  Otherwise fallthrough to buffered io for
fbbbad4bSMatthew Wilcox		 * the rest of the read.  Buffered reads will not work for
fbbbad4bSMatthew Wilcox		 * DAX files, so don't bother trying.
66f998f6SJosef Bacik		 */
61d0017eSJens Axboe		if (retval < 0 || !count || IS_DAX(inode))
61d0017eSJens Axboe			return retval;
61d0017eSJens Axboe		if (iocb->ki_pos >= i_size_read(inode))
826ea860SChristoph Hellwig			return retval;
1da177e4SLinus Torvalds	}
1da177e4SLinus Torvalds
826ea860SChristoph Hellwig	return filemap_read(iocb, iter, retval);
1da177e4SLinus Torvalds}
ed978a81SAl ViroEXPORT_SYMBOL(generic_file_read_iter);
1da177e4SLinus Torvalds
07073eb0SDavid Howells/*
07073eb0SDavid Howells * Splice subpages from a folio into a pipe.
07073eb0SDavid Howells */
07073eb0SDavid Howellssize_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
07073eb0SDavid Howells			      struct folio *folio, loff_t fpos, size_t size)
07073eb0SDavid Howells{
07073eb0SDavid Howells	struct page *page;
07073eb0SDavid Howells	size_t spliced = 0, offset = offset_in_folio(folio, fpos);
07073eb0SDavid Howells
07073eb0SDavid Howells	page = folio_page(folio, offset / PAGE_SIZE);
07073eb0SDavid Howells	size = min(size, folio_size(folio) - offset);
07073eb0SDavid Howells	offset %= PAGE_SIZE;
07073eb0SDavid Howells
07073eb0SDavid Howells	while (spliced < size &&
07073eb0SDavid Howells	       !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
07073eb0SDavid Howells		struct pipe_buffer *buf = pipe_head_buf(pipe);
07073eb0SDavid Howells		size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
07073eb0SDavid Howells
07073eb0SDavid Howells		*buf = (struct pipe_buffer) {
07073eb0SDavid Howells			.ops	= &page_cache_pipe_buf_ops,
07073eb0SDavid Howells			.page	= page,
07073eb0SDavid Howells			.offset	= offset,
07073eb0SDavid Howells			.len	= part,
07073eb0SDavid Howells		};
07073eb0SDavid Howells		folio_get(folio);
07073eb0SDavid Howells		pipe->head++;
07073eb0SDavid Howells		page++;
07073eb0SDavid Howells		spliced += part;
07073eb0SDavid Howells		offset = 0;
07073eb0SDavid Howells	}
07073eb0SDavid Howells
07073eb0SDavid Howells	return spliced;
07073eb0SDavid Howells}
07073eb0SDavid Howells
9eee8bd8SDavid Howells/**
9eee8bd8SDavid Howells * filemap_splice_read -  Splice data from a file's pagecache into a pipe
9eee8bd8SDavid Howells * @in: The file to read from
9eee8bd8SDavid Howells * @ppos: Pointer to the file position to read from
9eee8bd8SDavid Howells * @pipe: The pipe to splice into
9eee8bd8SDavid Howells * @len: The amount to splice
9eee8bd8SDavid Howells * @flags: The SPLICE_F_* flags
9eee8bd8SDavid Howells *
9eee8bd8SDavid Howells * This function gets folios from a file's pagecache and splices them into the
9eee8bd8SDavid Howells * pipe.  Readahead will be called as necessary to fill more folios.  This may
9eee8bd8SDavid Howells * be used for blockdevs also.
9eee8bd8SDavid Howells *
9eee8bd8SDavid Howells * Return: On success, the number of bytes read will be returned and *@ppos
9eee8bd8SDavid Howells * will be updated if appropriate; 0 will be returned if there is no more data
9eee8bd8SDavid Howells * to be read; -EAGAIN will be returned if the pipe had no space, and some
9eee8bd8SDavid Howells * other negative error code will be returned on error.  A short read may occur
9eee8bd8SDavid Howells * if the pipe has insufficient space, we reach the end of the data or we hit a
9eee8bd8SDavid Howells * hole.
07073eb0SDavid Howells */
07073eb0SDavid Howellsssize_t filemap_splice_read(struct file *in, loff_t *ppos,
07073eb0SDavid Howells			    struct pipe_inode_info *pipe,
07073eb0SDavid Howells			    size_t len, unsigned int flags)
07073eb0SDavid Howells{
07073eb0SDavid Howells	struct folio_batch fbatch;
07073eb0SDavid Howells	struct kiocb iocb;
07073eb0SDavid Howells	size_t total_spliced = 0, used, npages;
07073eb0SDavid Howells	loff_t isize, end_offset;
07073eb0SDavid Howells	bool writably_mapped;
07073eb0SDavid Howells	int i, error = 0;
07073eb0SDavid Howells
83aeff88SDavid Howells	if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
83aeff88SDavid Howells		return 0;
83aeff88SDavid Howells
07073eb0SDavid Howells	init_sync_kiocb(&iocb, in);
07073eb0SDavid Howells	iocb.ki_pos = *ppos;
07073eb0SDavid Howells
07073eb0SDavid Howells	/* Work out how much data we can actually add into the pipe */
07073eb0SDavid Howells	used = pipe_occupancy(pipe->head, pipe->tail);
07073eb0SDavid Howells	npages = max_t(ssize_t, pipe->max_usage - used, 0);
07073eb0SDavid Howells	len = min_t(size_t, len, npages * PAGE_SIZE);
07073eb0SDavid Howells
07073eb0SDavid Howells	folio_batch_init(&fbatch);
07073eb0SDavid Howells
07073eb0SDavid Howells	do {
07073eb0SDavid Howells		cond_resched();
07073eb0SDavid Howells
c3722208SDavid Howells		if (*ppos >= i_size_read(in->f_mapping->host))
07073eb0SDavid Howells			break;
07073eb0SDavid Howells
07073eb0SDavid Howells		iocb.ki_pos = *ppos;
07073eb0SDavid Howells		error = filemap_get_pages(&iocb, len, &fbatch, true);
07073eb0SDavid Howells		if (error < 0)
07073eb0SDavid Howells			break;
07073eb0SDavid Howells
07073eb0SDavid Howells		/*
07073eb0SDavid Howells		 * i_size must be checked after we know the pages are Uptodate.
07073eb0SDavid Howells		 *
07073eb0SDavid Howells		 * Checking i_size after the check allows us to calculate
07073eb0SDavid Howells		 * the correct value for "nr", which means the zero-filled
07073eb0SDavid Howells		 * part of the page is not copied back to userspace (unless
07073eb0SDavid Howells		 * another truncate extends the file - this is desired though).
07073eb0SDavid Howells		 */
c3722208SDavid Howells		isize = i_size_read(in->f_mapping->host);
07073eb0SDavid Howells		if (unlikely(*ppos >= isize))
07073eb0SDavid Howells			break;
07073eb0SDavid Howells		end_offset = min_t(loff_t, isize, *ppos + len);
07073eb0SDavid Howells
07073eb0SDavid Howells		/*
07073eb0SDavid Howells		 * Once we start copying data, we don't want to be touching any
07073eb0SDavid Howells		 * cachelines that might be contended:
07073eb0SDavid Howells		 */
07073eb0SDavid Howells		writably_mapped = mapping_writably_mapped(in->f_mapping);
07073eb0SDavid Howells
07073eb0SDavid Howells		for (i = 0; i < folio_batch_count(&fbatch); i++) {
07073eb0SDavid Howells			struct folio *folio = fbatch.folios[i];
07073eb0SDavid Howells			size_t n;
07073eb0SDavid Howells
07073eb0SDavid Howells			if (folio_pos(folio) >= end_offset)
07073eb0SDavid Howells				goto out;
07073eb0SDavid Howells			folio_mark_accessed(folio);
07073eb0SDavid Howells
07073eb0SDavid Howells			/*
07073eb0SDavid Howells			 * If users can be writing to this folio using arbitrary
07073eb0SDavid Howells			 * virtual addresses, take care of potential aliasing
07073eb0SDavid Howells			 * before reading the folio on the kernel side.
07073eb0SDavid Howells			 */
07073eb0SDavid Howells			if (writably_mapped)
07073eb0SDavid Howells				flush_dcache_folio(folio);
07073eb0SDavid Howells
07073eb0SDavid Howells			n = min_t(loff_t, len, isize - *ppos);
07073eb0SDavid Howells			n = splice_folio_into_pipe(pipe, folio, *ppos, n);
07073eb0SDavid Howells			if (!n)
07073eb0SDavid Howells				goto out;
07073eb0SDavid Howells			len -= n;
07073eb0SDavid Howells			total_spliced += n;
07073eb0SDavid Howells			*ppos += n;
07073eb0SDavid Howells			in->f_ra.prev_pos = *ppos;
07073eb0SDavid Howells			if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
07073eb0SDavid Howells				goto out;
07073eb0SDavid Howells		}
07073eb0SDavid Howells
07073eb0SDavid Howells		folio_batch_release(&fbatch);
07073eb0SDavid Howells	} while (len);
07073eb0SDavid Howells
07073eb0SDavid Howellsout:
07073eb0SDavid Howells	folio_batch_release(&fbatch);
07073eb0SDavid Howells	file_accessed(in);
07073eb0SDavid Howells
07073eb0SDavid Howells	return total_spliced ? total_spliced : error;
07073eb0SDavid Howells}
7c8e01ebSDavid HowellsEXPORT_SYMBOL(filemap_splice_read);
07073eb0SDavid Howells
f5e6429aSMatthew Wilcox (Oracle)static inline loff_t folio_seek_hole_data(struct xa_state *xas,
f5e6429aSMatthew Wilcox (Oracle)		struct address_space *mapping, struct folio *folio,
54fa39acSMatthew Wilcox (Oracle)		loff_t start, loff_t end, bool seek_data)
41139aa4SMatthew Wilcox (Oracle){
54fa39acSMatthew Wilcox (Oracle)	const struct address_space_operations *ops = mapping->a_ops;
54fa39acSMatthew Wilcox (Oracle)	size_t offset, bsz = i_blocksize(mapping->host);
54fa39acSMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)	if (xa_is_value(folio) || folio_test_uptodate(folio))
54fa39acSMatthew Wilcox (Oracle)		return seek_data ? start : end;
54fa39acSMatthew Wilcox (Oracle)	if (!ops->is_partially_uptodate)
54fa39acSMatthew Wilcox (Oracle)		return seek_data ? end : start;
54fa39acSMatthew Wilcox (Oracle)
54fa39acSMatthew Wilcox (Oracle)	xas_pause(xas);
54fa39acSMatthew Wilcox (Oracle)	rcu_read_unlock();
f5e6429aSMatthew Wilcox (Oracle)	folio_lock(folio);
f5e6429aSMatthew Wilcox (Oracle)	if (unlikely(folio->mapping != mapping))
54fa39acSMatthew Wilcox (Oracle)		goto unlock;
54fa39acSMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)	offset = offset_in_folio(folio, start) & ~(bsz - 1);
54fa39acSMatthew Wilcox (Oracle)
54fa39acSMatthew Wilcox (Oracle)	do {
2e7e80f7SMatthew Wilcox (Oracle)		if (ops->is_partially_uptodate(folio, offset, bsz) ==
f5e6429aSMatthew Wilcox (Oracle)							seek_data)
54fa39acSMatthew Wilcox (Oracle)			break;
09528bb1SMarco Nelissen		start = (start + bsz) & ~((u64)bsz - 1);
54fa39acSMatthew Wilcox (Oracle)		offset += bsz;
f5e6429aSMatthew Wilcox (Oracle)	} while (offset < folio_size(folio));
54fa39acSMatthew Wilcox (Oracle)unlock:
f5e6429aSMatthew Wilcox (Oracle)	folio_unlock(folio);
54fa39acSMatthew Wilcox (Oracle)	rcu_read_lock();
54fa39acSMatthew Wilcox (Oracle)	return start;
41139aa4SMatthew Wilcox (Oracle)}
41139aa4SMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
41139aa4SMatthew Wilcox (Oracle){
f5e6429aSMatthew Wilcox (Oracle)	if (xa_is_value(folio))
41139aa4SMatthew Wilcox (Oracle)		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
f5e6429aSMatthew Wilcox (Oracle)	return folio_size(folio);
41139aa4SMatthew Wilcox (Oracle)}
41139aa4SMatthew Wilcox (Oracle)
41139aa4SMatthew Wilcox (Oracle)/**
41139aa4SMatthew Wilcox (Oracle) * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
41139aa4SMatthew Wilcox (Oracle) * @mapping: Address space to search.
41139aa4SMatthew Wilcox (Oracle) * @start: First byte to consider.
41139aa4SMatthew Wilcox (Oracle) * @end: Limit of search (exclusive).
41139aa4SMatthew Wilcox (Oracle) * @whence: Either SEEK_HOLE or SEEK_DATA.
41139aa4SMatthew Wilcox (Oracle) *
41139aa4SMatthew Wilcox (Oracle) * If the page cache knows which blocks contain holes and which blocks
41139aa4SMatthew Wilcox (Oracle) * contain data, your filesystem can use this function to implement
41139aa4SMatthew Wilcox (Oracle) * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
41139aa4SMatthew Wilcox (Oracle) * entirely memory-based such as tmpfs, and filesystems which support
41139aa4SMatthew Wilcox (Oracle) * unwritten extents.
41139aa4SMatthew Wilcox (Oracle) *
f0953a1bSIngo Molnar * Return: The requested offset on success, or -ENXIO if @whence specifies
41139aa4SMatthew Wilcox (Oracle) * SEEK_DATA and there is no data after @start.  There is an implicit hole
41139aa4SMatthew Wilcox (Oracle) * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
41139aa4SMatthew Wilcox (Oracle) * and @end contain data.
41139aa4SMatthew Wilcox (Oracle) */
41139aa4SMatthew Wilcox (Oracle)loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
41139aa4SMatthew Wilcox (Oracle)		loff_t end, int whence)
41139aa4SMatthew Wilcox (Oracle){
41139aa4SMatthew Wilcox (Oracle)	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
ed98b015SHugh Dickins	pgoff_t max = (end - 1) >> PAGE_SHIFT;
41139aa4SMatthew Wilcox (Oracle)	bool seek_data = (whence == SEEK_DATA);
f5e6429aSMatthew Wilcox (Oracle)	struct folio *folio;
41139aa4SMatthew Wilcox (Oracle)
41139aa4SMatthew Wilcox (Oracle)	if (end <= start)
41139aa4SMatthew Wilcox (Oracle)		return -ENXIO;
41139aa4SMatthew Wilcox (Oracle)
41139aa4SMatthew Wilcox (Oracle)	rcu_read_lock();
f5e6429aSMatthew Wilcox (Oracle)	while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
ed98b015SHugh Dickins		loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
f5e6429aSMatthew Wilcox (Oracle)		size_t seek_size;
41139aa4SMatthew Wilcox (Oracle)
41139aa4SMatthew Wilcox (Oracle)		if (start < pos) {
41139aa4SMatthew Wilcox (Oracle)			if (!seek_data)
41139aa4SMatthew Wilcox (Oracle)				goto unlock;
41139aa4SMatthew Wilcox (Oracle)			start = pos;
41139aa4SMatthew Wilcox (Oracle)		}
41139aa4SMatthew Wilcox (Oracle)
f5e6429aSMatthew Wilcox (Oracle)		seek_size = seek_folio_size(&xas, folio);
f5e6429aSMatthew Wilcox (Oracle)		pos = round_up((u64)pos + 1, seek_size);
f5e6429aSMatthew Wilcox (Oracle)		start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
54fa39acSMatthew Wilcox (Oracle)				seek_data);
54fa39acSMatthew Wilcox (Oracle)		if (start < pos)
41139aa4SMatthew Wilcox (Oracle)			goto unlock;
ed98b015SHugh Dickins		if (start >= end)
ed98b015SHugh Dickins			break;
ed98b015SHugh Dickins		if (seek_size > PAGE_SIZE)
ed98b015SHugh Dickins			xas_set(&xas, pos >> PAGE_SHIFT);
f5e6429aSMatthew Wilcox (Oracle)		if (!xa_is_value(folio))
f5e6429aSMatthew Wilcox (Oracle)			folio_put(folio);
41139aa4SMatthew Wilcox (Oracle)	}
41139aa4SMatthew Wilcox (Oracle)	if (seek_data)
ed98b015SHugh Dickins		start = -ENXIO;
41139aa4SMatthew Wilcox (Oracle)unlock:
41139aa4SMatthew Wilcox (Oracle)	rcu_read_unlock();
f5e6429aSMatthew Wilcox (Oracle)	if (folio && !xa_is_value(folio))
f5e6429aSMatthew Wilcox (Oracle)		folio_put(folio);
41139aa4SMatthew Wilcox (Oracle)	if (start > end)
41139aa4SMatthew Wilcox (Oracle)		return end;
41139aa4SMatthew Wilcox (Oracle)	return start;
41139aa4SMatthew Wilcox (Oracle)}
41139aa4SMatthew Wilcox (Oracle)
1da177e4SLinus Torvalds#ifdef CONFIG_MMU
1da177e4SLinus Torvalds#define MMAP_LOTSAMISS  (100)
6b4c9f44SJosef Bacik/*
e292e6d6SMatthew Wilcox (Oracle) * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
6b4c9f44SJosef Bacik * @vmf - the vm_fault for this fault.
e292e6d6SMatthew Wilcox (Oracle) * @folio - the folio to lock.
6b4c9f44SJosef Bacik * @fpin - the pointer to the file we may pin (or is already pinned).
6b4c9f44SJosef Bacik *
e292e6d6SMatthew Wilcox (Oracle) * This works similar to lock_folio_or_retry in that it can drop the
e292e6d6SMatthew Wilcox (Oracle) * mmap_lock.  It differs in that it actually returns the folio locked
e292e6d6SMatthew Wilcox (Oracle) * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
e292e6d6SMatthew Wilcox (Oracle) * to drop the mmap_lock then fpin will point to the pinned file and
e292e6d6SMatthew Wilcox (Oracle) * needs to be fput()'ed at a later point.
6b4c9f44SJosef Bacik */
e292e6d6SMatthew Wilcox (Oracle)static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
6b4c9f44SJosef Bacik				     struct file **fpin)
6b4c9f44SJosef Bacik{
7c23c782SMatthew Wilcox (Oracle)	if (folio_trylock(folio))
6b4c9f44SJosef Bacik		return 1;
6b4c9f44SJosef Bacik
8b0f9fa2SLinus Torvalds	/*
8b0f9fa2SLinus Torvalds	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
c1e8d7c6SMichel Lespinasse	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
8b0f9fa2SLinus Torvalds	 * is supposed to work. We have way too many special cases..
8b0f9fa2SLinus Torvalds	 */
6b4c9f44SJosef Bacik	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
6b4c9f44SJosef Bacik		return 0;
6b4c9f44SJosef Bacik
6b4c9f44SJosef Bacik	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
6b4c9f44SJosef Bacik	if (vmf->flags & FAULT_FLAG_KILLABLE) {
af7f29d9SMatthew Wilcox (Oracle)		if (__folio_lock_killable(folio)) {
6b4c9f44SJosef Bacik			/*
c1e8d7c6SMichel Lespinasse			 * We didn't have the right flags to drop the mmap_lock,
6b4c9f44SJosef Bacik			 * but all fault_handlers only check for fatal signals
6b4c9f44SJosef Bacik			 * if we return VM_FAULT_RETRY, so we need to drop the
c1e8d7c6SMichel Lespinasse			 * mmap_lock here and return 0 if we don't have a fpin.
6b4c9f44SJosef Bacik			 */
6b4c9f44SJosef Bacik			if (*fpin == NULL)
d8ed45c5SMichel Lespinasse				mmap_read_unlock(vmf->vma->vm_mm);
6b4c9f44SJosef Bacik			return 0;
6b4c9f44SJosef Bacik		}
6b4c9f44SJosef Bacik	} else
7c23c782SMatthew Wilcox (Oracle)		__folio_lock(folio);
7c23c782SMatthew Wilcox (Oracle)
6b4c9f44SJosef Bacik	return 1;
6b4c9f44SJosef Bacik}
6b4c9f44SJosef Bacik
6b4c9f44SJosef Bacik/*
6b4c9f44SJosef Bacik * Synchronous readahead happens when we don't even find a page in the page
6b4c9f44SJosef Bacik * cache at all.  We don't want to perform IO under the mmap sem, so if we have
6b4c9f44SJosef Bacik * to drop the mmap sem we return the file that was pinned in order for us to do
6b4c9f44SJosef Bacik * that.  If we didn't pin a file then we return NULL.  The file that is
6b4c9f44SJosef Bacik * returned needs to be fput()'ed when we're done with it.
6b4c9f44SJosef Bacik */
6b4c9f44SJosef Bacikstatic struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ef00e08eSLinus Torvalds{
2a1180f1SJosef Bacik	struct file *file = vmf->vma->vm_file;
2a1180f1SJosef Bacik	struct file_ra_state *ra = &file->f_ra;
ef00e08eSLinus Torvalds	struct address_space *mapping = file->f_mapping;
fcd9ae4fSMatthew Wilcox (Oracle)	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
6b4c9f44SJosef Bacik	struct file *fpin = NULL;
dcfa24baSMatthew Wilcox (Oracle)	unsigned long vm_flags = vmf->vma->vm_flags;
e630bfacSKirill A. Shutemov	unsigned int mmap_miss;
ef00e08eSLinus Torvalds
4687fdbbSMatthew Wilcox (Oracle)#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4687fdbbSMatthew Wilcox (Oracle)	/* Use the readahead code, even if readahead is disabled */
06b5a69cSGavin Shan	if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
4687fdbbSMatthew Wilcox (Oracle)		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
4687fdbbSMatthew Wilcox (Oracle)		ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
4687fdbbSMatthew Wilcox (Oracle)		ra->size = HPAGE_PMD_NR;
4687fdbbSMatthew Wilcox (Oracle)		/*
4687fdbbSMatthew Wilcox (Oracle)		 * Fetch two PMD folios, so we get the chance to actually
4687fdbbSMatthew Wilcox (Oracle)		 * readahead, unless we've been told not to.
4687fdbbSMatthew Wilcox (Oracle)		 */
dcfa24baSMatthew Wilcox (Oracle)		if (!(vm_flags & VM_RAND_READ))
4687fdbbSMatthew Wilcox (Oracle)			ra->size *= 2;
4687fdbbSMatthew Wilcox (Oracle)		ra->async_size = HPAGE_PMD_NR;
4687fdbbSMatthew Wilcox (Oracle)		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
4687fdbbSMatthew Wilcox (Oracle)		return fpin;
4687fdbbSMatthew Wilcox (Oracle)	}
4687fdbbSMatthew Wilcox (Oracle)#endif
4687fdbbSMatthew Wilcox (Oracle)
ef00e08eSLinus Torvalds	/* If we don't want any read-ahead, don't bother */
dcfa24baSMatthew Wilcox (Oracle)	if (vm_flags & VM_RAND_READ)
6b4c9f44SJosef Bacik		return fpin;
275b12bfSWu Fengguang	if (!ra->ra_pages)
6b4c9f44SJosef Bacik		return fpin;
ef00e08eSLinus Torvalds
dcfa24baSMatthew Wilcox (Oracle)	if (vm_flags & VM_SEQ_READ) {
6b4c9f44SJosef Bacik		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
fcd9ae4fSMatthew Wilcox (Oracle)		page_cache_sync_ra(&ractl, ra->ra_pages);
6b4c9f44SJosef Bacik		return fpin;
ef00e08eSLinus Torvalds	}
ef00e08eSLinus Torvalds
207d04baSAndi Kleen	/* Avoid banging the cache line if not needed */
e630bfacSKirill A. Shutemov	mmap_miss = READ_ONCE(ra->mmap_miss);
e630bfacSKirill A. Shutemov	if (mmap_miss < MMAP_LOTSAMISS * 10)
e630bfacSKirill A. Shutemov		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
ef00e08eSLinus Torvalds
ef00e08eSLinus Torvalds	/*
ef00e08eSLinus Torvalds	 * Do we miss much more than hit in this file? If so,
ef00e08eSLinus Torvalds	 * stop bothering with read-ahead. It will only hurt.
ef00e08eSLinus Torvalds	 */
e630bfacSKirill A. Shutemov	if (mmap_miss > MMAP_LOTSAMISS)
6b4c9f44SJosef Bacik		return fpin;
ef00e08eSLinus Torvalds
d30a1100SWu Fengguang	/*
d30a1100SWu Fengguang	 * mmap read-around
d30a1100SWu Fengguang	 */
6b4c9f44SJosef Bacik	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
db660d46SDavid Howells	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
600e19afSRoman Gushchin	ra->size = ra->ra_pages;
600e19afSRoman Gushchin	ra->async_size = ra->ra_pages / 4;
db660d46SDavid Howells	ractl._index = ra->start;
56a4d67cSMatthew Wilcox (Oracle)	page_cache_ra_order(&ractl, ra, 0);
6b4c9f44SJosef Bacik	return fpin;
ef00e08eSLinus Torvalds}
ef00e08eSLinus Torvalds
ef00e08eSLinus Torvalds/*
ef00e08eSLinus Torvalds * Asynchronous readahead happens when we find the page and PG_readahead,
6b4c9f44SJosef Bacik * so we want to possibly extend the readahead further.  We return the file that
c1e8d7c6SMichel Lespinasse * was pinned if we have to drop the mmap_lock in order to do IO.
ef00e08eSLinus Torvalds */
6b4c9f44SJosef Bacikstatic struct file *do_async_mmap_readahead(struct vm_fault *vmf,
79598cedSMatthew Wilcox (Oracle)					    struct folio *folio)
ef00e08eSLinus Torvalds{
2a1180f1SJosef Bacik	struct file *file = vmf->vma->vm_file;
2a1180f1SJosef Bacik	struct file_ra_state *ra = &file->f_ra;
79598cedSMatthew Wilcox (Oracle)	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
6b4c9f44SJosef Bacik	struct file *fpin = NULL;
e630bfacSKirill A. Shutemov	unsigned int mmap_miss;
ef00e08eSLinus Torvalds
ef00e08eSLinus Torvalds	/* If we don't want any read-ahead, don't bother */
5c72feeeSJan Kara	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
6b4c9f44SJosef Bacik		return fpin;
79598cedSMatthew Wilcox (Oracle)
e630bfacSKirill A. Shutemov	mmap_miss = READ_ONCE(ra->mmap_miss);
e630bfacSKirill A. Shutemov	if (mmap_miss)
e630bfacSKirill A. Shutemov		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
79598cedSMatthew Wilcox (Oracle)
79598cedSMatthew Wilcox (Oracle)	if (folio_test_readahead(folio)) {
6b4c9f44SJosef Bacik		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
79598cedSMatthew Wilcox (Oracle)		page_cache_async_ra(&ractl, folio, ra->ra_pages);
ef00e08eSLinus Torvalds	}
6b4c9f44SJosef Bacik	return fpin;
6b4c9f44SJosef Bacik}
ef00e08eSLinus Torvalds
485bb99bSRandy Dunlap/**
54cb8821SNick Piggin * filemap_fault - read in file data for page fault handling
d0217ac0SNick Piggin * @vmf:	struct vm_fault containing details of the fault
485bb99bSRandy Dunlap *
54cb8821SNick Piggin * filemap_fault() is invoked via the vma operations vector for a
1da177e4SLinus Torvalds * mapped memory region to read in file data during a page fault.
1da177e4SLinus Torvalds *
1da177e4SLinus Torvalds * The goto's are kind of ugly, but this streamlines the normal case of having
1da177e4SLinus Torvalds * it in the page cache, and handles the special cases reasonably without
1da177e4SLinus Torvalds * having a lot of duplicated code.
9a95f3cfSPaul Cassella *
c1e8d7c6SMichel Lespinasse * vma->vm_mm->mmap_lock must be held on entry.
9a95f3cfSPaul Cassella *
c1e8d7c6SMichel Lespinasse * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
e292e6d6SMatthew Wilcox (Oracle) * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
9a95f3cfSPaul Cassella *
c1e8d7c6SMichel Lespinasse * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
9a95f3cfSPaul Cassella * has not been released.
9a95f3cfSPaul Cassella *
9a95f3cfSPaul Cassella * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return: bitwise-OR of %VM_FAULT_ codes.
1da177e4SLinus Torvalds */
2bcd6454SSouptick Joardervm_fault_t filemap_fault(struct vm_fault *vmf)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	int error;
11bac800SDave Jiang	struct file *file = vmf->vma->vm_file;
6b4c9f44SJosef Bacik	struct file *fpin = NULL;
1da177e4SLinus Torvalds	struct address_space *mapping = file->f_mapping;
1da177e4SLinus Torvalds	struct inode *inode = mapping->host;
e292e6d6SMatthew Wilcox (Oracle)	pgoff_t max_idx, index = vmf->pgoff;
e292e6d6SMatthew Wilcox (Oracle)	struct folio *folio;
2bcd6454SSouptick Joarder	vm_fault_t ret = 0;
730633f0SJan Kara	bool mapping_locked = false;
1da177e4SLinus Torvalds
e292e6d6SMatthew Wilcox (Oracle)	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
e292e6d6SMatthew Wilcox (Oracle)	if (unlikely(index >= max_idx))
5307cc1aSLinus Torvalds		return VM_FAULT_SIGBUS;
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds	/*
49426420SJohannes Weiner	 * Do we have something in the page cache already?
1da177e4SLinus Torvalds	 */
e292e6d6SMatthew Wilcox (Oracle)	folio = filemap_get_folio(mapping, index);
66dabbb6SChristoph Hellwig	if (likely(!IS_ERR(folio))) {
3ea89ee8SFengguang Wu		/*
730633f0SJan Kara		 * We found the page, so try async readahead before waiting for
730633f0SJan Kara		 * the lock.
3ea89ee8SFengguang Wu		 */
730633f0SJan Kara		if (!(vmf->flags & FAULT_FLAG_TRIED))
79598cedSMatthew Wilcox (Oracle)			fpin = do_async_mmap_readahead(vmf, folio);
e292e6d6SMatthew Wilcox (Oracle)		if (unlikely(!folio_test_uptodate(folio))) {
730633f0SJan Kara			filemap_invalidate_lock_shared(mapping);
730633f0SJan Kara			mapping_locked = true;
730633f0SJan Kara		}
730633f0SJan Kara	} else {
ef00e08eSLinus Torvalds		/* No page in the page cache at all */
f8891e5eSChristoph Lameter		count_vm_event(PGMAJFAULT);
2262185cSRoman Gushchin		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ef00e08eSLinus Torvalds		ret = VM_FAULT_MAJOR;
6b4c9f44SJosef Bacik		fpin = do_sync_mmap_readahead(vmf);
ef00e08eSLinus Torvaldsretry_find:
730633f0SJan Kara		/*
e292e6d6SMatthew Wilcox (Oracle)		 * See comment in filemap_create_folio() why we need
730633f0SJan Kara		 * invalidate_lock
730633f0SJan Kara		 */
730633f0SJan Kara		if (!mapping_locked) {
730633f0SJan Kara			filemap_invalidate_lock_shared(mapping);
730633f0SJan Kara			mapping_locked = true;
730633f0SJan Kara		}
e292e6d6SMatthew Wilcox (Oracle)		folio = __filemap_get_folio(mapping, index,
a75d4c33SJosef Bacik					  FGP_CREAT|FGP_FOR_MMAP,
a75d4c33SJosef Bacik					  vmf->gfp_mask);
66dabbb6SChristoph Hellwig		if (IS_ERR(folio)) {
6b4c9f44SJosef Bacik			if (fpin)
6b4c9f44SJosef Bacik				goto out_retry;
730633f0SJan Kara			filemap_invalidate_unlock_shared(mapping);
e520e932SMatthew Wilcox (Oracle)			return VM_FAULT_OOM;
1da177e4SLinus Torvalds		}
d88c0922SMichel Lespinasse	}
b522c94dSMichel Lespinasse
e292e6d6SMatthew Wilcox (Oracle)	if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
6b4c9f44SJosef Bacik		goto out_retry;
6b4c9f44SJosef Bacik
b522c94dSMichel Lespinasse	/* Did it get truncated? */
e292e6d6SMatthew Wilcox (Oracle)	if (unlikely(folio->mapping != mapping)) {
e292e6d6SMatthew Wilcox (Oracle)		folio_unlock(folio);
e292e6d6SMatthew Wilcox (Oracle)		folio_put(folio);
b522c94dSMichel Lespinasse		goto retry_find;
b522c94dSMichel Lespinasse	}
e292e6d6SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
b522c94dSMichel Lespinasse
1da177e4SLinus Torvalds	/*
d00806b1SNick Piggin	 * We have a locked page in the page cache, now we need to check
d00806b1SNick Piggin	 * that it's up-to-date. If not, it is going to be due to an error.
1da177e4SLinus Torvalds	 */
e292e6d6SMatthew Wilcox (Oracle)	if (unlikely(!folio_test_uptodate(folio))) {
730633f0SJan Kara		/*
730633f0SJan Kara		 * The page was in cache and uptodate and now it is not.
730633f0SJan Kara		 * Strange but possible since we didn't hold the page lock all
730633f0SJan Kara		 * the time. Let's drop everything get the invalidate lock and
730633f0SJan Kara		 * try again.
730633f0SJan Kara		 */
730633f0SJan Kara		if (!mapping_locked) {
e292e6d6SMatthew Wilcox (Oracle)			folio_unlock(folio);
e292e6d6SMatthew Wilcox (Oracle)			folio_put(folio);
730633f0SJan Kara			goto retry_find;
730633f0SJan Kara		}
1da177e4SLinus Torvalds		goto page_not_uptodate;
730633f0SJan Kara	}
1da177e4SLinus Torvalds
ef00e08eSLinus Torvalds	/*
c1e8d7c6SMichel Lespinasse	 * We've made it this far and we had to drop our mmap_lock, now is the
6b4c9f44SJosef Bacik	 * time to return to the upper layer and have it re-find the vma and
6b4c9f44SJosef Bacik	 * redo the fault.
6b4c9f44SJosef Bacik	 */
6b4c9f44SJosef Bacik	if (fpin) {
e292e6d6SMatthew Wilcox (Oracle)		folio_unlock(folio);
6b4c9f44SJosef Bacik		goto out_retry;
6b4c9f44SJosef Bacik	}
730633f0SJan Kara	if (mapping_locked)
730633f0SJan Kara		filemap_invalidate_unlock_shared(mapping);
6b4c9f44SJosef Bacik
6b4c9f44SJosef Bacik	/*
ef00e08eSLinus Torvalds	 * Found the page and have a reference on it.
ef00e08eSLinus Torvalds	 * We must recheck i_size under page lock.
ef00e08eSLinus Torvalds	 */
e292e6d6SMatthew Wilcox (Oracle)	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
e292e6d6SMatthew Wilcox (Oracle)	if (unlikely(index >= max_idx)) {
e292e6d6SMatthew Wilcox (Oracle)		folio_unlock(folio);
e292e6d6SMatthew Wilcox (Oracle)		folio_put(folio);
5307cc1aSLinus Torvalds		return VM_FAULT_SIGBUS;
d00806b1SNick Piggin	}
d00806b1SNick Piggin
e292e6d6SMatthew Wilcox (Oracle)	vmf->page = folio_file_page(folio, index);
83c54070SNick Piggin	return ret | VM_FAULT_LOCKED;
1da177e4SLinus Torvalds
1da177e4SLinus Torvaldspage_not_uptodate:
1da177e4SLinus Torvalds	/*
1da177e4SLinus Torvalds	 * Umm, take care of errors if the page isn't up-to-date.
1da177e4SLinus Torvalds	 * Try to re-read it _once_. We do this synchronously,
1da177e4SLinus Torvalds	 * because there really aren't any performance issues here
1da177e4SLinus Torvalds	 * and we need to check for errors.
1da177e4SLinus Torvalds	 */
6b4c9f44SJosef Bacik	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
290e1a32SMatthew Wilcox (Oracle)	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
6b4c9f44SJosef Bacik	if (fpin)
6b4c9f44SJosef Bacik		goto out_retry;
e292e6d6SMatthew Wilcox (Oracle)	folio_put(folio);
1da177e4SLinus Torvalds
d00806b1SNick Piggin	if (!error || error == AOP_TRUNCATED_PAGE)
d00806b1SNick Piggin		goto retry_find;
730633f0SJan Kara	filemap_invalidate_unlock_shared(mapping);
d00806b1SNick Piggin
d0217ac0SNick Piggin	return VM_FAULT_SIGBUS;
6b4c9f44SJosef Bacik
6b4c9f44SJosef Bacikout_retry:
6b4c9f44SJosef Bacik	/*
c1e8d7c6SMichel Lespinasse	 * We dropped the mmap_lock, we need to return to the fault handler to
6b4c9f44SJosef Bacik	 * re-find the vma and come back and find our hopefully still populated
6b4c9f44SJosef Bacik	 * page.
6b4c9f44SJosef Bacik	 */
38a55db9SMatthew Wilcox	if (!IS_ERR(folio))
e292e6d6SMatthew Wilcox (Oracle)		folio_put(folio);
730633f0SJan Kara	if (mapping_locked)
730633f0SJan Kara		filemap_invalidate_unlock_shared(mapping);
6b4c9f44SJosef Bacik	if (fpin)
6b4c9f44SJosef Bacik		fput(fpin);
6b4c9f44SJosef Bacik	return ret | VM_FAULT_RETRY;
54cb8821SNick Piggin}
54cb8821SNick PigginEXPORT_SYMBOL(filemap_fault);
54cb8821SNick Piggin
8808ecabSMatthew Wilcox (Oracle)static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
8808ecabSMatthew Wilcox (Oracle)		pgoff_t start)
f9ce0be7SKirill A. Shutemov{
f9ce0be7SKirill A. Shutemov	struct mm_struct *mm = vmf->vma->vm_mm;
f9ce0be7SKirill A. Shutemov
f9ce0be7SKirill A. Shutemov	/* Huge page is mapped? No need to proceed. */
f9ce0be7SKirill A. Shutemov	if (pmd_trans_huge(*vmf->pmd)) {
8808ecabSMatthew Wilcox (Oracle)		folio_unlock(folio);
8808ecabSMatthew Wilcox (Oracle)		folio_put(folio);
f9ce0be7SKirill A. Shutemov		return true;
f9ce0be7SKirill A. Shutemov	}
f9ce0be7SKirill A. Shutemov
8808ecabSMatthew Wilcox (Oracle)	if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
8808ecabSMatthew Wilcox (Oracle)		struct page *page = folio_file_page(folio, start);
f9ce0be7SKirill A. Shutemov		vm_fault_t ret = do_set_pmd(vmf, page);
f9ce0be7SKirill A. Shutemov		if (!ret) {
f9ce0be7SKirill A. Shutemov			/* The page is mapped successfully, reference consumed. */
8808ecabSMatthew Wilcox (Oracle)			folio_unlock(folio);
f9ce0be7SKirill A. Shutemov			return true;
f9ce0be7SKirill A. Shutemov		}
f9ce0be7SKirill A. Shutemov	}
f9ce0be7SKirill A. Shutemov
799f90c3SHugh Dickins	if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
03c4f204SQi Zheng		pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
f9ce0be7SKirill A. Shutemov
f9ce0be7SKirill A. Shutemov	return false;
f9ce0be7SKirill A. Shutemov}
f9ce0be7SKirill A. Shutemov
de74976eSYin Fengweistatic struct folio *next_uptodate_folio(struct xa_state *xas,
de74976eSYin Fengwei		struct address_space *mapping, pgoff_t end_pgoff)
f9ce0be7SKirill A. Shutemov{
de74976eSYin Fengwei	struct folio *folio = xas_next_entry(xas, end_pgoff);
f9ce0be7SKirill A. Shutemov	unsigned long max_idx;
f9ce0be7SKirill A. Shutemov
f9ce0be7SKirill A. Shutemov	do {
9184a307SMatthew Wilcox (Oracle)		if (!folio)
f9ce0be7SKirill A. Shutemov			return NULL;
9184a307SMatthew Wilcox (Oracle)		if (xas_retry(xas, folio))
f9ce0be7SKirill A. Shutemov			continue;
9184a307SMatthew Wilcox (Oracle)		if (xa_is_value(folio))
f9ce0be7SKirill A. Shutemov			continue;
9184a307SMatthew Wilcox (Oracle)		if (folio_test_locked(folio))
f9ce0be7SKirill A. Shutemov			continue;
16380f52SYang Shi		if (!folio_try_get(folio))
f9ce0be7SKirill A. Shutemov			continue;
f9ce0be7SKirill A. Shutemov		/* Has the page moved or been split? */
9184a307SMatthew Wilcox (Oracle)		if (unlikely(folio != xas_reload(xas)))
f9ce0be7SKirill A. Shutemov			goto skip;
9184a307SMatthew Wilcox (Oracle)		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
f9ce0be7SKirill A. Shutemov			goto skip;
9184a307SMatthew Wilcox (Oracle)		if (!folio_trylock(folio))
f9ce0be7SKirill A. Shutemov			goto skip;
9184a307SMatthew Wilcox (Oracle)		if (folio->mapping != mapping)
f9ce0be7SKirill A. Shutemov			goto unlock;
9184a307SMatthew Wilcox (Oracle)		if (!folio_test_uptodate(folio))
f9ce0be7SKirill A. Shutemov			goto unlock;
f9ce0be7SKirill A. Shutemov		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
f9ce0be7SKirill A. Shutemov		if (xas->xa_index >= max_idx)
f9ce0be7SKirill A. Shutemov			goto unlock;
820b05e9SMatthew Wilcox (Oracle)		return folio;
f9ce0be7SKirill A. Shutemovunlock:
9184a307SMatthew Wilcox (Oracle)		folio_unlock(folio);
f9ce0be7SKirill A. Shutemovskip:
9184a307SMatthew Wilcox (Oracle)		folio_put(folio);
9184a307SMatthew Wilcox (Oracle)	} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
f9ce0be7SKirill A. Shutemov
f9ce0be7SKirill A. Shutemov	return NULL;
f9ce0be7SKirill A. Shutemov}
f9ce0be7SKirill A. Shutemov
de74976eSYin Fengwei/*
de74976eSYin Fengwei * Map page range [start_page, start_page + nr_pages) of folio.
de74976eSYin Fengwei * start_page is gotten from start by folio_page(folio, start)
de74976eSYin Fengwei */
de74976eSYin Fengweistatic vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
de74976eSYin Fengwei			struct folio *folio, unsigned long start,
c8be0380SYin Fengwei			unsigned long addr, unsigned int nr_pages,
c8be0380SYin Fengwei			unsigned int *mmap_miss)
f9ce0be7SKirill A. Shutemov{
de74976eSYin Fengwei	vm_fault_t ret = 0;
de74976eSYin Fengwei	struct page *page = folio_page(folio, start);
617c28ecSYin Fengwei	unsigned int count = 0;
617c28ecSYin Fengwei	pte_t *old_ptep = vmf->pte;
f9ce0be7SKirill A. Shutemov
de74976eSYin Fengwei	do {
617c28ecSYin Fengwei		if (PageHWPoison(page + count))
617c28ecSYin Fengwei			goto skip;
de74976eSYin Fengwei
c8be0380SYin Fengwei		(*mmap_miss)++;
de74976eSYin Fengwei
de74976eSYin Fengwei		/*
de74976eSYin Fengwei		 * NOTE: If there're PTE markers, we'll leave them to be
de74976eSYin Fengwei		 * handled in the specific fault path, and it'll prohibit the
de74976eSYin Fengwei		 * fault-around logic.
de74976eSYin Fengwei		 */
617c28ecSYin Fengwei		if (!pte_none(vmf->pte[count]))
617c28ecSYin Fengwei			goto skip;
617c28ecSYin Fengwei
617c28ecSYin Fengwei		count++;
de74976eSYin Fengwei		continue;
617c28ecSYin Fengweiskip:
617c28ecSYin Fengwei		if (count) {
617c28ecSYin Fengwei			set_pte_range(vmf, folio, page, count, addr);
617c28ecSYin Fengwei			folio_ref_add(folio, count);
a501a070SMatthew Wilcox (Oracle)			if (in_range(vmf->address, addr, count * PAGE_SIZE))
de74976eSYin Fengwei				ret = VM_FAULT_NOPAGE;
f9ce0be7SKirill A. Shutemov		}
f9ce0be7SKirill A. Shutemov
617c28ecSYin Fengwei		count++;
617c28ecSYin Fengwei		page += count;
617c28ecSYin Fengwei		vmf->pte += count;
617c28ecSYin Fengwei		addr += count * PAGE_SIZE;
617c28ecSYin Fengwei		count = 0;
617c28ecSYin Fengwei	} while (--nr_pages > 0);
de74976eSYin Fengwei
617c28ecSYin Fengwei	if (count) {
617c28ecSYin Fengwei		set_pte_range(vmf, folio, page, count, addr);
617c28ecSYin Fengwei		folio_ref_add(folio, count);
a501a070SMatthew Wilcox (Oracle)		if (in_range(vmf->address, addr, count * PAGE_SIZE))
617c28ecSYin Fengwei			ret = VM_FAULT_NOPAGE;
617c28ecSYin Fengwei	}
de74976eSYin Fengwei
617c28ecSYin Fengwei	vmf->pte = old_ptep;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	return ret;
c8be0380SYin Fengwei}
c8be0380SYin Fengwei
c8be0380SYin Fengweistatic vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
c8be0380SYin Fengwei		struct folio *folio, unsigned long addr,
c8be0380SYin Fengwei		unsigned int *mmap_miss)
c8be0380SYin Fengwei{
c8be0380SYin Fengwei	vm_fault_t ret = 0;
c8be0380SYin Fengwei	struct page *page = &folio->page;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	if (PageHWPoison(page))
c8be0380SYin Fengwei		return ret;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	(*mmap_miss)++;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	/*
c8be0380SYin Fengwei	 * NOTE: If there're PTE markers, we'll leave them to be
c8be0380SYin Fengwei	 * handled in the specific fault path, and it'll prohibit
c8be0380SYin Fengwei	 * the fault-around logic.
c8be0380SYin Fengwei	 */
c8be0380SYin Fengwei	if (!pte_none(ptep_get(vmf->pte)))
c8be0380SYin Fengwei		return ret;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	if (vmf->address == addr)
c8be0380SYin Fengwei		ret = VM_FAULT_NOPAGE;
c8be0380SYin Fengwei
c8be0380SYin Fengwei	set_pte_range(vmf, folio, page, 1, addr);
c8be0380SYin Fengwei	folio_ref_inc(folio);
de74976eSYin Fengwei
de74976eSYin Fengwei	return ret;
f9ce0be7SKirill A. Shutemov}
f9ce0be7SKirill A. Shutemov
f9ce0be7SKirill A. Shutemovvm_fault_t filemap_map_pages(struct vm_fault *vmf,
bae473a4SKirill A. Shutemov			     pgoff_t start_pgoff, pgoff_t end_pgoff)
f1820361SKirill A. Shutemov{
f9ce0be7SKirill A. Shutemov	struct vm_area_struct *vma = vmf->vma;
f9ce0be7SKirill A. Shutemov	struct file *file = vma->vm_file;
f1820361SKirill A. Shutemov	struct address_space *mapping = file->f_mapping;
bae473a4SKirill A. Shutemov	pgoff_t last_pgoff = start_pgoff;
9d3af4b4SWill Deacon	unsigned long addr;
070e807cSMatthew Wilcox	XA_STATE(xas, &mapping->i_pages, start_pgoff);
820b05e9SMatthew Wilcox (Oracle)	struct folio *folio;
f9ce0be7SKirill A. Shutemov	vm_fault_t ret = 0;
c8be0380SYin Fengwei	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved;
f1820361SKirill A. Shutemov
f1820361SKirill A. Shutemov	rcu_read_lock();
de74976eSYin Fengwei	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
820b05e9SMatthew Wilcox (Oracle)	if (!folio)
f9ce0be7SKirill A. Shutemov		goto out;
f1820361SKirill A. Shutemov
8808ecabSMatthew Wilcox (Oracle)	if (filemap_map_pmd(vmf, folio, start_pgoff)) {
f9ce0be7SKirill A. Shutemov		ret = VM_FAULT_NOPAGE;
f9ce0be7SKirill A. Shutemov		goto out;
f9ce0be7SKirill A. Shutemov	}
f1820361SKirill A. Shutemov
9d3af4b4SWill Deacon	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
9d3af4b4SWill Deacon	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
65747aafSHugh Dickins	if (!vmf->pte) {
65747aafSHugh Dickins		folio_unlock(folio);
65747aafSHugh Dickins		folio_put(folio);
65747aafSHugh Dickins		goto out;
65747aafSHugh Dickins	}
f9ce0be7SKirill A. Shutemov	do {
de74976eSYin Fengwei		unsigned long end;
7267ec00SKirill A. Shutemov
9d3af4b4SWill Deacon		addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
070e807cSMatthew Wilcox		vmf->pte += xas.xa_index - last_pgoff;
070e807cSMatthew Wilcox		last_pgoff = xas.xa_index;
de74976eSYin Fengwei		end = folio->index + folio_nr_pages(folio) - 1;
de74976eSYin Fengwei		nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
f9ce0be7SKirill A. Shutemov
c8be0380SYin Fengwei		if (!folio_test_large(folio))
c8be0380SYin Fengwei			ret |= filemap_map_order0_folio(vmf,
c8be0380SYin Fengwei					folio, addr, &mmap_miss);
c8be0380SYin Fengwei		else
de74976eSYin Fengwei			ret |= filemap_map_folio_range(vmf, folio,
c8be0380SYin Fengwei					xas.xa_index - folio->index, addr,
c8be0380SYin Fengwei					nr_pages, &mmap_miss);
46bdb427SWill Deacon
820b05e9SMatthew Wilcox (Oracle)		folio_unlock(folio);
820b05e9SMatthew Wilcox (Oracle)		folio_put(folio);
c8be0380SYin Fengwei	} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
f9ce0be7SKirill A. Shutemov	pte_unmap_unlock(vmf->pte, vmf->ptl);
f9ce0be7SKirill A. Shutemovout:
f1820361SKirill A. Shutemov	rcu_read_unlock();
c8be0380SYin Fengwei
c8be0380SYin Fengwei	mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
c8be0380SYin Fengwei	if (mmap_miss >= mmap_miss_saved)
c8be0380SYin Fengwei		WRITE_ONCE(file->f_ra.mmap_miss, 0);
c8be0380SYin Fengwei	else
c8be0380SYin Fengwei		WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
c8be0380SYin Fengwei
f9ce0be7SKirill A. Shutemov	return ret;
f1820361SKirill A. Shutemov}
f1820361SKirill A. ShutemovEXPORT_SYMBOL(filemap_map_pages);
f1820361SKirill A. Shutemov
2bcd6454SSouptick Joardervm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4fcf1c62SJan Kara{
5df1a672SChristoph Hellwig	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
960ea971SMatthew Wilcox (Oracle)	struct folio *folio = page_folio(vmf->page);
2bcd6454SSouptick Joarder	vm_fault_t ret = VM_FAULT_LOCKED;
4fcf1c62SJan Kara
5df1a672SChristoph Hellwig	sb_start_pagefault(mapping->host->i_sb);
11bac800SDave Jiang	file_update_time(vmf->vma->vm_file);
960ea971SMatthew Wilcox (Oracle)	folio_lock(folio);
960ea971SMatthew Wilcox (Oracle)	if (folio->mapping != mapping) {
960ea971SMatthew Wilcox (Oracle)		folio_unlock(folio);
4fcf1c62SJan Kara		ret = VM_FAULT_NOPAGE;
4fcf1c62SJan Kara		goto out;
4fcf1c62SJan Kara	}
14da9200SJan Kara	/*
960ea971SMatthew Wilcox (Oracle)	 * We mark the folio dirty already here so that when freeze is in
14da9200SJan Kara	 * progress, we are guaranteed that writeback during freezing will
960ea971SMatthew Wilcox (Oracle)	 * see the dirty folio and writeprotect it again.
14da9200SJan Kara	 */
960ea971SMatthew Wilcox (Oracle)	folio_mark_dirty(folio);
960ea971SMatthew Wilcox (Oracle)	folio_wait_stable(folio);
4fcf1c62SJan Karaout:
5df1a672SChristoph Hellwig	sb_end_pagefault(mapping->host->i_sb);
4fcf1c62SJan Kara	return ret;
4fcf1c62SJan Kara}
4fcf1c62SJan Kara
f0f37e2fSAlexey Dobriyanconst struct vm_operations_struct generic_file_vm_ops = {
54cb8821SNick Piggin	.fault		= filemap_fault,
f1820361SKirill A. Shutemov	.map_pages	= filemap_map_pages,
4fcf1c62SJan Kara	.page_mkwrite	= filemap_page_mkwrite,
1da177e4SLinus Torvalds};
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds/* This is used for a general mmap of a disk file */
1da177e4SLinus Torvalds
1da177e4SLinus Torvaldsint generic_file_mmap(struct file *file, struct vm_area_struct *vma)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	struct address_space *mapping = file->f_mapping;
1da177e4SLinus Torvalds
7e0a1265SMatthew Wilcox (Oracle)	if (!mapping->a_ops->read_folio)
1da177e4SLinus Torvalds		return -ENOEXEC;
1da177e4SLinus Torvalds	file_accessed(file);
1da177e4SLinus Torvalds	vma->vm_ops = &generic_file_vm_ops;
1da177e4SLinus Torvalds	return 0;
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds
1da177e4SLinus Torvalds/*
1da177e4SLinus Torvalds * This is for filesystems which do not implement ->writepage.
1da177e4SLinus Torvalds */
1da177e4SLinus Torvaldsint generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1da177e4SLinus Torvalds		return -EINVAL;
1da177e4SLinus Torvalds	return generic_file_mmap(file, vma);
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds#else
4b96a37dSSouptick Joardervm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
45397228SArnd Bergmann{
4b96a37dSSouptick Joarder	return VM_FAULT_SIGBUS;
45397228SArnd Bergmann}
1da177e4SLinus Torvaldsint generic_file_mmap(struct file *file, struct vm_area_struct *vma)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	return -ENOSYS;
1da177e4SLinus Torvalds}
1da177e4SLinus Torvaldsint generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	return -ENOSYS;
1da177e4SLinus Torvalds}
1da177e4SLinus Torvalds#endif /* CONFIG_MMU */
1da177e4SLinus Torvalds
45397228SArnd BergmannEXPORT_SYMBOL(filemap_page_mkwrite);
1da177e4SLinus TorvaldsEXPORT_SYMBOL(generic_file_mmap);
1da177e4SLinus TorvaldsEXPORT_SYMBOL(generic_file_readonly_mmap);
1da177e4SLinus Torvalds
539a3322SMatthew Wilcox (Oracle)static struct folio *do_read_cache_folio(struct address_space *mapping,
e9b5b23eSMatthew Wilcox (Oracle)		pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
67f9fd91SSasha Levin{
539a3322SMatthew Wilcox (Oracle)	struct folio *folio;
1da177e4SLinus Torvalds	int err;
07950008SMatthew Wilcox (Oracle)
07950008SMatthew Wilcox (Oracle)	if (!filler)
07950008SMatthew Wilcox (Oracle)		filler = mapping->a_ops->read_folio;
1da177e4SLinus Torvaldsrepeat:
539a3322SMatthew Wilcox (Oracle)	folio = filemap_get_folio(mapping, index);
66dabbb6SChristoph Hellwig	if (IS_ERR(folio)) {
539a3322SMatthew Wilcox (Oracle)		folio = filemap_alloc_folio(gfp, 0);
539a3322SMatthew Wilcox (Oracle)		if (!folio)
1da177e4SLinus Torvalds			return ERR_PTR(-ENOMEM);
539a3322SMatthew Wilcox (Oracle)		err = filemap_add_folio(mapping, folio, index, gfp);
eb2be189SNick Piggin		if (unlikely(err)) {
539a3322SMatthew Wilcox (Oracle)			folio_put(folio);
1da177e4SLinus Torvalds			if (err == -EEXIST)
1da177e4SLinus Torvalds				goto repeat;
22ecdb4fSMatthew Wilcox			/* Presumably ENOMEM for xarray node */
1da177e4SLinus Torvalds			return ERR_PTR(err);
1da177e4SLinus Torvalds		}
32b63529SMel Gorman
9bc3e869SMatthew Wilcox (Oracle)		goto filler;
32b63529SMel Gorman	}
539a3322SMatthew Wilcox (Oracle)	if (folio_test_uptodate(folio))
1da177e4SLinus Torvalds		goto out;
1da177e4SLinus Torvalds
81f4c03bSMatthew Wilcox (Oracle)	if (!folio_trylock(folio)) {
81f4c03bSMatthew Wilcox (Oracle)		folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
81f4c03bSMatthew Wilcox (Oracle)		goto repeat;
81f4c03bSMatthew Wilcox (Oracle)	}
ebded027SMel Gorman
81f4c03bSMatthew Wilcox (Oracle)	/* Folio was truncated from mapping */
539a3322SMatthew Wilcox (Oracle)	if (!folio->mapping) {
539a3322SMatthew Wilcox (Oracle)		folio_unlock(folio);
539a3322SMatthew Wilcox (Oracle)		folio_put(folio);
32b63529SMel Gorman		goto repeat;
1da177e4SLinus Torvalds	}
ebded027SMel Gorman
ebded027SMel Gorman	/* Someone else locked and filled the page in a very small window */
539a3322SMatthew Wilcox (Oracle)	if (folio_test_uptodate(folio)) {
539a3322SMatthew Wilcox (Oracle)		folio_unlock(folio);
1da177e4SLinus Torvalds		goto out;
1da177e4SLinus Torvalds	}
faffdfa0SXianting Tian
9bc3e869SMatthew Wilcox (Oracle)filler:
290e1a32SMatthew Wilcox (Oracle)	err = filemap_read_folio(file, filler, folio);
1dfa24a4SMatthew Wilcox (Oracle)	if (err) {
9bc3e869SMatthew Wilcox (Oracle)		folio_put(folio);
1dfa24a4SMatthew Wilcox (Oracle)		if (err == AOP_TRUNCATED_PAGE)
1dfa24a4SMatthew Wilcox (Oracle)			goto repeat;
9bc3e869SMatthew Wilcox (Oracle)		return ERR_PTR(err);
9bc3e869SMatthew Wilcox (Oracle)	}
32b63529SMel Gorman
1da177e4SLinus Torvaldsout:
539a3322SMatthew Wilcox (Oracle)	folio_mark_accessed(folio);
539a3322SMatthew Wilcox (Oracle)	return folio;
6fe6900eSNick Piggin}
0531b2aaSLinus Torvalds
0531b2aaSLinus Torvalds/**
e9b5b23eSMatthew Wilcox (Oracle) * read_cache_folio - Read into page cache, fill it if needed.
e9b5b23eSMatthew Wilcox (Oracle) * @mapping: The address_space to read from.
e9b5b23eSMatthew Wilcox (Oracle) * @index: The index to read.
e9b5b23eSMatthew Wilcox (Oracle) * @filler: Function to perform the read, or NULL to use aops->read_folio().
e9b5b23eSMatthew Wilcox (Oracle) * @file: Passed to filler function, may be NULL if not required.
0531b2aaSLinus Torvalds *
e9b5b23eSMatthew Wilcox (Oracle) * Read one page into the page cache.  If it succeeds, the folio returned
e9b5b23eSMatthew Wilcox (Oracle) * will contain @index, but it may not be the first page of the folio.
0531b2aaSLinus Torvalds *
e9b5b23eSMatthew Wilcox (Oracle) * If the filler function returns an error, it will be returned to the
e9b5b23eSMatthew Wilcox (Oracle) * caller.
a862f68aSMike Rapoport *
e9b5b23eSMatthew Wilcox (Oracle) * Context: May sleep.  Expects mapping->invalidate_lock to be held.
e9b5b23eSMatthew Wilcox (Oracle) * Return: An uptodate folio on success, ERR_PTR() on failure.
0531b2aaSLinus Torvalds */
539a3322SMatthew Wilcox (Oracle)struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
e9b5b23eSMatthew Wilcox (Oracle)		filler_t filler, struct file *file)
539a3322SMatthew Wilcox (Oracle){
e9b5b23eSMatthew Wilcox (Oracle)	return do_read_cache_folio(mapping, index, filler, file,
539a3322SMatthew Wilcox (Oracle)			mapping_gfp_mask(mapping));
539a3322SMatthew Wilcox (Oracle)}
539a3322SMatthew Wilcox (Oracle)EXPORT_SYMBOL(read_cache_folio);
539a3322SMatthew Wilcox (Oracle)
3e629597SMatthew Wilcox (Oracle)/**
3e629597SMatthew Wilcox (Oracle) * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
3e629597SMatthew Wilcox (Oracle) * @mapping:	The address_space for the folio.
3e629597SMatthew Wilcox (Oracle) * @index:	The index that the allocated folio will contain.
3e629597SMatthew Wilcox (Oracle) * @gfp:	The page allocator flags to use if allocating.
3e629597SMatthew Wilcox (Oracle) *
3e629597SMatthew Wilcox (Oracle) * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
3e629597SMatthew Wilcox (Oracle) * any new memory allocations done using the specified allocation flags.
3e629597SMatthew Wilcox (Oracle) *
3e629597SMatthew Wilcox (Oracle) * The most likely error from this function is EIO, but ENOMEM is
3e629597SMatthew Wilcox (Oracle) * possible and so is EINTR.  If ->read_folio returns another error,
3e629597SMatthew Wilcox (Oracle) * that will be returned to the caller.
3e629597SMatthew Wilcox (Oracle) *
3e629597SMatthew Wilcox (Oracle) * The function expects mapping->invalidate_lock to be already held.
3e629597SMatthew Wilcox (Oracle) *
3e629597SMatthew Wilcox (Oracle) * Return: Uptodate folio on success, ERR_PTR() on failure.
3e629597SMatthew Wilcox (Oracle) */
3e629597SMatthew Wilcox (Oracle)struct folio *mapping_read_folio_gfp(struct address_space *mapping,
3e629597SMatthew Wilcox (Oracle)		pgoff_t index, gfp_t gfp)
3e629597SMatthew Wilcox (Oracle){
3e629597SMatthew Wilcox (Oracle)	return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
3e629597SMatthew Wilcox (Oracle)}
3e629597SMatthew Wilcox (Oracle)EXPORT_SYMBOL(mapping_read_folio_gfp);
3e629597SMatthew Wilcox (Oracle)
539a3322SMatthew Wilcox (Oracle)static struct page *do_read_cache_page(struct address_space *mapping,
e9b5b23eSMatthew Wilcox (Oracle)		pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
539a3322SMatthew Wilcox (Oracle){
539a3322SMatthew Wilcox (Oracle)	struct folio *folio;
539a3322SMatthew Wilcox (Oracle)
e9b5b23eSMatthew Wilcox (Oracle)	folio = do_read_cache_folio(mapping, index, filler, file, gfp);
539a3322SMatthew Wilcox (Oracle)	if (IS_ERR(folio))
539a3322SMatthew Wilcox (Oracle)		return &folio->page;
539a3322SMatthew Wilcox (Oracle)	return folio_file_page(folio, index);
539a3322SMatthew Wilcox (Oracle)}
539a3322SMatthew Wilcox (Oracle)
67f9fd91SSasha Levinstruct page *read_cache_page(struct address_space *mapping,
e9b5b23eSMatthew Wilcox (Oracle)			pgoff_t index, filler_t *filler, struct file *file)
0531b2aaSLinus Torvalds{
e9b5b23eSMatthew Wilcox (Oracle)	return do_read_cache_page(mapping, index, filler, file,
d322a8e5SChristoph Hellwig			mapping_gfp_mask(mapping));
0531b2aaSLinus Torvalds}
67f9fd91SSasha LevinEXPORT_SYMBOL(read_cache_page);
0531b2aaSLinus Torvalds
0531b2aaSLinus Torvalds/**
0531b2aaSLinus Torvalds * read_cache_page_gfp - read into page cache, using specified page allocation flags.
0531b2aaSLinus Torvalds * @mapping:	the page's address_space
0531b2aaSLinus Torvalds * @index:	the page index
0531b2aaSLinus Torvalds * @gfp:	the page allocator flags to use if allocating
0531b2aaSLinus Torvalds *
0531b2aaSLinus Torvalds * This is the same as "read_mapping_page(mapping, index, NULL)", but with
e6f67b8cSDave Kleikamp * any new page allocations done using the specified allocation flags.
0531b2aaSLinus Torvalds *
0531b2aaSLinus Torvalds * If the page does not get brought uptodate, return -EIO.
a862f68aSMike Rapoport *
730633f0SJan Kara * The function expects mapping->invalidate_lock to be already held.
730633f0SJan Kara *
a862f68aSMike Rapoport * Return: up to date page on success, ERR_PTR() on failure.
0531b2aaSLinus Torvalds */
0531b2aaSLinus Torvaldsstruct page *read_cache_page_gfp(struct address_space *mapping,
0531b2aaSLinus Torvalds				pgoff_t index,
0531b2aaSLinus Torvalds				gfp_t gfp)
0531b2aaSLinus Torvalds{
6c45b454SChristoph Hellwig	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
0531b2aaSLinus Torvalds}
0531b2aaSLinus TorvaldsEXPORT_SYMBOL(read_cache_page_gfp);
0531b2aaSLinus Torvalds
a92853b6SKonstantin Khlebnikov/*
a92853b6SKonstantin Khlebnikov * Warn about a page cache invalidation failure during a direct I/O write.
a92853b6SKonstantin Khlebnikov */
c402a9a9SChristoph Hellwigstatic void dio_warn_stale_pagecache(struct file *filp)
a92853b6SKonstantin Khlebnikov{
a92853b6SKonstantin Khlebnikov	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
a92853b6SKonstantin Khlebnikov	char pathname[128];
a92853b6SKonstantin Khlebnikov	char *path;
a92853b6SKonstantin Khlebnikov
5df1a672SChristoph Hellwig	errseq_set(&filp->f_mapping->wb_err, -EIO);
a92853b6SKonstantin Khlebnikov	if (__ratelimit(&_rs)) {
a92853b6SKonstantin Khlebnikov		path = file_path(filp, pathname, sizeof(pathname));
a92853b6SKonstantin Khlebnikov		if (IS_ERR(path))
a92853b6SKonstantin Khlebnikov			path = "(unknown)";
a92853b6SKonstantin Khlebnikov		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
a92853b6SKonstantin Khlebnikov		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
a92853b6SKonstantin Khlebnikov			current->comm);
a92853b6SKonstantin Khlebnikov	}
a92853b6SKonstantin Khlebnikov}
a92853b6SKonstantin Khlebnikov
c402a9a9SChristoph Hellwigvoid kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
c402a9a9SChristoph Hellwig{
c402a9a9SChristoph Hellwig	struct address_space *mapping = iocb->ki_filp->f_mapping;
c402a9a9SChristoph Hellwig
c402a9a9SChristoph Hellwig	if (mapping->nrpages &&
c402a9a9SChristoph Hellwig	    invalidate_inode_pages2_range(mapping,
c402a9a9SChristoph Hellwig			iocb->ki_pos >> PAGE_SHIFT,
c402a9a9SChristoph Hellwig			(iocb->ki_pos + count - 1) >> PAGE_SHIFT))
c402a9a9SChristoph Hellwig		dio_warn_stale_pagecache(iocb->ki_filp);
c402a9a9SChristoph Hellwig}
c402a9a9SChristoph Hellwig
1da177e4SLinus Torvaldsssize_t
1af5bb49SChristoph Hellwiggeneric_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4SLinus Torvalds{
c402a9a9SChristoph Hellwig	struct address_space *mapping = iocb->ki_filp->f_mapping;
c402a9a9SChristoph Hellwig	size_t write_len = iov_iter_count(from);
1da177e4SLinus Torvalds	ssize_t written;
1da177e4SLinus Torvalds
6ccfa806SHisashi Hifumi	/*
6ccfa806SHisashi Hifumi	 * If a page can not be invalidated, return 0 to fall back
6ccfa806SHisashi Hifumi	 * to buffered write.
6ccfa806SHisashi Hifumi	 */
e003f74aSChristoph Hellwig	written = kiocb_invalidate_pages(iocb, write_len);
6ccfa806SHisashi Hifumi	if (written) {
6ccfa806SHisashi Hifumi		if (written == -EBUSY)
6ccfa806SHisashi Hifumi			return 0;
c402a9a9SChristoph Hellwig		return written;
a969e903SChristoph Hellwig	}
a969e903SChristoph Hellwig
639a93a5SAl Viro	written = mapping->a_ops->direct_IO(iocb, from);
a969e903SChristoph Hellwig
a969e903SChristoph Hellwig	/*
a969e903SChristoph Hellwig	 * Finally, try again to invalidate clean pages which might have been
a969e903SChristoph Hellwig	 * cached by non-direct readahead, or faulted in by get_user_pages()
a969e903SChristoph Hellwig	 * if the source of the write was an mmap'ed region of the file
a969e903SChristoph Hellwig	 * we're writing.  Either one is a pretty crazy thing to do,
a969e903SChristoph Hellwig	 * so we don't support it 100%.  If this invalidation
a969e903SChristoph Hellwig	 * fails, tough, the write still worked...
332391a9SLukas Czerner	 *
332391a9SLukas Czerner	 * Most of the time we do not need this since dio_complete() will do
332391a9SLukas Czerner	 * the invalidation for us. However there are some file systems that
332391a9SLukas Czerner	 * do not end up with dio_complete() being called, so let's not break
80c1fe90SKonstantin Khlebnikov	 * them by removing it completely.
80c1fe90SKonstantin Khlebnikov	 *
9266a140SKonstantin Khlebnikov	 * Noticeable example is a blkdev_direct_IO().
9266a140SKonstantin Khlebnikov	 *
80c1fe90SKonstantin Khlebnikov	 * Skip invalidation for async writes or if mapping has no pages.
a969e903SChristoph Hellwig	 */
1da177e4SLinus Torvalds	if (written > 0) {
c402a9a9SChristoph Hellwig		struct inode *inode = mapping->host;
c402a9a9SChristoph Hellwig		loff_t pos = iocb->ki_pos;
c402a9a9SChristoph Hellwig
c402a9a9SChristoph Hellwig		kiocb_invalidate_post_direct_write(iocb, written);
0116651cSNamhyung Kim		pos += written;
639a93a5SAl Viro		write_len -= written;
0116651cSNamhyung Kim		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
0116651cSNamhyung Kim			i_size_write(inode, pos);
1da177e4SLinus Torvalds			mark_inode_dirty(inode);
1da177e4SLinus Torvalds		}
5cb6c6c7SAl Viro		iocb->ki_pos = pos;
1da177e4SLinus Torvalds	}
ab2125dfSPavel Begunkov	if (written != -EIOCBQUEUED)
639a93a5SAl Viro		iov_iter_revert(from, write_len - iov_iter_count(from));
1da177e4SLinus Torvalds	return written;
1da177e4SLinus Torvalds}
1da177e4SLinus TorvaldsEXPORT_SYMBOL(generic_file_direct_write);
1da177e4SLinus Torvalds
800ba295SMatthew Wilcox (Oracle)ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
afddba49SNick Piggin{
800ba295SMatthew Wilcox (Oracle)	struct file *file = iocb->ki_filp;
800ba295SMatthew Wilcox (Oracle)	loff_t pos = iocb->ki_pos;
afddba49SNick Piggin	struct address_space *mapping = file->f_mapping;
afddba49SNick Piggin	const struct address_space_operations *a_ops = mapping->a_ops;
afddba49SNick Piggin	long status = 0;
afddba49SNick Piggin	ssize_t written = 0;
674b892eSNick Piggin
afddba49SNick Piggin	do {
afddba49SNick Piggin		struct page *page;
afddba49SNick Piggin		unsigned long offset;	/* Offset into pagecache page */
afddba49SNick Piggin		unsigned long bytes;	/* Bytes to write to page */
afddba49SNick Piggin		size_t copied;		/* Bytes copied from user */
1468c6f4SAlexander Potapenko		void *fsdata = NULL;
afddba49SNick Piggin
09cbfeafSKirill A. Shutemov		offset = (pos & (PAGE_SIZE - 1));
09cbfeafSKirill A. Shutemov		bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49SNick Piggin						iov_iter_count(i));
afddba49SNick Piggin
afddba49SNick Pigginagain:
00a3d660SLinus Torvalds		/*
00a3d660SLinus Torvalds		 * Bring in the user page that we will copy from _first_.
00a3d660SLinus Torvalds		 * Otherwise there's a nasty deadlock on copying from the
00a3d660SLinus Torvalds		 * same page as we're writing to, without it being marked
00a3d660SLinus Torvalds		 * up-to-date.
00a3d660SLinus Torvalds		 */
631f871fSAndreas Gruenbacher		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
00a3d660SLinus Torvalds			status = -EFAULT;
00a3d660SLinus Torvalds			break;
00a3d660SLinus Torvalds		}
00a3d660SLinus Torvalds
296291cdSJan Kara		if (fatal_signal_pending(current)) {
296291cdSJan Kara			status = -EINTR;
296291cdSJan Kara			break;
296291cdSJan Kara		}
296291cdSJan Kara
9d6b0cd7SMatthew Wilcox (Oracle)		status = a_ops->write_begin(file, mapping, pos, bytes,
afddba49SNick Piggin						&page, &fsdata);
2457aec6SMel Gorman		if (unlikely(status < 0))
afddba49SNick Piggin			break;
afddba49SNick Piggin
931e80e4Sanfei zhou		if (mapping_writably_mapped(mapping))
931e80e4Sanfei zhou			flush_dcache_page(page);
00a3d660SLinus Torvalds
f0b65f39SAl Viro		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
afddba49SNick Piggin		flush_dcache_page(page);
afddba49SNick Piggin
afddba49SNick Piggin		status = a_ops->write_end(file, mapping, pos, bytes, copied,
afddba49SNick Piggin						page, fsdata);
f0b65f39SAl Viro		if (unlikely(status != copied)) {
f0b65f39SAl Viro			iov_iter_revert(i, copied - max(status, 0L));
afddba49SNick Piggin			if (unlikely(status < 0))
afddba49SNick Piggin				break;
f0b65f39SAl Viro		}
afddba49SNick Piggin		cond_resched();
afddba49SNick Piggin
bc1bb416SAl Viro		if (unlikely(status == 0)) {
afddba49SNick Piggin			/*
bc1bb416SAl Viro			 * A short copy made ->write_end() reject the
bc1bb416SAl Viro			 * thing entirely.  Might be memory poisoning
bc1bb416SAl Viro			 * halfway through, might be a race with munmap,
bc1bb416SAl Viro			 * might be severe memory pressure.
afddba49SNick Piggin			 */
bc1bb416SAl Viro			if (copied)
bc1bb416SAl Viro				bytes = copied;
afddba49SNick Piggin			goto again;
afddba49SNick Piggin		}
f0b65f39SAl Viro		pos += status;
f0b65f39SAl Viro		written += status;
afddba49SNick Piggin
afddba49SNick Piggin		balance_dirty_pages_ratelimited(mapping);
afddba49SNick Piggin	} while (iov_iter_count(i));
afddba49SNick Piggin
182c25e9SChristoph Hellwig	if (!written)
182c25e9SChristoph Hellwig		return status;
182c25e9SChristoph Hellwig	iocb->ki_pos += written;
182c25e9SChristoph Hellwig	return written;
afddba49SNick Piggin}
3b93f911SAl ViroEXPORT_SYMBOL(generic_perform_write);
1da177e4SLinus Torvalds
e4dd9de3SJan Kara/**
8174202bSAl Viro * __generic_file_write_iter - write data to a file
e4dd9de3SJan Kara * @iocb:	IO state structure (file, offset, etc.)
8174202bSAl Viro * @from:	iov_iter with data to write
e4dd9de3SJan Kara *
e4dd9de3SJan Kara * This function does all the work needed for actually writing data to a
e4dd9de3SJan Kara * file. It does all basic checks, removes SUID from the file, updates
e4dd9de3SJan Kara * modification times and calls proper subroutines depending on whether we
e4dd9de3SJan Kara * do direct IO or a standard buffered write.
e4dd9de3SJan Kara *
9608703eSJan Kara * It expects i_rwsem to be grabbed unless we work on a block device or similar
e4dd9de3SJan Kara * object which does not need locking at all.
e4dd9de3SJan Kara *
e4dd9de3SJan Kara * This function does *not* take care of syncing data in case of O_SYNC write.
e4dd9de3SJan Kara * A caller has to handle it. This is mainly due to the fact that we want to
9608703eSJan Kara * avoid syncing under i_rwsem.
a862f68aSMike Rapoport *
a862f68aSMike Rapoport * Return:
a862f68aSMike Rapoport * * number of bytes written, even for truncated writes
a862f68aSMike Rapoport * * negative error code if no data has been written at all
e4dd9de3SJan Kara */
8174202bSAl Virossize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	struct file *file = iocb->ki_filp;
fb5527e6SJeff Moyer	struct address_space *mapping = file->f_mapping;
1da177e4SLinus Torvalds	struct inode *inode = mapping->host;
44fff0faSChristoph Hellwig	ssize_t ret;
1da177e4SLinus Torvalds
44fff0faSChristoph Hellwig	ret = file_remove_privs(file);
44fff0faSChristoph Hellwig	if (ret)
44fff0faSChristoph Hellwig		return ret;
1da177e4SLinus Torvalds
44fff0faSChristoph Hellwig	ret = file_update_time(file);
44fff0faSChristoph Hellwig	if (ret)
44fff0faSChristoph Hellwig		return ret;
1da177e4SLinus Torvalds
2ba48ce5SAl Viro	if (iocb->ki_flags & IOCB_DIRECT) {
44fff0faSChristoph Hellwig		ret = generic_file_direct_write(iocb, from);
fbbbad4bSMatthew Wilcox		/*
fbbbad4bSMatthew Wilcox		 * If the write stopped short of completing, fall back to
fbbbad4bSMatthew Wilcox		 * buffered writes.  Some filesystems do this for writes to
fbbbad4bSMatthew Wilcox		 * holes, for example.  For DAX files, a buffered write will
fbbbad4bSMatthew Wilcox		 * not succeed (even if it did, DAX does not handle dirty
fbbbad4bSMatthew Wilcox		 * page-cache pages correctly).
fbbbad4bSMatthew Wilcox		 */
44fff0faSChristoph Hellwig		if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
44fff0faSChristoph Hellwig			return ret;
44fff0faSChristoph Hellwig		return direct_write_fallback(iocb, from, ret,
44fff0faSChristoph Hellwig				generic_perform_write(iocb, from));
44fff0faSChristoph Hellwig	}
3b93f911SAl Viro
44fff0faSChristoph Hellwig	return generic_perform_write(iocb, from);
1da177e4SLinus Torvalds}
8174202bSAl ViroEXPORT_SYMBOL(__generic_file_write_iter);
1da177e4SLinus Torvalds
e4dd9de3SJan Kara/**
8174202bSAl Viro * generic_file_write_iter - write data to a file
e4dd9de3SJan Kara * @iocb:	IO state structure
8174202bSAl Viro * @from:	iov_iter with data to write
e4dd9de3SJan Kara *
8174202bSAl Viro * This is a wrapper around __generic_file_write_iter() to be used by most
e4dd9de3SJan Kara * filesystems. It takes care of syncing the file in case of O_SYNC file
9608703eSJan Kara * and acquires i_rwsem as needed.
a862f68aSMike Rapoport * Return:
a862f68aSMike Rapoport * * negative error code if no data has been written at all of
a862f68aSMike Rapoport *   vfs_fsync_range() failed for a synchronous write
a862f68aSMike Rapoport * * number of bytes written, even for truncated writes
e4dd9de3SJan Kara */
8174202bSAl Virossize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4SLinus Torvalds{
1da177e4SLinus Torvalds	struct file *file = iocb->ki_filp;
148f948bSJan Kara	struct inode *inode = file->f_mapping->host;
1da177e4SLinus Torvalds	ssize_t ret;
1da177e4SLinus Torvalds
5955102cSAl Viro	inode_lock(inode);
3309dd04SAl Viro	ret = generic_write_checks(iocb, from);
3309dd04SAl Viro	if (ret > 0)
8174202bSAl Viro		ret = __generic_file_write_iter(iocb, from);
5955102cSAl Viro	inode_unlock(inode);
1da177e4SLinus Torvalds
e2592217SChristoph Hellwig	if (ret > 0)
e2592217SChristoph Hellwig		ret = generic_write_sync(iocb, ret);
1da177e4SLinus Torvalds	return ret;
1da177e4SLinus Torvalds}
8174202bSAl ViroEXPORT_SYMBOL(generic_file_write_iter);
1da177e4SLinus Torvalds
cf9a2ae8SDavid Howells/**
82c50f8bSMatthew Wilcox (Oracle) * filemap_release_folio() - Release fs-specific metadata on a folio.
82c50f8bSMatthew Wilcox (Oracle) * @folio: The folio which the kernel is trying to free.
82c50f8bSMatthew Wilcox (Oracle) * @gfp: Memory allocation flags (and I/O mode).
cf9a2ae8SDavid Howells *
82c50f8bSMatthew Wilcox (Oracle) * The address_space is trying to release any data attached to a folio
82c50f8bSMatthew Wilcox (Oracle) * (presumably at folio->private).
cf9a2ae8SDavid Howells *
82c50f8bSMatthew Wilcox (Oracle) * This will also be called if the private_2 flag is set on a page,
82c50f8bSMatthew Wilcox (Oracle) * indicating that the folio has other metadata associated with it.
cf9a2ae8SDavid Howells *
82c50f8bSMatthew Wilcox (Oracle) * The @gfp argument specifies whether I/O may be performed to release
82c50f8bSMatthew Wilcox (Oracle) * this page (__GFP_IO), and whether the call may block
82c50f8bSMatthew Wilcox (Oracle) * (__GFP_RECLAIM & __GFP_FS).
266cf658SDavid Howells *
82c50f8bSMatthew Wilcox (Oracle) * Return: %true if the release was successful, otherwise %false.
cf9a2ae8SDavid Howells */
82c50f8bSMatthew Wilcox (Oracle)bool filemap_release_folio(struct folio *folio, gfp_t gfp)
cf9a2ae8SDavid Howells{
82c50f8bSMatthew Wilcox (Oracle)	struct address_space * const mapping = folio->mapping;
cf9a2ae8SDavid Howells
82c50f8bSMatthew Wilcox (Oracle)	BUG_ON(!folio_test_locked(folio));
0201ebf2SDavid Howells	if (!folio_needs_release(folio))
0201ebf2SDavid Howells		return true;
82c50f8bSMatthew Wilcox (Oracle)	if (folio_test_writeback(folio))
82c50f8bSMatthew Wilcox (Oracle)		return false;
cf9a2ae8SDavid Howells
fa29000bSMatthew Wilcox (Oracle)	if (mapping && mapping->a_ops->release_folio)
fa29000bSMatthew Wilcox (Oracle)		return mapping->a_ops->release_folio(folio, gfp);
68189fefSMatthew Wilcox (Oracle)	return try_to_free_buffers(folio);
cf9a2ae8SDavid Howells}
82c50f8bSMatthew Wilcox (Oracle)EXPORT_SYMBOL(filemap_release_folio);
cf264e13SNhat Pham
cf264e13SNhat Pham#ifdef CONFIG_CACHESTAT_SYSCALL
cf264e13SNhat Pham/**
cf264e13SNhat Pham * filemap_cachestat() - compute the page cache statistics of a mapping
cf264e13SNhat Pham * @mapping:	The mapping to compute the statistics for.
cf264e13SNhat Pham * @first_index:	The starting page cache index.
cf264e13SNhat Pham * @last_index:	The final page index (inclusive).
cf264e13SNhat Pham * @cs:	the cachestat struct to write the result to.
cf264e13SNhat Pham *
cf264e13SNhat Pham * This will query the page cache statistics of a mapping in the
cf264e13SNhat Pham * page range of [first_index, last_index] (inclusive). The statistics
cf264e13SNhat Pham * queried include: number of dirty pages, number of pages marked for
cf264e13SNhat Pham * writeback, and the number of (recently) evicted pages.
cf264e13SNhat Pham */
cf264e13SNhat Phamstatic void filemap_cachestat(struct address_space *mapping,
cf264e13SNhat Pham		pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
cf264e13SNhat Pham{
cf264e13SNhat Pham	XA_STATE(xas, &mapping->i_pages, first_index);
cf264e13SNhat Pham	struct folio *folio;
cf264e13SNhat Pham
cf264e13SNhat Pham	rcu_read_lock();
cf264e13SNhat Pham	xas_for_each(&xas, folio, last_index) {
ba60fdf7SNhat Pham		int order;
cf264e13SNhat Pham		unsigned long nr_pages;
cf264e13SNhat Pham		pgoff_t folio_first_index, folio_last_index;
cf264e13SNhat Pham
ba60fdf7SNhat Pham		/*
ba60fdf7SNhat Pham		 * Don't deref the folio. It is not pinned, and might
ba60fdf7SNhat Pham		 * get freed (and reused) underneath us.
ba60fdf7SNhat Pham		 *
ba60fdf7SNhat Pham		 * We *could* pin it, but that would be expensive for
ba60fdf7SNhat Pham		 * what should be a fast and lightweight syscall.
ba60fdf7SNhat Pham		 *
ba60fdf7SNhat Pham		 * Instead, derive all information of interest from
ba60fdf7SNhat Pham		 * the rcu-protected xarray.
ba60fdf7SNhat Pham		 */
ba60fdf7SNhat Pham
cf264e13SNhat Pham		if (xas_retry(&xas, folio))
cf264e13SNhat Pham			continue;
cf264e13SNhat Pham
ba60fdf7SNhat Pham		order = xa_get_order(xas.xa, xas.xa_index);
cf264e13SNhat Pham		nr_pages = 1 << order;
cf264e13SNhat Pham		folio_first_index = round_down(xas.xa_index, 1 << order);
cf264e13SNhat Pham		folio_last_index = folio_first_index + nr_pages - 1;
cf264e13SNhat Pham
cf264e13SNhat Pham		/* Folios might straddle the range boundaries, only count covered pages */
cf264e13SNhat Pham		if (folio_first_index < first_index)
cf264e13SNhat Pham			nr_pages -= first_index - folio_first_index;
cf264e13SNhat Pham
cf264e13SNhat Pham		if (folio_last_index > last_index)
cf264e13SNhat Pham			nr_pages -= folio_last_index - last_index;
cf264e13SNhat Pham
ba60fdf7SNhat Pham		if (xa_is_value(folio)) {
ba60fdf7SNhat Pham			/* page is evicted */
ba60fdf7SNhat Pham			void *shadow = (void *)folio;
ba60fdf7SNhat Pham			bool workingset; /* not used */
ba60fdf7SNhat Pham
cf264e13SNhat Pham			cs->nr_evicted += nr_pages;
cf264e13SNhat Pham
cf264e13SNhat Pham#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
cf264e13SNhat Pham			if (shmem_mapping(mapping)) {
cf264e13SNhat Pham				/* shmem file - in swap cache */
cf264e13SNhat Pham				swp_entry_t swp = radix_to_swp_entry(folio);
cf264e13SNhat Pham
b79f9e1fSJohannes Weiner				/* swapin error results in poisoned entry */
b79f9e1fSJohannes Weiner				if (non_swap_entry(swp))
b79f9e1fSJohannes Weiner					goto resched;
b79f9e1fSJohannes Weiner
b79f9e1fSJohannes Weiner				/*
b79f9e1fSJohannes Weiner				 * Getting a swap entry from the shmem
b79f9e1fSJohannes Weiner				 * inode means we beat
b79f9e1fSJohannes Weiner				 * shmem_unuse(). rcu_read_lock()
b79f9e1fSJohannes Weiner				 * ensures swapoff waits for us before
b79f9e1fSJohannes Weiner				 * freeing the swapper space. However,
b79f9e1fSJohannes Weiner				 * we can race with swapping and
b79f9e1fSJohannes Weiner				 * invalidation, so there might not be
b79f9e1fSJohannes Weiner				 * a shadow in the swapcache (yet).
b79f9e1fSJohannes Weiner				 */
cf264e13SNhat Pham				shadow = get_shadow_from_swap_cache(swp);
b79f9e1fSJohannes Weiner				if (!shadow)
b79f9e1fSJohannes Weiner					goto resched;
cf264e13SNhat Pham			}
cf264e13SNhat Pham#endif
cf264e13SNhat Pham			if (workingset_test_recent(shadow, true, &workingset))
cf264e13SNhat Pham				cs->nr_recently_evicted += nr_pages;
cf264e13SNhat Pham
cf264e13SNhat Pham			goto resched;
cf264e13SNhat Pham		}
cf264e13SNhat Pham
cf264e13SNhat Pham		/* page is in cache */
cf264e13SNhat Pham		cs->nr_cache += nr_pages;
cf264e13SNhat Pham
ba60fdf7SNhat Pham		if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cf264e13SNhat Pham			cs->nr_dirty += nr_pages;
cf264e13SNhat Pham
ba60fdf7SNhat Pham		if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cf264e13SNhat Pham			cs->nr_writeback += nr_pages;
cf264e13SNhat Pham
cf264e13SNhat Phamresched:
cf264e13SNhat Pham		if (need_resched()) {
cf264e13SNhat Pham			xas_pause(&xas);
cf264e13SNhat Pham			cond_resched_rcu();
cf264e13SNhat Pham		}
cf264e13SNhat Pham	}
cf264e13SNhat Pham	rcu_read_unlock();
cf264e13SNhat Pham}
cf264e13SNhat Pham
cf264e13SNhat Pham/*
*7d6405c1SLinus Torvalds * See mincore: reveal pagecache information only for files
*7d6405c1SLinus Torvalds * that the calling process has write access to, or could (if
*7d6405c1SLinus Torvalds * tried) open for writing.
*7d6405c1SLinus Torvalds */
*7d6405c1SLinus Torvaldsstatic inline bool can_do_cachestat(struct file *f)
*7d6405c1SLinus Torvalds{
*7d6405c1SLinus Torvalds	if (f->f_mode & FMODE_WRITE)
*7d6405c1SLinus Torvalds		return true;
*7d6405c1SLinus Torvalds	if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
*7d6405c1SLinus Torvalds		return true;
*7d6405c1SLinus Torvalds	return file_permission(f, MAY_WRITE) == 0;
*7d6405c1SLinus Torvalds}
*7d6405c1SLinus Torvalds
*7d6405c1SLinus Torvalds/*
cf264e13SNhat Pham * The cachestat(2) system call.
cf264e13SNhat Pham *
cf264e13SNhat Pham * cachestat() returns the page cache statistics of a file in the
cf264e13SNhat Pham * bytes range specified by `off` and `len`: number of cached pages,
cf264e13SNhat Pham * number of dirty pages, number of pages marked for writeback,
cf264e13SNhat Pham * number of evicted pages, and number of recently evicted pages.
cf264e13SNhat Pham *
cf264e13SNhat Pham * An evicted page is a page that is previously in the page cache
cf264e13SNhat Pham * but has been evicted since. A page is recently evicted if its last
cf264e13SNhat Pham * eviction was recent enough that its reentry to the cache would
cf264e13SNhat Pham * indicate that it is actively being used by the system, and that
cf264e13SNhat Pham * there is memory pressure on the system.
cf264e13SNhat Pham *
cf264e13SNhat Pham * `off` and `len` must be non-negative integers. If `len` > 0,
cf264e13SNhat Pham * the queried range is [`off`, `off` + `len`]. If `len` == 0,
cf264e13SNhat Pham * we will query in the range from `off` to the end of the file.
cf264e13SNhat Pham *
cf264e13SNhat Pham * The `flags` argument is unused for now, but is included for future
cf264e13SNhat Pham * extensibility. User should pass 0 (i.e no flag specified).
cf264e13SNhat Pham *
cf264e13SNhat Pham * Currently, hugetlbfs is not supported.
cf264e13SNhat Pham *
cf264e13SNhat Pham * Because the status of a page can change after cachestat() checks it
cf264e13SNhat Pham * but before it returns to the application, the returned values may
cf264e13SNhat Pham * contain stale information.
cf264e13SNhat Pham *
cf264e13SNhat Pham * return values:
cf264e13SNhat Pham *  zero        - success
cf264e13SNhat Pham *  -EFAULT     - cstat or cstat_range points to an illegal address
cf264e13SNhat Pham *  -EINVAL     - invalid flags
cf264e13SNhat Pham *  -EBADF      - invalid file descriptor
cf264e13SNhat Pham *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
cf264e13SNhat Pham */
cf264e13SNhat PhamSYSCALL_DEFINE4(cachestat, unsigned int, fd,
cf264e13SNhat Pham		struct cachestat_range __user *, cstat_range,
cf264e13SNhat Pham		struct cachestat __user *, cstat, unsigned int, flags)
cf264e13SNhat Pham{
cf264e13SNhat Pham	struct fd f = fdget(fd);
cf264e13SNhat Pham	struct address_space *mapping;
cf264e13SNhat Pham	struct cachestat_range csr;
cf264e13SNhat Pham	struct cachestat cs;
cf264e13SNhat Pham	pgoff_t first_index, last_index;
cf264e13SNhat Pham
cf264e13SNhat Pham	if (!f.file)
cf264e13SNhat Pham		return -EBADF;
cf264e13SNhat Pham
cf264e13SNhat Pham	if (copy_from_user(&csr, cstat_range,
cf264e13SNhat Pham			sizeof(struct cachestat_range))) {
cf264e13SNhat Pham		fdput(f);
cf264e13SNhat Pham		return -EFAULT;
cf264e13SNhat Pham	}
cf264e13SNhat Pham
cf264e13SNhat Pham	/* hugetlbfs is not supported */
cf264e13SNhat Pham	if (is_file_hugepages(f.file)) {
cf264e13SNhat Pham		fdput(f);
cf264e13SNhat Pham		return -EOPNOTSUPP;
cf264e13SNhat Pham	}
cf264e13SNhat Pham
*7d6405c1SLinus Torvalds	if (!can_do_cachestat(f.file)) {
*7d6405c1SLinus Torvalds		fdput(f);
*7d6405c1SLinus Torvalds		return -EPERM;
*7d6405c1SLinus Torvalds	}
*7d6405c1SLinus Torvalds
cf264e13SNhat Pham	if (flags != 0) {
cf264e13SNhat Pham		fdput(f);
cf264e13SNhat Pham		return -EINVAL;
cf264e13SNhat Pham	}
cf264e13SNhat Pham
cf264e13SNhat Pham	first_index = csr.off >> PAGE_SHIFT;
cf264e13SNhat Pham	last_index =
cf264e13SNhat Pham		csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
cf264e13SNhat Pham	memset(&cs, 0, sizeof(struct cachestat));
cf264e13SNhat Pham	mapping = f.file->f_mapping;
cf264e13SNhat Pham	filemap_cachestat(mapping, first_index, last_index, &cs);
cf264e13SNhat Pham	fdput(f);
cf264e13SNhat Pham
cf264e13SNhat Pham	if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
cf264e13SNhat Pham		return -EFAULT;
cf264e13SNhat Pham
cf264e13SNhat Pham	return 0;
cf264e13SNhat Pham}
cf264e13SNhat Pham#endif /* CONFIG_CACHESTAT_SYSCALL */