1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * fs/mpage.c
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright (C) 2002, Linus Torvalds.
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Contains functions related to preparing and submitting BIOs which contain
81da177e4SLinus Torvalds * multiple pagecache pages.
91da177e4SLinus Torvalds *
10e1f8e874SFrancois Cami * 15May2002 Andrew Morton
111da177e4SLinus Torvalds * Initial version
121da177e4SLinus Torvalds * 27Jun2002 axboe@suse.de
131da177e4SLinus Torvalds * use bio_add_page() to build bio's just the right size
141da177e4SLinus Torvalds */
151da177e4SLinus Torvalds
161da177e4SLinus Torvalds #include <linux/kernel.h>
17630d9c47SPaul Gortmaker #include <linux/export.h>
181da177e4SLinus Torvalds #include <linux/mm.h>
191da177e4SLinus Torvalds #include <linux/kdev_t.h>
205a0e3ad6STejun Heo #include <linux/gfp.h>
211da177e4SLinus Torvalds #include <linux/bio.h>
221da177e4SLinus Torvalds #include <linux/fs.h>
231da177e4SLinus Torvalds #include <linux/buffer_head.h>
241da177e4SLinus Torvalds #include <linux/blkdev.h>
251da177e4SLinus Torvalds #include <linux/highmem.h>
261da177e4SLinus Torvalds #include <linux/prefetch.h>
271da177e4SLinus Torvalds #include <linux/mpage.h>
2802c43638SAndrew Morton #include <linux/mm_inline.h>
291da177e4SLinus Torvalds #include <linux/writeback.h>
301da177e4SLinus Torvalds #include <linux/backing-dev.h>
311da177e4SLinus Torvalds #include <linux/pagevec.h>
324db96b71SAkinobu Mita #include "internal.h"
331da177e4SLinus Torvalds
341da177e4SLinus Torvalds /*
351da177e4SLinus Torvalds * I/O completion handler for multipage BIOs.
361da177e4SLinus Torvalds *
371da177e4SLinus Torvalds * The mpage code never puts partial pages into a BIO (except for end-of-file).
381da177e4SLinus Torvalds * If a page does not map to a contiguous run of blocks then it simply falls
392c69e205SMatthew Wilcox (Oracle) * back to block_read_full_folio().
401da177e4SLinus Torvalds *
411da177e4SLinus Torvalds * Why is this? If a page's completion depends on a number of different BIOs
421da177e4SLinus Torvalds * which can complete in any order (or at the same time) then determining the
431da177e4SLinus Torvalds * status of that page is hard. See end_buffer_async_read() for the details.
441da177e4SLinus Torvalds * There is no point in duplicating all that complexity.
451da177e4SLinus Torvalds */
mpage_read_end_io(struct bio * bio)46f0d6ca46SPankaj Raghav static void mpage_read_end_io(struct bio *bio)
471da177e4SLinus Torvalds {
48*09a607c9SPankaj Raghav struct folio_iter fi;
49*09a607c9SPankaj Raghav int err = blk_status_to_errno(bio->bi_status);
501da177e4SLinus Torvalds
51*09a607c9SPankaj Raghav bio_for_each_folio_all(fi, bio) {
52*09a607c9SPankaj Raghav if (err)
53*09a607c9SPankaj Raghav folio_set_error(fi.folio);
54*09a607c9SPankaj Raghav else
55*09a607c9SPankaj Raghav folio_mark_uptodate(fi.folio);
56*09a607c9SPankaj Raghav folio_unlock(fi.folio);
57*09a607c9SPankaj Raghav }
582c30c71bSKent Overstreet
591da177e4SLinus Torvalds bio_put(bio);
601da177e4SLinus Torvalds }
611da177e4SLinus Torvalds
mpage_write_end_io(struct bio * bio)62f0d6ca46SPankaj Raghav static void mpage_write_end_io(struct bio *bio)
631da177e4SLinus Torvalds {
64*09a607c9SPankaj Raghav struct folio_iter fi;
65*09a607c9SPankaj Raghav int err = blk_status_to_errno(bio->bi_status);
66f0d6ca46SPankaj Raghav
67*09a607c9SPankaj Raghav bio_for_each_folio_all(fi, bio) {
68*09a607c9SPankaj Raghav if (err) {
69*09a607c9SPankaj Raghav folio_set_error(fi.folio);
70*09a607c9SPankaj Raghav mapping_set_error(fi.folio->mapping, err);
71*09a607c9SPankaj Raghav }
72*09a607c9SPankaj Raghav folio_end_writeback(fi.folio);
73*09a607c9SPankaj Raghav }
74f0d6ca46SPankaj Raghav
75f0d6ca46SPankaj Raghav bio_put(bio);
76f0d6ca46SPankaj Raghav }
77f0d6ca46SPankaj Raghav
mpage_bio_submit_read(struct bio * bio)78f0d6ca46SPankaj Raghav static struct bio *mpage_bio_submit_read(struct bio *bio)
79f0d6ca46SPankaj Raghav {
80f0d6ca46SPankaj Raghav bio->bi_end_io = mpage_read_end_io;
81f0d6ca46SPankaj Raghav guard_bio_eod(bio);
82f0d6ca46SPankaj Raghav submit_bio(bio);
83f0d6ca46SPankaj Raghav return NULL;
84f0d6ca46SPankaj Raghav }
85f0d6ca46SPankaj Raghav
mpage_bio_submit_write(struct bio * bio)86f0d6ca46SPankaj Raghav static struct bio *mpage_bio_submit_write(struct bio *bio)
87f0d6ca46SPankaj Raghav {
88f0d6ca46SPankaj Raghav bio->bi_end_io = mpage_write_end_io;
8983c9c547SMing Lei guard_bio_eod(bio);
904e49ea4aSMike Christie submit_bio(bio);
911da177e4SLinus Torvalds return NULL;
921da177e4SLinus Torvalds }
931da177e4SLinus Torvalds
941da177e4SLinus Torvalds /*
95d4388340SMatthew Wilcox (Oracle) * support function for mpage_readahead. The fs supplied get_block might
961da177e4SLinus Torvalds * return an up to date buffer. This is used to map that buffer into
972c69e205SMatthew Wilcox (Oracle) * the page, which allows read_folio to avoid triggering a duplicate call
981da177e4SLinus Torvalds * to get_block.
991da177e4SLinus Torvalds *
1001da177e4SLinus Torvalds * The idea is to avoid adding buffers to pages that don't already have
1011da177e4SLinus Torvalds * them. So when the buffer is up to date and the page size == block size,
1021da177e4SLinus Torvalds * this marks the page up to date instead of adding new buffers.
1031da177e4SLinus Torvalds */
map_buffer_to_folio(struct folio * folio,struct buffer_head * bh,int page_block)104211d0444SMatthew Wilcox (Oracle) static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
105211d0444SMatthew Wilcox (Oracle) int page_block)
1061da177e4SLinus Torvalds {
107211d0444SMatthew Wilcox (Oracle) struct inode *inode = folio->mapping->host;
1081da177e4SLinus Torvalds struct buffer_head *page_bh, *head;
1091da177e4SLinus Torvalds int block = 0;
1101da177e4SLinus Torvalds
111211d0444SMatthew Wilcox (Oracle) head = folio_buffers(folio);
112211d0444SMatthew Wilcox (Oracle) if (!head) {
1131da177e4SLinus Torvalds /*
1141da177e4SLinus Torvalds * don't make any buffers if there is only one buffer on
115211d0444SMatthew Wilcox (Oracle) * the folio and the folio just needs to be set up to date
1161da177e4SLinus Torvalds */
11709cbfeafSKirill A. Shutemov if (inode->i_blkbits == PAGE_SHIFT &&
1181da177e4SLinus Torvalds buffer_uptodate(bh)) {
119211d0444SMatthew Wilcox (Oracle) folio_mark_uptodate(folio);
1201da177e4SLinus Torvalds return;
1211da177e4SLinus Torvalds }
122211d0444SMatthew Wilcox (Oracle) create_empty_buffers(&folio->page, i_blocksize(inode), 0);
123211d0444SMatthew Wilcox (Oracle) head = folio_buffers(folio);
1241da177e4SLinus Torvalds }
125211d0444SMatthew Wilcox (Oracle)
1261da177e4SLinus Torvalds page_bh = head;
1271da177e4SLinus Torvalds do {
1281da177e4SLinus Torvalds if (block == page_block) {
1291da177e4SLinus Torvalds page_bh->b_state = bh->b_state;
1301da177e4SLinus Torvalds page_bh->b_bdev = bh->b_bdev;
1311da177e4SLinus Torvalds page_bh->b_blocknr = bh->b_blocknr;
1321da177e4SLinus Torvalds break;
1331da177e4SLinus Torvalds }
1341da177e4SLinus Torvalds page_bh = page_bh->b_this_page;
1351da177e4SLinus Torvalds block++;
1361da177e4SLinus Torvalds } while (page_bh != head);
1371da177e4SLinus Torvalds }
1381da177e4SLinus Torvalds
139357c1206SJens Axboe struct mpage_readpage_args {
140357c1206SJens Axboe struct bio *bio;
141211d0444SMatthew Wilcox (Oracle) struct folio *folio;
142357c1206SJens Axboe unsigned int nr_pages;
14374c8164eSJens Axboe bool is_readahead;
144357c1206SJens Axboe sector_t last_block_in_bio;
145357c1206SJens Axboe struct buffer_head map_bh;
146357c1206SJens Axboe unsigned long first_logical_block;
147357c1206SJens Axboe get_block_t *get_block;
148357c1206SJens Axboe };
149357c1206SJens Axboe
150fa30bd05SBadari Pulavarty /*
151fa30bd05SBadari Pulavarty * This is the worker routine which does all the work of mapping the disk
152fa30bd05SBadari Pulavarty * blocks and constructs largest possible bios, submits them for IO if the
153fa30bd05SBadari Pulavarty * blocks are not contiguous on the disk.
154fa30bd05SBadari Pulavarty *
155fa30bd05SBadari Pulavarty * We pass a buffer_head back and forth and use its buffer_mapped() flag to
156fa30bd05SBadari Pulavarty * represent the validity of its disk mapping and to decide when to do the next
157fa30bd05SBadari Pulavarty * get_block() call.
158fa30bd05SBadari Pulavarty */
do_mpage_readpage(struct mpage_readpage_args * args)159357c1206SJens Axboe static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
1601da177e4SLinus Torvalds {
161211d0444SMatthew Wilcox (Oracle) struct folio *folio = args->folio;
162211d0444SMatthew Wilcox (Oracle) struct inode *inode = folio->mapping->host;
1631da177e4SLinus Torvalds const unsigned blkbits = inode->i_blkbits;
16409cbfeafSKirill A. Shutemov const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
1651da177e4SLinus Torvalds const unsigned blocksize = 1 << blkbits;
166357c1206SJens Axboe struct buffer_head *map_bh = &args->map_bh;
1671da177e4SLinus Torvalds sector_t block_in_file;
1681da177e4SLinus Torvalds sector_t last_block;
169fa30bd05SBadari Pulavarty sector_t last_block_in_file;
1701da177e4SLinus Torvalds sector_t blocks[MAX_BUF_PER_PAGE];
1711da177e4SLinus Torvalds unsigned page_block;
1721da177e4SLinus Torvalds unsigned first_hole = blocks_per_page;
1731da177e4SLinus Torvalds struct block_device *bdev = NULL;
1741da177e4SLinus Torvalds int length;
1751da177e4SLinus Torvalds int fully_mapped = 1;
176f84c94afSBart Van Assche blk_opf_t opf = REQ_OP_READ;
177fa30bd05SBadari Pulavarty unsigned nblocks;
178fa30bd05SBadari Pulavarty unsigned relative_block;
179211d0444SMatthew Wilcox (Oracle) gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
180211d0444SMatthew Wilcox (Oracle)
181211d0444SMatthew Wilcox (Oracle) /* MAX_BUF_PER_PAGE, for example */
182211d0444SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
18374c8164eSJens Axboe
18474c8164eSJens Axboe if (args->is_readahead) {
185f84c94afSBart Van Assche opf |= REQ_RAHEAD;
18661285ff7SChristoph Hellwig gfp |= __GFP_NORETRY | __GFP_NOWARN;
18774c8164eSJens Axboe }
1881da177e4SLinus Torvalds
189211d0444SMatthew Wilcox (Oracle) if (folio_buffers(folio))
1901da177e4SLinus Torvalds goto confused;
1911da177e4SLinus Torvalds
192211d0444SMatthew Wilcox (Oracle) block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
193357c1206SJens Axboe last_block = block_in_file + args->nr_pages * blocks_per_page;
194fa30bd05SBadari Pulavarty last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
195fa30bd05SBadari Pulavarty if (last_block > last_block_in_file)
196fa30bd05SBadari Pulavarty last_block = last_block_in_file;
197fa30bd05SBadari Pulavarty page_block = 0;
1981da177e4SLinus Torvalds
199fa30bd05SBadari Pulavarty /*
200fa30bd05SBadari Pulavarty * Map blocks using the result from the previous get_blocks call first.
201fa30bd05SBadari Pulavarty */
202fa30bd05SBadari Pulavarty nblocks = map_bh->b_size >> blkbits;
203357c1206SJens Axboe if (buffer_mapped(map_bh) &&
204357c1206SJens Axboe block_in_file > args->first_logical_block &&
205357c1206SJens Axboe block_in_file < (args->first_logical_block + nblocks)) {
206357c1206SJens Axboe unsigned map_offset = block_in_file - args->first_logical_block;
207fa30bd05SBadari Pulavarty unsigned last = nblocks - map_offset;
208fa30bd05SBadari Pulavarty
209fa30bd05SBadari Pulavarty for (relative_block = 0; ; relative_block++) {
210fa30bd05SBadari Pulavarty if (relative_block == last) {
211fa30bd05SBadari Pulavarty clear_buffer_mapped(map_bh);
212fa30bd05SBadari Pulavarty break;
213fa30bd05SBadari Pulavarty }
214fa30bd05SBadari Pulavarty if (page_block == blocks_per_page)
215fa30bd05SBadari Pulavarty break;
216fa30bd05SBadari Pulavarty blocks[page_block] = map_bh->b_blocknr + map_offset +
217fa30bd05SBadari Pulavarty relative_block;
218fa30bd05SBadari Pulavarty page_block++;
219fa30bd05SBadari Pulavarty block_in_file++;
220fa30bd05SBadari Pulavarty }
221fa30bd05SBadari Pulavarty bdev = map_bh->b_bdev;
2221da177e4SLinus Torvalds }
2231da177e4SLinus Torvalds
224fa30bd05SBadari Pulavarty /*
225211d0444SMatthew Wilcox (Oracle) * Then do more get_blocks calls until we are done with this folio.
226fa30bd05SBadari Pulavarty */
227a5fd8390SMatthew Wilcox (Oracle) map_bh->b_folio = folio;
228fa30bd05SBadari Pulavarty while (page_block < blocks_per_page) {
229fa30bd05SBadari Pulavarty map_bh->b_state = 0;
230fa30bd05SBadari Pulavarty map_bh->b_size = 0;
231fa30bd05SBadari Pulavarty
232fa30bd05SBadari Pulavarty if (block_in_file < last_block) {
233fa30bd05SBadari Pulavarty map_bh->b_size = (last_block-block_in_file) << blkbits;
234357c1206SJens Axboe if (args->get_block(inode, block_in_file, map_bh, 0))
235fa30bd05SBadari Pulavarty goto confused;
236357c1206SJens Axboe args->first_logical_block = block_in_file;
237fa30bd05SBadari Pulavarty }
238fa30bd05SBadari Pulavarty
239fa30bd05SBadari Pulavarty if (!buffer_mapped(map_bh)) {
2401da177e4SLinus Torvalds fully_mapped = 0;
2411da177e4SLinus Torvalds if (first_hole == blocks_per_page)
2421da177e4SLinus Torvalds first_hole = page_block;
243fa30bd05SBadari Pulavarty page_block++;
244fa30bd05SBadari Pulavarty block_in_file++;
2451da177e4SLinus Torvalds continue;
2461da177e4SLinus Torvalds }
2471da177e4SLinus Torvalds
2481da177e4SLinus Torvalds /* some filesystems will copy data into the page during
2491da177e4SLinus Torvalds * the get_block call, in which case we don't want to
250211d0444SMatthew Wilcox (Oracle) * read it again. map_buffer_to_folio copies the data
251211d0444SMatthew Wilcox (Oracle) * we just collected from get_block into the folio's buffers
252211d0444SMatthew Wilcox (Oracle) * so read_folio doesn't have to repeat the get_block call
2531da177e4SLinus Torvalds */
254fa30bd05SBadari Pulavarty if (buffer_uptodate(map_bh)) {
255211d0444SMatthew Wilcox (Oracle) map_buffer_to_folio(folio, map_bh, page_block);
2561da177e4SLinus Torvalds goto confused;
2571da177e4SLinus Torvalds }
2581da177e4SLinus Torvalds
2591da177e4SLinus Torvalds if (first_hole != blocks_per_page)
2601da177e4SLinus Torvalds goto confused; /* hole -> non-hole */
2611da177e4SLinus Torvalds
2621da177e4SLinus Torvalds /* Contiguous blocks? */
263fa30bd05SBadari Pulavarty if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
2641da177e4SLinus Torvalds goto confused;
265fa30bd05SBadari Pulavarty nblocks = map_bh->b_size >> blkbits;
266fa30bd05SBadari Pulavarty for (relative_block = 0; ; relative_block++) {
267fa30bd05SBadari Pulavarty if (relative_block == nblocks) {
268fa30bd05SBadari Pulavarty clear_buffer_mapped(map_bh);
269fa30bd05SBadari Pulavarty break;
270fa30bd05SBadari Pulavarty } else if (page_block == blocks_per_page)
271fa30bd05SBadari Pulavarty break;
272fa30bd05SBadari Pulavarty blocks[page_block] = map_bh->b_blocknr+relative_block;
273fa30bd05SBadari Pulavarty page_block++;
274fa30bd05SBadari Pulavarty block_in_file++;
275fa30bd05SBadari Pulavarty }
276fa30bd05SBadari Pulavarty bdev = map_bh->b_bdev;
2771da177e4SLinus Torvalds }
2781da177e4SLinus Torvalds
2791da177e4SLinus Torvalds if (first_hole != blocks_per_page) {
280211d0444SMatthew Wilcox (Oracle) folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
2811da177e4SLinus Torvalds if (first_hole == 0) {
282211d0444SMatthew Wilcox (Oracle) folio_mark_uptodate(folio);
283211d0444SMatthew Wilcox (Oracle) folio_unlock(folio);
2841da177e4SLinus Torvalds goto out;
2851da177e4SLinus Torvalds }
2861da177e4SLinus Torvalds } else if (fully_mapped) {
287211d0444SMatthew Wilcox (Oracle) folio_set_mappedtodisk(folio);
2881da177e4SLinus Torvalds }
2891da177e4SLinus Torvalds
2901da177e4SLinus Torvalds /*
291211d0444SMatthew Wilcox (Oracle) * This folio will go to BIO. Do we need to send this BIO off first?
2921da177e4SLinus Torvalds */
293357c1206SJens Axboe if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
294f0d6ca46SPankaj Raghav args->bio = mpage_bio_submit_read(args->bio);
2951da177e4SLinus Torvalds
2961da177e4SLinus Torvalds alloc_new:
297357c1206SJens Axboe if (args->bio == NULL) {
298f84c94afSBart Van Assche args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf,
29907888c66SChristoph Hellwig gfp);
300357c1206SJens Axboe if (args->bio == NULL)
3011da177e4SLinus Torvalds goto confused;
302d5f68a42SChristoph Hellwig args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
3031da177e4SLinus Torvalds }
3041da177e4SLinus Torvalds
3051da177e4SLinus Torvalds length = first_hole << blkbits;
306211d0444SMatthew Wilcox (Oracle) if (!bio_add_folio(args->bio, folio, length, 0)) {
307f0d6ca46SPankaj Raghav args->bio = mpage_bio_submit_read(args->bio);
3081da177e4SLinus Torvalds goto alloc_new;
3091da177e4SLinus Torvalds }
3101da177e4SLinus Torvalds
311357c1206SJens Axboe relative_block = block_in_file - args->first_logical_block;
31238c8e618SMiquel van Smoorenburg nblocks = map_bh->b_size >> blkbits;
31338c8e618SMiquel van Smoorenburg if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
31438c8e618SMiquel van Smoorenburg (first_hole != blocks_per_page))
315f0d6ca46SPankaj Raghav args->bio = mpage_bio_submit_read(args->bio);
3161da177e4SLinus Torvalds else
317357c1206SJens Axboe args->last_block_in_bio = blocks[blocks_per_page - 1];
3181da177e4SLinus Torvalds out:
319357c1206SJens Axboe return args->bio;
3201da177e4SLinus Torvalds
3211da177e4SLinus Torvalds confused:
322357c1206SJens Axboe if (args->bio)
323f0d6ca46SPankaj Raghav args->bio = mpage_bio_submit_read(args->bio);
324211d0444SMatthew Wilcox (Oracle) if (!folio_test_uptodate(folio))
325211d0444SMatthew Wilcox (Oracle) block_read_full_folio(folio, args->get_block);
3261da177e4SLinus Torvalds else
327211d0444SMatthew Wilcox (Oracle) folio_unlock(folio);
3281da177e4SLinus Torvalds goto out;
3291da177e4SLinus Torvalds }
3301da177e4SLinus Torvalds
33167be2dd1SMartin Waitz /**
332d4388340SMatthew Wilcox (Oracle) * mpage_readahead - start reads against pages
333d4388340SMatthew Wilcox (Oracle) * @rac: Describes which pages to read.
33467be2dd1SMartin Waitz * @get_block: The filesystem's block mapper function.
33567be2dd1SMartin Waitz *
33667be2dd1SMartin Waitz * This function walks the pages and the blocks within each page, building and
33767be2dd1SMartin Waitz * emitting large BIOs.
33867be2dd1SMartin Waitz *
33967be2dd1SMartin Waitz * If anything unusual happens, such as:
34067be2dd1SMartin Waitz *
34167be2dd1SMartin Waitz * - encountering a page which has buffers
34267be2dd1SMartin Waitz * - encountering a page which has a non-hole after a hole
34367be2dd1SMartin Waitz * - encountering a page with non-contiguous blocks
34467be2dd1SMartin Waitz *
34567be2dd1SMartin Waitz * then this code just gives up and calls the buffer_head-based read function.
34667be2dd1SMartin Waitz * It does handle a page which has holes at the end - that is a common case:
347ea1754a0SKirill A. Shutemov * the end-of-file on blocksize < PAGE_SIZE setups.
34867be2dd1SMartin Waitz *
34967be2dd1SMartin Waitz * BH_Boundary explanation:
35067be2dd1SMartin Waitz *
35167be2dd1SMartin Waitz * There is a problem. The mpage read code assembles several pages, gets all
35267be2dd1SMartin Waitz * their disk mappings, and then submits them all. That's fine, but obtaining
35367be2dd1SMartin Waitz * the disk mappings may require I/O. Reads of indirect blocks, for example.
35467be2dd1SMartin Waitz *
35567be2dd1SMartin Waitz * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
35667be2dd1SMartin Waitz * submitted in the following order:
3570117d427SMauro Carvalho Chehab *
35867be2dd1SMartin Waitz * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
35978a4a50aSRandy Dunlap *
36067be2dd1SMartin Waitz * because the indirect block has to be read to get the mappings of blocks
36167be2dd1SMartin Waitz * 13,14,15,16. Obviously, this impacts performance.
36267be2dd1SMartin Waitz *
36367be2dd1SMartin Waitz * So what we do it to allow the filesystem's get_block() function to set
36467be2dd1SMartin Waitz * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
36567be2dd1SMartin Waitz * after this one will require I/O against a block which is probably close to
36667be2dd1SMartin Waitz * this one. So you should push what I/O you have currently accumulated.
36767be2dd1SMartin Waitz *
36867be2dd1SMartin Waitz * This all causes the disk requests to be issued in the correct order.
36967be2dd1SMartin Waitz */
mpage_readahead(struct readahead_control * rac,get_block_t get_block)370d4388340SMatthew Wilcox (Oracle) void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
3711da177e4SLinus Torvalds {
372211d0444SMatthew Wilcox (Oracle) struct folio *folio;
373357c1206SJens Axboe struct mpage_readpage_args args = {
374357c1206SJens Axboe .get_block = get_block,
37574c8164eSJens Axboe .is_readahead = true,
376357c1206SJens Axboe };
3771da177e4SLinus Torvalds
378211d0444SMatthew Wilcox (Oracle) while ((folio = readahead_folio(rac))) {
379211d0444SMatthew Wilcox (Oracle) prefetchw(&folio->flags);
380211d0444SMatthew Wilcox (Oracle) args.folio = folio;
381d4388340SMatthew Wilcox (Oracle) args.nr_pages = readahead_count(rac);
382357c1206SJens Axboe args.bio = do_mpage_readpage(&args);
3831da177e4SLinus Torvalds }
384357c1206SJens Axboe if (args.bio)
385f0d6ca46SPankaj Raghav mpage_bio_submit_read(args.bio);
3861da177e4SLinus Torvalds }
387d4388340SMatthew Wilcox (Oracle) EXPORT_SYMBOL(mpage_readahead);
3881da177e4SLinus Torvalds
3891da177e4SLinus Torvalds /*
3901da177e4SLinus Torvalds * This isn't called much at all
3911da177e4SLinus Torvalds */
mpage_read_folio(struct folio * folio,get_block_t get_block)392f132ab7dSMatthew Wilcox (Oracle) int mpage_read_folio(struct folio *folio, get_block_t get_block)
3931da177e4SLinus Torvalds {
394357c1206SJens Axboe struct mpage_readpage_args args = {
395211d0444SMatthew Wilcox (Oracle) .folio = folio,
396357c1206SJens Axboe .nr_pages = 1,
397357c1206SJens Axboe .get_block = get_block,
398357c1206SJens Axboe };
3991da177e4SLinus Torvalds
400357c1206SJens Axboe args.bio = do_mpage_readpage(&args);
401357c1206SJens Axboe if (args.bio)
402f0d6ca46SPankaj Raghav mpage_bio_submit_read(args.bio);
4031da177e4SLinus Torvalds return 0;
4041da177e4SLinus Torvalds }
405f132ab7dSMatthew Wilcox (Oracle) EXPORT_SYMBOL(mpage_read_folio);
4061da177e4SLinus Torvalds
4071da177e4SLinus Torvalds /*
4081da177e4SLinus Torvalds * Writing is not so simple.
4091da177e4SLinus Torvalds *
4101da177e4SLinus Torvalds * If the page has buffers then they will be used for obtaining the disk
4111da177e4SLinus Torvalds * mapping. We only support pages which are fully mapped-and-dirty, with a
4121da177e4SLinus Torvalds * special case for pages which are unmapped at the end: end-of-file.
4131da177e4SLinus Torvalds *
4141da177e4SLinus Torvalds * If the page has no buffers (preferred) then the page is mapped here.
4151da177e4SLinus Torvalds *
4161da177e4SLinus Torvalds * If all blocks are found to be contiguous then the page can go into the
4171da177e4SLinus Torvalds * BIO. Otherwise fall back to the mapping's writepage().
4181da177e4SLinus Torvalds *
4191da177e4SLinus Torvalds * FIXME: This code wants an estimate of how many pages are still to be
4201da177e4SLinus Torvalds * written, so it can intelligently allocate a suitably-sized BIO. For now,
4211da177e4SLinus Torvalds * just allocate full-size (16-page) BIOs.
4221da177e4SLinus Torvalds */
4230ea97180SMiklos Szeredi
424ced117c7SDmitri Vorobiev struct mpage_data {
425ced117c7SDmitri Vorobiev struct bio *bio;
426ced117c7SDmitri Vorobiev sector_t last_block_in_bio;
427ced117c7SDmitri Vorobiev get_block_t *get_block;
428ced117c7SDmitri Vorobiev };
429ced117c7SDmitri Vorobiev
43090768eeeSMatthew Wilcox /*
43190768eeeSMatthew Wilcox * We have our BIO, so we can now mark the buffers clean. Make
43290768eeeSMatthew Wilcox * sure to only clean buffers which we know we'll be writing.
43390768eeeSMatthew Wilcox */
clean_buffers(struct page * page,unsigned first_unmapped)43490768eeeSMatthew Wilcox static void clean_buffers(struct page *page, unsigned first_unmapped)
43590768eeeSMatthew Wilcox {
43690768eeeSMatthew Wilcox unsigned buffer_counter = 0;
43790768eeeSMatthew Wilcox struct buffer_head *bh, *head;
43890768eeeSMatthew Wilcox if (!page_has_buffers(page))
43990768eeeSMatthew Wilcox return;
44090768eeeSMatthew Wilcox head = page_buffers(page);
44190768eeeSMatthew Wilcox bh = head;
44290768eeeSMatthew Wilcox
44390768eeeSMatthew Wilcox do {
44490768eeeSMatthew Wilcox if (buffer_counter++ == first_unmapped)
44590768eeeSMatthew Wilcox break;
44690768eeeSMatthew Wilcox clear_buffer_dirty(bh);
44790768eeeSMatthew Wilcox bh = bh->b_this_page;
44890768eeeSMatthew Wilcox } while (bh != head);
44990768eeeSMatthew Wilcox
45090768eeeSMatthew Wilcox /*
45190768eeeSMatthew Wilcox * we cannot drop the bh if the page is not uptodate or a concurrent
4522c69e205SMatthew Wilcox (Oracle) * read_folio would fail to serialize with the bh and it would read from
45390768eeeSMatthew Wilcox * disk before we reach the platter.
45490768eeeSMatthew Wilcox */
45590768eeeSMatthew Wilcox if (buffer_heads_over_limit && PageUptodate(page))
45668189fefSMatthew Wilcox (Oracle) try_to_free_buffers(page_folio(page));
45790768eeeSMatthew Wilcox }
45890768eeeSMatthew Wilcox
459f892760aSMatthew Wilcox /*
460f892760aSMatthew Wilcox * For situations where we want to clean all buffers attached to a page.
461f892760aSMatthew Wilcox * We don't need to calculate how many buffers are attached to the page,
462f892760aSMatthew Wilcox * we just need to specify a number larger than the maximum number of buffers.
463f892760aSMatthew Wilcox */
clean_page_buffers(struct page * page)464f892760aSMatthew Wilcox void clean_page_buffers(struct page *page)
465f892760aSMatthew Wilcox {
466f892760aSMatthew Wilcox clean_buffers(page, ~0U);
467f892760aSMatthew Wilcox }
468f892760aSMatthew Wilcox
__mpage_writepage(struct folio * folio,struct writeback_control * wbc,void * data)469d585bdbeSMatthew Wilcox (Oracle) static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
4700ea97180SMiklos Szeredi void *data)
4711da177e4SLinus Torvalds {
4720ea97180SMiklos Szeredi struct mpage_data *mpd = data;
4730ea97180SMiklos Szeredi struct bio *bio = mpd->bio;
4749160cffdSMatthew Wilcox (Oracle) struct address_space *mapping = folio->mapping;
4759160cffdSMatthew Wilcox (Oracle) struct inode *inode = mapping->host;
4761da177e4SLinus Torvalds const unsigned blkbits = inode->i_blkbits;
47709cbfeafSKirill A. Shutemov const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
4781da177e4SLinus Torvalds sector_t last_block;
4791da177e4SLinus Torvalds sector_t block_in_file;
4801da177e4SLinus Torvalds sector_t blocks[MAX_BUF_PER_PAGE];
4811da177e4SLinus Torvalds unsigned page_block;
4821da177e4SLinus Torvalds unsigned first_unmapped = blocks_per_page;
4831da177e4SLinus Torvalds struct block_device *bdev = NULL;
4841da177e4SLinus Torvalds int boundary = 0;
4851da177e4SLinus Torvalds sector_t boundary_block = 0;
4861da177e4SLinus Torvalds struct block_device *boundary_bdev = NULL;
4879160cffdSMatthew Wilcox (Oracle) size_t length;
4881da177e4SLinus Torvalds struct buffer_head map_bh;
4891da177e4SLinus Torvalds loff_t i_size = i_size_read(inode);
4900ea97180SMiklos Szeredi int ret = 0;
4919160cffdSMatthew Wilcox (Oracle) struct buffer_head *head = folio_buffers(folio);
4921da177e4SLinus Torvalds
4939160cffdSMatthew Wilcox (Oracle) if (head) {
4941da177e4SLinus Torvalds struct buffer_head *bh = head;
4951da177e4SLinus Torvalds
4961da177e4SLinus Torvalds /* If they're all mapped and dirty, do it */
4971da177e4SLinus Torvalds page_block = 0;
4981da177e4SLinus Torvalds do {
4991da177e4SLinus Torvalds BUG_ON(buffer_locked(bh));
5001da177e4SLinus Torvalds if (!buffer_mapped(bh)) {
5011da177e4SLinus Torvalds /*
5021da177e4SLinus Torvalds * unmapped dirty buffers are created by
503e621900aSMatthew Wilcox (Oracle) * block_dirty_folio -> mmapped data
5041da177e4SLinus Torvalds */
5051da177e4SLinus Torvalds if (buffer_dirty(bh))
5061da177e4SLinus Torvalds goto confused;
5071da177e4SLinus Torvalds if (first_unmapped == blocks_per_page)
5081da177e4SLinus Torvalds first_unmapped = page_block;
5091da177e4SLinus Torvalds continue;
5101da177e4SLinus Torvalds }
5111da177e4SLinus Torvalds
5121da177e4SLinus Torvalds if (first_unmapped != blocks_per_page)
5131da177e4SLinus Torvalds goto confused; /* hole -> non-hole */
5141da177e4SLinus Torvalds
5151da177e4SLinus Torvalds if (!buffer_dirty(bh) || !buffer_uptodate(bh))
5161da177e4SLinus Torvalds goto confused;
5171da177e4SLinus Torvalds if (page_block) {
5181da177e4SLinus Torvalds if (bh->b_blocknr != blocks[page_block-1] + 1)
5191da177e4SLinus Torvalds goto confused;
5201da177e4SLinus Torvalds }
5211da177e4SLinus Torvalds blocks[page_block++] = bh->b_blocknr;
5221da177e4SLinus Torvalds boundary = buffer_boundary(bh);
5231da177e4SLinus Torvalds if (boundary) {
5241da177e4SLinus Torvalds boundary_block = bh->b_blocknr;
5251da177e4SLinus Torvalds boundary_bdev = bh->b_bdev;
5261da177e4SLinus Torvalds }
5271da177e4SLinus Torvalds bdev = bh->b_bdev;
5281da177e4SLinus Torvalds } while ((bh = bh->b_this_page) != head);
5291da177e4SLinus Torvalds
5301da177e4SLinus Torvalds if (first_unmapped)
5311da177e4SLinus Torvalds goto page_is_mapped;
5321da177e4SLinus Torvalds
5331da177e4SLinus Torvalds /*
5341da177e4SLinus Torvalds * Page has buffers, but they are all unmapped. The page was
5351da177e4SLinus Torvalds * created by pagein or read over a hole which was handled by
5362c69e205SMatthew Wilcox (Oracle) * block_read_full_folio(). If this address_space is also
537d4388340SMatthew Wilcox (Oracle) * using mpage_readahead then this can rarely happen.
5381da177e4SLinus Torvalds */
5391da177e4SLinus Torvalds goto confused;
5401da177e4SLinus Torvalds }
5411da177e4SLinus Torvalds
5421da177e4SLinus Torvalds /*
5431da177e4SLinus Torvalds * The page has no buffers: map it to disk
5441da177e4SLinus Torvalds */
5459160cffdSMatthew Wilcox (Oracle) BUG_ON(!folio_test_uptodate(folio));
5469160cffdSMatthew Wilcox (Oracle) block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
5474b89a37dSJan Kara /*
5484b89a37dSJan Kara * Whole page beyond EOF? Skip allocating blocks to avoid leaking
5494b89a37dSJan Kara * space.
5504b89a37dSJan Kara */
5514b89a37dSJan Kara if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits)
5524b89a37dSJan Kara goto page_is_mapped;
5531da177e4SLinus Torvalds last_block = (i_size - 1) >> blkbits;
5549160cffdSMatthew Wilcox (Oracle) map_bh.b_folio = folio;
5551da177e4SLinus Torvalds for (page_block = 0; page_block < blocks_per_page; ) {
5561da177e4SLinus Torvalds
5571da177e4SLinus Torvalds map_bh.b_state = 0;
558b0cf2321SBadari Pulavarty map_bh.b_size = 1 << blkbits;
5590ea97180SMiklos Szeredi if (mpd->get_block(inode, block_in_file, &map_bh, 1))
5601da177e4SLinus Torvalds goto confused;
5617010839cSJan Kara if (!buffer_mapped(&map_bh))
5627010839cSJan Kara goto confused;
5631da177e4SLinus Torvalds if (buffer_new(&map_bh))
564e64855c6SJan Kara clean_bdev_bh_alias(&map_bh);
5651da177e4SLinus Torvalds if (buffer_boundary(&map_bh)) {
5661da177e4SLinus Torvalds boundary_block = map_bh.b_blocknr;
5671da177e4SLinus Torvalds boundary_bdev = map_bh.b_bdev;
5681da177e4SLinus Torvalds }
5691da177e4SLinus Torvalds if (page_block) {
5701da177e4SLinus Torvalds if (map_bh.b_blocknr != blocks[page_block-1] + 1)
5711da177e4SLinus Torvalds goto confused;
5721da177e4SLinus Torvalds }
5731da177e4SLinus Torvalds blocks[page_block++] = map_bh.b_blocknr;
5741da177e4SLinus Torvalds boundary = buffer_boundary(&map_bh);
5751da177e4SLinus Torvalds bdev = map_bh.b_bdev;
5761da177e4SLinus Torvalds if (block_in_file == last_block)
5771da177e4SLinus Torvalds break;
5781da177e4SLinus Torvalds block_in_file++;
5791da177e4SLinus Torvalds }
5801da177e4SLinus Torvalds BUG_ON(page_block == 0);
5811da177e4SLinus Torvalds
5821da177e4SLinus Torvalds first_unmapped = page_block;
5831da177e4SLinus Torvalds
5841da177e4SLinus Torvalds page_is_mapped:
5859160cffdSMatthew Wilcox (Oracle) /* Don't bother writing beyond EOF, truncate will discard the folio */
5869160cffdSMatthew Wilcox (Oracle) if (folio_pos(folio) >= i_size)
5879160cffdSMatthew Wilcox (Oracle) goto confused;
5889160cffdSMatthew Wilcox (Oracle) length = folio_size(folio);
5899160cffdSMatthew Wilcox (Oracle) if (folio_pos(folio) + length > i_size) {
5901da177e4SLinus Torvalds /*
5911da177e4SLinus Torvalds * The page straddles i_size. It must be zeroed out on each
5922a61aa40SAdam Buchbinder * and every writepage invocation because it may be mmapped.
5931da177e4SLinus Torvalds * "A file is mapped in multiples of the page size. For a file
5941da177e4SLinus Torvalds * that is not a multiple of the page size, the remaining memory
5951da177e4SLinus Torvalds * is zeroed when mapped, and writes to that region are not
5961da177e4SLinus Torvalds * written out to the file."
5971da177e4SLinus Torvalds */
5989160cffdSMatthew Wilcox (Oracle) length = i_size - folio_pos(folio);
5999160cffdSMatthew Wilcox (Oracle) folio_zero_segment(folio, length, folio_size(folio));
6001da177e4SLinus Torvalds }
6011da177e4SLinus Torvalds
6021da177e4SLinus Torvalds /*
6031da177e4SLinus Torvalds * This page will go to BIO. Do we need to send this BIO off first?
6041da177e4SLinus Torvalds */
6050ea97180SMiklos Szeredi if (bio && mpd->last_block_in_bio != blocks[0] - 1)
606f0d6ca46SPankaj Raghav bio = mpage_bio_submit_write(bio);
6071da177e4SLinus Torvalds
6081da177e4SLinus Torvalds alloc_new:
6091da177e4SLinus Torvalds if (bio == NULL) {
61077c436deSChristoph Hellwig bio = bio_alloc(bdev, BIO_MAX_VECS,
61177c436deSChristoph Hellwig REQ_OP_WRITE | wbc_to_write_flags(wbc),
61277c436deSChristoph Hellwig GFP_NOFS);
613d5f68a42SChristoph Hellwig bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
614b16b1debSTejun Heo wbc_init_bio(wbc, bio);
6151da177e4SLinus Torvalds }
6161da177e4SLinus Torvalds
6171da177e4SLinus Torvalds /*
6181da177e4SLinus Torvalds * Must try to add the page before marking the buffer clean or
6191da177e4SLinus Torvalds * the confused fail path above (OOM) will be very confused when
6201da177e4SLinus Torvalds * it finds all bh marked clean (i.e. it will not write anything)
6211da177e4SLinus Torvalds */
6229160cffdSMatthew Wilcox (Oracle) wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
6231da177e4SLinus Torvalds length = first_unmapped << blkbits;
6249160cffdSMatthew Wilcox (Oracle) if (!bio_add_folio(bio, folio, length, 0)) {
625f0d6ca46SPankaj Raghav bio = mpage_bio_submit_write(bio);
6261da177e4SLinus Torvalds goto alloc_new;
6271da177e4SLinus Torvalds }
6281da177e4SLinus Torvalds
6299160cffdSMatthew Wilcox (Oracle) clean_buffers(&folio->page, first_unmapped);
6301da177e4SLinus Torvalds
6319160cffdSMatthew Wilcox (Oracle) BUG_ON(folio_test_writeback(folio));
6329160cffdSMatthew Wilcox (Oracle) folio_start_writeback(folio);
6339160cffdSMatthew Wilcox (Oracle) folio_unlock(folio);
6341da177e4SLinus Torvalds if (boundary || (first_unmapped != blocks_per_page)) {
635f0d6ca46SPankaj Raghav bio = mpage_bio_submit_write(bio);
6361da177e4SLinus Torvalds if (boundary_block) {
6371da177e4SLinus Torvalds write_boundary_block(boundary_bdev,
6381da177e4SLinus Torvalds boundary_block, 1 << blkbits);
6391da177e4SLinus Torvalds }
6401da177e4SLinus Torvalds } else {
6410ea97180SMiklos Szeredi mpd->last_block_in_bio = blocks[blocks_per_page - 1];
6421da177e4SLinus Torvalds }
6431da177e4SLinus Torvalds goto out;
6441da177e4SLinus Torvalds
6451da177e4SLinus Torvalds confused:
6461da177e4SLinus Torvalds if (bio)
647f0d6ca46SPankaj Raghav bio = mpage_bio_submit_write(bio);
6481da177e4SLinus Torvalds
6491da177e4SLinus Torvalds /*
6501da177e4SLinus Torvalds * The caller has a ref on the inode, so *mapping is stable
6511da177e4SLinus Torvalds */
6529160cffdSMatthew Wilcox (Oracle) ret = block_write_full_page(&folio->page, mpd->get_block, wbc);
6530ea97180SMiklos Szeredi mapping_set_error(mapping, ret);
6541da177e4SLinus Torvalds out:
6550ea97180SMiklos Szeredi mpd->bio = bio;
6560ea97180SMiklos Szeredi return ret;
6571da177e4SLinus Torvalds }
6581da177e4SLinus Torvalds
6591da177e4SLinus Torvalds /**
66078a4a50aSRandy Dunlap * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
6611da177e4SLinus Torvalds * @mapping: address space structure to write
6621da177e4SLinus Torvalds * @wbc: subtract the number of written pages from *@wbc->nr_to_write
6631da177e4SLinus Torvalds * @get_block: the filesystem's block mapper function.
6641da177e4SLinus Torvalds *
6651da177e4SLinus Torvalds * This is a library function, which implements the writepages()
6661da177e4SLinus Torvalds * address_space_operation.
6671da177e4SLinus Torvalds */
6681da177e4SLinus Torvalds int
mpage_writepages(struct address_space * mapping,struct writeback_control * wbc,get_block_t get_block)6691da177e4SLinus Torvalds mpage_writepages(struct address_space *mapping,
6701da177e4SLinus Torvalds struct writeback_control *wbc, get_block_t get_block)
6711da177e4SLinus Torvalds {
672cf5e7a65SChristoph Hellwig struct mpage_data mpd = {
673cf5e7a65SChristoph Hellwig .get_block = get_block,
674cf5e7a65SChristoph Hellwig };
6752ed1a6bcSJens Axboe struct blk_plug plug;
6760ea97180SMiklos Szeredi int ret;
6771da177e4SLinus Torvalds
6782ed1a6bcSJens Axboe blk_start_plug(&plug);
6790ea97180SMiklos Szeredi ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
68077c436deSChristoph Hellwig if (mpd.bio)
681f0d6ca46SPankaj Raghav mpage_bio_submit_write(mpd.bio);
6822ed1a6bcSJens Axboe blk_finish_plug(&plug);
6831da177e4SLinus Torvalds return ret;
6841da177e4SLinus Torvalds }
6851da177e4SLinus Torvalds EXPORT_SYMBOL(mpage_writepages);
686