1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) { 52 if (err) 53 folio_set_error(fi.folio); 54 else 55 folio_mark_uptodate(fi.folio); 56 folio_unlock(fi.folio); 57 } 58 59 bio_put(bio); 60 } 61 62 static void mpage_write_end_io(struct bio *bio) 63 { 64 struct folio_iter fi; 65 int err = blk_status_to_errno(bio->bi_status); 66 67 bio_for_each_folio_all(fi, bio) { 68 if (err) { 69 folio_set_error(fi.folio); 70 mapping_set_error(fi.folio->mapping, err); 71 } 72 folio_end_writeback(fi.folio); 73 } 74 75 bio_put(bio); 76 } 77 78 static struct bio *mpage_bio_submit_read(struct bio *bio) 79 { 80 bio->bi_end_io = mpage_read_end_io; 81 guard_bio_eod(bio); 82 submit_bio(bio); 83 return NULL; 84 } 85 86 static struct bio *mpage_bio_submit_write(struct bio *bio) 87 { 88 bio->bi_end_io = mpage_write_end_io; 89 guard_bio_eod(bio); 90 submit_bio(bio); 91 return NULL; 92 } 93 94 /* 95 * support function for mpage_readahead. The fs supplied get_block might 96 * return an up to date buffer. This is used to map that buffer into 97 * the page, which allows read_folio to avoid triggering a duplicate call 98 * to get_block. 99 * 100 * The idea is to avoid adding buffers to pages that don't already have 101 * them. So when the buffer is up to date and the page size == block size, 102 * this marks the page up to date instead of adding new buffers. 103 */ 104 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 105 int page_block) 106 { 107 struct inode *inode = folio->mapping->host; 108 struct buffer_head *page_bh, *head; 109 int block = 0; 110 111 head = folio_buffers(folio); 112 if (!head) { 113 /* 114 * don't make any buffers if there is only one buffer on 115 * the folio and the folio just needs to be set up to date 116 */ 117 if (inode->i_blkbits == PAGE_SHIFT && 118 buffer_uptodate(bh)) { 119 folio_mark_uptodate(folio); 120 return; 121 } 122 create_empty_buffers(&folio->page, i_blocksize(inode), 0); 123 head = folio_buffers(folio); 124 } 125 126 page_bh = head; 127 do { 128 if (block == page_block) { 129 page_bh->b_state = bh->b_state; 130 page_bh->b_bdev = bh->b_bdev; 131 page_bh->b_blocknr = bh->b_blocknr; 132 break; 133 } 134 page_bh = page_bh->b_this_page; 135 block++; 136 } while (page_bh != head); 137 } 138 139 struct mpage_readpage_args { 140 struct bio *bio; 141 struct folio *folio; 142 unsigned int nr_pages; 143 bool is_readahead; 144 sector_t last_block_in_bio; 145 struct buffer_head map_bh; 146 unsigned long first_logical_block; 147 get_block_t *get_block; 148 }; 149 150 /* 151 * This is the worker routine which does all the work of mapping the disk 152 * blocks and constructs largest possible bios, submits them for IO if the 153 * blocks are not contiguous on the disk. 154 * 155 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 156 * represent the validity of its disk mapping and to decide when to do the next 157 * get_block() call. 158 */ 159 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 160 { 161 struct folio *folio = args->folio; 162 struct inode *inode = folio->mapping->host; 163 const unsigned blkbits = inode->i_blkbits; 164 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 165 const unsigned blocksize = 1 << blkbits; 166 struct buffer_head *map_bh = &args->map_bh; 167 sector_t block_in_file; 168 sector_t last_block; 169 sector_t last_block_in_file; 170 sector_t blocks[MAX_BUF_PER_PAGE]; 171 unsigned page_block; 172 unsigned first_hole = blocks_per_page; 173 struct block_device *bdev = NULL; 174 int length; 175 int fully_mapped = 1; 176 blk_opf_t opf = REQ_OP_READ; 177 unsigned nblocks; 178 unsigned relative_block; 179 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 180 181 /* MAX_BUF_PER_PAGE, for example */ 182 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 183 184 if (args->is_readahead) { 185 opf |= REQ_RAHEAD; 186 gfp |= __GFP_NORETRY | __GFP_NOWARN; 187 } 188 189 if (folio_buffers(folio)) 190 goto confused; 191 192 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 193 last_block = block_in_file + args->nr_pages * blocks_per_page; 194 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 195 if (last_block > last_block_in_file) 196 last_block = last_block_in_file; 197 page_block = 0; 198 199 /* 200 * Map blocks using the result from the previous get_blocks call first. 201 */ 202 nblocks = map_bh->b_size >> blkbits; 203 if (buffer_mapped(map_bh) && 204 block_in_file > args->first_logical_block && 205 block_in_file < (args->first_logical_block + nblocks)) { 206 unsigned map_offset = block_in_file - args->first_logical_block; 207 unsigned last = nblocks - map_offset; 208 209 for (relative_block = 0; ; relative_block++) { 210 if (relative_block == last) { 211 clear_buffer_mapped(map_bh); 212 break; 213 } 214 if (page_block == blocks_per_page) 215 break; 216 blocks[page_block] = map_bh->b_blocknr + map_offset + 217 relative_block; 218 page_block++; 219 block_in_file++; 220 } 221 bdev = map_bh->b_bdev; 222 } 223 224 /* 225 * Then do more get_blocks calls until we are done with this folio. 226 */ 227 map_bh->b_folio = folio; 228 while (page_block < blocks_per_page) { 229 map_bh->b_state = 0; 230 map_bh->b_size = 0; 231 232 if (block_in_file < last_block) { 233 map_bh->b_size = (last_block-block_in_file) << blkbits; 234 if (args->get_block(inode, block_in_file, map_bh, 0)) 235 goto confused; 236 args->first_logical_block = block_in_file; 237 } 238 239 if (!buffer_mapped(map_bh)) { 240 fully_mapped = 0; 241 if (first_hole == blocks_per_page) 242 first_hole = page_block; 243 page_block++; 244 block_in_file++; 245 continue; 246 } 247 248 /* some filesystems will copy data into the page during 249 * the get_block call, in which case we don't want to 250 * read it again. map_buffer_to_folio copies the data 251 * we just collected from get_block into the folio's buffers 252 * so read_folio doesn't have to repeat the get_block call 253 */ 254 if (buffer_uptodate(map_bh)) { 255 map_buffer_to_folio(folio, map_bh, page_block); 256 goto confused; 257 } 258 259 if (first_hole != blocks_per_page) 260 goto confused; /* hole -> non-hole */ 261 262 /* Contiguous blocks? */ 263 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) 264 goto confused; 265 nblocks = map_bh->b_size >> blkbits; 266 for (relative_block = 0; ; relative_block++) { 267 if (relative_block == nblocks) { 268 clear_buffer_mapped(map_bh); 269 break; 270 } else if (page_block == blocks_per_page) 271 break; 272 blocks[page_block] = map_bh->b_blocknr+relative_block; 273 page_block++; 274 block_in_file++; 275 } 276 bdev = map_bh->b_bdev; 277 } 278 279 if (first_hole != blocks_per_page) { 280 folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); 281 if (first_hole == 0) { 282 folio_mark_uptodate(folio); 283 folio_unlock(folio); 284 goto out; 285 } 286 } else if (fully_mapped) { 287 folio_set_mappedtodisk(folio); 288 } 289 290 /* 291 * This folio will go to BIO. Do we need to send this BIO off first? 292 */ 293 if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) 294 args->bio = mpage_bio_submit_read(args->bio); 295 296 alloc_new: 297 if (args->bio == NULL) { 298 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 299 gfp); 300 if (args->bio == NULL) 301 goto confused; 302 args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); 303 } 304 305 length = first_hole << blkbits; 306 if (!bio_add_folio(args->bio, folio, length, 0)) { 307 args->bio = mpage_bio_submit_read(args->bio); 308 goto alloc_new; 309 } 310 311 relative_block = block_in_file - args->first_logical_block; 312 nblocks = map_bh->b_size >> blkbits; 313 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 314 (first_hole != blocks_per_page)) 315 args->bio = mpage_bio_submit_read(args->bio); 316 else 317 args->last_block_in_bio = blocks[blocks_per_page - 1]; 318 out: 319 return args->bio; 320 321 confused: 322 if (args->bio) 323 args->bio = mpage_bio_submit_read(args->bio); 324 if (!folio_test_uptodate(folio)) 325 block_read_full_folio(folio, args->get_block); 326 else 327 folio_unlock(folio); 328 goto out; 329 } 330 331 /** 332 * mpage_readahead - start reads against pages 333 * @rac: Describes which pages to read. 334 * @get_block: The filesystem's block mapper function. 335 * 336 * This function walks the pages and the blocks within each page, building and 337 * emitting large BIOs. 338 * 339 * If anything unusual happens, such as: 340 * 341 * - encountering a page which has buffers 342 * - encountering a page which has a non-hole after a hole 343 * - encountering a page with non-contiguous blocks 344 * 345 * then this code just gives up and calls the buffer_head-based read function. 346 * It does handle a page which has holes at the end - that is a common case: 347 * the end-of-file on blocksize < PAGE_SIZE setups. 348 * 349 * BH_Boundary explanation: 350 * 351 * There is a problem. The mpage read code assembles several pages, gets all 352 * their disk mappings, and then submits them all. That's fine, but obtaining 353 * the disk mappings may require I/O. Reads of indirect blocks, for example. 354 * 355 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 356 * submitted in the following order: 357 * 358 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 359 * 360 * because the indirect block has to be read to get the mappings of blocks 361 * 13,14,15,16. Obviously, this impacts performance. 362 * 363 * So what we do it to allow the filesystem's get_block() function to set 364 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 365 * after this one will require I/O against a block which is probably close to 366 * this one. So you should push what I/O you have currently accumulated. 367 * 368 * This all causes the disk requests to be issued in the correct order. 369 */ 370 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 371 { 372 struct folio *folio; 373 struct mpage_readpage_args args = { 374 .get_block = get_block, 375 .is_readahead = true, 376 }; 377 378 while ((folio = readahead_folio(rac))) { 379 prefetchw(&folio->flags); 380 args.folio = folio; 381 args.nr_pages = readahead_count(rac); 382 args.bio = do_mpage_readpage(&args); 383 } 384 if (args.bio) 385 mpage_bio_submit_read(args.bio); 386 } 387 EXPORT_SYMBOL(mpage_readahead); 388 389 /* 390 * This isn't called much at all 391 */ 392 int mpage_read_folio(struct folio *folio, get_block_t get_block) 393 { 394 struct mpage_readpage_args args = { 395 .folio = folio, 396 .nr_pages = 1, 397 .get_block = get_block, 398 }; 399 400 args.bio = do_mpage_readpage(&args); 401 if (args.bio) 402 mpage_bio_submit_read(args.bio); 403 return 0; 404 } 405 EXPORT_SYMBOL(mpage_read_folio); 406 407 /* 408 * Writing is not so simple. 409 * 410 * If the page has buffers then they will be used for obtaining the disk 411 * mapping. We only support pages which are fully mapped-and-dirty, with a 412 * special case for pages which are unmapped at the end: end-of-file. 413 * 414 * If the page has no buffers (preferred) then the page is mapped here. 415 * 416 * If all blocks are found to be contiguous then the page can go into the 417 * BIO. Otherwise fall back to the mapping's writepage(). 418 * 419 * FIXME: This code wants an estimate of how many pages are still to be 420 * written, so it can intelligently allocate a suitably-sized BIO. For now, 421 * just allocate full-size (16-page) BIOs. 422 */ 423 424 struct mpage_data { 425 struct bio *bio; 426 sector_t last_block_in_bio; 427 get_block_t *get_block; 428 }; 429 430 /* 431 * We have our BIO, so we can now mark the buffers clean. Make 432 * sure to only clean buffers which we know we'll be writing. 433 */ 434 static void clean_buffers(struct page *page, unsigned first_unmapped) 435 { 436 unsigned buffer_counter = 0; 437 struct buffer_head *bh, *head; 438 if (!page_has_buffers(page)) 439 return; 440 head = page_buffers(page); 441 bh = head; 442 443 do { 444 if (buffer_counter++ == first_unmapped) 445 break; 446 clear_buffer_dirty(bh); 447 bh = bh->b_this_page; 448 } while (bh != head); 449 450 /* 451 * we cannot drop the bh if the page is not uptodate or a concurrent 452 * read_folio would fail to serialize with the bh and it would read from 453 * disk before we reach the platter. 454 */ 455 if (buffer_heads_over_limit && PageUptodate(page)) 456 try_to_free_buffers(page_folio(page)); 457 } 458 459 /* 460 * For situations where we want to clean all buffers attached to a page. 461 * We don't need to calculate how many buffers are attached to the page, 462 * we just need to specify a number larger than the maximum number of buffers. 463 */ 464 void clean_page_buffers(struct page *page) 465 { 466 clean_buffers(page, ~0U); 467 } 468 469 static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, 470 void *data) 471 { 472 struct mpage_data *mpd = data; 473 struct bio *bio = mpd->bio; 474 struct address_space *mapping = folio->mapping; 475 struct inode *inode = mapping->host; 476 const unsigned blkbits = inode->i_blkbits; 477 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 478 sector_t last_block; 479 sector_t block_in_file; 480 sector_t blocks[MAX_BUF_PER_PAGE]; 481 unsigned page_block; 482 unsigned first_unmapped = blocks_per_page; 483 struct block_device *bdev = NULL; 484 int boundary = 0; 485 sector_t boundary_block = 0; 486 struct block_device *boundary_bdev = NULL; 487 size_t length; 488 struct buffer_head map_bh; 489 loff_t i_size = i_size_read(inode); 490 int ret = 0; 491 struct buffer_head *head = folio_buffers(folio); 492 493 if (head) { 494 struct buffer_head *bh = head; 495 496 /* If they're all mapped and dirty, do it */ 497 page_block = 0; 498 do { 499 BUG_ON(buffer_locked(bh)); 500 if (!buffer_mapped(bh)) { 501 /* 502 * unmapped dirty buffers are created by 503 * block_dirty_folio -> mmapped data 504 */ 505 if (buffer_dirty(bh)) 506 goto confused; 507 if (first_unmapped == blocks_per_page) 508 first_unmapped = page_block; 509 continue; 510 } 511 512 if (first_unmapped != blocks_per_page) 513 goto confused; /* hole -> non-hole */ 514 515 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 516 goto confused; 517 if (page_block) { 518 if (bh->b_blocknr != blocks[page_block-1] + 1) 519 goto confused; 520 } 521 blocks[page_block++] = bh->b_blocknr; 522 boundary = buffer_boundary(bh); 523 if (boundary) { 524 boundary_block = bh->b_blocknr; 525 boundary_bdev = bh->b_bdev; 526 } 527 bdev = bh->b_bdev; 528 } while ((bh = bh->b_this_page) != head); 529 530 if (first_unmapped) 531 goto page_is_mapped; 532 533 /* 534 * Page has buffers, but they are all unmapped. The page was 535 * created by pagein or read over a hole which was handled by 536 * block_read_full_folio(). If this address_space is also 537 * using mpage_readahead then this can rarely happen. 538 */ 539 goto confused; 540 } 541 542 /* 543 * The page has no buffers: map it to disk 544 */ 545 BUG_ON(!folio_test_uptodate(folio)); 546 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 547 /* 548 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 549 * space. 550 */ 551 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 552 goto page_is_mapped; 553 last_block = (i_size - 1) >> blkbits; 554 map_bh.b_folio = folio; 555 for (page_block = 0; page_block < blocks_per_page; ) { 556 557 map_bh.b_state = 0; 558 map_bh.b_size = 1 << blkbits; 559 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 560 goto confused; 561 if (!buffer_mapped(&map_bh)) 562 goto confused; 563 if (buffer_new(&map_bh)) 564 clean_bdev_bh_alias(&map_bh); 565 if (buffer_boundary(&map_bh)) { 566 boundary_block = map_bh.b_blocknr; 567 boundary_bdev = map_bh.b_bdev; 568 } 569 if (page_block) { 570 if (map_bh.b_blocknr != blocks[page_block-1] + 1) 571 goto confused; 572 } 573 blocks[page_block++] = map_bh.b_blocknr; 574 boundary = buffer_boundary(&map_bh); 575 bdev = map_bh.b_bdev; 576 if (block_in_file == last_block) 577 break; 578 block_in_file++; 579 } 580 BUG_ON(page_block == 0); 581 582 first_unmapped = page_block; 583 584 page_is_mapped: 585 /* Don't bother writing beyond EOF, truncate will discard the folio */ 586 if (folio_pos(folio) >= i_size) 587 goto confused; 588 length = folio_size(folio); 589 if (folio_pos(folio) + length > i_size) { 590 /* 591 * The page straddles i_size. It must be zeroed out on each 592 * and every writepage invocation because it may be mmapped. 593 * "A file is mapped in multiples of the page size. For a file 594 * that is not a multiple of the page size, the remaining memory 595 * is zeroed when mapped, and writes to that region are not 596 * written out to the file." 597 */ 598 length = i_size - folio_pos(folio); 599 folio_zero_segment(folio, length, folio_size(folio)); 600 } 601 602 /* 603 * This page will go to BIO. Do we need to send this BIO off first? 604 */ 605 if (bio && mpd->last_block_in_bio != blocks[0] - 1) 606 bio = mpage_bio_submit_write(bio); 607 608 alloc_new: 609 if (bio == NULL) { 610 bio = bio_alloc(bdev, BIO_MAX_VECS, 611 REQ_OP_WRITE | wbc_to_write_flags(wbc), 612 GFP_NOFS); 613 bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); 614 wbc_init_bio(wbc, bio); 615 } 616 617 /* 618 * Must try to add the page before marking the buffer clean or 619 * the confused fail path above (OOM) will be very confused when 620 * it finds all bh marked clean (i.e. it will not write anything) 621 */ 622 wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); 623 length = first_unmapped << blkbits; 624 if (!bio_add_folio(bio, folio, length, 0)) { 625 bio = mpage_bio_submit_write(bio); 626 goto alloc_new; 627 } 628 629 clean_buffers(&folio->page, first_unmapped); 630 631 BUG_ON(folio_test_writeback(folio)); 632 folio_start_writeback(folio); 633 folio_unlock(folio); 634 if (boundary || (first_unmapped != blocks_per_page)) { 635 bio = mpage_bio_submit_write(bio); 636 if (boundary_block) { 637 write_boundary_block(boundary_bdev, 638 boundary_block, 1 << blkbits); 639 } 640 } else { 641 mpd->last_block_in_bio = blocks[blocks_per_page - 1]; 642 } 643 goto out; 644 645 confused: 646 if (bio) 647 bio = mpage_bio_submit_write(bio); 648 649 /* 650 * The caller has a ref on the inode, so *mapping is stable 651 */ 652 ret = block_write_full_page(&folio->page, mpd->get_block, wbc); 653 mapping_set_error(mapping, ret); 654 out: 655 mpd->bio = bio; 656 return ret; 657 } 658 659 /** 660 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 661 * @mapping: address space structure to write 662 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 663 * @get_block: the filesystem's block mapper function. 664 * 665 * This is a library function, which implements the writepages() 666 * address_space_operation. 667 */ 668 int 669 mpage_writepages(struct address_space *mapping, 670 struct writeback_control *wbc, get_block_t get_block) 671 { 672 struct mpage_data mpd = { 673 .get_block = get_block, 674 }; 675 struct blk_plug plug; 676 int ret; 677 678 blk_start_plug(&plug); 679 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); 680 if (mpd.bio) 681 mpage_bio_submit_write(mpd.bio); 682 blk_finish_plug(&plug); 683 return ret; 684 } 685 EXPORT_SYMBOL(mpage_writepages); 686