1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Red Hat, Inc. 4 * Copyright (C) 2016-2019 Christoph Hellwig. 5 */ 6 #include <linux/module.h> 7 #include <linux/compiler.h> 8 #include <linux/fs.h> 9 #include <linux/iomap.h> 10 #include <linux/pagemap.h> 11 #include <linux/uio.h> 12 #include <linux/buffer_head.h> 13 #include <linux/dax.h> 14 #include <linux/writeback.h> 15 #include <linux/list_sort.h> 16 #include <linux/swap.h> 17 #include <linux/bio.h> 18 #include <linux/sched/signal.h> 19 #include <linux/migrate.h> 20 #include "trace.h" 21 22 #include "../internal.h" 23 24 /* 25 * Structure allocated for each page when block size < PAGE_SIZE to track 26 * sub-page uptodate status and I/O completions. 27 */ 28 struct iomap_page { 29 atomic_t read_count; 30 atomic_t write_count; 31 spinlock_t uptodate_lock; 32 DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); 33 }; 34 35 static inline struct iomap_page *to_iomap_page(struct page *page) 36 { 37 if (page_has_private(page)) 38 return (struct iomap_page *)page_private(page); 39 return NULL; 40 } 41 42 static struct bio_set iomap_ioend_bioset; 43 44 static struct iomap_page * 45 iomap_page_create(struct inode *inode, struct page *page) 46 { 47 struct iomap_page *iop = to_iomap_page(page); 48 49 if (iop || i_blocksize(inode) == PAGE_SIZE) 50 return iop; 51 52 iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); 53 atomic_set(&iop->read_count, 0); 54 atomic_set(&iop->write_count, 0); 55 spin_lock_init(&iop->uptodate_lock); 56 bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); 57 58 /* 59 * migrate_page_move_mapping() assumes that pages with private data have 60 * their count elevated by 1. 61 */ 62 attach_page_private(page, iop); 63 return iop; 64 } 65 66 static void 67 iomap_page_release(struct page *page) 68 { 69 struct iomap_page *iop = detach_page_private(page); 70 71 if (!iop) 72 return; 73 WARN_ON_ONCE(atomic_read(&iop->read_count)); 74 WARN_ON_ONCE(atomic_read(&iop->write_count)); 75 kfree(iop); 76 } 77 78 /* 79 * Calculate the range inside the page that we actually need to read. 80 */ 81 static void 82 iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 83 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 84 { 85 loff_t orig_pos = *pos; 86 loff_t isize = i_size_read(inode); 87 unsigned block_bits = inode->i_blkbits; 88 unsigned block_size = (1 << block_bits); 89 unsigned poff = offset_in_page(*pos); 90 unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 91 unsigned first = poff >> block_bits; 92 unsigned last = (poff + plen - 1) >> block_bits; 93 94 /* 95 * If the block size is smaller than the page size we need to check the 96 * per-block uptodate status and adjust the offset and length if needed 97 * to avoid reading in already uptodate ranges. 98 */ 99 if (iop) { 100 unsigned int i; 101 102 /* move forward for each leading block marked uptodate */ 103 for (i = first; i <= last; i++) { 104 if (!test_bit(i, iop->uptodate)) 105 break; 106 *pos += block_size; 107 poff += block_size; 108 plen -= block_size; 109 first++; 110 } 111 112 /* truncate len if we find any trailing uptodate block(s) */ 113 for ( ; i <= last; i++) { 114 if (test_bit(i, iop->uptodate)) { 115 plen -= (last - i + 1) * block_size; 116 last = i - 1; 117 break; 118 } 119 } 120 } 121 122 /* 123 * If the extent spans the block that contains the i_size we need to 124 * handle both halves separately so that we properly zero data in the 125 * page cache for blocks that are entirely outside of i_size. 126 */ 127 if (orig_pos <= isize && orig_pos + length > isize) { 128 unsigned end = offset_in_page(isize - 1) >> block_bits; 129 130 if (first <= end && last > end) 131 plen -= (last - end) * block_size; 132 } 133 134 *offp = poff; 135 *lenp = plen; 136 } 137 138 static void 139 iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) 140 { 141 struct iomap_page *iop = to_iomap_page(page); 142 struct inode *inode = page->mapping->host; 143 unsigned first = off >> inode->i_blkbits; 144 unsigned last = (off + len - 1) >> inode->i_blkbits; 145 bool uptodate = true; 146 unsigned long flags; 147 unsigned int i; 148 149 spin_lock_irqsave(&iop->uptodate_lock, flags); 150 for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { 151 if (i >= first && i <= last) 152 set_bit(i, iop->uptodate); 153 else if (!test_bit(i, iop->uptodate)) 154 uptodate = false; 155 } 156 157 if (uptodate) 158 SetPageUptodate(page); 159 spin_unlock_irqrestore(&iop->uptodate_lock, flags); 160 } 161 162 static void 163 iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 164 { 165 if (PageError(page)) 166 return; 167 168 if (page_has_private(page)) 169 iomap_iop_set_range_uptodate(page, off, len); 170 else 171 SetPageUptodate(page); 172 } 173 174 static void 175 iomap_read_finish(struct iomap_page *iop, struct page *page) 176 { 177 if (!iop || atomic_dec_and_test(&iop->read_count)) 178 unlock_page(page); 179 } 180 181 static void 182 iomap_read_page_end_io(struct bio_vec *bvec, int error) 183 { 184 struct page *page = bvec->bv_page; 185 struct iomap_page *iop = to_iomap_page(page); 186 187 if (unlikely(error)) { 188 ClearPageUptodate(page); 189 SetPageError(page); 190 } else { 191 iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 192 } 193 194 iomap_read_finish(iop, page); 195 } 196 197 static void 198 iomap_read_end_io(struct bio *bio) 199 { 200 int error = blk_status_to_errno(bio->bi_status); 201 struct bio_vec *bvec; 202 struct bvec_iter_all iter_all; 203 204 bio_for_each_segment_all(bvec, bio, iter_all) 205 iomap_read_page_end_io(bvec, error); 206 bio_put(bio); 207 } 208 209 struct iomap_readpage_ctx { 210 struct page *cur_page; 211 bool cur_page_in_bio; 212 struct bio *bio; 213 struct readahead_control *rac; 214 }; 215 216 static void 217 iomap_read_inline_data(struct inode *inode, struct page *page, 218 struct iomap *iomap) 219 { 220 size_t size = i_size_read(inode); 221 void *addr; 222 223 if (PageUptodate(page)) 224 return; 225 226 BUG_ON(page->index); 227 BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 228 229 addr = kmap_atomic(page); 230 memcpy(addr, iomap->inline_data, size); 231 memset(addr + size, 0, PAGE_SIZE - size); 232 kunmap_atomic(addr); 233 SetPageUptodate(page); 234 } 235 236 static inline bool iomap_block_needs_zeroing(struct inode *inode, 237 struct iomap *iomap, loff_t pos) 238 { 239 return iomap->type != IOMAP_MAPPED || 240 (iomap->flags & IOMAP_F_NEW) || 241 pos >= i_size_read(inode); 242 } 243 244 static loff_t 245 iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 246 struct iomap *iomap, struct iomap *srcmap) 247 { 248 struct iomap_readpage_ctx *ctx = data; 249 struct page *page = ctx->cur_page; 250 struct iomap_page *iop = iomap_page_create(inode, page); 251 bool same_page = false, is_contig = false; 252 loff_t orig_pos = pos; 253 unsigned poff, plen; 254 sector_t sector; 255 256 if (iomap->type == IOMAP_INLINE) { 257 WARN_ON_ONCE(pos); 258 iomap_read_inline_data(inode, page, iomap); 259 return PAGE_SIZE; 260 } 261 262 /* zero post-eof blocks as the page may be mapped */ 263 iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); 264 if (plen == 0) 265 goto done; 266 267 if (iomap_block_needs_zeroing(inode, iomap, pos)) { 268 zero_user(page, poff, plen); 269 iomap_set_range_uptodate(page, poff, plen); 270 goto done; 271 } 272 273 ctx->cur_page_in_bio = true; 274 275 /* 276 * Try to merge into a previous segment if we can. 277 */ 278 sector = iomap_sector(iomap, pos); 279 if (ctx->bio && bio_end_sector(ctx->bio) == sector) 280 is_contig = true; 281 282 if (is_contig && 283 __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { 284 if (!same_page && iop) 285 atomic_inc(&iop->read_count); 286 goto done; 287 } 288 289 /* 290 * If we start a new segment we need to increase the read count, and we 291 * need to do so before submitting any previous full bio to make sure 292 * that we don't prematurely unlock the page. 293 */ 294 if (iop) 295 atomic_inc(&iop->read_count); 296 297 if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { 298 gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 299 gfp_t orig_gfp = gfp; 300 int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 301 302 if (ctx->bio) 303 submit_bio(ctx->bio); 304 305 if (ctx->rac) /* same as readahead_gfp_mask */ 306 gfp |= __GFP_NORETRY | __GFP_NOWARN; 307 ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 308 /* 309 * If the bio_alloc fails, try it again for a single page to 310 * avoid having to deal with partial page reads. This emulates 311 * what do_mpage_readpage does. 312 */ 313 if (!ctx->bio) 314 ctx->bio = bio_alloc(orig_gfp, 1); 315 ctx->bio->bi_opf = REQ_OP_READ; 316 if (ctx->rac) 317 ctx->bio->bi_opf |= REQ_RAHEAD; 318 ctx->bio->bi_iter.bi_sector = sector; 319 bio_set_dev(ctx->bio, iomap->bdev); 320 ctx->bio->bi_end_io = iomap_read_end_io; 321 } 322 323 bio_add_page(ctx->bio, page, plen, poff); 324 done: 325 /* 326 * Move the caller beyond our range so that it keeps making progress. 327 * For that we have to include any leading non-uptodate ranges, but 328 * we can skip trailing ones as they will be handled in the next 329 * iteration. 330 */ 331 return pos - orig_pos + plen; 332 } 333 334 int 335 iomap_readpage(struct page *page, const struct iomap_ops *ops) 336 { 337 struct iomap_readpage_ctx ctx = { .cur_page = page }; 338 struct inode *inode = page->mapping->host; 339 unsigned poff; 340 loff_t ret; 341 342 trace_iomap_readpage(page->mapping->host, 1); 343 344 for (poff = 0; poff < PAGE_SIZE; poff += ret) { 345 ret = iomap_apply(inode, page_offset(page) + poff, 346 PAGE_SIZE - poff, 0, ops, &ctx, 347 iomap_readpage_actor); 348 if (ret <= 0) { 349 WARN_ON_ONCE(ret == 0); 350 SetPageError(page); 351 break; 352 } 353 } 354 355 if (ctx.bio) { 356 submit_bio(ctx.bio); 357 WARN_ON_ONCE(!ctx.cur_page_in_bio); 358 } else { 359 WARN_ON_ONCE(ctx.cur_page_in_bio); 360 unlock_page(page); 361 } 362 363 /* 364 * Just like mpage_readahead and block_read_full_page we always 365 * return 0 and just mark the page as PageError on errors. This 366 * should be cleaned up all through the stack eventually. 367 */ 368 return 0; 369 } 370 EXPORT_SYMBOL_GPL(iomap_readpage); 371 372 static loff_t 373 iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, 374 void *data, struct iomap *iomap, struct iomap *srcmap) 375 { 376 struct iomap_readpage_ctx *ctx = data; 377 loff_t done, ret; 378 379 for (done = 0; done < length; done += ret) { 380 if (ctx->cur_page && offset_in_page(pos + done) == 0) { 381 if (!ctx->cur_page_in_bio) 382 unlock_page(ctx->cur_page); 383 put_page(ctx->cur_page); 384 ctx->cur_page = NULL; 385 } 386 if (!ctx->cur_page) { 387 ctx->cur_page = readahead_page(ctx->rac); 388 ctx->cur_page_in_bio = false; 389 } 390 ret = iomap_readpage_actor(inode, pos + done, length - done, 391 ctx, iomap, srcmap); 392 } 393 394 return done; 395 } 396 397 /** 398 * iomap_readahead - Attempt to read pages from a file. 399 * @rac: Describes the pages to be read. 400 * @ops: The operations vector for the filesystem. 401 * 402 * This function is for filesystems to call to implement their readahead 403 * address_space operation. 404 * 405 * Context: The @ops callbacks may submit I/O (eg to read the addresses of 406 * blocks from disc), and may wait for it. The caller may be trying to 407 * access a different page, and so sleeping excessively should be avoided. 408 * It may allocate memory, but should avoid costly allocations. This 409 * function is called with memalloc_nofs set, so allocations will not cause 410 * the filesystem to be reentered. 411 */ 412 void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) 413 { 414 struct inode *inode = rac->mapping->host; 415 loff_t pos = readahead_pos(rac); 416 loff_t length = readahead_length(rac); 417 struct iomap_readpage_ctx ctx = { 418 .rac = rac, 419 }; 420 421 trace_iomap_readahead(inode, readahead_count(rac)); 422 423 while (length > 0) { 424 loff_t ret = iomap_apply(inode, pos, length, 0, ops, 425 &ctx, iomap_readahead_actor); 426 if (ret <= 0) { 427 WARN_ON_ONCE(ret == 0); 428 break; 429 } 430 pos += ret; 431 length -= ret; 432 } 433 434 if (ctx.bio) 435 submit_bio(ctx.bio); 436 if (ctx.cur_page) { 437 if (!ctx.cur_page_in_bio) 438 unlock_page(ctx.cur_page); 439 put_page(ctx.cur_page); 440 } 441 } 442 EXPORT_SYMBOL_GPL(iomap_readahead); 443 444 /* 445 * iomap_is_partially_uptodate checks whether blocks within a page are 446 * uptodate or not. 447 * 448 * Returns true if all blocks which correspond to a file portion 449 * we want to read within the page are uptodate. 450 */ 451 int 452 iomap_is_partially_uptodate(struct page *page, unsigned long from, 453 unsigned long count) 454 { 455 struct iomap_page *iop = to_iomap_page(page); 456 struct inode *inode = page->mapping->host; 457 unsigned len, first, last; 458 unsigned i; 459 460 /* Limit range to one page */ 461 len = min_t(unsigned, PAGE_SIZE - from, count); 462 463 /* First and last blocks in range within page */ 464 first = from >> inode->i_blkbits; 465 last = (from + len - 1) >> inode->i_blkbits; 466 467 if (iop) { 468 for (i = first; i <= last; i++) 469 if (!test_bit(i, iop->uptodate)) 470 return 0; 471 return 1; 472 } 473 474 return 0; 475 } 476 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 477 478 int 479 iomap_releasepage(struct page *page, gfp_t gfp_mask) 480 { 481 trace_iomap_releasepage(page->mapping->host, page_offset(page), 482 PAGE_SIZE); 483 484 /* 485 * mm accommodates an old ext3 case where clean pages might not have had 486 * the dirty bit cleared. Thus, it can send actual dirty pages to 487 * ->releasepage() via shrink_active_list(), skip those here. 488 */ 489 if (PageDirty(page) || PageWriteback(page)) 490 return 0; 491 iomap_page_release(page); 492 return 1; 493 } 494 EXPORT_SYMBOL_GPL(iomap_releasepage); 495 496 void 497 iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 498 { 499 trace_iomap_invalidatepage(page->mapping->host, offset, len); 500 501 /* 502 * If we are invalidating the entire page, clear the dirty state from it 503 * and release it to avoid unnecessary buildup of the LRU. 504 */ 505 if (offset == 0 && len == PAGE_SIZE) { 506 WARN_ON_ONCE(PageWriteback(page)); 507 cancel_dirty_page(page); 508 iomap_page_release(page); 509 } 510 } 511 EXPORT_SYMBOL_GPL(iomap_invalidatepage); 512 513 #ifdef CONFIG_MIGRATION 514 int 515 iomap_migrate_page(struct address_space *mapping, struct page *newpage, 516 struct page *page, enum migrate_mode mode) 517 { 518 int ret; 519 520 ret = migrate_page_move_mapping(mapping, newpage, page, 0); 521 if (ret != MIGRATEPAGE_SUCCESS) 522 return ret; 523 524 if (page_has_private(page)) 525 attach_page_private(newpage, detach_page_private(page)); 526 527 if (mode != MIGRATE_SYNC_NO_COPY) 528 migrate_page_copy(newpage, page); 529 else 530 migrate_page_states(newpage, page); 531 return MIGRATEPAGE_SUCCESS; 532 } 533 EXPORT_SYMBOL_GPL(iomap_migrate_page); 534 #endif /* CONFIG_MIGRATION */ 535 536 enum { 537 IOMAP_WRITE_F_UNSHARE = (1 << 0), 538 }; 539 540 static void 541 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 542 { 543 loff_t i_size = i_size_read(inode); 544 545 /* 546 * Only truncate newly allocated pages beyoned EOF, even if the 547 * write started inside the existing inode size. 548 */ 549 if (pos + len > i_size) 550 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 551 } 552 553 static int 554 iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, 555 unsigned plen, struct iomap *iomap) 556 { 557 struct bio_vec bvec; 558 struct bio bio; 559 560 bio_init(&bio, &bvec, 1); 561 bio.bi_opf = REQ_OP_READ; 562 bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 563 bio_set_dev(&bio, iomap->bdev); 564 __bio_add_page(&bio, page, plen, poff); 565 return submit_bio_wait(&bio); 566 } 567 568 static int 569 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, 570 struct page *page, struct iomap *srcmap) 571 { 572 struct iomap_page *iop = iomap_page_create(inode, page); 573 loff_t block_size = i_blocksize(inode); 574 loff_t block_start = pos & ~(block_size - 1); 575 loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 576 unsigned from = offset_in_page(pos), to = from + len, poff, plen; 577 int status; 578 579 if (PageUptodate(page)) 580 return 0; 581 582 do { 583 iomap_adjust_read_range(inode, iop, &block_start, 584 block_end - block_start, &poff, &plen); 585 if (plen == 0) 586 break; 587 588 if (!(flags & IOMAP_WRITE_F_UNSHARE) && 589 (from <= poff || from >= poff + plen) && 590 (to <= poff || to >= poff + plen)) 591 continue; 592 593 if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { 594 if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) 595 return -EIO; 596 zero_user_segments(page, poff, from, to, poff + plen); 597 iomap_set_range_uptodate(page, poff, plen); 598 continue; 599 } 600 601 status = iomap_read_page_sync(block_start, page, poff, plen, 602 srcmap); 603 if (status) 604 return status; 605 } while ((block_start += plen) < block_end); 606 607 return 0; 608 } 609 610 static int 611 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 612 struct page **pagep, struct iomap *iomap, struct iomap *srcmap) 613 { 614 const struct iomap_page_ops *page_ops = iomap->page_ops; 615 struct page *page; 616 int status = 0; 617 618 BUG_ON(pos + len > iomap->offset + iomap->length); 619 if (srcmap != iomap) 620 BUG_ON(pos + len > srcmap->offset + srcmap->length); 621 622 if (fatal_signal_pending(current)) 623 return -EINTR; 624 625 if (page_ops && page_ops->page_prepare) { 626 status = page_ops->page_prepare(inode, pos, len, iomap); 627 if (status) 628 return status; 629 } 630 631 page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, 632 AOP_FLAG_NOFS); 633 if (!page) { 634 status = -ENOMEM; 635 goto out_no_page; 636 } 637 638 if (srcmap->type == IOMAP_INLINE) 639 iomap_read_inline_data(inode, page, srcmap); 640 else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 641 status = __block_write_begin_int(page, pos, len, NULL, srcmap); 642 else 643 status = __iomap_write_begin(inode, pos, len, flags, page, 644 srcmap); 645 646 if (unlikely(status)) 647 goto out_unlock; 648 649 *pagep = page; 650 return 0; 651 652 out_unlock: 653 unlock_page(page); 654 put_page(page); 655 iomap_write_failed(inode, pos, len); 656 657 out_no_page: 658 if (page_ops && page_ops->page_done) 659 page_ops->page_done(inode, pos, 0, NULL, iomap); 660 return status; 661 } 662 663 int 664 iomap_set_page_dirty(struct page *page) 665 { 666 struct address_space *mapping = page_mapping(page); 667 int newly_dirty; 668 669 if (unlikely(!mapping)) 670 return !TestSetPageDirty(page); 671 672 /* 673 * Lock out page->mem_cgroup migration to keep PageDirty 674 * synchronized with per-memcg dirty page counters. 675 */ 676 lock_page_memcg(page); 677 newly_dirty = !TestSetPageDirty(page); 678 if (newly_dirty) 679 __set_page_dirty(page, mapping, 0); 680 unlock_page_memcg(page); 681 682 if (newly_dirty) 683 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 684 return newly_dirty; 685 } 686 EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 687 688 static int 689 __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 690 unsigned copied, struct page *page) 691 { 692 flush_dcache_page(page); 693 694 /* 695 * The blocks that were entirely written will now be uptodate, so we 696 * don't have to worry about a readpage reading them and overwriting a 697 * partial write. However if we have encountered a short write and only 698 * partially written into a block, it will not be marked uptodate, so a 699 * readpage might come in and destroy our partial write. 700 * 701 * Do the simplest thing, and just treat any short write to a non 702 * uptodate page as a zero-length write, and force the caller to redo 703 * the whole thing. 704 */ 705 if (unlikely(copied < len && !PageUptodate(page))) 706 return 0; 707 iomap_set_range_uptodate(page, offset_in_page(pos), len); 708 iomap_set_page_dirty(page); 709 return copied; 710 } 711 712 static int 713 iomap_write_end_inline(struct inode *inode, struct page *page, 714 struct iomap *iomap, loff_t pos, unsigned copied) 715 { 716 void *addr; 717 718 WARN_ON_ONCE(!PageUptodate(page)); 719 BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 720 721 addr = kmap_atomic(page); 722 memcpy(iomap->inline_data + pos, addr + pos, copied); 723 kunmap_atomic(addr); 724 725 mark_inode_dirty(inode); 726 return copied; 727 } 728 729 static int 730 iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, 731 struct page *page, struct iomap *iomap, struct iomap *srcmap) 732 { 733 const struct iomap_page_ops *page_ops = iomap->page_ops; 734 loff_t old_size = inode->i_size; 735 int ret; 736 737 if (srcmap->type == IOMAP_INLINE) { 738 ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 739 } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { 740 ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, 741 page, NULL); 742 } else { 743 ret = __iomap_write_end(inode, pos, len, copied, page); 744 } 745 746 /* 747 * Update the in-memory inode size after copying the data into the page 748 * cache. It's up to the file system to write the updated size to disk, 749 * preferably after I/O completion so that no stale data is exposed. 750 */ 751 if (pos + ret > old_size) { 752 i_size_write(inode, pos + ret); 753 iomap->flags |= IOMAP_F_SIZE_CHANGED; 754 } 755 unlock_page(page); 756 757 if (old_size < pos) 758 pagecache_isize_extended(inode, old_size, pos); 759 if (page_ops && page_ops->page_done) 760 page_ops->page_done(inode, pos, ret, page, iomap); 761 put_page(page); 762 763 if (ret < len) 764 iomap_write_failed(inode, pos, len); 765 return ret; 766 } 767 768 static loff_t 769 iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 770 struct iomap *iomap, struct iomap *srcmap) 771 { 772 struct iov_iter *i = data; 773 long status = 0; 774 ssize_t written = 0; 775 776 do { 777 struct page *page; 778 unsigned long offset; /* Offset into pagecache page */ 779 unsigned long bytes; /* Bytes to write to page */ 780 size_t copied; /* Bytes copied from user */ 781 782 offset = offset_in_page(pos); 783 bytes = min_t(unsigned long, PAGE_SIZE - offset, 784 iov_iter_count(i)); 785 again: 786 if (bytes > length) 787 bytes = length; 788 789 /* 790 * Bring in the user page that we will copy from _first_. 791 * Otherwise there's a nasty deadlock on copying from the 792 * same page as we're writing to, without it being marked 793 * up-to-date. 794 * 795 * Not only is this an optimisation, but it is also required 796 * to check that the address is actually valid, when atomic 797 * usercopies are used, below. 798 */ 799 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 800 status = -EFAULT; 801 break; 802 } 803 804 status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, 805 srcmap); 806 if (unlikely(status)) 807 break; 808 809 if (mapping_writably_mapped(inode->i_mapping)) 810 flush_dcache_page(page); 811 812 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 813 814 flush_dcache_page(page); 815 816 status = iomap_write_end(inode, pos, bytes, copied, page, iomap, 817 srcmap); 818 if (unlikely(status < 0)) 819 break; 820 copied = status; 821 822 cond_resched(); 823 824 iov_iter_advance(i, copied); 825 if (unlikely(copied == 0)) { 826 /* 827 * If we were unable to copy any data at all, we must 828 * fall back to a single segment length write. 829 * 830 * If we didn't fallback here, we could livelock 831 * because not all segments in the iov can be copied at 832 * once without a pagefault. 833 */ 834 bytes = min_t(unsigned long, PAGE_SIZE - offset, 835 iov_iter_single_seg_count(i)); 836 goto again; 837 } 838 pos += copied; 839 written += copied; 840 length -= copied; 841 842 balance_dirty_pages_ratelimited(inode->i_mapping); 843 } while (iov_iter_count(i) && length); 844 845 return written ? written : status; 846 } 847 848 ssize_t 849 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 850 const struct iomap_ops *ops) 851 { 852 struct inode *inode = iocb->ki_filp->f_mapping->host; 853 loff_t pos = iocb->ki_pos, ret = 0, written = 0; 854 855 while (iov_iter_count(iter)) { 856 ret = iomap_apply(inode, pos, iov_iter_count(iter), 857 IOMAP_WRITE, ops, iter, iomap_write_actor); 858 if (ret <= 0) 859 break; 860 pos += ret; 861 written += ret; 862 } 863 864 return written ? written : ret; 865 } 866 EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 867 868 static loff_t 869 iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 870 struct iomap *iomap, struct iomap *srcmap) 871 { 872 long status = 0; 873 ssize_t written = 0; 874 875 /* don't bother with blocks that are not shared to start with */ 876 if (!(iomap->flags & IOMAP_F_SHARED)) 877 return length; 878 /* don't bother with holes or unwritten extents */ 879 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 880 return length; 881 882 do { 883 unsigned long offset = offset_in_page(pos); 884 unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); 885 struct page *page; 886 887 status = iomap_write_begin(inode, pos, bytes, 888 IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); 889 if (unlikely(status)) 890 return status; 891 892 status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, 893 srcmap); 894 if (unlikely(status <= 0)) { 895 if (WARN_ON_ONCE(status == 0)) 896 return -EIO; 897 return status; 898 } 899 900 cond_resched(); 901 902 pos += status; 903 written += status; 904 length -= status; 905 906 balance_dirty_pages_ratelimited(inode->i_mapping); 907 } while (length); 908 909 return written; 910 } 911 912 int 913 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, 914 const struct iomap_ops *ops) 915 { 916 loff_t ret; 917 918 while (len) { 919 ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 920 iomap_unshare_actor); 921 if (ret <= 0) 922 return ret; 923 pos += ret; 924 len -= ret; 925 } 926 927 return 0; 928 } 929 EXPORT_SYMBOL_GPL(iomap_file_unshare); 930 931 static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 932 unsigned bytes, struct iomap *iomap, struct iomap *srcmap) 933 { 934 struct page *page; 935 int status; 936 937 status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); 938 if (status) 939 return status; 940 941 zero_user(page, offset, bytes); 942 mark_page_accessed(page); 943 944 return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); 945 } 946 947 static loff_t 948 iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 949 void *data, struct iomap *iomap, struct iomap *srcmap) 950 { 951 bool *did_zero = data; 952 loff_t written = 0; 953 int status; 954 955 /* already zeroed? we're done. */ 956 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 957 return count; 958 959 do { 960 unsigned offset, bytes; 961 962 offset = offset_in_page(pos); 963 bytes = min_t(loff_t, PAGE_SIZE - offset, count); 964 965 if (IS_DAX(inode)) 966 status = dax_iomap_zero(pos, offset, bytes, iomap); 967 else 968 status = iomap_zero(inode, pos, offset, bytes, iomap, 969 srcmap); 970 if (status < 0) 971 return status; 972 973 pos += bytes; 974 count -= bytes; 975 written += bytes; 976 if (did_zero) 977 *did_zero = true; 978 } while (count > 0); 979 980 return written; 981 } 982 983 int 984 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 985 const struct iomap_ops *ops) 986 { 987 loff_t ret; 988 989 while (len > 0) { 990 ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 991 ops, did_zero, iomap_zero_range_actor); 992 if (ret <= 0) 993 return ret; 994 995 pos += ret; 996 len -= ret; 997 } 998 999 return 0; 1000 } 1001 EXPORT_SYMBOL_GPL(iomap_zero_range); 1002 1003 int 1004 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1005 const struct iomap_ops *ops) 1006 { 1007 unsigned int blocksize = i_blocksize(inode); 1008 unsigned int off = pos & (blocksize - 1); 1009 1010 /* Block boundary? Nothing to do */ 1011 if (!off) 1012 return 0; 1013 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 1014 } 1015 EXPORT_SYMBOL_GPL(iomap_truncate_page); 1016 1017 static loff_t 1018 iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 1019 void *data, struct iomap *iomap, struct iomap *srcmap) 1020 { 1021 struct page *page = data; 1022 int ret; 1023 1024 if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 1025 ret = __block_write_begin_int(page, pos, length, NULL, iomap); 1026 if (ret) 1027 return ret; 1028 block_commit_write(page, 0, length); 1029 } else { 1030 WARN_ON_ONCE(!PageUptodate(page)); 1031 iomap_page_create(inode, page); 1032 set_page_dirty(page); 1033 } 1034 1035 return length; 1036 } 1037 1038 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1039 { 1040 struct page *page = vmf->page; 1041 struct inode *inode = file_inode(vmf->vma->vm_file); 1042 unsigned long length; 1043 loff_t offset; 1044 ssize_t ret; 1045 1046 lock_page(page); 1047 ret = page_mkwrite_check_truncate(page, inode); 1048 if (ret < 0) 1049 goto out_unlock; 1050 length = ret; 1051 1052 offset = page_offset(page); 1053 while (length > 0) { 1054 ret = iomap_apply(inode, offset, length, 1055 IOMAP_WRITE | IOMAP_FAULT, ops, page, 1056 iomap_page_mkwrite_actor); 1057 if (unlikely(ret <= 0)) 1058 goto out_unlock; 1059 offset += ret; 1060 length -= ret; 1061 } 1062 1063 wait_for_stable_page(page); 1064 return VM_FAULT_LOCKED; 1065 out_unlock: 1066 unlock_page(page); 1067 return block_page_mkwrite_return(ret); 1068 } 1069 EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1070 1071 static void 1072 iomap_finish_page_writeback(struct inode *inode, struct page *page, 1073 int error) 1074 { 1075 struct iomap_page *iop = to_iomap_page(page); 1076 1077 if (error) { 1078 SetPageError(page); 1079 mapping_set_error(inode->i_mapping, -EIO); 1080 } 1081 1082 WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); 1083 WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); 1084 1085 if (!iop || atomic_dec_and_test(&iop->write_count)) 1086 end_page_writeback(page); 1087 } 1088 1089 /* 1090 * We're now finished for good with this ioend structure. Update the page 1091 * state, release holds on bios, and finally free up memory. Do not use the 1092 * ioend after this. 1093 */ 1094 static void 1095 iomap_finish_ioend(struct iomap_ioend *ioend, int error) 1096 { 1097 struct inode *inode = ioend->io_inode; 1098 struct bio *bio = &ioend->io_inline_bio; 1099 struct bio *last = ioend->io_bio, *next; 1100 u64 start = bio->bi_iter.bi_sector; 1101 loff_t offset = ioend->io_offset; 1102 bool quiet = bio_flagged(bio, BIO_QUIET); 1103 1104 for (bio = &ioend->io_inline_bio; bio; bio = next) { 1105 struct bio_vec *bv; 1106 struct bvec_iter_all iter_all; 1107 1108 /* 1109 * For the last bio, bi_private points to the ioend, so we 1110 * need to explicitly end the iteration here. 1111 */ 1112 if (bio == last) 1113 next = NULL; 1114 else 1115 next = bio->bi_private; 1116 1117 /* walk each page on bio, ending page IO on them */ 1118 bio_for_each_segment_all(bv, bio, iter_all) 1119 iomap_finish_page_writeback(inode, bv->bv_page, error); 1120 bio_put(bio); 1121 } 1122 /* The ioend has been freed by bio_put() */ 1123 1124 if (unlikely(error && !quiet)) { 1125 printk_ratelimited(KERN_ERR 1126 "%s: writeback error on inode %lu, offset %lld, sector %llu", 1127 inode->i_sb->s_id, inode->i_ino, offset, start); 1128 } 1129 } 1130 1131 void 1132 iomap_finish_ioends(struct iomap_ioend *ioend, int error) 1133 { 1134 struct list_head tmp; 1135 1136 list_replace_init(&ioend->io_list, &tmp); 1137 iomap_finish_ioend(ioend, error); 1138 1139 while (!list_empty(&tmp)) { 1140 ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); 1141 list_del_init(&ioend->io_list); 1142 iomap_finish_ioend(ioend, error); 1143 } 1144 } 1145 EXPORT_SYMBOL_GPL(iomap_finish_ioends); 1146 1147 /* 1148 * We can merge two adjacent ioends if they have the same set of work to do. 1149 */ 1150 static bool 1151 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) 1152 { 1153 if (ioend->io_bio->bi_status != next->io_bio->bi_status) 1154 return false; 1155 if ((ioend->io_flags & IOMAP_F_SHARED) ^ 1156 (next->io_flags & IOMAP_F_SHARED)) 1157 return false; 1158 if ((ioend->io_type == IOMAP_UNWRITTEN) ^ 1159 (next->io_type == IOMAP_UNWRITTEN)) 1160 return false; 1161 if (ioend->io_offset + ioend->io_size != next->io_offset) 1162 return false; 1163 return true; 1164 } 1165 1166 void 1167 iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, 1168 void (*merge_private)(struct iomap_ioend *ioend, 1169 struct iomap_ioend *next)) 1170 { 1171 struct iomap_ioend *next; 1172 1173 INIT_LIST_HEAD(&ioend->io_list); 1174 1175 while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, 1176 io_list))) { 1177 if (!iomap_ioend_can_merge(ioend, next)) 1178 break; 1179 list_move_tail(&next->io_list, &ioend->io_list); 1180 ioend->io_size += next->io_size; 1181 if (next->io_private && merge_private) 1182 merge_private(ioend, next); 1183 } 1184 } 1185 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); 1186 1187 static int 1188 iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b) 1189 { 1190 struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); 1191 struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); 1192 1193 if (ia->io_offset < ib->io_offset) 1194 return -1; 1195 if (ia->io_offset > ib->io_offset) 1196 return 1; 1197 return 0; 1198 } 1199 1200 void 1201 iomap_sort_ioends(struct list_head *ioend_list) 1202 { 1203 list_sort(NULL, ioend_list, iomap_ioend_compare); 1204 } 1205 EXPORT_SYMBOL_GPL(iomap_sort_ioends); 1206 1207 static void iomap_writepage_end_bio(struct bio *bio) 1208 { 1209 struct iomap_ioend *ioend = bio->bi_private; 1210 1211 iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); 1212 } 1213 1214 /* 1215 * Submit the final bio for an ioend. 1216 * 1217 * If @error is non-zero, it means that we have a situation where some part of 1218 * the submission process has failed after we have marked paged for writeback 1219 * and unlocked them. In this situation, we need to fail the bio instead of 1220 * submitting it. This typically only happens on a filesystem shutdown. 1221 */ 1222 static int 1223 iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, 1224 int error) 1225 { 1226 ioend->io_bio->bi_private = ioend; 1227 ioend->io_bio->bi_end_io = iomap_writepage_end_bio; 1228 1229 if (wpc->ops->prepare_ioend) 1230 error = wpc->ops->prepare_ioend(ioend, error); 1231 if (error) { 1232 /* 1233 * If we are failing the IO now, just mark the ioend with an 1234 * error and finish it. This will run IO completion immediately 1235 * as there is only one reference to the ioend at this point in 1236 * time. 1237 */ 1238 ioend->io_bio->bi_status = errno_to_blk_status(error); 1239 bio_endio(ioend->io_bio); 1240 return error; 1241 } 1242 1243 submit_bio(ioend->io_bio); 1244 return 0; 1245 } 1246 1247 static struct iomap_ioend * 1248 iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, 1249 loff_t offset, sector_t sector, struct writeback_control *wbc) 1250 { 1251 struct iomap_ioend *ioend; 1252 struct bio *bio; 1253 1254 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); 1255 bio_set_dev(bio, wpc->iomap.bdev); 1256 bio->bi_iter.bi_sector = sector; 1257 bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 1258 bio->bi_write_hint = inode->i_write_hint; 1259 wbc_init_bio(wbc, bio); 1260 1261 ioend = container_of(bio, struct iomap_ioend, io_inline_bio); 1262 INIT_LIST_HEAD(&ioend->io_list); 1263 ioend->io_type = wpc->iomap.type; 1264 ioend->io_flags = wpc->iomap.flags; 1265 ioend->io_inode = inode; 1266 ioend->io_size = 0; 1267 ioend->io_offset = offset; 1268 ioend->io_private = NULL; 1269 ioend->io_bio = bio; 1270 return ioend; 1271 } 1272 1273 /* 1274 * Allocate a new bio, and chain the old bio to the new one. 1275 * 1276 * Note that we have to do perform the chaining in this unintuitive order 1277 * so that the bi_private linkage is set up in the right direction for the 1278 * traversal in iomap_finish_ioend(). 1279 */ 1280 static struct bio * 1281 iomap_chain_bio(struct bio *prev) 1282 { 1283 struct bio *new; 1284 1285 new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); 1286 bio_copy_dev(new, prev);/* also copies over blkcg information */ 1287 new->bi_iter.bi_sector = bio_end_sector(prev); 1288 new->bi_opf = prev->bi_opf; 1289 new->bi_write_hint = prev->bi_write_hint; 1290 1291 bio_chain(prev, new); 1292 bio_get(prev); /* for iomap_finish_ioend */ 1293 submit_bio(prev); 1294 return new; 1295 } 1296 1297 static bool 1298 iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, 1299 sector_t sector) 1300 { 1301 if ((wpc->iomap.flags & IOMAP_F_SHARED) != 1302 (wpc->ioend->io_flags & IOMAP_F_SHARED)) 1303 return false; 1304 if (wpc->iomap.type != wpc->ioend->io_type) 1305 return false; 1306 if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) 1307 return false; 1308 if (sector != bio_end_sector(wpc->ioend->io_bio)) 1309 return false; 1310 return true; 1311 } 1312 1313 /* 1314 * Test to see if we have an existing ioend structure that we could append to 1315 * first, otherwise finish off the current ioend and start another. 1316 */ 1317 static void 1318 iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, 1319 struct iomap_page *iop, struct iomap_writepage_ctx *wpc, 1320 struct writeback_control *wbc, struct list_head *iolist) 1321 { 1322 sector_t sector = iomap_sector(&wpc->iomap, offset); 1323 unsigned len = i_blocksize(inode); 1324 unsigned poff = offset & (PAGE_SIZE - 1); 1325 bool merged, same_page = false; 1326 1327 if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { 1328 if (wpc->ioend) 1329 list_add(&wpc->ioend->io_list, iolist); 1330 wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); 1331 } 1332 1333 merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, 1334 &same_page); 1335 if (iop && !same_page) 1336 atomic_inc(&iop->write_count); 1337 1338 if (!merged) { 1339 if (bio_full(wpc->ioend->io_bio, len)) { 1340 wpc->ioend->io_bio = 1341 iomap_chain_bio(wpc->ioend->io_bio); 1342 } 1343 bio_add_page(wpc->ioend->io_bio, page, len, poff); 1344 } 1345 1346 wpc->ioend->io_size += len; 1347 wbc_account_cgroup_owner(wbc, page, len); 1348 } 1349 1350 /* 1351 * We implement an immediate ioend submission policy here to avoid needing to 1352 * chain multiple ioends and hence nest mempool allocations which can violate 1353 * forward progress guarantees we need to provide. The current ioend we are 1354 * adding blocks to is cached on the writepage context, and if the new block 1355 * does not append to the cached ioend it will create a new ioend and cache that 1356 * instead. 1357 * 1358 * If a new ioend is created and cached, the old ioend is returned and queued 1359 * locally for submission once the entire page is processed or an error has been 1360 * detected. While ioends are submitted immediately after they are completed, 1361 * batching optimisations are provided by higher level block plugging. 1362 * 1363 * At the end of a writeback pass, there will be a cached ioend remaining on the 1364 * writepage context that the caller will need to submit. 1365 */ 1366 static int 1367 iomap_writepage_map(struct iomap_writepage_ctx *wpc, 1368 struct writeback_control *wbc, struct inode *inode, 1369 struct page *page, u64 end_offset) 1370 { 1371 struct iomap_page *iop = to_iomap_page(page); 1372 struct iomap_ioend *ioend, *next; 1373 unsigned len = i_blocksize(inode); 1374 u64 file_offset; /* file offset of page */ 1375 int error = 0, count = 0, i; 1376 LIST_HEAD(submit_list); 1377 1378 WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); 1379 WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); 1380 1381 /* 1382 * Walk through the page to find areas to write back. If we run off the 1383 * end of the current map or find the current map invalid, grab a new 1384 * one. 1385 */ 1386 for (i = 0, file_offset = page_offset(page); 1387 i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; 1388 i++, file_offset += len) { 1389 if (iop && !test_bit(i, iop->uptodate)) 1390 continue; 1391 1392 error = wpc->ops->map_blocks(wpc, inode, file_offset); 1393 if (error) 1394 break; 1395 if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) 1396 continue; 1397 if (wpc->iomap.type == IOMAP_HOLE) 1398 continue; 1399 iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, 1400 &submit_list); 1401 count++; 1402 } 1403 1404 WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); 1405 WARN_ON_ONCE(!PageLocked(page)); 1406 WARN_ON_ONCE(PageWriteback(page)); 1407 1408 /* 1409 * We cannot cancel the ioend directly here on error. We may have 1410 * already set other pages under writeback and hence we have to run I/O 1411 * completion to mark the error state of the pages under writeback 1412 * appropriately. 1413 */ 1414 if (unlikely(error)) { 1415 if (!count) { 1416 /* 1417 * If the current page hasn't been added to ioend, it 1418 * won't be affected by I/O completions and we must 1419 * discard and unlock it right here. 1420 */ 1421 if (wpc->ops->discard_page) 1422 wpc->ops->discard_page(page); 1423 ClearPageUptodate(page); 1424 unlock_page(page); 1425 goto done; 1426 } 1427 1428 /* 1429 * If the page was not fully cleaned, we need to ensure that the 1430 * higher layers come back to it correctly. That means we need 1431 * to keep the page dirty, and for WB_SYNC_ALL writeback we need 1432 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed 1433 * so another attempt to write this page in this writeback sweep 1434 * will be made. 1435 */ 1436 set_page_writeback_keepwrite(page); 1437 } else { 1438 clear_page_dirty_for_io(page); 1439 set_page_writeback(page); 1440 } 1441 1442 unlock_page(page); 1443 1444 /* 1445 * Preserve the original error if there was one, otherwise catch 1446 * submission errors here and propagate into subsequent ioend 1447 * submissions. 1448 */ 1449 list_for_each_entry_safe(ioend, next, &submit_list, io_list) { 1450 int error2; 1451 1452 list_del_init(&ioend->io_list); 1453 error2 = iomap_submit_ioend(wpc, ioend, error); 1454 if (error2 && !error) 1455 error = error2; 1456 } 1457 1458 /* 1459 * We can end up here with no error and nothing to write only if we race 1460 * with a partial page truncate on a sub-page block sized filesystem. 1461 */ 1462 if (!count) 1463 end_page_writeback(page); 1464 done: 1465 mapping_set_error(page->mapping, error); 1466 return error; 1467 } 1468 1469 /* 1470 * Write out a dirty page. 1471 * 1472 * For delalloc space on the page we need to allocate space and flush it. 1473 * For unwritten space on the page we need to start the conversion to 1474 * regular allocated space. 1475 */ 1476 static int 1477 iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) 1478 { 1479 struct iomap_writepage_ctx *wpc = data; 1480 struct inode *inode = page->mapping->host; 1481 pgoff_t end_index; 1482 u64 end_offset; 1483 loff_t offset; 1484 1485 trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); 1486 1487 /* 1488 * Refuse to write the page out if we are called from reclaim context. 1489 * 1490 * This avoids stack overflows when called from deeply used stacks in 1491 * random callers for direct reclaim or memcg reclaim. We explicitly 1492 * allow reclaim from kswapd as the stack usage there is relatively low. 1493 * 1494 * This should never happen except in the case of a VM regression so 1495 * warn about it. 1496 */ 1497 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 1498 PF_MEMALLOC)) 1499 goto redirty; 1500 1501 /* 1502 * Given that we do not allow direct reclaim to call us, we should 1503 * never be called in a recursive filesystem reclaim context. 1504 */ 1505 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) 1506 goto redirty; 1507 1508 /* 1509 * Is this page beyond the end of the file? 1510 * 1511 * The page index is less than the end_index, adjust the end_offset 1512 * to the highest offset that this page should represent. 1513 * ----------------------------------------------------- 1514 * | file mapping | <EOF> | 1515 * ----------------------------------------------------- 1516 * | Page ... | Page N-2 | Page N-1 | Page N | | 1517 * ^--------------------------------^----------|-------- 1518 * | desired writeback range | see else | 1519 * ---------------------------------^------------------| 1520 */ 1521 offset = i_size_read(inode); 1522 end_index = offset >> PAGE_SHIFT; 1523 if (page->index < end_index) 1524 end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; 1525 else { 1526 /* 1527 * Check whether the page to write out is beyond or straddles 1528 * i_size or not. 1529 * ------------------------------------------------------- 1530 * | file mapping | <EOF> | 1531 * ------------------------------------------------------- 1532 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1533 * ^--------------------------------^-----------|--------- 1534 * | | Straddles | 1535 * ---------------------------------^-----------|--------| 1536 */ 1537 unsigned offset_into_page = offset & (PAGE_SIZE - 1); 1538 1539 /* 1540 * Skip the page if it is fully outside i_size, e.g. due to a 1541 * truncate operation that is in progress. We must redirty the 1542 * page so that reclaim stops reclaiming it. Otherwise 1543 * iomap_vm_releasepage() is called on it and gets confused. 1544 * 1545 * Note that the end_index is unsigned long, it would overflow 1546 * if the given offset is greater than 16TB on 32-bit system 1547 * and if we do check the page is fully outside i_size or not 1548 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1549 * will be evaluated to 0. Hence this page will be redirtied 1550 * and be written out repeatedly which would result in an 1551 * infinite loop, the user program that perform this operation 1552 * will hang. Instead, we can verify this situation by checking 1553 * if the page to write is totally beyond the i_size or if it's 1554 * offset is just equal to the EOF. 1555 */ 1556 if (page->index > end_index || 1557 (page->index == end_index && offset_into_page == 0)) 1558 goto redirty; 1559 1560 /* 1561 * The page straddles i_size. It must be zeroed out on each 1562 * and every writepage invocation because it may be mmapped. 1563 * "A file is mapped in multiples of the page size. For a file 1564 * that is not a multiple of the page size, the remaining 1565 * memory is zeroed when mapped, and writes to that region are 1566 * not written out to the file." 1567 */ 1568 zero_user_segment(page, offset_into_page, PAGE_SIZE); 1569 1570 /* Adjust the end_offset to the end of file */ 1571 end_offset = offset; 1572 } 1573 1574 return iomap_writepage_map(wpc, wbc, inode, page, end_offset); 1575 1576 redirty: 1577 redirty_page_for_writepage(wbc, page); 1578 unlock_page(page); 1579 return 0; 1580 } 1581 1582 int 1583 iomap_writepage(struct page *page, struct writeback_control *wbc, 1584 struct iomap_writepage_ctx *wpc, 1585 const struct iomap_writeback_ops *ops) 1586 { 1587 int ret; 1588 1589 wpc->ops = ops; 1590 ret = iomap_do_writepage(page, wbc, wpc); 1591 if (!wpc->ioend) 1592 return ret; 1593 return iomap_submit_ioend(wpc, wpc->ioend, ret); 1594 } 1595 EXPORT_SYMBOL_GPL(iomap_writepage); 1596 1597 int 1598 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, 1599 struct iomap_writepage_ctx *wpc, 1600 const struct iomap_writeback_ops *ops) 1601 { 1602 int ret; 1603 1604 wpc->ops = ops; 1605 ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); 1606 if (!wpc->ioend) 1607 return ret; 1608 return iomap_submit_ioend(wpc, wpc->ioend, ret); 1609 } 1610 EXPORT_SYMBOL_GPL(iomap_writepages); 1611 1612 static int __init iomap_init(void) 1613 { 1614 return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), 1615 offsetof(struct iomap_ioend, io_inline_bio), 1616 BIOSET_NEED_BVECS); 1617 } 1618 fs_initcall(iomap_init); 1619