1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Red Hat, Inc. 4 * Copyright (C) 2016-2019 Christoph Hellwig. 5 */ 6 #include <linux/module.h> 7 #include <linux/compiler.h> 8 #include <linux/fs.h> 9 #include <linux/iomap.h> 10 #include <linux/pagemap.h> 11 #include <linux/uio.h> 12 #include <linux/buffer_head.h> 13 #include <linux/dax.h> 14 #include <linux/writeback.h> 15 #include <linux/list_sort.h> 16 #include <linux/swap.h> 17 #include <linux/bio.h> 18 #include <linux/sched/signal.h> 19 #include <linux/migrate.h> 20 #include "trace.h" 21 22 #include "../internal.h" 23 24 /* 25 * Structure allocated for each page or THP when block size < page size 26 * to track sub-page uptodate status and I/O completions. 27 */ 28 struct iomap_page { 29 atomic_t read_bytes_pending; 30 atomic_t write_bytes_pending; 31 spinlock_t uptodate_lock; 32 unsigned long uptodate[]; 33 }; 34 35 static inline struct iomap_page *to_iomap_page(struct page *page) 36 { 37 /* 38 * per-block data is stored in the head page. Callers should 39 * not be dealing with tail pages, and if they are, they can 40 * call thp_head() first. 41 */ 42 VM_BUG_ON_PGFLAGS(PageTail(page), page); 43 44 if (page_has_private(page)) 45 return (struct iomap_page *)page_private(page); 46 return NULL; 47 } 48 49 static struct bio_set iomap_ioend_bioset; 50 51 static struct iomap_page * 52 iomap_page_create(struct inode *inode, struct page *page) 53 { 54 struct iomap_page *iop = to_iomap_page(page); 55 unsigned int nr_blocks = i_blocks_per_page(inode, page); 56 57 if (iop || nr_blocks <= 1) 58 return iop; 59 60 iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), 61 GFP_NOFS | __GFP_NOFAIL); 62 spin_lock_init(&iop->uptodate_lock); 63 if (PageUptodate(page)) 64 bitmap_fill(iop->uptodate, nr_blocks); 65 attach_page_private(page, iop); 66 return iop; 67 } 68 69 static void 70 iomap_page_release(struct page *page) 71 { 72 struct iomap_page *iop = detach_page_private(page); 73 unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); 74 75 if (!iop) 76 return; 77 WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); 78 WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); 79 WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != 80 PageUptodate(page)); 81 kfree(iop); 82 } 83 84 /* 85 * Calculate the range inside the page that we actually need to read. 86 */ 87 static void 88 iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 89 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 90 { 91 loff_t orig_pos = *pos; 92 loff_t isize = i_size_read(inode); 93 unsigned block_bits = inode->i_blkbits; 94 unsigned block_size = (1 << block_bits); 95 unsigned poff = offset_in_page(*pos); 96 unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 97 unsigned first = poff >> block_bits; 98 unsigned last = (poff + plen - 1) >> block_bits; 99 100 /* 101 * If the block size is smaller than the page size, we need to check the 102 * per-block uptodate status and adjust the offset and length if needed 103 * to avoid reading in already uptodate ranges. 104 */ 105 if (iop) { 106 unsigned int i; 107 108 /* move forward for each leading block marked uptodate */ 109 for (i = first; i <= last; i++) { 110 if (!test_bit(i, iop->uptodate)) 111 break; 112 *pos += block_size; 113 poff += block_size; 114 plen -= block_size; 115 first++; 116 } 117 118 /* truncate len if we find any trailing uptodate block(s) */ 119 for ( ; i <= last; i++) { 120 if (test_bit(i, iop->uptodate)) { 121 plen -= (last - i + 1) * block_size; 122 last = i - 1; 123 break; 124 } 125 } 126 } 127 128 /* 129 * If the extent spans the block that contains the i_size, we need to 130 * handle both halves separately so that we properly zero data in the 131 * page cache for blocks that are entirely outside of i_size. 132 */ 133 if (orig_pos <= isize && orig_pos + length > isize) { 134 unsigned end = offset_in_page(isize - 1) >> block_bits; 135 136 if (first <= end && last > end) 137 plen -= (last - end) * block_size; 138 } 139 140 *offp = poff; 141 *lenp = plen; 142 } 143 144 static void 145 iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) 146 { 147 struct iomap_page *iop = to_iomap_page(page); 148 struct inode *inode = page->mapping->host; 149 unsigned first = off >> inode->i_blkbits; 150 unsigned last = (off + len - 1) >> inode->i_blkbits; 151 unsigned long flags; 152 153 spin_lock_irqsave(&iop->uptodate_lock, flags); 154 bitmap_set(iop->uptodate, first, last - first + 1); 155 if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) 156 SetPageUptodate(page); 157 spin_unlock_irqrestore(&iop->uptodate_lock, flags); 158 } 159 160 static void 161 iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 162 { 163 if (PageError(page)) 164 return; 165 166 if (page_has_private(page)) 167 iomap_iop_set_range_uptodate(page, off, len); 168 else 169 SetPageUptodate(page); 170 } 171 172 static void 173 iomap_read_page_end_io(struct bio_vec *bvec, int error) 174 { 175 struct page *page = bvec->bv_page; 176 struct iomap_page *iop = to_iomap_page(page); 177 178 if (unlikely(error)) { 179 ClearPageUptodate(page); 180 SetPageError(page); 181 } else { 182 iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 183 } 184 185 if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) 186 unlock_page(page); 187 } 188 189 static void 190 iomap_read_end_io(struct bio *bio) 191 { 192 int error = blk_status_to_errno(bio->bi_status); 193 struct bio_vec *bvec; 194 struct bvec_iter_all iter_all; 195 196 bio_for_each_segment_all(bvec, bio, iter_all) 197 iomap_read_page_end_io(bvec, error); 198 bio_put(bio); 199 } 200 201 struct iomap_readpage_ctx { 202 struct page *cur_page; 203 bool cur_page_in_bio; 204 struct bio *bio; 205 struct readahead_control *rac; 206 }; 207 208 /** 209 * iomap_read_inline_data - copy inline data into the page cache 210 * @iter: iteration structure 211 * @page: page to copy to 212 * 213 * Copy the inline data in @iter into @page and zero out the rest of the page. 214 * Only a single IOMAP_INLINE extent is allowed at the end of each file. 215 * Returns zero for success to complete the read, or the usual negative errno. 216 */ 217 static int iomap_read_inline_data(const struct iomap_iter *iter, 218 struct page *page) 219 { 220 const struct iomap *iomap = iomap_iter_srcmap(iter); 221 size_t size = i_size_read(iter->inode) - iomap->offset; 222 size_t poff = offset_in_page(iomap->offset); 223 void *addr; 224 225 if (PageUptodate(page)) 226 return 0; 227 228 if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) 229 return -EIO; 230 if (WARN_ON_ONCE(size > PAGE_SIZE - 231 offset_in_page(iomap->inline_data))) 232 return -EIO; 233 if (WARN_ON_ONCE(size > iomap->length)) 234 return -EIO; 235 if (poff > 0) 236 iomap_page_create(iter->inode, page); 237 238 addr = kmap_local_page(page) + poff; 239 memcpy(addr, iomap->inline_data, size); 240 memset(addr + size, 0, PAGE_SIZE - poff - size); 241 kunmap_local(addr); 242 iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); 243 return 0; 244 } 245 246 static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, 247 loff_t pos) 248 { 249 const struct iomap *srcmap = iomap_iter_srcmap(iter); 250 251 return srcmap->type != IOMAP_MAPPED || 252 (srcmap->flags & IOMAP_F_NEW) || 253 pos >= i_size_read(iter->inode); 254 } 255 256 static loff_t iomap_readpage_iter(const struct iomap_iter *iter, 257 struct iomap_readpage_ctx *ctx, loff_t offset) 258 { 259 const struct iomap *iomap = &iter->iomap; 260 loff_t pos = iter->pos + offset; 261 loff_t length = iomap_length(iter) - offset; 262 struct page *page = ctx->cur_page; 263 struct iomap_page *iop; 264 loff_t orig_pos = pos; 265 unsigned poff, plen; 266 sector_t sector; 267 268 if (iomap->type == IOMAP_INLINE) 269 return iomap_read_inline_data(iter, page); 270 271 /* zero post-eof blocks as the page may be mapped */ 272 iop = iomap_page_create(iter->inode, page); 273 iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); 274 if (plen == 0) 275 goto done; 276 277 if (iomap_block_needs_zeroing(iter, pos)) { 278 zero_user(page, poff, plen); 279 iomap_set_range_uptodate(page, poff, plen); 280 goto done; 281 } 282 283 ctx->cur_page_in_bio = true; 284 if (iop) 285 atomic_add(plen, &iop->read_bytes_pending); 286 287 sector = iomap_sector(iomap, pos); 288 if (!ctx->bio || 289 bio_end_sector(ctx->bio) != sector || 290 bio_add_page(ctx->bio, page, plen, poff) != plen) { 291 gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 292 gfp_t orig_gfp = gfp; 293 unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); 294 295 if (ctx->bio) 296 submit_bio(ctx->bio); 297 298 if (ctx->rac) /* same as readahead_gfp_mask */ 299 gfp |= __GFP_NORETRY | __GFP_NOWARN; 300 ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); 301 /* 302 * If the bio_alloc fails, try it again for a single page to 303 * avoid having to deal with partial page reads. This emulates 304 * what do_mpage_readpage does. 305 */ 306 if (!ctx->bio) 307 ctx->bio = bio_alloc(orig_gfp, 1); 308 ctx->bio->bi_opf = REQ_OP_READ; 309 if (ctx->rac) 310 ctx->bio->bi_opf |= REQ_RAHEAD; 311 ctx->bio->bi_iter.bi_sector = sector; 312 bio_set_dev(ctx->bio, iomap->bdev); 313 ctx->bio->bi_end_io = iomap_read_end_io; 314 __bio_add_page(ctx->bio, page, plen, poff); 315 } 316 done: 317 /* 318 * Move the caller beyond our range so that it keeps making progress. 319 * For that, we have to include any leading non-uptodate ranges, but 320 * we can skip trailing ones as they will be handled in the next 321 * iteration. 322 */ 323 return pos - orig_pos + plen; 324 } 325 326 int 327 iomap_readpage(struct page *page, const struct iomap_ops *ops) 328 { 329 struct iomap_iter iter = { 330 .inode = page->mapping->host, 331 .pos = page_offset(page), 332 .len = PAGE_SIZE, 333 }; 334 struct iomap_readpage_ctx ctx = { 335 .cur_page = page, 336 }; 337 int ret; 338 339 trace_iomap_readpage(page->mapping->host, 1); 340 341 while ((ret = iomap_iter(&iter, ops)) > 0) 342 iter.processed = iomap_readpage_iter(&iter, &ctx, 0); 343 344 if (ret < 0) 345 SetPageError(page); 346 347 if (ctx.bio) { 348 submit_bio(ctx.bio); 349 WARN_ON_ONCE(!ctx.cur_page_in_bio); 350 } else { 351 WARN_ON_ONCE(ctx.cur_page_in_bio); 352 unlock_page(page); 353 } 354 355 /* 356 * Just like mpage_readahead and block_read_full_page, we always 357 * return 0 and just mark the page as PageError on errors. This 358 * should be cleaned up throughout the stack eventually. 359 */ 360 return 0; 361 } 362 EXPORT_SYMBOL_GPL(iomap_readpage); 363 364 static loff_t iomap_readahead_iter(const struct iomap_iter *iter, 365 struct iomap_readpage_ctx *ctx) 366 { 367 loff_t length = iomap_length(iter); 368 loff_t done, ret; 369 370 for (done = 0; done < length; done += ret) { 371 if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { 372 if (!ctx->cur_page_in_bio) 373 unlock_page(ctx->cur_page); 374 put_page(ctx->cur_page); 375 ctx->cur_page = NULL; 376 } 377 if (!ctx->cur_page) { 378 ctx->cur_page = readahead_page(ctx->rac); 379 ctx->cur_page_in_bio = false; 380 } 381 ret = iomap_readpage_iter(iter, ctx, done); 382 if (ret <= 0) 383 return ret; 384 } 385 386 return done; 387 } 388 389 /** 390 * iomap_readahead - Attempt to read pages from a file. 391 * @rac: Describes the pages to be read. 392 * @ops: The operations vector for the filesystem. 393 * 394 * This function is for filesystems to call to implement their readahead 395 * address_space operation. 396 * 397 * Context: The @ops callbacks may submit I/O (eg to read the addresses of 398 * blocks from disc), and may wait for it. The caller may be trying to 399 * access a different page, and so sleeping excessively should be avoided. 400 * It may allocate memory, but should avoid costly allocations. This 401 * function is called with memalloc_nofs set, so allocations will not cause 402 * the filesystem to be reentered. 403 */ 404 void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) 405 { 406 struct iomap_iter iter = { 407 .inode = rac->mapping->host, 408 .pos = readahead_pos(rac), 409 .len = readahead_length(rac), 410 }; 411 struct iomap_readpage_ctx ctx = { 412 .rac = rac, 413 }; 414 415 trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); 416 417 while (iomap_iter(&iter, ops) > 0) 418 iter.processed = iomap_readahead_iter(&iter, &ctx); 419 420 if (ctx.bio) 421 submit_bio(ctx.bio); 422 if (ctx.cur_page) { 423 if (!ctx.cur_page_in_bio) 424 unlock_page(ctx.cur_page); 425 put_page(ctx.cur_page); 426 } 427 } 428 EXPORT_SYMBOL_GPL(iomap_readahead); 429 430 /* 431 * iomap_is_partially_uptodate checks whether blocks within a page are 432 * uptodate or not. 433 * 434 * Returns true if all blocks which correspond to a file portion 435 * we want to read within the page are uptodate. 436 */ 437 int 438 iomap_is_partially_uptodate(struct page *page, unsigned long from, 439 unsigned long count) 440 { 441 struct iomap_page *iop = to_iomap_page(page); 442 struct inode *inode = page->mapping->host; 443 unsigned len, first, last; 444 unsigned i; 445 446 /* Limit range to one page */ 447 len = min_t(unsigned, PAGE_SIZE - from, count); 448 449 /* First and last blocks in range within page */ 450 first = from >> inode->i_blkbits; 451 last = (from + len - 1) >> inode->i_blkbits; 452 453 if (iop) { 454 for (i = first; i <= last; i++) 455 if (!test_bit(i, iop->uptodate)) 456 return 0; 457 return 1; 458 } 459 460 return 0; 461 } 462 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 463 464 int 465 iomap_releasepage(struct page *page, gfp_t gfp_mask) 466 { 467 trace_iomap_releasepage(page->mapping->host, page_offset(page), 468 PAGE_SIZE); 469 470 /* 471 * mm accommodates an old ext3 case where clean pages might not have had 472 * the dirty bit cleared. Thus, it can send actual dirty pages to 473 * ->releasepage() via shrink_active_list(); skip those here. 474 */ 475 if (PageDirty(page) || PageWriteback(page)) 476 return 0; 477 iomap_page_release(page); 478 return 1; 479 } 480 EXPORT_SYMBOL_GPL(iomap_releasepage); 481 482 void 483 iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 484 { 485 trace_iomap_invalidatepage(page->mapping->host, offset, len); 486 487 /* 488 * If we're invalidating the entire page, clear the dirty state from it 489 * and release it to avoid unnecessary buildup of the LRU. 490 */ 491 if (offset == 0 && len == PAGE_SIZE) { 492 WARN_ON_ONCE(PageWriteback(page)); 493 cancel_dirty_page(page); 494 iomap_page_release(page); 495 } 496 } 497 EXPORT_SYMBOL_GPL(iomap_invalidatepage); 498 499 #ifdef CONFIG_MIGRATION 500 int 501 iomap_migrate_page(struct address_space *mapping, struct page *newpage, 502 struct page *page, enum migrate_mode mode) 503 { 504 int ret; 505 506 ret = migrate_page_move_mapping(mapping, newpage, page, 0); 507 if (ret != MIGRATEPAGE_SUCCESS) 508 return ret; 509 510 if (page_has_private(page)) 511 attach_page_private(newpage, detach_page_private(page)); 512 513 if (mode != MIGRATE_SYNC_NO_COPY) 514 migrate_page_copy(newpage, page); 515 else 516 migrate_page_states(newpage, page); 517 return MIGRATEPAGE_SUCCESS; 518 } 519 EXPORT_SYMBOL_GPL(iomap_migrate_page); 520 #endif /* CONFIG_MIGRATION */ 521 522 static void 523 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 524 { 525 loff_t i_size = i_size_read(inode); 526 527 /* 528 * Only truncate newly allocated pages beyoned EOF, even if the 529 * write started inside the existing inode size. 530 */ 531 if (pos + len > i_size) 532 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 533 } 534 535 static int 536 iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, 537 unsigned plen, const struct iomap *iomap) 538 { 539 struct bio_vec bvec; 540 struct bio bio; 541 542 bio_init(&bio, &bvec, 1); 543 bio.bi_opf = REQ_OP_READ; 544 bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 545 bio_set_dev(&bio, iomap->bdev); 546 __bio_add_page(&bio, page, plen, poff); 547 return submit_bio_wait(&bio); 548 } 549 550 static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, 551 unsigned len, struct page *page) 552 { 553 const struct iomap *srcmap = iomap_iter_srcmap(iter); 554 struct iomap_page *iop = iomap_page_create(iter->inode, page); 555 loff_t block_size = i_blocksize(iter->inode); 556 loff_t block_start = round_down(pos, block_size); 557 loff_t block_end = round_up(pos + len, block_size); 558 unsigned from = offset_in_page(pos), to = from + len, poff, plen; 559 560 if (PageUptodate(page)) 561 return 0; 562 ClearPageError(page); 563 564 do { 565 iomap_adjust_read_range(iter->inode, iop, &block_start, 566 block_end - block_start, &poff, &plen); 567 if (plen == 0) 568 break; 569 570 if (!(iter->flags & IOMAP_UNSHARE) && 571 (from <= poff || from >= poff + plen) && 572 (to <= poff || to >= poff + plen)) 573 continue; 574 575 if (iomap_block_needs_zeroing(iter, block_start)) { 576 if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) 577 return -EIO; 578 zero_user_segments(page, poff, from, to, poff + plen); 579 } else { 580 int status = iomap_read_page_sync(block_start, page, 581 poff, plen, srcmap); 582 if (status) 583 return status; 584 } 585 iomap_set_range_uptodate(page, poff, plen); 586 } while ((block_start += plen) < block_end); 587 588 return 0; 589 } 590 591 static int iomap_write_begin_inline(const struct iomap_iter *iter, 592 struct page *page) 593 { 594 /* needs more work for the tailpacking case; disable for now */ 595 if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) 596 return -EIO; 597 return iomap_read_inline_data(iter, page); 598 } 599 600 static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, 601 unsigned len, struct page **pagep) 602 { 603 const struct iomap_page_ops *page_ops = iter->iomap.page_ops; 604 const struct iomap *srcmap = iomap_iter_srcmap(iter); 605 struct page *page; 606 int status = 0; 607 608 BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); 609 if (srcmap != &iter->iomap) 610 BUG_ON(pos + len > srcmap->offset + srcmap->length); 611 612 if (fatal_signal_pending(current)) 613 return -EINTR; 614 615 if (page_ops && page_ops->page_prepare) { 616 status = page_ops->page_prepare(iter->inode, pos, len); 617 if (status) 618 return status; 619 } 620 621 page = grab_cache_page_write_begin(iter->inode->i_mapping, 622 pos >> PAGE_SHIFT, AOP_FLAG_NOFS); 623 if (!page) { 624 status = -ENOMEM; 625 goto out_no_page; 626 } 627 628 if (srcmap->type == IOMAP_INLINE) 629 status = iomap_write_begin_inline(iter, page); 630 else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) 631 status = __block_write_begin_int(page, pos, len, NULL, srcmap); 632 else 633 status = __iomap_write_begin(iter, pos, len, page); 634 635 if (unlikely(status)) 636 goto out_unlock; 637 638 *pagep = page; 639 return 0; 640 641 out_unlock: 642 unlock_page(page); 643 put_page(page); 644 iomap_write_failed(iter->inode, pos, len); 645 646 out_no_page: 647 if (page_ops && page_ops->page_done) 648 page_ops->page_done(iter->inode, pos, 0, NULL); 649 return status; 650 } 651 652 static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, 653 size_t copied, struct page *page) 654 { 655 flush_dcache_page(page); 656 657 /* 658 * The blocks that were entirely written will now be uptodate, so we 659 * don't have to worry about a readpage reading them and overwriting a 660 * partial write. However, if we've encountered a short write and only 661 * partially written into a block, it will not be marked uptodate, so a 662 * readpage might come in and destroy our partial write. 663 * 664 * Do the simplest thing and just treat any short write to a 665 * non-uptodate page as a zero-length write, and force the caller to 666 * redo the whole thing. 667 */ 668 if (unlikely(copied < len && !PageUptodate(page))) 669 return 0; 670 iomap_set_range_uptodate(page, offset_in_page(pos), len); 671 __set_page_dirty_nobuffers(page); 672 return copied; 673 } 674 675 static size_t iomap_write_end_inline(const struct iomap_iter *iter, 676 struct page *page, loff_t pos, size_t copied) 677 { 678 const struct iomap *iomap = &iter->iomap; 679 void *addr; 680 681 WARN_ON_ONCE(!PageUptodate(page)); 682 BUG_ON(!iomap_inline_data_valid(iomap)); 683 684 flush_dcache_page(page); 685 addr = kmap_local_page(page) + pos; 686 memcpy(iomap_inline_data(iomap, pos), addr, copied); 687 kunmap_local(addr); 688 689 mark_inode_dirty(iter->inode); 690 return copied; 691 } 692 693 /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ 694 static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, 695 size_t copied, struct page *page) 696 { 697 const struct iomap_page_ops *page_ops = iter->iomap.page_ops; 698 const struct iomap *srcmap = iomap_iter_srcmap(iter); 699 loff_t old_size = iter->inode->i_size; 700 size_t ret; 701 702 if (srcmap->type == IOMAP_INLINE) { 703 ret = iomap_write_end_inline(iter, page, pos, copied); 704 } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { 705 ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, 706 copied, page, NULL); 707 } else { 708 ret = __iomap_write_end(iter->inode, pos, len, copied, page); 709 } 710 711 /* 712 * Update the in-memory inode size after copying the data into the page 713 * cache. It's up to the file system to write the updated size to disk, 714 * preferably after I/O completion so that no stale data is exposed. 715 */ 716 if (pos + ret > old_size) { 717 i_size_write(iter->inode, pos + ret); 718 iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; 719 } 720 unlock_page(page); 721 722 if (old_size < pos) 723 pagecache_isize_extended(iter->inode, old_size, pos); 724 if (page_ops && page_ops->page_done) 725 page_ops->page_done(iter->inode, pos, ret, page); 726 put_page(page); 727 728 if (ret < len) 729 iomap_write_failed(iter->inode, pos, len); 730 return ret; 731 } 732 733 static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) 734 { 735 loff_t length = iomap_length(iter); 736 loff_t pos = iter->pos; 737 ssize_t written = 0; 738 long status = 0; 739 740 do { 741 struct page *page; 742 unsigned long offset; /* Offset into pagecache page */ 743 unsigned long bytes; /* Bytes to write to page */ 744 size_t copied; /* Bytes copied from user */ 745 746 offset = offset_in_page(pos); 747 bytes = min_t(unsigned long, PAGE_SIZE - offset, 748 iov_iter_count(i)); 749 again: 750 if (bytes > length) 751 bytes = length; 752 753 /* 754 * Bring in the user page that we'll copy from _first_. 755 * Otherwise there's a nasty deadlock on copying from the 756 * same page as we're writing to, without it being marked 757 * up-to-date. 758 */ 759 if (unlikely(fault_in_iov_iter_readable(i, bytes))) { 760 status = -EFAULT; 761 break; 762 } 763 764 status = iomap_write_begin(iter, pos, bytes, &page); 765 if (unlikely(status)) 766 break; 767 768 if (mapping_writably_mapped(iter->inode->i_mapping)) 769 flush_dcache_page(page); 770 771 copied = copy_page_from_iter_atomic(page, offset, bytes, i); 772 773 status = iomap_write_end(iter, pos, bytes, copied, page); 774 775 if (unlikely(copied != status)) 776 iov_iter_revert(i, copied - status); 777 778 cond_resched(); 779 if (unlikely(status == 0)) { 780 /* 781 * A short copy made iomap_write_end() reject the 782 * thing entirely. Might be memory poisoning 783 * halfway through, might be a race with munmap, 784 * might be severe memory pressure. 785 */ 786 if (copied) 787 bytes = copied; 788 goto again; 789 } 790 pos += status; 791 written += status; 792 length -= status; 793 794 balance_dirty_pages_ratelimited(iter->inode->i_mapping); 795 } while (iov_iter_count(i) && length); 796 797 return written ? written : status; 798 } 799 800 ssize_t 801 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, 802 const struct iomap_ops *ops) 803 { 804 struct iomap_iter iter = { 805 .inode = iocb->ki_filp->f_mapping->host, 806 .pos = iocb->ki_pos, 807 .len = iov_iter_count(i), 808 .flags = IOMAP_WRITE, 809 }; 810 int ret; 811 812 while ((ret = iomap_iter(&iter, ops)) > 0) 813 iter.processed = iomap_write_iter(&iter, i); 814 if (iter.pos == iocb->ki_pos) 815 return ret; 816 return iter.pos - iocb->ki_pos; 817 } 818 EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 819 820 static loff_t iomap_unshare_iter(struct iomap_iter *iter) 821 { 822 struct iomap *iomap = &iter->iomap; 823 const struct iomap *srcmap = iomap_iter_srcmap(iter); 824 loff_t pos = iter->pos; 825 loff_t length = iomap_length(iter); 826 long status = 0; 827 loff_t written = 0; 828 829 /* don't bother with blocks that are not shared to start with */ 830 if (!(iomap->flags & IOMAP_F_SHARED)) 831 return length; 832 /* don't bother with holes or unwritten extents */ 833 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 834 return length; 835 836 do { 837 unsigned long offset = offset_in_page(pos); 838 unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); 839 struct page *page; 840 841 status = iomap_write_begin(iter, pos, bytes, &page); 842 if (unlikely(status)) 843 return status; 844 845 status = iomap_write_end(iter, pos, bytes, bytes, page); 846 if (WARN_ON_ONCE(status == 0)) 847 return -EIO; 848 849 cond_resched(); 850 851 pos += status; 852 written += status; 853 length -= status; 854 855 balance_dirty_pages_ratelimited(iter->inode->i_mapping); 856 } while (length); 857 858 return written; 859 } 860 861 int 862 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, 863 const struct iomap_ops *ops) 864 { 865 struct iomap_iter iter = { 866 .inode = inode, 867 .pos = pos, 868 .len = len, 869 .flags = IOMAP_WRITE | IOMAP_UNSHARE, 870 }; 871 int ret; 872 873 while ((ret = iomap_iter(&iter, ops)) > 0) 874 iter.processed = iomap_unshare_iter(&iter); 875 return ret; 876 } 877 EXPORT_SYMBOL_GPL(iomap_file_unshare); 878 879 static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) 880 { 881 struct page *page; 882 int status; 883 unsigned offset = offset_in_page(pos); 884 unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); 885 886 status = iomap_write_begin(iter, pos, bytes, &page); 887 if (status) 888 return status; 889 890 zero_user(page, offset, bytes); 891 mark_page_accessed(page); 892 893 return iomap_write_end(iter, pos, bytes, bytes, page); 894 } 895 896 static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) 897 { 898 struct iomap *iomap = &iter->iomap; 899 const struct iomap *srcmap = iomap_iter_srcmap(iter); 900 loff_t pos = iter->pos; 901 loff_t length = iomap_length(iter); 902 loff_t written = 0; 903 904 /* already zeroed? we're done. */ 905 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 906 return length; 907 908 do { 909 s64 bytes; 910 911 if (IS_DAX(iter->inode)) 912 bytes = dax_iomap_zero(pos, length, iomap); 913 else 914 bytes = __iomap_zero_iter(iter, pos, length); 915 if (bytes < 0) 916 return bytes; 917 918 pos += bytes; 919 length -= bytes; 920 written += bytes; 921 if (did_zero) 922 *did_zero = true; 923 } while (length > 0); 924 925 return written; 926 } 927 928 int 929 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 930 const struct iomap_ops *ops) 931 { 932 struct iomap_iter iter = { 933 .inode = inode, 934 .pos = pos, 935 .len = len, 936 .flags = IOMAP_ZERO, 937 }; 938 int ret; 939 940 while ((ret = iomap_iter(&iter, ops)) > 0) 941 iter.processed = iomap_zero_iter(&iter, did_zero); 942 return ret; 943 } 944 EXPORT_SYMBOL_GPL(iomap_zero_range); 945 946 int 947 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 948 const struct iomap_ops *ops) 949 { 950 unsigned int blocksize = i_blocksize(inode); 951 unsigned int off = pos & (blocksize - 1); 952 953 /* Block boundary? Nothing to do */ 954 if (!off) 955 return 0; 956 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 957 } 958 EXPORT_SYMBOL_GPL(iomap_truncate_page); 959 960 static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, 961 struct page *page) 962 { 963 loff_t length = iomap_length(iter); 964 int ret; 965 966 if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { 967 ret = __block_write_begin_int(page, iter->pos, length, NULL, 968 &iter->iomap); 969 if (ret) 970 return ret; 971 block_commit_write(page, 0, length); 972 } else { 973 WARN_ON_ONCE(!PageUptodate(page)); 974 set_page_dirty(page); 975 } 976 977 return length; 978 } 979 980 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 981 { 982 struct iomap_iter iter = { 983 .inode = file_inode(vmf->vma->vm_file), 984 .flags = IOMAP_WRITE | IOMAP_FAULT, 985 }; 986 struct page *page = vmf->page; 987 ssize_t ret; 988 989 lock_page(page); 990 ret = page_mkwrite_check_truncate(page, iter.inode); 991 if (ret < 0) 992 goto out_unlock; 993 iter.pos = page_offset(page); 994 iter.len = ret; 995 while ((ret = iomap_iter(&iter, ops)) > 0) 996 iter.processed = iomap_page_mkwrite_iter(&iter, page); 997 998 if (ret < 0) 999 goto out_unlock; 1000 wait_for_stable_page(page); 1001 return VM_FAULT_LOCKED; 1002 out_unlock: 1003 unlock_page(page); 1004 return block_page_mkwrite_return(ret); 1005 } 1006 EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1007 1008 static void 1009 iomap_finish_page_writeback(struct inode *inode, struct page *page, 1010 int error, unsigned int len) 1011 { 1012 struct iomap_page *iop = to_iomap_page(page); 1013 1014 if (error) { 1015 SetPageError(page); 1016 mapping_set_error(inode->i_mapping, error); 1017 } 1018 1019 WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); 1020 WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); 1021 1022 if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) 1023 end_page_writeback(page); 1024 } 1025 1026 /* 1027 * We're now finished for good with this ioend structure. Update the page 1028 * state, release holds on bios, and finally free up memory. Do not use the 1029 * ioend after this. 1030 */ 1031 static void 1032 iomap_finish_ioend(struct iomap_ioend *ioend, int error) 1033 { 1034 struct inode *inode = ioend->io_inode; 1035 struct bio *bio = &ioend->io_inline_bio; 1036 struct bio *last = ioend->io_bio, *next; 1037 u64 start = bio->bi_iter.bi_sector; 1038 loff_t offset = ioend->io_offset; 1039 bool quiet = bio_flagged(bio, BIO_QUIET); 1040 1041 for (bio = &ioend->io_inline_bio; bio; bio = next) { 1042 struct bio_vec *bv; 1043 struct bvec_iter_all iter_all; 1044 1045 /* 1046 * For the last bio, bi_private points to the ioend, so we 1047 * need to explicitly end the iteration here. 1048 */ 1049 if (bio == last) 1050 next = NULL; 1051 else 1052 next = bio->bi_private; 1053 1054 /* walk each page on bio, ending page IO on them */ 1055 bio_for_each_segment_all(bv, bio, iter_all) 1056 iomap_finish_page_writeback(inode, bv->bv_page, error, 1057 bv->bv_len); 1058 bio_put(bio); 1059 } 1060 /* The ioend has been freed by bio_put() */ 1061 1062 if (unlikely(error && !quiet)) { 1063 printk_ratelimited(KERN_ERR 1064 "%s: writeback error on inode %lu, offset %lld, sector %llu", 1065 inode->i_sb->s_id, inode->i_ino, offset, start); 1066 } 1067 } 1068 1069 void 1070 iomap_finish_ioends(struct iomap_ioend *ioend, int error) 1071 { 1072 struct list_head tmp; 1073 1074 list_replace_init(&ioend->io_list, &tmp); 1075 iomap_finish_ioend(ioend, error); 1076 1077 while (!list_empty(&tmp)) { 1078 ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); 1079 list_del_init(&ioend->io_list); 1080 iomap_finish_ioend(ioend, error); 1081 } 1082 } 1083 EXPORT_SYMBOL_GPL(iomap_finish_ioends); 1084 1085 /* 1086 * We can merge two adjacent ioends if they have the same set of work to do. 1087 */ 1088 static bool 1089 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) 1090 { 1091 if (ioend->io_bio->bi_status != next->io_bio->bi_status) 1092 return false; 1093 if ((ioend->io_flags & IOMAP_F_SHARED) ^ 1094 (next->io_flags & IOMAP_F_SHARED)) 1095 return false; 1096 if ((ioend->io_type == IOMAP_UNWRITTEN) ^ 1097 (next->io_type == IOMAP_UNWRITTEN)) 1098 return false; 1099 if (ioend->io_offset + ioend->io_size != next->io_offset) 1100 return false; 1101 return true; 1102 } 1103 1104 void 1105 iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) 1106 { 1107 struct iomap_ioend *next; 1108 1109 INIT_LIST_HEAD(&ioend->io_list); 1110 1111 while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, 1112 io_list))) { 1113 if (!iomap_ioend_can_merge(ioend, next)) 1114 break; 1115 list_move_tail(&next->io_list, &ioend->io_list); 1116 ioend->io_size += next->io_size; 1117 } 1118 } 1119 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); 1120 1121 static int 1122 iomap_ioend_compare(void *priv, const struct list_head *a, 1123 const struct list_head *b) 1124 { 1125 struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); 1126 struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); 1127 1128 if (ia->io_offset < ib->io_offset) 1129 return -1; 1130 if (ia->io_offset > ib->io_offset) 1131 return 1; 1132 return 0; 1133 } 1134 1135 void 1136 iomap_sort_ioends(struct list_head *ioend_list) 1137 { 1138 list_sort(NULL, ioend_list, iomap_ioend_compare); 1139 } 1140 EXPORT_SYMBOL_GPL(iomap_sort_ioends); 1141 1142 static void iomap_writepage_end_bio(struct bio *bio) 1143 { 1144 struct iomap_ioend *ioend = bio->bi_private; 1145 1146 iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); 1147 } 1148 1149 /* 1150 * Submit the final bio for an ioend. 1151 * 1152 * If @error is non-zero, it means that we have a situation where some part of 1153 * the submission process has failed after we've marked pages for writeback 1154 * and unlocked them. In this situation, we need to fail the bio instead of 1155 * submitting it. This typically only happens on a filesystem shutdown. 1156 */ 1157 static int 1158 iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, 1159 int error) 1160 { 1161 ioend->io_bio->bi_private = ioend; 1162 ioend->io_bio->bi_end_io = iomap_writepage_end_bio; 1163 1164 if (wpc->ops->prepare_ioend) 1165 error = wpc->ops->prepare_ioend(ioend, error); 1166 if (error) { 1167 /* 1168 * If we're failing the IO now, just mark the ioend with an 1169 * error and finish it. This will run IO completion immediately 1170 * as there is only one reference to the ioend at this point in 1171 * time. 1172 */ 1173 ioend->io_bio->bi_status = errno_to_blk_status(error); 1174 bio_endio(ioend->io_bio); 1175 return error; 1176 } 1177 1178 submit_bio(ioend->io_bio); 1179 return 0; 1180 } 1181 1182 static struct iomap_ioend * 1183 iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, 1184 loff_t offset, sector_t sector, struct writeback_control *wbc) 1185 { 1186 struct iomap_ioend *ioend; 1187 struct bio *bio; 1188 1189 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset); 1190 bio_set_dev(bio, wpc->iomap.bdev); 1191 bio->bi_iter.bi_sector = sector; 1192 bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 1193 bio->bi_write_hint = inode->i_write_hint; 1194 wbc_init_bio(wbc, bio); 1195 1196 ioend = container_of(bio, struct iomap_ioend, io_inline_bio); 1197 INIT_LIST_HEAD(&ioend->io_list); 1198 ioend->io_type = wpc->iomap.type; 1199 ioend->io_flags = wpc->iomap.flags; 1200 ioend->io_inode = inode; 1201 ioend->io_size = 0; 1202 ioend->io_offset = offset; 1203 ioend->io_bio = bio; 1204 return ioend; 1205 } 1206 1207 /* 1208 * Allocate a new bio, and chain the old bio to the new one. 1209 * 1210 * Note that we have to perform the chaining in this unintuitive order 1211 * so that the bi_private linkage is set up in the right direction for the 1212 * traversal in iomap_finish_ioend(). 1213 */ 1214 static struct bio * 1215 iomap_chain_bio(struct bio *prev) 1216 { 1217 struct bio *new; 1218 1219 new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); 1220 bio_copy_dev(new, prev);/* also copies over blkcg information */ 1221 new->bi_iter.bi_sector = bio_end_sector(prev); 1222 new->bi_opf = prev->bi_opf; 1223 new->bi_write_hint = prev->bi_write_hint; 1224 1225 bio_chain(prev, new); 1226 bio_get(prev); /* for iomap_finish_ioend */ 1227 submit_bio(prev); 1228 return new; 1229 } 1230 1231 static bool 1232 iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, 1233 sector_t sector) 1234 { 1235 if ((wpc->iomap.flags & IOMAP_F_SHARED) != 1236 (wpc->ioend->io_flags & IOMAP_F_SHARED)) 1237 return false; 1238 if (wpc->iomap.type != wpc->ioend->io_type) 1239 return false; 1240 if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) 1241 return false; 1242 if (sector != bio_end_sector(wpc->ioend->io_bio)) 1243 return false; 1244 return true; 1245 } 1246 1247 /* 1248 * Test to see if we have an existing ioend structure that we could append to 1249 * first; otherwise finish off the current ioend and start another. 1250 */ 1251 static void 1252 iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, 1253 struct iomap_page *iop, struct iomap_writepage_ctx *wpc, 1254 struct writeback_control *wbc, struct list_head *iolist) 1255 { 1256 sector_t sector = iomap_sector(&wpc->iomap, offset); 1257 unsigned len = i_blocksize(inode); 1258 unsigned poff = offset & (PAGE_SIZE - 1); 1259 1260 if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { 1261 if (wpc->ioend) 1262 list_add(&wpc->ioend->io_list, iolist); 1263 wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); 1264 } 1265 1266 if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { 1267 wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); 1268 __bio_add_page(wpc->ioend->io_bio, page, len, poff); 1269 } 1270 1271 if (iop) 1272 atomic_add(len, &iop->write_bytes_pending); 1273 wpc->ioend->io_size += len; 1274 wbc_account_cgroup_owner(wbc, page, len); 1275 } 1276 1277 /* 1278 * We implement an immediate ioend submission policy here to avoid needing to 1279 * chain multiple ioends and hence nest mempool allocations which can violate 1280 * the forward progress guarantees we need to provide. The current ioend we're 1281 * adding blocks to is cached in the writepage context, and if the new block 1282 * doesn't append to the cached ioend, it will create a new ioend and cache that 1283 * instead. 1284 * 1285 * If a new ioend is created and cached, the old ioend is returned and queued 1286 * locally for submission once the entire page is processed or an error has been 1287 * detected. While ioends are submitted immediately after they are completed, 1288 * batching optimisations are provided by higher level block plugging. 1289 * 1290 * At the end of a writeback pass, there will be a cached ioend remaining on the 1291 * writepage context that the caller will need to submit. 1292 */ 1293 static int 1294 iomap_writepage_map(struct iomap_writepage_ctx *wpc, 1295 struct writeback_control *wbc, struct inode *inode, 1296 struct page *page, u64 end_offset) 1297 { 1298 struct iomap_page *iop = iomap_page_create(inode, page); 1299 struct iomap_ioend *ioend, *next; 1300 unsigned len = i_blocksize(inode); 1301 u64 file_offset; /* file offset of page */ 1302 int error = 0, count = 0, i; 1303 LIST_HEAD(submit_list); 1304 1305 WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); 1306 1307 /* 1308 * Walk through the page to find areas to write back. If we run off the 1309 * end of the current map or find the current map invalid, grab a new 1310 * one. 1311 */ 1312 for (i = 0, file_offset = page_offset(page); 1313 i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; 1314 i++, file_offset += len) { 1315 if (iop && !test_bit(i, iop->uptodate)) 1316 continue; 1317 1318 error = wpc->ops->map_blocks(wpc, inode, file_offset); 1319 if (error) 1320 break; 1321 if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) 1322 continue; 1323 if (wpc->iomap.type == IOMAP_HOLE) 1324 continue; 1325 iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, 1326 &submit_list); 1327 count++; 1328 } 1329 1330 WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); 1331 WARN_ON_ONCE(!PageLocked(page)); 1332 WARN_ON_ONCE(PageWriteback(page)); 1333 WARN_ON_ONCE(PageDirty(page)); 1334 1335 /* 1336 * We cannot cancel the ioend directly here on error. We may have 1337 * already set other pages under writeback and hence we have to run I/O 1338 * completion to mark the error state of the pages under writeback 1339 * appropriately. 1340 */ 1341 if (unlikely(error)) { 1342 /* 1343 * Let the filesystem know what portion of the current page 1344 * failed to map. If the page hasn't been added to ioend, it 1345 * won't be affected by I/O completion and we must unlock it 1346 * now. 1347 */ 1348 if (wpc->ops->discard_page) 1349 wpc->ops->discard_page(page, file_offset); 1350 if (!count) { 1351 ClearPageUptodate(page); 1352 unlock_page(page); 1353 goto done; 1354 } 1355 } 1356 1357 set_page_writeback(page); 1358 unlock_page(page); 1359 1360 /* 1361 * Preserve the original error if there was one; catch 1362 * submission errors here and propagate into subsequent ioend 1363 * submissions. 1364 */ 1365 list_for_each_entry_safe(ioend, next, &submit_list, io_list) { 1366 int error2; 1367 1368 list_del_init(&ioend->io_list); 1369 error2 = iomap_submit_ioend(wpc, ioend, error); 1370 if (error2 && !error) 1371 error = error2; 1372 } 1373 1374 /* 1375 * We can end up here with no error and nothing to write only if we race 1376 * with a partial page truncate on a sub-page block sized filesystem. 1377 */ 1378 if (!count) 1379 end_page_writeback(page); 1380 done: 1381 mapping_set_error(page->mapping, error); 1382 return error; 1383 } 1384 1385 /* 1386 * Write out a dirty page. 1387 * 1388 * For delalloc space on the page, we need to allocate space and flush it. 1389 * For unwritten space on the page, we need to start the conversion to 1390 * regular allocated space. 1391 */ 1392 static int 1393 iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) 1394 { 1395 struct iomap_writepage_ctx *wpc = data; 1396 struct inode *inode = page->mapping->host; 1397 pgoff_t end_index; 1398 u64 end_offset; 1399 loff_t offset; 1400 1401 trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); 1402 1403 /* 1404 * Refuse to write the page out if we're called from reclaim context. 1405 * 1406 * This avoids stack overflows when called from deeply used stacks in 1407 * random callers for direct reclaim or memcg reclaim. We explicitly 1408 * allow reclaim from kswapd as the stack usage there is relatively low. 1409 * 1410 * This should never happen except in the case of a VM regression so 1411 * warn about it. 1412 */ 1413 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 1414 PF_MEMALLOC)) 1415 goto redirty; 1416 1417 /* 1418 * Is this page beyond the end of the file? 1419 * 1420 * The page index is less than the end_index, adjust the end_offset 1421 * to the highest offset that this page should represent. 1422 * ----------------------------------------------------- 1423 * | file mapping | <EOF> | 1424 * ----------------------------------------------------- 1425 * | Page ... | Page N-2 | Page N-1 | Page N | | 1426 * ^--------------------------------^----------|-------- 1427 * | desired writeback range | see else | 1428 * ---------------------------------^------------------| 1429 */ 1430 offset = i_size_read(inode); 1431 end_index = offset >> PAGE_SHIFT; 1432 if (page->index < end_index) 1433 end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; 1434 else { 1435 /* 1436 * Check whether the page to write out is beyond or straddles 1437 * i_size or not. 1438 * ------------------------------------------------------- 1439 * | file mapping | <EOF> | 1440 * ------------------------------------------------------- 1441 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1442 * ^--------------------------------^-----------|--------- 1443 * | | Straddles | 1444 * ---------------------------------^-----------|--------| 1445 */ 1446 unsigned offset_into_page = offset & (PAGE_SIZE - 1); 1447 1448 /* 1449 * Skip the page if it's fully outside i_size, e.g. due to a 1450 * truncate operation that's in progress. We must redirty the 1451 * page so that reclaim stops reclaiming it. Otherwise 1452 * iomap_vm_releasepage() is called on it and gets confused. 1453 * 1454 * Note that the end_index is unsigned long. If the given 1455 * offset is greater than 16TB on a 32-bit system then if we 1456 * checked if the page is fully outside i_size with 1457 * "if (page->index >= end_index + 1)", "end_index + 1" would 1458 * overflow and evaluate to 0. Hence this page would be 1459 * redirtied and written out repeatedly, which would result in 1460 * an infinite loop; the user program performing this operation 1461 * would hang. Instead, we can detect this situation by 1462 * checking if the page is totally beyond i_size or if its 1463 * offset is just equal to the EOF. 1464 */ 1465 if (page->index > end_index || 1466 (page->index == end_index && offset_into_page == 0)) 1467 goto redirty; 1468 1469 /* 1470 * The page straddles i_size. It must be zeroed out on each 1471 * and every writepage invocation because it may be mmapped. 1472 * "A file is mapped in multiples of the page size. For a file 1473 * that is not a multiple of the page size, the remaining 1474 * memory is zeroed when mapped, and writes to that region are 1475 * not written out to the file." 1476 */ 1477 zero_user_segment(page, offset_into_page, PAGE_SIZE); 1478 1479 /* Adjust the end_offset to the end of file */ 1480 end_offset = offset; 1481 } 1482 1483 return iomap_writepage_map(wpc, wbc, inode, page, end_offset); 1484 1485 redirty: 1486 redirty_page_for_writepage(wbc, page); 1487 unlock_page(page); 1488 return 0; 1489 } 1490 1491 int 1492 iomap_writepage(struct page *page, struct writeback_control *wbc, 1493 struct iomap_writepage_ctx *wpc, 1494 const struct iomap_writeback_ops *ops) 1495 { 1496 int ret; 1497 1498 wpc->ops = ops; 1499 ret = iomap_do_writepage(page, wbc, wpc); 1500 if (!wpc->ioend) 1501 return ret; 1502 return iomap_submit_ioend(wpc, wpc->ioend, ret); 1503 } 1504 EXPORT_SYMBOL_GPL(iomap_writepage); 1505 1506 int 1507 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, 1508 struct iomap_writepage_ctx *wpc, 1509 const struct iomap_writeback_ops *ops) 1510 { 1511 int ret; 1512 1513 wpc->ops = ops; 1514 ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); 1515 if (!wpc->ioend) 1516 return ret; 1517 return iomap_submit_ioend(wpc, wpc->ioend, ret); 1518 } 1519 EXPORT_SYMBOL_GPL(iomap_writepages); 1520 1521 static int __init iomap_init(void) 1522 { 1523 return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), 1524 offsetof(struct iomap_ioend, io_inline_bio), 1525 BIOSET_NEED_BVECS); 1526 } 1527 fs_initcall(iomap_init); 1528