1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 123 /* 124 * We hand off the transaction to the completion thread now, so 125 * clear the flag here. 126 */ 127 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 128 return 0; 129 } 130 131 /* 132 * Update on-disk file size now that data has been written to disk. 133 */ 134 STATIC int 135 xfs_setfilesize( 136 struct xfs_inode *ip, 137 struct xfs_trans *tp, 138 xfs_off_t offset, 139 size_t size) 140 { 141 xfs_fsize_t isize; 142 143 xfs_ilock(ip, XFS_ILOCK_EXCL); 144 isize = xfs_new_eof(ip, offset + size); 145 if (!isize) { 146 xfs_iunlock(ip, XFS_ILOCK_EXCL); 147 xfs_trans_cancel(tp); 148 return 0; 149 } 150 151 trace_xfs_setfilesize(ip, offset, size); 152 153 ip->i_d.di_size = isize; 154 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 155 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 156 157 return xfs_trans_commit(tp); 158 } 159 160 STATIC int 161 xfs_setfilesize_ioend( 162 struct xfs_ioend *ioend) 163 { 164 struct xfs_inode *ip = XFS_I(ioend->io_inode); 165 struct xfs_trans *tp = ioend->io_append_trans; 166 167 /* 168 * The transaction may have been allocated in the I/O submission thread, 169 * thus we need to mark ourselves as being in a transaction manually. 170 * Similarly for freeze protection. 171 */ 172 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 173 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 174 175 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 176 } 177 178 /* 179 * Schedule IO completion handling on the final put of an ioend. 180 * 181 * If there is no work to do we might as well call it a day and free the 182 * ioend right now. 183 */ 184 STATIC void 185 xfs_finish_ioend( 186 struct xfs_ioend *ioend) 187 { 188 if (atomic_dec_and_test(&ioend->io_remaining)) { 189 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 190 191 if (ioend->io_type == XFS_IO_UNWRITTEN) 192 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 193 else if (ioend->io_append_trans) 194 queue_work(mp->m_data_workqueue, &ioend->io_work); 195 else 196 xfs_destroy_ioend(ioend); 197 } 198 } 199 200 /* 201 * IO write completion. 202 */ 203 STATIC void 204 xfs_end_io( 205 struct work_struct *work) 206 { 207 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode); 209 int error = 0; 210 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 212 ioend->io_error = -EIO; 213 goto done; 214 } 215 if (ioend->io_error) 216 goto done; 217 218 /* 219 * For unwritten extents we need to issue transactions to convert a 220 * range to normal written extens after the data I/O has finished. 221 */ 222 if (ioend->io_type == XFS_IO_UNWRITTEN) { 223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 224 ioend->io_size); 225 } else if (ioend->io_append_trans) { 226 error = xfs_setfilesize_ioend(ioend); 227 } else { 228 ASSERT(!xfs_ioend_is_append(ioend)); 229 } 230 231 done: 232 if (error) 233 ioend->io_error = error; 234 xfs_destroy_ioend(ioend); 235 } 236 237 /* 238 * Allocate and initialise an IO completion structure. 239 * We need to track unwritten extent write completion here initially. 240 * We'll need to extend this for updating the ondisk inode size later 241 * (vs. incore size). 242 */ 243 STATIC xfs_ioend_t * 244 xfs_alloc_ioend( 245 struct inode *inode, 246 unsigned int type) 247 { 248 xfs_ioend_t *ioend; 249 250 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 251 252 /* 253 * Set the count to 1 initially, which will prevent an I/O 254 * completion callback from happening before we have started 255 * all the I/O from calling the completion routine too early. 256 */ 257 atomic_set(&ioend->io_remaining, 1); 258 ioend->io_error = 0; 259 ioend->io_list = NULL; 260 ioend->io_type = type; 261 ioend->io_inode = inode; 262 ioend->io_buffer_head = NULL; 263 ioend->io_buffer_tail = NULL; 264 ioend->io_offset = 0; 265 ioend->io_size = 0; 266 ioend->io_append_trans = NULL; 267 268 INIT_WORK(&ioend->io_work, xfs_end_io); 269 return ioend; 270 } 271 272 STATIC int 273 xfs_map_blocks( 274 struct inode *inode, 275 loff_t offset, 276 struct xfs_bmbt_irec *imap, 277 int type, 278 int nonblocking) 279 { 280 struct xfs_inode *ip = XFS_I(inode); 281 struct xfs_mount *mp = ip->i_mount; 282 ssize_t count = 1 << inode->i_blkbits; 283 xfs_fileoff_t offset_fsb, end_fsb; 284 int error = 0; 285 int bmapi_flags = XFS_BMAPI_ENTIRE; 286 int nimaps = 1; 287 288 if (XFS_FORCED_SHUTDOWN(mp)) 289 return -EIO; 290 291 if (type == XFS_IO_UNWRITTEN) 292 bmapi_flags |= XFS_BMAPI_IGSTATE; 293 294 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 295 if (nonblocking) 296 return -EAGAIN; 297 xfs_ilock(ip, XFS_ILOCK_SHARED); 298 } 299 300 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 301 (ip->i_df.if_flags & XFS_IFEXTENTS)); 302 ASSERT(offset <= mp->m_super->s_maxbytes); 303 304 if (offset + count > mp->m_super->s_maxbytes) 305 count = mp->m_super->s_maxbytes - offset; 306 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 307 offset_fsb = XFS_B_TO_FSBT(mp, offset); 308 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 309 imap, &nimaps, bmapi_flags); 310 xfs_iunlock(ip, XFS_ILOCK_SHARED); 311 312 if (error) 313 return error; 314 315 if (type == XFS_IO_DELALLOC && 316 (!nimaps || isnullstartblock(imap->br_startblock))) { 317 error = xfs_iomap_write_allocate(ip, offset, imap); 318 if (!error) 319 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 320 return error; 321 } 322 323 #ifdef DEBUG 324 if (type == XFS_IO_UNWRITTEN) { 325 ASSERT(nimaps); 326 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 327 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 328 } 329 #endif 330 if (nimaps) 331 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 332 return 0; 333 } 334 335 STATIC int 336 xfs_imap_valid( 337 struct inode *inode, 338 struct xfs_bmbt_irec *imap, 339 xfs_off_t offset) 340 { 341 offset >>= inode->i_blkbits; 342 343 return offset >= imap->br_startoff && 344 offset < imap->br_startoff + imap->br_blockcount; 345 } 346 347 /* 348 * BIO completion handler for buffered IO. 349 */ 350 STATIC void 351 xfs_end_bio( 352 struct bio *bio) 353 { 354 xfs_ioend_t *ioend = bio->bi_private; 355 356 if (!ioend->io_error) 357 ioend->io_error = bio->bi_error; 358 359 /* Toss bio and pass work off to an xfsdatad thread */ 360 bio->bi_private = NULL; 361 bio->bi_end_io = NULL; 362 bio_put(bio); 363 364 xfs_finish_ioend(ioend); 365 } 366 367 STATIC void 368 xfs_submit_ioend_bio( 369 struct writeback_control *wbc, 370 xfs_ioend_t *ioend, 371 struct bio *bio) 372 { 373 atomic_inc(&ioend->io_remaining); 374 bio->bi_private = ioend; 375 bio->bi_end_io = xfs_end_bio; 376 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 377 } 378 379 STATIC struct bio * 380 xfs_alloc_ioend_bio( 381 struct buffer_head *bh) 382 { 383 struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 384 385 ASSERT(bio->bi_private == NULL); 386 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 387 bio->bi_bdev = bh->b_bdev; 388 return bio; 389 } 390 391 STATIC void 392 xfs_start_buffer_writeback( 393 struct buffer_head *bh) 394 { 395 ASSERT(buffer_mapped(bh)); 396 ASSERT(buffer_locked(bh)); 397 ASSERT(!buffer_delay(bh)); 398 ASSERT(!buffer_unwritten(bh)); 399 400 mark_buffer_async_write(bh); 401 set_buffer_uptodate(bh); 402 clear_buffer_dirty(bh); 403 } 404 405 STATIC void 406 xfs_start_page_writeback( 407 struct page *page, 408 int clear_dirty, 409 int buffers) 410 { 411 ASSERT(PageLocked(page)); 412 ASSERT(!PageWriteback(page)); 413 414 /* 415 * if the page was not fully cleaned, we need to ensure that the higher 416 * layers come back to it correctly. That means we need to keep the page 417 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 418 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 419 * write this page in this writeback sweep will be made. 420 */ 421 if (clear_dirty) { 422 clear_page_dirty_for_io(page); 423 set_page_writeback(page); 424 } else 425 set_page_writeback_keepwrite(page); 426 427 unlock_page(page); 428 429 /* If no buffers on the page are to be written, finish it here */ 430 if (!buffers) 431 end_page_writeback(page); 432 } 433 434 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 435 { 436 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 437 } 438 439 /* 440 * Submit all of the bios for all of the ioends we have saved up, covering the 441 * initial writepage page and also any probed pages. 442 * 443 * Because we may have multiple ioends spanning a page, we need to start 444 * writeback on all the buffers before we submit them for I/O. If we mark the 445 * buffers as we got, then we can end up with a page that only has buffers 446 * marked async write and I/O complete on can occur before we mark the other 447 * buffers async write. 448 * 449 * The end result of this is that we trip a bug in end_page_writeback() because 450 * we call it twice for the one page as the code in end_buffer_async_write() 451 * assumes that all buffers on the page are started at the same time. 452 * 453 * The fix is two passes across the ioend list - one to start writeback on the 454 * buffer_heads, and then submit them for I/O on the second pass. 455 * 456 * If @fail is non-zero, it means that we have a situation where some part of 457 * the submission process has failed after we have marked paged for writeback 458 * and unlocked them. In this situation, we need to fail the ioend chain rather 459 * than submit it to IO. This typically only happens on a filesystem shutdown. 460 */ 461 STATIC void 462 xfs_submit_ioend( 463 struct writeback_control *wbc, 464 xfs_ioend_t *ioend, 465 int fail) 466 { 467 xfs_ioend_t *head = ioend; 468 xfs_ioend_t *next; 469 struct buffer_head *bh; 470 struct bio *bio; 471 sector_t lastblock = 0; 472 473 /* Pass 1 - start writeback */ 474 do { 475 next = ioend->io_list; 476 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 477 xfs_start_buffer_writeback(bh); 478 } while ((ioend = next) != NULL); 479 480 /* Pass 2 - submit I/O */ 481 ioend = head; 482 do { 483 next = ioend->io_list; 484 bio = NULL; 485 486 /* 487 * If we are failing the IO now, just mark the ioend with an 488 * error and finish it. This will run IO completion immediately 489 * as there is only one reference to the ioend at this point in 490 * time. 491 */ 492 if (fail) { 493 ioend->io_error = fail; 494 xfs_finish_ioend(ioend); 495 continue; 496 } 497 498 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 499 500 if (!bio) { 501 retry: 502 bio = xfs_alloc_ioend_bio(bh); 503 } else if (bh->b_blocknr != lastblock + 1) { 504 xfs_submit_ioend_bio(wbc, ioend, bio); 505 goto retry; 506 } 507 508 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 509 xfs_submit_ioend_bio(wbc, ioend, bio); 510 goto retry; 511 } 512 513 lastblock = bh->b_blocknr; 514 } 515 if (bio) 516 xfs_submit_ioend_bio(wbc, ioend, bio); 517 xfs_finish_ioend(ioend); 518 } while ((ioend = next) != NULL); 519 } 520 521 /* 522 * Cancel submission of all buffer_heads so far in this endio. 523 * Toss the endio too. Only ever called for the initial page 524 * in a writepage request, so only ever one page. 525 */ 526 STATIC void 527 xfs_cancel_ioend( 528 xfs_ioend_t *ioend) 529 { 530 xfs_ioend_t *next; 531 struct buffer_head *bh, *next_bh; 532 533 do { 534 next = ioend->io_list; 535 bh = ioend->io_buffer_head; 536 do { 537 next_bh = bh->b_private; 538 clear_buffer_async_write(bh); 539 /* 540 * The unwritten flag is cleared when added to the 541 * ioend. We're not submitting for I/O so mark the 542 * buffer unwritten again for next time around. 543 */ 544 if (ioend->io_type == XFS_IO_UNWRITTEN) 545 set_buffer_unwritten(bh); 546 unlock_buffer(bh); 547 } while ((bh = next_bh) != NULL); 548 549 mempool_free(ioend, xfs_ioend_pool); 550 } while ((ioend = next) != NULL); 551 } 552 553 /* 554 * Test to see if we've been building up a completion structure for 555 * earlier buffers -- if so, we try to append to this ioend if we 556 * can, otherwise we finish off any current ioend and start another. 557 * Return true if we've finished the given ioend. 558 */ 559 STATIC void 560 xfs_add_to_ioend( 561 struct inode *inode, 562 struct buffer_head *bh, 563 xfs_off_t offset, 564 unsigned int type, 565 xfs_ioend_t **result, 566 int need_ioend) 567 { 568 xfs_ioend_t *ioend = *result; 569 570 if (!ioend || need_ioend || type != ioend->io_type) { 571 xfs_ioend_t *previous = *result; 572 573 ioend = xfs_alloc_ioend(inode, type); 574 ioend->io_offset = offset; 575 ioend->io_buffer_head = bh; 576 ioend->io_buffer_tail = bh; 577 if (previous) 578 previous->io_list = ioend; 579 *result = ioend; 580 } else { 581 ioend->io_buffer_tail->b_private = bh; 582 ioend->io_buffer_tail = bh; 583 } 584 585 bh->b_private = NULL; 586 ioend->io_size += bh->b_size; 587 } 588 589 STATIC void 590 xfs_map_buffer( 591 struct inode *inode, 592 struct buffer_head *bh, 593 struct xfs_bmbt_irec *imap, 594 xfs_off_t offset) 595 { 596 sector_t bn; 597 struct xfs_mount *m = XFS_I(inode)->i_mount; 598 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 599 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 600 601 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 602 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 603 604 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 605 ((offset - iomap_offset) >> inode->i_blkbits); 606 607 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 608 609 bh->b_blocknr = bn; 610 set_buffer_mapped(bh); 611 } 612 613 STATIC void 614 xfs_map_at_offset( 615 struct inode *inode, 616 struct buffer_head *bh, 617 struct xfs_bmbt_irec *imap, 618 xfs_off_t offset) 619 { 620 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 621 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 622 623 xfs_map_buffer(inode, bh, imap, offset); 624 set_buffer_mapped(bh); 625 clear_buffer_delay(bh); 626 clear_buffer_unwritten(bh); 627 } 628 629 /* 630 * Test if a given page contains at least one buffer of a given @type. 631 * If @check_all_buffers is true, then we walk all the buffers in the page to 632 * try to find one of the type passed in. If it is not set, then the caller only 633 * needs to check the first buffer on the page for a match. 634 */ 635 STATIC bool 636 xfs_check_page_type( 637 struct page *page, 638 unsigned int type, 639 bool check_all_buffers) 640 { 641 struct buffer_head *bh; 642 struct buffer_head *head; 643 644 if (PageWriteback(page)) 645 return false; 646 if (!page->mapping) 647 return false; 648 if (!page_has_buffers(page)) 649 return false; 650 651 bh = head = page_buffers(page); 652 do { 653 if (buffer_unwritten(bh)) { 654 if (type == XFS_IO_UNWRITTEN) 655 return true; 656 } else if (buffer_delay(bh)) { 657 if (type == XFS_IO_DELALLOC) 658 return true; 659 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 660 if (type == XFS_IO_OVERWRITE) 661 return true; 662 } 663 664 /* If we are only checking the first buffer, we are done now. */ 665 if (!check_all_buffers) 666 break; 667 } while ((bh = bh->b_this_page) != head); 668 669 return false; 670 } 671 672 /* 673 * Allocate & map buffers for page given the extent map. Write it out. 674 * except for the original page of a writepage, this is called on 675 * delalloc/unwritten pages only, for the original page it is possible 676 * that the page has no mapping at all. 677 */ 678 STATIC int 679 xfs_convert_page( 680 struct inode *inode, 681 struct page *page, 682 loff_t tindex, 683 struct xfs_bmbt_irec *imap, 684 xfs_ioend_t **ioendp, 685 struct writeback_control *wbc) 686 { 687 struct buffer_head *bh, *head; 688 xfs_off_t end_offset; 689 unsigned long p_offset; 690 unsigned int type; 691 int len, page_dirty; 692 int count = 0, done = 0, uptodate = 1; 693 xfs_off_t offset = page_offset(page); 694 695 if (page->index != tindex) 696 goto fail; 697 if (!trylock_page(page)) 698 goto fail; 699 if (PageWriteback(page)) 700 goto fail_unlock_page; 701 if (page->mapping != inode->i_mapping) 702 goto fail_unlock_page; 703 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 704 goto fail_unlock_page; 705 706 /* 707 * page_dirty is initially a count of buffers on the page before 708 * EOF and is decremented as we move each into a cleanable state. 709 * 710 * Derivation: 711 * 712 * End offset is the highest offset that this page should represent. 713 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 714 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 715 * hence give us the correct page_dirty count. On any other page, 716 * it will be zero and in that case we need page_dirty to be the 717 * count of buffers on the page. 718 */ 719 end_offset = min_t(unsigned long long, 720 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 721 i_size_read(inode)); 722 723 /* 724 * If the current map does not span the entire page we are about to try 725 * to write, then give up. The only way we can write a page that spans 726 * multiple mappings in a single writeback iteration is via the 727 * xfs_vm_writepage() function. Data integrity writeback requires the 728 * entire page to be written in a single attempt, otherwise the part of 729 * the page we don't write here doesn't get written as part of the data 730 * integrity sync. 731 * 732 * For normal writeback, we also don't attempt to write partial pages 733 * here as it simply means that write_cache_pages() will see it under 734 * writeback and ignore the page until some point in the future, at 735 * which time this will be the only page in the file that needs 736 * writeback. Hence for more optimal IO patterns, we should always 737 * avoid partial page writeback due to multiple mappings on a page here. 738 */ 739 if (!xfs_imap_valid(inode, imap, end_offset)) 740 goto fail_unlock_page; 741 742 len = 1 << inode->i_blkbits; 743 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 744 PAGE_CACHE_SIZE); 745 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 746 page_dirty = p_offset / len; 747 748 /* 749 * The moment we find a buffer that doesn't match our current type 750 * specification or can't be written, abort the loop and start 751 * writeback. As per the above xfs_imap_valid() check, only 752 * xfs_vm_writepage() can handle partial page writeback fully - we are 753 * limited here to the buffers that are contiguous with the current 754 * ioend, and hence a buffer we can't write breaks that contiguity and 755 * we have to defer the rest of the IO to xfs_vm_writepage(). 756 */ 757 bh = head = page_buffers(page); 758 do { 759 if (offset >= end_offset) 760 break; 761 if (!buffer_uptodate(bh)) 762 uptodate = 0; 763 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 764 done = 1; 765 break; 766 } 767 768 if (buffer_unwritten(bh) || buffer_delay(bh) || 769 buffer_mapped(bh)) { 770 if (buffer_unwritten(bh)) 771 type = XFS_IO_UNWRITTEN; 772 else if (buffer_delay(bh)) 773 type = XFS_IO_DELALLOC; 774 else 775 type = XFS_IO_OVERWRITE; 776 777 /* 778 * imap should always be valid because of the above 779 * partial page end_offset check on the imap. 780 */ 781 ASSERT(xfs_imap_valid(inode, imap, offset)); 782 783 lock_buffer(bh); 784 if (type != XFS_IO_OVERWRITE) 785 xfs_map_at_offset(inode, bh, imap, offset); 786 xfs_add_to_ioend(inode, bh, offset, type, 787 ioendp, done); 788 789 page_dirty--; 790 count++; 791 } else { 792 done = 1; 793 break; 794 } 795 } while (offset += len, (bh = bh->b_this_page) != head); 796 797 if (uptodate && bh == head) 798 SetPageUptodate(page); 799 800 if (count) { 801 if (--wbc->nr_to_write <= 0 && 802 wbc->sync_mode == WB_SYNC_NONE) 803 done = 1; 804 } 805 xfs_start_page_writeback(page, !page_dirty, count); 806 807 return done; 808 fail_unlock_page: 809 unlock_page(page); 810 fail: 811 return 1; 812 } 813 814 /* 815 * Convert & write out a cluster of pages in the same extent as defined 816 * by mp and following the start page. 817 */ 818 STATIC void 819 xfs_cluster_write( 820 struct inode *inode, 821 pgoff_t tindex, 822 struct xfs_bmbt_irec *imap, 823 xfs_ioend_t **ioendp, 824 struct writeback_control *wbc, 825 pgoff_t tlast) 826 { 827 struct pagevec pvec; 828 int done = 0, i; 829 830 pagevec_init(&pvec, 0); 831 while (!done && tindex <= tlast) { 832 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 833 834 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 835 break; 836 837 for (i = 0; i < pagevec_count(&pvec); i++) { 838 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 839 imap, ioendp, wbc); 840 if (done) 841 break; 842 } 843 844 pagevec_release(&pvec); 845 cond_resched(); 846 } 847 } 848 849 STATIC void 850 xfs_vm_invalidatepage( 851 struct page *page, 852 unsigned int offset, 853 unsigned int length) 854 { 855 trace_xfs_invalidatepage(page->mapping->host, page, offset, 856 length); 857 block_invalidatepage(page, offset, length); 858 } 859 860 /* 861 * If the page has delalloc buffers on it, we need to punch them out before we 862 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 863 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 864 * is done on that same region - the delalloc extent is returned when none is 865 * supposed to be there. 866 * 867 * We prevent this by truncating away the delalloc regions on the page before 868 * invalidating it. Because they are delalloc, we can do this without needing a 869 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 870 * truncation without a transaction as there is no space left for block 871 * reservation (typically why we see a ENOSPC in writeback). 872 * 873 * This is not a performance critical path, so for now just do the punching a 874 * buffer head at a time. 875 */ 876 STATIC void 877 xfs_aops_discard_page( 878 struct page *page) 879 { 880 struct inode *inode = page->mapping->host; 881 struct xfs_inode *ip = XFS_I(inode); 882 struct buffer_head *bh, *head; 883 loff_t offset = page_offset(page); 884 885 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 886 goto out_invalidate; 887 888 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 889 goto out_invalidate; 890 891 xfs_alert(ip->i_mount, 892 "page discard on page %p, inode 0x%llx, offset %llu.", 893 page, ip->i_ino, offset); 894 895 xfs_ilock(ip, XFS_ILOCK_EXCL); 896 bh = head = page_buffers(page); 897 do { 898 int error; 899 xfs_fileoff_t start_fsb; 900 901 if (!buffer_delay(bh)) 902 goto next_buffer; 903 904 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 905 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 906 if (error) { 907 /* something screwed, just bail */ 908 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 909 xfs_alert(ip->i_mount, 910 "page discard unable to remove delalloc mapping."); 911 } 912 break; 913 } 914 next_buffer: 915 offset += 1 << inode->i_blkbits; 916 917 } while ((bh = bh->b_this_page) != head); 918 919 xfs_iunlock(ip, XFS_ILOCK_EXCL); 920 out_invalidate: 921 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 922 return; 923 } 924 925 /* 926 * Write out a dirty page. 927 * 928 * For delalloc space on the page we need to allocate space and flush it. 929 * For unwritten space on the page we need to start the conversion to 930 * regular allocated space. 931 * For any other dirty buffer heads on the page we should flush them. 932 */ 933 STATIC int 934 xfs_vm_writepage( 935 struct page *page, 936 struct writeback_control *wbc) 937 { 938 struct inode *inode = page->mapping->host; 939 struct buffer_head *bh, *head; 940 struct xfs_bmbt_irec imap; 941 xfs_ioend_t *ioend = NULL, *iohead = NULL; 942 loff_t offset; 943 unsigned int type; 944 __uint64_t end_offset; 945 pgoff_t end_index, last_index; 946 ssize_t len; 947 int err, imap_valid = 0, uptodate = 1; 948 int count = 0; 949 int nonblocking = 0; 950 951 trace_xfs_writepage(inode, page, 0, 0); 952 953 ASSERT(page_has_buffers(page)); 954 955 /* 956 * Refuse to write the page out if we are called from reclaim context. 957 * 958 * This avoids stack overflows when called from deeply used stacks in 959 * random callers for direct reclaim or memcg reclaim. We explicitly 960 * allow reclaim from kswapd as the stack usage there is relatively low. 961 * 962 * This should never happen except in the case of a VM regression so 963 * warn about it. 964 */ 965 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 966 PF_MEMALLOC)) 967 goto redirty; 968 969 /* 970 * Given that we do not allow direct reclaim to call us, we should 971 * never be called while in a filesystem transaction. 972 */ 973 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 974 goto redirty; 975 976 /* Is this page beyond the end of the file? */ 977 offset = i_size_read(inode); 978 end_index = offset >> PAGE_CACHE_SHIFT; 979 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 980 981 /* 982 * The page index is less than the end_index, adjust the end_offset 983 * to the highest offset that this page should represent. 984 * ----------------------------------------------------- 985 * | file mapping | <EOF> | 986 * ----------------------------------------------------- 987 * | Page ... | Page N-2 | Page N-1 | Page N | | 988 * ^--------------------------------^----------|-------- 989 * | desired writeback range | see else | 990 * ---------------------------------^------------------| 991 */ 992 if (page->index < end_index) 993 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 994 else { 995 /* 996 * Check whether the page to write out is beyond or straddles 997 * i_size or not. 998 * ------------------------------------------------------- 999 * | file mapping | <EOF> | 1000 * ------------------------------------------------------- 1001 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1002 * ^--------------------------------^-----------|--------- 1003 * | | Straddles | 1004 * ---------------------------------^-----------|--------| 1005 */ 1006 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1007 1008 /* 1009 * Skip the page if it is fully outside i_size, e.g. due to a 1010 * truncate operation that is in progress. We must redirty the 1011 * page so that reclaim stops reclaiming it. Otherwise 1012 * xfs_vm_releasepage() is called on it and gets confused. 1013 * 1014 * Note that the end_index is unsigned long, it would overflow 1015 * if the given offset is greater than 16TB on 32-bit system 1016 * and if we do check the page is fully outside i_size or not 1017 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1018 * will be evaluated to 0. Hence this page will be redirtied 1019 * and be written out repeatedly which would result in an 1020 * infinite loop, the user program that perform this operation 1021 * will hang. Instead, we can verify this situation by checking 1022 * if the page to write is totally beyond the i_size or if it's 1023 * offset is just equal to the EOF. 1024 */ 1025 if (page->index > end_index || 1026 (page->index == end_index && offset_into_page == 0)) 1027 goto redirty; 1028 1029 /* 1030 * The page straddles i_size. It must be zeroed out on each 1031 * and every writepage invocation because it may be mmapped. 1032 * "A file is mapped in multiples of the page size. For a file 1033 * that is not a multiple of the page size, the remaining 1034 * memory is zeroed when mapped, and writes to that region are 1035 * not written out to the file." 1036 */ 1037 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1038 1039 /* Adjust the end_offset to the end of file */ 1040 end_offset = offset; 1041 } 1042 1043 len = 1 << inode->i_blkbits; 1044 1045 bh = head = page_buffers(page); 1046 offset = page_offset(page); 1047 type = XFS_IO_OVERWRITE; 1048 1049 if (wbc->sync_mode == WB_SYNC_NONE) 1050 nonblocking = 1; 1051 1052 do { 1053 int new_ioend = 0; 1054 1055 if (offset >= end_offset) 1056 break; 1057 if (!buffer_uptodate(bh)) 1058 uptodate = 0; 1059 1060 /* 1061 * set_page_dirty dirties all buffers in a page, independent 1062 * of their state. The dirty state however is entirely 1063 * meaningless for holes (!mapped && uptodate), so skip 1064 * buffers covering holes here. 1065 */ 1066 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1067 imap_valid = 0; 1068 continue; 1069 } 1070 1071 if (buffer_unwritten(bh)) { 1072 if (type != XFS_IO_UNWRITTEN) { 1073 type = XFS_IO_UNWRITTEN; 1074 imap_valid = 0; 1075 } 1076 } else if (buffer_delay(bh)) { 1077 if (type != XFS_IO_DELALLOC) { 1078 type = XFS_IO_DELALLOC; 1079 imap_valid = 0; 1080 } 1081 } else if (buffer_uptodate(bh)) { 1082 if (type != XFS_IO_OVERWRITE) { 1083 type = XFS_IO_OVERWRITE; 1084 imap_valid = 0; 1085 } 1086 } else { 1087 if (PageUptodate(page)) 1088 ASSERT(buffer_mapped(bh)); 1089 /* 1090 * This buffer is not uptodate and will not be 1091 * written to disk. Ensure that we will put any 1092 * subsequent writeable buffers into a new 1093 * ioend. 1094 */ 1095 imap_valid = 0; 1096 continue; 1097 } 1098 1099 if (imap_valid) 1100 imap_valid = xfs_imap_valid(inode, &imap, offset); 1101 if (!imap_valid) { 1102 /* 1103 * If we didn't have a valid mapping then we need to 1104 * put the new mapping into a separate ioend structure. 1105 * This ensures non-contiguous extents always have 1106 * separate ioends, which is particularly important 1107 * for unwritten extent conversion at I/O completion 1108 * time. 1109 */ 1110 new_ioend = 1; 1111 err = xfs_map_blocks(inode, offset, &imap, type, 1112 nonblocking); 1113 if (err) 1114 goto error; 1115 imap_valid = xfs_imap_valid(inode, &imap, offset); 1116 } 1117 if (imap_valid) { 1118 lock_buffer(bh); 1119 if (type != XFS_IO_OVERWRITE) 1120 xfs_map_at_offset(inode, bh, &imap, offset); 1121 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1122 new_ioend); 1123 count++; 1124 } 1125 1126 if (!iohead) 1127 iohead = ioend; 1128 1129 } while (offset += len, ((bh = bh->b_this_page) != head)); 1130 1131 if (uptodate && bh == head) 1132 SetPageUptodate(page); 1133 1134 xfs_start_page_writeback(page, 1, count); 1135 1136 /* if there is no IO to be submitted for this page, we are done */ 1137 if (!ioend) 1138 return 0; 1139 1140 ASSERT(iohead); 1141 1142 /* 1143 * Any errors from this point onwards need tobe reported through the IO 1144 * completion path as we have marked the initial page as under writeback 1145 * and unlocked it. 1146 */ 1147 if (imap_valid) { 1148 xfs_off_t end_index; 1149 1150 end_index = imap.br_startoff + imap.br_blockcount; 1151 1152 /* to bytes */ 1153 end_index <<= inode->i_blkbits; 1154 1155 /* to pages */ 1156 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1157 1158 /* check against file size */ 1159 if (end_index > last_index) 1160 end_index = last_index; 1161 1162 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1163 wbc, end_index); 1164 } 1165 1166 1167 /* 1168 * Reserve log space if we might write beyond the on-disk inode size. 1169 */ 1170 err = 0; 1171 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1172 err = xfs_setfilesize_trans_alloc(ioend); 1173 1174 xfs_submit_ioend(wbc, iohead, err); 1175 1176 return 0; 1177 1178 error: 1179 if (iohead) 1180 xfs_cancel_ioend(iohead); 1181 1182 if (err == -EAGAIN) 1183 goto redirty; 1184 1185 xfs_aops_discard_page(page); 1186 ClearPageUptodate(page); 1187 unlock_page(page); 1188 return err; 1189 1190 redirty: 1191 redirty_page_for_writepage(wbc, page); 1192 unlock_page(page); 1193 return 0; 1194 } 1195 1196 STATIC int 1197 xfs_vm_writepages( 1198 struct address_space *mapping, 1199 struct writeback_control *wbc) 1200 { 1201 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1202 return generic_writepages(mapping, wbc); 1203 } 1204 1205 /* 1206 * Called to move a page into cleanable state - and from there 1207 * to be released. The page should already be clean. We always 1208 * have buffer heads in this call. 1209 * 1210 * Returns 1 if the page is ok to release, 0 otherwise. 1211 */ 1212 STATIC int 1213 xfs_vm_releasepage( 1214 struct page *page, 1215 gfp_t gfp_mask) 1216 { 1217 int delalloc, unwritten; 1218 1219 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1220 1221 xfs_count_page_state(page, &delalloc, &unwritten); 1222 1223 if (WARN_ON_ONCE(delalloc)) 1224 return 0; 1225 if (WARN_ON_ONCE(unwritten)) 1226 return 0; 1227 1228 return try_to_free_buffers(page); 1229 } 1230 1231 /* 1232 * When we map a DIO buffer, we may need to attach an ioend that describes the 1233 * type of write IO we are doing. This passes to the completion function the 1234 * operations it needs to perform. If the mapping is for an overwrite wholly 1235 * within the EOF then we don't need an ioend and so we don't allocate one. 1236 * This avoids the unnecessary overhead of allocating and freeing ioends for 1237 * workloads that don't require transactions on IO completion. 1238 * 1239 * If we get multiple mappings in a single IO, we might be mapping different 1240 * types. But because the direct IO can only have a single private pointer, we 1241 * need to ensure that: 1242 * 1243 * a) i) the ioend spans the entire region of unwritten mappings; or 1244 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1245 * b) if it contains unwritten extents, it is *permanently* marked as such 1246 * 1247 * We could do this by chaining ioends like buffered IO does, but we only 1248 * actually get one IO completion callback from the direct IO, and that spans 1249 * the entire IO regardless of how many mappings and IOs are needed to complete 1250 * the DIO. There is only going to be one reference to the ioend and its life 1251 * cycle is constrained by the DIO completion code. hence we don't need 1252 * reference counting here. 1253 */ 1254 static void 1255 xfs_map_direct( 1256 struct inode *inode, 1257 struct buffer_head *bh_result, 1258 struct xfs_bmbt_irec *imap, 1259 xfs_off_t offset) 1260 { 1261 struct xfs_ioend *ioend; 1262 xfs_off_t size = bh_result->b_size; 1263 int type; 1264 1265 if (ISUNWRITTEN(imap)) 1266 type = XFS_IO_UNWRITTEN; 1267 else 1268 type = XFS_IO_OVERWRITE; 1269 1270 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1271 1272 if (bh_result->b_private) { 1273 ioend = bh_result->b_private; 1274 ASSERT(ioend->io_size > 0); 1275 ASSERT(offset >= ioend->io_offset); 1276 if (offset + size > ioend->io_offset + ioend->io_size) 1277 ioend->io_size = offset - ioend->io_offset + size; 1278 1279 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1280 ioend->io_type = XFS_IO_UNWRITTEN; 1281 1282 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1283 ioend->io_size, ioend->io_type, 1284 imap); 1285 } else if (type == XFS_IO_UNWRITTEN || 1286 offset + size > i_size_read(inode)) { 1287 ioend = xfs_alloc_ioend(inode, type); 1288 ioend->io_offset = offset; 1289 ioend->io_size = size; 1290 1291 bh_result->b_private = ioend; 1292 set_buffer_defer_completion(bh_result); 1293 1294 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1295 imap); 1296 } else { 1297 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1298 imap); 1299 } 1300 } 1301 1302 /* 1303 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1304 * is, so that we can avoid repeated get_blocks calls. 1305 * 1306 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1307 * for blocks beyond EOF must be marked new so that sub block regions can be 1308 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1309 * was just allocated or is unwritten, otherwise the callers would overwrite 1310 * existing data with zeros. Hence we have to split the mapping into a range up 1311 * to and including EOF, and a second mapping for beyond EOF. 1312 */ 1313 static void 1314 xfs_map_trim_size( 1315 struct inode *inode, 1316 sector_t iblock, 1317 struct buffer_head *bh_result, 1318 struct xfs_bmbt_irec *imap, 1319 xfs_off_t offset, 1320 ssize_t size) 1321 { 1322 xfs_off_t mapping_size; 1323 1324 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1325 mapping_size <<= inode->i_blkbits; 1326 1327 ASSERT(mapping_size > 0); 1328 if (mapping_size > size) 1329 mapping_size = size; 1330 if (offset < i_size_read(inode) && 1331 offset + mapping_size >= i_size_read(inode)) { 1332 /* limit mapping to block that spans EOF */ 1333 mapping_size = roundup_64(i_size_read(inode) - offset, 1334 1 << inode->i_blkbits); 1335 } 1336 if (mapping_size > LONG_MAX) 1337 mapping_size = LONG_MAX; 1338 1339 bh_result->b_size = mapping_size; 1340 } 1341 1342 STATIC int 1343 __xfs_get_blocks( 1344 struct inode *inode, 1345 sector_t iblock, 1346 struct buffer_head *bh_result, 1347 int create, 1348 bool direct) 1349 { 1350 struct xfs_inode *ip = XFS_I(inode); 1351 struct xfs_mount *mp = ip->i_mount; 1352 xfs_fileoff_t offset_fsb, end_fsb; 1353 int error = 0; 1354 int lockmode = 0; 1355 struct xfs_bmbt_irec imap; 1356 int nimaps = 1; 1357 xfs_off_t offset; 1358 ssize_t size; 1359 int new = 0; 1360 1361 if (XFS_FORCED_SHUTDOWN(mp)) 1362 return -EIO; 1363 1364 offset = (xfs_off_t)iblock << inode->i_blkbits; 1365 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1366 size = bh_result->b_size; 1367 1368 if (!create && direct && offset >= i_size_read(inode)) 1369 return 0; 1370 1371 /* 1372 * Direct I/O is usually done on preallocated files, so try getting 1373 * a block mapping without an exclusive lock first. For buffered 1374 * writes we already have the exclusive iolock anyway, so avoiding 1375 * a lock roundtrip here by taking the ilock exclusive from the 1376 * beginning is a useful micro optimization. 1377 */ 1378 if (create && !direct) { 1379 lockmode = XFS_ILOCK_EXCL; 1380 xfs_ilock(ip, lockmode); 1381 } else { 1382 lockmode = xfs_ilock_data_map_shared(ip); 1383 } 1384 1385 ASSERT(offset <= mp->m_super->s_maxbytes); 1386 if (offset + size > mp->m_super->s_maxbytes) 1387 size = mp->m_super->s_maxbytes - offset; 1388 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1389 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1390 1391 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1392 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1393 if (error) 1394 goto out_unlock; 1395 1396 if (create && 1397 (!nimaps || 1398 (imap.br_startblock == HOLESTARTBLOCK || 1399 imap.br_startblock == DELAYSTARTBLOCK))) { 1400 if (direct || xfs_get_extsz_hint(ip)) { 1401 /* 1402 * Drop the ilock in preparation for starting the block 1403 * allocation transaction. It will be retaken 1404 * exclusively inside xfs_iomap_write_direct for the 1405 * actual allocation. 1406 */ 1407 xfs_iunlock(ip, lockmode); 1408 error = xfs_iomap_write_direct(ip, offset, size, 1409 &imap, nimaps); 1410 if (error) 1411 return error; 1412 new = 1; 1413 1414 } else { 1415 /* 1416 * Delalloc reservations do not require a transaction, 1417 * we can go on without dropping the lock here. If we 1418 * are allocating a new delalloc block, make sure that 1419 * we set the new flag so that we mark the buffer new so 1420 * that we know that it is newly allocated if the write 1421 * fails. 1422 */ 1423 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1424 new = 1; 1425 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1426 if (error) 1427 goto out_unlock; 1428 1429 xfs_iunlock(ip, lockmode); 1430 } 1431 trace_xfs_get_blocks_alloc(ip, offset, size, 1432 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1433 : XFS_IO_DELALLOC, &imap); 1434 } else if (nimaps) { 1435 trace_xfs_get_blocks_found(ip, offset, size, 1436 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1437 : XFS_IO_OVERWRITE, &imap); 1438 xfs_iunlock(ip, lockmode); 1439 } else { 1440 trace_xfs_get_blocks_notfound(ip, offset, size); 1441 goto out_unlock; 1442 } 1443 1444 /* trim mapping down to size requested */ 1445 if (direct || size > (1 << inode->i_blkbits)) 1446 xfs_map_trim_size(inode, iblock, bh_result, 1447 &imap, offset, size); 1448 1449 /* 1450 * For unwritten extents do not report a disk address in the buffered 1451 * read case (treat as if we're reading into a hole). 1452 */ 1453 if (imap.br_startblock != HOLESTARTBLOCK && 1454 imap.br_startblock != DELAYSTARTBLOCK && 1455 (create || !ISUNWRITTEN(&imap))) { 1456 xfs_map_buffer(inode, bh_result, &imap, offset); 1457 if (ISUNWRITTEN(&imap)) 1458 set_buffer_unwritten(bh_result); 1459 /* direct IO needs special help */ 1460 if (create && direct) 1461 xfs_map_direct(inode, bh_result, &imap, offset); 1462 } 1463 1464 /* 1465 * If this is a realtime file, data may be on a different device. 1466 * to that pointed to from the buffer_head b_bdev currently. 1467 */ 1468 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1469 1470 /* 1471 * If we previously allocated a block out beyond eof and we are now 1472 * coming back to use it then we will need to flag it as new even if it 1473 * has a disk address. 1474 * 1475 * With sub-block writes into unwritten extents we also need to mark 1476 * the buffer as new so that the unwritten parts of the buffer gets 1477 * correctly zeroed. 1478 */ 1479 if (create && 1480 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1481 (offset >= i_size_read(inode)) || 1482 (new || ISUNWRITTEN(&imap)))) 1483 set_buffer_new(bh_result); 1484 1485 if (imap.br_startblock == DELAYSTARTBLOCK) { 1486 BUG_ON(direct); 1487 if (create) { 1488 set_buffer_uptodate(bh_result); 1489 set_buffer_mapped(bh_result); 1490 set_buffer_delay(bh_result); 1491 } 1492 } 1493 1494 return 0; 1495 1496 out_unlock: 1497 xfs_iunlock(ip, lockmode); 1498 return error; 1499 } 1500 1501 int 1502 xfs_get_blocks( 1503 struct inode *inode, 1504 sector_t iblock, 1505 struct buffer_head *bh_result, 1506 int create) 1507 { 1508 return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1509 } 1510 1511 int 1512 xfs_get_blocks_direct( 1513 struct inode *inode, 1514 sector_t iblock, 1515 struct buffer_head *bh_result, 1516 int create) 1517 { 1518 return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1519 } 1520 1521 static void 1522 __xfs_end_io_direct_write( 1523 struct inode *inode, 1524 struct xfs_ioend *ioend, 1525 loff_t offset, 1526 ssize_t size) 1527 { 1528 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1529 1530 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1531 goto out_end_io; 1532 1533 /* 1534 * dio completion end_io functions are only called on writes if more 1535 * than 0 bytes was written. 1536 */ 1537 ASSERT(size > 0); 1538 1539 /* 1540 * The ioend only maps whole blocks, while the IO may be sector aligned. 1541 * Hence the ioend offset/size may not match the IO offset/size exactly. 1542 * Because we don't map overwrites within EOF into the ioend, the offset 1543 * may not match, but only if the endio spans EOF. Either way, write 1544 * the IO sizes into the ioend so that completion processing does the 1545 * right thing. 1546 */ 1547 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1548 ioend->io_size = size; 1549 ioend->io_offset = offset; 1550 1551 /* 1552 * The ioend tells us whether we are doing unwritten extent conversion 1553 * or an append transaction that updates the on-disk file size. These 1554 * cases are the only cases where we should *potentially* be needing 1555 * to update the VFS inode size. 1556 * 1557 * We need to update the in-core inode size here so that we don't end up 1558 * with the on-disk inode size being outside the in-core inode size. We 1559 * have no other method of updating EOF for AIO, so always do it here 1560 * if necessary. 1561 * 1562 * We need to lock the test/set EOF update as we can be racing with 1563 * other IO completions here to update the EOF. Failing to serialise 1564 * here can result in EOF moving backwards and Bad Things Happen when 1565 * that occurs. 1566 */ 1567 spin_lock(&XFS_I(inode)->i_flags_lock); 1568 if (offset + size > i_size_read(inode)) 1569 i_size_write(inode, offset + size); 1570 spin_unlock(&XFS_I(inode)->i_flags_lock); 1571 1572 /* 1573 * If we are doing an append IO that needs to update the EOF on disk, 1574 * do the transaction reserve now so we can use common end io 1575 * processing. Stashing the error (if there is one) in the ioend will 1576 * result in the ioend processing passing on the error if it is 1577 * possible as we can't return it from here. 1578 */ 1579 if (ioend->io_type == XFS_IO_OVERWRITE) 1580 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1581 1582 out_end_io: 1583 xfs_end_io(&ioend->io_work); 1584 return; 1585 } 1586 1587 /* 1588 * Complete a direct I/O write request. 1589 * 1590 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1591 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1592 * wholly within the EOF and so there is nothing for us to do. Note that in this 1593 * case the completion can be called in interrupt context, whereas if we have an 1594 * ioend we will always be called in task context (i.e. from a workqueue). 1595 */ 1596 STATIC void 1597 xfs_end_io_direct_write( 1598 struct kiocb *iocb, 1599 loff_t offset, 1600 ssize_t size, 1601 void *private) 1602 { 1603 struct inode *inode = file_inode(iocb->ki_filp); 1604 struct xfs_ioend *ioend = private; 1605 1606 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1607 ioend ? ioend->io_type : 0, NULL); 1608 1609 if (!ioend) { 1610 ASSERT(offset + size <= i_size_read(inode)); 1611 return; 1612 } 1613 1614 __xfs_end_io_direct_write(inode, ioend, offset, size); 1615 } 1616 1617 /* 1618 * For DAX we need a mapping buffer callback for unwritten extent conversion 1619 * when page faults allocate blocks and then zero them. Note that in this 1620 * case the mapping indicated by the ioend may extend beyond EOF. We most 1621 * definitely do not want to extend EOF here, so we trim back the ioend size to 1622 * EOF. 1623 */ 1624 #ifdef CONFIG_FS_DAX 1625 void 1626 xfs_end_io_dax_write( 1627 struct buffer_head *bh, 1628 int uptodate) 1629 { 1630 struct xfs_ioend *ioend = bh->b_private; 1631 struct inode *inode = ioend->io_inode; 1632 ssize_t size = ioend->io_size; 1633 1634 ASSERT(IS_DAX(ioend->io_inode)); 1635 1636 /* if there was an error zeroing, then don't convert it */ 1637 if (!uptodate) 1638 ioend->io_error = -EIO; 1639 1640 /* 1641 * Trim update to EOF, so we don't extend EOF during unwritten extent 1642 * conversion of partial EOF blocks. 1643 */ 1644 spin_lock(&XFS_I(inode)->i_flags_lock); 1645 if (ioend->io_offset + size > i_size_read(inode)) 1646 size = i_size_read(inode) - ioend->io_offset; 1647 spin_unlock(&XFS_I(inode)->i_flags_lock); 1648 1649 __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); 1650 1651 } 1652 #else 1653 void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } 1654 #endif 1655 1656 static inline ssize_t 1657 xfs_vm_do_dio( 1658 struct inode *inode, 1659 struct kiocb *iocb, 1660 struct iov_iter *iter, 1661 loff_t offset, 1662 void (*endio)(struct kiocb *iocb, 1663 loff_t offset, 1664 ssize_t size, 1665 void *private), 1666 int flags) 1667 { 1668 struct block_device *bdev; 1669 1670 if (IS_DAX(inode)) 1671 return dax_do_io(iocb, inode, iter, offset, 1672 xfs_get_blocks_direct, endio, 0); 1673 1674 bdev = xfs_find_bdev_for_inode(inode); 1675 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1676 xfs_get_blocks_direct, endio, NULL, flags); 1677 } 1678 1679 STATIC ssize_t 1680 xfs_vm_direct_IO( 1681 struct kiocb *iocb, 1682 struct iov_iter *iter, 1683 loff_t offset) 1684 { 1685 struct inode *inode = iocb->ki_filp->f_mapping->host; 1686 1687 if (iov_iter_rw(iter) == WRITE) 1688 return xfs_vm_do_dio(inode, iocb, iter, offset, 1689 xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1690 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1691 } 1692 1693 /* 1694 * Punch out the delalloc blocks we have already allocated. 1695 * 1696 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1697 * as the page is still locked at this point. 1698 */ 1699 STATIC void 1700 xfs_vm_kill_delalloc_range( 1701 struct inode *inode, 1702 loff_t start, 1703 loff_t end) 1704 { 1705 struct xfs_inode *ip = XFS_I(inode); 1706 xfs_fileoff_t start_fsb; 1707 xfs_fileoff_t end_fsb; 1708 int error; 1709 1710 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1711 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1712 if (end_fsb <= start_fsb) 1713 return; 1714 1715 xfs_ilock(ip, XFS_ILOCK_EXCL); 1716 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1717 end_fsb - start_fsb); 1718 if (error) { 1719 /* something screwed, just bail */ 1720 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1721 xfs_alert(ip->i_mount, 1722 "xfs_vm_write_failed: unable to clean up ino %lld", 1723 ip->i_ino); 1724 } 1725 } 1726 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1727 } 1728 1729 STATIC void 1730 xfs_vm_write_failed( 1731 struct inode *inode, 1732 struct page *page, 1733 loff_t pos, 1734 unsigned len) 1735 { 1736 loff_t block_offset; 1737 loff_t block_start; 1738 loff_t block_end; 1739 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1740 loff_t to = from + len; 1741 struct buffer_head *bh, *head; 1742 1743 /* 1744 * The request pos offset might be 32 or 64 bit, this is all fine 1745 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1746 * platform, the high 32-bit will be masked off if we evaluate the 1747 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1748 * 0xfffff000 as an unsigned long, hence the result is incorrect 1749 * which could cause the following ASSERT failed in most cases. 1750 * In order to avoid this, we can evaluate the block_offset of the 1751 * start of the page by using shifts rather than masks the mismatch 1752 * problem. 1753 */ 1754 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1755 1756 ASSERT(block_offset + from == pos); 1757 1758 head = page_buffers(page); 1759 block_start = 0; 1760 for (bh = head; bh != head || !block_start; 1761 bh = bh->b_this_page, block_start = block_end, 1762 block_offset += bh->b_size) { 1763 block_end = block_start + bh->b_size; 1764 1765 /* skip buffers before the write */ 1766 if (block_end <= from) 1767 continue; 1768 1769 /* if the buffer is after the write, we're done */ 1770 if (block_start >= to) 1771 break; 1772 1773 if (!buffer_delay(bh)) 1774 continue; 1775 1776 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1777 continue; 1778 1779 xfs_vm_kill_delalloc_range(inode, block_offset, 1780 block_offset + bh->b_size); 1781 1782 /* 1783 * This buffer does not contain data anymore. make sure anyone 1784 * who finds it knows that for certain. 1785 */ 1786 clear_buffer_delay(bh); 1787 clear_buffer_uptodate(bh); 1788 clear_buffer_mapped(bh); 1789 clear_buffer_new(bh); 1790 clear_buffer_dirty(bh); 1791 } 1792 1793 } 1794 1795 /* 1796 * This used to call block_write_begin(), but it unlocks and releases the page 1797 * on error, and we need that page to be able to punch stale delalloc blocks out 1798 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1799 * the appropriate point. 1800 */ 1801 STATIC int 1802 xfs_vm_write_begin( 1803 struct file *file, 1804 struct address_space *mapping, 1805 loff_t pos, 1806 unsigned len, 1807 unsigned flags, 1808 struct page **pagep, 1809 void **fsdata) 1810 { 1811 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1812 struct page *page; 1813 int status; 1814 1815 ASSERT(len <= PAGE_CACHE_SIZE); 1816 1817 page = grab_cache_page_write_begin(mapping, index, flags); 1818 if (!page) 1819 return -ENOMEM; 1820 1821 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1822 if (unlikely(status)) { 1823 struct inode *inode = mapping->host; 1824 size_t isize = i_size_read(inode); 1825 1826 xfs_vm_write_failed(inode, page, pos, len); 1827 unlock_page(page); 1828 1829 /* 1830 * If the write is beyond EOF, we only want to kill blocks 1831 * allocated in this write, not blocks that were previously 1832 * written successfully. 1833 */ 1834 if (pos + len > isize) { 1835 ssize_t start = max_t(ssize_t, pos, isize); 1836 1837 truncate_pagecache_range(inode, start, pos + len); 1838 } 1839 1840 page_cache_release(page); 1841 page = NULL; 1842 } 1843 1844 *pagep = page; 1845 return status; 1846 } 1847 1848 /* 1849 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1850 * this specific write because they will never be written. Previous writes 1851 * beyond EOF where block allocation succeeded do not need to be trashed, so 1852 * only new blocks from this write should be trashed. For blocks within 1853 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1854 * written with all the other valid data. 1855 */ 1856 STATIC int 1857 xfs_vm_write_end( 1858 struct file *file, 1859 struct address_space *mapping, 1860 loff_t pos, 1861 unsigned len, 1862 unsigned copied, 1863 struct page *page, 1864 void *fsdata) 1865 { 1866 int ret; 1867 1868 ASSERT(len <= PAGE_CACHE_SIZE); 1869 1870 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1871 if (unlikely(ret < len)) { 1872 struct inode *inode = mapping->host; 1873 size_t isize = i_size_read(inode); 1874 loff_t to = pos + len; 1875 1876 if (to > isize) { 1877 /* only kill blocks in this write beyond EOF */ 1878 if (pos > isize) 1879 isize = pos; 1880 xfs_vm_kill_delalloc_range(inode, isize, to); 1881 truncate_pagecache_range(inode, isize, to); 1882 } 1883 } 1884 return ret; 1885 } 1886 1887 STATIC sector_t 1888 xfs_vm_bmap( 1889 struct address_space *mapping, 1890 sector_t block) 1891 { 1892 struct inode *inode = (struct inode *)mapping->host; 1893 struct xfs_inode *ip = XFS_I(inode); 1894 1895 trace_xfs_vm_bmap(XFS_I(inode)); 1896 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1897 filemap_write_and_wait(mapping); 1898 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1899 return generic_block_bmap(mapping, block, xfs_get_blocks); 1900 } 1901 1902 STATIC int 1903 xfs_vm_readpage( 1904 struct file *unused, 1905 struct page *page) 1906 { 1907 return mpage_readpage(page, xfs_get_blocks); 1908 } 1909 1910 STATIC int 1911 xfs_vm_readpages( 1912 struct file *unused, 1913 struct address_space *mapping, 1914 struct list_head *pages, 1915 unsigned nr_pages) 1916 { 1917 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1918 } 1919 1920 /* 1921 * This is basically a copy of __set_page_dirty_buffers() with one 1922 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1923 * dirty, we'll never be able to clean them because we don't write buffers 1924 * beyond EOF, and that means we can't invalidate pages that span EOF 1925 * that have been marked dirty. Further, the dirty state can leak into 1926 * the file interior if the file is extended, resulting in all sorts of 1927 * bad things happening as the state does not match the underlying data. 1928 * 1929 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1930 * this only exist because of bufferheads and how the generic code manages them. 1931 */ 1932 STATIC int 1933 xfs_vm_set_page_dirty( 1934 struct page *page) 1935 { 1936 struct address_space *mapping = page->mapping; 1937 struct inode *inode = mapping->host; 1938 loff_t end_offset; 1939 loff_t offset; 1940 int newly_dirty; 1941 struct mem_cgroup *memcg; 1942 1943 if (unlikely(!mapping)) 1944 return !TestSetPageDirty(page); 1945 1946 end_offset = i_size_read(inode); 1947 offset = page_offset(page); 1948 1949 spin_lock(&mapping->private_lock); 1950 if (page_has_buffers(page)) { 1951 struct buffer_head *head = page_buffers(page); 1952 struct buffer_head *bh = head; 1953 1954 do { 1955 if (offset < end_offset) 1956 set_buffer_dirty(bh); 1957 bh = bh->b_this_page; 1958 offset += 1 << inode->i_blkbits; 1959 } while (bh != head); 1960 } 1961 /* 1962 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1963 * per-memcg dirty page counters. 1964 */ 1965 memcg = mem_cgroup_begin_page_stat(page); 1966 newly_dirty = !TestSetPageDirty(page); 1967 spin_unlock(&mapping->private_lock); 1968 1969 if (newly_dirty) { 1970 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1971 unsigned long flags; 1972 1973 spin_lock_irqsave(&mapping->tree_lock, flags); 1974 if (page->mapping) { /* Race with truncate? */ 1975 WARN_ON_ONCE(!PageUptodate(page)); 1976 account_page_dirtied(page, mapping, memcg); 1977 radix_tree_tag_set(&mapping->page_tree, 1978 page_index(page), PAGECACHE_TAG_DIRTY); 1979 } 1980 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1981 } 1982 mem_cgroup_end_page_stat(memcg); 1983 if (newly_dirty) 1984 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1985 return newly_dirty; 1986 } 1987 1988 const struct address_space_operations xfs_address_space_operations = { 1989 .readpage = xfs_vm_readpage, 1990 .readpages = xfs_vm_readpages, 1991 .writepage = xfs_vm_writepage, 1992 .writepages = xfs_vm_writepages, 1993 .set_page_dirty = xfs_vm_set_page_dirty, 1994 .releasepage = xfs_vm_releasepage, 1995 .invalidatepage = xfs_vm_invalidatepage, 1996 .write_begin = xfs_vm_write_begin, 1997 .write_end = xfs_vm_write_end, 1998 .bmap = xfs_vm_bmap, 1999 .direct_IO = xfs_vm_direct_IO, 2000 .migratepage = buffer_migrate_page, 2001 .is_partially_uptodate = block_is_partially_uptodate, 2002 .error_remove_page = generic_error_remove_page, 2003 }; 2004