1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 123 /* 124 * We hand off the transaction to the completion thread now, so 125 * clear the flag here. 126 */ 127 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 128 return 0; 129 } 130 131 /* 132 * Update on-disk file size now that data has been written to disk. 133 */ 134 STATIC int 135 xfs_setfilesize( 136 struct xfs_inode *ip, 137 struct xfs_trans *tp, 138 xfs_off_t offset, 139 size_t size) 140 { 141 xfs_fsize_t isize; 142 143 xfs_ilock(ip, XFS_ILOCK_EXCL); 144 isize = xfs_new_eof(ip, offset + size); 145 if (!isize) { 146 xfs_iunlock(ip, XFS_ILOCK_EXCL); 147 xfs_trans_cancel(tp); 148 return 0; 149 } 150 151 trace_xfs_setfilesize(ip, offset, size); 152 153 ip->i_d.di_size = isize; 154 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 155 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 156 157 return xfs_trans_commit(tp); 158 } 159 160 STATIC int 161 xfs_setfilesize_ioend( 162 struct xfs_ioend *ioend) 163 { 164 struct xfs_inode *ip = XFS_I(ioend->io_inode); 165 struct xfs_trans *tp = ioend->io_append_trans; 166 167 /* 168 * The transaction may have been allocated in the I/O submission thread, 169 * thus we need to mark ourselves as being in a transaction manually. 170 * Similarly for freeze protection. 171 */ 172 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 173 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 174 175 /* we abort the update if there was an IO error */ 176 if (ioend->io_error) { 177 xfs_trans_cancel(tp); 178 return ioend->io_error; 179 } 180 181 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 182 } 183 184 /* 185 * Schedule IO completion handling on the final put of an ioend. 186 * 187 * If there is no work to do we might as well call it a day and free the 188 * ioend right now. 189 */ 190 STATIC void 191 xfs_finish_ioend( 192 struct xfs_ioend *ioend) 193 { 194 if (atomic_dec_and_test(&ioend->io_remaining)) { 195 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 196 197 if (ioend->io_type == XFS_IO_UNWRITTEN) 198 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 199 else if (ioend->io_append_trans) 200 queue_work(mp->m_data_workqueue, &ioend->io_work); 201 else 202 xfs_destroy_ioend(ioend); 203 } 204 } 205 206 /* 207 * IO write completion. 208 */ 209 STATIC void 210 xfs_end_io( 211 struct work_struct *work) 212 { 213 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 214 struct xfs_inode *ip = XFS_I(ioend->io_inode); 215 int error = 0; 216 217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 218 ioend->io_error = -EIO; 219 goto done; 220 } 221 222 /* 223 * For unwritten extents we need to issue transactions to convert a 224 * range to normal written extens after the data I/O has finished. 225 * Detecting and handling completion IO errors is done individually 226 * for each case as different cleanup operations need to be performed 227 * on error. 228 */ 229 if (ioend->io_type == XFS_IO_UNWRITTEN) { 230 if (ioend->io_error) 231 goto done; 232 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 233 ioend->io_size); 234 } else if (ioend->io_append_trans) { 235 error = xfs_setfilesize_ioend(ioend); 236 } else { 237 ASSERT(!xfs_ioend_is_append(ioend)); 238 } 239 240 done: 241 if (error) 242 ioend->io_error = error; 243 xfs_destroy_ioend(ioend); 244 } 245 246 /* 247 * Allocate and initialise an IO completion structure. 248 * We need to track unwritten extent write completion here initially. 249 * We'll need to extend this for updating the ondisk inode size later 250 * (vs. incore size). 251 */ 252 STATIC xfs_ioend_t * 253 xfs_alloc_ioend( 254 struct inode *inode, 255 unsigned int type) 256 { 257 xfs_ioend_t *ioend; 258 259 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 260 261 /* 262 * Set the count to 1 initially, which will prevent an I/O 263 * completion callback from happening before we have started 264 * all the I/O from calling the completion routine too early. 265 */ 266 atomic_set(&ioend->io_remaining, 1); 267 ioend->io_error = 0; 268 ioend->io_list = NULL; 269 ioend->io_type = type; 270 ioend->io_inode = inode; 271 ioend->io_buffer_head = NULL; 272 ioend->io_buffer_tail = NULL; 273 ioend->io_offset = 0; 274 ioend->io_size = 0; 275 ioend->io_append_trans = NULL; 276 277 INIT_WORK(&ioend->io_work, xfs_end_io); 278 return ioend; 279 } 280 281 STATIC int 282 xfs_map_blocks( 283 struct inode *inode, 284 loff_t offset, 285 struct xfs_bmbt_irec *imap, 286 int type, 287 int nonblocking) 288 { 289 struct xfs_inode *ip = XFS_I(inode); 290 struct xfs_mount *mp = ip->i_mount; 291 ssize_t count = 1 << inode->i_blkbits; 292 xfs_fileoff_t offset_fsb, end_fsb; 293 int error = 0; 294 int bmapi_flags = XFS_BMAPI_ENTIRE; 295 int nimaps = 1; 296 297 if (XFS_FORCED_SHUTDOWN(mp)) 298 return -EIO; 299 300 if (type == XFS_IO_UNWRITTEN) 301 bmapi_flags |= XFS_BMAPI_IGSTATE; 302 303 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 304 if (nonblocking) 305 return -EAGAIN; 306 xfs_ilock(ip, XFS_ILOCK_SHARED); 307 } 308 309 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 310 (ip->i_df.if_flags & XFS_IFEXTENTS)); 311 ASSERT(offset <= mp->m_super->s_maxbytes); 312 313 if (offset + count > mp->m_super->s_maxbytes) 314 count = mp->m_super->s_maxbytes - offset; 315 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 316 offset_fsb = XFS_B_TO_FSBT(mp, offset); 317 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 318 imap, &nimaps, bmapi_flags); 319 xfs_iunlock(ip, XFS_ILOCK_SHARED); 320 321 if (error) 322 return error; 323 324 if (type == XFS_IO_DELALLOC && 325 (!nimaps || isnullstartblock(imap->br_startblock))) { 326 error = xfs_iomap_write_allocate(ip, offset, imap); 327 if (!error) 328 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 329 return error; 330 } 331 332 #ifdef DEBUG 333 if (type == XFS_IO_UNWRITTEN) { 334 ASSERT(nimaps); 335 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 336 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 337 } 338 #endif 339 if (nimaps) 340 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 341 return 0; 342 } 343 344 STATIC int 345 xfs_imap_valid( 346 struct inode *inode, 347 struct xfs_bmbt_irec *imap, 348 xfs_off_t offset) 349 { 350 offset >>= inode->i_blkbits; 351 352 return offset >= imap->br_startoff && 353 offset < imap->br_startoff + imap->br_blockcount; 354 } 355 356 /* 357 * BIO completion handler for buffered IO. 358 */ 359 STATIC void 360 xfs_end_bio( 361 struct bio *bio) 362 { 363 xfs_ioend_t *ioend = bio->bi_private; 364 365 if (!ioend->io_error) 366 ioend->io_error = bio->bi_error; 367 368 /* Toss bio and pass work off to an xfsdatad thread */ 369 bio->bi_private = NULL; 370 bio->bi_end_io = NULL; 371 bio_put(bio); 372 373 xfs_finish_ioend(ioend); 374 } 375 376 STATIC void 377 xfs_submit_ioend_bio( 378 struct writeback_control *wbc, 379 xfs_ioend_t *ioend, 380 struct bio *bio) 381 { 382 atomic_inc(&ioend->io_remaining); 383 bio->bi_private = ioend; 384 bio->bi_end_io = xfs_end_bio; 385 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 386 } 387 388 STATIC struct bio * 389 xfs_alloc_ioend_bio( 390 struct buffer_head *bh) 391 { 392 struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 393 394 ASSERT(bio->bi_private == NULL); 395 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 396 bio->bi_bdev = bh->b_bdev; 397 return bio; 398 } 399 400 STATIC void 401 xfs_start_buffer_writeback( 402 struct buffer_head *bh) 403 { 404 ASSERT(buffer_mapped(bh)); 405 ASSERT(buffer_locked(bh)); 406 ASSERT(!buffer_delay(bh)); 407 ASSERT(!buffer_unwritten(bh)); 408 409 mark_buffer_async_write(bh); 410 set_buffer_uptodate(bh); 411 clear_buffer_dirty(bh); 412 } 413 414 STATIC void 415 xfs_start_page_writeback( 416 struct page *page, 417 int clear_dirty, 418 int buffers) 419 { 420 ASSERT(PageLocked(page)); 421 ASSERT(!PageWriteback(page)); 422 423 /* 424 * if the page was not fully cleaned, we need to ensure that the higher 425 * layers come back to it correctly. That means we need to keep the page 426 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 427 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 428 * write this page in this writeback sweep will be made. 429 */ 430 if (clear_dirty) { 431 clear_page_dirty_for_io(page); 432 set_page_writeback(page); 433 } else 434 set_page_writeback_keepwrite(page); 435 436 unlock_page(page); 437 438 /* If no buffers on the page are to be written, finish it here */ 439 if (!buffers) 440 end_page_writeback(page); 441 } 442 443 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 444 { 445 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 446 } 447 448 /* 449 * Submit all of the bios for all of the ioends we have saved up, covering the 450 * initial writepage page and also any probed pages. 451 * 452 * Because we may have multiple ioends spanning a page, we need to start 453 * writeback on all the buffers before we submit them for I/O. If we mark the 454 * buffers as we got, then we can end up with a page that only has buffers 455 * marked async write and I/O complete on can occur before we mark the other 456 * buffers async write. 457 * 458 * The end result of this is that we trip a bug in end_page_writeback() because 459 * we call it twice for the one page as the code in end_buffer_async_write() 460 * assumes that all buffers on the page are started at the same time. 461 * 462 * The fix is two passes across the ioend list - one to start writeback on the 463 * buffer_heads, and then submit them for I/O on the second pass. 464 * 465 * If @fail is non-zero, it means that we have a situation where some part of 466 * the submission process has failed after we have marked paged for writeback 467 * and unlocked them. In this situation, we need to fail the ioend chain rather 468 * than submit it to IO. This typically only happens on a filesystem shutdown. 469 */ 470 STATIC void 471 xfs_submit_ioend( 472 struct writeback_control *wbc, 473 xfs_ioend_t *ioend, 474 int fail) 475 { 476 xfs_ioend_t *head = ioend; 477 xfs_ioend_t *next; 478 struct buffer_head *bh; 479 struct bio *bio; 480 sector_t lastblock = 0; 481 482 /* Pass 1 - start writeback */ 483 do { 484 next = ioend->io_list; 485 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 486 xfs_start_buffer_writeback(bh); 487 } while ((ioend = next) != NULL); 488 489 /* Pass 2 - submit I/O */ 490 ioend = head; 491 do { 492 next = ioend->io_list; 493 bio = NULL; 494 495 /* 496 * If we are failing the IO now, just mark the ioend with an 497 * error and finish it. This will run IO completion immediately 498 * as there is only one reference to the ioend at this point in 499 * time. 500 */ 501 if (fail) { 502 ioend->io_error = fail; 503 xfs_finish_ioend(ioend); 504 continue; 505 } 506 507 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 508 509 if (!bio) { 510 retry: 511 bio = xfs_alloc_ioend_bio(bh); 512 } else if (bh->b_blocknr != lastblock + 1) { 513 xfs_submit_ioend_bio(wbc, ioend, bio); 514 goto retry; 515 } 516 517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 518 xfs_submit_ioend_bio(wbc, ioend, bio); 519 goto retry; 520 } 521 522 lastblock = bh->b_blocknr; 523 } 524 if (bio) 525 xfs_submit_ioend_bio(wbc, ioend, bio); 526 xfs_finish_ioend(ioend); 527 } while ((ioend = next) != NULL); 528 } 529 530 /* 531 * Cancel submission of all buffer_heads so far in this endio. 532 * Toss the endio too. Only ever called for the initial page 533 * in a writepage request, so only ever one page. 534 */ 535 STATIC void 536 xfs_cancel_ioend( 537 xfs_ioend_t *ioend) 538 { 539 xfs_ioend_t *next; 540 struct buffer_head *bh, *next_bh; 541 542 do { 543 next = ioend->io_list; 544 bh = ioend->io_buffer_head; 545 do { 546 next_bh = bh->b_private; 547 clear_buffer_async_write(bh); 548 /* 549 * The unwritten flag is cleared when added to the 550 * ioend. We're not submitting for I/O so mark the 551 * buffer unwritten again for next time around. 552 */ 553 if (ioend->io_type == XFS_IO_UNWRITTEN) 554 set_buffer_unwritten(bh); 555 unlock_buffer(bh); 556 } while ((bh = next_bh) != NULL); 557 558 mempool_free(ioend, xfs_ioend_pool); 559 } while ((ioend = next) != NULL); 560 } 561 562 /* 563 * Test to see if we've been building up a completion structure for 564 * earlier buffers -- if so, we try to append to this ioend if we 565 * can, otherwise we finish off any current ioend and start another. 566 * Return true if we've finished the given ioend. 567 */ 568 STATIC void 569 xfs_add_to_ioend( 570 struct inode *inode, 571 struct buffer_head *bh, 572 xfs_off_t offset, 573 unsigned int type, 574 xfs_ioend_t **result, 575 int need_ioend) 576 { 577 xfs_ioend_t *ioend = *result; 578 579 if (!ioend || need_ioend || type != ioend->io_type) { 580 xfs_ioend_t *previous = *result; 581 582 ioend = xfs_alloc_ioend(inode, type); 583 ioend->io_offset = offset; 584 ioend->io_buffer_head = bh; 585 ioend->io_buffer_tail = bh; 586 if (previous) 587 previous->io_list = ioend; 588 *result = ioend; 589 } else { 590 ioend->io_buffer_tail->b_private = bh; 591 ioend->io_buffer_tail = bh; 592 } 593 594 bh->b_private = NULL; 595 ioend->io_size += bh->b_size; 596 } 597 598 STATIC void 599 xfs_map_buffer( 600 struct inode *inode, 601 struct buffer_head *bh, 602 struct xfs_bmbt_irec *imap, 603 xfs_off_t offset) 604 { 605 sector_t bn; 606 struct xfs_mount *m = XFS_I(inode)->i_mount; 607 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 608 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 609 610 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 611 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 612 613 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 614 ((offset - iomap_offset) >> inode->i_blkbits); 615 616 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 617 618 bh->b_blocknr = bn; 619 set_buffer_mapped(bh); 620 } 621 622 STATIC void 623 xfs_map_at_offset( 624 struct inode *inode, 625 struct buffer_head *bh, 626 struct xfs_bmbt_irec *imap, 627 xfs_off_t offset) 628 { 629 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 630 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 631 632 xfs_map_buffer(inode, bh, imap, offset); 633 set_buffer_mapped(bh); 634 clear_buffer_delay(bh); 635 clear_buffer_unwritten(bh); 636 } 637 638 /* 639 * Test if a given page contains at least one buffer of a given @type. 640 * If @check_all_buffers is true, then we walk all the buffers in the page to 641 * try to find one of the type passed in. If it is not set, then the caller only 642 * needs to check the first buffer on the page for a match. 643 */ 644 STATIC bool 645 xfs_check_page_type( 646 struct page *page, 647 unsigned int type, 648 bool check_all_buffers) 649 { 650 struct buffer_head *bh; 651 struct buffer_head *head; 652 653 if (PageWriteback(page)) 654 return false; 655 if (!page->mapping) 656 return false; 657 if (!page_has_buffers(page)) 658 return false; 659 660 bh = head = page_buffers(page); 661 do { 662 if (buffer_unwritten(bh)) { 663 if (type == XFS_IO_UNWRITTEN) 664 return true; 665 } else if (buffer_delay(bh)) { 666 if (type == XFS_IO_DELALLOC) 667 return true; 668 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 669 if (type == XFS_IO_OVERWRITE) 670 return true; 671 } 672 673 /* If we are only checking the first buffer, we are done now. */ 674 if (!check_all_buffers) 675 break; 676 } while ((bh = bh->b_this_page) != head); 677 678 return false; 679 } 680 681 /* 682 * Allocate & map buffers for page given the extent map. Write it out. 683 * except for the original page of a writepage, this is called on 684 * delalloc/unwritten pages only, for the original page it is possible 685 * that the page has no mapping at all. 686 */ 687 STATIC int 688 xfs_convert_page( 689 struct inode *inode, 690 struct page *page, 691 loff_t tindex, 692 struct xfs_bmbt_irec *imap, 693 xfs_ioend_t **ioendp, 694 struct writeback_control *wbc) 695 { 696 struct buffer_head *bh, *head; 697 xfs_off_t end_offset; 698 unsigned long p_offset; 699 unsigned int type; 700 int len, page_dirty; 701 int count = 0, done = 0, uptodate = 1; 702 xfs_off_t offset = page_offset(page); 703 704 if (page->index != tindex) 705 goto fail; 706 if (!trylock_page(page)) 707 goto fail; 708 if (PageWriteback(page)) 709 goto fail_unlock_page; 710 if (page->mapping != inode->i_mapping) 711 goto fail_unlock_page; 712 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 713 goto fail_unlock_page; 714 715 /* 716 * page_dirty is initially a count of buffers on the page before 717 * EOF and is decremented as we move each into a cleanable state. 718 * 719 * Derivation: 720 * 721 * End offset is the highest offset that this page should represent. 722 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 723 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 724 * hence give us the correct page_dirty count. On any other page, 725 * it will be zero and in that case we need page_dirty to be the 726 * count of buffers on the page. 727 */ 728 end_offset = min_t(unsigned long long, 729 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 730 i_size_read(inode)); 731 732 /* 733 * If the current map does not span the entire page we are about to try 734 * to write, then give up. The only way we can write a page that spans 735 * multiple mappings in a single writeback iteration is via the 736 * xfs_vm_writepage() function. Data integrity writeback requires the 737 * entire page to be written in a single attempt, otherwise the part of 738 * the page we don't write here doesn't get written as part of the data 739 * integrity sync. 740 * 741 * For normal writeback, we also don't attempt to write partial pages 742 * here as it simply means that write_cache_pages() will see it under 743 * writeback and ignore the page until some point in the future, at 744 * which time this will be the only page in the file that needs 745 * writeback. Hence for more optimal IO patterns, we should always 746 * avoid partial page writeback due to multiple mappings on a page here. 747 */ 748 if (!xfs_imap_valid(inode, imap, end_offset)) 749 goto fail_unlock_page; 750 751 len = 1 << inode->i_blkbits; 752 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 753 PAGE_CACHE_SIZE); 754 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 755 page_dirty = p_offset / len; 756 757 /* 758 * The moment we find a buffer that doesn't match our current type 759 * specification or can't be written, abort the loop and start 760 * writeback. As per the above xfs_imap_valid() check, only 761 * xfs_vm_writepage() can handle partial page writeback fully - we are 762 * limited here to the buffers that are contiguous with the current 763 * ioend, and hence a buffer we can't write breaks that contiguity and 764 * we have to defer the rest of the IO to xfs_vm_writepage(). 765 */ 766 bh = head = page_buffers(page); 767 do { 768 if (offset >= end_offset) 769 break; 770 if (!buffer_uptodate(bh)) 771 uptodate = 0; 772 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 773 done = 1; 774 break; 775 } 776 777 if (buffer_unwritten(bh) || buffer_delay(bh) || 778 buffer_mapped(bh)) { 779 if (buffer_unwritten(bh)) 780 type = XFS_IO_UNWRITTEN; 781 else if (buffer_delay(bh)) 782 type = XFS_IO_DELALLOC; 783 else 784 type = XFS_IO_OVERWRITE; 785 786 /* 787 * imap should always be valid because of the above 788 * partial page end_offset check on the imap. 789 */ 790 ASSERT(xfs_imap_valid(inode, imap, offset)); 791 792 lock_buffer(bh); 793 if (type != XFS_IO_OVERWRITE) 794 xfs_map_at_offset(inode, bh, imap, offset); 795 xfs_add_to_ioend(inode, bh, offset, type, 796 ioendp, done); 797 798 page_dirty--; 799 count++; 800 } else { 801 done = 1; 802 break; 803 } 804 } while (offset += len, (bh = bh->b_this_page) != head); 805 806 if (uptodate && bh == head) 807 SetPageUptodate(page); 808 809 if (count) { 810 if (--wbc->nr_to_write <= 0 && 811 wbc->sync_mode == WB_SYNC_NONE) 812 done = 1; 813 } 814 xfs_start_page_writeback(page, !page_dirty, count); 815 816 return done; 817 fail_unlock_page: 818 unlock_page(page); 819 fail: 820 return 1; 821 } 822 823 /* 824 * Convert & write out a cluster of pages in the same extent as defined 825 * by mp and following the start page. 826 */ 827 STATIC void 828 xfs_cluster_write( 829 struct inode *inode, 830 pgoff_t tindex, 831 struct xfs_bmbt_irec *imap, 832 xfs_ioend_t **ioendp, 833 struct writeback_control *wbc, 834 pgoff_t tlast) 835 { 836 struct pagevec pvec; 837 int done = 0, i; 838 839 pagevec_init(&pvec, 0); 840 while (!done && tindex <= tlast) { 841 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 842 843 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 844 break; 845 846 for (i = 0; i < pagevec_count(&pvec); i++) { 847 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 848 imap, ioendp, wbc); 849 if (done) 850 break; 851 } 852 853 pagevec_release(&pvec); 854 cond_resched(); 855 } 856 } 857 858 STATIC void 859 xfs_vm_invalidatepage( 860 struct page *page, 861 unsigned int offset, 862 unsigned int length) 863 { 864 trace_xfs_invalidatepage(page->mapping->host, page, offset, 865 length); 866 block_invalidatepage(page, offset, length); 867 } 868 869 /* 870 * If the page has delalloc buffers on it, we need to punch them out before we 871 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 872 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 873 * is done on that same region - the delalloc extent is returned when none is 874 * supposed to be there. 875 * 876 * We prevent this by truncating away the delalloc regions on the page before 877 * invalidating it. Because they are delalloc, we can do this without needing a 878 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 879 * truncation without a transaction as there is no space left for block 880 * reservation (typically why we see a ENOSPC in writeback). 881 * 882 * This is not a performance critical path, so for now just do the punching a 883 * buffer head at a time. 884 */ 885 STATIC void 886 xfs_aops_discard_page( 887 struct page *page) 888 { 889 struct inode *inode = page->mapping->host; 890 struct xfs_inode *ip = XFS_I(inode); 891 struct buffer_head *bh, *head; 892 loff_t offset = page_offset(page); 893 894 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 895 goto out_invalidate; 896 897 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 898 goto out_invalidate; 899 900 xfs_alert(ip->i_mount, 901 "page discard on page %p, inode 0x%llx, offset %llu.", 902 page, ip->i_ino, offset); 903 904 xfs_ilock(ip, XFS_ILOCK_EXCL); 905 bh = head = page_buffers(page); 906 do { 907 int error; 908 xfs_fileoff_t start_fsb; 909 910 if (!buffer_delay(bh)) 911 goto next_buffer; 912 913 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 914 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 915 if (error) { 916 /* something screwed, just bail */ 917 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 918 xfs_alert(ip->i_mount, 919 "page discard unable to remove delalloc mapping."); 920 } 921 break; 922 } 923 next_buffer: 924 offset += 1 << inode->i_blkbits; 925 926 } while ((bh = bh->b_this_page) != head); 927 928 xfs_iunlock(ip, XFS_ILOCK_EXCL); 929 out_invalidate: 930 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 931 return; 932 } 933 934 /* 935 * Write out a dirty page. 936 * 937 * For delalloc space on the page we need to allocate space and flush it. 938 * For unwritten space on the page we need to start the conversion to 939 * regular allocated space. 940 * For any other dirty buffer heads on the page we should flush them. 941 */ 942 STATIC int 943 xfs_vm_writepage( 944 struct page *page, 945 struct writeback_control *wbc) 946 { 947 struct inode *inode = page->mapping->host; 948 struct buffer_head *bh, *head; 949 struct xfs_bmbt_irec imap; 950 xfs_ioend_t *ioend = NULL, *iohead = NULL; 951 loff_t offset; 952 unsigned int type; 953 __uint64_t end_offset; 954 pgoff_t end_index, last_index; 955 ssize_t len; 956 int err, imap_valid = 0, uptodate = 1; 957 int count = 0; 958 int nonblocking = 0; 959 960 trace_xfs_writepage(inode, page, 0, 0); 961 962 ASSERT(page_has_buffers(page)); 963 964 /* 965 * Refuse to write the page out if we are called from reclaim context. 966 * 967 * This avoids stack overflows when called from deeply used stacks in 968 * random callers for direct reclaim or memcg reclaim. We explicitly 969 * allow reclaim from kswapd as the stack usage there is relatively low. 970 * 971 * This should never happen except in the case of a VM regression so 972 * warn about it. 973 */ 974 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 975 PF_MEMALLOC)) 976 goto redirty; 977 978 /* 979 * Given that we do not allow direct reclaim to call us, we should 980 * never be called while in a filesystem transaction. 981 */ 982 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 983 goto redirty; 984 985 /* Is this page beyond the end of the file? */ 986 offset = i_size_read(inode); 987 end_index = offset >> PAGE_CACHE_SHIFT; 988 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 989 990 /* 991 * The page index is less than the end_index, adjust the end_offset 992 * to the highest offset that this page should represent. 993 * ----------------------------------------------------- 994 * | file mapping | <EOF> | 995 * ----------------------------------------------------- 996 * | Page ... | Page N-2 | Page N-1 | Page N | | 997 * ^--------------------------------^----------|-------- 998 * | desired writeback range | see else | 999 * ---------------------------------^------------------| 1000 */ 1001 if (page->index < end_index) 1002 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 1003 else { 1004 /* 1005 * Check whether the page to write out is beyond or straddles 1006 * i_size or not. 1007 * ------------------------------------------------------- 1008 * | file mapping | <EOF> | 1009 * ------------------------------------------------------- 1010 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1011 * ^--------------------------------^-----------|--------- 1012 * | | Straddles | 1013 * ---------------------------------^-----------|--------| 1014 */ 1015 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1016 1017 /* 1018 * Skip the page if it is fully outside i_size, e.g. due to a 1019 * truncate operation that is in progress. We must redirty the 1020 * page so that reclaim stops reclaiming it. Otherwise 1021 * xfs_vm_releasepage() is called on it and gets confused. 1022 * 1023 * Note that the end_index is unsigned long, it would overflow 1024 * if the given offset is greater than 16TB on 32-bit system 1025 * and if we do check the page is fully outside i_size or not 1026 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1027 * will be evaluated to 0. Hence this page will be redirtied 1028 * and be written out repeatedly which would result in an 1029 * infinite loop, the user program that perform this operation 1030 * will hang. Instead, we can verify this situation by checking 1031 * if the page to write is totally beyond the i_size or if it's 1032 * offset is just equal to the EOF. 1033 */ 1034 if (page->index > end_index || 1035 (page->index == end_index && offset_into_page == 0)) 1036 goto redirty; 1037 1038 /* 1039 * The page straddles i_size. It must be zeroed out on each 1040 * and every writepage invocation because it may be mmapped. 1041 * "A file is mapped in multiples of the page size. For a file 1042 * that is not a multiple of the page size, the remaining 1043 * memory is zeroed when mapped, and writes to that region are 1044 * not written out to the file." 1045 */ 1046 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1047 1048 /* Adjust the end_offset to the end of file */ 1049 end_offset = offset; 1050 } 1051 1052 len = 1 << inode->i_blkbits; 1053 1054 bh = head = page_buffers(page); 1055 offset = page_offset(page); 1056 type = XFS_IO_OVERWRITE; 1057 1058 if (wbc->sync_mode == WB_SYNC_NONE) 1059 nonblocking = 1; 1060 1061 do { 1062 int new_ioend = 0; 1063 1064 if (offset >= end_offset) 1065 break; 1066 if (!buffer_uptodate(bh)) 1067 uptodate = 0; 1068 1069 /* 1070 * set_page_dirty dirties all buffers in a page, independent 1071 * of their state. The dirty state however is entirely 1072 * meaningless for holes (!mapped && uptodate), so skip 1073 * buffers covering holes here. 1074 */ 1075 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1076 imap_valid = 0; 1077 continue; 1078 } 1079 1080 if (buffer_unwritten(bh)) { 1081 if (type != XFS_IO_UNWRITTEN) { 1082 type = XFS_IO_UNWRITTEN; 1083 imap_valid = 0; 1084 } 1085 } else if (buffer_delay(bh)) { 1086 if (type != XFS_IO_DELALLOC) { 1087 type = XFS_IO_DELALLOC; 1088 imap_valid = 0; 1089 } 1090 } else if (buffer_uptodate(bh)) { 1091 if (type != XFS_IO_OVERWRITE) { 1092 type = XFS_IO_OVERWRITE; 1093 imap_valid = 0; 1094 } 1095 } else { 1096 if (PageUptodate(page)) 1097 ASSERT(buffer_mapped(bh)); 1098 /* 1099 * This buffer is not uptodate and will not be 1100 * written to disk. Ensure that we will put any 1101 * subsequent writeable buffers into a new 1102 * ioend. 1103 */ 1104 imap_valid = 0; 1105 continue; 1106 } 1107 1108 if (imap_valid) 1109 imap_valid = xfs_imap_valid(inode, &imap, offset); 1110 if (!imap_valid) { 1111 /* 1112 * If we didn't have a valid mapping then we need to 1113 * put the new mapping into a separate ioend structure. 1114 * This ensures non-contiguous extents always have 1115 * separate ioends, which is particularly important 1116 * for unwritten extent conversion at I/O completion 1117 * time. 1118 */ 1119 new_ioend = 1; 1120 err = xfs_map_blocks(inode, offset, &imap, type, 1121 nonblocking); 1122 if (err) 1123 goto error; 1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 1125 } 1126 if (imap_valid) { 1127 lock_buffer(bh); 1128 if (type != XFS_IO_OVERWRITE) 1129 xfs_map_at_offset(inode, bh, &imap, offset); 1130 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1131 new_ioend); 1132 count++; 1133 } 1134 1135 if (!iohead) 1136 iohead = ioend; 1137 1138 } while (offset += len, ((bh = bh->b_this_page) != head)); 1139 1140 if (uptodate && bh == head) 1141 SetPageUptodate(page); 1142 1143 xfs_start_page_writeback(page, 1, count); 1144 1145 /* if there is no IO to be submitted for this page, we are done */ 1146 if (!ioend) 1147 return 0; 1148 1149 ASSERT(iohead); 1150 1151 /* 1152 * Any errors from this point onwards need tobe reported through the IO 1153 * completion path as we have marked the initial page as under writeback 1154 * and unlocked it. 1155 */ 1156 if (imap_valid) { 1157 xfs_off_t end_index; 1158 1159 end_index = imap.br_startoff + imap.br_blockcount; 1160 1161 /* to bytes */ 1162 end_index <<= inode->i_blkbits; 1163 1164 /* to pages */ 1165 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1166 1167 /* check against file size */ 1168 if (end_index > last_index) 1169 end_index = last_index; 1170 1171 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1172 wbc, end_index); 1173 } 1174 1175 1176 /* 1177 * Reserve log space if we might write beyond the on-disk inode size. 1178 */ 1179 err = 0; 1180 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1181 err = xfs_setfilesize_trans_alloc(ioend); 1182 1183 xfs_submit_ioend(wbc, iohead, err); 1184 1185 return 0; 1186 1187 error: 1188 if (iohead) 1189 xfs_cancel_ioend(iohead); 1190 1191 if (err == -EAGAIN) 1192 goto redirty; 1193 1194 xfs_aops_discard_page(page); 1195 ClearPageUptodate(page); 1196 unlock_page(page); 1197 return err; 1198 1199 redirty: 1200 redirty_page_for_writepage(wbc, page); 1201 unlock_page(page); 1202 return 0; 1203 } 1204 1205 STATIC int 1206 xfs_vm_writepages( 1207 struct address_space *mapping, 1208 struct writeback_control *wbc) 1209 { 1210 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1211 if (dax_mapping(mapping)) 1212 return dax_writeback_mapping_range(mapping, 1213 xfs_find_bdev_for_inode(mapping->host), wbc); 1214 1215 return generic_writepages(mapping, wbc); 1216 } 1217 1218 /* 1219 * Called to move a page into cleanable state - and from there 1220 * to be released. The page should already be clean. We always 1221 * have buffer heads in this call. 1222 * 1223 * Returns 1 if the page is ok to release, 0 otherwise. 1224 */ 1225 STATIC int 1226 xfs_vm_releasepage( 1227 struct page *page, 1228 gfp_t gfp_mask) 1229 { 1230 int delalloc, unwritten; 1231 1232 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1233 1234 xfs_count_page_state(page, &delalloc, &unwritten); 1235 1236 if (WARN_ON_ONCE(delalloc)) 1237 return 0; 1238 if (WARN_ON_ONCE(unwritten)) 1239 return 0; 1240 1241 return try_to_free_buffers(page); 1242 } 1243 1244 /* 1245 * When we map a DIO buffer, we may need to attach an ioend that describes the 1246 * type of write IO we are doing. This passes to the completion function the 1247 * operations it needs to perform. If the mapping is for an overwrite wholly 1248 * within the EOF then we don't need an ioend and so we don't allocate one. 1249 * This avoids the unnecessary overhead of allocating and freeing ioends for 1250 * workloads that don't require transactions on IO completion. 1251 * 1252 * If we get multiple mappings in a single IO, we might be mapping different 1253 * types. But because the direct IO can only have a single private pointer, we 1254 * need to ensure that: 1255 * 1256 * a) i) the ioend spans the entire region of unwritten mappings; or 1257 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1258 * b) if it contains unwritten extents, it is *permanently* marked as such 1259 * 1260 * We could do this by chaining ioends like buffered IO does, but we only 1261 * actually get one IO completion callback from the direct IO, and that spans 1262 * the entire IO regardless of how many mappings and IOs are needed to complete 1263 * the DIO. There is only going to be one reference to the ioend and its life 1264 * cycle is constrained by the DIO completion code. hence we don't need 1265 * reference counting here. 1266 * 1267 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1268 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1269 * bit variable. Hence if we see this overflow, we have to assume that the IO is 1270 * extending the file size. We won't know for sure until IO completion is run 1271 * and the actual max write offset is communicated to the IO completion 1272 * routine. 1273 * 1274 * For DAX page faults, we are preparing to never see unwritten extents here, 1275 * nor should we ever extend the inode size. Hence we will soon have nothing to 1276 * do here for this case, ensuring we don't have to provide an IO completion 1277 * callback to free an ioend that we don't actually need for a fault into the 1278 * page at offset (2^63 - 1FSB) bytes. 1279 */ 1280 1281 static void 1282 xfs_map_direct( 1283 struct inode *inode, 1284 struct buffer_head *bh_result, 1285 struct xfs_bmbt_irec *imap, 1286 xfs_off_t offset, 1287 bool dax_fault) 1288 { 1289 struct xfs_ioend *ioend; 1290 xfs_off_t size = bh_result->b_size; 1291 int type; 1292 1293 if (ISUNWRITTEN(imap)) 1294 type = XFS_IO_UNWRITTEN; 1295 else 1296 type = XFS_IO_OVERWRITE; 1297 1298 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1299 1300 if (dax_fault) { 1301 ASSERT(type == XFS_IO_OVERWRITE); 1302 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1303 imap); 1304 return; 1305 } 1306 1307 if (bh_result->b_private) { 1308 ioend = bh_result->b_private; 1309 ASSERT(ioend->io_size > 0); 1310 ASSERT(offset >= ioend->io_offset); 1311 if (offset + size > ioend->io_offset + ioend->io_size) 1312 ioend->io_size = offset - ioend->io_offset + size; 1313 1314 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1315 ioend->io_type = XFS_IO_UNWRITTEN; 1316 1317 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1318 ioend->io_size, ioend->io_type, 1319 imap); 1320 } else if (type == XFS_IO_UNWRITTEN || 1321 offset + size > i_size_read(inode) || 1322 offset + size < 0) { 1323 ioend = xfs_alloc_ioend(inode, type); 1324 ioend->io_offset = offset; 1325 ioend->io_size = size; 1326 1327 bh_result->b_private = ioend; 1328 set_buffer_defer_completion(bh_result); 1329 1330 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1331 imap); 1332 } else { 1333 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1334 imap); 1335 } 1336 } 1337 1338 /* 1339 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1340 * is, so that we can avoid repeated get_blocks calls. 1341 * 1342 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1343 * for blocks beyond EOF must be marked new so that sub block regions can be 1344 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1345 * was just allocated or is unwritten, otherwise the callers would overwrite 1346 * existing data with zeros. Hence we have to split the mapping into a range up 1347 * to and including EOF, and a second mapping for beyond EOF. 1348 */ 1349 static void 1350 xfs_map_trim_size( 1351 struct inode *inode, 1352 sector_t iblock, 1353 struct buffer_head *bh_result, 1354 struct xfs_bmbt_irec *imap, 1355 xfs_off_t offset, 1356 ssize_t size) 1357 { 1358 xfs_off_t mapping_size; 1359 1360 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1361 mapping_size <<= inode->i_blkbits; 1362 1363 ASSERT(mapping_size > 0); 1364 if (mapping_size > size) 1365 mapping_size = size; 1366 if (offset < i_size_read(inode) && 1367 offset + mapping_size >= i_size_read(inode)) { 1368 /* limit mapping to block that spans EOF */ 1369 mapping_size = roundup_64(i_size_read(inode) - offset, 1370 1 << inode->i_blkbits); 1371 } 1372 if (mapping_size > LONG_MAX) 1373 mapping_size = LONG_MAX; 1374 1375 bh_result->b_size = mapping_size; 1376 } 1377 1378 STATIC int 1379 __xfs_get_blocks( 1380 struct inode *inode, 1381 sector_t iblock, 1382 struct buffer_head *bh_result, 1383 int create, 1384 bool direct, 1385 bool dax_fault) 1386 { 1387 struct xfs_inode *ip = XFS_I(inode); 1388 struct xfs_mount *mp = ip->i_mount; 1389 xfs_fileoff_t offset_fsb, end_fsb; 1390 int error = 0; 1391 int lockmode = 0; 1392 struct xfs_bmbt_irec imap; 1393 int nimaps = 1; 1394 xfs_off_t offset; 1395 ssize_t size; 1396 int new = 0; 1397 1398 if (XFS_FORCED_SHUTDOWN(mp)) 1399 return -EIO; 1400 1401 offset = (xfs_off_t)iblock << inode->i_blkbits; 1402 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1403 size = bh_result->b_size; 1404 1405 if (!create && direct && offset >= i_size_read(inode)) 1406 return 0; 1407 1408 /* 1409 * Direct I/O is usually done on preallocated files, so try getting 1410 * a block mapping without an exclusive lock first. For buffered 1411 * writes we already have the exclusive iolock anyway, so avoiding 1412 * a lock roundtrip here by taking the ilock exclusive from the 1413 * beginning is a useful micro optimization. 1414 */ 1415 if (create && !direct) { 1416 lockmode = XFS_ILOCK_EXCL; 1417 xfs_ilock(ip, lockmode); 1418 } else { 1419 lockmode = xfs_ilock_data_map_shared(ip); 1420 } 1421 1422 ASSERT(offset <= mp->m_super->s_maxbytes); 1423 if (offset + size > mp->m_super->s_maxbytes) 1424 size = mp->m_super->s_maxbytes - offset; 1425 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1426 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1427 1428 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1429 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1430 if (error) 1431 goto out_unlock; 1432 1433 /* for DAX, we convert unwritten extents directly */ 1434 if (create && 1435 (!nimaps || 1436 (imap.br_startblock == HOLESTARTBLOCK || 1437 imap.br_startblock == DELAYSTARTBLOCK) || 1438 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1439 if (direct || xfs_get_extsz_hint(ip)) { 1440 /* 1441 * xfs_iomap_write_direct() expects the shared lock. It 1442 * is unlocked on return. 1443 */ 1444 if (lockmode == XFS_ILOCK_EXCL) 1445 xfs_ilock_demote(ip, lockmode); 1446 1447 error = xfs_iomap_write_direct(ip, offset, size, 1448 &imap, nimaps); 1449 if (error) 1450 return error; 1451 new = 1; 1452 1453 } else { 1454 /* 1455 * Delalloc reservations do not require a transaction, 1456 * we can go on without dropping the lock here. If we 1457 * are allocating a new delalloc block, make sure that 1458 * we set the new flag so that we mark the buffer new so 1459 * that we know that it is newly allocated if the write 1460 * fails. 1461 */ 1462 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1463 new = 1; 1464 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1465 if (error) 1466 goto out_unlock; 1467 1468 xfs_iunlock(ip, lockmode); 1469 } 1470 trace_xfs_get_blocks_alloc(ip, offset, size, 1471 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1472 : XFS_IO_DELALLOC, &imap); 1473 } else if (nimaps) { 1474 trace_xfs_get_blocks_found(ip, offset, size, 1475 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1476 : XFS_IO_OVERWRITE, &imap); 1477 xfs_iunlock(ip, lockmode); 1478 } else { 1479 trace_xfs_get_blocks_notfound(ip, offset, size); 1480 goto out_unlock; 1481 } 1482 1483 if (IS_DAX(inode) && create) { 1484 ASSERT(!ISUNWRITTEN(&imap)); 1485 /* zeroing is not needed at a higher layer */ 1486 new = 0; 1487 } 1488 1489 /* trim mapping down to size requested */ 1490 if (direct || size > (1 << inode->i_blkbits)) 1491 xfs_map_trim_size(inode, iblock, bh_result, 1492 &imap, offset, size); 1493 1494 /* 1495 * For unwritten extents do not report a disk address in the buffered 1496 * read case (treat as if we're reading into a hole). 1497 */ 1498 if (imap.br_startblock != HOLESTARTBLOCK && 1499 imap.br_startblock != DELAYSTARTBLOCK && 1500 (create || !ISUNWRITTEN(&imap))) { 1501 xfs_map_buffer(inode, bh_result, &imap, offset); 1502 if (ISUNWRITTEN(&imap)) 1503 set_buffer_unwritten(bh_result); 1504 /* direct IO needs special help */ 1505 if (create && direct) 1506 xfs_map_direct(inode, bh_result, &imap, offset, 1507 dax_fault); 1508 } 1509 1510 /* 1511 * If this is a realtime file, data may be on a different device. 1512 * to that pointed to from the buffer_head b_bdev currently. 1513 */ 1514 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1515 1516 /* 1517 * If we previously allocated a block out beyond eof and we are now 1518 * coming back to use it then we will need to flag it as new even if it 1519 * has a disk address. 1520 * 1521 * With sub-block writes into unwritten extents we also need to mark 1522 * the buffer as new so that the unwritten parts of the buffer gets 1523 * correctly zeroed. 1524 */ 1525 if (create && 1526 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1527 (offset >= i_size_read(inode)) || 1528 (new || ISUNWRITTEN(&imap)))) 1529 set_buffer_new(bh_result); 1530 1531 if (imap.br_startblock == DELAYSTARTBLOCK) { 1532 BUG_ON(direct); 1533 if (create) { 1534 set_buffer_uptodate(bh_result); 1535 set_buffer_mapped(bh_result); 1536 set_buffer_delay(bh_result); 1537 } 1538 } 1539 1540 return 0; 1541 1542 out_unlock: 1543 xfs_iunlock(ip, lockmode); 1544 return error; 1545 } 1546 1547 int 1548 xfs_get_blocks( 1549 struct inode *inode, 1550 sector_t iblock, 1551 struct buffer_head *bh_result, 1552 int create) 1553 { 1554 return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1555 } 1556 1557 int 1558 xfs_get_blocks_direct( 1559 struct inode *inode, 1560 sector_t iblock, 1561 struct buffer_head *bh_result, 1562 int create) 1563 { 1564 return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1565 } 1566 1567 int 1568 xfs_get_blocks_dax_fault( 1569 struct inode *inode, 1570 sector_t iblock, 1571 struct buffer_head *bh_result, 1572 int create) 1573 { 1574 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1575 } 1576 1577 static void 1578 __xfs_end_io_direct_write( 1579 struct inode *inode, 1580 struct xfs_ioend *ioend, 1581 loff_t offset, 1582 ssize_t size) 1583 { 1584 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1585 1586 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1587 goto out_end_io; 1588 1589 /* 1590 * dio completion end_io functions are only called on writes if more 1591 * than 0 bytes was written. 1592 */ 1593 ASSERT(size > 0); 1594 1595 /* 1596 * The ioend only maps whole blocks, while the IO may be sector aligned. 1597 * Hence the ioend offset/size may not match the IO offset/size exactly. 1598 * Because we don't map overwrites within EOF into the ioend, the offset 1599 * may not match, but only if the endio spans EOF. Either way, write 1600 * the IO sizes into the ioend so that completion processing does the 1601 * right thing. 1602 */ 1603 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1604 ioend->io_size = size; 1605 ioend->io_offset = offset; 1606 1607 /* 1608 * The ioend tells us whether we are doing unwritten extent conversion 1609 * or an append transaction that updates the on-disk file size. These 1610 * cases are the only cases where we should *potentially* be needing 1611 * to update the VFS inode size. 1612 * 1613 * We need to update the in-core inode size here so that we don't end up 1614 * with the on-disk inode size being outside the in-core inode size. We 1615 * have no other method of updating EOF for AIO, so always do it here 1616 * if necessary. 1617 * 1618 * We need to lock the test/set EOF update as we can be racing with 1619 * other IO completions here to update the EOF. Failing to serialise 1620 * here can result in EOF moving backwards and Bad Things Happen when 1621 * that occurs. 1622 */ 1623 spin_lock(&XFS_I(inode)->i_flags_lock); 1624 if (offset + size > i_size_read(inode)) 1625 i_size_write(inode, offset + size); 1626 spin_unlock(&XFS_I(inode)->i_flags_lock); 1627 1628 /* 1629 * If we are doing an append IO that needs to update the EOF on disk, 1630 * do the transaction reserve now so we can use common end io 1631 * processing. Stashing the error (if there is one) in the ioend will 1632 * result in the ioend processing passing on the error if it is 1633 * possible as we can't return it from here. 1634 */ 1635 if (ioend->io_type == XFS_IO_OVERWRITE) 1636 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1637 1638 out_end_io: 1639 xfs_end_io(&ioend->io_work); 1640 return; 1641 } 1642 1643 /* 1644 * Complete a direct I/O write request. 1645 * 1646 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1647 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1648 * wholly within the EOF and so there is nothing for us to do. Note that in this 1649 * case the completion can be called in interrupt context, whereas if we have an 1650 * ioend we will always be called in task context (i.e. from a workqueue). 1651 */ 1652 STATIC void 1653 xfs_end_io_direct_write( 1654 struct kiocb *iocb, 1655 loff_t offset, 1656 ssize_t size, 1657 void *private) 1658 { 1659 struct inode *inode = file_inode(iocb->ki_filp); 1660 struct xfs_ioend *ioend = private; 1661 1662 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1663 ioend ? ioend->io_type : 0, NULL); 1664 1665 if (!ioend) { 1666 ASSERT(offset + size <= i_size_read(inode)); 1667 return; 1668 } 1669 1670 __xfs_end_io_direct_write(inode, ioend, offset, size); 1671 } 1672 1673 static inline ssize_t 1674 xfs_vm_do_dio( 1675 struct inode *inode, 1676 struct kiocb *iocb, 1677 struct iov_iter *iter, 1678 loff_t offset, 1679 void (*endio)(struct kiocb *iocb, 1680 loff_t offset, 1681 ssize_t size, 1682 void *private), 1683 int flags) 1684 { 1685 struct block_device *bdev; 1686 1687 if (IS_DAX(inode)) 1688 return dax_do_io(iocb, inode, iter, offset, 1689 xfs_get_blocks_direct, endio, 0); 1690 1691 bdev = xfs_find_bdev_for_inode(inode); 1692 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1693 xfs_get_blocks_direct, endio, NULL, flags); 1694 } 1695 1696 STATIC ssize_t 1697 xfs_vm_direct_IO( 1698 struct kiocb *iocb, 1699 struct iov_iter *iter, 1700 loff_t offset) 1701 { 1702 struct inode *inode = iocb->ki_filp->f_mapping->host; 1703 1704 if (iov_iter_rw(iter) == WRITE) 1705 return xfs_vm_do_dio(inode, iocb, iter, offset, 1706 xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1707 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1708 } 1709 1710 /* 1711 * Punch out the delalloc blocks we have already allocated. 1712 * 1713 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1714 * as the page is still locked at this point. 1715 */ 1716 STATIC void 1717 xfs_vm_kill_delalloc_range( 1718 struct inode *inode, 1719 loff_t start, 1720 loff_t end) 1721 { 1722 struct xfs_inode *ip = XFS_I(inode); 1723 xfs_fileoff_t start_fsb; 1724 xfs_fileoff_t end_fsb; 1725 int error; 1726 1727 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1728 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1729 if (end_fsb <= start_fsb) 1730 return; 1731 1732 xfs_ilock(ip, XFS_ILOCK_EXCL); 1733 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1734 end_fsb - start_fsb); 1735 if (error) { 1736 /* something screwed, just bail */ 1737 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1738 xfs_alert(ip->i_mount, 1739 "xfs_vm_write_failed: unable to clean up ino %lld", 1740 ip->i_ino); 1741 } 1742 } 1743 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1744 } 1745 1746 STATIC void 1747 xfs_vm_write_failed( 1748 struct inode *inode, 1749 struct page *page, 1750 loff_t pos, 1751 unsigned len) 1752 { 1753 loff_t block_offset; 1754 loff_t block_start; 1755 loff_t block_end; 1756 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1757 loff_t to = from + len; 1758 struct buffer_head *bh, *head; 1759 1760 /* 1761 * The request pos offset might be 32 or 64 bit, this is all fine 1762 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1763 * platform, the high 32-bit will be masked off if we evaluate the 1764 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1765 * 0xfffff000 as an unsigned long, hence the result is incorrect 1766 * which could cause the following ASSERT failed in most cases. 1767 * In order to avoid this, we can evaluate the block_offset of the 1768 * start of the page by using shifts rather than masks the mismatch 1769 * problem. 1770 */ 1771 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1772 1773 ASSERT(block_offset + from == pos); 1774 1775 head = page_buffers(page); 1776 block_start = 0; 1777 for (bh = head; bh != head || !block_start; 1778 bh = bh->b_this_page, block_start = block_end, 1779 block_offset += bh->b_size) { 1780 block_end = block_start + bh->b_size; 1781 1782 /* skip buffers before the write */ 1783 if (block_end <= from) 1784 continue; 1785 1786 /* if the buffer is after the write, we're done */ 1787 if (block_start >= to) 1788 break; 1789 1790 if (!buffer_delay(bh)) 1791 continue; 1792 1793 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1794 continue; 1795 1796 xfs_vm_kill_delalloc_range(inode, block_offset, 1797 block_offset + bh->b_size); 1798 1799 /* 1800 * This buffer does not contain data anymore. make sure anyone 1801 * who finds it knows that for certain. 1802 */ 1803 clear_buffer_delay(bh); 1804 clear_buffer_uptodate(bh); 1805 clear_buffer_mapped(bh); 1806 clear_buffer_new(bh); 1807 clear_buffer_dirty(bh); 1808 } 1809 1810 } 1811 1812 /* 1813 * This used to call block_write_begin(), but it unlocks and releases the page 1814 * on error, and we need that page to be able to punch stale delalloc blocks out 1815 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1816 * the appropriate point. 1817 */ 1818 STATIC int 1819 xfs_vm_write_begin( 1820 struct file *file, 1821 struct address_space *mapping, 1822 loff_t pos, 1823 unsigned len, 1824 unsigned flags, 1825 struct page **pagep, 1826 void **fsdata) 1827 { 1828 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1829 struct page *page; 1830 int status; 1831 1832 ASSERT(len <= PAGE_CACHE_SIZE); 1833 1834 page = grab_cache_page_write_begin(mapping, index, flags); 1835 if (!page) 1836 return -ENOMEM; 1837 1838 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1839 if (unlikely(status)) { 1840 struct inode *inode = mapping->host; 1841 size_t isize = i_size_read(inode); 1842 1843 xfs_vm_write_failed(inode, page, pos, len); 1844 unlock_page(page); 1845 1846 /* 1847 * If the write is beyond EOF, we only want to kill blocks 1848 * allocated in this write, not blocks that were previously 1849 * written successfully. 1850 */ 1851 if (pos + len > isize) { 1852 ssize_t start = max_t(ssize_t, pos, isize); 1853 1854 truncate_pagecache_range(inode, start, pos + len); 1855 } 1856 1857 page_cache_release(page); 1858 page = NULL; 1859 } 1860 1861 *pagep = page; 1862 return status; 1863 } 1864 1865 /* 1866 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1867 * this specific write because they will never be written. Previous writes 1868 * beyond EOF where block allocation succeeded do not need to be trashed, so 1869 * only new blocks from this write should be trashed. For blocks within 1870 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1871 * written with all the other valid data. 1872 */ 1873 STATIC int 1874 xfs_vm_write_end( 1875 struct file *file, 1876 struct address_space *mapping, 1877 loff_t pos, 1878 unsigned len, 1879 unsigned copied, 1880 struct page *page, 1881 void *fsdata) 1882 { 1883 int ret; 1884 1885 ASSERT(len <= PAGE_CACHE_SIZE); 1886 1887 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1888 if (unlikely(ret < len)) { 1889 struct inode *inode = mapping->host; 1890 size_t isize = i_size_read(inode); 1891 loff_t to = pos + len; 1892 1893 if (to > isize) { 1894 /* only kill blocks in this write beyond EOF */ 1895 if (pos > isize) 1896 isize = pos; 1897 xfs_vm_kill_delalloc_range(inode, isize, to); 1898 truncate_pagecache_range(inode, isize, to); 1899 } 1900 } 1901 return ret; 1902 } 1903 1904 STATIC sector_t 1905 xfs_vm_bmap( 1906 struct address_space *mapping, 1907 sector_t block) 1908 { 1909 struct inode *inode = (struct inode *)mapping->host; 1910 struct xfs_inode *ip = XFS_I(inode); 1911 1912 trace_xfs_vm_bmap(XFS_I(inode)); 1913 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1914 filemap_write_and_wait(mapping); 1915 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1916 return generic_block_bmap(mapping, block, xfs_get_blocks); 1917 } 1918 1919 STATIC int 1920 xfs_vm_readpage( 1921 struct file *unused, 1922 struct page *page) 1923 { 1924 trace_xfs_vm_readpage(page->mapping->host, 1); 1925 return mpage_readpage(page, xfs_get_blocks); 1926 } 1927 1928 STATIC int 1929 xfs_vm_readpages( 1930 struct file *unused, 1931 struct address_space *mapping, 1932 struct list_head *pages, 1933 unsigned nr_pages) 1934 { 1935 trace_xfs_vm_readpages(mapping->host, nr_pages); 1936 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1937 } 1938 1939 /* 1940 * This is basically a copy of __set_page_dirty_buffers() with one 1941 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1942 * dirty, we'll never be able to clean them because we don't write buffers 1943 * beyond EOF, and that means we can't invalidate pages that span EOF 1944 * that have been marked dirty. Further, the dirty state can leak into 1945 * the file interior if the file is extended, resulting in all sorts of 1946 * bad things happening as the state does not match the underlying data. 1947 * 1948 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1949 * this only exist because of bufferheads and how the generic code manages them. 1950 */ 1951 STATIC int 1952 xfs_vm_set_page_dirty( 1953 struct page *page) 1954 { 1955 struct address_space *mapping = page->mapping; 1956 struct inode *inode = mapping->host; 1957 loff_t end_offset; 1958 loff_t offset; 1959 int newly_dirty; 1960 struct mem_cgroup *memcg; 1961 1962 if (unlikely(!mapping)) 1963 return !TestSetPageDirty(page); 1964 1965 end_offset = i_size_read(inode); 1966 offset = page_offset(page); 1967 1968 spin_lock(&mapping->private_lock); 1969 if (page_has_buffers(page)) { 1970 struct buffer_head *head = page_buffers(page); 1971 struct buffer_head *bh = head; 1972 1973 do { 1974 if (offset < end_offset) 1975 set_buffer_dirty(bh); 1976 bh = bh->b_this_page; 1977 offset += 1 << inode->i_blkbits; 1978 } while (bh != head); 1979 } 1980 /* 1981 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1982 * per-memcg dirty page counters. 1983 */ 1984 memcg = mem_cgroup_begin_page_stat(page); 1985 newly_dirty = !TestSetPageDirty(page); 1986 spin_unlock(&mapping->private_lock); 1987 1988 if (newly_dirty) { 1989 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1990 unsigned long flags; 1991 1992 spin_lock_irqsave(&mapping->tree_lock, flags); 1993 if (page->mapping) { /* Race with truncate? */ 1994 WARN_ON_ONCE(!PageUptodate(page)); 1995 account_page_dirtied(page, mapping, memcg); 1996 radix_tree_tag_set(&mapping->page_tree, 1997 page_index(page), PAGECACHE_TAG_DIRTY); 1998 } 1999 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2000 } 2001 mem_cgroup_end_page_stat(memcg); 2002 if (newly_dirty) 2003 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 2004 return newly_dirty; 2005 } 2006 2007 const struct address_space_operations xfs_address_space_operations = { 2008 .readpage = xfs_vm_readpage, 2009 .readpages = xfs_vm_readpages, 2010 .writepage = xfs_vm_writepage, 2011 .writepages = xfs_vm_writepages, 2012 .set_page_dirty = xfs_vm_set_page_dirty, 2013 .releasepage = xfs_vm_releasepage, 2014 .invalidatepage = xfs_vm_invalidatepage, 2015 .write_begin = xfs_vm_write_begin, 2016 .write_end = xfs_vm_write_end, 2017 .bmap = xfs_vm_bmap, 2018 .direct_IO = xfs_vm_direct_IO, 2019 .migratepage = buffer_migrate_page, 2020 .is_partially_uptodate = block_is_partially_uptodate, 2021 .error_remove_page = generic_error_remove_page, 2022 }; 2023