1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp, 0); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 123 1, _THIS_IP_); 124 /* 125 * We hand off the transaction to the completion thread now, so 126 * clear the flag here. 127 */ 128 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 129 return 0; 130 } 131 132 /* 133 * Update on-disk file size now that data has been written to disk. 134 */ 135 STATIC int 136 xfs_setfilesize( 137 struct xfs_inode *ip, 138 struct xfs_trans *tp, 139 xfs_off_t offset, 140 size_t size) 141 { 142 xfs_fsize_t isize; 143 144 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 isize = xfs_new_eof(ip, offset + size); 146 if (!isize) { 147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_trans_cancel(tp, 0); 149 return 0; 150 } 151 152 trace_xfs_setfilesize(ip, offset, size); 153 154 ip->i_d.di_size = isize; 155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 157 158 return xfs_trans_commit(tp, 0); 159 } 160 161 STATIC int 162 xfs_setfilesize_ioend( 163 struct xfs_ioend *ioend) 164 { 165 struct xfs_inode *ip = XFS_I(ioend->io_inode); 166 struct xfs_trans *tp = ioend->io_append_trans; 167 168 /* 169 * The transaction may have been allocated in the I/O submission thread, 170 * thus we need to mark ourselves as being in a transaction manually. 171 * Similarly for freeze protection. 172 */ 173 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 174 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 175 0, 1, _THIS_IP_); 176 177 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 178 } 179 180 /* 181 * Schedule IO completion handling on the final put of an ioend. 182 * 183 * If there is no work to do we might as well call it a day and free the 184 * ioend right now. 185 */ 186 STATIC void 187 xfs_finish_ioend( 188 struct xfs_ioend *ioend) 189 { 190 if (atomic_dec_and_test(&ioend->io_remaining)) { 191 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 192 193 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 else if (ioend->io_append_trans) 196 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 else 198 xfs_destroy_ioend(ioend); 199 } 200 } 201 202 /* 203 * IO write completion. 204 */ 205 STATIC void 206 xfs_end_io( 207 struct work_struct *work) 208 { 209 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 210 struct xfs_inode *ip = XFS_I(ioend->io_inode); 211 int error = 0; 212 213 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 214 ioend->io_error = -EIO; 215 goto done; 216 } 217 if (ioend->io_error) 218 goto done; 219 220 /* 221 * For unwritten extents we need to issue transactions to convert a 222 * range to normal written extens after the data I/O has finished. 223 */ 224 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 ioend->io_size); 227 } else if (ioend->io_append_trans) { 228 error = xfs_setfilesize_ioend(ioend); 229 } else { 230 ASSERT(!xfs_ioend_is_append(ioend)); 231 } 232 233 done: 234 if (error) 235 ioend->io_error = error; 236 xfs_destroy_ioend(ioend); 237 } 238 239 /* 240 * Allocate and initialise an IO completion structure. 241 * We need to track unwritten extent write completion here initially. 242 * We'll need to extend this for updating the ondisk inode size later 243 * (vs. incore size). 244 */ 245 STATIC xfs_ioend_t * 246 xfs_alloc_ioend( 247 struct inode *inode, 248 unsigned int type) 249 { 250 xfs_ioend_t *ioend; 251 252 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 253 254 /* 255 * Set the count to 1 initially, which will prevent an I/O 256 * completion callback from happening before we have started 257 * all the I/O from calling the completion routine too early. 258 */ 259 atomic_set(&ioend->io_remaining, 1); 260 ioend->io_error = 0; 261 ioend->io_list = NULL; 262 ioend->io_type = type; 263 ioend->io_inode = inode; 264 ioend->io_buffer_head = NULL; 265 ioend->io_buffer_tail = NULL; 266 ioend->io_offset = 0; 267 ioend->io_size = 0; 268 ioend->io_append_trans = NULL; 269 270 INIT_WORK(&ioend->io_work, xfs_end_io); 271 return ioend; 272 } 273 274 STATIC int 275 xfs_map_blocks( 276 struct inode *inode, 277 loff_t offset, 278 struct xfs_bmbt_irec *imap, 279 int type, 280 int nonblocking) 281 { 282 struct xfs_inode *ip = XFS_I(inode); 283 struct xfs_mount *mp = ip->i_mount; 284 ssize_t count = 1 << inode->i_blkbits; 285 xfs_fileoff_t offset_fsb, end_fsb; 286 int error = 0; 287 int bmapi_flags = XFS_BMAPI_ENTIRE; 288 int nimaps = 1; 289 290 if (XFS_FORCED_SHUTDOWN(mp)) 291 return -EIO; 292 293 if (type == XFS_IO_UNWRITTEN) 294 bmapi_flags |= XFS_BMAPI_IGSTATE; 295 296 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 297 if (nonblocking) 298 return -EAGAIN; 299 xfs_ilock(ip, XFS_ILOCK_SHARED); 300 } 301 302 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 303 (ip->i_df.if_flags & XFS_IFEXTENTS)); 304 ASSERT(offset <= mp->m_super->s_maxbytes); 305 306 if (offset + count > mp->m_super->s_maxbytes) 307 count = mp->m_super->s_maxbytes - offset; 308 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 309 offset_fsb = XFS_B_TO_FSBT(mp, offset); 310 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 311 imap, &nimaps, bmapi_flags); 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 314 if (error) 315 return error; 316 317 if (type == XFS_IO_DELALLOC && 318 (!nimaps || isnullstartblock(imap->br_startblock))) { 319 error = xfs_iomap_write_allocate(ip, offset, imap); 320 if (!error) 321 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 322 return error; 323 } 324 325 #ifdef DEBUG 326 if (type == XFS_IO_UNWRITTEN) { 327 ASSERT(nimaps); 328 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 329 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 330 } 331 #endif 332 if (nimaps) 333 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 334 return 0; 335 } 336 337 STATIC int 338 xfs_imap_valid( 339 struct inode *inode, 340 struct xfs_bmbt_irec *imap, 341 xfs_off_t offset) 342 { 343 offset >>= inode->i_blkbits; 344 345 return offset >= imap->br_startoff && 346 offset < imap->br_startoff + imap->br_blockcount; 347 } 348 349 /* 350 * BIO completion handler for buffered IO. 351 */ 352 STATIC void 353 xfs_end_bio( 354 struct bio *bio, 355 int error) 356 { 357 xfs_ioend_t *ioend = bio->bi_private; 358 359 ASSERT(atomic_read(&bio->bi_cnt) >= 1); 360 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; 361 362 /* Toss bio and pass work off to an xfsdatad thread */ 363 bio->bi_private = NULL; 364 bio->bi_end_io = NULL; 365 bio_put(bio); 366 367 xfs_finish_ioend(ioend); 368 } 369 370 STATIC void 371 xfs_submit_ioend_bio( 372 struct writeback_control *wbc, 373 xfs_ioend_t *ioend, 374 struct bio *bio) 375 { 376 atomic_inc(&ioend->io_remaining); 377 bio->bi_private = ioend; 378 bio->bi_end_io = xfs_end_bio; 379 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 380 } 381 382 STATIC struct bio * 383 xfs_alloc_ioend_bio( 384 struct buffer_head *bh) 385 { 386 int nvecs = bio_get_nr_vecs(bh->b_bdev); 387 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 388 389 ASSERT(bio->bi_private == NULL); 390 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 391 bio->bi_bdev = bh->b_bdev; 392 return bio; 393 } 394 395 STATIC void 396 xfs_start_buffer_writeback( 397 struct buffer_head *bh) 398 { 399 ASSERT(buffer_mapped(bh)); 400 ASSERT(buffer_locked(bh)); 401 ASSERT(!buffer_delay(bh)); 402 ASSERT(!buffer_unwritten(bh)); 403 404 mark_buffer_async_write(bh); 405 set_buffer_uptodate(bh); 406 clear_buffer_dirty(bh); 407 } 408 409 STATIC void 410 xfs_start_page_writeback( 411 struct page *page, 412 int clear_dirty, 413 int buffers) 414 { 415 ASSERT(PageLocked(page)); 416 ASSERT(!PageWriteback(page)); 417 418 /* 419 * if the page was not fully cleaned, we need to ensure that the higher 420 * layers come back to it correctly. That means we need to keep the page 421 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 422 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 423 * write this page in this writeback sweep will be made. 424 */ 425 if (clear_dirty) { 426 clear_page_dirty_for_io(page); 427 set_page_writeback(page); 428 } else 429 set_page_writeback_keepwrite(page); 430 431 unlock_page(page); 432 433 /* If no buffers on the page are to be written, finish it here */ 434 if (!buffers) 435 end_page_writeback(page); 436 } 437 438 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 439 { 440 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 441 } 442 443 /* 444 * Submit all of the bios for all of the ioends we have saved up, covering the 445 * initial writepage page and also any probed pages. 446 * 447 * Because we may have multiple ioends spanning a page, we need to start 448 * writeback on all the buffers before we submit them for I/O. If we mark the 449 * buffers as we got, then we can end up with a page that only has buffers 450 * marked async write and I/O complete on can occur before we mark the other 451 * buffers async write. 452 * 453 * The end result of this is that we trip a bug in end_page_writeback() because 454 * we call it twice for the one page as the code in end_buffer_async_write() 455 * assumes that all buffers on the page are started at the same time. 456 * 457 * The fix is two passes across the ioend list - one to start writeback on the 458 * buffer_heads, and then submit them for I/O on the second pass. 459 * 460 * If @fail is non-zero, it means that we have a situation where some part of 461 * the submission process has failed after we have marked paged for writeback 462 * and unlocked them. In this situation, we need to fail the ioend chain rather 463 * than submit it to IO. This typically only happens on a filesystem shutdown. 464 */ 465 STATIC void 466 xfs_submit_ioend( 467 struct writeback_control *wbc, 468 xfs_ioend_t *ioend, 469 int fail) 470 { 471 xfs_ioend_t *head = ioend; 472 xfs_ioend_t *next; 473 struct buffer_head *bh; 474 struct bio *bio; 475 sector_t lastblock = 0; 476 477 /* Pass 1 - start writeback */ 478 do { 479 next = ioend->io_list; 480 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 481 xfs_start_buffer_writeback(bh); 482 } while ((ioend = next) != NULL); 483 484 /* Pass 2 - submit I/O */ 485 ioend = head; 486 do { 487 next = ioend->io_list; 488 bio = NULL; 489 490 /* 491 * If we are failing the IO now, just mark the ioend with an 492 * error and finish it. This will run IO completion immediately 493 * as there is only one reference to the ioend at this point in 494 * time. 495 */ 496 if (fail) { 497 ioend->io_error = fail; 498 xfs_finish_ioend(ioend); 499 continue; 500 } 501 502 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 503 504 if (!bio) { 505 retry: 506 bio = xfs_alloc_ioend_bio(bh); 507 } else if (bh->b_blocknr != lastblock + 1) { 508 xfs_submit_ioend_bio(wbc, ioend, bio); 509 goto retry; 510 } 511 512 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 513 xfs_submit_ioend_bio(wbc, ioend, bio); 514 goto retry; 515 } 516 517 lastblock = bh->b_blocknr; 518 } 519 if (bio) 520 xfs_submit_ioend_bio(wbc, ioend, bio); 521 xfs_finish_ioend(ioend); 522 } while ((ioend = next) != NULL); 523 } 524 525 /* 526 * Cancel submission of all buffer_heads so far in this endio. 527 * Toss the endio too. Only ever called for the initial page 528 * in a writepage request, so only ever one page. 529 */ 530 STATIC void 531 xfs_cancel_ioend( 532 xfs_ioend_t *ioend) 533 { 534 xfs_ioend_t *next; 535 struct buffer_head *bh, *next_bh; 536 537 do { 538 next = ioend->io_list; 539 bh = ioend->io_buffer_head; 540 do { 541 next_bh = bh->b_private; 542 clear_buffer_async_write(bh); 543 /* 544 * The unwritten flag is cleared when added to the 545 * ioend. We're not submitting for I/O so mark the 546 * buffer unwritten again for next time around. 547 */ 548 if (ioend->io_type == XFS_IO_UNWRITTEN) 549 set_buffer_unwritten(bh); 550 unlock_buffer(bh); 551 } while ((bh = next_bh) != NULL); 552 553 mempool_free(ioend, xfs_ioend_pool); 554 } while ((ioend = next) != NULL); 555 } 556 557 /* 558 * Test to see if we've been building up a completion structure for 559 * earlier buffers -- if so, we try to append to this ioend if we 560 * can, otherwise we finish off any current ioend and start another. 561 * Return true if we've finished the given ioend. 562 */ 563 STATIC void 564 xfs_add_to_ioend( 565 struct inode *inode, 566 struct buffer_head *bh, 567 xfs_off_t offset, 568 unsigned int type, 569 xfs_ioend_t **result, 570 int need_ioend) 571 { 572 xfs_ioend_t *ioend = *result; 573 574 if (!ioend || need_ioend || type != ioend->io_type) { 575 xfs_ioend_t *previous = *result; 576 577 ioend = xfs_alloc_ioend(inode, type); 578 ioend->io_offset = offset; 579 ioend->io_buffer_head = bh; 580 ioend->io_buffer_tail = bh; 581 if (previous) 582 previous->io_list = ioend; 583 *result = ioend; 584 } else { 585 ioend->io_buffer_tail->b_private = bh; 586 ioend->io_buffer_tail = bh; 587 } 588 589 bh->b_private = NULL; 590 ioend->io_size += bh->b_size; 591 } 592 593 STATIC void 594 xfs_map_buffer( 595 struct inode *inode, 596 struct buffer_head *bh, 597 struct xfs_bmbt_irec *imap, 598 xfs_off_t offset) 599 { 600 sector_t bn; 601 struct xfs_mount *m = XFS_I(inode)->i_mount; 602 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 603 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 604 605 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 606 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 607 608 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 609 ((offset - iomap_offset) >> inode->i_blkbits); 610 611 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 612 613 bh->b_blocknr = bn; 614 set_buffer_mapped(bh); 615 } 616 617 STATIC void 618 xfs_map_at_offset( 619 struct inode *inode, 620 struct buffer_head *bh, 621 struct xfs_bmbt_irec *imap, 622 xfs_off_t offset) 623 { 624 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 627 xfs_map_buffer(inode, bh, imap, offset); 628 set_buffer_mapped(bh); 629 clear_buffer_delay(bh); 630 clear_buffer_unwritten(bh); 631 } 632 633 /* 634 * Test if a given page contains at least one buffer of a given @type. 635 * If @check_all_buffers is true, then we walk all the buffers in the page to 636 * try to find one of the type passed in. If it is not set, then the caller only 637 * needs to check the first buffer on the page for a match. 638 */ 639 STATIC bool 640 xfs_check_page_type( 641 struct page *page, 642 unsigned int type, 643 bool check_all_buffers) 644 { 645 struct buffer_head *bh; 646 struct buffer_head *head; 647 648 if (PageWriteback(page)) 649 return false; 650 if (!page->mapping) 651 return false; 652 if (!page_has_buffers(page)) 653 return false; 654 655 bh = head = page_buffers(page); 656 do { 657 if (buffer_unwritten(bh)) { 658 if (type == XFS_IO_UNWRITTEN) 659 return true; 660 } else if (buffer_delay(bh)) { 661 if (type == XFS_IO_DELALLOC) 662 return true; 663 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 664 if (type == XFS_IO_OVERWRITE) 665 return true; 666 } 667 668 /* If we are only checking the first buffer, we are done now. */ 669 if (!check_all_buffers) 670 break; 671 } while ((bh = bh->b_this_page) != head); 672 673 return false; 674 } 675 676 /* 677 * Allocate & map buffers for page given the extent map. Write it out. 678 * except for the original page of a writepage, this is called on 679 * delalloc/unwritten pages only, for the original page it is possible 680 * that the page has no mapping at all. 681 */ 682 STATIC int 683 xfs_convert_page( 684 struct inode *inode, 685 struct page *page, 686 loff_t tindex, 687 struct xfs_bmbt_irec *imap, 688 xfs_ioend_t **ioendp, 689 struct writeback_control *wbc) 690 { 691 struct buffer_head *bh, *head; 692 xfs_off_t end_offset; 693 unsigned long p_offset; 694 unsigned int type; 695 int len, page_dirty; 696 int count = 0, done = 0, uptodate = 1; 697 xfs_off_t offset = page_offset(page); 698 699 if (page->index != tindex) 700 goto fail; 701 if (!trylock_page(page)) 702 goto fail; 703 if (PageWriteback(page)) 704 goto fail_unlock_page; 705 if (page->mapping != inode->i_mapping) 706 goto fail_unlock_page; 707 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 708 goto fail_unlock_page; 709 710 /* 711 * page_dirty is initially a count of buffers on the page before 712 * EOF and is decremented as we move each into a cleanable state. 713 * 714 * Derivation: 715 * 716 * End offset is the highest offset that this page should represent. 717 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 718 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 719 * hence give us the correct page_dirty count. On any other page, 720 * it will be zero and in that case we need page_dirty to be the 721 * count of buffers on the page. 722 */ 723 end_offset = min_t(unsigned long long, 724 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 725 i_size_read(inode)); 726 727 /* 728 * If the current map does not span the entire page we are about to try 729 * to write, then give up. The only way we can write a page that spans 730 * multiple mappings in a single writeback iteration is via the 731 * xfs_vm_writepage() function. Data integrity writeback requires the 732 * entire page to be written in a single attempt, otherwise the part of 733 * the page we don't write here doesn't get written as part of the data 734 * integrity sync. 735 * 736 * For normal writeback, we also don't attempt to write partial pages 737 * here as it simply means that write_cache_pages() will see it under 738 * writeback and ignore the page until some point in the future, at 739 * which time this will be the only page in the file that needs 740 * writeback. Hence for more optimal IO patterns, we should always 741 * avoid partial page writeback due to multiple mappings on a page here. 742 */ 743 if (!xfs_imap_valid(inode, imap, end_offset)) 744 goto fail_unlock_page; 745 746 len = 1 << inode->i_blkbits; 747 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 748 PAGE_CACHE_SIZE); 749 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 page_dirty = p_offset / len; 751 752 /* 753 * The moment we find a buffer that doesn't match our current type 754 * specification or can't be written, abort the loop and start 755 * writeback. As per the above xfs_imap_valid() check, only 756 * xfs_vm_writepage() can handle partial page writeback fully - we are 757 * limited here to the buffers that are contiguous with the current 758 * ioend, and hence a buffer we can't write breaks that contiguity and 759 * we have to defer the rest of the IO to xfs_vm_writepage(). 760 */ 761 bh = head = page_buffers(page); 762 do { 763 if (offset >= end_offset) 764 break; 765 if (!buffer_uptodate(bh)) 766 uptodate = 0; 767 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 done = 1; 769 break; 770 } 771 772 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 buffer_mapped(bh)) { 774 if (buffer_unwritten(bh)) 775 type = XFS_IO_UNWRITTEN; 776 else if (buffer_delay(bh)) 777 type = XFS_IO_DELALLOC; 778 else 779 type = XFS_IO_OVERWRITE; 780 781 /* 782 * imap should always be valid because of the above 783 * partial page end_offset check on the imap. 784 */ 785 ASSERT(xfs_imap_valid(inode, imap, offset)); 786 787 lock_buffer(bh); 788 if (type != XFS_IO_OVERWRITE) 789 xfs_map_at_offset(inode, bh, imap, offset); 790 xfs_add_to_ioend(inode, bh, offset, type, 791 ioendp, done); 792 793 page_dirty--; 794 count++; 795 } else { 796 done = 1; 797 break; 798 } 799 } while (offset += len, (bh = bh->b_this_page) != head); 800 801 if (uptodate && bh == head) 802 SetPageUptodate(page); 803 804 if (count) { 805 if (--wbc->nr_to_write <= 0 && 806 wbc->sync_mode == WB_SYNC_NONE) 807 done = 1; 808 } 809 xfs_start_page_writeback(page, !page_dirty, count); 810 811 return done; 812 fail_unlock_page: 813 unlock_page(page); 814 fail: 815 return 1; 816 } 817 818 /* 819 * Convert & write out a cluster of pages in the same extent as defined 820 * by mp and following the start page. 821 */ 822 STATIC void 823 xfs_cluster_write( 824 struct inode *inode, 825 pgoff_t tindex, 826 struct xfs_bmbt_irec *imap, 827 xfs_ioend_t **ioendp, 828 struct writeback_control *wbc, 829 pgoff_t tlast) 830 { 831 struct pagevec pvec; 832 int done = 0, i; 833 834 pagevec_init(&pvec, 0); 835 while (!done && tindex <= tlast) { 836 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 837 838 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 839 break; 840 841 for (i = 0; i < pagevec_count(&pvec); i++) { 842 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 843 imap, ioendp, wbc); 844 if (done) 845 break; 846 } 847 848 pagevec_release(&pvec); 849 cond_resched(); 850 } 851 } 852 853 STATIC void 854 xfs_vm_invalidatepage( 855 struct page *page, 856 unsigned int offset, 857 unsigned int length) 858 { 859 trace_xfs_invalidatepage(page->mapping->host, page, offset, 860 length); 861 block_invalidatepage(page, offset, length); 862 } 863 864 /* 865 * If the page has delalloc buffers on it, we need to punch them out before we 866 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 867 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 868 * is done on that same region - the delalloc extent is returned when none is 869 * supposed to be there. 870 * 871 * We prevent this by truncating away the delalloc regions on the page before 872 * invalidating it. Because they are delalloc, we can do this without needing a 873 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 874 * truncation without a transaction as there is no space left for block 875 * reservation (typically why we see a ENOSPC in writeback). 876 * 877 * This is not a performance critical path, so for now just do the punching a 878 * buffer head at a time. 879 */ 880 STATIC void 881 xfs_aops_discard_page( 882 struct page *page) 883 { 884 struct inode *inode = page->mapping->host; 885 struct xfs_inode *ip = XFS_I(inode); 886 struct buffer_head *bh, *head; 887 loff_t offset = page_offset(page); 888 889 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 890 goto out_invalidate; 891 892 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 goto out_invalidate; 894 895 xfs_alert(ip->i_mount, 896 "page discard on page %p, inode 0x%llx, offset %llu.", 897 page, ip->i_ino, offset); 898 899 xfs_ilock(ip, XFS_ILOCK_EXCL); 900 bh = head = page_buffers(page); 901 do { 902 int error; 903 xfs_fileoff_t start_fsb; 904 905 if (!buffer_delay(bh)) 906 goto next_buffer; 907 908 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 909 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 910 if (error) { 911 /* something screwed, just bail */ 912 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 913 xfs_alert(ip->i_mount, 914 "page discard unable to remove delalloc mapping."); 915 } 916 break; 917 } 918 next_buffer: 919 offset += 1 << inode->i_blkbits; 920 921 } while ((bh = bh->b_this_page) != head); 922 923 xfs_iunlock(ip, XFS_ILOCK_EXCL); 924 out_invalidate: 925 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 926 return; 927 } 928 929 /* 930 * Write out a dirty page. 931 * 932 * For delalloc space on the page we need to allocate space and flush it. 933 * For unwritten space on the page we need to start the conversion to 934 * regular allocated space. 935 * For any other dirty buffer heads on the page we should flush them. 936 */ 937 STATIC int 938 xfs_vm_writepage( 939 struct page *page, 940 struct writeback_control *wbc) 941 { 942 struct inode *inode = page->mapping->host; 943 struct buffer_head *bh, *head; 944 struct xfs_bmbt_irec imap; 945 xfs_ioend_t *ioend = NULL, *iohead = NULL; 946 loff_t offset; 947 unsigned int type; 948 __uint64_t end_offset; 949 pgoff_t end_index, last_index; 950 ssize_t len; 951 int err, imap_valid = 0, uptodate = 1; 952 int count = 0; 953 int nonblocking = 0; 954 955 trace_xfs_writepage(inode, page, 0, 0); 956 957 ASSERT(page_has_buffers(page)); 958 959 /* 960 * Refuse to write the page out if we are called from reclaim context. 961 * 962 * This avoids stack overflows when called from deeply used stacks in 963 * random callers for direct reclaim or memcg reclaim. We explicitly 964 * allow reclaim from kswapd as the stack usage there is relatively low. 965 * 966 * This should never happen except in the case of a VM regression so 967 * warn about it. 968 */ 969 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 970 PF_MEMALLOC)) 971 goto redirty; 972 973 /* 974 * Given that we do not allow direct reclaim to call us, we should 975 * never be called while in a filesystem transaction. 976 */ 977 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 978 goto redirty; 979 980 /* Is this page beyond the end of the file? */ 981 offset = i_size_read(inode); 982 end_index = offset >> PAGE_CACHE_SHIFT; 983 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 984 985 /* 986 * The page index is less than the end_index, adjust the end_offset 987 * to the highest offset that this page should represent. 988 * ----------------------------------------------------- 989 * | file mapping | <EOF> | 990 * ----------------------------------------------------- 991 * | Page ... | Page N-2 | Page N-1 | Page N | | 992 * ^--------------------------------^----------|-------- 993 * | desired writeback range | see else | 994 * ---------------------------------^------------------| 995 */ 996 if (page->index < end_index) 997 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 998 else { 999 /* 1000 * Check whether the page to write out is beyond or straddles 1001 * i_size or not. 1002 * ------------------------------------------------------- 1003 * | file mapping | <EOF> | 1004 * ------------------------------------------------------- 1005 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1006 * ^--------------------------------^-----------|--------- 1007 * | | Straddles | 1008 * ---------------------------------^-----------|--------| 1009 */ 1010 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1011 1012 /* 1013 * Skip the page if it is fully outside i_size, e.g. due to a 1014 * truncate operation that is in progress. We must redirty the 1015 * page so that reclaim stops reclaiming it. Otherwise 1016 * xfs_vm_releasepage() is called on it and gets confused. 1017 * 1018 * Note that the end_index is unsigned long, it would overflow 1019 * if the given offset is greater than 16TB on 32-bit system 1020 * and if we do check the page is fully outside i_size or not 1021 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1022 * will be evaluated to 0. Hence this page will be redirtied 1023 * and be written out repeatedly which would result in an 1024 * infinite loop, the user program that perform this operation 1025 * will hang. Instead, we can verify this situation by checking 1026 * if the page to write is totally beyond the i_size or if it's 1027 * offset is just equal to the EOF. 1028 */ 1029 if (page->index > end_index || 1030 (page->index == end_index && offset_into_page == 0)) 1031 goto redirty; 1032 1033 /* 1034 * The page straddles i_size. It must be zeroed out on each 1035 * and every writepage invocation because it may be mmapped. 1036 * "A file is mapped in multiples of the page size. For a file 1037 * that is not a multiple of the page size, the remaining 1038 * memory is zeroed when mapped, and writes to that region are 1039 * not written out to the file." 1040 */ 1041 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1042 1043 /* Adjust the end_offset to the end of file */ 1044 end_offset = offset; 1045 } 1046 1047 len = 1 << inode->i_blkbits; 1048 1049 bh = head = page_buffers(page); 1050 offset = page_offset(page); 1051 type = XFS_IO_OVERWRITE; 1052 1053 if (wbc->sync_mode == WB_SYNC_NONE) 1054 nonblocking = 1; 1055 1056 do { 1057 int new_ioend = 0; 1058 1059 if (offset >= end_offset) 1060 break; 1061 if (!buffer_uptodate(bh)) 1062 uptodate = 0; 1063 1064 /* 1065 * set_page_dirty dirties all buffers in a page, independent 1066 * of their state. The dirty state however is entirely 1067 * meaningless for holes (!mapped && uptodate), so skip 1068 * buffers covering holes here. 1069 */ 1070 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1071 imap_valid = 0; 1072 continue; 1073 } 1074 1075 if (buffer_unwritten(bh)) { 1076 if (type != XFS_IO_UNWRITTEN) { 1077 type = XFS_IO_UNWRITTEN; 1078 imap_valid = 0; 1079 } 1080 } else if (buffer_delay(bh)) { 1081 if (type != XFS_IO_DELALLOC) { 1082 type = XFS_IO_DELALLOC; 1083 imap_valid = 0; 1084 } 1085 } else if (buffer_uptodate(bh)) { 1086 if (type != XFS_IO_OVERWRITE) { 1087 type = XFS_IO_OVERWRITE; 1088 imap_valid = 0; 1089 } 1090 } else { 1091 if (PageUptodate(page)) 1092 ASSERT(buffer_mapped(bh)); 1093 /* 1094 * This buffer is not uptodate and will not be 1095 * written to disk. Ensure that we will put any 1096 * subsequent writeable buffers into a new 1097 * ioend. 1098 */ 1099 imap_valid = 0; 1100 continue; 1101 } 1102 1103 if (imap_valid) 1104 imap_valid = xfs_imap_valid(inode, &imap, offset); 1105 if (!imap_valid) { 1106 /* 1107 * If we didn't have a valid mapping then we need to 1108 * put the new mapping into a separate ioend structure. 1109 * This ensures non-contiguous extents always have 1110 * separate ioends, which is particularly important 1111 * for unwritten extent conversion at I/O completion 1112 * time. 1113 */ 1114 new_ioend = 1; 1115 err = xfs_map_blocks(inode, offset, &imap, type, 1116 nonblocking); 1117 if (err) 1118 goto error; 1119 imap_valid = xfs_imap_valid(inode, &imap, offset); 1120 } 1121 if (imap_valid) { 1122 lock_buffer(bh); 1123 if (type != XFS_IO_OVERWRITE) 1124 xfs_map_at_offset(inode, bh, &imap, offset); 1125 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1126 new_ioend); 1127 count++; 1128 } 1129 1130 if (!iohead) 1131 iohead = ioend; 1132 1133 } while (offset += len, ((bh = bh->b_this_page) != head)); 1134 1135 if (uptodate && bh == head) 1136 SetPageUptodate(page); 1137 1138 xfs_start_page_writeback(page, 1, count); 1139 1140 /* if there is no IO to be submitted for this page, we are done */ 1141 if (!ioend) 1142 return 0; 1143 1144 ASSERT(iohead); 1145 1146 /* 1147 * Any errors from this point onwards need tobe reported through the IO 1148 * completion path as we have marked the initial page as under writeback 1149 * and unlocked it. 1150 */ 1151 if (imap_valid) { 1152 xfs_off_t end_index; 1153 1154 end_index = imap.br_startoff + imap.br_blockcount; 1155 1156 /* to bytes */ 1157 end_index <<= inode->i_blkbits; 1158 1159 /* to pages */ 1160 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1161 1162 /* check against file size */ 1163 if (end_index > last_index) 1164 end_index = last_index; 1165 1166 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1167 wbc, end_index); 1168 } 1169 1170 1171 /* 1172 * Reserve log space if we might write beyond the on-disk inode size. 1173 */ 1174 err = 0; 1175 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1176 err = xfs_setfilesize_trans_alloc(ioend); 1177 1178 xfs_submit_ioend(wbc, iohead, err); 1179 1180 return 0; 1181 1182 error: 1183 if (iohead) 1184 xfs_cancel_ioend(iohead); 1185 1186 if (err == -EAGAIN) 1187 goto redirty; 1188 1189 xfs_aops_discard_page(page); 1190 ClearPageUptodate(page); 1191 unlock_page(page); 1192 return err; 1193 1194 redirty: 1195 redirty_page_for_writepage(wbc, page); 1196 unlock_page(page); 1197 return 0; 1198 } 1199 1200 STATIC int 1201 xfs_vm_writepages( 1202 struct address_space *mapping, 1203 struct writeback_control *wbc) 1204 { 1205 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1206 return generic_writepages(mapping, wbc); 1207 } 1208 1209 /* 1210 * Called to move a page into cleanable state - and from there 1211 * to be released. The page should already be clean. We always 1212 * have buffer heads in this call. 1213 * 1214 * Returns 1 if the page is ok to release, 0 otherwise. 1215 */ 1216 STATIC int 1217 xfs_vm_releasepage( 1218 struct page *page, 1219 gfp_t gfp_mask) 1220 { 1221 int delalloc, unwritten; 1222 1223 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1224 1225 xfs_count_page_state(page, &delalloc, &unwritten); 1226 1227 if (WARN_ON_ONCE(delalloc)) 1228 return 0; 1229 if (WARN_ON_ONCE(unwritten)) 1230 return 0; 1231 1232 return try_to_free_buffers(page); 1233 } 1234 1235 STATIC int 1236 __xfs_get_blocks( 1237 struct inode *inode, 1238 sector_t iblock, 1239 struct buffer_head *bh_result, 1240 int create, 1241 int direct) 1242 { 1243 struct xfs_inode *ip = XFS_I(inode); 1244 struct xfs_mount *mp = ip->i_mount; 1245 xfs_fileoff_t offset_fsb, end_fsb; 1246 int error = 0; 1247 int lockmode = 0; 1248 struct xfs_bmbt_irec imap; 1249 int nimaps = 1; 1250 xfs_off_t offset; 1251 ssize_t size; 1252 int new = 0; 1253 1254 if (XFS_FORCED_SHUTDOWN(mp)) 1255 return -EIO; 1256 1257 offset = (xfs_off_t)iblock << inode->i_blkbits; 1258 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1259 size = bh_result->b_size; 1260 1261 if (!create && direct && offset >= i_size_read(inode)) 1262 return 0; 1263 1264 /* 1265 * Direct I/O is usually done on preallocated files, so try getting 1266 * a block mapping without an exclusive lock first. For buffered 1267 * writes we already have the exclusive iolock anyway, so avoiding 1268 * a lock roundtrip here by taking the ilock exclusive from the 1269 * beginning is a useful micro optimization. 1270 */ 1271 if (create && !direct) { 1272 lockmode = XFS_ILOCK_EXCL; 1273 xfs_ilock(ip, lockmode); 1274 } else { 1275 lockmode = xfs_ilock_data_map_shared(ip); 1276 } 1277 1278 ASSERT(offset <= mp->m_super->s_maxbytes); 1279 if (offset + size > mp->m_super->s_maxbytes) 1280 size = mp->m_super->s_maxbytes - offset; 1281 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1282 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1283 1284 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1285 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1286 if (error) 1287 goto out_unlock; 1288 1289 if (create && 1290 (!nimaps || 1291 (imap.br_startblock == HOLESTARTBLOCK || 1292 imap.br_startblock == DELAYSTARTBLOCK))) { 1293 if (direct || xfs_get_extsz_hint(ip)) { 1294 /* 1295 * Drop the ilock in preparation for starting the block 1296 * allocation transaction. It will be retaken 1297 * exclusively inside xfs_iomap_write_direct for the 1298 * actual allocation. 1299 */ 1300 xfs_iunlock(ip, lockmode); 1301 error = xfs_iomap_write_direct(ip, offset, size, 1302 &imap, nimaps); 1303 if (error) 1304 return error; 1305 new = 1; 1306 } else { 1307 /* 1308 * Delalloc reservations do not require a transaction, 1309 * we can go on without dropping the lock here. If we 1310 * are allocating a new delalloc block, make sure that 1311 * we set the new flag so that we mark the buffer new so 1312 * that we know that it is newly allocated if the write 1313 * fails. 1314 */ 1315 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1316 new = 1; 1317 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1318 if (error) 1319 goto out_unlock; 1320 1321 xfs_iunlock(ip, lockmode); 1322 } 1323 1324 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1325 } else if (nimaps) { 1326 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1327 xfs_iunlock(ip, lockmode); 1328 } else { 1329 trace_xfs_get_blocks_notfound(ip, offset, size); 1330 goto out_unlock; 1331 } 1332 1333 if (imap.br_startblock != HOLESTARTBLOCK && 1334 imap.br_startblock != DELAYSTARTBLOCK) { 1335 /* 1336 * For unwritten extents do not report a disk address on 1337 * the read case (treat as if we're reading into a hole). 1338 */ 1339 if (create || !ISUNWRITTEN(&imap)) 1340 xfs_map_buffer(inode, bh_result, &imap, offset); 1341 if (create && ISUNWRITTEN(&imap)) { 1342 if (direct) { 1343 bh_result->b_private = inode; 1344 set_buffer_defer_completion(bh_result); 1345 } 1346 set_buffer_unwritten(bh_result); 1347 } 1348 } 1349 1350 /* 1351 * If this is a realtime file, data may be on a different device. 1352 * to that pointed to from the buffer_head b_bdev currently. 1353 */ 1354 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1355 1356 /* 1357 * If we previously allocated a block out beyond eof and we are now 1358 * coming back to use it then we will need to flag it as new even if it 1359 * has a disk address. 1360 * 1361 * With sub-block writes into unwritten extents we also need to mark 1362 * the buffer as new so that the unwritten parts of the buffer gets 1363 * correctly zeroed. 1364 */ 1365 if (create && 1366 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1367 (offset >= i_size_read(inode)) || 1368 (new || ISUNWRITTEN(&imap)))) 1369 set_buffer_new(bh_result); 1370 1371 if (imap.br_startblock == DELAYSTARTBLOCK) { 1372 BUG_ON(direct); 1373 if (create) { 1374 set_buffer_uptodate(bh_result); 1375 set_buffer_mapped(bh_result); 1376 set_buffer_delay(bh_result); 1377 } 1378 } 1379 1380 /* 1381 * If this is O_DIRECT or the mpage code calling tell them how large 1382 * the mapping is, so that we can avoid repeated get_blocks calls. 1383 * 1384 * If the mapping spans EOF, then we have to break the mapping up as the 1385 * mapping for blocks beyond EOF must be marked new so that sub block 1386 * regions can be correctly zeroed. We can't do this for mappings within 1387 * EOF unless the mapping was just allocated or is unwritten, otherwise 1388 * the callers would overwrite existing data with zeros. Hence we have 1389 * to split the mapping into a range up to and including EOF, and a 1390 * second mapping for beyond EOF. 1391 */ 1392 if (direct || size > (1 << inode->i_blkbits)) { 1393 xfs_off_t mapping_size; 1394 1395 mapping_size = imap.br_startoff + imap.br_blockcount - iblock; 1396 mapping_size <<= inode->i_blkbits; 1397 1398 ASSERT(mapping_size > 0); 1399 if (mapping_size > size) 1400 mapping_size = size; 1401 if (offset < i_size_read(inode) && 1402 offset + mapping_size >= i_size_read(inode)) { 1403 /* limit mapping to block that spans EOF */ 1404 mapping_size = roundup_64(i_size_read(inode) - offset, 1405 1 << inode->i_blkbits); 1406 } 1407 if (mapping_size > LONG_MAX) 1408 mapping_size = LONG_MAX; 1409 1410 bh_result->b_size = mapping_size; 1411 } 1412 1413 return 0; 1414 1415 out_unlock: 1416 xfs_iunlock(ip, lockmode); 1417 return error; 1418 } 1419 1420 int 1421 xfs_get_blocks( 1422 struct inode *inode, 1423 sector_t iblock, 1424 struct buffer_head *bh_result, 1425 int create) 1426 { 1427 return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1428 } 1429 1430 STATIC int 1431 xfs_get_blocks_direct( 1432 struct inode *inode, 1433 sector_t iblock, 1434 struct buffer_head *bh_result, 1435 int create) 1436 { 1437 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1438 } 1439 1440 /* 1441 * Complete a direct I/O write request. 1442 * 1443 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1444 * need to issue a transaction to convert the range from unwritten to written 1445 * extents. 1446 */ 1447 STATIC void 1448 xfs_end_io_direct_write( 1449 struct kiocb *iocb, 1450 loff_t offset, 1451 ssize_t size, 1452 void *private) 1453 { 1454 struct inode *inode = file_inode(iocb->ki_filp); 1455 struct xfs_inode *ip = XFS_I(inode); 1456 struct xfs_mount *mp = ip->i_mount; 1457 1458 if (XFS_FORCED_SHUTDOWN(mp)) 1459 return; 1460 1461 /* 1462 * While the generic direct I/O code updates the inode size, it does 1463 * so only after the end_io handler is called, which means our 1464 * end_io handler thinks the on-disk size is outside the in-core 1465 * size. To prevent this just update it a little bit earlier here. 1466 */ 1467 if (offset + size > i_size_read(inode)) 1468 i_size_write(inode, offset + size); 1469 1470 /* 1471 * For direct I/O we do not know if we need to allocate blocks or not, 1472 * so we can't preallocate an append transaction, as that results in 1473 * nested reservations and log space deadlocks. Hence allocate the 1474 * transaction here. While this is sub-optimal and can block IO 1475 * completion for some time, we're stuck with doing it this way until 1476 * we can pass the ioend to the direct IO allocation callbacks and 1477 * avoid nesting that way. 1478 */ 1479 if (private && size > 0) { 1480 xfs_iomap_write_unwritten(ip, offset, size); 1481 } else if (offset + size > ip->i_d.di_size) { 1482 struct xfs_trans *tp; 1483 int error; 1484 1485 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 1486 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 1487 if (error) { 1488 xfs_trans_cancel(tp, 0); 1489 return; 1490 } 1491 1492 xfs_setfilesize(ip, tp, offset, size); 1493 } 1494 } 1495 1496 STATIC ssize_t 1497 xfs_vm_direct_IO( 1498 struct kiocb *iocb, 1499 struct iov_iter *iter, 1500 loff_t offset) 1501 { 1502 struct inode *inode = iocb->ki_filp->f_mapping->host; 1503 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1504 1505 if (iov_iter_rw(iter) == WRITE) { 1506 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1507 xfs_get_blocks_direct, 1508 xfs_end_io_direct_write, NULL, 1509 DIO_ASYNC_EXTEND); 1510 } 1511 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1512 xfs_get_blocks_direct, NULL, NULL, 0); 1513 } 1514 1515 /* 1516 * Punch out the delalloc blocks we have already allocated. 1517 * 1518 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1519 * as the page is still locked at this point. 1520 */ 1521 STATIC void 1522 xfs_vm_kill_delalloc_range( 1523 struct inode *inode, 1524 loff_t start, 1525 loff_t end) 1526 { 1527 struct xfs_inode *ip = XFS_I(inode); 1528 xfs_fileoff_t start_fsb; 1529 xfs_fileoff_t end_fsb; 1530 int error; 1531 1532 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1533 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1534 if (end_fsb <= start_fsb) 1535 return; 1536 1537 xfs_ilock(ip, XFS_ILOCK_EXCL); 1538 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1539 end_fsb - start_fsb); 1540 if (error) { 1541 /* something screwed, just bail */ 1542 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1543 xfs_alert(ip->i_mount, 1544 "xfs_vm_write_failed: unable to clean up ino %lld", 1545 ip->i_ino); 1546 } 1547 } 1548 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1549 } 1550 1551 STATIC void 1552 xfs_vm_write_failed( 1553 struct inode *inode, 1554 struct page *page, 1555 loff_t pos, 1556 unsigned len) 1557 { 1558 loff_t block_offset; 1559 loff_t block_start; 1560 loff_t block_end; 1561 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1562 loff_t to = from + len; 1563 struct buffer_head *bh, *head; 1564 1565 /* 1566 * The request pos offset might be 32 or 64 bit, this is all fine 1567 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1568 * platform, the high 32-bit will be masked off if we evaluate the 1569 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1570 * 0xfffff000 as an unsigned long, hence the result is incorrect 1571 * which could cause the following ASSERT failed in most cases. 1572 * In order to avoid this, we can evaluate the block_offset of the 1573 * start of the page by using shifts rather than masks the mismatch 1574 * problem. 1575 */ 1576 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1577 1578 ASSERT(block_offset + from == pos); 1579 1580 head = page_buffers(page); 1581 block_start = 0; 1582 for (bh = head; bh != head || !block_start; 1583 bh = bh->b_this_page, block_start = block_end, 1584 block_offset += bh->b_size) { 1585 block_end = block_start + bh->b_size; 1586 1587 /* skip buffers before the write */ 1588 if (block_end <= from) 1589 continue; 1590 1591 /* if the buffer is after the write, we're done */ 1592 if (block_start >= to) 1593 break; 1594 1595 if (!buffer_delay(bh)) 1596 continue; 1597 1598 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1599 continue; 1600 1601 xfs_vm_kill_delalloc_range(inode, block_offset, 1602 block_offset + bh->b_size); 1603 1604 /* 1605 * This buffer does not contain data anymore. make sure anyone 1606 * who finds it knows that for certain. 1607 */ 1608 clear_buffer_delay(bh); 1609 clear_buffer_uptodate(bh); 1610 clear_buffer_mapped(bh); 1611 clear_buffer_new(bh); 1612 clear_buffer_dirty(bh); 1613 } 1614 1615 } 1616 1617 /* 1618 * This used to call block_write_begin(), but it unlocks and releases the page 1619 * on error, and we need that page to be able to punch stale delalloc blocks out 1620 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1621 * the appropriate point. 1622 */ 1623 STATIC int 1624 xfs_vm_write_begin( 1625 struct file *file, 1626 struct address_space *mapping, 1627 loff_t pos, 1628 unsigned len, 1629 unsigned flags, 1630 struct page **pagep, 1631 void **fsdata) 1632 { 1633 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1634 struct page *page; 1635 int status; 1636 1637 ASSERT(len <= PAGE_CACHE_SIZE); 1638 1639 page = grab_cache_page_write_begin(mapping, index, flags); 1640 if (!page) 1641 return -ENOMEM; 1642 1643 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1644 if (unlikely(status)) { 1645 struct inode *inode = mapping->host; 1646 size_t isize = i_size_read(inode); 1647 1648 xfs_vm_write_failed(inode, page, pos, len); 1649 unlock_page(page); 1650 1651 /* 1652 * If the write is beyond EOF, we only want to kill blocks 1653 * allocated in this write, not blocks that were previously 1654 * written successfully. 1655 */ 1656 if (pos + len > isize) { 1657 ssize_t start = max_t(ssize_t, pos, isize); 1658 1659 truncate_pagecache_range(inode, start, pos + len); 1660 } 1661 1662 page_cache_release(page); 1663 page = NULL; 1664 } 1665 1666 *pagep = page; 1667 return status; 1668 } 1669 1670 /* 1671 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1672 * this specific write because they will never be written. Previous writes 1673 * beyond EOF where block allocation succeeded do not need to be trashed, so 1674 * only new blocks from this write should be trashed. For blocks within 1675 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1676 * written with all the other valid data. 1677 */ 1678 STATIC int 1679 xfs_vm_write_end( 1680 struct file *file, 1681 struct address_space *mapping, 1682 loff_t pos, 1683 unsigned len, 1684 unsigned copied, 1685 struct page *page, 1686 void *fsdata) 1687 { 1688 int ret; 1689 1690 ASSERT(len <= PAGE_CACHE_SIZE); 1691 1692 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1693 if (unlikely(ret < len)) { 1694 struct inode *inode = mapping->host; 1695 size_t isize = i_size_read(inode); 1696 loff_t to = pos + len; 1697 1698 if (to > isize) { 1699 /* only kill blocks in this write beyond EOF */ 1700 if (pos > isize) 1701 isize = pos; 1702 xfs_vm_kill_delalloc_range(inode, isize, to); 1703 truncate_pagecache_range(inode, isize, to); 1704 } 1705 } 1706 return ret; 1707 } 1708 1709 STATIC sector_t 1710 xfs_vm_bmap( 1711 struct address_space *mapping, 1712 sector_t block) 1713 { 1714 struct inode *inode = (struct inode *)mapping->host; 1715 struct xfs_inode *ip = XFS_I(inode); 1716 1717 trace_xfs_vm_bmap(XFS_I(inode)); 1718 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1719 filemap_write_and_wait(mapping); 1720 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1721 return generic_block_bmap(mapping, block, xfs_get_blocks); 1722 } 1723 1724 STATIC int 1725 xfs_vm_readpage( 1726 struct file *unused, 1727 struct page *page) 1728 { 1729 return mpage_readpage(page, xfs_get_blocks); 1730 } 1731 1732 STATIC int 1733 xfs_vm_readpages( 1734 struct file *unused, 1735 struct address_space *mapping, 1736 struct list_head *pages, 1737 unsigned nr_pages) 1738 { 1739 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1740 } 1741 1742 /* 1743 * This is basically a copy of __set_page_dirty_buffers() with one 1744 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1745 * dirty, we'll never be able to clean them because we don't write buffers 1746 * beyond EOF, and that means we can't invalidate pages that span EOF 1747 * that have been marked dirty. Further, the dirty state can leak into 1748 * the file interior if the file is extended, resulting in all sorts of 1749 * bad things happening as the state does not match the underlying data. 1750 * 1751 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1752 * this only exist because of bufferheads and how the generic code manages them. 1753 */ 1754 STATIC int 1755 xfs_vm_set_page_dirty( 1756 struct page *page) 1757 { 1758 struct address_space *mapping = page->mapping; 1759 struct inode *inode = mapping->host; 1760 loff_t end_offset; 1761 loff_t offset; 1762 int newly_dirty; 1763 1764 if (unlikely(!mapping)) 1765 return !TestSetPageDirty(page); 1766 1767 end_offset = i_size_read(inode); 1768 offset = page_offset(page); 1769 1770 spin_lock(&mapping->private_lock); 1771 if (page_has_buffers(page)) { 1772 struct buffer_head *head = page_buffers(page); 1773 struct buffer_head *bh = head; 1774 1775 do { 1776 if (offset < end_offset) 1777 set_buffer_dirty(bh); 1778 bh = bh->b_this_page; 1779 offset += 1 << inode->i_blkbits; 1780 } while (bh != head); 1781 } 1782 newly_dirty = !TestSetPageDirty(page); 1783 spin_unlock(&mapping->private_lock); 1784 1785 if (newly_dirty) { 1786 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1787 unsigned long flags; 1788 1789 spin_lock_irqsave(&mapping->tree_lock, flags); 1790 if (page->mapping) { /* Race with truncate? */ 1791 WARN_ON_ONCE(!PageUptodate(page)); 1792 account_page_dirtied(page, mapping); 1793 radix_tree_tag_set(&mapping->page_tree, 1794 page_index(page), PAGECACHE_TAG_DIRTY); 1795 } 1796 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1797 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1798 } 1799 return newly_dirty; 1800 } 1801 1802 const struct address_space_operations xfs_address_space_operations = { 1803 .readpage = xfs_vm_readpage, 1804 .readpages = xfs_vm_readpages, 1805 .writepage = xfs_vm_writepage, 1806 .writepages = xfs_vm_writepages, 1807 .set_page_dirty = xfs_vm_set_page_dirty, 1808 .releasepage = xfs_vm_releasepage, 1809 .invalidatepage = xfs_vm_invalidatepage, 1810 .write_begin = xfs_vm_write_begin, 1811 .write_end = xfs_vm_write_end, 1812 .bmap = xfs_vm_bmap, 1813 .direct_IO = xfs_vm_direct_IO, 1814 .migratepage = buffer_migrate_page, 1815 .is_partially_uptodate = block_is_partially_uptodate, 1816 .error_remove_page = generic_error_remove_page, 1817 }; 1818