1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_log.h" 20 #include "xfs_sb.h" 21 #include "xfs_ag.h" 22 #include "xfs_trans.h" 23 #include "xfs_mount.h" 24 #include "xfs_bmap_btree.h" 25 #include "xfs_dinode.h" 26 #include "xfs_inode.h" 27 #include "xfs_inode_item.h" 28 #include "xfs_alloc.h" 29 #include "xfs_error.h" 30 #include "xfs_iomap.h" 31 #include "xfs_vnodeops.h" 32 #include "xfs_trace.h" 33 #include "xfs_bmap.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 if (ioend->io_iocb) { 89 if (ioend->io_isasync) { 90 aio_complete(ioend->io_iocb, ioend->io_error ? 91 ioend->io_error : ioend->io_result, 0); 92 } 93 inode_dio_done(ioend->io_inode); 94 } 95 96 mempool_free(ioend, xfs_ioend_pool); 97 } 98 99 /* 100 * Fast and loose check if this write could update the on-disk inode size. 101 */ 102 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 103 { 104 return ioend->io_offset + ioend->io_size > 105 XFS_I(ioend->io_inode)->i_d.di_size; 106 } 107 108 STATIC int 109 xfs_setfilesize_trans_alloc( 110 struct xfs_ioend *ioend) 111 { 112 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 113 struct xfs_trans *tp; 114 int error; 115 116 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 117 118 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 119 if (error) { 120 xfs_trans_cancel(tp, 0); 121 return error; 122 } 123 124 ioend->io_append_trans = tp; 125 126 /* 127 * We will pass freeze protection with a transaction. So tell lockdep 128 * we released it. 129 */ 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 131 1, _THIS_IP_); 132 /* 133 * We hand off the transaction to the completion thread now, so 134 * clear the flag here. 135 */ 136 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 137 return 0; 138 } 139 140 /* 141 * Update on-disk file size now that data has been written to disk. 142 */ 143 STATIC int 144 xfs_setfilesize( 145 struct xfs_ioend *ioend) 146 { 147 struct xfs_inode *ip = XFS_I(ioend->io_inode); 148 struct xfs_trans *tp = ioend->io_append_trans; 149 xfs_fsize_t isize; 150 151 /* 152 * The transaction was allocated in the I/O submission thread, 153 * thus we need to mark ourselves as beeing in a transaction 154 * manually. 155 */ 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 157 158 xfs_ilock(ip, XFS_ILOCK_EXCL); 159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 160 if (!isize) { 161 xfs_iunlock(ip, XFS_ILOCK_EXCL); 162 xfs_trans_cancel(tp, 0); 163 return 0; 164 } 165 166 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 167 168 ip->i_d.di_size = isize; 169 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 170 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 171 172 return xfs_trans_commit(tp, 0); 173 } 174 175 /* 176 * Schedule IO completion handling on the final put of an ioend. 177 * 178 * If there is no work to do we might as well call it a day and free the 179 * ioend right now. 180 */ 181 STATIC void 182 xfs_finish_ioend( 183 struct xfs_ioend *ioend) 184 { 185 if (atomic_dec_and_test(&ioend->io_remaining)) { 186 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 187 188 if (ioend->io_type == XFS_IO_UNWRITTEN) 189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 190 else if (ioend->io_append_trans) 191 queue_work(mp->m_data_workqueue, &ioend->io_work); 192 else 193 xfs_destroy_ioend(ioend); 194 } 195 } 196 197 /* 198 * IO write completion. 199 */ 200 STATIC void 201 xfs_end_io( 202 struct work_struct *work) 203 { 204 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 206 int error = 0; 207 208 if (ioend->io_append_trans) { 209 /* 210 * We've got freeze protection passed with the transaction. 211 * Tell lockdep about it. 212 */ 213 rwsem_acquire_read( 214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 215 0, 1, _THIS_IP_); 216 } 217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 218 ioend->io_error = -EIO; 219 goto done; 220 } 221 if (ioend->io_error) 222 goto done; 223 224 /* 225 * For unwritten extents we need to issue transactions to convert a 226 * range to normal written extens after the data I/O has finished. 227 */ 228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 229 /* 230 * For buffered I/O we never preallocate a transaction when 231 * doing the unwritten extent conversion, but for direct I/O 232 * we do not know if we are converting an unwritten extent 233 * or not at the point where we preallocate the transaction. 234 */ 235 if (ioend->io_append_trans) { 236 ASSERT(ioend->io_isdirect); 237 238 current_set_flags_nested( 239 &ioend->io_append_trans->t_pflags, PF_FSTRANS); 240 xfs_trans_cancel(ioend->io_append_trans, 0); 241 } 242 243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 244 ioend->io_size); 245 if (error) { 246 ioend->io_error = -error; 247 goto done; 248 } 249 } else if (ioend->io_append_trans) { 250 error = xfs_setfilesize(ioend); 251 if (error) 252 ioend->io_error = -error; 253 } else { 254 ASSERT(!xfs_ioend_is_append(ioend)); 255 } 256 257 done: 258 xfs_destroy_ioend(ioend); 259 } 260 261 /* 262 * Call IO completion handling in caller context on the final put of an ioend. 263 */ 264 STATIC void 265 xfs_finish_ioend_sync( 266 struct xfs_ioend *ioend) 267 { 268 if (atomic_dec_and_test(&ioend->io_remaining)) 269 xfs_end_io(&ioend->io_work); 270 } 271 272 /* 273 * Allocate and initialise an IO completion structure. 274 * We need to track unwritten extent write completion here initially. 275 * We'll need to extend this for updating the ondisk inode size later 276 * (vs. incore size). 277 */ 278 STATIC xfs_ioend_t * 279 xfs_alloc_ioend( 280 struct inode *inode, 281 unsigned int type) 282 { 283 xfs_ioend_t *ioend; 284 285 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 286 287 /* 288 * Set the count to 1 initially, which will prevent an I/O 289 * completion callback from happening before we have started 290 * all the I/O from calling the completion routine too early. 291 */ 292 atomic_set(&ioend->io_remaining, 1); 293 ioend->io_isasync = 0; 294 ioend->io_isdirect = 0; 295 ioend->io_error = 0; 296 ioend->io_list = NULL; 297 ioend->io_type = type; 298 ioend->io_inode = inode; 299 ioend->io_buffer_head = NULL; 300 ioend->io_buffer_tail = NULL; 301 ioend->io_offset = 0; 302 ioend->io_size = 0; 303 ioend->io_iocb = NULL; 304 ioend->io_result = 0; 305 ioend->io_append_trans = NULL; 306 307 INIT_WORK(&ioend->io_work, xfs_end_io); 308 return ioend; 309 } 310 311 STATIC int 312 xfs_map_blocks( 313 struct inode *inode, 314 loff_t offset, 315 struct xfs_bmbt_irec *imap, 316 int type, 317 int nonblocking) 318 { 319 struct xfs_inode *ip = XFS_I(inode); 320 struct xfs_mount *mp = ip->i_mount; 321 ssize_t count = 1 << inode->i_blkbits; 322 xfs_fileoff_t offset_fsb, end_fsb; 323 int error = 0; 324 int bmapi_flags = XFS_BMAPI_ENTIRE; 325 int nimaps = 1; 326 327 if (XFS_FORCED_SHUTDOWN(mp)) 328 return -XFS_ERROR(EIO); 329 330 if (type == XFS_IO_UNWRITTEN) 331 bmapi_flags |= XFS_BMAPI_IGSTATE; 332 333 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 334 if (nonblocking) 335 return -XFS_ERROR(EAGAIN); 336 xfs_ilock(ip, XFS_ILOCK_SHARED); 337 } 338 339 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 340 (ip->i_df.if_flags & XFS_IFEXTENTS)); 341 ASSERT(offset <= mp->m_super->s_maxbytes); 342 343 if (offset + count > mp->m_super->s_maxbytes) 344 count = mp->m_super->s_maxbytes - offset; 345 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 346 offset_fsb = XFS_B_TO_FSBT(mp, offset); 347 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 348 imap, &nimaps, bmapi_flags); 349 xfs_iunlock(ip, XFS_ILOCK_SHARED); 350 351 if (error) 352 return -XFS_ERROR(error); 353 354 if (type == XFS_IO_DELALLOC && 355 (!nimaps || isnullstartblock(imap->br_startblock))) { 356 error = xfs_iomap_write_allocate(ip, offset, count, imap); 357 if (!error) 358 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 359 return -XFS_ERROR(error); 360 } 361 362 #ifdef DEBUG 363 if (type == XFS_IO_UNWRITTEN) { 364 ASSERT(nimaps); 365 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 366 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 367 } 368 #endif 369 if (nimaps) 370 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 371 return 0; 372 } 373 374 STATIC int 375 xfs_imap_valid( 376 struct inode *inode, 377 struct xfs_bmbt_irec *imap, 378 xfs_off_t offset) 379 { 380 offset >>= inode->i_blkbits; 381 382 return offset >= imap->br_startoff && 383 offset < imap->br_startoff + imap->br_blockcount; 384 } 385 386 /* 387 * BIO completion handler for buffered IO. 388 */ 389 STATIC void 390 xfs_end_bio( 391 struct bio *bio, 392 int error) 393 { 394 xfs_ioend_t *ioend = bio->bi_private; 395 396 ASSERT(atomic_read(&bio->bi_cnt) >= 1); 397 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; 398 399 /* Toss bio and pass work off to an xfsdatad thread */ 400 bio->bi_private = NULL; 401 bio->bi_end_io = NULL; 402 bio_put(bio); 403 404 xfs_finish_ioend(ioend); 405 } 406 407 STATIC void 408 xfs_submit_ioend_bio( 409 struct writeback_control *wbc, 410 xfs_ioend_t *ioend, 411 struct bio *bio) 412 { 413 atomic_inc(&ioend->io_remaining); 414 bio->bi_private = ioend; 415 bio->bi_end_io = xfs_end_bio; 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 417 } 418 419 STATIC struct bio * 420 xfs_alloc_ioend_bio( 421 struct buffer_head *bh) 422 { 423 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 425 426 ASSERT(bio->bi_private == NULL); 427 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_bdev = bh->b_bdev; 429 return bio; 430 } 431 432 STATIC void 433 xfs_start_buffer_writeback( 434 struct buffer_head *bh) 435 { 436 ASSERT(buffer_mapped(bh)); 437 ASSERT(buffer_locked(bh)); 438 ASSERT(!buffer_delay(bh)); 439 ASSERT(!buffer_unwritten(bh)); 440 441 mark_buffer_async_write(bh); 442 set_buffer_uptodate(bh); 443 clear_buffer_dirty(bh); 444 } 445 446 STATIC void 447 xfs_start_page_writeback( 448 struct page *page, 449 int clear_dirty, 450 int buffers) 451 { 452 ASSERT(PageLocked(page)); 453 ASSERT(!PageWriteback(page)); 454 if (clear_dirty) 455 clear_page_dirty_for_io(page); 456 set_page_writeback(page); 457 unlock_page(page); 458 /* If no buffers on the page are to be written, finish it here */ 459 if (!buffers) 460 end_page_writeback(page); 461 } 462 463 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) 464 { 465 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 466 } 467 468 /* 469 * Submit all of the bios for all of the ioends we have saved up, covering the 470 * initial writepage page and also any probed pages. 471 * 472 * Because we may have multiple ioends spanning a page, we need to start 473 * writeback on all the buffers before we submit them for I/O. If we mark the 474 * buffers as we got, then we can end up with a page that only has buffers 475 * marked async write and I/O complete on can occur before we mark the other 476 * buffers async write. 477 * 478 * The end result of this is that we trip a bug in end_page_writeback() because 479 * we call it twice for the one page as the code in end_buffer_async_write() 480 * assumes that all buffers on the page are started at the same time. 481 * 482 * The fix is two passes across the ioend list - one to start writeback on the 483 * buffer_heads, and then submit them for I/O on the second pass. 484 * 485 * If @fail is non-zero, it means that we have a situation where some part of 486 * the submission process has failed after we have marked paged for writeback 487 * and unlocked them. In this situation, we need to fail the ioend chain rather 488 * than submit it to IO. This typically only happens on a filesystem shutdown. 489 */ 490 STATIC void 491 xfs_submit_ioend( 492 struct writeback_control *wbc, 493 xfs_ioend_t *ioend, 494 int fail) 495 { 496 xfs_ioend_t *head = ioend; 497 xfs_ioend_t *next; 498 struct buffer_head *bh; 499 struct bio *bio; 500 sector_t lastblock = 0; 501 502 /* Pass 1 - start writeback */ 503 do { 504 next = ioend->io_list; 505 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 506 xfs_start_buffer_writeback(bh); 507 } while ((ioend = next) != NULL); 508 509 /* Pass 2 - submit I/O */ 510 ioend = head; 511 do { 512 next = ioend->io_list; 513 bio = NULL; 514 515 /* 516 * If we are failing the IO now, just mark the ioend with an 517 * error and finish it. This will run IO completion immediately 518 * as there is only one reference to the ioend at this point in 519 * time. 520 */ 521 if (fail) { 522 ioend->io_error = -fail; 523 xfs_finish_ioend(ioend); 524 continue; 525 } 526 527 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 528 529 if (!bio) { 530 retry: 531 bio = xfs_alloc_ioend_bio(bh); 532 } else if (bh->b_blocknr != lastblock + 1) { 533 xfs_submit_ioend_bio(wbc, ioend, bio); 534 goto retry; 535 } 536 537 if (bio_add_buffer(bio, bh) != bh->b_size) { 538 xfs_submit_ioend_bio(wbc, ioend, bio); 539 goto retry; 540 } 541 542 lastblock = bh->b_blocknr; 543 } 544 if (bio) 545 xfs_submit_ioend_bio(wbc, ioend, bio); 546 xfs_finish_ioend(ioend); 547 } while ((ioend = next) != NULL); 548 } 549 550 /* 551 * Cancel submission of all buffer_heads so far in this endio. 552 * Toss the endio too. Only ever called for the initial page 553 * in a writepage request, so only ever one page. 554 */ 555 STATIC void 556 xfs_cancel_ioend( 557 xfs_ioend_t *ioend) 558 { 559 xfs_ioend_t *next; 560 struct buffer_head *bh, *next_bh; 561 562 do { 563 next = ioend->io_list; 564 bh = ioend->io_buffer_head; 565 do { 566 next_bh = bh->b_private; 567 clear_buffer_async_write(bh); 568 unlock_buffer(bh); 569 } while ((bh = next_bh) != NULL); 570 571 mempool_free(ioend, xfs_ioend_pool); 572 } while ((ioend = next) != NULL); 573 } 574 575 /* 576 * Test to see if we've been building up a completion structure for 577 * earlier buffers -- if so, we try to append to this ioend if we 578 * can, otherwise we finish off any current ioend and start another. 579 * Return true if we've finished the given ioend. 580 */ 581 STATIC void 582 xfs_add_to_ioend( 583 struct inode *inode, 584 struct buffer_head *bh, 585 xfs_off_t offset, 586 unsigned int type, 587 xfs_ioend_t **result, 588 int need_ioend) 589 { 590 xfs_ioend_t *ioend = *result; 591 592 if (!ioend || need_ioend || type != ioend->io_type) { 593 xfs_ioend_t *previous = *result; 594 595 ioend = xfs_alloc_ioend(inode, type); 596 ioend->io_offset = offset; 597 ioend->io_buffer_head = bh; 598 ioend->io_buffer_tail = bh; 599 if (previous) 600 previous->io_list = ioend; 601 *result = ioend; 602 } else { 603 ioend->io_buffer_tail->b_private = bh; 604 ioend->io_buffer_tail = bh; 605 } 606 607 bh->b_private = NULL; 608 ioend->io_size += bh->b_size; 609 } 610 611 STATIC void 612 xfs_map_buffer( 613 struct inode *inode, 614 struct buffer_head *bh, 615 struct xfs_bmbt_irec *imap, 616 xfs_off_t offset) 617 { 618 sector_t bn; 619 struct xfs_mount *m = XFS_I(inode)->i_mount; 620 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 621 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 622 623 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 624 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 625 626 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 627 ((offset - iomap_offset) >> inode->i_blkbits); 628 629 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 630 631 bh->b_blocknr = bn; 632 set_buffer_mapped(bh); 633 } 634 635 STATIC void 636 xfs_map_at_offset( 637 struct inode *inode, 638 struct buffer_head *bh, 639 struct xfs_bmbt_irec *imap, 640 xfs_off_t offset) 641 { 642 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 643 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 644 645 xfs_map_buffer(inode, bh, imap, offset); 646 set_buffer_mapped(bh); 647 clear_buffer_delay(bh); 648 clear_buffer_unwritten(bh); 649 } 650 651 /* 652 * Test if a given page is suitable for writing as part of an unwritten 653 * or delayed allocate extent. 654 */ 655 STATIC int 656 xfs_check_page_type( 657 struct page *page, 658 unsigned int type) 659 { 660 if (PageWriteback(page)) 661 return 0; 662 663 if (page->mapping && page_has_buffers(page)) { 664 struct buffer_head *bh, *head; 665 int acceptable = 0; 666 667 bh = head = page_buffers(page); 668 do { 669 if (buffer_unwritten(bh)) 670 acceptable += (type == XFS_IO_UNWRITTEN); 671 else if (buffer_delay(bh)) 672 acceptable += (type == XFS_IO_DELALLOC); 673 else if (buffer_dirty(bh) && buffer_mapped(bh)) 674 acceptable += (type == XFS_IO_OVERWRITE); 675 else 676 break; 677 } while ((bh = bh->b_this_page) != head); 678 679 if (acceptable) 680 return 1; 681 } 682 683 return 0; 684 } 685 686 /* 687 * Allocate & map buffers for page given the extent map. Write it out. 688 * except for the original page of a writepage, this is called on 689 * delalloc/unwritten pages only, for the original page it is possible 690 * that the page has no mapping at all. 691 */ 692 STATIC int 693 xfs_convert_page( 694 struct inode *inode, 695 struct page *page, 696 loff_t tindex, 697 struct xfs_bmbt_irec *imap, 698 xfs_ioend_t **ioendp, 699 struct writeback_control *wbc) 700 { 701 struct buffer_head *bh, *head; 702 xfs_off_t end_offset; 703 unsigned long p_offset; 704 unsigned int type; 705 int len, page_dirty; 706 int count = 0, done = 0, uptodate = 1; 707 xfs_off_t offset = page_offset(page); 708 709 if (page->index != tindex) 710 goto fail; 711 if (!trylock_page(page)) 712 goto fail; 713 if (PageWriteback(page)) 714 goto fail_unlock_page; 715 if (page->mapping != inode->i_mapping) 716 goto fail_unlock_page; 717 if (!xfs_check_page_type(page, (*ioendp)->io_type)) 718 goto fail_unlock_page; 719 720 /* 721 * page_dirty is initially a count of buffers on the page before 722 * EOF and is decremented as we move each into a cleanable state. 723 * 724 * Derivation: 725 * 726 * End offset is the highest offset that this page should represent. 727 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 728 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 729 * hence give us the correct page_dirty count. On any other page, 730 * it will be zero and in that case we need page_dirty to be the 731 * count of buffers on the page. 732 */ 733 end_offset = min_t(unsigned long long, 734 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 735 i_size_read(inode)); 736 737 len = 1 << inode->i_blkbits; 738 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 739 PAGE_CACHE_SIZE); 740 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 741 page_dirty = p_offset / len; 742 743 bh = head = page_buffers(page); 744 do { 745 if (offset >= end_offset) 746 break; 747 if (!buffer_uptodate(bh)) 748 uptodate = 0; 749 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 750 done = 1; 751 continue; 752 } 753 754 if (buffer_unwritten(bh) || buffer_delay(bh) || 755 buffer_mapped(bh)) { 756 if (buffer_unwritten(bh)) 757 type = XFS_IO_UNWRITTEN; 758 else if (buffer_delay(bh)) 759 type = XFS_IO_DELALLOC; 760 else 761 type = XFS_IO_OVERWRITE; 762 763 if (!xfs_imap_valid(inode, imap, offset)) { 764 done = 1; 765 continue; 766 } 767 768 lock_buffer(bh); 769 if (type != XFS_IO_OVERWRITE) 770 xfs_map_at_offset(inode, bh, imap, offset); 771 xfs_add_to_ioend(inode, bh, offset, type, 772 ioendp, done); 773 774 page_dirty--; 775 count++; 776 } else { 777 done = 1; 778 } 779 } while (offset += len, (bh = bh->b_this_page) != head); 780 781 if (uptodate && bh == head) 782 SetPageUptodate(page); 783 784 if (count) { 785 if (--wbc->nr_to_write <= 0 && 786 wbc->sync_mode == WB_SYNC_NONE) 787 done = 1; 788 } 789 xfs_start_page_writeback(page, !page_dirty, count); 790 791 return done; 792 fail_unlock_page: 793 unlock_page(page); 794 fail: 795 return 1; 796 } 797 798 /* 799 * Convert & write out a cluster of pages in the same extent as defined 800 * by mp and following the start page. 801 */ 802 STATIC void 803 xfs_cluster_write( 804 struct inode *inode, 805 pgoff_t tindex, 806 struct xfs_bmbt_irec *imap, 807 xfs_ioend_t **ioendp, 808 struct writeback_control *wbc, 809 pgoff_t tlast) 810 { 811 struct pagevec pvec; 812 int done = 0, i; 813 814 pagevec_init(&pvec, 0); 815 while (!done && tindex <= tlast) { 816 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 817 818 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 819 break; 820 821 for (i = 0; i < pagevec_count(&pvec); i++) { 822 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 823 imap, ioendp, wbc); 824 if (done) 825 break; 826 } 827 828 pagevec_release(&pvec); 829 cond_resched(); 830 } 831 } 832 833 STATIC void 834 xfs_vm_invalidatepage( 835 struct page *page, 836 unsigned long offset) 837 { 838 trace_xfs_invalidatepage(page->mapping->host, page, offset); 839 block_invalidatepage(page, offset); 840 } 841 842 /* 843 * If the page has delalloc buffers on it, we need to punch them out before we 844 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 845 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 846 * is done on that same region - the delalloc extent is returned when none is 847 * supposed to be there. 848 * 849 * We prevent this by truncating away the delalloc regions on the page before 850 * invalidating it. Because they are delalloc, we can do this without needing a 851 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 852 * truncation without a transaction as there is no space left for block 853 * reservation (typically why we see a ENOSPC in writeback). 854 * 855 * This is not a performance critical path, so for now just do the punching a 856 * buffer head at a time. 857 */ 858 STATIC void 859 xfs_aops_discard_page( 860 struct page *page) 861 { 862 struct inode *inode = page->mapping->host; 863 struct xfs_inode *ip = XFS_I(inode); 864 struct buffer_head *bh, *head; 865 loff_t offset = page_offset(page); 866 867 if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) 868 goto out_invalidate; 869 870 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 871 goto out_invalidate; 872 873 xfs_alert(ip->i_mount, 874 "page discard on page %p, inode 0x%llx, offset %llu.", 875 page, ip->i_ino, offset); 876 877 xfs_ilock(ip, XFS_ILOCK_EXCL); 878 bh = head = page_buffers(page); 879 do { 880 int error; 881 xfs_fileoff_t start_fsb; 882 883 if (!buffer_delay(bh)) 884 goto next_buffer; 885 886 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 887 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 888 if (error) { 889 /* something screwed, just bail */ 890 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 891 xfs_alert(ip->i_mount, 892 "page discard unable to remove delalloc mapping."); 893 } 894 break; 895 } 896 next_buffer: 897 offset += 1 << inode->i_blkbits; 898 899 } while ((bh = bh->b_this_page) != head); 900 901 xfs_iunlock(ip, XFS_ILOCK_EXCL); 902 out_invalidate: 903 xfs_vm_invalidatepage(page, 0); 904 return; 905 } 906 907 /* 908 * Write out a dirty page. 909 * 910 * For delalloc space on the page we need to allocate space and flush it. 911 * For unwritten space on the page we need to start the conversion to 912 * regular allocated space. 913 * For any other dirty buffer heads on the page we should flush them. 914 */ 915 STATIC int 916 xfs_vm_writepage( 917 struct page *page, 918 struct writeback_control *wbc) 919 { 920 struct inode *inode = page->mapping->host; 921 struct buffer_head *bh, *head; 922 struct xfs_bmbt_irec imap; 923 xfs_ioend_t *ioend = NULL, *iohead = NULL; 924 loff_t offset; 925 unsigned int type; 926 __uint64_t end_offset; 927 pgoff_t end_index, last_index; 928 ssize_t len; 929 int err, imap_valid = 0, uptodate = 1; 930 int count = 0; 931 int nonblocking = 0; 932 933 trace_xfs_writepage(inode, page, 0); 934 935 ASSERT(page_has_buffers(page)); 936 937 /* 938 * Refuse to write the page out if we are called from reclaim context. 939 * 940 * This avoids stack overflows when called from deeply used stacks in 941 * random callers for direct reclaim or memcg reclaim. We explicitly 942 * allow reclaim from kswapd as the stack usage there is relatively low. 943 * 944 * This should never happen except in the case of a VM regression so 945 * warn about it. 946 */ 947 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 948 PF_MEMALLOC)) 949 goto redirty; 950 951 /* 952 * Given that we do not allow direct reclaim to call us, we should 953 * never be called while in a filesystem transaction. 954 */ 955 if (WARN_ON(current->flags & PF_FSTRANS)) 956 goto redirty; 957 958 /* Is this page beyond the end of the file? */ 959 offset = i_size_read(inode); 960 end_index = offset >> PAGE_CACHE_SHIFT; 961 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 962 if (page->index >= end_index) { 963 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 964 965 /* 966 * Just skip the page if it is fully outside i_size, e.g. due 967 * to a truncate operation that is in progress. 968 */ 969 if (page->index >= end_index + 1 || offset_into_page == 0) { 970 unlock_page(page); 971 return 0; 972 } 973 974 /* 975 * The page straddles i_size. It must be zeroed out on each 976 * and every writepage invocation because it may be mmapped. 977 * "A file is mapped in multiples of the page size. For a file 978 * that is not a multiple of the page size, the remaining 979 * memory is zeroed when mapped, and writes to that region are 980 * not written out to the file." 981 */ 982 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 983 } 984 985 end_offset = min_t(unsigned long long, 986 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 987 offset); 988 len = 1 << inode->i_blkbits; 989 990 bh = head = page_buffers(page); 991 offset = page_offset(page); 992 type = XFS_IO_OVERWRITE; 993 994 if (wbc->sync_mode == WB_SYNC_NONE) 995 nonblocking = 1; 996 997 do { 998 int new_ioend = 0; 999 1000 if (offset >= end_offset) 1001 break; 1002 if (!buffer_uptodate(bh)) 1003 uptodate = 0; 1004 1005 /* 1006 * set_page_dirty dirties all buffers in a page, independent 1007 * of their state. The dirty state however is entirely 1008 * meaningless for holes (!mapped && uptodate), so skip 1009 * buffers covering holes here. 1010 */ 1011 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1012 imap_valid = 0; 1013 continue; 1014 } 1015 1016 if (buffer_unwritten(bh)) { 1017 if (type != XFS_IO_UNWRITTEN) { 1018 type = XFS_IO_UNWRITTEN; 1019 imap_valid = 0; 1020 } 1021 } else if (buffer_delay(bh)) { 1022 if (type != XFS_IO_DELALLOC) { 1023 type = XFS_IO_DELALLOC; 1024 imap_valid = 0; 1025 } 1026 } else if (buffer_uptodate(bh)) { 1027 if (type != XFS_IO_OVERWRITE) { 1028 type = XFS_IO_OVERWRITE; 1029 imap_valid = 0; 1030 } 1031 } else { 1032 if (PageUptodate(page)) 1033 ASSERT(buffer_mapped(bh)); 1034 /* 1035 * This buffer is not uptodate and will not be 1036 * written to disk. Ensure that we will put any 1037 * subsequent writeable buffers into a new 1038 * ioend. 1039 */ 1040 imap_valid = 0; 1041 continue; 1042 } 1043 1044 if (imap_valid) 1045 imap_valid = xfs_imap_valid(inode, &imap, offset); 1046 if (!imap_valid) { 1047 /* 1048 * If we didn't have a valid mapping then we need to 1049 * put the new mapping into a separate ioend structure. 1050 * This ensures non-contiguous extents always have 1051 * separate ioends, which is particularly important 1052 * for unwritten extent conversion at I/O completion 1053 * time. 1054 */ 1055 new_ioend = 1; 1056 err = xfs_map_blocks(inode, offset, &imap, type, 1057 nonblocking); 1058 if (err) 1059 goto error; 1060 imap_valid = xfs_imap_valid(inode, &imap, offset); 1061 } 1062 if (imap_valid) { 1063 lock_buffer(bh); 1064 if (type != XFS_IO_OVERWRITE) 1065 xfs_map_at_offset(inode, bh, &imap, offset); 1066 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1067 new_ioend); 1068 count++; 1069 } 1070 1071 if (!iohead) 1072 iohead = ioend; 1073 1074 } while (offset += len, ((bh = bh->b_this_page) != head)); 1075 1076 if (uptodate && bh == head) 1077 SetPageUptodate(page); 1078 1079 xfs_start_page_writeback(page, 1, count); 1080 1081 /* if there is no IO to be submitted for this page, we are done */ 1082 if (!ioend) 1083 return 0; 1084 1085 ASSERT(iohead); 1086 1087 /* 1088 * Any errors from this point onwards need tobe reported through the IO 1089 * completion path as we have marked the initial page as under writeback 1090 * and unlocked it. 1091 */ 1092 if (imap_valid) { 1093 xfs_off_t end_index; 1094 1095 end_index = imap.br_startoff + imap.br_blockcount; 1096 1097 /* to bytes */ 1098 end_index <<= inode->i_blkbits; 1099 1100 /* to pages */ 1101 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1102 1103 /* check against file size */ 1104 if (end_index > last_index) 1105 end_index = last_index; 1106 1107 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1108 wbc, end_index); 1109 } 1110 1111 1112 /* 1113 * Reserve log space if we might write beyond the on-disk inode size. 1114 */ 1115 err = 0; 1116 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1117 err = xfs_setfilesize_trans_alloc(ioend); 1118 1119 xfs_submit_ioend(wbc, iohead, err); 1120 1121 return 0; 1122 1123 error: 1124 if (iohead) 1125 xfs_cancel_ioend(iohead); 1126 1127 if (err == -EAGAIN) 1128 goto redirty; 1129 1130 xfs_aops_discard_page(page); 1131 ClearPageUptodate(page); 1132 unlock_page(page); 1133 return err; 1134 1135 redirty: 1136 redirty_page_for_writepage(wbc, page); 1137 unlock_page(page); 1138 return 0; 1139 } 1140 1141 STATIC int 1142 xfs_vm_writepages( 1143 struct address_space *mapping, 1144 struct writeback_control *wbc) 1145 { 1146 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1147 return generic_writepages(mapping, wbc); 1148 } 1149 1150 /* 1151 * Called to move a page into cleanable state - and from there 1152 * to be released. The page should already be clean. We always 1153 * have buffer heads in this call. 1154 * 1155 * Returns 1 if the page is ok to release, 0 otherwise. 1156 */ 1157 STATIC int 1158 xfs_vm_releasepage( 1159 struct page *page, 1160 gfp_t gfp_mask) 1161 { 1162 int delalloc, unwritten; 1163 1164 trace_xfs_releasepage(page->mapping->host, page, 0); 1165 1166 xfs_count_page_state(page, &delalloc, &unwritten); 1167 1168 if (WARN_ON(delalloc)) 1169 return 0; 1170 if (WARN_ON(unwritten)) 1171 return 0; 1172 1173 return try_to_free_buffers(page); 1174 } 1175 1176 STATIC int 1177 __xfs_get_blocks( 1178 struct inode *inode, 1179 sector_t iblock, 1180 struct buffer_head *bh_result, 1181 int create, 1182 int direct) 1183 { 1184 struct xfs_inode *ip = XFS_I(inode); 1185 struct xfs_mount *mp = ip->i_mount; 1186 xfs_fileoff_t offset_fsb, end_fsb; 1187 int error = 0; 1188 int lockmode = 0; 1189 struct xfs_bmbt_irec imap; 1190 int nimaps = 1; 1191 xfs_off_t offset; 1192 ssize_t size; 1193 int new = 0; 1194 1195 if (XFS_FORCED_SHUTDOWN(mp)) 1196 return -XFS_ERROR(EIO); 1197 1198 offset = (xfs_off_t)iblock << inode->i_blkbits; 1199 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1200 size = bh_result->b_size; 1201 1202 if (!create && direct && offset >= i_size_read(inode)) 1203 return 0; 1204 1205 /* 1206 * Direct I/O is usually done on preallocated files, so try getting 1207 * a block mapping without an exclusive lock first. For buffered 1208 * writes we already have the exclusive iolock anyway, so avoiding 1209 * a lock roundtrip here by taking the ilock exclusive from the 1210 * beginning is a useful micro optimization. 1211 */ 1212 if (create && !direct) { 1213 lockmode = XFS_ILOCK_EXCL; 1214 xfs_ilock(ip, lockmode); 1215 } else { 1216 lockmode = xfs_ilock_map_shared(ip); 1217 } 1218 1219 ASSERT(offset <= mp->m_super->s_maxbytes); 1220 if (offset + size > mp->m_super->s_maxbytes) 1221 size = mp->m_super->s_maxbytes - offset; 1222 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1223 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1224 1225 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1226 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1227 if (error) 1228 goto out_unlock; 1229 1230 if (create && 1231 (!nimaps || 1232 (imap.br_startblock == HOLESTARTBLOCK || 1233 imap.br_startblock == DELAYSTARTBLOCK))) { 1234 if (direct || xfs_get_extsz_hint(ip)) { 1235 /* 1236 * Drop the ilock in preparation for starting the block 1237 * allocation transaction. It will be retaken 1238 * exclusively inside xfs_iomap_write_direct for the 1239 * actual allocation. 1240 */ 1241 xfs_iunlock(ip, lockmode); 1242 error = xfs_iomap_write_direct(ip, offset, size, 1243 &imap, nimaps); 1244 if (error) 1245 return -error; 1246 new = 1; 1247 } else { 1248 /* 1249 * Delalloc reservations do not require a transaction, 1250 * we can go on without dropping the lock here. If we 1251 * are allocating a new delalloc block, make sure that 1252 * we set the new flag so that we mark the buffer new so 1253 * that we know that it is newly allocated if the write 1254 * fails. 1255 */ 1256 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1257 new = 1; 1258 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1259 if (error) 1260 goto out_unlock; 1261 1262 xfs_iunlock(ip, lockmode); 1263 } 1264 1265 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1266 } else if (nimaps) { 1267 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1268 xfs_iunlock(ip, lockmode); 1269 } else { 1270 trace_xfs_get_blocks_notfound(ip, offset, size); 1271 goto out_unlock; 1272 } 1273 1274 if (imap.br_startblock != HOLESTARTBLOCK && 1275 imap.br_startblock != DELAYSTARTBLOCK) { 1276 /* 1277 * For unwritten extents do not report a disk address on 1278 * the read case (treat as if we're reading into a hole). 1279 */ 1280 if (create || !ISUNWRITTEN(&imap)) 1281 xfs_map_buffer(inode, bh_result, &imap, offset); 1282 if (create && ISUNWRITTEN(&imap)) { 1283 if (direct) 1284 bh_result->b_private = inode; 1285 set_buffer_unwritten(bh_result); 1286 } 1287 } 1288 1289 /* 1290 * If this is a realtime file, data may be on a different device. 1291 * to that pointed to from the buffer_head b_bdev currently. 1292 */ 1293 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1294 1295 /* 1296 * If we previously allocated a block out beyond eof and we are now 1297 * coming back to use it then we will need to flag it as new even if it 1298 * has a disk address. 1299 * 1300 * With sub-block writes into unwritten extents we also need to mark 1301 * the buffer as new so that the unwritten parts of the buffer gets 1302 * correctly zeroed. 1303 */ 1304 if (create && 1305 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1306 (offset >= i_size_read(inode)) || 1307 (new || ISUNWRITTEN(&imap)))) 1308 set_buffer_new(bh_result); 1309 1310 if (imap.br_startblock == DELAYSTARTBLOCK) { 1311 BUG_ON(direct); 1312 if (create) { 1313 set_buffer_uptodate(bh_result); 1314 set_buffer_mapped(bh_result); 1315 set_buffer_delay(bh_result); 1316 } 1317 } 1318 1319 /* 1320 * If this is O_DIRECT or the mpage code calling tell them how large 1321 * the mapping is, so that we can avoid repeated get_blocks calls. 1322 */ 1323 if (direct || size > (1 << inode->i_blkbits)) { 1324 xfs_off_t mapping_size; 1325 1326 mapping_size = imap.br_startoff + imap.br_blockcount - iblock; 1327 mapping_size <<= inode->i_blkbits; 1328 1329 ASSERT(mapping_size > 0); 1330 if (mapping_size > size) 1331 mapping_size = size; 1332 if (mapping_size > LONG_MAX) 1333 mapping_size = LONG_MAX; 1334 1335 bh_result->b_size = mapping_size; 1336 } 1337 1338 return 0; 1339 1340 out_unlock: 1341 xfs_iunlock(ip, lockmode); 1342 return -error; 1343 } 1344 1345 int 1346 xfs_get_blocks( 1347 struct inode *inode, 1348 sector_t iblock, 1349 struct buffer_head *bh_result, 1350 int create) 1351 { 1352 return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1353 } 1354 1355 STATIC int 1356 xfs_get_blocks_direct( 1357 struct inode *inode, 1358 sector_t iblock, 1359 struct buffer_head *bh_result, 1360 int create) 1361 { 1362 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1363 } 1364 1365 /* 1366 * Complete a direct I/O write request. 1367 * 1368 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1369 * need to issue a transaction to convert the range from unwritten to written 1370 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1371 * to do this and we are done. But in case this was a successful AIO 1372 * request this handler is called from interrupt context, from which we 1373 * can't start transactions. In that case offload the I/O completion to 1374 * the workqueues we also use for buffered I/O completion. 1375 */ 1376 STATIC void 1377 xfs_end_io_direct_write( 1378 struct kiocb *iocb, 1379 loff_t offset, 1380 ssize_t size, 1381 void *private, 1382 int ret, 1383 bool is_async) 1384 { 1385 struct xfs_ioend *ioend = iocb->private; 1386 1387 /* 1388 * While the generic direct I/O code updates the inode size, it does 1389 * so only after the end_io handler is called, which means our 1390 * end_io handler thinks the on-disk size is outside the in-core 1391 * size. To prevent this just update it a little bit earlier here. 1392 */ 1393 if (offset + size > i_size_read(ioend->io_inode)) 1394 i_size_write(ioend->io_inode, offset + size); 1395 1396 /* 1397 * blockdev_direct_IO can return an error even after the I/O 1398 * completion handler was called. Thus we need to protect 1399 * against double-freeing. 1400 */ 1401 iocb->private = NULL; 1402 1403 ioend->io_offset = offset; 1404 ioend->io_size = size; 1405 ioend->io_iocb = iocb; 1406 ioend->io_result = ret; 1407 if (private && size > 0) 1408 ioend->io_type = XFS_IO_UNWRITTEN; 1409 1410 if (is_async) { 1411 ioend->io_isasync = 1; 1412 xfs_finish_ioend(ioend); 1413 } else { 1414 xfs_finish_ioend_sync(ioend); 1415 } 1416 } 1417 1418 STATIC ssize_t 1419 xfs_vm_direct_IO( 1420 int rw, 1421 struct kiocb *iocb, 1422 const struct iovec *iov, 1423 loff_t offset, 1424 unsigned long nr_segs) 1425 { 1426 struct inode *inode = iocb->ki_filp->f_mapping->host; 1427 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1428 struct xfs_ioend *ioend = NULL; 1429 ssize_t ret; 1430 1431 if (rw & WRITE) { 1432 size_t size = iov_length(iov, nr_segs); 1433 1434 /* 1435 * We need to preallocate a transaction for a size update 1436 * here. In the case that this write both updates the size 1437 * and converts at least on unwritten extent we will cancel 1438 * the still clean transaction after the I/O has finished. 1439 */ 1440 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1441 if (offset + size > XFS_I(inode)->i_d.di_size) { 1442 ret = xfs_setfilesize_trans_alloc(ioend); 1443 if (ret) 1444 goto out_destroy_ioend; 1445 ioend->io_isdirect = 1; 1446 } 1447 1448 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1449 offset, nr_segs, 1450 xfs_get_blocks_direct, 1451 xfs_end_io_direct_write, NULL, 0); 1452 if (ret != -EIOCBQUEUED && iocb->private) 1453 goto out_trans_cancel; 1454 } else { 1455 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1456 offset, nr_segs, 1457 xfs_get_blocks_direct, 1458 NULL, NULL, 0); 1459 } 1460 1461 return ret; 1462 1463 out_trans_cancel: 1464 if (ioend->io_append_trans) { 1465 current_set_flags_nested(&ioend->io_append_trans->t_pflags, 1466 PF_FSTRANS); 1467 rwsem_acquire_read( 1468 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 1469 0, 1, _THIS_IP_); 1470 xfs_trans_cancel(ioend->io_append_trans, 0); 1471 } 1472 out_destroy_ioend: 1473 xfs_destroy_ioend(ioend); 1474 return ret; 1475 } 1476 1477 /* 1478 * Punch out the delalloc blocks we have already allocated. 1479 * 1480 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1481 * as the page is still locked at this point. 1482 */ 1483 STATIC void 1484 xfs_vm_kill_delalloc_range( 1485 struct inode *inode, 1486 loff_t start, 1487 loff_t end) 1488 { 1489 struct xfs_inode *ip = XFS_I(inode); 1490 xfs_fileoff_t start_fsb; 1491 xfs_fileoff_t end_fsb; 1492 int error; 1493 1494 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1495 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1496 if (end_fsb <= start_fsb) 1497 return; 1498 1499 xfs_ilock(ip, XFS_ILOCK_EXCL); 1500 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1501 end_fsb - start_fsb); 1502 if (error) { 1503 /* something screwed, just bail */ 1504 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1505 xfs_alert(ip->i_mount, 1506 "xfs_vm_write_failed: unable to clean up ino %lld", 1507 ip->i_ino); 1508 } 1509 } 1510 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1511 } 1512 1513 STATIC void 1514 xfs_vm_write_failed( 1515 struct inode *inode, 1516 struct page *page, 1517 loff_t pos, 1518 unsigned len) 1519 { 1520 loff_t block_offset = pos & PAGE_MASK; 1521 loff_t block_start; 1522 loff_t block_end; 1523 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1524 loff_t to = from + len; 1525 struct buffer_head *bh, *head; 1526 1527 ASSERT(block_offset + from == pos); 1528 1529 head = page_buffers(page); 1530 block_start = 0; 1531 for (bh = head; bh != head || !block_start; 1532 bh = bh->b_this_page, block_start = block_end, 1533 block_offset += bh->b_size) { 1534 block_end = block_start + bh->b_size; 1535 1536 /* skip buffers before the write */ 1537 if (block_end <= from) 1538 continue; 1539 1540 /* if the buffer is after the write, we're done */ 1541 if (block_start >= to) 1542 break; 1543 1544 if (!buffer_delay(bh)) 1545 continue; 1546 1547 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1548 continue; 1549 1550 xfs_vm_kill_delalloc_range(inode, block_offset, 1551 block_offset + bh->b_size); 1552 } 1553 1554 } 1555 1556 /* 1557 * This used to call block_write_begin(), but it unlocks and releases the page 1558 * on error, and we need that page to be able to punch stale delalloc blocks out 1559 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1560 * the appropriate point. 1561 */ 1562 STATIC int 1563 xfs_vm_write_begin( 1564 struct file *file, 1565 struct address_space *mapping, 1566 loff_t pos, 1567 unsigned len, 1568 unsigned flags, 1569 struct page **pagep, 1570 void **fsdata) 1571 { 1572 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1573 struct page *page; 1574 int status; 1575 1576 ASSERT(len <= PAGE_CACHE_SIZE); 1577 1578 page = grab_cache_page_write_begin(mapping, index, 1579 flags | AOP_FLAG_NOFS); 1580 if (!page) 1581 return -ENOMEM; 1582 1583 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1584 if (unlikely(status)) { 1585 struct inode *inode = mapping->host; 1586 1587 xfs_vm_write_failed(inode, page, pos, len); 1588 unlock_page(page); 1589 1590 if (pos + len > i_size_read(inode)) 1591 truncate_pagecache(inode, pos + len, i_size_read(inode)); 1592 1593 page_cache_release(page); 1594 page = NULL; 1595 } 1596 1597 *pagep = page; 1598 return status; 1599 } 1600 1601 /* 1602 * On failure, we only need to kill delalloc blocks beyond EOF because they 1603 * will never be written. For blocks within EOF, generic_write_end() zeros them 1604 * so they are safe to leave alone and be written with all the other valid data. 1605 */ 1606 STATIC int 1607 xfs_vm_write_end( 1608 struct file *file, 1609 struct address_space *mapping, 1610 loff_t pos, 1611 unsigned len, 1612 unsigned copied, 1613 struct page *page, 1614 void *fsdata) 1615 { 1616 int ret; 1617 1618 ASSERT(len <= PAGE_CACHE_SIZE); 1619 1620 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1621 if (unlikely(ret < len)) { 1622 struct inode *inode = mapping->host; 1623 size_t isize = i_size_read(inode); 1624 loff_t to = pos + len; 1625 1626 if (to > isize) { 1627 truncate_pagecache(inode, to, isize); 1628 xfs_vm_kill_delalloc_range(inode, isize, to); 1629 } 1630 } 1631 return ret; 1632 } 1633 1634 STATIC sector_t 1635 xfs_vm_bmap( 1636 struct address_space *mapping, 1637 sector_t block) 1638 { 1639 struct inode *inode = (struct inode *)mapping->host; 1640 struct xfs_inode *ip = XFS_I(inode); 1641 1642 trace_xfs_vm_bmap(XFS_I(inode)); 1643 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1644 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1645 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1646 return generic_block_bmap(mapping, block, xfs_get_blocks); 1647 } 1648 1649 STATIC int 1650 xfs_vm_readpage( 1651 struct file *unused, 1652 struct page *page) 1653 { 1654 return mpage_readpage(page, xfs_get_blocks); 1655 } 1656 1657 STATIC int 1658 xfs_vm_readpages( 1659 struct file *unused, 1660 struct address_space *mapping, 1661 struct list_head *pages, 1662 unsigned nr_pages) 1663 { 1664 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1665 } 1666 1667 const struct address_space_operations xfs_address_space_operations = { 1668 .readpage = xfs_vm_readpage, 1669 .readpages = xfs_vm_readpages, 1670 .writepage = xfs_vm_writepage, 1671 .writepages = xfs_vm_writepages, 1672 .releasepage = xfs_vm_releasepage, 1673 .invalidatepage = xfs_vm_invalidatepage, 1674 .write_begin = xfs_vm_write_begin, 1675 .write_end = xfs_vm_write_end, 1676 .bmap = xfs_vm_bmap, 1677 .direct_IO = xfs_vm_direct_IO, 1678 .migratepage = buffer_migrate_page, 1679 .is_partially_uptodate = block_is_partially_uptodate, 1680 .error_remove_page = generic_error_remove_page, 1681 }; 1682