1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * Copyright (c) 2016-2018 Christoph Hellwig. 5 * All Rights Reserved. 6 */ 7 #include "xfs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_alloc.h" 17 #include "xfs_error.h" 18 #include "xfs_iomap.h" 19 #include "xfs_trace.h" 20 #include "xfs_bmap.h" 21 #include "xfs_bmap_util.h" 22 #include "xfs_bmap_btree.h" 23 #include "xfs_reflink.h" 24 #include <linux/writeback.h> 25 26 /* 27 * structure owned by writepages passed to individual writepage calls 28 */ 29 struct xfs_writepage_ctx { 30 struct xfs_bmbt_irec imap; 31 unsigned int io_type; 32 unsigned int cow_seq; 33 struct xfs_ioend *ioend; 34 }; 35 36 struct block_device * 37 xfs_find_bdev_for_inode( 38 struct inode *inode) 39 { 40 struct xfs_inode *ip = XFS_I(inode); 41 struct xfs_mount *mp = ip->i_mount; 42 43 if (XFS_IS_REALTIME_INODE(ip)) 44 return mp->m_rtdev_targp->bt_bdev; 45 else 46 return mp->m_ddev_targp->bt_bdev; 47 } 48 49 struct dax_device * 50 xfs_find_daxdev_for_inode( 51 struct inode *inode) 52 { 53 struct xfs_inode *ip = XFS_I(inode); 54 struct xfs_mount *mp = ip->i_mount; 55 56 if (XFS_IS_REALTIME_INODE(ip)) 57 return mp->m_rtdev_targp->bt_daxdev; 58 else 59 return mp->m_ddev_targp->bt_daxdev; 60 } 61 62 static void 63 xfs_finish_page_writeback( 64 struct inode *inode, 65 struct bio_vec *bvec, 66 int error) 67 { 68 struct iomap_page *iop = to_iomap_page(bvec->bv_page); 69 70 if (error) { 71 SetPageError(bvec->bv_page); 72 mapping_set_error(inode->i_mapping, -EIO); 73 } 74 75 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); 76 ASSERT(!iop || atomic_read(&iop->write_count) > 0); 77 78 if (!iop || atomic_dec_and_test(&iop->write_count)) 79 end_page_writeback(bvec->bv_page); 80 } 81 82 /* 83 * We're now finished for good with this ioend structure. Update the page 84 * state, release holds on bios, and finally free up memory. Do not use the 85 * ioend after this. 86 */ 87 STATIC void 88 xfs_destroy_ioend( 89 struct xfs_ioend *ioend, 90 int error) 91 { 92 struct inode *inode = ioend->io_inode; 93 struct bio *bio = &ioend->io_inline_bio; 94 struct bio *last = ioend->io_bio, *next; 95 u64 start = bio->bi_iter.bi_sector; 96 bool quiet = bio_flagged(bio, BIO_QUIET); 97 98 for (bio = &ioend->io_inline_bio; bio; bio = next) { 99 struct bio_vec *bvec; 100 int i; 101 102 /* 103 * For the last bio, bi_private points to the ioend, so we 104 * need to explicitly end the iteration here. 105 */ 106 if (bio == last) 107 next = NULL; 108 else 109 next = bio->bi_private; 110 111 /* walk each page on bio, ending page IO on them */ 112 bio_for_each_segment_all(bvec, bio, i) 113 xfs_finish_page_writeback(inode, bvec, error); 114 bio_put(bio); 115 } 116 117 if (unlikely(error && !quiet)) { 118 xfs_err_ratelimited(XFS_I(inode)->i_mount, 119 "writeback error on sector %llu", start); 120 } 121 } 122 123 /* 124 * Fast and loose check if this write could update the on-disk inode size. 125 */ 126 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 127 { 128 return ioend->io_offset + ioend->io_size > 129 XFS_I(ioend->io_inode)->i_d.di_size; 130 } 131 132 STATIC int 133 xfs_setfilesize_trans_alloc( 134 struct xfs_ioend *ioend) 135 { 136 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 137 struct xfs_trans *tp; 138 int error; 139 140 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 141 XFS_TRANS_NOFS, &tp); 142 if (error) 143 return error; 144 145 ioend->io_append_trans = tp; 146 147 /* 148 * We may pass freeze protection with a transaction. So tell lockdep 149 * we released it. 150 */ 151 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 152 /* 153 * We hand off the transaction to the completion thread now, so 154 * clear the flag here. 155 */ 156 current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 157 return 0; 158 } 159 160 /* 161 * Update on-disk file size now that data has been written to disk. 162 */ 163 STATIC int 164 __xfs_setfilesize( 165 struct xfs_inode *ip, 166 struct xfs_trans *tp, 167 xfs_off_t offset, 168 size_t size) 169 { 170 xfs_fsize_t isize; 171 172 xfs_ilock(ip, XFS_ILOCK_EXCL); 173 isize = xfs_new_eof(ip, offset + size); 174 if (!isize) { 175 xfs_iunlock(ip, XFS_ILOCK_EXCL); 176 xfs_trans_cancel(tp); 177 return 0; 178 } 179 180 trace_xfs_setfilesize(ip, offset, size); 181 182 ip->i_d.di_size = isize; 183 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 184 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 185 186 return xfs_trans_commit(tp); 187 } 188 189 int 190 xfs_setfilesize( 191 struct xfs_inode *ip, 192 xfs_off_t offset, 193 size_t size) 194 { 195 struct xfs_mount *mp = ip->i_mount; 196 struct xfs_trans *tp; 197 int error; 198 199 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); 200 if (error) 201 return error; 202 203 return __xfs_setfilesize(ip, tp, offset, size); 204 } 205 206 STATIC int 207 xfs_setfilesize_ioend( 208 struct xfs_ioend *ioend, 209 int error) 210 { 211 struct xfs_inode *ip = XFS_I(ioend->io_inode); 212 struct xfs_trans *tp = ioend->io_append_trans; 213 214 /* 215 * The transaction may have been allocated in the I/O submission thread, 216 * thus we need to mark ourselves as being in a transaction manually. 217 * Similarly for freeze protection. 218 */ 219 current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 220 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 221 222 /* we abort the update if there was an IO error */ 223 if (error) { 224 xfs_trans_cancel(tp); 225 return error; 226 } 227 228 return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 229 } 230 231 /* 232 * IO write completion. 233 */ 234 STATIC void 235 xfs_end_io( 236 struct work_struct *work) 237 { 238 struct xfs_ioend *ioend = 239 container_of(work, struct xfs_ioend, io_work); 240 struct xfs_inode *ip = XFS_I(ioend->io_inode); 241 xfs_off_t offset = ioend->io_offset; 242 size_t size = ioend->io_size; 243 int error; 244 245 /* 246 * Just clean up the in-memory strutures if the fs has been shut down. 247 */ 248 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 249 error = -EIO; 250 goto done; 251 } 252 253 /* 254 * Clean up any COW blocks on an I/O error. 255 */ 256 error = blk_status_to_errno(ioend->io_bio->bi_status); 257 if (unlikely(error)) { 258 switch (ioend->io_type) { 259 case XFS_IO_COW: 260 xfs_reflink_cancel_cow_range(ip, offset, size, true); 261 break; 262 } 263 264 goto done; 265 } 266 267 /* 268 * Success: commit the COW or unwritten blocks if needed. 269 */ 270 switch (ioend->io_type) { 271 case XFS_IO_COW: 272 error = xfs_reflink_end_cow(ip, offset, size); 273 break; 274 case XFS_IO_UNWRITTEN: 275 /* writeback should never update isize */ 276 error = xfs_iomap_write_unwritten(ip, offset, size, false); 277 break; 278 default: 279 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); 280 break; 281 } 282 283 done: 284 if (ioend->io_append_trans) 285 error = xfs_setfilesize_ioend(ioend, error); 286 xfs_destroy_ioend(ioend, error); 287 } 288 289 STATIC void 290 xfs_end_bio( 291 struct bio *bio) 292 { 293 struct xfs_ioend *ioend = bio->bi_private; 294 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 295 296 if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) 297 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 298 else if (ioend->io_append_trans) 299 queue_work(mp->m_data_workqueue, &ioend->io_work); 300 else 301 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); 302 } 303 304 STATIC int 305 xfs_map_blocks( 306 struct xfs_writepage_ctx *wpc, 307 struct inode *inode, 308 loff_t offset) 309 { 310 struct xfs_inode *ip = XFS_I(inode); 311 struct xfs_mount *mp = ip->i_mount; 312 ssize_t count = i_blocksize(inode); 313 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; 314 xfs_fileoff_t cow_fsb = NULLFILEOFF; 315 struct xfs_bmbt_irec imap; 316 int whichfork = XFS_DATA_FORK; 317 struct xfs_iext_cursor icur; 318 bool imap_valid; 319 int error = 0; 320 321 /* 322 * We have to make sure the cached mapping is within EOF to protect 323 * against eofblocks trimming on file release leaving us with a stale 324 * mapping. Otherwise, a page for a subsequent file extending buffered 325 * write could get picked up by this writeback cycle and written to the 326 * wrong blocks. 327 * 328 * Note that what we really want here is a generic mapping invalidation 329 * mechanism to protect us from arbitrary extent modifying contexts, not 330 * just eofblocks. 331 */ 332 xfs_trim_extent_eof(&wpc->imap, ip); 333 334 /* 335 * COW fork blocks can overlap data fork blocks even if the blocks 336 * aren't shared. COW I/O always takes precedent, so we must always 337 * check for overlap on reflink inodes unless the mapping is already a 338 * COW one, or the COW fork hasn't changed from the last time we looked 339 * at it. 340 * 341 * It's safe to check the COW fork if_seq here without the ILOCK because 342 * we've indirectly protected against concurrent updates: writeback has 343 * the page locked, which prevents concurrent invalidations by reflink 344 * and directio and prevents concurrent buffered writes to the same 345 * page. Changes to if_seq always happen under i_lock, which protects 346 * against concurrent updates and provides a memory barrier on the way 347 * out that ensures that we always see the current value. 348 */ 349 imap_valid = offset_fsb >= wpc->imap.br_startoff && 350 offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; 351 if (imap_valid && 352 (!xfs_inode_has_cow_data(ip) || 353 wpc->io_type == XFS_IO_COW || 354 wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) 355 return 0; 356 357 if (XFS_FORCED_SHUTDOWN(mp)) 358 return -EIO; 359 360 /* 361 * If we don't have a valid map, now it's time to get a new one for this 362 * offset. This will convert delayed allocations (including COW ones) 363 * into real extents. If we return without a valid map, it means we 364 * landed in a hole and we skip the block. 365 */ 366 xfs_ilock(ip, XFS_ILOCK_SHARED); 367 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 368 (ip->i_df.if_flags & XFS_IFEXTENTS)); 369 ASSERT(offset <= mp->m_super->s_maxbytes); 370 371 if (offset > mp->m_super->s_maxbytes - count) 372 count = mp->m_super->s_maxbytes - offset; 373 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 374 375 /* 376 * Check if this is offset is covered by a COW extents, and if yes use 377 * it directly instead of looking up anything in the data fork. 378 */ 379 if (xfs_inode_has_cow_data(ip) && 380 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) 381 cow_fsb = imap.br_startoff; 382 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { 383 wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); 384 xfs_iunlock(ip, XFS_ILOCK_SHARED); 385 /* 386 * Truncate can race with writeback since writeback doesn't 387 * take the iolock and truncate decreases the file size before 388 * it starts truncating the pages between new_size and old_size. 389 * Therefore, we can end up in the situation where writeback 390 * gets a CoW fork mapping but the truncate makes the mapping 391 * invalid and we end up in here trying to get a new mapping. 392 * bail out here so that we simply never get a valid mapping 393 * and so we drop the write altogether. The page truncation 394 * will kill the contents anyway. 395 */ 396 if (offset > i_size_read(inode)) { 397 wpc->io_type = XFS_IO_HOLE; 398 return 0; 399 } 400 whichfork = XFS_COW_FORK; 401 wpc->io_type = XFS_IO_COW; 402 goto allocate_blocks; 403 } 404 405 /* 406 * Map valid and no COW extent in the way? We're done. 407 */ 408 if (imap_valid) { 409 xfs_iunlock(ip, XFS_ILOCK_SHARED); 410 return 0; 411 } 412 413 /* 414 * If we don't have a valid map, now it's time to get a new one for this 415 * offset. This will convert delayed allocations (including COW ones) 416 * into real extents. 417 */ 418 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) 419 imap.br_startoff = end_fsb; /* fake a hole past EOF */ 420 xfs_iunlock(ip, XFS_ILOCK_SHARED); 421 422 if (imap.br_startoff > offset_fsb) { 423 /* landed in a hole or beyond EOF */ 424 imap.br_blockcount = imap.br_startoff - offset_fsb; 425 imap.br_startoff = offset_fsb; 426 imap.br_startblock = HOLESTARTBLOCK; 427 wpc->io_type = XFS_IO_HOLE; 428 } else { 429 /* 430 * Truncate to the next COW extent if there is one. This is the 431 * only opportunity to do this because we can skip COW fork 432 * lookups for the subsequent blocks in the mapping; however, 433 * the requirement to treat the COW range separately remains. 434 */ 435 if (cow_fsb != NULLFILEOFF && 436 cow_fsb < imap.br_startoff + imap.br_blockcount) 437 imap.br_blockcount = cow_fsb - imap.br_startoff; 438 439 if (isnullstartblock(imap.br_startblock)) { 440 /* got a delalloc extent */ 441 wpc->io_type = XFS_IO_DELALLOC; 442 goto allocate_blocks; 443 } 444 445 if (imap.br_state == XFS_EXT_UNWRITTEN) 446 wpc->io_type = XFS_IO_UNWRITTEN; 447 else 448 wpc->io_type = XFS_IO_OVERWRITE; 449 } 450 451 wpc->imap = imap; 452 trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); 453 return 0; 454 allocate_blocks: 455 error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, 456 &wpc->cow_seq); 457 if (error) 458 return error; 459 ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || 460 imap.br_startoff + imap.br_blockcount <= cow_fsb); 461 wpc->imap = imap; 462 trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); 463 return 0; 464 } 465 466 /* 467 * Submit the bio for an ioend. We are passed an ioend with a bio attached to 468 * it, and we submit that bio. The ioend may be used for multiple bio 469 * submissions, so we only want to allocate an append transaction for the ioend 470 * once. In the case of multiple bio submission, each bio will take an IO 471 * reference to the ioend to ensure that the ioend completion is only done once 472 * all bios have been submitted and the ioend is really done. 473 * 474 * If @fail is non-zero, it means that we have a situation where some part of 475 * the submission process has failed after we have marked paged for writeback 476 * and unlocked them. In this situation, we need to fail the bio and ioend 477 * rather than submit it to IO. This typically only happens on a filesystem 478 * shutdown. 479 */ 480 STATIC int 481 xfs_submit_ioend( 482 struct writeback_control *wbc, 483 struct xfs_ioend *ioend, 484 int status) 485 { 486 /* Convert CoW extents to regular */ 487 if (!status && ioend->io_type == XFS_IO_COW) { 488 /* 489 * Yuk. This can do memory allocation, but is not a 490 * transactional operation so everything is done in GFP_KERNEL 491 * context. That can deadlock, because we hold pages in 492 * writeback state and GFP_KERNEL allocations can block on them. 493 * Hence we must operate in nofs conditions here. 494 */ 495 unsigned nofs_flag; 496 497 nofs_flag = memalloc_nofs_save(); 498 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), 499 ioend->io_offset, ioend->io_size); 500 memalloc_nofs_restore(nofs_flag); 501 } 502 503 /* Reserve log space if we might write beyond the on-disk inode size. */ 504 if (!status && 505 ioend->io_type != XFS_IO_UNWRITTEN && 506 xfs_ioend_is_append(ioend) && 507 !ioend->io_append_trans) 508 status = xfs_setfilesize_trans_alloc(ioend); 509 510 ioend->io_bio->bi_private = ioend; 511 ioend->io_bio->bi_end_io = xfs_end_bio; 512 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 513 514 /* 515 * If we are failing the IO now, just mark the ioend with an 516 * error and finish it. This will run IO completion immediately 517 * as there is only one reference to the ioend at this point in 518 * time. 519 */ 520 if (status) { 521 ioend->io_bio->bi_status = errno_to_blk_status(status); 522 bio_endio(ioend->io_bio); 523 return status; 524 } 525 526 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; 527 submit_bio(ioend->io_bio); 528 return 0; 529 } 530 531 static struct xfs_ioend * 532 xfs_alloc_ioend( 533 struct inode *inode, 534 unsigned int type, 535 xfs_off_t offset, 536 struct block_device *bdev, 537 sector_t sector) 538 { 539 struct xfs_ioend *ioend; 540 struct bio *bio; 541 542 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); 543 bio_set_dev(bio, bdev); 544 bio->bi_iter.bi_sector = sector; 545 546 ioend = container_of(bio, struct xfs_ioend, io_inline_bio); 547 INIT_LIST_HEAD(&ioend->io_list); 548 ioend->io_type = type; 549 ioend->io_inode = inode; 550 ioend->io_size = 0; 551 ioend->io_offset = offset; 552 INIT_WORK(&ioend->io_work, xfs_end_io); 553 ioend->io_append_trans = NULL; 554 ioend->io_bio = bio; 555 return ioend; 556 } 557 558 /* 559 * Allocate a new bio, and chain the old bio to the new one. 560 * 561 * Note that we have to do perform the chaining in this unintuitive order 562 * so that the bi_private linkage is set up in the right direction for the 563 * traversal in xfs_destroy_ioend(). 564 */ 565 static void 566 xfs_chain_bio( 567 struct xfs_ioend *ioend, 568 struct writeback_control *wbc, 569 struct block_device *bdev, 570 sector_t sector) 571 { 572 struct bio *new; 573 574 new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); 575 bio_set_dev(new, bdev); 576 new->bi_iter.bi_sector = sector; 577 bio_chain(ioend->io_bio, new); 578 bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ 579 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 580 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; 581 submit_bio(ioend->io_bio); 582 ioend->io_bio = new; 583 } 584 585 /* 586 * Test to see if we have an existing ioend structure that we could append to 587 * first, otherwise finish off the current ioend and start another. 588 */ 589 STATIC void 590 xfs_add_to_ioend( 591 struct inode *inode, 592 xfs_off_t offset, 593 struct page *page, 594 struct iomap_page *iop, 595 struct xfs_writepage_ctx *wpc, 596 struct writeback_control *wbc, 597 struct list_head *iolist) 598 { 599 struct xfs_inode *ip = XFS_I(inode); 600 struct xfs_mount *mp = ip->i_mount; 601 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 602 unsigned len = i_blocksize(inode); 603 unsigned poff = offset & (PAGE_SIZE - 1); 604 sector_t sector; 605 606 sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + 607 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); 608 609 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || 610 sector != bio_end_sector(wpc->ioend->io_bio) || 611 offset != wpc->ioend->io_offset + wpc->ioend->io_size) { 612 if (wpc->ioend) 613 list_add(&wpc->ioend->io_list, iolist); 614 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, 615 bdev, sector); 616 } 617 618 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { 619 if (iop) 620 atomic_inc(&iop->write_count); 621 if (bio_full(wpc->ioend->io_bio)) 622 xfs_chain_bio(wpc->ioend, wbc, bdev, sector); 623 __bio_add_page(wpc->ioend->io_bio, page, len, poff); 624 } 625 626 wpc->ioend->io_size += len; 627 } 628 629 STATIC void 630 xfs_vm_invalidatepage( 631 struct page *page, 632 unsigned int offset, 633 unsigned int length) 634 { 635 trace_xfs_invalidatepage(page->mapping->host, page, offset, length); 636 iomap_invalidatepage(page, offset, length); 637 } 638 639 /* 640 * If the page has delalloc blocks on it, we need to punch them out before we 641 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 642 * inode that can trip up a later direct I/O read operation on the same region. 643 * 644 * We prevent this by truncating away the delalloc regions on the page. Because 645 * they are delalloc, we can do this without needing a transaction. Indeed - if 646 * we get ENOSPC errors, we have to be able to do this truncation without a 647 * transaction as there is no space left for block reservation (typically why we 648 * see a ENOSPC in writeback). 649 */ 650 STATIC void 651 xfs_aops_discard_page( 652 struct page *page) 653 { 654 struct inode *inode = page->mapping->host; 655 struct xfs_inode *ip = XFS_I(inode); 656 struct xfs_mount *mp = ip->i_mount; 657 loff_t offset = page_offset(page); 658 xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); 659 int error; 660 661 if (XFS_FORCED_SHUTDOWN(mp)) 662 goto out_invalidate; 663 664 xfs_alert(mp, 665 "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", 666 page, ip->i_ino, offset); 667 668 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 669 PAGE_SIZE / i_blocksize(inode)); 670 if (error && !XFS_FORCED_SHUTDOWN(mp)) 671 xfs_alert(mp, "page discard unable to remove delalloc mapping."); 672 out_invalidate: 673 xfs_vm_invalidatepage(page, 0, PAGE_SIZE); 674 } 675 676 /* 677 * We implement an immediate ioend submission policy here to avoid needing to 678 * chain multiple ioends and hence nest mempool allocations which can violate 679 * forward progress guarantees we need to provide. The current ioend we are 680 * adding blocks to is cached on the writepage context, and if the new block 681 * does not append to the cached ioend it will create a new ioend and cache that 682 * instead. 683 * 684 * If a new ioend is created and cached, the old ioend is returned and queued 685 * locally for submission once the entire page is processed or an error has been 686 * detected. While ioends are submitted immediately after they are completed, 687 * batching optimisations are provided by higher level block plugging. 688 * 689 * At the end of a writeback pass, there will be a cached ioend remaining on the 690 * writepage context that the caller will need to submit. 691 */ 692 static int 693 xfs_writepage_map( 694 struct xfs_writepage_ctx *wpc, 695 struct writeback_control *wbc, 696 struct inode *inode, 697 struct page *page, 698 uint64_t end_offset) 699 { 700 LIST_HEAD(submit_list); 701 struct iomap_page *iop = to_iomap_page(page); 702 unsigned len = i_blocksize(inode); 703 struct xfs_ioend *ioend, *next; 704 uint64_t file_offset; /* file offset of page */ 705 int error = 0, count = 0, i; 706 707 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); 708 ASSERT(!iop || atomic_read(&iop->write_count) == 0); 709 710 /* 711 * Walk through the page to find areas to write back. If we run off the 712 * end of the current map or find the current map invalid, grab a new 713 * one. 714 */ 715 for (i = 0, file_offset = page_offset(page); 716 i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; 717 i++, file_offset += len) { 718 if (iop && !test_bit(i, iop->uptodate)) 719 continue; 720 721 error = xfs_map_blocks(wpc, inode, file_offset); 722 if (error) 723 break; 724 if (wpc->io_type == XFS_IO_HOLE) 725 continue; 726 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, 727 &submit_list); 728 count++; 729 } 730 731 ASSERT(wpc->ioend || list_empty(&submit_list)); 732 ASSERT(PageLocked(page)); 733 ASSERT(!PageWriteback(page)); 734 735 /* 736 * On error, we have to fail the ioend here because we may have set 737 * pages under writeback, we have to make sure we run IO completion to 738 * mark the error state of the IO appropriately, so we can't cancel the 739 * ioend directly here. That means we have to mark this page as under 740 * writeback if we included any blocks from it in the ioend chain so 741 * that completion treats it correctly. 742 * 743 * If we didn't include the page in the ioend, the on error we can 744 * simply discard and unlock it as there are no other users of the page 745 * now. The caller will still need to trigger submission of outstanding 746 * ioends on the writepage context so they are treated correctly on 747 * error. 748 */ 749 if (unlikely(error)) { 750 if (!count) { 751 xfs_aops_discard_page(page); 752 ClearPageUptodate(page); 753 unlock_page(page); 754 goto done; 755 } 756 757 /* 758 * If the page was not fully cleaned, we need to ensure that the 759 * higher layers come back to it correctly. That means we need 760 * to keep the page dirty, and for WB_SYNC_ALL writeback we need 761 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed 762 * so another attempt to write this page in this writeback sweep 763 * will be made. 764 */ 765 set_page_writeback_keepwrite(page); 766 } else { 767 clear_page_dirty_for_io(page); 768 set_page_writeback(page); 769 } 770 771 unlock_page(page); 772 773 /* 774 * Preserve the original error if there was one, otherwise catch 775 * submission errors here and propagate into subsequent ioend 776 * submissions. 777 */ 778 list_for_each_entry_safe(ioend, next, &submit_list, io_list) { 779 int error2; 780 781 list_del_init(&ioend->io_list); 782 error2 = xfs_submit_ioend(wbc, ioend, error); 783 if (error2 && !error) 784 error = error2; 785 } 786 787 /* 788 * We can end up here with no error and nothing to write only if we race 789 * with a partial page truncate on a sub-page block sized filesystem. 790 */ 791 if (!count) 792 end_page_writeback(page); 793 done: 794 mapping_set_error(page->mapping, error); 795 return error; 796 } 797 798 /* 799 * Write out a dirty page. 800 * 801 * For delalloc space on the page we need to allocate space and flush it. 802 * For unwritten space on the page we need to start the conversion to 803 * regular allocated space. 804 */ 805 STATIC int 806 xfs_do_writepage( 807 struct page *page, 808 struct writeback_control *wbc, 809 void *data) 810 { 811 struct xfs_writepage_ctx *wpc = data; 812 struct inode *inode = page->mapping->host; 813 loff_t offset; 814 uint64_t end_offset; 815 pgoff_t end_index; 816 817 trace_xfs_writepage(inode, page, 0, 0); 818 819 /* 820 * Refuse to write the page out if we are called from reclaim context. 821 * 822 * This avoids stack overflows when called from deeply used stacks in 823 * random callers for direct reclaim or memcg reclaim. We explicitly 824 * allow reclaim from kswapd as the stack usage there is relatively low. 825 * 826 * This should never happen except in the case of a VM regression so 827 * warn about it. 828 */ 829 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 830 PF_MEMALLOC)) 831 goto redirty; 832 833 /* 834 * Given that we do not allow direct reclaim to call us, we should 835 * never be called while in a filesystem transaction. 836 */ 837 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) 838 goto redirty; 839 840 /* 841 * Is this page beyond the end of the file? 842 * 843 * The page index is less than the end_index, adjust the end_offset 844 * to the highest offset that this page should represent. 845 * ----------------------------------------------------- 846 * | file mapping | <EOF> | 847 * ----------------------------------------------------- 848 * | Page ... | Page N-2 | Page N-1 | Page N | | 849 * ^--------------------------------^----------|-------- 850 * | desired writeback range | see else | 851 * ---------------------------------^------------------| 852 */ 853 offset = i_size_read(inode); 854 end_index = offset >> PAGE_SHIFT; 855 if (page->index < end_index) 856 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; 857 else { 858 /* 859 * Check whether the page to write out is beyond or straddles 860 * i_size or not. 861 * ------------------------------------------------------- 862 * | file mapping | <EOF> | 863 * ------------------------------------------------------- 864 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 865 * ^--------------------------------^-----------|--------- 866 * | | Straddles | 867 * ---------------------------------^-----------|--------| 868 */ 869 unsigned offset_into_page = offset & (PAGE_SIZE - 1); 870 871 /* 872 * Skip the page if it is fully outside i_size, e.g. due to a 873 * truncate operation that is in progress. We must redirty the 874 * page so that reclaim stops reclaiming it. Otherwise 875 * xfs_vm_releasepage() is called on it and gets confused. 876 * 877 * Note that the end_index is unsigned long, it would overflow 878 * if the given offset is greater than 16TB on 32-bit system 879 * and if we do check the page is fully outside i_size or not 880 * via "if (page->index >= end_index + 1)" as "end_index + 1" 881 * will be evaluated to 0. Hence this page will be redirtied 882 * and be written out repeatedly which would result in an 883 * infinite loop, the user program that perform this operation 884 * will hang. Instead, we can verify this situation by checking 885 * if the page to write is totally beyond the i_size or if it's 886 * offset is just equal to the EOF. 887 */ 888 if (page->index > end_index || 889 (page->index == end_index && offset_into_page == 0)) 890 goto redirty; 891 892 /* 893 * The page straddles i_size. It must be zeroed out on each 894 * and every writepage invocation because it may be mmapped. 895 * "A file is mapped in multiples of the page size. For a file 896 * that is not a multiple of the page size, the remaining 897 * memory is zeroed when mapped, and writes to that region are 898 * not written out to the file." 899 */ 900 zero_user_segment(page, offset_into_page, PAGE_SIZE); 901 902 /* Adjust the end_offset to the end of file */ 903 end_offset = offset; 904 } 905 906 return xfs_writepage_map(wpc, wbc, inode, page, end_offset); 907 908 redirty: 909 redirty_page_for_writepage(wbc, page); 910 unlock_page(page); 911 return 0; 912 } 913 914 STATIC int 915 xfs_vm_writepage( 916 struct page *page, 917 struct writeback_control *wbc) 918 { 919 struct xfs_writepage_ctx wpc = { 920 .io_type = XFS_IO_INVALID, 921 }; 922 int ret; 923 924 ret = xfs_do_writepage(page, wbc, &wpc); 925 if (wpc.ioend) 926 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 927 return ret; 928 } 929 930 STATIC int 931 xfs_vm_writepages( 932 struct address_space *mapping, 933 struct writeback_control *wbc) 934 { 935 struct xfs_writepage_ctx wpc = { 936 .io_type = XFS_IO_INVALID, 937 }; 938 int ret; 939 940 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 941 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); 942 if (wpc.ioend) 943 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 944 return ret; 945 } 946 947 STATIC int 948 xfs_dax_writepages( 949 struct address_space *mapping, 950 struct writeback_control *wbc) 951 { 952 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 953 return dax_writeback_mapping_range(mapping, 954 xfs_find_bdev_for_inode(mapping->host), wbc); 955 } 956 957 STATIC int 958 xfs_vm_releasepage( 959 struct page *page, 960 gfp_t gfp_mask) 961 { 962 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 963 return iomap_releasepage(page, gfp_mask); 964 } 965 966 STATIC sector_t 967 xfs_vm_bmap( 968 struct address_space *mapping, 969 sector_t block) 970 { 971 struct xfs_inode *ip = XFS_I(mapping->host); 972 973 trace_xfs_vm_bmap(ip); 974 975 /* 976 * The swap code (ab-)uses ->bmap to get a block mapping and then 977 * bypasses the file system for actual I/O. We really can't allow 978 * that on reflinks inodes, so we have to skip out here. And yes, 979 * 0 is the magic code for a bmap error. 980 * 981 * Since we don't pass back blockdev info, we can't return bmap 982 * information for rt files either. 983 */ 984 if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) 985 return 0; 986 return iomap_bmap(mapping, block, &xfs_iomap_ops); 987 } 988 989 STATIC int 990 xfs_vm_readpage( 991 struct file *unused, 992 struct page *page) 993 { 994 trace_xfs_vm_readpage(page->mapping->host, 1); 995 return iomap_readpage(page, &xfs_iomap_ops); 996 } 997 998 STATIC int 999 xfs_vm_readpages( 1000 struct file *unused, 1001 struct address_space *mapping, 1002 struct list_head *pages, 1003 unsigned nr_pages) 1004 { 1005 trace_xfs_vm_readpages(mapping->host, nr_pages); 1006 return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); 1007 } 1008 1009 static int 1010 xfs_iomap_swapfile_activate( 1011 struct swap_info_struct *sis, 1012 struct file *swap_file, 1013 sector_t *span) 1014 { 1015 sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); 1016 return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); 1017 } 1018 1019 const struct address_space_operations xfs_address_space_operations = { 1020 .readpage = xfs_vm_readpage, 1021 .readpages = xfs_vm_readpages, 1022 .writepage = xfs_vm_writepage, 1023 .writepages = xfs_vm_writepages, 1024 .set_page_dirty = iomap_set_page_dirty, 1025 .releasepage = xfs_vm_releasepage, 1026 .invalidatepage = xfs_vm_invalidatepage, 1027 .bmap = xfs_vm_bmap, 1028 .direct_IO = noop_direct_IO, 1029 .migratepage = iomap_migrate_page, 1030 .is_partially_uptodate = iomap_is_partially_uptodate, 1031 .error_remove_page = generic_error_remove_page, 1032 .swap_activate = xfs_iomap_swapfile_activate, 1033 }; 1034 1035 const struct address_space_operations xfs_dax_aops = { 1036 .writepages = xfs_dax_writepages, 1037 .direct_IO = noop_direct_IO, 1038 .set_page_dirty = noop_set_page_dirty, 1039 .invalidatepage = noop_invalidatepage, 1040 .swap_activate = xfs_iomap_swapfile_activate, 1041 }; 1042