1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_btree.h" 21 #include "xfs_refcount_btree.h" 22 #include "xfs_refcount.h" 23 #include "xfs_bmap_btree.h" 24 #include "xfs_trans_space.h" 25 #include "xfs_bit.h" 26 #include "xfs_alloc.h" 27 #include "xfs_quota.h" 28 #include "xfs_reflink.h" 29 #include "xfs_iomap.h" 30 #include "xfs_ag.h" 31 #include "xfs_ag_resv.h" 32 33 /* 34 * Copy on Write of Shared Blocks 35 * 36 * XFS must preserve "the usual" file semantics even when two files share 37 * the same physical blocks. This means that a write to one file must not 38 * alter the blocks in a different file; the way that we'll do that is 39 * through the use of a copy-on-write mechanism. At a high level, that 40 * means that when we want to write to a shared block, we allocate a new 41 * block, write the data to the new block, and if that succeeds we map the 42 * new block into the file. 43 * 44 * XFS provides a "delayed allocation" mechanism that defers the allocation 45 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 46 * possible. This reduces fragmentation by enabling the filesystem to ask 47 * for bigger chunks less often, which is exactly what we want for CoW. 48 * 49 * The delalloc mechanism begins when the kernel wants to make a block 50 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 51 * create a delalloc mapping, which is a regular in-core extent, but without 52 * a real startblock. (For delalloc mappings, the startblock encodes both 53 * a flag that this is a delalloc mapping, and a worst-case estimate of how 54 * many blocks might be required to put the mapping into the BMBT.) delalloc 55 * mappings are a reservation against the free space in the filesystem; 56 * adjacent mappings can also be combined into fewer larger mappings. 57 * 58 * As an optimization, the CoW extent size hint (cowextsz) creates 59 * outsized aligned delalloc reservations in the hope of landing out of 60 * order nearby CoW writes in a single extent on disk, thereby reducing 61 * fragmentation and improving future performance. 62 * 63 * D: --RRRRRRSSSRRRRRRRR--- (data fork) 64 * C: ------DDDDDDD--------- (CoW fork) 65 * 66 * When dirty pages are being written out (typically in writepage), the 67 * delalloc reservations are converted into unwritten mappings by 68 * allocating blocks and replacing the delalloc mapping with real ones. 69 * A delalloc mapping can be replaced by several unwritten ones if the 70 * free space is fragmented. 71 * 72 * D: --RRRRRRSSSRRRRRRRR--- 73 * C: ------UUUUUUU--------- 74 * 75 * We want to adapt the delalloc mechanism for copy-on-write, since the 76 * write paths are similar. The first two steps (creating the reservation 77 * and allocating the blocks) are exactly the same as delalloc except that 78 * the mappings must be stored in a separate CoW fork because we do not want 79 * to disturb the mapping in the data fork until we're sure that the write 80 * succeeded. IO completion in this case is the process of removing the old 81 * mapping from the data fork and moving the new mapping from the CoW fork to 82 * the data fork. This will be discussed shortly. 83 * 84 * For now, unaligned directio writes will be bounced back to the page cache. 85 * Block-aligned directio writes will use the same mechanism as buffered 86 * writes. 87 * 88 * Just prior to submitting the actual disk write requests, we convert 89 * the extents representing the range of the file actually being written 90 * (as opposed to extra pieces created for the cowextsize hint) to real 91 * extents. This will become important in the next step: 92 * 93 * D: --RRRRRRSSSRRRRRRRR--- 94 * C: ------UUrrUUU--------- 95 * 96 * CoW remapping must be done after the data block write completes, 97 * because we don't want to destroy the old data fork map until we're sure 98 * the new block has been written. Since the new mappings are kept in a 99 * separate fork, we can simply iterate these mappings to find the ones 100 * that cover the file blocks that we just CoW'd. For each extent, simply 101 * unmap the corresponding range in the data fork, map the new range into 102 * the data fork, and remove the extent from the CoW fork. Because of 103 * the presence of the cowextsize hint, however, we must be careful 104 * only to remap the blocks that we've actually written out -- we must 105 * never remap delalloc reservations nor CoW staging blocks that have 106 * yet to be written. This corresponds exactly to the real extents in 107 * the CoW fork: 108 * 109 * D: --RRRRRRrrSRRRRRRRR--- 110 * C: ------UU--UUU--------- 111 * 112 * Since the remapping operation can be applied to an arbitrary file 113 * range, we record the need for the remap step as a flag in the ioend 114 * instead of declaring a new IO type. This is required for direct io 115 * because we only have ioend for the whole dio, and we have to be able to 116 * remember the presence of unwritten blocks and CoW blocks with a single 117 * ioend structure. Better yet, the more ground we can cover with one 118 * ioend, the better. 119 */ 120 121 /* 122 * Given an AG extent, find the lowest-numbered run of shared blocks 123 * within that range and return the range in fbno/flen. If 124 * find_end_of_shared is true, return the longest contiguous extent of 125 * shared blocks. If there are no shared extents, fbno and flen will 126 * be set to NULLAGBLOCK and 0, respectively. 127 */ 128 int 129 xfs_reflink_find_shared( 130 struct xfs_mount *mp, 131 struct xfs_trans *tp, 132 xfs_agnumber_t agno, 133 xfs_agblock_t agbno, 134 xfs_extlen_t aglen, 135 xfs_agblock_t *fbno, 136 xfs_extlen_t *flen, 137 bool find_end_of_shared) 138 { 139 struct xfs_buf *agbp; 140 struct xfs_btree_cur *cur; 141 int error; 142 143 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 144 if (error) 145 return error; 146 147 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); 148 149 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 150 find_end_of_shared); 151 152 xfs_btree_del_cursor(cur, error); 153 154 xfs_trans_brelse(tp, agbp); 155 return error; 156 } 157 158 /* 159 * Trim the mapping to the next block where there's a change in the 160 * shared/unshared status. More specifically, this means that we 161 * find the lowest-numbered extent of shared blocks that coincides with 162 * the given block mapping. If the shared extent overlaps the start of 163 * the mapping, trim the mapping to the end of the shared extent. If 164 * the shared region intersects the mapping, trim the mapping to the 165 * start of the shared extent. If there are no shared regions that 166 * overlap, just return the original extent. 167 */ 168 int 169 xfs_reflink_trim_around_shared( 170 struct xfs_inode *ip, 171 struct xfs_bmbt_irec *irec, 172 bool *shared) 173 { 174 xfs_agnumber_t agno; 175 xfs_agblock_t agbno; 176 xfs_extlen_t aglen; 177 xfs_agblock_t fbno; 178 xfs_extlen_t flen; 179 int error = 0; 180 181 /* Holes, unwritten, and delalloc extents cannot be shared */ 182 if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { 183 *shared = false; 184 return 0; 185 } 186 187 trace_xfs_reflink_trim_around_shared(ip, irec); 188 189 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 190 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 191 aglen = irec->br_blockcount; 192 193 error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, 194 aglen, &fbno, &flen, true); 195 if (error) 196 return error; 197 198 *shared = false; 199 if (fbno == NULLAGBLOCK) { 200 /* No shared blocks at all. */ 201 return 0; 202 } else if (fbno == agbno) { 203 /* 204 * The start of this extent is shared. Truncate the 205 * mapping at the end of the shared region so that a 206 * subsequent iteration starts at the start of the 207 * unshared region. 208 */ 209 irec->br_blockcount = flen; 210 *shared = true; 211 return 0; 212 } else { 213 /* 214 * There's a shared extent midway through this extent. 215 * Truncate the mapping at the start of the shared 216 * extent so that a subsequent iteration starts at the 217 * start of the shared region. 218 */ 219 irec->br_blockcount = fbno - agbno; 220 return 0; 221 } 222 } 223 224 int 225 xfs_bmap_trim_cow( 226 struct xfs_inode *ip, 227 struct xfs_bmbt_irec *imap, 228 bool *shared) 229 { 230 /* We can't update any real extents in always COW mode. */ 231 if (xfs_is_always_cow_inode(ip) && 232 !isnullstartblock(imap->br_startblock)) { 233 *shared = true; 234 return 0; 235 } 236 237 /* Trim the mapping to the nearest shared extent boundary. */ 238 return xfs_reflink_trim_around_shared(ip, imap, shared); 239 } 240 241 static int 242 xfs_reflink_convert_cow_locked( 243 struct xfs_inode *ip, 244 xfs_fileoff_t offset_fsb, 245 xfs_filblks_t count_fsb) 246 { 247 struct xfs_iext_cursor icur; 248 struct xfs_bmbt_irec got; 249 struct xfs_btree_cur *dummy_cur = NULL; 250 int dummy_logflags; 251 int error = 0; 252 253 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 254 return 0; 255 256 do { 257 if (got.br_startoff >= offset_fsb + count_fsb) 258 break; 259 if (got.br_state == XFS_EXT_NORM) 260 continue; 261 if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) 262 return -EIO; 263 264 xfs_trim_extent(&got, offset_fsb, count_fsb); 265 if (!got.br_blockcount) 266 continue; 267 268 got.br_state = XFS_EXT_NORM; 269 error = xfs_bmap_add_extent_unwritten_real(NULL, ip, 270 XFS_COW_FORK, &icur, &dummy_cur, &got, 271 &dummy_logflags); 272 if (error) 273 return error; 274 } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); 275 276 return error; 277 } 278 279 /* Convert all of the unwritten CoW extents in a file's range to real ones. */ 280 int 281 xfs_reflink_convert_cow( 282 struct xfs_inode *ip, 283 xfs_off_t offset, 284 xfs_off_t count) 285 { 286 struct xfs_mount *mp = ip->i_mount; 287 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 288 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 289 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 290 int error; 291 292 ASSERT(count != 0); 293 294 xfs_ilock(ip, XFS_ILOCK_EXCL); 295 error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 296 xfs_iunlock(ip, XFS_ILOCK_EXCL); 297 return error; 298 } 299 300 /* 301 * Find the extent that maps the given range in the COW fork. Even if the extent 302 * is not shared we might have a preallocation for it in the COW fork. If so we 303 * use it that rather than trigger a new allocation. 304 */ 305 static int 306 xfs_find_trim_cow_extent( 307 struct xfs_inode *ip, 308 struct xfs_bmbt_irec *imap, 309 struct xfs_bmbt_irec *cmap, 310 bool *shared, 311 bool *found) 312 { 313 xfs_fileoff_t offset_fsb = imap->br_startoff; 314 xfs_filblks_t count_fsb = imap->br_blockcount; 315 struct xfs_iext_cursor icur; 316 317 *found = false; 318 319 /* 320 * If we don't find an overlapping extent, trim the range we need to 321 * allocate to fit the hole we found. 322 */ 323 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap)) 324 cmap->br_startoff = offset_fsb + count_fsb; 325 if (cmap->br_startoff > offset_fsb) { 326 xfs_trim_extent(imap, imap->br_startoff, 327 cmap->br_startoff - imap->br_startoff); 328 return xfs_bmap_trim_cow(ip, imap, shared); 329 } 330 331 *shared = true; 332 if (isnullstartblock(cmap->br_startblock)) { 333 xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount); 334 return 0; 335 } 336 337 /* real extent found - no need to allocate */ 338 xfs_trim_extent(cmap, offset_fsb, count_fsb); 339 *found = true; 340 return 0; 341 } 342 343 /* Allocate all CoW reservations covering a range of blocks in a file. */ 344 int 345 xfs_reflink_allocate_cow( 346 struct xfs_inode *ip, 347 struct xfs_bmbt_irec *imap, 348 struct xfs_bmbt_irec *cmap, 349 bool *shared, 350 uint *lockmode, 351 bool convert_now) 352 { 353 struct xfs_mount *mp = ip->i_mount; 354 xfs_fileoff_t offset_fsb = imap->br_startoff; 355 xfs_filblks_t count_fsb = imap->br_blockcount; 356 struct xfs_trans *tp; 357 int nimaps, error = 0; 358 bool found; 359 xfs_filblks_t resaligned; 360 xfs_extlen_t resblks = 0; 361 362 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 363 if (!ip->i_cowfp) { 364 ASSERT(!xfs_is_reflink_inode(ip)); 365 xfs_ifork_init_cow(ip); 366 } 367 368 error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); 369 if (error || !*shared) 370 return error; 371 if (found) 372 goto convert; 373 374 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 375 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 376 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 377 378 xfs_iunlock(ip, *lockmode); 379 *lockmode = 0; 380 381 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, 382 false, &tp); 383 if (error) 384 return error; 385 386 *lockmode = XFS_ILOCK_EXCL; 387 388 /* 389 * Check for an overlapping extent again now that we dropped the ilock. 390 */ 391 error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); 392 if (error || !*shared) 393 goto out_trans_cancel; 394 if (found) { 395 xfs_trans_cancel(tp); 396 goto convert; 397 } 398 399 /* Allocate the entire reservation as unwritten blocks. */ 400 nimaps = 1; 401 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 402 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, 403 &nimaps); 404 if (error) 405 goto out_trans_cancel; 406 407 xfs_inode_set_cowblocks_tag(ip); 408 error = xfs_trans_commit(tp); 409 if (error) 410 return error; 411 412 /* 413 * Allocation succeeded but the requested range was not even partially 414 * satisfied? Bail out! 415 */ 416 if (nimaps == 0) 417 return -ENOSPC; 418 convert: 419 xfs_trim_extent(cmap, offset_fsb, count_fsb); 420 /* 421 * COW fork extents are supposed to remain unwritten until we're ready 422 * to initiate a disk write. For direct I/O we are going to write the 423 * data and need the conversion, but for buffered writes we're done. 424 */ 425 if (!convert_now || cmap->br_state == XFS_EXT_NORM) 426 return 0; 427 trace_xfs_reflink_convert_cow(ip, cmap); 428 error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 429 if (!error) 430 cmap->br_state = XFS_EXT_NORM; 431 return error; 432 433 out_trans_cancel: 434 xfs_trans_cancel(tp); 435 return error; 436 } 437 438 /* 439 * Cancel CoW reservations for some block range of an inode. 440 * 441 * If cancel_real is true this function cancels all COW fork extents for the 442 * inode; if cancel_real is false, real extents are not cleared. 443 * 444 * Caller must have already joined the inode to the current transaction. The 445 * inode will be joined to the transaction returned to the caller. 446 */ 447 int 448 xfs_reflink_cancel_cow_blocks( 449 struct xfs_inode *ip, 450 struct xfs_trans **tpp, 451 xfs_fileoff_t offset_fsb, 452 xfs_fileoff_t end_fsb, 453 bool cancel_real) 454 { 455 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 456 struct xfs_bmbt_irec got, del; 457 struct xfs_iext_cursor icur; 458 int error = 0; 459 460 if (!xfs_inode_has_cow_data(ip)) 461 return 0; 462 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 463 return 0; 464 465 /* Walk backwards until we're out of the I/O range... */ 466 while (got.br_startoff + got.br_blockcount > offset_fsb) { 467 del = got; 468 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 469 470 /* Extent delete may have bumped ext forward */ 471 if (!del.br_blockcount) { 472 xfs_iext_prev(ifp, &icur); 473 goto next_extent; 474 } 475 476 trace_xfs_reflink_cancel_cow(ip, &del); 477 478 if (isnullstartblock(del.br_startblock)) { 479 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 480 &icur, &got, &del); 481 if (error) 482 break; 483 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 484 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 485 486 /* Free the CoW orphan record. */ 487 xfs_refcount_free_cow_extent(*tpp, del.br_startblock, 488 del.br_blockcount); 489 490 xfs_free_extent_later(*tpp, del.br_startblock, 491 del.br_blockcount, NULL); 492 493 /* Roll the transaction */ 494 error = xfs_defer_finish(tpp); 495 if (error) 496 break; 497 498 /* Remove the mapping from the CoW fork. */ 499 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 500 501 /* Remove the quota reservation */ 502 error = xfs_quota_unreserve_blkres(ip, 503 del.br_blockcount); 504 if (error) 505 break; 506 } else { 507 /* Didn't do anything, push cursor back. */ 508 xfs_iext_prev(ifp, &icur); 509 } 510 next_extent: 511 if (!xfs_iext_get_extent(ifp, &icur, &got)) 512 break; 513 } 514 515 /* clear tag if cow fork is emptied */ 516 if (!ifp->if_bytes) 517 xfs_inode_clear_cowblocks_tag(ip); 518 return error; 519 } 520 521 /* 522 * Cancel CoW reservations for some byte range of an inode. 523 * 524 * If cancel_real is true this function cancels all COW fork extents for the 525 * inode; if cancel_real is false, real extents are not cleared. 526 */ 527 int 528 xfs_reflink_cancel_cow_range( 529 struct xfs_inode *ip, 530 xfs_off_t offset, 531 xfs_off_t count, 532 bool cancel_real) 533 { 534 struct xfs_trans *tp; 535 xfs_fileoff_t offset_fsb; 536 xfs_fileoff_t end_fsb; 537 int error; 538 539 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 540 ASSERT(ip->i_cowfp); 541 542 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 543 if (count == NULLFILEOFF) 544 end_fsb = NULLFILEOFF; 545 else 546 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 547 548 /* Start a rolling transaction to remove the mappings */ 549 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 550 0, 0, 0, &tp); 551 if (error) 552 goto out; 553 554 xfs_ilock(ip, XFS_ILOCK_EXCL); 555 xfs_trans_ijoin(tp, ip, 0); 556 557 /* Scrape out the old CoW reservations */ 558 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 559 cancel_real); 560 if (error) 561 goto out_cancel; 562 563 error = xfs_trans_commit(tp); 564 565 xfs_iunlock(ip, XFS_ILOCK_EXCL); 566 return error; 567 568 out_cancel: 569 xfs_trans_cancel(tp); 570 xfs_iunlock(ip, XFS_ILOCK_EXCL); 571 out: 572 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 573 return error; 574 } 575 576 /* 577 * Remap part of the CoW fork into the data fork. 578 * 579 * We aim to remap the range starting at @offset_fsb and ending at @end_fsb 580 * into the data fork; this function will remap what it can (at the end of the 581 * range) and update @end_fsb appropriately. Each remap gets its own 582 * transaction because we can end up merging and splitting bmbt blocks for 583 * every remap operation and we'd like to keep the block reservation 584 * requirements as low as possible. 585 */ 586 STATIC int 587 xfs_reflink_end_cow_extent( 588 struct xfs_inode *ip, 589 xfs_fileoff_t offset_fsb, 590 xfs_fileoff_t *end_fsb) 591 { 592 struct xfs_bmbt_irec got, del; 593 struct xfs_iext_cursor icur; 594 struct xfs_mount *mp = ip->i_mount; 595 struct xfs_trans *tp; 596 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 597 xfs_filblks_t rlen; 598 unsigned int resblks; 599 int error; 600 601 /* No COW extents? That's easy! */ 602 if (ifp->if_bytes == 0) { 603 *end_fsb = offset_fsb; 604 return 0; 605 } 606 607 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 608 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 609 XFS_TRANS_RESERVE, &tp); 610 if (error) 611 return error; 612 613 /* 614 * Lock the inode. We have to ijoin without automatic unlock because 615 * the lead transaction is the refcountbt record deletion; the data 616 * fork update follows as a deferred log item. 617 */ 618 xfs_ilock(ip, XFS_ILOCK_EXCL); 619 xfs_trans_ijoin(tp, ip, 0); 620 621 error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, 622 XFS_IEXT_REFLINK_END_COW_CNT); 623 if (error) 624 goto out_cancel; 625 626 /* 627 * In case of racing, overlapping AIO writes no COW extents might be 628 * left by the time I/O completes for the loser of the race. In that 629 * case we are done. 630 */ 631 if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || 632 got.br_startoff + got.br_blockcount <= offset_fsb) { 633 *end_fsb = offset_fsb; 634 goto out_cancel; 635 } 636 637 /* 638 * Structure copy @got into @del, then trim @del to the range that we 639 * were asked to remap. We preserve @got for the eventual CoW fork 640 * deletion; from now on @del represents the mapping that we're 641 * actually remapping. 642 */ 643 del = got; 644 xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); 645 646 ASSERT(del.br_blockcount > 0); 647 648 /* 649 * Only remap real extents that contain data. With AIO, speculative 650 * preallocations can leak into the range we are called upon, and we 651 * need to skip them. 652 */ 653 if (!xfs_bmap_is_written_extent(&got)) { 654 *end_fsb = del.br_startoff; 655 goto out_cancel; 656 } 657 658 /* Unmap the old blocks in the data fork. */ 659 rlen = del.br_blockcount; 660 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); 661 if (error) 662 goto out_cancel; 663 664 /* Trim the extent to whatever got unmapped. */ 665 xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); 666 trace_xfs_reflink_cow_remap(ip, &del); 667 668 /* Free the CoW orphan record. */ 669 xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); 670 671 /* Map the new blocks into the data fork. */ 672 xfs_bmap_map_extent(tp, ip, &del); 673 674 /* Charge this new data fork mapping to the on-disk quota. */ 675 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 676 (long)del.br_blockcount); 677 678 /* Remove the mapping from the CoW fork. */ 679 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 680 681 error = xfs_trans_commit(tp); 682 xfs_iunlock(ip, XFS_ILOCK_EXCL); 683 if (error) 684 return error; 685 686 /* Update the caller about how much progress we made. */ 687 *end_fsb = del.br_startoff; 688 return 0; 689 690 out_cancel: 691 xfs_trans_cancel(tp); 692 xfs_iunlock(ip, XFS_ILOCK_EXCL); 693 return error; 694 } 695 696 /* 697 * Remap parts of a file's data fork after a successful CoW. 698 */ 699 int 700 xfs_reflink_end_cow( 701 struct xfs_inode *ip, 702 xfs_off_t offset, 703 xfs_off_t count) 704 { 705 xfs_fileoff_t offset_fsb; 706 xfs_fileoff_t end_fsb; 707 int error = 0; 708 709 trace_xfs_reflink_end_cow(ip, offset, count); 710 711 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 712 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 713 714 /* 715 * Walk backwards until we're out of the I/O range. The loop function 716 * repeatedly cycles the ILOCK to allocate one transaction per remapped 717 * extent. 718 * 719 * If we're being called by writeback then the pages will still 720 * have PageWriteback set, which prevents races with reflink remapping 721 * and truncate. Reflink remapping prevents races with writeback by 722 * taking the iolock and mmaplock before flushing the pages and 723 * remapping, which means there won't be any further writeback or page 724 * cache dirtying until the reflink completes. 725 * 726 * We should never have two threads issuing writeback for the same file 727 * region. There are also have post-eof checks in the writeback 728 * preparation code so that we don't bother writing out pages that are 729 * about to be truncated. 730 * 731 * If we're being called as part of directio write completion, the dio 732 * count is still elevated, which reflink and truncate will wait for. 733 * Reflink remapping takes the iolock and mmaplock and waits for 734 * pending dio to finish, which should prevent any directio until the 735 * remap completes. Multiple concurrent directio writes to the same 736 * region are handled by end_cow processing only occurring for the 737 * threads which succeed; the outcome of multiple overlapping direct 738 * writes is not well defined anyway. 739 * 740 * It's possible that a buffered write and a direct write could collide 741 * here (the buffered write stumbles in after the dio flushes and 742 * invalidates the page cache and immediately queues writeback), but we 743 * have never supported this 100%. If either disk write succeeds the 744 * blocks will be remapped. 745 */ 746 while (end_fsb > offset_fsb && !error) 747 error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); 748 749 if (error) 750 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 751 return error; 752 } 753 754 /* 755 * Free all CoW staging blocks that are still referenced by the ondisk refcount 756 * metadata. The ondisk metadata does not track which inode created the 757 * staging extent, so callers must ensure that there are no cached inodes with 758 * live CoW staging extents. 759 */ 760 int 761 xfs_reflink_recover_cow( 762 struct xfs_mount *mp) 763 { 764 struct xfs_perag *pag; 765 xfs_agnumber_t agno; 766 int error = 0; 767 768 if (!xfs_has_reflink(mp)) 769 return 0; 770 771 for_each_perag(mp, agno, pag) { 772 error = xfs_refcount_recover_cow_leftovers(mp, pag); 773 if (error) { 774 xfs_perag_put(pag); 775 break; 776 } 777 } 778 779 return error; 780 } 781 782 /* 783 * Reflinking (Block) Ranges of Two Files Together 784 * 785 * First, ensure that the reflink flag is set on both inodes. The flag is an 786 * optimization to avoid unnecessary refcount btree lookups in the write path. 787 * 788 * Now we can iteratively remap the range of extents (and holes) in src to the 789 * corresponding ranges in dest. Let drange and srange denote the ranges of 790 * logical blocks in dest and src touched by the reflink operation. 791 * 792 * While the length of drange is greater than zero, 793 * - Read src's bmbt at the start of srange ("imap") 794 * - If imap doesn't exist, make imap appear to start at the end of srange 795 * with zero length. 796 * - If imap starts before srange, advance imap to start at srange. 797 * - If imap goes beyond srange, truncate imap to end at the end of srange. 798 * - Punch (imap start - srange start + imap len) blocks from dest at 799 * offset (drange start). 800 * - If imap points to a real range of pblks, 801 * > Increase the refcount of the imap's pblks 802 * > Map imap's pblks into dest at the offset 803 * (drange start + imap start - srange start) 804 * - Advance drange and srange by (imap start - srange start + imap len) 805 * 806 * Finally, if the reflink made dest longer, update both the in-core and 807 * on-disk file sizes. 808 * 809 * ASCII Art Demonstration: 810 * 811 * Let's say we want to reflink this source file: 812 * 813 * ----SSSSSSS-SSSSS----SSSSSS (src file) 814 * <--------------------> 815 * 816 * into this destination file: 817 * 818 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 819 * <--------------------> 820 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 821 * Observe that the range has different logical offsets in either file. 822 * 823 * Consider that the first extent in the source file doesn't line up with our 824 * reflink range. Unmapping and remapping are separate operations, so we can 825 * unmap more blocks from the destination file than we remap. 826 * 827 * ----SSSSSSS-SSSSS----SSSSSS 828 * <-------> 829 * --DDDDD---------DDDDD--DDD 830 * <-------> 831 * 832 * Now remap the source extent into the destination file: 833 * 834 * ----SSSSSSS-SSSSS----SSSSSS 835 * <-------> 836 * --DDDDD--SSSSSSSDDDDD--DDD 837 * <-------> 838 * 839 * Do likewise with the second hole and extent in our range. Holes in the 840 * unmap range don't affect our operation. 841 * 842 * ----SSSSSSS-SSSSS----SSSSSS 843 * <----> 844 * --DDDDD--SSSSSSS-SSSSS-DDD 845 * <----> 846 * 847 * Finally, unmap and remap part of the third extent. This will increase the 848 * size of the destination file. 849 * 850 * ----SSSSSSS-SSSSS----SSSSSS 851 * <-----> 852 * --DDDDD--SSSSSSS-SSSSS----SSS 853 * <-----> 854 * 855 * Once we update the destination file's i_size, we're done. 856 */ 857 858 /* 859 * Ensure the reflink bit is set in both inodes. 860 */ 861 STATIC int 862 xfs_reflink_set_inode_flag( 863 struct xfs_inode *src, 864 struct xfs_inode *dest) 865 { 866 struct xfs_mount *mp = src->i_mount; 867 int error; 868 struct xfs_trans *tp; 869 870 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 871 return 0; 872 873 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 874 if (error) 875 goto out_error; 876 877 /* Lock both files against IO */ 878 if (src->i_ino == dest->i_ino) 879 xfs_ilock(src, XFS_ILOCK_EXCL); 880 else 881 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 882 883 if (!xfs_is_reflink_inode(src)) { 884 trace_xfs_reflink_set_inode_flag(src); 885 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 886 src->i_diflags2 |= XFS_DIFLAG2_REFLINK; 887 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 888 xfs_ifork_init_cow(src); 889 } else 890 xfs_iunlock(src, XFS_ILOCK_EXCL); 891 892 if (src->i_ino == dest->i_ino) 893 goto commit_flags; 894 895 if (!xfs_is_reflink_inode(dest)) { 896 trace_xfs_reflink_set_inode_flag(dest); 897 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 898 dest->i_diflags2 |= XFS_DIFLAG2_REFLINK; 899 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 900 xfs_ifork_init_cow(dest); 901 } else 902 xfs_iunlock(dest, XFS_ILOCK_EXCL); 903 904 commit_flags: 905 error = xfs_trans_commit(tp); 906 if (error) 907 goto out_error; 908 return error; 909 910 out_error: 911 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 912 return error; 913 } 914 915 /* 916 * Update destination inode size & cowextsize hint, if necessary. 917 */ 918 int 919 xfs_reflink_update_dest( 920 struct xfs_inode *dest, 921 xfs_off_t newlen, 922 xfs_extlen_t cowextsize, 923 unsigned int remap_flags) 924 { 925 struct xfs_mount *mp = dest->i_mount; 926 struct xfs_trans *tp; 927 int error; 928 929 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 930 return 0; 931 932 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 933 if (error) 934 goto out_error; 935 936 xfs_ilock(dest, XFS_ILOCK_EXCL); 937 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 938 939 if (newlen > i_size_read(VFS_I(dest))) { 940 trace_xfs_reflink_update_inode_size(dest, newlen); 941 i_size_write(VFS_I(dest), newlen); 942 dest->i_disk_size = newlen; 943 } 944 945 if (cowextsize) { 946 dest->i_cowextsize = cowextsize; 947 dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 948 } 949 950 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 951 952 error = xfs_trans_commit(tp); 953 if (error) 954 goto out_error; 955 return error; 956 957 out_error: 958 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 959 return error; 960 } 961 962 /* 963 * Do we have enough reserve in this AG to handle a reflink? The refcount 964 * btree already reserved all the space it needs, but the rmap btree can grow 965 * infinitely, so we won't allow more reflinks when the AG is down to the 966 * btree reserves. 967 */ 968 static int 969 xfs_reflink_ag_has_free_space( 970 struct xfs_mount *mp, 971 xfs_agnumber_t agno) 972 { 973 struct xfs_perag *pag; 974 int error = 0; 975 976 if (!xfs_has_rmapbt(mp)) 977 return 0; 978 979 pag = xfs_perag_get(mp, agno); 980 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 981 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 982 error = -ENOSPC; 983 xfs_perag_put(pag); 984 return error; 985 } 986 987 /* 988 * Remap the given extent into the file. The dmap blockcount will be set to 989 * the number of blocks that were actually remapped. 990 */ 991 STATIC int 992 xfs_reflink_remap_extent( 993 struct xfs_inode *ip, 994 struct xfs_bmbt_irec *dmap, 995 xfs_off_t new_isize) 996 { 997 struct xfs_bmbt_irec smap; 998 struct xfs_mount *mp = ip->i_mount; 999 struct xfs_trans *tp; 1000 xfs_off_t newlen; 1001 int64_t qdelta = 0; 1002 unsigned int resblks; 1003 bool quota_reserved = true; 1004 bool smap_real; 1005 bool dmap_written = xfs_bmap_is_written_extent(dmap); 1006 int iext_delta = 0; 1007 int nimaps; 1008 int error; 1009 1010 /* 1011 * Start a rolling transaction to switch the mappings. 1012 * 1013 * Adding a written extent to the extent map can cause a bmbt split, 1014 * and removing a mapped extent from the extent can cause a bmbt split. 1015 * The two operations cannot both cause a split since they operate on 1016 * the same index in the bmap btree, so we only need a reservation for 1017 * one bmbt split if either thing is happening. However, we haven't 1018 * locked the inode yet, so we reserve assuming this is the case. 1019 * 1020 * The first allocation call tries to reserve enough space to handle 1021 * mapping dmap into a sparse part of the file plus the bmbt split. We 1022 * haven't locked the inode or read the existing mapping yet, so we do 1023 * not know for sure that we need the space. This should succeed most 1024 * of the time. 1025 * 1026 * If the first attempt fails, try again but reserving only enough 1027 * space to handle a bmbt split. This is the hard minimum requirement, 1028 * and we revisit quota reservations later when we know more about what 1029 * we're remapping. 1030 */ 1031 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 1032 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 1033 resblks + dmap->br_blockcount, 0, false, &tp); 1034 if (error == -EDQUOT || error == -ENOSPC) { 1035 quota_reserved = false; 1036 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 1037 resblks, 0, false, &tp); 1038 } 1039 if (error) 1040 goto out; 1041 1042 /* 1043 * Read what's currently mapped in the destination file into smap. 1044 * If smap isn't a hole, we will have to remove it before we can add 1045 * dmap to the destination file. 1046 */ 1047 nimaps = 1; 1048 error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, 1049 &smap, &nimaps, 0); 1050 if (error) 1051 goto out_cancel; 1052 ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); 1053 smap_real = xfs_bmap_is_real_extent(&smap); 1054 1055 /* 1056 * We can only remap as many blocks as the smaller of the two extent 1057 * maps, because we can only remap one extent at a time. 1058 */ 1059 dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); 1060 ASSERT(dmap->br_blockcount == smap.br_blockcount); 1061 1062 trace_xfs_reflink_remap_extent_dest(ip, &smap); 1063 1064 /* 1065 * Two extents mapped to the same physical block must not have 1066 * different states; that's filesystem corruption. Move on to the next 1067 * extent if they're both holes or both the same physical extent. 1068 */ 1069 if (dmap->br_startblock == smap.br_startblock) { 1070 if (dmap->br_state != smap.br_state) 1071 error = -EFSCORRUPTED; 1072 goto out_cancel; 1073 } 1074 1075 /* If both extents are unwritten, leave them alone. */ 1076 if (dmap->br_state == XFS_EXT_UNWRITTEN && 1077 smap.br_state == XFS_EXT_UNWRITTEN) 1078 goto out_cancel; 1079 1080 /* No reflinking if the AG of the dest mapping is low on space. */ 1081 if (dmap_written) { 1082 error = xfs_reflink_ag_has_free_space(mp, 1083 XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); 1084 if (error) 1085 goto out_cancel; 1086 } 1087 1088 /* 1089 * Increase quota reservation if we think the quota block counter for 1090 * this file could increase. 1091 * 1092 * If we are mapping a written extent into the file, we need to have 1093 * enough quota block count reservation to handle the blocks in that 1094 * extent. We log only the delta to the quota block counts, so if the 1095 * extent we're unmapping also has blocks allocated to it, we don't 1096 * need a quota reservation for the extent itself. 1097 * 1098 * Note that if we're replacing a delalloc reservation with a written 1099 * extent, we have to take the full quota reservation because removing 1100 * the delalloc reservation gives the block count back to the quota 1101 * count. This is suboptimal, but the VFS flushed the dest range 1102 * before we started. That should have removed all the delalloc 1103 * reservations, but we code defensively. 1104 * 1105 * xfs_trans_alloc_inode above already tried to grab an even larger 1106 * quota reservation, and kicked off a blockgc scan if it couldn't. 1107 * If we can't get a potentially smaller quota reservation now, we're 1108 * done. 1109 */ 1110 if (!quota_reserved && !smap_real && dmap_written) { 1111 error = xfs_trans_reserve_quota_nblks(tp, ip, 1112 dmap->br_blockcount, 0, false); 1113 if (error) 1114 goto out_cancel; 1115 } 1116 1117 if (smap_real) 1118 ++iext_delta; 1119 1120 if (dmap_written) 1121 ++iext_delta; 1122 1123 error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); 1124 if (error) 1125 goto out_cancel; 1126 1127 if (smap_real) { 1128 /* 1129 * If the extent we're unmapping is backed by storage (written 1130 * or not), unmap the extent and drop its refcount. 1131 */ 1132 xfs_bmap_unmap_extent(tp, ip, &smap); 1133 xfs_refcount_decrease_extent(tp, &smap); 1134 qdelta -= smap.br_blockcount; 1135 } else if (smap.br_startblock == DELAYSTARTBLOCK) { 1136 xfs_filblks_t len = smap.br_blockcount; 1137 1138 /* 1139 * If the extent we're unmapping is a delalloc reservation, 1140 * we can use the regular bunmapi function to release the 1141 * incore state. Dropping the delalloc reservation takes care 1142 * of the quota reservation for us. 1143 */ 1144 error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); 1145 if (error) 1146 goto out_cancel; 1147 ASSERT(len == 0); 1148 } 1149 1150 /* 1151 * If the extent we're sharing is backed by written storage, increase 1152 * its refcount and map it into the file. 1153 */ 1154 if (dmap_written) { 1155 xfs_refcount_increase_extent(tp, dmap); 1156 xfs_bmap_map_extent(tp, ip, dmap); 1157 qdelta += dmap->br_blockcount; 1158 } 1159 1160 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); 1161 1162 /* Update dest isize if needed. */ 1163 newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); 1164 newlen = min_t(xfs_off_t, newlen, new_isize); 1165 if (newlen > i_size_read(VFS_I(ip))) { 1166 trace_xfs_reflink_update_inode_size(ip, newlen); 1167 i_size_write(VFS_I(ip), newlen); 1168 ip->i_disk_size = newlen; 1169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1170 } 1171 1172 /* Commit everything and unlock. */ 1173 error = xfs_trans_commit(tp); 1174 goto out_unlock; 1175 1176 out_cancel: 1177 xfs_trans_cancel(tp); 1178 out_unlock: 1179 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1180 out: 1181 if (error) 1182 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1183 return error; 1184 } 1185 1186 /* Remap a range of one file to the other. */ 1187 int 1188 xfs_reflink_remap_blocks( 1189 struct xfs_inode *src, 1190 loff_t pos_in, 1191 struct xfs_inode *dest, 1192 loff_t pos_out, 1193 loff_t remap_len, 1194 loff_t *remapped) 1195 { 1196 struct xfs_bmbt_irec imap; 1197 struct xfs_mount *mp = src->i_mount; 1198 xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); 1199 xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); 1200 xfs_filblks_t len; 1201 xfs_filblks_t remapped_len = 0; 1202 xfs_off_t new_isize = pos_out + remap_len; 1203 int nimaps; 1204 int error = 0; 1205 1206 len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), 1207 XFS_MAX_FILEOFF); 1208 1209 trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); 1210 1211 while (len > 0) { 1212 unsigned int lock_mode; 1213 1214 /* Read extent from the source file */ 1215 nimaps = 1; 1216 lock_mode = xfs_ilock_data_map_shared(src); 1217 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1218 xfs_iunlock(src, lock_mode); 1219 if (error) 1220 break; 1221 /* 1222 * The caller supposedly flushed all dirty pages in the source 1223 * file range, which means that writeback should have allocated 1224 * or deleted all delalloc reservations in that range. If we 1225 * find one, that's a good sign that something is seriously 1226 * wrong here. 1227 */ 1228 ASSERT(nimaps == 1 && imap.br_startoff == srcoff); 1229 if (imap.br_startblock == DELAYSTARTBLOCK) { 1230 ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1231 error = -EFSCORRUPTED; 1232 break; 1233 } 1234 1235 trace_xfs_reflink_remap_extent_src(src, &imap); 1236 1237 /* Remap into the destination file at the given offset. */ 1238 imap.br_startoff = destoff; 1239 error = xfs_reflink_remap_extent(dest, &imap, new_isize); 1240 if (error) 1241 break; 1242 1243 if (fatal_signal_pending(current)) { 1244 error = -EINTR; 1245 break; 1246 } 1247 1248 /* Advance drange/srange */ 1249 srcoff += imap.br_blockcount; 1250 destoff += imap.br_blockcount; 1251 len -= imap.br_blockcount; 1252 remapped_len += imap.br_blockcount; 1253 } 1254 1255 if (error) 1256 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1257 *remapped = min_t(loff_t, remap_len, 1258 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1259 return error; 1260 } 1261 1262 /* 1263 * If we're reflinking to a point past the destination file's EOF, we must 1264 * zero any speculative post-EOF preallocations that sit between the old EOF 1265 * and the destination file offset. 1266 */ 1267 static int 1268 xfs_reflink_zero_posteof( 1269 struct xfs_inode *ip, 1270 loff_t pos) 1271 { 1272 loff_t isize = i_size_read(VFS_I(ip)); 1273 1274 if (pos <= isize) 1275 return 0; 1276 1277 trace_xfs_zero_eof(ip, isize, pos - isize); 1278 return xfs_zero_range(ip, isize, pos - isize, NULL); 1279 } 1280 1281 /* 1282 * Prepare two files for range cloning. Upon a successful return both inodes 1283 * will have the iolock and mmaplock held, the page cache of the out file will 1284 * be truncated, and any leases on the out file will have been broken. This 1285 * function borrows heavily from xfs_file_aio_write_checks. 1286 * 1287 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1288 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1289 * EOF block in the source dedupe range because it's not a complete block match, 1290 * hence can introduce a corruption into the file that has it's block replaced. 1291 * 1292 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 1293 * "block aligned" for the purposes of cloning entire files. However, if the 1294 * source file range includes the EOF block and it lands within the existing EOF 1295 * of the destination file, then we can expose stale data from beyond the source 1296 * file EOF in the destination file. 1297 * 1298 * XFS doesn't support partial block sharing, so in both cases we have check 1299 * these cases ourselves. For dedupe, we can simply round the length to dedupe 1300 * down to the previous whole block and ignore the partial EOF block. While this 1301 * means we can't dedupe the last block of a file, this is an acceptible 1302 * tradeoff for simplicity on implementation. 1303 * 1304 * For cloning, we want to share the partial EOF block if it is also the new EOF 1305 * block of the destination file. If the partial EOF block lies inside the 1306 * existing destination EOF, then we have to abort the clone to avoid exposing 1307 * stale data in the destination file. Hence we reject these clone attempts with 1308 * -EINVAL in this case. 1309 */ 1310 int 1311 xfs_reflink_remap_prep( 1312 struct file *file_in, 1313 loff_t pos_in, 1314 struct file *file_out, 1315 loff_t pos_out, 1316 loff_t *len, 1317 unsigned int remap_flags) 1318 { 1319 struct inode *inode_in = file_inode(file_in); 1320 struct xfs_inode *src = XFS_I(inode_in); 1321 struct inode *inode_out = file_inode(file_out); 1322 struct xfs_inode *dest = XFS_I(inode_out); 1323 int ret; 1324 1325 /* Lock both files against IO */ 1326 ret = xfs_ilock2_io_mmap(src, dest); 1327 if (ret) 1328 return ret; 1329 1330 /* Check file eligibility and prepare for block sharing. */ 1331 ret = -EINVAL; 1332 /* Don't reflink realtime inodes */ 1333 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1334 goto out_unlock; 1335 1336 /* Don't share DAX file data for now. */ 1337 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1338 goto out_unlock; 1339 1340 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1341 len, remap_flags); 1342 if (ret || *len == 0) 1343 goto out_unlock; 1344 1345 /* Attach dquots to dest inode before changing block map */ 1346 ret = xfs_qm_dqattach(dest); 1347 if (ret) 1348 goto out_unlock; 1349 1350 /* 1351 * Zero existing post-eof speculative preallocations in the destination 1352 * file. 1353 */ 1354 ret = xfs_reflink_zero_posteof(dest, pos_out); 1355 if (ret) 1356 goto out_unlock; 1357 1358 /* Set flags and remap blocks. */ 1359 ret = xfs_reflink_set_inode_flag(src, dest); 1360 if (ret) 1361 goto out_unlock; 1362 1363 /* 1364 * If pos_out > EOF, we may have dirtied blocks between EOF and 1365 * pos_out. In that case, we need to extend the flush and unmap to cover 1366 * from EOF to the end of the copy length. 1367 */ 1368 if (pos_out > XFS_ISIZE(dest)) { 1369 loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 1370 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 1371 } else { 1372 ret = xfs_flush_unmap_range(dest, pos_out, *len); 1373 } 1374 if (ret) 1375 goto out_unlock; 1376 1377 return 0; 1378 out_unlock: 1379 xfs_iunlock2_io_mmap(src, dest); 1380 return ret; 1381 } 1382 1383 /* Does this inode need the reflink flag? */ 1384 int 1385 xfs_reflink_inode_has_shared_extents( 1386 struct xfs_trans *tp, 1387 struct xfs_inode *ip, 1388 bool *has_shared) 1389 { 1390 struct xfs_bmbt_irec got; 1391 struct xfs_mount *mp = ip->i_mount; 1392 struct xfs_ifork *ifp; 1393 xfs_agnumber_t agno; 1394 xfs_agblock_t agbno; 1395 xfs_extlen_t aglen; 1396 xfs_agblock_t rbno; 1397 xfs_extlen_t rlen; 1398 struct xfs_iext_cursor icur; 1399 bool found; 1400 int error; 1401 1402 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1403 error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1404 if (error) 1405 return error; 1406 1407 *has_shared = false; 1408 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1409 while (found) { 1410 if (isnullstartblock(got.br_startblock) || 1411 got.br_state != XFS_EXT_NORM) 1412 goto next; 1413 agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); 1414 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1415 aglen = got.br_blockcount; 1416 1417 error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, 1418 &rbno, &rlen, false); 1419 if (error) 1420 return error; 1421 /* Is there still a shared block here? */ 1422 if (rbno != NULLAGBLOCK) { 1423 *has_shared = true; 1424 return 0; 1425 } 1426 next: 1427 found = xfs_iext_next_extent(ifp, &icur, &got); 1428 } 1429 1430 return 0; 1431 } 1432 1433 /* 1434 * Clear the inode reflink flag if there are no shared extents. 1435 * 1436 * The caller is responsible for joining the inode to the transaction passed in. 1437 * The inode will be joined to the transaction that is returned to the caller. 1438 */ 1439 int 1440 xfs_reflink_clear_inode_flag( 1441 struct xfs_inode *ip, 1442 struct xfs_trans **tpp) 1443 { 1444 bool needs_flag; 1445 int error = 0; 1446 1447 ASSERT(xfs_is_reflink_inode(ip)); 1448 1449 error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 1450 if (error || needs_flag) 1451 return error; 1452 1453 /* 1454 * We didn't find any shared blocks so turn off the reflink flag. 1455 * First, get rid of any leftover CoW mappings. 1456 */ 1457 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, 1458 true); 1459 if (error) 1460 return error; 1461 1462 /* Clear the inode flag. */ 1463 trace_xfs_reflink_unset_inode_flag(ip); 1464 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1465 xfs_inode_clear_cowblocks_tag(ip); 1466 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1467 1468 return error; 1469 } 1470 1471 /* 1472 * Clear the inode reflink flag if there are no shared extents and the size 1473 * hasn't changed. 1474 */ 1475 STATIC int 1476 xfs_reflink_try_clear_inode_flag( 1477 struct xfs_inode *ip) 1478 { 1479 struct xfs_mount *mp = ip->i_mount; 1480 struct xfs_trans *tp; 1481 int error = 0; 1482 1483 /* Start a rolling transaction to remove the mappings */ 1484 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1485 if (error) 1486 return error; 1487 1488 xfs_ilock(ip, XFS_ILOCK_EXCL); 1489 xfs_trans_ijoin(tp, ip, 0); 1490 1491 error = xfs_reflink_clear_inode_flag(ip, &tp); 1492 if (error) 1493 goto cancel; 1494 1495 error = xfs_trans_commit(tp); 1496 if (error) 1497 goto out; 1498 1499 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1500 return 0; 1501 cancel: 1502 xfs_trans_cancel(tp); 1503 out: 1504 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1505 return error; 1506 } 1507 1508 /* 1509 * Pre-COW all shared blocks within a given byte range of a file and turn off 1510 * the reflink flag if we unshare all of the file's blocks. 1511 */ 1512 int 1513 xfs_reflink_unshare( 1514 struct xfs_inode *ip, 1515 xfs_off_t offset, 1516 xfs_off_t len) 1517 { 1518 struct inode *inode = VFS_I(ip); 1519 int error; 1520 1521 if (!xfs_is_reflink_inode(ip)) 1522 return 0; 1523 1524 trace_xfs_reflink_unshare(ip, offset, len); 1525 1526 inode_dio_wait(inode); 1527 1528 error = iomap_file_unshare(inode, offset, len, 1529 &xfs_buffered_write_iomap_ops); 1530 if (error) 1531 goto out; 1532 1533 error = filemap_write_and_wait_range(inode->i_mapping, offset, 1534 offset + len - 1); 1535 if (error) 1536 goto out; 1537 1538 /* Turn off the reflink flag if possible. */ 1539 error = xfs_reflink_try_clear_inode_flag(ip); 1540 if (error) 1541 goto out; 1542 return 0; 1543 1544 out: 1545 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1546 return error; 1547 } 1548