1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_da_format.h" 15 #include "xfs_da_btree.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_bmap.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_error.h" 22 #include "xfs_dir2.h" 23 #include "xfs_dir2_priv.h" 24 #include "xfs_ioctl.h" 25 #include "xfs_trace.h" 26 #include "xfs_log.h" 27 #include "xfs_icache.h" 28 #include "xfs_pnfs.h" 29 #include "xfs_btree.h" 30 #include "xfs_refcount_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_bmap_btree.h" 33 #include "xfs_trans_space.h" 34 #include "xfs_bit.h" 35 #include "xfs_alloc.h" 36 #include "xfs_quota_defs.h" 37 #include "xfs_quota.h" 38 #include "xfs_reflink.h" 39 #include "xfs_iomap.h" 40 #include "xfs_rmap_btree.h" 41 #include "xfs_sb.h" 42 #include "xfs_ag_resv.h" 43 44 /* 45 * Copy on Write of Shared Blocks 46 * 47 * XFS must preserve "the usual" file semantics even when two files share 48 * the same physical blocks. This means that a write to one file must not 49 * alter the blocks in a different file; the way that we'll do that is 50 * through the use of a copy-on-write mechanism. At a high level, that 51 * means that when we want to write to a shared block, we allocate a new 52 * block, write the data to the new block, and if that succeeds we map the 53 * new block into the file. 54 * 55 * XFS provides a "delayed allocation" mechanism that defers the allocation 56 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 57 * possible. This reduces fragmentation by enabling the filesystem to ask 58 * for bigger chunks less often, which is exactly what we want for CoW. 59 * 60 * The delalloc mechanism begins when the kernel wants to make a block 61 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 62 * create a delalloc mapping, which is a regular in-core extent, but without 63 * a real startblock. (For delalloc mappings, the startblock encodes both 64 * a flag that this is a delalloc mapping, and a worst-case estimate of how 65 * many blocks might be required to put the mapping into the BMBT.) delalloc 66 * mappings are a reservation against the free space in the filesystem; 67 * adjacent mappings can also be combined into fewer larger mappings. 68 * 69 * As an optimization, the CoW extent size hint (cowextsz) creates 70 * outsized aligned delalloc reservations in the hope of landing out of 71 * order nearby CoW writes in a single extent on disk, thereby reducing 72 * fragmentation and improving future performance. 73 * 74 * D: --RRRRRRSSSRRRRRRRR--- (data fork) 75 * C: ------DDDDDDD--------- (CoW fork) 76 * 77 * When dirty pages are being written out (typically in writepage), the 78 * delalloc reservations are converted into unwritten mappings by 79 * allocating blocks and replacing the delalloc mapping with real ones. 80 * A delalloc mapping can be replaced by several unwritten ones if the 81 * free space is fragmented. 82 * 83 * D: --RRRRRRSSSRRRRRRRR--- 84 * C: ------UUUUUUU--------- 85 * 86 * We want to adapt the delalloc mechanism for copy-on-write, since the 87 * write paths are similar. The first two steps (creating the reservation 88 * and allocating the blocks) are exactly the same as delalloc except that 89 * the mappings must be stored in a separate CoW fork because we do not want 90 * to disturb the mapping in the data fork until we're sure that the write 91 * succeeded. IO completion in this case is the process of removing the old 92 * mapping from the data fork and moving the new mapping from the CoW fork to 93 * the data fork. This will be discussed shortly. 94 * 95 * For now, unaligned directio writes will be bounced back to the page cache. 96 * Block-aligned directio writes will use the same mechanism as buffered 97 * writes. 98 * 99 * Just prior to submitting the actual disk write requests, we convert 100 * the extents representing the range of the file actually being written 101 * (as opposed to extra pieces created for the cowextsize hint) to real 102 * extents. This will become important in the next step: 103 * 104 * D: --RRRRRRSSSRRRRRRRR--- 105 * C: ------UUrrUUU--------- 106 * 107 * CoW remapping must be done after the data block write completes, 108 * because we don't want to destroy the old data fork map until we're sure 109 * the new block has been written. Since the new mappings are kept in a 110 * separate fork, we can simply iterate these mappings to find the ones 111 * that cover the file blocks that we just CoW'd. For each extent, simply 112 * unmap the corresponding range in the data fork, map the new range into 113 * the data fork, and remove the extent from the CoW fork. Because of 114 * the presence of the cowextsize hint, however, we must be careful 115 * only to remap the blocks that we've actually written out -- we must 116 * never remap delalloc reservations nor CoW staging blocks that have 117 * yet to be written. This corresponds exactly to the real extents in 118 * the CoW fork: 119 * 120 * D: --RRRRRRrrSRRRRRRRR--- 121 * C: ------UU--UUU--------- 122 * 123 * Since the remapping operation can be applied to an arbitrary file 124 * range, we record the need for the remap step as a flag in the ioend 125 * instead of declaring a new IO type. This is required for direct io 126 * because we only have ioend for the whole dio, and we have to be able to 127 * remember the presence of unwritten blocks and CoW blocks with a single 128 * ioend structure. Better yet, the more ground we can cover with one 129 * ioend, the better. 130 */ 131 132 /* 133 * Given an AG extent, find the lowest-numbered run of shared blocks 134 * within that range and return the range in fbno/flen. If 135 * find_end_of_shared is true, return the longest contiguous extent of 136 * shared blocks. If there are no shared extents, fbno and flen will 137 * be set to NULLAGBLOCK and 0, respectively. 138 */ 139 int 140 xfs_reflink_find_shared( 141 struct xfs_mount *mp, 142 struct xfs_trans *tp, 143 xfs_agnumber_t agno, 144 xfs_agblock_t agbno, 145 xfs_extlen_t aglen, 146 xfs_agblock_t *fbno, 147 xfs_extlen_t *flen, 148 bool find_end_of_shared) 149 { 150 struct xfs_buf *agbp; 151 struct xfs_btree_cur *cur; 152 int error; 153 154 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 155 if (error) 156 return error; 157 if (!agbp) 158 return -ENOMEM; 159 160 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); 161 162 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 163 find_end_of_shared); 164 165 xfs_btree_del_cursor(cur, error); 166 167 xfs_trans_brelse(tp, agbp); 168 return error; 169 } 170 171 /* 172 * Trim the mapping to the next block where there's a change in the 173 * shared/unshared status. More specifically, this means that we 174 * find the lowest-numbered extent of shared blocks that coincides with 175 * the given block mapping. If the shared extent overlaps the start of 176 * the mapping, trim the mapping to the end of the shared extent. If 177 * the shared region intersects the mapping, trim the mapping to the 178 * start of the shared extent. If there are no shared regions that 179 * overlap, just return the original extent. 180 */ 181 int 182 xfs_reflink_trim_around_shared( 183 struct xfs_inode *ip, 184 struct xfs_bmbt_irec *irec, 185 bool *shared) 186 { 187 xfs_agnumber_t agno; 188 xfs_agblock_t agbno; 189 xfs_extlen_t aglen; 190 xfs_agblock_t fbno; 191 xfs_extlen_t flen; 192 int error = 0; 193 194 /* Holes, unwritten, and delalloc extents cannot be shared */ 195 if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 196 *shared = false; 197 return 0; 198 } 199 200 trace_xfs_reflink_trim_around_shared(ip, irec); 201 202 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 203 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 204 aglen = irec->br_blockcount; 205 206 error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, 207 aglen, &fbno, &flen, true); 208 if (error) 209 return error; 210 211 *shared = false; 212 if (fbno == NULLAGBLOCK) { 213 /* No shared blocks at all. */ 214 return 0; 215 } else if (fbno == agbno) { 216 /* 217 * The start of this extent is shared. Truncate the 218 * mapping at the end of the shared region so that a 219 * subsequent iteration starts at the start of the 220 * unshared region. 221 */ 222 irec->br_blockcount = flen; 223 *shared = true; 224 return 0; 225 } else { 226 /* 227 * There's a shared extent midway through this extent. 228 * Truncate the mapping at the start of the shared 229 * extent so that a subsequent iteration starts at the 230 * start of the shared region. 231 */ 232 irec->br_blockcount = fbno - agbno; 233 return 0; 234 } 235 } 236 237 /* 238 * Trim the passed in imap to the next shared/unshared extent boundary, and 239 * if imap->br_startoff points to a shared extent reserve space for it in the 240 * COW fork. 241 * 242 * Note that imap will always contain the block numbers for the existing blocks 243 * in the data fork, as the upper layers need them for read-modify-write 244 * operations. 245 */ 246 int 247 xfs_reflink_reserve_cow( 248 struct xfs_inode *ip, 249 struct xfs_bmbt_irec *imap) 250 { 251 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 252 struct xfs_bmbt_irec got; 253 int error = 0; 254 bool eof = false; 255 struct xfs_iext_cursor icur; 256 bool shared; 257 258 /* 259 * Search the COW fork extent list first. This serves two purposes: 260 * first this implement the speculative preallocation using cowextisze, 261 * so that we also unshared block adjacent to shared blocks instead 262 * of just the shared blocks themselves. Second the lookup in the 263 * extent list is generally faster than going out to the shared extent 264 * tree. 265 */ 266 267 if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) 268 eof = true; 269 if (!eof && got.br_startoff <= imap->br_startoff) { 270 trace_xfs_reflink_cow_found(ip, imap); 271 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 272 return 0; 273 } 274 275 /* Trim the mapping to the nearest shared extent boundary. */ 276 error = xfs_reflink_trim_around_shared(ip, imap, &shared); 277 if (error) 278 return error; 279 280 /* Not shared? Just report the (potentially capped) extent. */ 281 if (!shared) 282 return 0; 283 284 /* 285 * Fork all the shared blocks from our write offset until the end of 286 * the extent. 287 */ 288 error = xfs_qm_dqattach_locked(ip, false); 289 if (error) 290 return error; 291 292 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, 293 imap->br_blockcount, 0, &got, &icur, eof); 294 if (error == -ENOSPC || error == -EDQUOT) 295 trace_xfs_reflink_cow_enospc(ip, imap); 296 if (error) 297 return error; 298 299 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 300 trace_xfs_reflink_cow_alloc(ip, &got); 301 return 0; 302 } 303 304 /* Convert part of an unwritten CoW extent to a real one. */ 305 STATIC int 306 xfs_reflink_convert_cow_extent( 307 struct xfs_inode *ip, 308 struct xfs_bmbt_irec *imap, 309 xfs_fileoff_t offset_fsb, 310 xfs_filblks_t count_fsb) 311 { 312 int nimaps = 1; 313 314 if (imap->br_state == XFS_EXT_NORM) 315 return 0; 316 317 xfs_trim_extent(imap, offset_fsb, count_fsb); 318 trace_xfs_reflink_convert_cow(ip, imap); 319 if (imap->br_blockcount == 0) 320 return 0; 321 return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, 322 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, 323 &nimaps); 324 } 325 326 /* Convert all of the unwritten CoW extents in a file's range to real ones. */ 327 int 328 xfs_reflink_convert_cow( 329 struct xfs_inode *ip, 330 xfs_off_t offset, 331 xfs_off_t count) 332 { 333 struct xfs_mount *mp = ip->i_mount; 334 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 335 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 336 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 337 struct xfs_bmbt_irec imap; 338 int nimaps = 1, error = 0; 339 340 ASSERT(count != 0); 341 342 xfs_ilock(ip, XFS_ILOCK_EXCL); 343 error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, 344 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | 345 XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); 346 xfs_iunlock(ip, XFS_ILOCK_EXCL); 347 return error; 348 } 349 350 /* 351 * Find the extent that maps the given range in the COW fork. Even if the extent 352 * is not shared we might have a preallocation for it in the COW fork. If so we 353 * use it that rather than trigger a new allocation. 354 */ 355 static int 356 xfs_find_trim_cow_extent( 357 struct xfs_inode *ip, 358 struct xfs_bmbt_irec *imap, 359 bool *shared, 360 bool *found) 361 { 362 xfs_fileoff_t offset_fsb = imap->br_startoff; 363 xfs_filblks_t count_fsb = imap->br_blockcount; 364 struct xfs_iext_cursor icur; 365 struct xfs_bmbt_irec got; 366 367 *found = false; 368 369 /* 370 * If we don't find an overlapping extent, trim the range we need to 371 * allocate to fit the hole we found. 372 */ 373 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 374 got.br_startoff = offset_fsb + count_fsb; 375 if (got.br_startoff > offset_fsb) { 376 xfs_trim_extent(imap, imap->br_startoff, 377 got.br_startoff - imap->br_startoff); 378 return xfs_reflink_trim_around_shared(ip, imap, shared); 379 } 380 381 *shared = true; 382 if (isnullstartblock(got.br_startblock)) { 383 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 384 return 0; 385 } 386 387 /* real extent found - no need to allocate */ 388 xfs_trim_extent(&got, offset_fsb, count_fsb); 389 *imap = got; 390 *found = true; 391 return 0; 392 } 393 394 /* Allocate all CoW reservations covering a range of blocks in a file. */ 395 int 396 xfs_reflink_allocate_cow( 397 struct xfs_inode *ip, 398 struct xfs_bmbt_irec *imap, 399 bool *shared, 400 uint *lockmode) 401 { 402 struct xfs_mount *mp = ip->i_mount; 403 xfs_fileoff_t offset_fsb = imap->br_startoff; 404 xfs_filblks_t count_fsb = imap->br_blockcount; 405 struct xfs_trans *tp; 406 int nimaps, error = 0; 407 bool found; 408 xfs_filblks_t resaligned; 409 xfs_extlen_t resblks = 0; 410 411 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 412 ASSERT(xfs_is_reflink_inode(ip)); 413 414 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 415 if (error || !*shared) 416 return error; 417 if (found) 418 goto convert; 419 420 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 421 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 422 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 423 424 xfs_iunlock(ip, *lockmode); 425 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 426 *lockmode = XFS_ILOCK_EXCL; 427 xfs_ilock(ip, *lockmode); 428 429 if (error) 430 return error; 431 432 error = xfs_qm_dqattach_locked(ip, false); 433 if (error) 434 goto out_trans_cancel; 435 436 /* 437 * Check for an overlapping extent again now that we dropped the ilock. 438 */ 439 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 440 if (error || !*shared) 441 goto out_trans_cancel; 442 if (found) { 443 xfs_trans_cancel(tp); 444 goto convert; 445 } 446 447 error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, 448 XFS_QMOPT_RES_REGBLKS); 449 if (error) 450 goto out_trans_cancel; 451 452 xfs_trans_ijoin(tp, ip, 0); 453 454 /* Allocate the entire reservation as unwritten blocks. */ 455 nimaps = 1; 456 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 457 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 458 resblks, imap, &nimaps); 459 if (error) 460 goto out_unreserve; 461 462 xfs_inode_set_cowblocks_tag(ip); 463 error = xfs_trans_commit(tp); 464 if (error) 465 return error; 466 467 /* 468 * Allocation succeeded but the requested range was not even partially 469 * satisfied? Bail out! 470 */ 471 if (nimaps == 0) 472 return -ENOSPC; 473 convert: 474 return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); 475 476 out_unreserve: 477 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, 478 XFS_QMOPT_RES_REGBLKS); 479 out_trans_cancel: 480 xfs_trans_cancel(tp); 481 return error; 482 } 483 484 /* 485 * Cancel CoW reservations for some block range of an inode. 486 * 487 * If cancel_real is true this function cancels all COW fork extents for the 488 * inode; if cancel_real is false, real extents are not cleared. 489 * 490 * Caller must have already joined the inode to the current transaction. The 491 * inode will be joined to the transaction returned to the caller. 492 */ 493 int 494 xfs_reflink_cancel_cow_blocks( 495 struct xfs_inode *ip, 496 struct xfs_trans **tpp, 497 xfs_fileoff_t offset_fsb, 498 xfs_fileoff_t end_fsb, 499 bool cancel_real) 500 { 501 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 502 struct xfs_bmbt_irec got, del; 503 struct xfs_iext_cursor icur; 504 int error = 0; 505 506 if (!xfs_inode_has_cow_data(ip)) 507 return 0; 508 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 509 return 0; 510 511 /* Walk backwards until we're out of the I/O range... */ 512 while (got.br_startoff + got.br_blockcount > offset_fsb) { 513 del = got; 514 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 515 516 /* Extent delete may have bumped ext forward */ 517 if (!del.br_blockcount) { 518 xfs_iext_prev(ifp, &icur); 519 goto next_extent; 520 } 521 522 trace_xfs_reflink_cancel_cow(ip, &del); 523 524 if (isnullstartblock(del.br_startblock)) { 525 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 526 &icur, &got, &del); 527 if (error) 528 break; 529 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 530 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 531 532 /* Free the CoW orphan record. */ 533 error = xfs_refcount_free_cow_extent(*tpp, 534 del.br_startblock, del.br_blockcount); 535 if (error) 536 break; 537 538 xfs_bmap_add_free(*tpp, del.br_startblock, 539 del.br_blockcount, NULL); 540 541 /* Roll the transaction */ 542 error = xfs_defer_finish(tpp); 543 if (error) 544 break; 545 546 /* Remove the mapping from the CoW fork. */ 547 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 548 549 /* Remove the quota reservation */ 550 error = xfs_trans_reserve_quota_nblks(NULL, ip, 551 -(long)del.br_blockcount, 0, 552 XFS_QMOPT_RES_REGBLKS); 553 if (error) 554 break; 555 } else { 556 /* Didn't do anything, push cursor back. */ 557 xfs_iext_prev(ifp, &icur); 558 } 559 next_extent: 560 if (!xfs_iext_get_extent(ifp, &icur, &got)) 561 break; 562 } 563 564 /* clear tag if cow fork is emptied */ 565 if (!ifp->if_bytes) 566 xfs_inode_clear_cowblocks_tag(ip); 567 return error; 568 } 569 570 /* 571 * Cancel CoW reservations for some byte range of an inode. 572 * 573 * If cancel_real is true this function cancels all COW fork extents for the 574 * inode; if cancel_real is false, real extents are not cleared. 575 */ 576 int 577 xfs_reflink_cancel_cow_range( 578 struct xfs_inode *ip, 579 xfs_off_t offset, 580 xfs_off_t count, 581 bool cancel_real) 582 { 583 struct xfs_trans *tp; 584 xfs_fileoff_t offset_fsb; 585 xfs_fileoff_t end_fsb; 586 int error; 587 588 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 589 ASSERT(xfs_is_reflink_inode(ip)); 590 591 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 592 if (count == NULLFILEOFF) 593 end_fsb = NULLFILEOFF; 594 else 595 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 596 597 /* Start a rolling transaction to remove the mappings */ 598 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 599 0, 0, XFS_TRANS_NOFS, &tp); 600 if (error) 601 goto out; 602 603 xfs_ilock(ip, XFS_ILOCK_EXCL); 604 xfs_trans_ijoin(tp, ip, 0); 605 606 /* Scrape out the old CoW reservations */ 607 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 608 cancel_real); 609 if (error) 610 goto out_cancel; 611 612 error = xfs_trans_commit(tp); 613 614 xfs_iunlock(ip, XFS_ILOCK_EXCL); 615 return error; 616 617 out_cancel: 618 xfs_trans_cancel(tp); 619 xfs_iunlock(ip, XFS_ILOCK_EXCL); 620 out: 621 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 622 return error; 623 } 624 625 /* 626 * Remap parts of a file's data fork after a successful CoW. 627 */ 628 int 629 xfs_reflink_end_cow( 630 struct xfs_inode *ip, 631 xfs_off_t offset, 632 xfs_off_t count) 633 { 634 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 635 struct xfs_bmbt_irec got, del; 636 struct xfs_trans *tp; 637 xfs_fileoff_t offset_fsb; 638 xfs_fileoff_t end_fsb; 639 int error; 640 unsigned int resblks; 641 xfs_filblks_t rlen; 642 struct xfs_iext_cursor icur; 643 644 trace_xfs_reflink_end_cow(ip, offset, count); 645 646 /* No COW extents? That's easy! */ 647 if (ifp->if_bytes == 0) 648 return 0; 649 650 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 651 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 652 653 /* 654 * Start a rolling transaction to switch the mappings. We're 655 * unlikely ever to have to remap 16T worth of single-block 656 * extents, so just cap the worst case extent count to 2^32-1. 657 * Stick a warning in just in case, and avoid 64-bit division. 658 */ 659 BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); 660 if (end_fsb - offset_fsb > UINT_MAX) { 661 error = -EFSCORRUPTED; 662 xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); 663 ASSERT(0); 664 goto out; 665 } 666 resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, 667 (unsigned int)(end_fsb - offset_fsb), 668 XFS_DATA_FORK); 669 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 670 resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); 671 if (error) 672 goto out; 673 674 xfs_ilock(ip, XFS_ILOCK_EXCL); 675 xfs_trans_ijoin(tp, ip, 0); 676 677 /* 678 * In case of racing, overlapping AIO writes no COW extents might be 679 * left by the time I/O completes for the loser of the race. In that 680 * case we are done. 681 */ 682 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 683 goto out_cancel; 684 685 /* Walk backwards until we're out of the I/O range... */ 686 while (got.br_startoff + got.br_blockcount > offset_fsb) { 687 del = got; 688 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 689 690 /* Extent delete may have bumped ext forward */ 691 if (!del.br_blockcount) 692 goto prev_extent; 693 694 /* 695 * Only remap real extent that contain data. With AIO 696 * speculatively preallocations can leak into the range we 697 * are called upon, and we need to skip them. 698 */ 699 if (!xfs_bmap_is_real_extent(&got)) 700 goto prev_extent; 701 702 /* Unmap the old blocks in the data fork. */ 703 ASSERT(tp->t_firstblock == NULLFSBLOCK); 704 rlen = del.br_blockcount; 705 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); 706 if (error) 707 goto out_cancel; 708 709 /* Trim the extent to whatever got unmapped. */ 710 if (rlen) { 711 xfs_trim_extent(&del, del.br_startoff + rlen, 712 del.br_blockcount - rlen); 713 } 714 trace_xfs_reflink_cow_remap(ip, &del); 715 716 /* Free the CoW orphan record. */ 717 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 718 del.br_blockcount); 719 if (error) 720 goto out_cancel; 721 722 /* Map the new blocks into the data fork. */ 723 error = xfs_bmap_map_extent(tp, ip, &del); 724 if (error) 725 goto out_cancel; 726 727 /* Charge this new data fork mapping to the on-disk quota. */ 728 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 729 (long)del.br_blockcount); 730 731 /* Remove the mapping from the CoW fork. */ 732 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 733 734 error = xfs_defer_finish(&tp); 735 if (error) 736 goto out_cancel; 737 if (!xfs_iext_get_extent(ifp, &icur, &got)) 738 break; 739 continue; 740 prev_extent: 741 if (!xfs_iext_prev_extent(ifp, &icur, &got)) 742 break; 743 } 744 745 error = xfs_trans_commit(tp); 746 xfs_iunlock(ip, XFS_ILOCK_EXCL); 747 if (error) 748 goto out; 749 return 0; 750 751 out_cancel: 752 xfs_trans_cancel(tp); 753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 754 out: 755 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 756 return error; 757 } 758 759 /* 760 * Free leftover CoW reservations that didn't get cleaned out. 761 */ 762 int 763 xfs_reflink_recover_cow( 764 struct xfs_mount *mp) 765 { 766 xfs_agnumber_t agno; 767 int error = 0; 768 769 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 770 return 0; 771 772 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 773 error = xfs_refcount_recover_cow_leftovers(mp, agno); 774 if (error) 775 break; 776 } 777 778 return error; 779 } 780 781 /* 782 * Reflinking (Block) Ranges of Two Files Together 783 * 784 * First, ensure that the reflink flag is set on both inodes. The flag is an 785 * optimization to avoid unnecessary refcount btree lookups in the write path. 786 * 787 * Now we can iteratively remap the range of extents (and holes) in src to the 788 * corresponding ranges in dest. Let drange and srange denote the ranges of 789 * logical blocks in dest and src touched by the reflink operation. 790 * 791 * While the length of drange is greater than zero, 792 * - Read src's bmbt at the start of srange ("imap") 793 * - If imap doesn't exist, make imap appear to start at the end of srange 794 * with zero length. 795 * - If imap starts before srange, advance imap to start at srange. 796 * - If imap goes beyond srange, truncate imap to end at the end of srange. 797 * - Punch (imap start - srange start + imap len) blocks from dest at 798 * offset (drange start). 799 * - If imap points to a real range of pblks, 800 * > Increase the refcount of the imap's pblks 801 * > Map imap's pblks into dest at the offset 802 * (drange start + imap start - srange start) 803 * - Advance drange and srange by (imap start - srange start + imap len) 804 * 805 * Finally, if the reflink made dest longer, update both the in-core and 806 * on-disk file sizes. 807 * 808 * ASCII Art Demonstration: 809 * 810 * Let's say we want to reflink this source file: 811 * 812 * ----SSSSSSS-SSSSS----SSSSSS (src file) 813 * <--------------------> 814 * 815 * into this destination file: 816 * 817 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 818 * <--------------------> 819 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 820 * Observe that the range has different logical offsets in either file. 821 * 822 * Consider that the first extent in the source file doesn't line up with our 823 * reflink range. Unmapping and remapping are separate operations, so we can 824 * unmap more blocks from the destination file than we remap. 825 * 826 * ----SSSSSSS-SSSSS----SSSSSS 827 * <-------> 828 * --DDDDD---------DDDDD--DDD 829 * <-------> 830 * 831 * Now remap the source extent into the destination file: 832 * 833 * ----SSSSSSS-SSSSS----SSSSSS 834 * <-------> 835 * --DDDDD--SSSSSSSDDDDD--DDD 836 * <-------> 837 * 838 * Do likewise with the second hole and extent in our range. Holes in the 839 * unmap range don't affect our operation. 840 * 841 * ----SSSSSSS-SSSSS----SSSSSS 842 * <----> 843 * --DDDDD--SSSSSSS-SSSSS-DDD 844 * <----> 845 * 846 * Finally, unmap and remap part of the third extent. This will increase the 847 * size of the destination file. 848 * 849 * ----SSSSSSS-SSSSS----SSSSSS 850 * <-----> 851 * --DDDDD--SSSSSSS-SSSSS----SSS 852 * <-----> 853 * 854 * Once we update the destination file's i_size, we're done. 855 */ 856 857 /* 858 * Ensure the reflink bit is set in both inodes. 859 */ 860 STATIC int 861 xfs_reflink_set_inode_flag( 862 struct xfs_inode *src, 863 struct xfs_inode *dest) 864 { 865 struct xfs_mount *mp = src->i_mount; 866 int error; 867 struct xfs_trans *tp; 868 869 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 870 return 0; 871 872 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 873 if (error) 874 goto out_error; 875 876 /* Lock both files against IO */ 877 if (src->i_ino == dest->i_ino) 878 xfs_ilock(src, XFS_ILOCK_EXCL); 879 else 880 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 881 882 if (!xfs_is_reflink_inode(src)) { 883 trace_xfs_reflink_set_inode_flag(src); 884 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 885 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 886 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 887 xfs_ifork_init_cow(src); 888 } else 889 xfs_iunlock(src, XFS_ILOCK_EXCL); 890 891 if (src->i_ino == dest->i_ino) 892 goto commit_flags; 893 894 if (!xfs_is_reflink_inode(dest)) { 895 trace_xfs_reflink_set_inode_flag(dest); 896 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 897 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 898 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 899 xfs_ifork_init_cow(dest); 900 } else 901 xfs_iunlock(dest, XFS_ILOCK_EXCL); 902 903 commit_flags: 904 error = xfs_trans_commit(tp); 905 if (error) 906 goto out_error; 907 return error; 908 909 out_error: 910 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 911 return error; 912 } 913 914 /* 915 * Update destination inode size & cowextsize hint, if necessary. 916 */ 917 int 918 xfs_reflink_update_dest( 919 struct xfs_inode *dest, 920 xfs_off_t newlen, 921 xfs_extlen_t cowextsize, 922 unsigned int remap_flags) 923 { 924 struct xfs_mount *mp = dest->i_mount; 925 struct xfs_trans *tp; 926 int error; 927 928 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 929 return 0; 930 931 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 932 if (error) 933 goto out_error; 934 935 xfs_ilock(dest, XFS_ILOCK_EXCL); 936 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 937 938 if (newlen > i_size_read(VFS_I(dest))) { 939 trace_xfs_reflink_update_inode_size(dest, newlen); 940 i_size_write(VFS_I(dest), newlen); 941 dest->i_d.di_size = newlen; 942 } 943 944 if (cowextsize) { 945 dest->i_d.di_cowextsize = cowextsize; 946 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 947 } 948 949 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 950 951 error = xfs_trans_commit(tp); 952 if (error) 953 goto out_error; 954 return error; 955 956 out_error: 957 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 958 return error; 959 } 960 961 /* 962 * Do we have enough reserve in this AG to handle a reflink? The refcount 963 * btree already reserved all the space it needs, but the rmap btree can grow 964 * infinitely, so we won't allow more reflinks when the AG is down to the 965 * btree reserves. 966 */ 967 static int 968 xfs_reflink_ag_has_free_space( 969 struct xfs_mount *mp, 970 xfs_agnumber_t agno) 971 { 972 struct xfs_perag *pag; 973 int error = 0; 974 975 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 976 return 0; 977 978 pag = xfs_perag_get(mp, agno); 979 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 980 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 981 error = -ENOSPC; 982 xfs_perag_put(pag); 983 return error; 984 } 985 986 /* 987 * Unmap a range of blocks from a file, then map other blocks into the hole. 988 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 989 * The extent irec is mapped into dest at irec->br_startoff. 990 */ 991 STATIC int 992 xfs_reflink_remap_extent( 993 struct xfs_inode *ip, 994 struct xfs_bmbt_irec *irec, 995 xfs_fileoff_t destoff, 996 xfs_off_t new_isize) 997 { 998 struct xfs_mount *mp = ip->i_mount; 999 bool real_extent = xfs_bmap_is_real_extent(irec); 1000 struct xfs_trans *tp; 1001 unsigned int resblks; 1002 struct xfs_bmbt_irec uirec; 1003 xfs_filblks_t rlen; 1004 xfs_filblks_t unmap_len; 1005 xfs_off_t newlen; 1006 int error; 1007 1008 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1009 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1010 1011 /* No reflinking if we're low on space */ 1012 if (real_extent) { 1013 error = xfs_reflink_ag_has_free_space(mp, 1014 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1015 if (error) 1016 goto out; 1017 } 1018 1019 /* Start a rolling transaction to switch the mappings */ 1020 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1021 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1022 if (error) 1023 goto out; 1024 1025 xfs_ilock(ip, XFS_ILOCK_EXCL); 1026 xfs_trans_ijoin(tp, ip, 0); 1027 1028 /* If we're not just clearing space, then do we have enough quota? */ 1029 if (real_extent) { 1030 error = xfs_trans_reserve_quota_nblks(tp, ip, 1031 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1032 if (error) 1033 goto out_cancel; 1034 } 1035 1036 trace_xfs_reflink_remap(ip, irec->br_startoff, 1037 irec->br_blockcount, irec->br_startblock); 1038 1039 /* Unmap the old blocks in the data fork. */ 1040 rlen = unmap_len; 1041 while (rlen) { 1042 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1043 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); 1044 if (error) 1045 goto out_cancel; 1046 1047 /* 1048 * Trim the extent to whatever got unmapped. 1049 * Remember, bunmapi works backwards. 1050 */ 1051 uirec.br_startblock = irec->br_startblock + rlen; 1052 uirec.br_startoff = irec->br_startoff + rlen; 1053 uirec.br_blockcount = unmap_len - rlen; 1054 unmap_len = rlen; 1055 1056 /* If this isn't a real mapping, we're done. */ 1057 if (!real_extent || uirec.br_blockcount == 0) 1058 goto next_extent; 1059 1060 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1061 uirec.br_blockcount, uirec.br_startblock); 1062 1063 /* Update the refcount tree */ 1064 error = xfs_refcount_increase_extent(tp, &uirec); 1065 if (error) 1066 goto out_cancel; 1067 1068 /* Map the new blocks into the data fork. */ 1069 error = xfs_bmap_map_extent(tp, ip, &uirec); 1070 if (error) 1071 goto out_cancel; 1072 1073 /* Update quota accounting. */ 1074 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1075 uirec.br_blockcount); 1076 1077 /* Update dest isize if needed. */ 1078 newlen = XFS_FSB_TO_B(mp, 1079 uirec.br_startoff + uirec.br_blockcount); 1080 newlen = min_t(xfs_off_t, newlen, new_isize); 1081 if (newlen > i_size_read(VFS_I(ip))) { 1082 trace_xfs_reflink_update_inode_size(ip, newlen); 1083 i_size_write(VFS_I(ip), newlen); 1084 ip->i_d.di_size = newlen; 1085 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1086 } 1087 1088 next_extent: 1089 /* Process all the deferred stuff. */ 1090 error = xfs_defer_finish(&tp); 1091 if (error) 1092 goto out_cancel; 1093 } 1094 1095 error = xfs_trans_commit(tp); 1096 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1097 if (error) 1098 goto out; 1099 return 0; 1100 1101 out_cancel: 1102 xfs_trans_cancel(tp); 1103 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1104 out: 1105 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1106 return error; 1107 } 1108 1109 /* 1110 * Iteratively remap one file's extents (and holes) to another's. 1111 */ 1112 int 1113 xfs_reflink_remap_blocks( 1114 struct xfs_inode *src, 1115 loff_t pos_in, 1116 struct xfs_inode *dest, 1117 loff_t pos_out, 1118 loff_t remap_len, 1119 loff_t *remapped) 1120 { 1121 struct xfs_bmbt_irec imap; 1122 xfs_fileoff_t srcoff; 1123 xfs_fileoff_t destoff; 1124 xfs_filblks_t len; 1125 xfs_filblks_t range_len; 1126 xfs_filblks_t remapped_len = 0; 1127 xfs_off_t new_isize = pos_out + remap_len; 1128 int nimaps; 1129 int error = 0; 1130 1131 destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); 1132 srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); 1133 len = XFS_B_TO_FSB(src->i_mount, remap_len); 1134 1135 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1136 while (len) { 1137 uint lock_mode; 1138 1139 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1140 dest, destoff); 1141 1142 /* Read extent from the source file */ 1143 nimaps = 1; 1144 lock_mode = xfs_ilock_data_map_shared(src); 1145 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1146 xfs_iunlock(src, lock_mode); 1147 if (error) 1148 break; 1149 ASSERT(nimaps == 1); 1150 1151 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1152 &imap); 1153 1154 /* Translate imap into the destination file. */ 1155 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1156 imap.br_startoff += destoff - srcoff; 1157 1158 /* Clear dest from destoff to the end of imap and map it in. */ 1159 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1160 new_isize); 1161 if (error) 1162 break; 1163 1164 if (fatal_signal_pending(current)) { 1165 error = -EINTR; 1166 break; 1167 } 1168 1169 /* Advance drange/srange */ 1170 srcoff += range_len; 1171 destoff += range_len; 1172 len -= range_len; 1173 remapped_len += range_len; 1174 } 1175 1176 if (error) 1177 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1178 *remapped = min_t(loff_t, remap_len, 1179 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1180 return error; 1181 } 1182 1183 /* 1184 * Grab the exclusive iolock for a data copy from src to dest, making 1185 * sure to abide vfs locking order (lowest pointer value goes first) and 1186 * breaking the pnfs layout leases on dest before proceeding. The loop 1187 * is needed because we cannot call the blocking break_layout() with the 1188 * src iolock held, and therefore have to back out both locks. 1189 */ 1190 static int 1191 xfs_iolock_two_inodes_and_break_layout( 1192 struct inode *src, 1193 struct inode *dest) 1194 { 1195 int error; 1196 1197 retry: 1198 if (src < dest) { 1199 inode_lock_shared(src); 1200 inode_lock_nested(dest, I_MUTEX_NONDIR2); 1201 } else { 1202 /* src >= dest */ 1203 inode_lock(dest); 1204 } 1205 1206 error = break_layout(dest, false); 1207 if (error == -EWOULDBLOCK) { 1208 inode_unlock(dest); 1209 if (src < dest) 1210 inode_unlock_shared(src); 1211 error = break_layout(dest, true); 1212 if (error) 1213 return error; 1214 goto retry; 1215 } 1216 if (error) { 1217 inode_unlock(dest); 1218 if (src < dest) 1219 inode_unlock_shared(src); 1220 return error; 1221 } 1222 if (src > dest) 1223 inode_lock_shared_nested(src, I_MUTEX_NONDIR2); 1224 return 0; 1225 } 1226 1227 /* Unlock both inodes after they've been prepped for a range clone. */ 1228 void 1229 xfs_reflink_remap_unlock( 1230 struct file *file_in, 1231 struct file *file_out) 1232 { 1233 struct inode *inode_in = file_inode(file_in); 1234 struct xfs_inode *src = XFS_I(inode_in); 1235 struct inode *inode_out = file_inode(file_out); 1236 struct xfs_inode *dest = XFS_I(inode_out); 1237 bool same_inode = (inode_in == inode_out); 1238 1239 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1240 if (!same_inode) 1241 xfs_iunlock(src, XFS_MMAPLOCK_SHARED); 1242 inode_unlock(inode_out); 1243 if (!same_inode) 1244 inode_unlock_shared(inode_in); 1245 } 1246 1247 /* 1248 * If we're reflinking to a point past the destination file's EOF, we must 1249 * zero any speculative post-EOF preallocations that sit between the old EOF 1250 * and the destination file offset. 1251 */ 1252 static int 1253 xfs_reflink_zero_posteof( 1254 struct xfs_inode *ip, 1255 loff_t pos) 1256 { 1257 loff_t isize = i_size_read(VFS_I(ip)); 1258 1259 if (pos <= isize) 1260 return 0; 1261 1262 trace_xfs_zero_eof(ip, isize, pos - isize); 1263 return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, 1264 &xfs_iomap_ops); 1265 } 1266 1267 /* 1268 * Prepare two files for range cloning. Upon a successful return both inodes 1269 * will have the iolock and mmaplock held, the page cache of the out file will 1270 * be truncated, and any leases on the out file will have been broken. This 1271 * function borrows heavily from xfs_file_aio_write_checks. 1272 * 1273 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1274 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1275 * EOF block in the source dedupe range because it's not a complete block match, 1276 * hence can introduce a corruption into the file that has it's block replaced. 1277 * 1278 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 1279 * "block aligned" for the purposes of cloning entire files. However, if the 1280 * source file range includes the EOF block and it lands within the existing EOF 1281 * of the destination file, then we can expose stale data from beyond the source 1282 * file EOF in the destination file. 1283 * 1284 * XFS doesn't support partial block sharing, so in both cases we have check 1285 * these cases ourselves. For dedupe, we can simply round the length to dedupe 1286 * down to the previous whole block and ignore the partial EOF block. While this 1287 * means we can't dedupe the last block of a file, this is an acceptible 1288 * tradeoff for simplicity on implementation. 1289 * 1290 * For cloning, we want to share the partial EOF block if it is also the new EOF 1291 * block of the destination file. If the partial EOF block lies inside the 1292 * existing destination EOF, then we have to abort the clone to avoid exposing 1293 * stale data in the destination file. Hence we reject these clone attempts with 1294 * -EINVAL in this case. 1295 */ 1296 int 1297 xfs_reflink_remap_prep( 1298 struct file *file_in, 1299 loff_t pos_in, 1300 struct file *file_out, 1301 loff_t pos_out, 1302 loff_t *len, 1303 unsigned int remap_flags) 1304 { 1305 struct inode *inode_in = file_inode(file_in); 1306 struct xfs_inode *src = XFS_I(inode_in); 1307 struct inode *inode_out = file_inode(file_out); 1308 struct xfs_inode *dest = XFS_I(inode_out); 1309 bool same_inode = (inode_in == inode_out); 1310 ssize_t ret; 1311 1312 /* Lock both files against IO */ 1313 ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); 1314 if (ret) 1315 return ret; 1316 if (same_inode) 1317 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1318 else 1319 xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, 1320 XFS_MMAPLOCK_EXCL); 1321 1322 /* Check file eligibility and prepare for block sharing. */ 1323 ret = -EINVAL; 1324 /* Don't reflink realtime inodes */ 1325 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1326 goto out_unlock; 1327 1328 /* Don't share DAX file data for now. */ 1329 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1330 goto out_unlock; 1331 1332 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1333 len, remap_flags); 1334 if (ret < 0 || *len == 0) 1335 goto out_unlock; 1336 1337 /* Attach dquots to dest inode before changing block map */ 1338 ret = xfs_qm_dqattach(dest); 1339 if (ret) 1340 goto out_unlock; 1341 1342 /* 1343 * Zero existing post-eof speculative preallocations in the destination 1344 * file. 1345 */ 1346 ret = xfs_reflink_zero_posteof(dest, pos_out); 1347 if (ret) 1348 goto out_unlock; 1349 1350 /* Set flags and remap blocks. */ 1351 ret = xfs_reflink_set_inode_flag(src, dest); 1352 if (ret) 1353 goto out_unlock; 1354 1355 /* 1356 * If pos_out > EOF, we may have dirtied blocks between EOF and 1357 * pos_out. In that case, we need to extend the flush and unmap to cover 1358 * from EOF to the end of the copy length. 1359 */ 1360 if (pos_out > XFS_ISIZE(dest)) { 1361 loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 1362 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 1363 } else { 1364 ret = xfs_flush_unmap_range(dest, pos_out, *len); 1365 } 1366 if (ret) 1367 goto out_unlock; 1368 1369 return 1; 1370 out_unlock: 1371 xfs_reflink_remap_unlock(file_in, file_out); 1372 return ret; 1373 } 1374 1375 /* 1376 * The user wants to preemptively CoW all shared blocks in this file, 1377 * which enables us to turn off the reflink flag. Iterate all 1378 * extents which are not prealloc/delalloc to see which ranges are 1379 * mentioned in the refcount tree, then read those blocks into the 1380 * pagecache, dirty them, fsync them back out, and then we can update 1381 * the inode flag. What happens if we run out of memory? :) 1382 */ 1383 STATIC int 1384 xfs_reflink_dirty_extents( 1385 struct xfs_inode *ip, 1386 xfs_fileoff_t fbno, 1387 xfs_filblks_t end, 1388 xfs_off_t isize) 1389 { 1390 struct xfs_mount *mp = ip->i_mount; 1391 xfs_agnumber_t agno; 1392 xfs_agblock_t agbno; 1393 xfs_extlen_t aglen; 1394 xfs_agblock_t rbno; 1395 xfs_extlen_t rlen; 1396 xfs_off_t fpos; 1397 xfs_off_t flen; 1398 struct xfs_bmbt_irec map[2]; 1399 int nmaps; 1400 int error = 0; 1401 1402 while (end - fbno > 0) { 1403 nmaps = 1; 1404 /* 1405 * Look for extents in the file. Skip holes, delalloc, or 1406 * unwritten extents; they can't be reflinked. 1407 */ 1408 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1409 if (error) 1410 goto out; 1411 if (nmaps == 0) 1412 break; 1413 if (!xfs_bmap_is_real_extent(&map[0])) 1414 goto next; 1415 1416 map[1] = map[0]; 1417 while (map[1].br_blockcount) { 1418 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1419 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1420 aglen = map[1].br_blockcount; 1421 1422 error = xfs_reflink_find_shared(mp, NULL, agno, agbno, 1423 aglen, &rbno, &rlen, true); 1424 if (error) 1425 goto out; 1426 if (rbno == NULLAGBLOCK) 1427 break; 1428 1429 /* Dirty the pages */ 1430 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1431 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1432 (rbno - agbno)); 1433 flen = XFS_FSB_TO_B(mp, rlen); 1434 if (fpos + flen > isize) 1435 flen = isize - fpos; 1436 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1437 &xfs_iomap_ops); 1438 xfs_ilock(ip, XFS_ILOCK_EXCL); 1439 if (error) 1440 goto out; 1441 1442 map[1].br_blockcount -= (rbno - agbno + rlen); 1443 map[1].br_startoff += (rbno - agbno + rlen); 1444 map[1].br_startblock += (rbno - agbno + rlen); 1445 } 1446 1447 next: 1448 fbno = map[0].br_startoff + map[0].br_blockcount; 1449 } 1450 out: 1451 return error; 1452 } 1453 1454 /* Does this inode need the reflink flag? */ 1455 int 1456 xfs_reflink_inode_has_shared_extents( 1457 struct xfs_trans *tp, 1458 struct xfs_inode *ip, 1459 bool *has_shared) 1460 { 1461 struct xfs_bmbt_irec got; 1462 struct xfs_mount *mp = ip->i_mount; 1463 struct xfs_ifork *ifp; 1464 xfs_agnumber_t agno; 1465 xfs_agblock_t agbno; 1466 xfs_extlen_t aglen; 1467 xfs_agblock_t rbno; 1468 xfs_extlen_t rlen; 1469 struct xfs_iext_cursor icur; 1470 bool found; 1471 int error; 1472 1473 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1474 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 1475 error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1476 if (error) 1477 return error; 1478 } 1479 1480 *has_shared = false; 1481 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1482 while (found) { 1483 if (isnullstartblock(got.br_startblock) || 1484 got.br_state != XFS_EXT_NORM) 1485 goto next; 1486 agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); 1487 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1488 aglen = got.br_blockcount; 1489 1490 error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, 1491 &rbno, &rlen, false); 1492 if (error) 1493 return error; 1494 /* Is there still a shared block here? */ 1495 if (rbno != NULLAGBLOCK) { 1496 *has_shared = true; 1497 return 0; 1498 } 1499 next: 1500 found = xfs_iext_next_extent(ifp, &icur, &got); 1501 } 1502 1503 return 0; 1504 } 1505 1506 /* 1507 * Clear the inode reflink flag if there are no shared extents. 1508 * 1509 * The caller is responsible for joining the inode to the transaction passed in. 1510 * The inode will be joined to the transaction that is returned to the caller. 1511 */ 1512 int 1513 xfs_reflink_clear_inode_flag( 1514 struct xfs_inode *ip, 1515 struct xfs_trans **tpp) 1516 { 1517 bool needs_flag; 1518 int error = 0; 1519 1520 ASSERT(xfs_is_reflink_inode(ip)); 1521 1522 error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 1523 if (error || needs_flag) 1524 return error; 1525 1526 /* 1527 * We didn't find any shared blocks so turn off the reflink flag. 1528 * First, get rid of any leftover CoW mappings. 1529 */ 1530 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); 1531 if (error) 1532 return error; 1533 1534 /* Clear the inode flag. */ 1535 trace_xfs_reflink_unset_inode_flag(ip); 1536 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1537 xfs_inode_clear_cowblocks_tag(ip); 1538 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1539 1540 return error; 1541 } 1542 1543 /* 1544 * Clear the inode reflink flag if there are no shared extents and the size 1545 * hasn't changed. 1546 */ 1547 STATIC int 1548 xfs_reflink_try_clear_inode_flag( 1549 struct xfs_inode *ip) 1550 { 1551 struct xfs_mount *mp = ip->i_mount; 1552 struct xfs_trans *tp; 1553 int error = 0; 1554 1555 /* Start a rolling transaction to remove the mappings */ 1556 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1557 if (error) 1558 return error; 1559 1560 xfs_ilock(ip, XFS_ILOCK_EXCL); 1561 xfs_trans_ijoin(tp, ip, 0); 1562 1563 error = xfs_reflink_clear_inode_flag(ip, &tp); 1564 if (error) 1565 goto cancel; 1566 1567 error = xfs_trans_commit(tp); 1568 if (error) 1569 goto out; 1570 1571 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1572 return 0; 1573 cancel: 1574 xfs_trans_cancel(tp); 1575 out: 1576 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1577 return error; 1578 } 1579 1580 /* 1581 * Pre-COW all shared blocks within a given byte range of a file and turn off 1582 * the reflink flag if we unshare all of the file's blocks. 1583 */ 1584 int 1585 xfs_reflink_unshare( 1586 struct xfs_inode *ip, 1587 xfs_off_t offset, 1588 xfs_off_t len) 1589 { 1590 struct xfs_mount *mp = ip->i_mount; 1591 xfs_fileoff_t fbno; 1592 xfs_filblks_t end; 1593 xfs_off_t isize; 1594 int error; 1595 1596 if (!xfs_is_reflink_inode(ip)) 1597 return 0; 1598 1599 trace_xfs_reflink_unshare(ip, offset, len); 1600 1601 inode_dio_wait(VFS_I(ip)); 1602 1603 /* Try to CoW the selected ranges */ 1604 xfs_ilock(ip, XFS_ILOCK_EXCL); 1605 fbno = XFS_B_TO_FSBT(mp, offset); 1606 isize = i_size_read(VFS_I(ip)); 1607 end = XFS_B_TO_FSB(mp, offset + len); 1608 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1609 if (error) 1610 goto out_unlock; 1611 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1612 1613 /* Wait for the IO to finish */ 1614 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1615 if (error) 1616 goto out; 1617 1618 /* Turn off the reflink flag if possible. */ 1619 error = xfs_reflink_try_clear_inode_flag(ip); 1620 if (error) 1621 goto out; 1622 1623 return 0; 1624 1625 out_unlock: 1626 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1627 out: 1628 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1629 return error; 1630 } 1631