1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_da_format.h" 15 #include "xfs_da_btree.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_bmap.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_error.h" 22 #include "xfs_dir2.h" 23 #include "xfs_dir2_priv.h" 24 #include "xfs_ioctl.h" 25 #include "xfs_trace.h" 26 #include "xfs_log.h" 27 #include "xfs_icache.h" 28 #include "xfs_pnfs.h" 29 #include "xfs_btree.h" 30 #include "xfs_refcount_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_bmap_btree.h" 33 #include "xfs_trans_space.h" 34 #include "xfs_bit.h" 35 #include "xfs_alloc.h" 36 #include "xfs_quota_defs.h" 37 #include "xfs_quota.h" 38 #include "xfs_reflink.h" 39 #include "xfs_iomap.h" 40 #include "xfs_rmap_btree.h" 41 #include "xfs_sb.h" 42 #include "xfs_ag_resv.h" 43 44 /* 45 * Copy on Write of Shared Blocks 46 * 47 * XFS must preserve "the usual" file semantics even when two files share 48 * the same physical blocks. This means that a write to one file must not 49 * alter the blocks in a different file; the way that we'll do that is 50 * through the use of a copy-on-write mechanism. At a high level, that 51 * means that when we want to write to a shared block, we allocate a new 52 * block, write the data to the new block, and if that succeeds we map the 53 * new block into the file. 54 * 55 * XFS provides a "delayed allocation" mechanism that defers the allocation 56 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 57 * possible. This reduces fragmentation by enabling the filesystem to ask 58 * for bigger chunks less often, which is exactly what we want for CoW. 59 * 60 * The delalloc mechanism begins when the kernel wants to make a block 61 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 62 * create a delalloc mapping, which is a regular in-core extent, but without 63 * a real startblock. (For delalloc mappings, the startblock encodes both 64 * a flag that this is a delalloc mapping, and a worst-case estimate of how 65 * many blocks might be required to put the mapping into the BMBT.) delalloc 66 * mappings are a reservation against the free space in the filesystem; 67 * adjacent mappings can also be combined into fewer larger mappings. 68 * 69 * As an optimization, the CoW extent size hint (cowextsz) creates 70 * outsized aligned delalloc reservations in the hope of landing out of 71 * order nearby CoW writes in a single extent on disk, thereby reducing 72 * fragmentation and improving future performance. 73 * 74 * D: --RRRRRRSSSRRRRRRRR--- (data fork) 75 * C: ------DDDDDDD--------- (CoW fork) 76 * 77 * When dirty pages are being written out (typically in writepage), the 78 * delalloc reservations are converted into unwritten mappings by 79 * allocating blocks and replacing the delalloc mapping with real ones. 80 * A delalloc mapping can be replaced by several unwritten ones if the 81 * free space is fragmented. 82 * 83 * D: --RRRRRRSSSRRRRRRRR--- 84 * C: ------UUUUUUU--------- 85 * 86 * We want to adapt the delalloc mechanism for copy-on-write, since the 87 * write paths are similar. The first two steps (creating the reservation 88 * and allocating the blocks) are exactly the same as delalloc except that 89 * the mappings must be stored in a separate CoW fork because we do not want 90 * to disturb the mapping in the data fork until we're sure that the write 91 * succeeded. IO completion in this case is the process of removing the old 92 * mapping from the data fork and moving the new mapping from the CoW fork to 93 * the data fork. This will be discussed shortly. 94 * 95 * For now, unaligned directio writes will be bounced back to the page cache. 96 * Block-aligned directio writes will use the same mechanism as buffered 97 * writes. 98 * 99 * Just prior to submitting the actual disk write requests, we convert 100 * the extents representing the range of the file actually being written 101 * (as opposed to extra pieces created for the cowextsize hint) to real 102 * extents. This will become important in the next step: 103 * 104 * D: --RRRRRRSSSRRRRRRRR--- 105 * C: ------UUrrUUU--------- 106 * 107 * CoW remapping must be done after the data block write completes, 108 * because we don't want to destroy the old data fork map until we're sure 109 * the new block has been written. Since the new mappings are kept in a 110 * separate fork, we can simply iterate these mappings to find the ones 111 * that cover the file blocks that we just CoW'd. For each extent, simply 112 * unmap the corresponding range in the data fork, map the new range into 113 * the data fork, and remove the extent from the CoW fork. Because of 114 * the presence of the cowextsize hint, however, we must be careful 115 * only to remap the blocks that we've actually written out -- we must 116 * never remap delalloc reservations nor CoW staging blocks that have 117 * yet to be written. This corresponds exactly to the real extents in 118 * the CoW fork: 119 * 120 * D: --RRRRRRrrSRRRRRRRR--- 121 * C: ------UU--UUU--------- 122 * 123 * Since the remapping operation can be applied to an arbitrary file 124 * range, we record the need for the remap step as a flag in the ioend 125 * instead of declaring a new IO type. This is required for direct io 126 * because we only have ioend for the whole dio, and we have to be able to 127 * remember the presence of unwritten blocks and CoW blocks with a single 128 * ioend structure. Better yet, the more ground we can cover with one 129 * ioend, the better. 130 */ 131 132 /* 133 * Given an AG extent, find the lowest-numbered run of shared blocks 134 * within that range and return the range in fbno/flen. If 135 * find_end_of_shared is true, return the longest contiguous extent of 136 * shared blocks. If there are no shared extents, fbno and flen will 137 * be set to NULLAGBLOCK and 0, respectively. 138 */ 139 int 140 xfs_reflink_find_shared( 141 struct xfs_mount *mp, 142 struct xfs_trans *tp, 143 xfs_agnumber_t agno, 144 xfs_agblock_t agbno, 145 xfs_extlen_t aglen, 146 xfs_agblock_t *fbno, 147 xfs_extlen_t *flen, 148 bool find_end_of_shared) 149 { 150 struct xfs_buf *agbp; 151 struct xfs_btree_cur *cur; 152 int error; 153 154 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 155 if (error) 156 return error; 157 if (!agbp) 158 return -ENOMEM; 159 160 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); 161 162 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 163 find_end_of_shared); 164 165 xfs_btree_del_cursor(cur, error); 166 167 xfs_trans_brelse(tp, agbp); 168 return error; 169 } 170 171 /* 172 * Trim the mapping to the next block where there's a change in the 173 * shared/unshared status. More specifically, this means that we 174 * find the lowest-numbered extent of shared blocks that coincides with 175 * the given block mapping. If the shared extent overlaps the start of 176 * the mapping, trim the mapping to the end of the shared extent. If 177 * the shared region intersects the mapping, trim the mapping to the 178 * start of the shared extent. If there are no shared regions that 179 * overlap, just return the original extent. 180 */ 181 int 182 xfs_reflink_trim_around_shared( 183 struct xfs_inode *ip, 184 struct xfs_bmbt_irec *irec, 185 bool *shared) 186 { 187 xfs_agnumber_t agno; 188 xfs_agblock_t agbno; 189 xfs_extlen_t aglen; 190 xfs_agblock_t fbno; 191 xfs_extlen_t flen; 192 int error = 0; 193 194 /* Holes, unwritten, and delalloc extents cannot be shared */ 195 if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 196 *shared = false; 197 return 0; 198 } 199 200 trace_xfs_reflink_trim_around_shared(ip, irec); 201 202 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 203 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 204 aglen = irec->br_blockcount; 205 206 error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, 207 aglen, &fbno, &flen, true); 208 if (error) 209 return error; 210 211 *shared = false; 212 if (fbno == NULLAGBLOCK) { 213 /* No shared blocks at all. */ 214 return 0; 215 } else if (fbno == agbno) { 216 /* 217 * The start of this extent is shared. Truncate the 218 * mapping at the end of the shared region so that a 219 * subsequent iteration starts at the start of the 220 * unshared region. 221 */ 222 irec->br_blockcount = flen; 223 *shared = true; 224 return 0; 225 } else { 226 /* 227 * There's a shared extent midway through this extent. 228 * Truncate the mapping at the start of the shared 229 * extent so that a subsequent iteration starts at the 230 * start of the shared region. 231 */ 232 irec->br_blockcount = fbno - agbno; 233 return 0; 234 } 235 } 236 237 /* 238 * Trim the passed in imap to the next shared/unshared extent boundary, and 239 * if imap->br_startoff points to a shared extent reserve space for it in the 240 * COW fork. 241 * 242 * Note that imap will always contain the block numbers for the existing blocks 243 * in the data fork, as the upper layers need them for read-modify-write 244 * operations. 245 */ 246 int 247 xfs_reflink_reserve_cow( 248 struct xfs_inode *ip, 249 struct xfs_bmbt_irec *imap) 250 { 251 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 252 struct xfs_bmbt_irec got; 253 int error = 0; 254 bool eof = false; 255 struct xfs_iext_cursor icur; 256 bool shared; 257 258 /* 259 * Search the COW fork extent list first. This serves two purposes: 260 * first this implement the speculative preallocation using cowextisze, 261 * so that we also unshared block adjacent to shared blocks instead 262 * of just the shared blocks themselves. Second the lookup in the 263 * extent list is generally faster than going out to the shared extent 264 * tree. 265 */ 266 267 if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) 268 eof = true; 269 if (!eof && got.br_startoff <= imap->br_startoff) { 270 trace_xfs_reflink_cow_found(ip, imap); 271 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 272 return 0; 273 } 274 275 /* Trim the mapping to the nearest shared extent boundary. */ 276 error = xfs_reflink_trim_around_shared(ip, imap, &shared); 277 if (error) 278 return error; 279 280 /* Not shared? Just report the (potentially capped) extent. */ 281 if (!shared) 282 return 0; 283 284 /* 285 * Fork all the shared blocks from our write offset until the end of 286 * the extent. 287 */ 288 error = xfs_qm_dqattach_locked(ip, false); 289 if (error) 290 return error; 291 292 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, 293 imap->br_blockcount, 0, &got, &icur, eof); 294 if (error == -ENOSPC || error == -EDQUOT) 295 trace_xfs_reflink_cow_enospc(ip, imap); 296 if (error) 297 return error; 298 299 trace_xfs_reflink_cow_alloc(ip, &got); 300 return 0; 301 } 302 303 /* Convert part of an unwritten CoW extent to a real one. */ 304 STATIC int 305 xfs_reflink_convert_cow_extent( 306 struct xfs_inode *ip, 307 struct xfs_bmbt_irec *imap, 308 xfs_fileoff_t offset_fsb, 309 xfs_filblks_t count_fsb) 310 { 311 int nimaps = 1; 312 313 if (imap->br_state == XFS_EXT_NORM) 314 return 0; 315 316 xfs_trim_extent(imap, offset_fsb, count_fsb); 317 trace_xfs_reflink_convert_cow(ip, imap); 318 if (imap->br_blockcount == 0) 319 return 0; 320 return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, 321 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, 322 &nimaps); 323 } 324 325 /* Convert all of the unwritten CoW extents in a file's range to real ones. */ 326 int 327 xfs_reflink_convert_cow( 328 struct xfs_inode *ip, 329 xfs_off_t offset, 330 xfs_off_t count) 331 { 332 struct xfs_mount *mp = ip->i_mount; 333 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 334 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 335 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 336 struct xfs_bmbt_irec imap; 337 int nimaps = 1, error = 0; 338 339 ASSERT(count != 0); 340 341 xfs_ilock(ip, XFS_ILOCK_EXCL); 342 error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, 343 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | 344 XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); 345 xfs_iunlock(ip, XFS_ILOCK_EXCL); 346 return error; 347 } 348 349 /* 350 * Find the extent that maps the given range in the COW fork. Even if the extent 351 * is not shared we might have a preallocation for it in the COW fork. If so we 352 * use it that rather than trigger a new allocation. 353 */ 354 static int 355 xfs_find_trim_cow_extent( 356 struct xfs_inode *ip, 357 struct xfs_bmbt_irec *imap, 358 bool *shared, 359 bool *found) 360 { 361 xfs_fileoff_t offset_fsb = imap->br_startoff; 362 xfs_filblks_t count_fsb = imap->br_blockcount; 363 struct xfs_iext_cursor icur; 364 struct xfs_bmbt_irec got; 365 366 *found = false; 367 368 /* 369 * If we don't find an overlapping extent, trim the range we need to 370 * allocate to fit the hole we found. 371 */ 372 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 373 got.br_startoff = offset_fsb + count_fsb; 374 if (got.br_startoff > offset_fsb) { 375 xfs_trim_extent(imap, imap->br_startoff, 376 got.br_startoff - imap->br_startoff); 377 return xfs_reflink_trim_around_shared(ip, imap, shared); 378 } 379 380 *shared = true; 381 if (isnullstartblock(got.br_startblock)) { 382 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 383 return 0; 384 } 385 386 /* real extent found - no need to allocate */ 387 xfs_trim_extent(&got, offset_fsb, count_fsb); 388 *imap = got; 389 *found = true; 390 return 0; 391 } 392 393 /* Allocate all CoW reservations covering a range of blocks in a file. */ 394 int 395 xfs_reflink_allocate_cow( 396 struct xfs_inode *ip, 397 struct xfs_bmbt_irec *imap, 398 bool *shared, 399 uint *lockmode) 400 { 401 struct xfs_mount *mp = ip->i_mount; 402 xfs_fileoff_t offset_fsb = imap->br_startoff; 403 xfs_filblks_t count_fsb = imap->br_blockcount; 404 struct xfs_trans *tp; 405 int nimaps, error = 0; 406 bool found; 407 xfs_filblks_t resaligned; 408 xfs_extlen_t resblks = 0; 409 410 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 411 ASSERT(xfs_is_reflink_inode(ip)); 412 413 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 414 if (error || !*shared) 415 return error; 416 if (found) 417 goto convert; 418 419 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 420 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 421 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 422 423 xfs_iunlock(ip, *lockmode); 424 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 425 *lockmode = XFS_ILOCK_EXCL; 426 xfs_ilock(ip, *lockmode); 427 428 if (error) 429 return error; 430 431 error = xfs_qm_dqattach_locked(ip, false); 432 if (error) 433 goto out_trans_cancel; 434 435 /* 436 * Check for an overlapping extent again now that we dropped the ilock. 437 */ 438 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 439 if (error || !*shared) 440 goto out_trans_cancel; 441 if (found) { 442 xfs_trans_cancel(tp); 443 goto convert; 444 } 445 446 error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, 447 XFS_QMOPT_RES_REGBLKS); 448 if (error) 449 goto out_trans_cancel; 450 451 xfs_trans_ijoin(tp, ip, 0); 452 453 /* Allocate the entire reservation as unwritten blocks. */ 454 nimaps = 1; 455 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 456 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 457 resblks, imap, &nimaps); 458 if (error) 459 goto out_unreserve; 460 461 xfs_inode_set_cowblocks_tag(ip); 462 error = xfs_trans_commit(tp); 463 if (error) 464 return error; 465 466 /* 467 * Allocation succeeded but the requested range was not even partially 468 * satisfied? Bail out! 469 */ 470 if (nimaps == 0) 471 return -ENOSPC; 472 convert: 473 return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); 474 475 out_unreserve: 476 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, 477 XFS_QMOPT_RES_REGBLKS); 478 out_trans_cancel: 479 xfs_trans_cancel(tp); 480 return error; 481 } 482 483 /* 484 * Cancel CoW reservations for some block range of an inode. 485 * 486 * If cancel_real is true this function cancels all COW fork extents for the 487 * inode; if cancel_real is false, real extents are not cleared. 488 * 489 * Caller must have already joined the inode to the current transaction. The 490 * inode will be joined to the transaction returned to the caller. 491 */ 492 int 493 xfs_reflink_cancel_cow_blocks( 494 struct xfs_inode *ip, 495 struct xfs_trans **tpp, 496 xfs_fileoff_t offset_fsb, 497 xfs_fileoff_t end_fsb, 498 bool cancel_real) 499 { 500 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 501 struct xfs_bmbt_irec got, del; 502 struct xfs_iext_cursor icur; 503 int error = 0; 504 505 if (!xfs_inode_has_cow_data(ip)) 506 return 0; 507 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 508 return 0; 509 510 /* Walk backwards until we're out of the I/O range... */ 511 while (got.br_startoff + got.br_blockcount > offset_fsb) { 512 del = got; 513 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 514 515 /* Extent delete may have bumped ext forward */ 516 if (!del.br_blockcount) { 517 xfs_iext_prev(ifp, &icur); 518 goto next_extent; 519 } 520 521 trace_xfs_reflink_cancel_cow(ip, &del); 522 523 if (isnullstartblock(del.br_startblock)) { 524 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 525 &icur, &got, &del); 526 if (error) 527 break; 528 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 529 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 530 531 /* Free the CoW orphan record. */ 532 error = xfs_refcount_free_cow_extent(*tpp, 533 del.br_startblock, del.br_blockcount); 534 if (error) 535 break; 536 537 xfs_bmap_add_free(*tpp, del.br_startblock, 538 del.br_blockcount, NULL); 539 540 /* Roll the transaction */ 541 error = xfs_defer_finish(tpp); 542 if (error) 543 break; 544 545 /* Remove the mapping from the CoW fork. */ 546 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 547 548 /* Remove the quota reservation */ 549 error = xfs_trans_reserve_quota_nblks(NULL, ip, 550 -(long)del.br_blockcount, 0, 551 XFS_QMOPT_RES_REGBLKS); 552 if (error) 553 break; 554 } else { 555 /* Didn't do anything, push cursor back. */ 556 xfs_iext_prev(ifp, &icur); 557 } 558 next_extent: 559 if (!xfs_iext_get_extent(ifp, &icur, &got)) 560 break; 561 } 562 563 /* clear tag if cow fork is emptied */ 564 if (!ifp->if_bytes) 565 xfs_inode_clear_cowblocks_tag(ip); 566 return error; 567 } 568 569 /* 570 * Cancel CoW reservations for some byte range of an inode. 571 * 572 * If cancel_real is true this function cancels all COW fork extents for the 573 * inode; if cancel_real is false, real extents are not cleared. 574 */ 575 int 576 xfs_reflink_cancel_cow_range( 577 struct xfs_inode *ip, 578 xfs_off_t offset, 579 xfs_off_t count, 580 bool cancel_real) 581 { 582 struct xfs_trans *tp; 583 xfs_fileoff_t offset_fsb; 584 xfs_fileoff_t end_fsb; 585 int error; 586 587 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 588 ASSERT(xfs_is_reflink_inode(ip)); 589 590 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 591 if (count == NULLFILEOFF) 592 end_fsb = NULLFILEOFF; 593 else 594 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 595 596 /* Start a rolling transaction to remove the mappings */ 597 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 598 0, 0, XFS_TRANS_NOFS, &tp); 599 if (error) 600 goto out; 601 602 xfs_ilock(ip, XFS_ILOCK_EXCL); 603 xfs_trans_ijoin(tp, ip, 0); 604 605 /* Scrape out the old CoW reservations */ 606 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 607 cancel_real); 608 if (error) 609 goto out_cancel; 610 611 error = xfs_trans_commit(tp); 612 613 xfs_iunlock(ip, XFS_ILOCK_EXCL); 614 return error; 615 616 out_cancel: 617 xfs_trans_cancel(tp); 618 xfs_iunlock(ip, XFS_ILOCK_EXCL); 619 out: 620 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 621 return error; 622 } 623 624 /* 625 * Remap parts of a file's data fork after a successful CoW. 626 */ 627 int 628 xfs_reflink_end_cow( 629 struct xfs_inode *ip, 630 xfs_off_t offset, 631 xfs_off_t count) 632 { 633 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 634 struct xfs_bmbt_irec got, del; 635 struct xfs_trans *tp; 636 xfs_fileoff_t offset_fsb; 637 xfs_fileoff_t end_fsb; 638 int error; 639 unsigned int resblks; 640 xfs_filblks_t rlen; 641 struct xfs_iext_cursor icur; 642 643 trace_xfs_reflink_end_cow(ip, offset, count); 644 645 /* No COW extents? That's easy! */ 646 if (ifp->if_bytes == 0) 647 return 0; 648 649 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 650 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 651 652 /* 653 * Start a rolling transaction to switch the mappings. We're 654 * unlikely ever to have to remap 16T worth of single-block 655 * extents, so just cap the worst case extent count to 2^32-1. 656 * Stick a warning in just in case, and avoid 64-bit division. 657 */ 658 BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); 659 if (end_fsb - offset_fsb > UINT_MAX) { 660 error = -EFSCORRUPTED; 661 xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); 662 ASSERT(0); 663 goto out; 664 } 665 resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, 666 (unsigned int)(end_fsb - offset_fsb), 667 XFS_DATA_FORK); 668 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 669 resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); 670 if (error) 671 goto out; 672 673 xfs_ilock(ip, XFS_ILOCK_EXCL); 674 xfs_trans_ijoin(tp, ip, 0); 675 676 /* 677 * In case of racing, overlapping AIO writes no COW extents might be 678 * left by the time I/O completes for the loser of the race. In that 679 * case we are done. 680 */ 681 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 682 goto out_cancel; 683 684 /* Walk backwards until we're out of the I/O range... */ 685 while (got.br_startoff + got.br_blockcount > offset_fsb) { 686 del = got; 687 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 688 689 /* Extent delete may have bumped ext forward */ 690 if (!del.br_blockcount) 691 goto prev_extent; 692 693 /* 694 * Only remap real extent that contain data. With AIO 695 * speculatively preallocations can leak into the range we 696 * are called upon, and we need to skip them. 697 */ 698 if (!xfs_bmap_is_real_extent(&got)) 699 goto prev_extent; 700 701 /* Unmap the old blocks in the data fork. */ 702 ASSERT(tp->t_firstblock == NULLFSBLOCK); 703 rlen = del.br_blockcount; 704 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); 705 if (error) 706 goto out_cancel; 707 708 /* Trim the extent to whatever got unmapped. */ 709 if (rlen) { 710 xfs_trim_extent(&del, del.br_startoff + rlen, 711 del.br_blockcount - rlen); 712 } 713 trace_xfs_reflink_cow_remap(ip, &del); 714 715 /* Free the CoW orphan record. */ 716 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 717 del.br_blockcount); 718 if (error) 719 goto out_cancel; 720 721 /* Map the new blocks into the data fork. */ 722 error = xfs_bmap_map_extent(tp, ip, &del); 723 if (error) 724 goto out_cancel; 725 726 /* Charge this new data fork mapping to the on-disk quota. */ 727 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 728 (long)del.br_blockcount); 729 730 /* Remove the mapping from the CoW fork. */ 731 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 732 733 error = xfs_defer_finish(&tp); 734 if (error) 735 goto out_cancel; 736 if (!xfs_iext_get_extent(ifp, &icur, &got)) 737 break; 738 continue; 739 prev_extent: 740 if (!xfs_iext_prev_extent(ifp, &icur, &got)) 741 break; 742 } 743 744 error = xfs_trans_commit(tp); 745 xfs_iunlock(ip, XFS_ILOCK_EXCL); 746 if (error) 747 goto out; 748 return 0; 749 750 out_cancel: 751 xfs_trans_cancel(tp); 752 xfs_iunlock(ip, XFS_ILOCK_EXCL); 753 out: 754 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 755 return error; 756 } 757 758 /* 759 * Free leftover CoW reservations that didn't get cleaned out. 760 */ 761 int 762 xfs_reflink_recover_cow( 763 struct xfs_mount *mp) 764 { 765 xfs_agnumber_t agno; 766 int error = 0; 767 768 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 769 return 0; 770 771 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 772 error = xfs_refcount_recover_cow_leftovers(mp, agno); 773 if (error) 774 break; 775 } 776 777 return error; 778 } 779 780 /* 781 * Reflinking (Block) Ranges of Two Files Together 782 * 783 * First, ensure that the reflink flag is set on both inodes. The flag is an 784 * optimization to avoid unnecessary refcount btree lookups in the write path. 785 * 786 * Now we can iteratively remap the range of extents (and holes) in src to the 787 * corresponding ranges in dest. Let drange and srange denote the ranges of 788 * logical blocks in dest and src touched by the reflink operation. 789 * 790 * While the length of drange is greater than zero, 791 * - Read src's bmbt at the start of srange ("imap") 792 * - If imap doesn't exist, make imap appear to start at the end of srange 793 * with zero length. 794 * - If imap starts before srange, advance imap to start at srange. 795 * - If imap goes beyond srange, truncate imap to end at the end of srange. 796 * - Punch (imap start - srange start + imap len) blocks from dest at 797 * offset (drange start). 798 * - If imap points to a real range of pblks, 799 * > Increase the refcount of the imap's pblks 800 * > Map imap's pblks into dest at the offset 801 * (drange start + imap start - srange start) 802 * - Advance drange and srange by (imap start - srange start + imap len) 803 * 804 * Finally, if the reflink made dest longer, update both the in-core and 805 * on-disk file sizes. 806 * 807 * ASCII Art Demonstration: 808 * 809 * Let's say we want to reflink this source file: 810 * 811 * ----SSSSSSS-SSSSS----SSSSSS (src file) 812 * <--------------------> 813 * 814 * into this destination file: 815 * 816 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 817 * <--------------------> 818 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 819 * Observe that the range has different logical offsets in either file. 820 * 821 * Consider that the first extent in the source file doesn't line up with our 822 * reflink range. Unmapping and remapping are separate operations, so we can 823 * unmap more blocks from the destination file than we remap. 824 * 825 * ----SSSSSSS-SSSSS----SSSSSS 826 * <-------> 827 * --DDDDD---------DDDDD--DDD 828 * <-------> 829 * 830 * Now remap the source extent into the destination file: 831 * 832 * ----SSSSSSS-SSSSS----SSSSSS 833 * <-------> 834 * --DDDDD--SSSSSSSDDDDD--DDD 835 * <-------> 836 * 837 * Do likewise with the second hole and extent in our range. Holes in the 838 * unmap range don't affect our operation. 839 * 840 * ----SSSSSSS-SSSSS----SSSSSS 841 * <----> 842 * --DDDDD--SSSSSSS-SSSSS-DDD 843 * <----> 844 * 845 * Finally, unmap and remap part of the third extent. This will increase the 846 * size of the destination file. 847 * 848 * ----SSSSSSS-SSSSS----SSSSSS 849 * <-----> 850 * --DDDDD--SSSSSSS-SSSSS----SSS 851 * <-----> 852 * 853 * Once we update the destination file's i_size, we're done. 854 */ 855 856 /* 857 * Ensure the reflink bit is set in both inodes. 858 */ 859 STATIC int 860 xfs_reflink_set_inode_flag( 861 struct xfs_inode *src, 862 struct xfs_inode *dest) 863 { 864 struct xfs_mount *mp = src->i_mount; 865 int error; 866 struct xfs_trans *tp; 867 868 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 869 return 0; 870 871 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 872 if (error) 873 goto out_error; 874 875 /* Lock both files against IO */ 876 if (src->i_ino == dest->i_ino) 877 xfs_ilock(src, XFS_ILOCK_EXCL); 878 else 879 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 880 881 if (!xfs_is_reflink_inode(src)) { 882 trace_xfs_reflink_set_inode_flag(src); 883 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 884 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 885 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 886 xfs_ifork_init_cow(src); 887 } else 888 xfs_iunlock(src, XFS_ILOCK_EXCL); 889 890 if (src->i_ino == dest->i_ino) 891 goto commit_flags; 892 893 if (!xfs_is_reflink_inode(dest)) { 894 trace_xfs_reflink_set_inode_flag(dest); 895 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 896 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 897 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 898 xfs_ifork_init_cow(dest); 899 } else 900 xfs_iunlock(dest, XFS_ILOCK_EXCL); 901 902 commit_flags: 903 error = xfs_trans_commit(tp); 904 if (error) 905 goto out_error; 906 return error; 907 908 out_error: 909 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 910 return error; 911 } 912 913 /* 914 * Update destination inode size & cowextsize hint, if necessary. 915 */ 916 int 917 xfs_reflink_update_dest( 918 struct xfs_inode *dest, 919 xfs_off_t newlen, 920 xfs_extlen_t cowextsize, 921 unsigned int remap_flags) 922 { 923 struct xfs_mount *mp = dest->i_mount; 924 struct xfs_trans *tp; 925 int error; 926 927 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 928 return 0; 929 930 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 931 if (error) 932 goto out_error; 933 934 xfs_ilock(dest, XFS_ILOCK_EXCL); 935 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 936 937 if (newlen > i_size_read(VFS_I(dest))) { 938 trace_xfs_reflink_update_inode_size(dest, newlen); 939 i_size_write(VFS_I(dest), newlen); 940 dest->i_d.di_size = newlen; 941 } 942 943 if (cowextsize) { 944 dest->i_d.di_cowextsize = cowextsize; 945 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 946 } 947 948 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 949 950 error = xfs_trans_commit(tp); 951 if (error) 952 goto out_error; 953 return error; 954 955 out_error: 956 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 957 return error; 958 } 959 960 /* 961 * Do we have enough reserve in this AG to handle a reflink? The refcount 962 * btree already reserved all the space it needs, but the rmap btree can grow 963 * infinitely, so we won't allow more reflinks when the AG is down to the 964 * btree reserves. 965 */ 966 static int 967 xfs_reflink_ag_has_free_space( 968 struct xfs_mount *mp, 969 xfs_agnumber_t agno) 970 { 971 struct xfs_perag *pag; 972 int error = 0; 973 974 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 975 return 0; 976 977 pag = xfs_perag_get(mp, agno); 978 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 979 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 980 error = -ENOSPC; 981 xfs_perag_put(pag); 982 return error; 983 } 984 985 /* 986 * Unmap a range of blocks from a file, then map other blocks into the hole. 987 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 988 * The extent irec is mapped into dest at irec->br_startoff. 989 */ 990 STATIC int 991 xfs_reflink_remap_extent( 992 struct xfs_inode *ip, 993 struct xfs_bmbt_irec *irec, 994 xfs_fileoff_t destoff, 995 xfs_off_t new_isize) 996 { 997 struct xfs_mount *mp = ip->i_mount; 998 bool real_extent = xfs_bmap_is_real_extent(irec); 999 struct xfs_trans *tp; 1000 unsigned int resblks; 1001 struct xfs_bmbt_irec uirec; 1002 xfs_filblks_t rlen; 1003 xfs_filblks_t unmap_len; 1004 xfs_off_t newlen; 1005 int error; 1006 1007 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1008 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1009 1010 /* No reflinking if we're low on space */ 1011 if (real_extent) { 1012 error = xfs_reflink_ag_has_free_space(mp, 1013 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1014 if (error) 1015 goto out; 1016 } 1017 1018 /* Start a rolling transaction to switch the mappings */ 1019 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1020 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1021 if (error) 1022 goto out; 1023 1024 xfs_ilock(ip, XFS_ILOCK_EXCL); 1025 xfs_trans_ijoin(tp, ip, 0); 1026 1027 /* If we're not just clearing space, then do we have enough quota? */ 1028 if (real_extent) { 1029 error = xfs_trans_reserve_quota_nblks(tp, ip, 1030 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1031 if (error) 1032 goto out_cancel; 1033 } 1034 1035 trace_xfs_reflink_remap(ip, irec->br_startoff, 1036 irec->br_blockcount, irec->br_startblock); 1037 1038 /* Unmap the old blocks in the data fork. */ 1039 rlen = unmap_len; 1040 while (rlen) { 1041 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1042 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); 1043 if (error) 1044 goto out_cancel; 1045 1046 /* 1047 * Trim the extent to whatever got unmapped. 1048 * Remember, bunmapi works backwards. 1049 */ 1050 uirec.br_startblock = irec->br_startblock + rlen; 1051 uirec.br_startoff = irec->br_startoff + rlen; 1052 uirec.br_blockcount = unmap_len - rlen; 1053 unmap_len = rlen; 1054 1055 /* If this isn't a real mapping, we're done. */ 1056 if (!real_extent || uirec.br_blockcount == 0) 1057 goto next_extent; 1058 1059 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1060 uirec.br_blockcount, uirec.br_startblock); 1061 1062 /* Update the refcount tree */ 1063 error = xfs_refcount_increase_extent(tp, &uirec); 1064 if (error) 1065 goto out_cancel; 1066 1067 /* Map the new blocks into the data fork. */ 1068 error = xfs_bmap_map_extent(tp, ip, &uirec); 1069 if (error) 1070 goto out_cancel; 1071 1072 /* Update quota accounting. */ 1073 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1074 uirec.br_blockcount); 1075 1076 /* Update dest isize if needed. */ 1077 newlen = XFS_FSB_TO_B(mp, 1078 uirec.br_startoff + uirec.br_blockcount); 1079 newlen = min_t(xfs_off_t, newlen, new_isize); 1080 if (newlen > i_size_read(VFS_I(ip))) { 1081 trace_xfs_reflink_update_inode_size(ip, newlen); 1082 i_size_write(VFS_I(ip), newlen); 1083 ip->i_d.di_size = newlen; 1084 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1085 } 1086 1087 next_extent: 1088 /* Process all the deferred stuff. */ 1089 error = xfs_defer_finish(&tp); 1090 if (error) 1091 goto out_cancel; 1092 } 1093 1094 error = xfs_trans_commit(tp); 1095 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1096 if (error) 1097 goto out; 1098 return 0; 1099 1100 out_cancel: 1101 xfs_trans_cancel(tp); 1102 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1103 out: 1104 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1105 return error; 1106 } 1107 1108 /* 1109 * Iteratively remap one file's extents (and holes) to another's. 1110 */ 1111 int 1112 xfs_reflink_remap_blocks( 1113 struct xfs_inode *src, 1114 loff_t pos_in, 1115 struct xfs_inode *dest, 1116 loff_t pos_out, 1117 loff_t remap_len, 1118 loff_t *remapped) 1119 { 1120 struct xfs_bmbt_irec imap; 1121 xfs_fileoff_t srcoff; 1122 xfs_fileoff_t destoff; 1123 xfs_filblks_t len; 1124 xfs_filblks_t range_len; 1125 xfs_filblks_t remapped_len = 0; 1126 xfs_off_t new_isize = pos_out + remap_len; 1127 int nimaps; 1128 int error = 0; 1129 1130 destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); 1131 srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); 1132 len = XFS_B_TO_FSB(src->i_mount, remap_len); 1133 1134 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1135 while (len) { 1136 uint lock_mode; 1137 1138 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1139 dest, destoff); 1140 1141 /* Read extent from the source file */ 1142 nimaps = 1; 1143 lock_mode = xfs_ilock_data_map_shared(src); 1144 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1145 xfs_iunlock(src, lock_mode); 1146 if (error) 1147 break; 1148 ASSERT(nimaps == 1); 1149 1150 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1151 &imap); 1152 1153 /* Translate imap into the destination file. */ 1154 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1155 imap.br_startoff += destoff - srcoff; 1156 1157 /* Clear dest from destoff to the end of imap and map it in. */ 1158 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1159 new_isize); 1160 if (error) 1161 break; 1162 1163 if (fatal_signal_pending(current)) { 1164 error = -EINTR; 1165 break; 1166 } 1167 1168 /* Advance drange/srange */ 1169 srcoff += range_len; 1170 destoff += range_len; 1171 len -= range_len; 1172 remapped_len += range_len; 1173 } 1174 1175 if (error) 1176 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1177 *remapped = min_t(loff_t, remap_len, 1178 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1179 return error; 1180 } 1181 1182 /* 1183 * Grab the exclusive iolock for a data copy from src to dest, making 1184 * sure to abide vfs locking order (lowest pointer value goes first) and 1185 * breaking the pnfs layout leases on dest before proceeding. The loop 1186 * is needed because we cannot call the blocking break_layout() with the 1187 * src iolock held, and therefore have to back out both locks. 1188 */ 1189 static int 1190 xfs_iolock_two_inodes_and_break_layout( 1191 struct inode *src, 1192 struct inode *dest) 1193 { 1194 int error; 1195 1196 retry: 1197 if (src < dest) { 1198 inode_lock_shared(src); 1199 inode_lock_nested(dest, I_MUTEX_NONDIR2); 1200 } else { 1201 /* src >= dest */ 1202 inode_lock(dest); 1203 } 1204 1205 error = break_layout(dest, false); 1206 if (error == -EWOULDBLOCK) { 1207 inode_unlock(dest); 1208 if (src < dest) 1209 inode_unlock_shared(src); 1210 error = break_layout(dest, true); 1211 if (error) 1212 return error; 1213 goto retry; 1214 } 1215 if (error) { 1216 inode_unlock(dest); 1217 if (src < dest) 1218 inode_unlock_shared(src); 1219 return error; 1220 } 1221 if (src > dest) 1222 inode_lock_shared_nested(src, I_MUTEX_NONDIR2); 1223 return 0; 1224 } 1225 1226 /* Unlock both inodes after they've been prepped for a range clone. */ 1227 void 1228 xfs_reflink_remap_unlock( 1229 struct file *file_in, 1230 struct file *file_out) 1231 { 1232 struct inode *inode_in = file_inode(file_in); 1233 struct xfs_inode *src = XFS_I(inode_in); 1234 struct inode *inode_out = file_inode(file_out); 1235 struct xfs_inode *dest = XFS_I(inode_out); 1236 bool same_inode = (inode_in == inode_out); 1237 1238 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1239 if (!same_inode) 1240 xfs_iunlock(src, XFS_MMAPLOCK_SHARED); 1241 inode_unlock(inode_out); 1242 if (!same_inode) 1243 inode_unlock_shared(inode_in); 1244 } 1245 1246 /* 1247 * If we're reflinking to a point past the destination file's EOF, we must 1248 * zero any speculative post-EOF preallocations that sit between the old EOF 1249 * and the destination file offset. 1250 */ 1251 static int 1252 xfs_reflink_zero_posteof( 1253 struct xfs_inode *ip, 1254 loff_t pos) 1255 { 1256 loff_t isize = i_size_read(VFS_I(ip)); 1257 1258 if (pos <= isize) 1259 return 0; 1260 1261 trace_xfs_zero_eof(ip, isize, pos - isize); 1262 return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, 1263 &xfs_iomap_ops); 1264 } 1265 1266 /* 1267 * Prepare two files for range cloning. Upon a successful return both inodes 1268 * will have the iolock and mmaplock held, the page cache of the out file will 1269 * be truncated, and any leases on the out file will have been broken. This 1270 * function borrows heavily from xfs_file_aio_write_checks. 1271 * 1272 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1273 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1274 * EOF block in the source dedupe range because it's not a complete block match, 1275 * hence can introduce a corruption into the file that has it's block replaced. 1276 * 1277 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 1278 * "block aligned" for the purposes of cloning entire files. However, if the 1279 * source file range includes the EOF block and it lands within the existing EOF 1280 * of the destination file, then we can expose stale data from beyond the source 1281 * file EOF in the destination file. 1282 * 1283 * XFS doesn't support partial block sharing, so in both cases we have check 1284 * these cases ourselves. For dedupe, we can simply round the length to dedupe 1285 * down to the previous whole block and ignore the partial EOF block. While this 1286 * means we can't dedupe the last block of a file, this is an acceptible 1287 * tradeoff for simplicity on implementation. 1288 * 1289 * For cloning, we want to share the partial EOF block if it is also the new EOF 1290 * block of the destination file. If the partial EOF block lies inside the 1291 * existing destination EOF, then we have to abort the clone to avoid exposing 1292 * stale data in the destination file. Hence we reject these clone attempts with 1293 * -EINVAL in this case. 1294 */ 1295 int 1296 xfs_reflink_remap_prep( 1297 struct file *file_in, 1298 loff_t pos_in, 1299 struct file *file_out, 1300 loff_t pos_out, 1301 loff_t *len, 1302 unsigned int remap_flags) 1303 { 1304 struct inode *inode_in = file_inode(file_in); 1305 struct xfs_inode *src = XFS_I(inode_in); 1306 struct inode *inode_out = file_inode(file_out); 1307 struct xfs_inode *dest = XFS_I(inode_out); 1308 bool same_inode = (inode_in == inode_out); 1309 ssize_t ret; 1310 1311 /* Lock both files against IO */ 1312 ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); 1313 if (ret) 1314 return ret; 1315 if (same_inode) 1316 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1317 else 1318 xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, 1319 XFS_MMAPLOCK_EXCL); 1320 1321 /* Check file eligibility and prepare for block sharing. */ 1322 ret = -EINVAL; 1323 /* Don't reflink realtime inodes */ 1324 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1325 goto out_unlock; 1326 1327 /* Don't share DAX file data for now. */ 1328 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1329 goto out_unlock; 1330 1331 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1332 len, remap_flags); 1333 if (ret < 0 || *len == 0) 1334 goto out_unlock; 1335 1336 /* Attach dquots to dest inode before changing block map */ 1337 ret = xfs_qm_dqattach(dest); 1338 if (ret) 1339 goto out_unlock; 1340 1341 /* 1342 * Zero existing post-eof speculative preallocations in the destination 1343 * file. 1344 */ 1345 ret = xfs_reflink_zero_posteof(dest, pos_out); 1346 if (ret) 1347 goto out_unlock; 1348 1349 /* Set flags and remap blocks. */ 1350 ret = xfs_reflink_set_inode_flag(src, dest); 1351 if (ret) 1352 goto out_unlock; 1353 1354 /* Zap any page cache for the destination file's range. */ 1355 truncate_inode_pages_range(&inode_out->i_data, 1356 round_down(pos_out, PAGE_SIZE), 1357 round_up(pos_out + *len, PAGE_SIZE) - 1); 1358 1359 return 1; 1360 out_unlock: 1361 xfs_reflink_remap_unlock(file_in, file_out); 1362 return ret; 1363 } 1364 1365 /* 1366 * The user wants to preemptively CoW all shared blocks in this file, 1367 * which enables us to turn off the reflink flag. Iterate all 1368 * extents which are not prealloc/delalloc to see which ranges are 1369 * mentioned in the refcount tree, then read those blocks into the 1370 * pagecache, dirty them, fsync them back out, and then we can update 1371 * the inode flag. What happens if we run out of memory? :) 1372 */ 1373 STATIC int 1374 xfs_reflink_dirty_extents( 1375 struct xfs_inode *ip, 1376 xfs_fileoff_t fbno, 1377 xfs_filblks_t end, 1378 xfs_off_t isize) 1379 { 1380 struct xfs_mount *mp = ip->i_mount; 1381 xfs_agnumber_t agno; 1382 xfs_agblock_t agbno; 1383 xfs_extlen_t aglen; 1384 xfs_agblock_t rbno; 1385 xfs_extlen_t rlen; 1386 xfs_off_t fpos; 1387 xfs_off_t flen; 1388 struct xfs_bmbt_irec map[2]; 1389 int nmaps; 1390 int error = 0; 1391 1392 while (end - fbno > 0) { 1393 nmaps = 1; 1394 /* 1395 * Look for extents in the file. Skip holes, delalloc, or 1396 * unwritten extents; they can't be reflinked. 1397 */ 1398 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1399 if (error) 1400 goto out; 1401 if (nmaps == 0) 1402 break; 1403 if (!xfs_bmap_is_real_extent(&map[0])) 1404 goto next; 1405 1406 map[1] = map[0]; 1407 while (map[1].br_blockcount) { 1408 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1409 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1410 aglen = map[1].br_blockcount; 1411 1412 error = xfs_reflink_find_shared(mp, NULL, agno, agbno, 1413 aglen, &rbno, &rlen, true); 1414 if (error) 1415 goto out; 1416 if (rbno == NULLAGBLOCK) 1417 break; 1418 1419 /* Dirty the pages */ 1420 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1421 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1422 (rbno - agbno)); 1423 flen = XFS_FSB_TO_B(mp, rlen); 1424 if (fpos + flen > isize) 1425 flen = isize - fpos; 1426 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1427 &xfs_iomap_ops); 1428 xfs_ilock(ip, XFS_ILOCK_EXCL); 1429 if (error) 1430 goto out; 1431 1432 map[1].br_blockcount -= (rbno - agbno + rlen); 1433 map[1].br_startoff += (rbno - agbno + rlen); 1434 map[1].br_startblock += (rbno - agbno + rlen); 1435 } 1436 1437 next: 1438 fbno = map[0].br_startoff + map[0].br_blockcount; 1439 } 1440 out: 1441 return error; 1442 } 1443 1444 /* Does this inode need the reflink flag? */ 1445 int 1446 xfs_reflink_inode_has_shared_extents( 1447 struct xfs_trans *tp, 1448 struct xfs_inode *ip, 1449 bool *has_shared) 1450 { 1451 struct xfs_bmbt_irec got; 1452 struct xfs_mount *mp = ip->i_mount; 1453 struct xfs_ifork *ifp; 1454 xfs_agnumber_t agno; 1455 xfs_agblock_t agbno; 1456 xfs_extlen_t aglen; 1457 xfs_agblock_t rbno; 1458 xfs_extlen_t rlen; 1459 struct xfs_iext_cursor icur; 1460 bool found; 1461 int error; 1462 1463 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1464 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 1465 error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1466 if (error) 1467 return error; 1468 } 1469 1470 *has_shared = false; 1471 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1472 while (found) { 1473 if (isnullstartblock(got.br_startblock) || 1474 got.br_state != XFS_EXT_NORM) 1475 goto next; 1476 agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); 1477 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1478 aglen = got.br_blockcount; 1479 1480 error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, 1481 &rbno, &rlen, false); 1482 if (error) 1483 return error; 1484 /* Is there still a shared block here? */ 1485 if (rbno != NULLAGBLOCK) { 1486 *has_shared = true; 1487 return 0; 1488 } 1489 next: 1490 found = xfs_iext_next_extent(ifp, &icur, &got); 1491 } 1492 1493 return 0; 1494 } 1495 1496 /* 1497 * Clear the inode reflink flag if there are no shared extents. 1498 * 1499 * The caller is responsible for joining the inode to the transaction passed in. 1500 * The inode will be joined to the transaction that is returned to the caller. 1501 */ 1502 int 1503 xfs_reflink_clear_inode_flag( 1504 struct xfs_inode *ip, 1505 struct xfs_trans **tpp) 1506 { 1507 bool needs_flag; 1508 int error = 0; 1509 1510 ASSERT(xfs_is_reflink_inode(ip)); 1511 1512 error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 1513 if (error || needs_flag) 1514 return error; 1515 1516 /* 1517 * We didn't find any shared blocks so turn off the reflink flag. 1518 * First, get rid of any leftover CoW mappings. 1519 */ 1520 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); 1521 if (error) 1522 return error; 1523 1524 /* Clear the inode flag. */ 1525 trace_xfs_reflink_unset_inode_flag(ip); 1526 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1527 xfs_inode_clear_cowblocks_tag(ip); 1528 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1529 1530 return error; 1531 } 1532 1533 /* 1534 * Clear the inode reflink flag if there are no shared extents and the size 1535 * hasn't changed. 1536 */ 1537 STATIC int 1538 xfs_reflink_try_clear_inode_flag( 1539 struct xfs_inode *ip) 1540 { 1541 struct xfs_mount *mp = ip->i_mount; 1542 struct xfs_trans *tp; 1543 int error = 0; 1544 1545 /* Start a rolling transaction to remove the mappings */ 1546 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1547 if (error) 1548 return error; 1549 1550 xfs_ilock(ip, XFS_ILOCK_EXCL); 1551 xfs_trans_ijoin(tp, ip, 0); 1552 1553 error = xfs_reflink_clear_inode_flag(ip, &tp); 1554 if (error) 1555 goto cancel; 1556 1557 error = xfs_trans_commit(tp); 1558 if (error) 1559 goto out; 1560 1561 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1562 return 0; 1563 cancel: 1564 xfs_trans_cancel(tp); 1565 out: 1566 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1567 return error; 1568 } 1569 1570 /* 1571 * Pre-COW all shared blocks within a given byte range of a file and turn off 1572 * the reflink flag if we unshare all of the file's blocks. 1573 */ 1574 int 1575 xfs_reflink_unshare( 1576 struct xfs_inode *ip, 1577 xfs_off_t offset, 1578 xfs_off_t len) 1579 { 1580 struct xfs_mount *mp = ip->i_mount; 1581 xfs_fileoff_t fbno; 1582 xfs_filblks_t end; 1583 xfs_off_t isize; 1584 int error; 1585 1586 if (!xfs_is_reflink_inode(ip)) 1587 return 0; 1588 1589 trace_xfs_reflink_unshare(ip, offset, len); 1590 1591 inode_dio_wait(VFS_I(ip)); 1592 1593 /* Try to CoW the selected ranges */ 1594 xfs_ilock(ip, XFS_ILOCK_EXCL); 1595 fbno = XFS_B_TO_FSBT(mp, offset); 1596 isize = i_size_read(VFS_I(ip)); 1597 end = XFS_B_TO_FSB(mp, offset + len); 1598 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1599 if (error) 1600 goto out_unlock; 1601 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1602 1603 /* Wait for the IO to finish */ 1604 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1605 if (error) 1606 goto out; 1607 1608 /* Turn off the reflink flag if possible. */ 1609 error = xfs_reflink_try_clear_inode_flag(ip); 1610 if (error) 1611 goto out; 1612 1613 return 0; 1614 1615 out_unlock: 1616 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1617 out: 1618 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1619 return error; 1620 } 1621