1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_da_format.h" 15 #include "xfs_da_btree.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_bmap.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_error.h" 22 #include "xfs_dir2.h" 23 #include "xfs_dir2_priv.h" 24 #include "xfs_ioctl.h" 25 #include "xfs_trace.h" 26 #include "xfs_log.h" 27 #include "xfs_icache.h" 28 #include "xfs_pnfs.h" 29 #include "xfs_btree.h" 30 #include "xfs_refcount_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_bmap_btree.h" 33 #include "xfs_trans_space.h" 34 #include "xfs_bit.h" 35 #include "xfs_alloc.h" 36 #include "xfs_quota_defs.h" 37 #include "xfs_quota.h" 38 #include "xfs_reflink.h" 39 #include "xfs_iomap.h" 40 #include "xfs_rmap_btree.h" 41 #include "xfs_sb.h" 42 #include "xfs_ag_resv.h" 43 44 /* 45 * Copy on Write of Shared Blocks 46 * 47 * XFS must preserve "the usual" file semantics even when two files share 48 * the same physical blocks. This means that a write to one file must not 49 * alter the blocks in a different file; the way that we'll do that is 50 * through the use of a copy-on-write mechanism. At a high level, that 51 * means that when we want to write to a shared block, we allocate a new 52 * block, write the data to the new block, and if that succeeds we map the 53 * new block into the file. 54 * 55 * XFS provides a "delayed allocation" mechanism that defers the allocation 56 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 57 * possible. This reduces fragmentation by enabling the filesystem to ask 58 * for bigger chunks less often, which is exactly what we want for CoW. 59 * 60 * The delalloc mechanism begins when the kernel wants to make a block 61 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 62 * create a delalloc mapping, which is a regular in-core extent, but without 63 * a real startblock. (For delalloc mappings, the startblock encodes both 64 * a flag that this is a delalloc mapping, and a worst-case estimate of how 65 * many blocks might be required to put the mapping into the BMBT.) delalloc 66 * mappings are a reservation against the free space in the filesystem; 67 * adjacent mappings can also be combined into fewer larger mappings. 68 * 69 * As an optimization, the CoW extent size hint (cowextsz) creates 70 * outsized aligned delalloc reservations in the hope of landing out of 71 * order nearby CoW writes in a single extent on disk, thereby reducing 72 * fragmentation and improving future performance. 73 * 74 * D: --RRRRRRSSSRRRRRRRR--- (data fork) 75 * C: ------DDDDDDD--------- (CoW fork) 76 * 77 * When dirty pages are being written out (typically in writepage), the 78 * delalloc reservations are converted into unwritten mappings by 79 * allocating blocks and replacing the delalloc mapping with real ones. 80 * A delalloc mapping can be replaced by several unwritten ones if the 81 * free space is fragmented. 82 * 83 * D: --RRRRRRSSSRRRRRRRR--- 84 * C: ------UUUUUUU--------- 85 * 86 * We want to adapt the delalloc mechanism for copy-on-write, since the 87 * write paths are similar. The first two steps (creating the reservation 88 * and allocating the blocks) are exactly the same as delalloc except that 89 * the mappings must be stored in a separate CoW fork because we do not want 90 * to disturb the mapping in the data fork until we're sure that the write 91 * succeeded. IO completion in this case is the process of removing the old 92 * mapping from the data fork and moving the new mapping from the CoW fork to 93 * the data fork. This will be discussed shortly. 94 * 95 * For now, unaligned directio writes will be bounced back to the page cache. 96 * Block-aligned directio writes will use the same mechanism as buffered 97 * writes. 98 * 99 * Just prior to submitting the actual disk write requests, we convert 100 * the extents representing the range of the file actually being written 101 * (as opposed to extra pieces created for the cowextsize hint) to real 102 * extents. This will become important in the next step: 103 * 104 * D: --RRRRRRSSSRRRRRRRR--- 105 * C: ------UUrrUUU--------- 106 * 107 * CoW remapping must be done after the data block write completes, 108 * because we don't want to destroy the old data fork map until we're sure 109 * the new block has been written. Since the new mappings are kept in a 110 * separate fork, we can simply iterate these mappings to find the ones 111 * that cover the file blocks that we just CoW'd. For each extent, simply 112 * unmap the corresponding range in the data fork, map the new range into 113 * the data fork, and remove the extent from the CoW fork. Because of 114 * the presence of the cowextsize hint, however, we must be careful 115 * only to remap the blocks that we've actually written out -- we must 116 * never remap delalloc reservations nor CoW staging blocks that have 117 * yet to be written. This corresponds exactly to the real extents in 118 * the CoW fork: 119 * 120 * D: --RRRRRRrrSRRRRRRRR--- 121 * C: ------UU--UUU--------- 122 * 123 * Since the remapping operation can be applied to an arbitrary file 124 * range, we record the need for the remap step as a flag in the ioend 125 * instead of declaring a new IO type. This is required for direct io 126 * because we only have ioend for the whole dio, and we have to be able to 127 * remember the presence of unwritten blocks and CoW blocks with a single 128 * ioend structure. Better yet, the more ground we can cover with one 129 * ioend, the better. 130 */ 131 132 /* 133 * Given an AG extent, find the lowest-numbered run of shared blocks 134 * within that range and return the range in fbno/flen. If 135 * find_end_of_shared is true, return the longest contiguous extent of 136 * shared blocks. If there are no shared extents, fbno and flen will 137 * be set to NULLAGBLOCK and 0, respectively. 138 */ 139 int 140 xfs_reflink_find_shared( 141 struct xfs_mount *mp, 142 struct xfs_trans *tp, 143 xfs_agnumber_t agno, 144 xfs_agblock_t agbno, 145 xfs_extlen_t aglen, 146 xfs_agblock_t *fbno, 147 xfs_extlen_t *flen, 148 bool find_end_of_shared) 149 { 150 struct xfs_buf *agbp; 151 struct xfs_btree_cur *cur; 152 int error; 153 154 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 155 if (error) 156 return error; 157 if (!agbp) 158 return -ENOMEM; 159 160 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); 161 162 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 163 find_end_of_shared); 164 165 xfs_btree_del_cursor(cur, error); 166 167 xfs_trans_brelse(tp, agbp); 168 return error; 169 } 170 171 /* 172 * Trim the mapping to the next block where there's a change in the 173 * shared/unshared status. More specifically, this means that we 174 * find the lowest-numbered extent of shared blocks that coincides with 175 * the given block mapping. If the shared extent overlaps the start of 176 * the mapping, trim the mapping to the end of the shared extent. If 177 * the shared region intersects the mapping, trim the mapping to the 178 * start of the shared extent. If there are no shared regions that 179 * overlap, just return the original extent. 180 */ 181 int 182 xfs_reflink_trim_around_shared( 183 struct xfs_inode *ip, 184 struct xfs_bmbt_irec *irec, 185 bool *shared) 186 { 187 xfs_agnumber_t agno; 188 xfs_agblock_t agbno; 189 xfs_extlen_t aglen; 190 xfs_agblock_t fbno; 191 xfs_extlen_t flen; 192 int error = 0; 193 194 /* Holes, unwritten, and delalloc extents cannot be shared */ 195 if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 196 *shared = false; 197 return 0; 198 } 199 200 trace_xfs_reflink_trim_around_shared(ip, irec); 201 202 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 203 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 204 aglen = irec->br_blockcount; 205 206 error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, 207 aglen, &fbno, &flen, true); 208 if (error) 209 return error; 210 211 *shared = false; 212 if (fbno == NULLAGBLOCK) { 213 /* No shared blocks at all. */ 214 return 0; 215 } else if (fbno == agbno) { 216 /* 217 * The start of this extent is shared. Truncate the 218 * mapping at the end of the shared region so that a 219 * subsequent iteration starts at the start of the 220 * unshared region. 221 */ 222 irec->br_blockcount = flen; 223 *shared = true; 224 return 0; 225 } else { 226 /* 227 * There's a shared extent midway through this extent. 228 * Truncate the mapping at the start of the shared 229 * extent so that a subsequent iteration starts at the 230 * start of the shared region. 231 */ 232 irec->br_blockcount = fbno - agbno; 233 return 0; 234 } 235 } 236 237 /* 238 * Trim the passed in imap to the next shared/unshared extent boundary, and 239 * if imap->br_startoff points to a shared extent reserve space for it in the 240 * COW fork. 241 * 242 * Note that imap will always contain the block numbers for the existing blocks 243 * in the data fork, as the upper layers need them for read-modify-write 244 * operations. 245 */ 246 int 247 xfs_reflink_reserve_cow( 248 struct xfs_inode *ip, 249 struct xfs_bmbt_irec *imap) 250 { 251 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 252 struct xfs_bmbt_irec got; 253 int error = 0; 254 bool eof = false; 255 struct xfs_iext_cursor icur; 256 bool shared; 257 258 /* 259 * Search the COW fork extent list first. This serves two purposes: 260 * first this implement the speculative preallocation using cowextisze, 261 * so that we also unshared block adjacent to shared blocks instead 262 * of just the shared blocks themselves. Second the lookup in the 263 * extent list is generally faster than going out to the shared extent 264 * tree. 265 */ 266 267 if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) 268 eof = true; 269 if (!eof && got.br_startoff <= imap->br_startoff) { 270 trace_xfs_reflink_cow_found(ip, imap); 271 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 272 return 0; 273 } 274 275 /* Trim the mapping to the nearest shared extent boundary. */ 276 error = xfs_reflink_trim_around_shared(ip, imap, &shared); 277 if (error) 278 return error; 279 280 /* Not shared? Just report the (potentially capped) extent. */ 281 if (!shared) 282 return 0; 283 284 /* 285 * Fork all the shared blocks from our write offset until the end of 286 * the extent. 287 */ 288 error = xfs_qm_dqattach_locked(ip, false); 289 if (error) 290 return error; 291 292 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, 293 imap->br_blockcount, 0, &got, &icur, eof); 294 if (error == -ENOSPC || error == -EDQUOT) 295 trace_xfs_reflink_cow_enospc(ip, imap); 296 if (error) 297 return error; 298 299 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 300 trace_xfs_reflink_cow_alloc(ip, &got); 301 return 0; 302 } 303 304 /* Convert part of an unwritten CoW extent to a real one. */ 305 STATIC int 306 xfs_reflink_convert_cow_extent( 307 struct xfs_inode *ip, 308 struct xfs_bmbt_irec *imap, 309 xfs_fileoff_t offset_fsb, 310 xfs_filblks_t count_fsb) 311 { 312 int nimaps = 1; 313 314 if (imap->br_state == XFS_EXT_NORM) 315 return 0; 316 317 xfs_trim_extent(imap, offset_fsb, count_fsb); 318 trace_xfs_reflink_convert_cow(ip, imap); 319 if (imap->br_blockcount == 0) 320 return 0; 321 return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, 322 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, 323 &nimaps); 324 } 325 326 /* Convert all of the unwritten CoW extents in a file's range to real ones. */ 327 int 328 xfs_reflink_convert_cow( 329 struct xfs_inode *ip, 330 xfs_off_t offset, 331 xfs_off_t count) 332 { 333 struct xfs_mount *mp = ip->i_mount; 334 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 335 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 336 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 337 struct xfs_bmbt_irec imap; 338 int nimaps = 1, error = 0; 339 340 ASSERT(count != 0); 341 342 xfs_ilock(ip, XFS_ILOCK_EXCL); 343 error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, 344 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | 345 XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); 346 xfs_iunlock(ip, XFS_ILOCK_EXCL); 347 return error; 348 } 349 350 /* 351 * Find the extent that maps the given range in the COW fork. Even if the extent 352 * is not shared we might have a preallocation for it in the COW fork. If so we 353 * use it that rather than trigger a new allocation. 354 */ 355 static int 356 xfs_find_trim_cow_extent( 357 struct xfs_inode *ip, 358 struct xfs_bmbt_irec *imap, 359 bool *shared, 360 bool *found) 361 { 362 xfs_fileoff_t offset_fsb = imap->br_startoff; 363 xfs_filblks_t count_fsb = imap->br_blockcount; 364 struct xfs_iext_cursor icur; 365 struct xfs_bmbt_irec got; 366 367 *found = false; 368 369 /* 370 * If we don't find an overlapping extent, trim the range we need to 371 * allocate to fit the hole we found. 372 */ 373 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 374 got.br_startoff = offset_fsb + count_fsb; 375 if (got.br_startoff > offset_fsb) { 376 xfs_trim_extent(imap, imap->br_startoff, 377 got.br_startoff - imap->br_startoff); 378 return xfs_reflink_trim_around_shared(ip, imap, shared); 379 } 380 381 *shared = true; 382 if (isnullstartblock(got.br_startblock)) { 383 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 384 return 0; 385 } 386 387 /* real extent found - no need to allocate */ 388 xfs_trim_extent(&got, offset_fsb, count_fsb); 389 *imap = got; 390 *found = true; 391 return 0; 392 } 393 394 /* Allocate all CoW reservations covering a range of blocks in a file. */ 395 int 396 xfs_reflink_allocate_cow( 397 struct xfs_inode *ip, 398 struct xfs_bmbt_irec *imap, 399 bool *shared, 400 uint *lockmode) 401 { 402 struct xfs_mount *mp = ip->i_mount; 403 xfs_fileoff_t offset_fsb = imap->br_startoff; 404 xfs_filblks_t count_fsb = imap->br_blockcount; 405 struct xfs_trans *tp; 406 int nimaps, error = 0; 407 bool found; 408 xfs_filblks_t resaligned; 409 xfs_extlen_t resblks = 0; 410 411 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 412 ASSERT(xfs_is_reflink_inode(ip)); 413 414 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 415 if (error || !*shared) 416 return error; 417 if (found) 418 goto convert; 419 420 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 421 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 422 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 423 424 xfs_iunlock(ip, *lockmode); 425 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 426 *lockmode = XFS_ILOCK_EXCL; 427 xfs_ilock(ip, *lockmode); 428 429 if (error) 430 return error; 431 432 error = xfs_qm_dqattach_locked(ip, false); 433 if (error) 434 goto out_trans_cancel; 435 436 /* 437 * Check for an overlapping extent again now that we dropped the ilock. 438 */ 439 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 440 if (error || !*shared) 441 goto out_trans_cancel; 442 if (found) { 443 xfs_trans_cancel(tp); 444 goto convert; 445 } 446 447 error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, 448 XFS_QMOPT_RES_REGBLKS); 449 if (error) 450 goto out_trans_cancel; 451 452 xfs_trans_ijoin(tp, ip, 0); 453 454 /* Allocate the entire reservation as unwritten blocks. */ 455 nimaps = 1; 456 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 457 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 458 resblks, imap, &nimaps); 459 if (error) 460 goto out_unreserve; 461 462 xfs_inode_set_cowblocks_tag(ip); 463 error = xfs_trans_commit(tp); 464 if (error) 465 return error; 466 467 /* 468 * Allocation succeeded but the requested range was not even partially 469 * satisfied? Bail out! 470 */ 471 if (nimaps == 0) 472 return -ENOSPC; 473 convert: 474 return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); 475 476 out_unreserve: 477 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, 478 XFS_QMOPT_RES_REGBLKS); 479 out_trans_cancel: 480 xfs_trans_cancel(tp); 481 return error; 482 } 483 484 /* 485 * Cancel CoW reservations for some block range of an inode. 486 * 487 * If cancel_real is true this function cancels all COW fork extents for the 488 * inode; if cancel_real is false, real extents are not cleared. 489 * 490 * Caller must have already joined the inode to the current transaction. The 491 * inode will be joined to the transaction returned to the caller. 492 */ 493 int 494 xfs_reflink_cancel_cow_blocks( 495 struct xfs_inode *ip, 496 struct xfs_trans **tpp, 497 xfs_fileoff_t offset_fsb, 498 xfs_fileoff_t end_fsb, 499 bool cancel_real) 500 { 501 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 502 struct xfs_bmbt_irec got, del; 503 struct xfs_iext_cursor icur; 504 int error = 0; 505 506 if (!xfs_inode_has_cow_data(ip)) 507 return 0; 508 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 509 return 0; 510 511 /* Walk backwards until we're out of the I/O range... */ 512 while (got.br_startoff + got.br_blockcount > offset_fsb) { 513 del = got; 514 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 515 516 /* Extent delete may have bumped ext forward */ 517 if (!del.br_blockcount) { 518 xfs_iext_prev(ifp, &icur); 519 goto next_extent; 520 } 521 522 trace_xfs_reflink_cancel_cow(ip, &del); 523 524 if (isnullstartblock(del.br_startblock)) { 525 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 526 &icur, &got, &del); 527 if (error) 528 break; 529 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 530 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 531 532 /* Free the CoW orphan record. */ 533 error = xfs_refcount_free_cow_extent(*tpp, 534 del.br_startblock, del.br_blockcount); 535 if (error) 536 break; 537 538 xfs_bmap_add_free(*tpp, del.br_startblock, 539 del.br_blockcount, NULL); 540 541 /* Roll the transaction */ 542 error = xfs_defer_finish(tpp); 543 if (error) 544 break; 545 546 /* Remove the mapping from the CoW fork. */ 547 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 548 549 /* Remove the quota reservation */ 550 error = xfs_trans_reserve_quota_nblks(NULL, ip, 551 -(long)del.br_blockcount, 0, 552 XFS_QMOPT_RES_REGBLKS); 553 if (error) 554 break; 555 } else { 556 /* Didn't do anything, push cursor back. */ 557 xfs_iext_prev(ifp, &icur); 558 } 559 next_extent: 560 if (!xfs_iext_get_extent(ifp, &icur, &got)) 561 break; 562 } 563 564 /* clear tag if cow fork is emptied */ 565 if (!ifp->if_bytes) 566 xfs_inode_clear_cowblocks_tag(ip); 567 return error; 568 } 569 570 /* 571 * Cancel CoW reservations for some byte range of an inode. 572 * 573 * If cancel_real is true this function cancels all COW fork extents for the 574 * inode; if cancel_real is false, real extents are not cleared. 575 */ 576 int 577 xfs_reflink_cancel_cow_range( 578 struct xfs_inode *ip, 579 xfs_off_t offset, 580 xfs_off_t count, 581 bool cancel_real) 582 { 583 struct xfs_trans *tp; 584 xfs_fileoff_t offset_fsb; 585 xfs_fileoff_t end_fsb; 586 int error; 587 588 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 589 ASSERT(xfs_is_reflink_inode(ip)); 590 591 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 592 if (count == NULLFILEOFF) 593 end_fsb = NULLFILEOFF; 594 else 595 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 596 597 /* Start a rolling transaction to remove the mappings */ 598 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 599 0, 0, XFS_TRANS_NOFS, &tp); 600 if (error) 601 goto out; 602 603 xfs_ilock(ip, XFS_ILOCK_EXCL); 604 xfs_trans_ijoin(tp, ip, 0); 605 606 /* Scrape out the old CoW reservations */ 607 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 608 cancel_real); 609 if (error) 610 goto out_cancel; 611 612 error = xfs_trans_commit(tp); 613 614 xfs_iunlock(ip, XFS_ILOCK_EXCL); 615 return error; 616 617 out_cancel: 618 xfs_trans_cancel(tp); 619 xfs_iunlock(ip, XFS_ILOCK_EXCL); 620 out: 621 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 622 return error; 623 } 624 625 /* 626 * Remap part of the CoW fork into the data fork. 627 * 628 * We aim to remap the range starting at @offset_fsb and ending at @end_fsb 629 * into the data fork; this function will remap what it can (at the end of the 630 * range) and update @end_fsb appropriately. Each remap gets its own 631 * transaction because we can end up merging and splitting bmbt blocks for 632 * every remap operation and we'd like to keep the block reservation 633 * requirements as low as possible. 634 */ 635 STATIC int 636 xfs_reflink_end_cow_extent( 637 struct xfs_inode *ip, 638 xfs_fileoff_t offset_fsb, 639 xfs_fileoff_t *end_fsb) 640 { 641 struct xfs_bmbt_irec got, del; 642 struct xfs_iext_cursor icur; 643 struct xfs_mount *mp = ip->i_mount; 644 struct xfs_trans *tp; 645 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 646 xfs_filblks_t rlen; 647 unsigned int resblks; 648 int error; 649 650 /* No COW extents? That's easy! */ 651 if (ifp->if_bytes == 0) { 652 *end_fsb = offset_fsb; 653 return 0; 654 } 655 656 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 657 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 658 XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); 659 if (error) 660 return error; 661 662 /* 663 * Lock the inode. We have to ijoin without automatic unlock because 664 * the lead transaction is the refcountbt record deletion; the data 665 * fork update follows as a deferred log item. 666 */ 667 xfs_ilock(ip, XFS_ILOCK_EXCL); 668 xfs_trans_ijoin(tp, ip, 0); 669 670 /* 671 * In case of racing, overlapping AIO writes no COW extents might be 672 * left by the time I/O completes for the loser of the race. In that 673 * case we are done. 674 */ 675 if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || 676 got.br_startoff + got.br_blockcount <= offset_fsb) { 677 *end_fsb = offset_fsb; 678 goto out_cancel; 679 } 680 681 /* 682 * Structure copy @got into @del, then trim @del to the range that we 683 * were asked to remap. We preserve @got for the eventual CoW fork 684 * deletion; from now on @del represents the mapping that we're 685 * actually remapping. 686 */ 687 del = got; 688 xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); 689 690 ASSERT(del.br_blockcount > 0); 691 692 /* 693 * Only remap real extents that contain data. With AIO, speculative 694 * preallocations can leak into the range we are called upon, and we 695 * need to skip them. 696 */ 697 if (!xfs_bmap_is_real_extent(&got)) { 698 *end_fsb = del.br_startoff; 699 goto out_cancel; 700 } 701 702 /* Unmap the old blocks in the data fork. */ 703 rlen = del.br_blockcount; 704 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); 705 if (error) 706 goto out_cancel; 707 708 /* Trim the extent to whatever got unmapped. */ 709 xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); 710 trace_xfs_reflink_cow_remap(ip, &del); 711 712 /* Free the CoW orphan record. */ 713 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 714 del.br_blockcount); 715 if (error) 716 goto out_cancel; 717 718 /* Map the new blocks into the data fork. */ 719 error = xfs_bmap_map_extent(tp, ip, &del); 720 if (error) 721 goto out_cancel; 722 723 /* Charge this new data fork mapping to the on-disk quota. */ 724 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 725 (long)del.br_blockcount); 726 727 /* Remove the mapping from the CoW fork. */ 728 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 729 730 error = xfs_trans_commit(tp); 731 xfs_iunlock(ip, XFS_ILOCK_EXCL); 732 if (error) 733 return error; 734 735 /* Update the caller about how much progress we made. */ 736 *end_fsb = del.br_startoff; 737 return 0; 738 739 out_cancel: 740 xfs_trans_cancel(tp); 741 xfs_iunlock(ip, XFS_ILOCK_EXCL); 742 return error; 743 } 744 745 /* 746 * Remap parts of a file's data fork after a successful CoW. 747 */ 748 int 749 xfs_reflink_end_cow( 750 struct xfs_inode *ip, 751 xfs_off_t offset, 752 xfs_off_t count) 753 { 754 xfs_fileoff_t offset_fsb; 755 xfs_fileoff_t end_fsb; 756 int error = 0; 757 758 trace_xfs_reflink_end_cow(ip, offset, count); 759 760 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 761 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 762 763 /* 764 * Walk backwards until we're out of the I/O range. The loop function 765 * repeatedly cycles the ILOCK to allocate one transaction per remapped 766 * extent. 767 * 768 * If we're being called by writeback then the the pages will still 769 * have PageWriteback set, which prevents races with reflink remapping 770 * and truncate. Reflink remapping prevents races with writeback by 771 * taking the iolock and mmaplock before flushing the pages and 772 * remapping, which means there won't be any further writeback or page 773 * cache dirtying until the reflink completes. 774 * 775 * We should never have two threads issuing writeback for the same file 776 * region. There are also have post-eof checks in the writeback 777 * preparation code so that we don't bother writing out pages that are 778 * about to be truncated. 779 * 780 * If we're being called as part of directio write completion, the dio 781 * count is still elevated, which reflink and truncate will wait for. 782 * Reflink remapping takes the iolock and mmaplock and waits for 783 * pending dio to finish, which should prevent any directio until the 784 * remap completes. Multiple concurrent directio writes to the same 785 * region are handled by end_cow processing only occurring for the 786 * threads which succeed; the outcome of multiple overlapping direct 787 * writes is not well defined anyway. 788 * 789 * It's possible that a buffered write and a direct write could collide 790 * here (the buffered write stumbles in after the dio flushes and 791 * invalidates the page cache and immediately queues writeback), but we 792 * have never supported this 100%. If either disk write succeeds the 793 * blocks will be remapped. 794 */ 795 while (end_fsb > offset_fsb && !error) 796 error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); 797 798 if (error) 799 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 800 return error; 801 } 802 803 /* 804 * Free leftover CoW reservations that didn't get cleaned out. 805 */ 806 int 807 xfs_reflink_recover_cow( 808 struct xfs_mount *mp) 809 { 810 xfs_agnumber_t agno; 811 int error = 0; 812 813 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 814 return 0; 815 816 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 817 error = xfs_refcount_recover_cow_leftovers(mp, agno); 818 if (error) 819 break; 820 } 821 822 return error; 823 } 824 825 /* 826 * Reflinking (Block) Ranges of Two Files Together 827 * 828 * First, ensure that the reflink flag is set on both inodes. The flag is an 829 * optimization to avoid unnecessary refcount btree lookups in the write path. 830 * 831 * Now we can iteratively remap the range of extents (and holes) in src to the 832 * corresponding ranges in dest. Let drange and srange denote the ranges of 833 * logical blocks in dest and src touched by the reflink operation. 834 * 835 * While the length of drange is greater than zero, 836 * - Read src's bmbt at the start of srange ("imap") 837 * - If imap doesn't exist, make imap appear to start at the end of srange 838 * with zero length. 839 * - If imap starts before srange, advance imap to start at srange. 840 * - If imap goes beyond srange, truncate imap to end at the end of srange. 841 * - Punch (imap start - srange start + imap len) blocks from dest at 842 * offset (drange start). 843 * - If imap points to a real range of pblks, 844 * > Increase the refcount of the imap's pblks 845 * > Map imap's pblks into dest at the offset 846 * (drange start + imap start - srange start) 847 * - Advance drange and srange by (imap start - srange start + imap len) 848 * 849 * Finally, if the reflink made dest longer, update both the in-core and 850 * on-disk file sizes. 851 * 852 * ASCII Art Demonstration: 853 * 854 * Let's say we want to reflink this source file: 855 * 856 * ----SSSSSSS-SSSSS----SSSSSS (src file) 857 * <--------------------> 858 * 859 * into this destination file: 860 * 861 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 862 * <--------------------> 863 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 864 * Observe that the range has different logical offsets in either file. 865 * 866 * Consider that the first extent in the source file doesn't line up with our 867 * reflink range. Unmapping and remapping are separate operations, so we can 868 * unmap more blocks from the destination file than we remap. 869 * 870 * ----SSSSSSS-SSSSS----SSSSSS 871 * <-------> 872 * --DDDDD---------DDDDD--DDD 873 * <-------> 874 * 875 * Now remap the source extent into the destination file: 876 * 877 * ----SSSSSSS-SSSSS----SSSSSS 878 * <-------> 879 * --DDDDD--SSSSSSSDDDDD--DDD 880 * <-------> 881 * 882 * Do likewise with the second hole and extent in our range. Holes in the 883 * unmap range don't affect our operation. 884 * 885 * ----SSSSSSS-SSSSS----SSSSSS 886 * <----> 887 * --DDDDD--SSSSSSS-SSSSS-DDD 888 * <----> 889 * 890 * Finally, unmap and remap part of the third extent. This will increase the 891 * size of the destination file. 892 * 893 * ----SSSSSSS-SSSSS----SSSSSS 894 * <-----> 895 * --DDDDD--SSSSSSS-SSSSS----SSS 896 * <-----> 897 * 898 * Once we update the destination file's i_size, we're done. 899 */ 900 901 /* 902 * Ensure the reflink bit is set in both inodes. 903 */ 904 STATIC int 905 xfs_reflink_set_inode_flag( 906 struct xfs_inode *src, 907 struct xfs_inode *dest) 908 { 909 struct xfs_mount *mp = src->i_mount; 910 int error; 911 struct xfs_trans *tp; 912 913 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 914 return 0; 915 916 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 917 if (error) 918 goto out_error; 919 920 /* Lock both files against IO */ 921 if (src->i_ino == dest->i_ino) 922 xfs_ilock(src, XFS_ILOCK_EXCL); 923 else 924 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 925 926 if (!xfs_is_reflink_inode(src)) { 927 trace_xfs_reflink_set_inode_flag(src); 928 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 929 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 930 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 931 xfs_ifork_init_cow(src); 932 } else 933 xfs_iunlock(src, XFS_ILOCK_EXCL); 934 935 if (src->i_ino == dest->i_ino) 936 goto commit_flags; 937 938 if (!xfs_is_reflink_inode(dest)) { 939 trace_xfs_reflink_set_inode_flag(dest); 940 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 941 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 942 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 943 xfs_ifork_init_cow(dest); 944 } else 945 xfs_iunlock(dest, XFS_ILOCK_EXCL); 946 947 commit_flags: 948 error = xfs_trans_commit(tp); 949 if (error) 950 goto out_error; 951 return error; 952 953 out_error: 954 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 955 return error; 956 } 957 958 /* 959 * Update destination inode size & cowextsize hint, if necessary. 960 */ 961 int 962 xfs_reflink_update_dest( 963 struct xfs_inode *dest, 964 xfs_off_t newlen, 965 xfs_extlen_t cowextsize, 966 unsigned int remap_flags) 967 { 968 struct xfs_mount *mp = dest->i_mount; 969 struct xfs_trans *tp; 970 int error; 971 972 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 973 return 0; 974 975 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 976 if (error) 977 goto out_error; 978 979 xfs_ilock(dest, XFS_ILOCK_EXCL); 980 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 981 982 if (newlen > i_size_read(VFS_I(dest))) { 983 trace_xfs_reflink_update_inode_size(dest, newlen); 984 i_size_write(VFS_I(dest), newlen); 985 dest->i_d.di_size = newlen; 986 } 987 988 if (cowextsize) { 989 dest->i_d.di_cowextsize = cowextsize; 990 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 991 } 992 993 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 994 995 error = xfs_trans_commit(tp); 996 if (error) 997 goto out_error; 998 return error; 999 1000 out_error: 1001 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 1002 return error; 1003 } 1004 1005 /* 1006 * Do we have enough reserve in this AG to handle a reflink? The refcount 1007 * btree already reserved all the space it needs, but the rmap btree can grow 1008 * infinitely, so we won't allow more reflinks when the AG is down to the 1009 * btree reserves. 1010 */ 1011 static int 1012 xfs_reflink_ag_has_free_space( 1013 struct xfs_mount *mp, 1014 xfs_agnumber_t agno) 1015 { 1016 struct xfs_perag *pag; 1017 int error = 0; 1018 1019 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 1020 return 0; 1021 1022 pag = xfs_perag_get(mp, agno); 1023 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 1024 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 1025 error = -ENOSPC; 1026 xfs_perag_put(pag); 1027 return error; 1028 } 1029 1030 /* 1031 * Unmap a range of blocks from a file, then map other blocks into the hole. 1032 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 1033 * The extent irec is mapped into dest at irec->br_startoff. 1034 */ 1035 STATIC int 1036 xfs_reflink_remap_extent( 1037 struct xfs_inode *ip, 1038 struct xfs_bmbt_irec *irec, 1039 xfs_fileoff_t destoff, 1040 xfs_off_t new_isize) 1041 { 1042 struct xfs_mount *mp = ip->i_mount; 1043 bool real_extent = xfs_bmap_is_real_extent(irec); 1044 struct xfs_trans *tp; 1045 unsigned int resblks; 1046 struct xfs_bmbt_irec uirec; 1047 xfs_filblks_t rlen; 1048 xfs_filblks_t unmap_len; 1049 xfs_off_t newlen; 1050 int error; 1051 1052 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1053 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1054 1055 /* No reflinking if we're low on space */ 1056 if (real_extent) { 1057 error = xfs_reflink_ag_has_free_space(mp, 1058 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1059 if (error) 1060 goto out; 1061 } 1062 1063 /* Start a rolling transaction to switch the mappings */ 1064 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1065 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1066 if (error) 1067 goto out; 1068 1069 xfs_ilock(ip, XFS_ILOCK_EXCL); 1070 xfs_trans_ijoin(tp, ip, 0); 1071 1072 /* If we're not just clearing space, then do we have enough quota? */ 1073 if (real_extent) { 1074 error = xfs_trans_reserve_quota_nblks(tp, ip, 1075 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1076 if (error) 1077 goto out_cancel; 1078 } 1079 1080 trace_xfs_reflink_remap(ip, irec->br_startoff, 1081 irec->br_blockcount, irec->br_startblock); 1082 1083 /* Unmap the old blocks in the data fork. */ 1084 rlen = unmap_len; 1085 while (rlen) { 1086 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1087 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); 1088 if (error) 1089 goto out_cancel; 1090 1091 /* 1092 * Trim the extent to whatever got unmapped. 1093 * Remember, bunmapi works backwards. 1094 */ 1095 uirec.br_startblock = irec->br_startblock + rlen; 1096 uirec.br_startoff = irec->br_startoff + rlen; 1097 uirec.br_blockcount = unmap_len - rlen; 1098 unmap_len = rlen; 1099 1100 /* If this isn't a real mapping, we're done. */ 1101 if (!real_extent || uirec.br_blockcount == 0) 1102 goto next_extent; 1103 1104 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1105 uirec.br_blockcount, uirec.br_startblock); 1106 1107 /* Update the refcount tree */ 1108 error = xfs_refcount_increase_extent(tp, &uirec); 1109 if (error) 1110 goto out_cancel; 1111 1112 /* Map the new blocks into the data fork. */ 1113 error = xfs_bmap_map_extent(tp, ip, &uirec); 1114 if (error) 1115 goto out_cancel; 1116 1117 /* Update quota accounting. */ 1118 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1119 uirec.br_blockcount); 1120 1121 /* Update dest isize if needed. */ 1122 newlen = XFS_FSB_TO_B(mp, 1123 uirec.br_startoff + uirec.br_blockcount); 1124 newlen = min_t(xfs_off_t, newlen, new_isize); 1125 if (newlen > i_size_read(VFS_I(ip))) { 1126 trace_xfs_reflink_update_inode_size(ip, newlen); 1127 i_size_write(VFS_I(ip), newlen); 1128 ip->i_d.di_size = newlen; 1129 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1130 } 1131 1132 next_extent: 1133 /* Process all the deferred stuff. */ 1134 error = xfs_defer_finish(&tp); 1135 if (error) 1136 goto out_cancel; 1137 } 1138 1139 error = xfs_trans_commit(tp); 1140 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1141 if (error) 1142 goto out; 1143 return 0; 1144 1145 out_cancel: 1146 xfs_trans_cancel(tp); 1147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1148 out: 1149 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1150 return error; 1151 } 1152 1153 /* 1154 * Iteratively remap one file's extents (and holes) to another's. 1155 */ 1156 int 1157 xfs_reflink_remap_blocks( 1158 struct xfs_inode *src, 1159 loff_t pos_in, 1160 struct xfs_inode *dest, 1161 loff_t pos_out, 1162 loff_t remap_len, 1163 loff_t *remapped) 1164 { 1165 struct xfs_bmbt_irec imap; 1166 xfs_fileoff_t srcoff; 1167 xfs_fileoff_t destoff; 1168 xfs_filblks_t len; 1169 xfs_filblks_t range_len; 1170 xfs_filblks_t remapped_len = 0; 1171 xfs_off_t new_isize = pos_out + remap_len; 1172 int nimaps; 1173 int error = 0; 1174 1175 destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); 1176 srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); 1177 len = XFS_B_TO_FSB(src->i_mount, remap_len); 1178 1179 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1180 while (len) { 1181 uint lock_mode; 1182 1183 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1184 dest, destoff); 1185 1186 /* Read extent from the source file */ 1187 nimaps = 1; 1188 lock_mode = xfs_ilock_data_map_shared(src); 1189 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1190 xfs_iunlock(src, lock_mode); 1191 if (error) 1192 break; 1193 ASSERT(nimaps == 1); 1194 1195 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1196 &imap); 1197 1198 /* Translate imap into the destination file. */ 1199 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1200 imap.br_startoff += destoff - srcoff; 1201 1202 /* Clear dest from destoff to the end of imap and map it in. */ 1203 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1204 new_isize); 1205 if (error) 1206 break; 1207 1208 if (fatal_signal_pending(current)) { 1209 error = -EINTR; 1210 break; 1211 } 1212 1213 /* Advance drange/srange */ 1214 srcoff += range_len; 1215 destoff += range_len; 1216 len -= range_len; 1217 remapped_len += range_len; 1218 } 1219 1220 if (error) 1221 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1222 *remapped = min_t(loff_t, remap_len, 1223 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1224 return error; 1225 } 1226 1227 /* 1228 * Grab the exclusive iolock for a data copy from src to dest, making 1229 * sure to abide vfs locking order (lowest pointer value goes first) and 1230 * breaking the pnfs layout leases on dest before proceeding. The loop 1231 * is needed because we cannot call the blocking break_layout() with the 1232 * src iolock held, and therefore have to back out both locks. 1233 */ 1234 static int 1235 xfs_iolock_two_inodes_and_break_layout( 1236 struct inode *src, 1237 struct inode *dest) 1238 { 1239 int error; 1240 1241 retry: 1242 if (src < dest) { 1243 inode_lock_shared(src); 1244 inode_lock_nested(dest, I_MUTEX_NONDIR2); 1245 } else { 1246 /* src >= dest */ 1247 inode_lock(dest); 1248 } 1249 1250 error = break_layout(dest, false); 1251 if (error == -EWOULDBLOCK) { 1252 inode_unlock(dest); 1253 if (src < dest) 1254 inode_unlock_shared(src); 1255 error = break_layout(dest, true); 1256 if (error) 1257 return error; 1258 goto retry; 1259 } 1260 if (error) { 1261 inode_unlock(dest); 1262 if (src < dest) 1263 inode_unlock_shared(src); 1264 return error; 1265 } 1266 if (src > dest) 1267 inode_lock_shared_nested(src, I_MUTEX_NONDIR2); 1268 return 0; 1269 } 1270 1271 /* Unlock both inodes after they've been prepped for a range clone. */ 1272 void 1273 xfs_reflink_remap_unlock( 1274 struct file *file_in, 1275 struct file *file_out) 1276 { 1277 struct inode *inode_in = file_inode(file_in); 1278 struct xfs_inode *src = XFS_I(inode_in); 1279 struct inode *inode_out = file_inode(file_out); 1280 struct xfs_inode *dest = XFS_I(inode_out); 1281 bool same_inode = (inode_in == inode_out); 1282 1283 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1284 if (!same_inode) 1285 xfs_iunlock(src, XFS_MMAPLOCK_SHARED); 1286 inode_unlock(inode_out); 1287 if (!same_inode) 1288 inode_unlock_shared(inode_in); 1289 } 1290 1291 /* 1292 * If we're reflinking to a point past the destination file's EOF, we must 1293 * zero any speculative post-EOF preallocations that sit between the old EOF 1294 * and the destination file offset. 1295 */ 1296 static int 1297 xfs_reflink_zero_posteof( 1298 struct xfs_inode *ip, 1299 loff_t pos) 1300 { 1301 loff_t isize = i_size_read(VFS_I(ip)); 1302 1303 if (pos <= isize) 1304 return 0; 1305 1306 trace_xfs_zero_eof(ip, isize, pos - isize); 1307 return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, 1308 &xfs_iomap_ops); 1309 } 1310 1311 /* 1312 * Prepare two files for range cloning. Upon a successful return both inodes 1313 * will have the iolock and mmaplock held, the page cache of the out file will 1314 * be truncated, and any leases on the out file will have been broken. This 1315 * function borrows heavily from xfs_file_aio_write_checks. 1316 * 1317 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1318 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1319 * EOF block in the source dedupe range because it's not a complete block match, 1320 * hence can introduce a corruption into the file that has it's block replaced. 1321 * 1322 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 1323 * "block aligned" for the purposes of cloning entire files. However, if the 1324 * source file range includes the EOF block and it lands within the existing EOF 1325 * of the destination file, then we can expose stale data from beyond the source 1326 * file EOF in the destination file. 1327 * 1328 * XFS doesn't support partial block sharing, so in both cases we have check 1329 * these cases ourselves. For dedupe, we can simply round the length to dedupe 1330 * down to the previous whole block and ignore the partial EOF block. While this 1331 * means we can't dedupe the last block of a file, this is an acceptible 1332 * tradeoff for simplicity on implementation. 1333 * 1334 * For cloning, we want to share the partial EOF block if it is also the new EOF 1335 * block of the destination file. If the partial EOF block lies inside the 1336 * existing destination EOF, then we have to abort the clone to avoid exposing 1337 * stale data in the destination file. Hence we reject these clone attempts with 1338 * -EINVAL in this case. 1339 */ 1340 int 1341 xfs_reflink_remap_prep( 1342 struct file *file_in, 1343 loff_t pos_in, 1344 struct file *file_out, 1345 loff_t pos_out, 1346 loff_t *len, 1347 unsigned int remap_flags) 1348 { 1349 struct inode *inode_in = file_inode(file_in); 1350 struct xfs_inode *src = XFS_I(inode_in); 1351 struct inode *inode_out = file_inode(file_out); 1352 struct xfs_inode *dest = XFS_I(inode_out); 1353 bool same_inode = (inode_in == inode_out); 1354 ssize_t ret; 1355 1356 /* Lock both files against IO */ 1357 ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); 1358 if (ret) 1359 return ret; 1360 if (same_inode) 1361 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1362 else 1363 xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, 1364 XFS_MMAPLOCK_EXCL); 1365 1366 /* Check file eligibility and prepare for block sharing. */ 1367 ret = -EINVAL; 1368 /* Don't reflink realtime inodes */ 1369 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1370 goto out_unlock; 1371 1372 /* Don't share DAX file data for now. */ 1373 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1374 goto out_unlock; 1375 1376 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1377 len, remap_flags); 1378 if (ret < 0 || *len == 0) 1379 goto out_unlock; 1380 1381 /* Attach dquots to dest inode before changing block map */ 1382 ret = xfs_qm_dqattach(dest); 1383 if (ret) 1384 goto out_unlock; 1385 1386 /* 1387 * Zero existing post-eof speculative preallocations in the destination 1388 * file. 1389 */ 1390 ret = xfs_reflink_zero_posteof(dest, pos_out); 1391 if (ret) 1392 goto out_unlock; 1393 1394 /* Set flags and remap blocks. */ 1395 ret = xfs_reflink_set_inode_flag(src, dest); 1396 if (ret) 1397 goto out_unlock; 1398 1399 /* 1400 * If pos_out > EOF, we may have dirtied blocks between EOF and 1401 * pos_out. In that case, we need to extend the flush and unmap to cover 1402 * from EOF to the end of the copy length. 1403 */ 1404 if (pos_out > XFS_ISIZE(dest)) { 1405 loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 1406 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 1407 } else { 1408 ret = xfs_flush_unmap_range(dest, pos_out, *len); 1409 } 1410 if (ret) 1411 goto out_unlock; 1412 1413 return 1; 1414 out_unlock: 1415 xfs_reflink_remap_unlock(file_in, file_out); 1416 return ret; 1417 } 1418 1419 /* 1420 * The user wants to preemptively CoW all shared blocks in this file, 1421 * which enables us to turn off the reflink flag. Iterate all 1422 * extents which are not prealloc/delalloc to see which ranges are 1423 * mentioned in the refcount tree, then read those blocks into the 1424 * pagecache, dirty them, fsync them back out, and then we can update 1425 * the inode flag. What happens if we run out of memory? :) 1426 */ 1427 STATIC int 1428 xfs_reflink_dirty_extents( 1429 struct xfs_inode *ip, 1430 xfs_fileoff_t fbno, 1431 xfs_filblks_t end, 1432 xfs_off_t isize) 1433 { 1434 struct xfs_mount *mp = ip->i_mount; 1435 xfs_agnumber_t agno; 1436 xfs_agblock_t agbno; 1437 xfs_extlen_t aglen; 1438 xfs_agblock_t rbno; 1439 xfs_extlen_t rlen; 1440 xfs_off_t fpos; 1441 xfs_off_t flen; 1442 struct xfs_bmbt_irec map[2]; 1443 int nmaps; 1444 int error = 0; 1445 1446 while (end - fbno > 0) { 1447 nmaps = 1; 1448 /* 1449 * Look for extents in the file. Skip holes, delalloc, or 1450 * unwritten extents; they can't be reflinked. 1451 */ 1452 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1453 if (error) 1454 goto out; 1455 if (nmaps == 0) 1456 break; 1457 if (!xfs_bmap_is_real_extent(&map[0])) 1458 goto next; 1459 1460 map[1] = map[0]; 1461 while (map[1].br_blockcount) { 1462 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1463 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1464 aglen = map[1].br_blockcount; 1465 1466 error = xfs_reflink_find_shared(mp, NULL, agno, agbno, 1467 aglen, &rbno, &rlen, true); 1468 if (error) 1469 goto out; 1470 if (rbno == NULLAGBLOCK) 1471 break; 1472 1473 /* Dirty the pages */ 1474 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1475 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1476 (rbno - agbno)); 1477 flen = XFS_FSB_TO_B(mp, rlen); 1478 if (fpos + flen > isize) 1479 flen = isize - fpos; 1480 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1481 &xfs_iomap_ops); 1482 xfs_ilock(ip, XFS_ILOCK_EXCL); 1483 if (error) 1484 goto out; 1485 1486 map[1].br_blockcount -= (rbno - agbno + rlen); 1487 map[1].br_startoff += (rbno - agbno + rlen); 1488 map[1].br_startblock += (rbno - agbno + rlen); 1489 } 1490 1491 next: 1492 fbno = map[0].br_startoff + map[0].br_blockcount; 1493 } 1494 out: 1495 return error; 1496 } 1497 1498 /* Does this inode need the reflink flag? */ 1499 int 1500 xfs_reflink_inode_has_shared_extents( 1501 struct xfs_trans *tp, 1502 struct xfs_inode *ip, 1503 bool *has_shared) 1504 { 1505 struct xfs_bmbt_irec got; 1506 struct xfs_mount *mp = ip->i_mount; 1507 struct xfs_ifork *ifp; 1508 xfs_agnumber_t agno; 1509 xfs_agblock_t agbno; 1510 xfs_extlen_t aglen; 1511 xfs_agblock_t rbno; 1512 xfs_extlen_t rlen; 1513 struct xfs_iext_cursor icur; 1514 bool found; 1515 int error; 1516 1517 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1518 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 1519 error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1520 if (error) 1521 return error; 1522 } 1523 1524 *has_shared = false; 1525 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1526 while (found) { 1527 if (isnullstartblock(got.br_startblock) || 1528 got.br_state != XFS_EXT_NORM) 1529 goto next; 1530 agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); 1531 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1532 aglen = got.br_blockcount; 1533 1534 error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, 1535 &rbno, &rlen, false); 1536 if (error) 1537 return error; 1538 /* Is there still a shared block here? */ 1539 if (rbno != NULLAGBLOCK) { 1540 *has_shared = true; 1541 return 0; 1542 } 1543 next: 1544 found = xfs_iext_next_extent(ifp, &icur, &got); 1545 } 1546 1547 return 0; 1548 } 1549 1550 /* 1551 * Clear the inode reflink flag if there are no shared extents. 1552 * 1553 * The caller is responsible for joining the inode to the transaction passed in. 1554 * The inode will be joined to the transaction that is returned to the caller. 1555 */ 1556 int 1557 xfs_reflink_clear_inode_flag( 1558 struct xfs_inode *ip, 1559 struct xfs_trans **tpp) 1560 { 1561 bool needs_flag; 1562 int error = 0; 1563 1564 ASSERT(xfs_is_reflink_inode(ip)); 1565 1566 error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 1567 if (error || needs_flag) 1568 return error; 1569 1570 /* 1571 * We didn't find any shared blocks so turn off the reflink flag. 1572 * First, get rid of any leftover CoW mappings. 1573 */ 1574 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); 1575 if (error) 1576 return error; 1577 1578 /* Clear the inode flag. */ 1579 trace_xfs_reflink_unset_inode_flag(ip); 1580 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1581 xfs_inode_clear_cowblocks_tag(ip); 1582 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1583 1584 return error; 1585 } 1586 1587 /* 1588 * Clear the inode reflink flag if there are no shared extents and the size 1589 * hasn't changed. 1590 */ 1591 STATIC int 1592 xfs_reflink_try_clear_inode_flag( 1593 struct xfs_inode *ip) 1594 { 1595 struct xfs_mount *mp = ip->i_mount; 1596 struct xfs_trans *tp; 1597 int error = 0; 1598 1599 /* Start a rolling transaction to remove the mappings */ 1600 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1601 if (error) 1602 return error; 1603 1604 xfs_ilock(ip, XFS_ILOCK_EXCL); 1605 xfs_trans_ijoin(tp, ip, 0); 1606 1607 error = xfs_reflink_clear_inode_flag(ip, &tp); 1608 if (error) 1609 goto cancel; 1610 1611 error = xfs_trans_commit(tp); 1612 if (error) 1613 goto out; 1614 1615 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1616 return 0; 1617 cancel: 1618 xfs_trans_cancel(tp); 1619 out: 1620 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1621 return error; 1622 } 1623 1624 /* 1625 * Pre-COW all shared blocks within a given byte range of a file and turn off 1626 * the reflink flag if we unshare all of the file's blocks. 1627 */ 1628 int 1629 xfs_reflink_unshare( 1630 struct xfs_inode *ip, 1631 xfs_off_t offset, 1632 xfs_off_t len) 1633 { 1634 struct xfs_mount *mp = ip->i_mount; 1635 xfs_fileoff_t fbno; 1636 xfs_filblks_t end; 1637 xfs_off_t isize; 1638 int error; 1639 1640 if (!xfs_is_reflink_inode(ip)) 1641 return 0; 1642 1643 trace_xfs_reflink_unshare(ip, offset, len); 1644 1645 inode_dio_wait(VFS_I(ip)); 1646 1647 /* Try to CoW the selected ranges */ 1648 xfs_ilock(ip, XFS_ILOCK_EXCL); 1649 fbno = XFS_B_TO_FSBT(mp, offset); 1650 isize = i_size_read(VFS_I(ip)); 1651 end = XFS_B_TO_FSB(mp, offset + len); 1652 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1653 if (error) 1654 goto out_unlock; 1655 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1656 1657 /* Wait for the IO to finish */ 1658 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1659 if (error) 1660 goto out; 1661 1662 /* Turn off the reflink flag if possible. */ 1663 error = xfs_reflink_try_clear_inode_flag(ip); 1664 if (error) 1665 goto out; 1666 1667 return 0; 1668 1669 out_unlock: 1670 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1671 out: 1672 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1673 return error; 1674 } 1675