1 /* 2 * Copyright (C) 2016 Oracle. All Rights Reserved. 3 * 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_shared.h" 23 #include "xfs_format.h" 24 #include "xfs_log_format.h" 25 #include "xfs_trans_resv.h" 26 #include "xfs_mount.h" 27 #include "xfs_defer.h" 28 #include "xfs_da_format.h" 29 #include "xfs_da_btree.h" 30 #include "xfs_inode.h" 31 #include "xfs_trans.h" 32 #include "xfs_inode_item.h" 33 #include "xfs_bmap.h" 34 #include "xfs_bmap_util.h" 35 #include "xfs_error.h" 36 #include "xfs_dir2.h" 37 #include "xfs_dir2_priv.h" 38 #include "xfs_ioctl.h" 39 #include "xfs_trace.h" 40 #include "xfs_log.h" 41 #include "xfs_icache.h" 42 #include "xfs_pnfs.h" 43 #include "xfs_btree.h" 44 #include "xfs_refcount_btree.h" 45 #include "xfs_refcount.h" 46 #include "xfs_bmap_btree.h" 47 #include "xfs_trans_space.h" 48 #include "xfs_bit.h" 49 #include "xfs_alloc.h" 50 #include "xfs_quota_defs.h" 51 #include "xfs_quota.h" 52 #include "xfs_btree.h" 53 #include "xfs_bmap_btree.h" 54 #include "xfs_reflink.h" 55 #include "xfs_iomap.h" 56 #include "xfs_rmap_btree.h" 57 #include "xfs_sb.h" 58 #include "xfs_ag_resv.h" 59 60 /* 61 * Copy on Write of Shared Blocks 62 * 63 * XFS must preserve "the usual" file semantics even when two files share 64 * the same physical blocks. This means that a write to one file must not 65 * alter the blocks in a different file; the way that we'll do that is 66 * through the use of a copy-on-write mechanism. At a high level, that 67 * means that when we want to write to a shared block, we allocate a new 68 * block, write the data to the new block, and if that succeeds we map the 69 * new block into the file. 70 * 71 * XFS provides a "delayed allocation" mechanism that defers the allocation 72 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 73 * possible. This reduces fragmentation by enabling the filesystem to ask 74 * for bigger chunks less often, which is exactly what we want for CoW. 75 * 76 * The delalloc mechanism begins when the kernel wants to make a block 77 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 78 * create a delalloc mapping, which is a regular in-core extent, but without 79 * a real startblock. (For delalloc mappings, the startblock encodes both 80 * a flag that this is a delalloc mapping, and a worst-case estimate of how 81 * many blocks might be required to put the mapping into the BMBT.) delalloc 82 * mappings are a reservation against the free space in the filesystem; 83 * adjacent mappings can also be combined into fewer larger mappings. 84 * 85 * When dirty pages are being written out (typically in writepage), the 86 * delalloc reservations are converted into real mappings by allocating 87 * blocks and replacing the delalloc mapping with real ones. A delalloc 88 * mapping can be replaced by several real ones if the free space is 89 * fragmented. 90 * 91 * We want to adapt the delalloc mechanism for copy-on-write, since the 92 * write paths are similar. The first two steps (creating the reservation 93 * and allocating the blocks) are exactly the same as delalloc except that 94 * the mappings must be stored in a separate CoW fork because we do not want 95 * to disturb the mapping in the data fork until we're sure that the write 96 * succeeded. IO completion in this case is the process of removing the old 97 * mapping from the data fork and moving the new mapping from the CoW fork to 98 * the data fork. This will be discussed shortly. 99 * 100 * For now, unaligned directio writes will be bounced back to the page cache. 101 * Block-aligned directio writes will use the same mechanism as buffered 102 * writes. 103 * 104 * CoW remapping must be done after the data block write completes, 105 * because we don't want to destroy the old data fork map until we're sure 106 * the new block has been written. Since the new mappings are kept in a 107 * separate fork, we can simply iterate these mappings to find the ones 108 * that cover the file blocks that we just CoW'd. For each extent, simply 109 * unmap the corresponding range in the data fork, map the new range into 110 * the data fork, and remove the extent from the CoW fork. 111 * 112 * Since the remapping operation can be applied to an arbitrary file 113 * range, we record the need for the remap step as a flag in the ioend 114 * instead of declaring a new IO type. This is required for direct io 115 * because we only have ioend for the whole dio, and we have to be able to 116 * remember the presence of unwritten blocks and CoW blocks with a single 117 * ioend structure. Better yet, the more ground we can cover with one 118 * ioend, the better. 119 */ 120 121 /* 122 * Given an AG extent, find the lowest-numbered run of shared blocks 123 * within that range and return the range in fbno/flen. If 124 * find_end_of_shared is true, return the longest contiguous extent of 125 * shared blocks. If there are no shared extents, fbno and flen will 126 * be set to NULLAGBLOCK and 0, respectively. 127 */ 128 int 129 xfs_reflink_find_shared( 130 struct xfs_mount *mp, 131 xfs_agnumber_t agno, 132 xfs_agblock_t agbno, 133 xfs_extlen_t aglen, 134 xfs_agblock_t *fbno, 135 xfs_extlen_t *flen, 136 bool find_end_of_shared) 137 { 138 struct xfs_buf *agbp; 139 struct xfs_btree_cur *cur; 140 int error; 141 142 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); 143 if (error) 144 return error; 145 146 cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); 147 148 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 149 find_end_of_shared); 150 151 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 152 153 xfs_buf_relse(agbp); 154 return error; 155 } 156 157 /* 158 * Trim the mapping to the next block where there's a change in the 159 * shared/unshared status. More specifically, this means that we 160 * find the lowest-numbered extent of shared blocks that coincides with 161 * the given block mapping. If the shared extent overlaps the start of 162 * the mapping, trim the mapping to the end of the shared extent. If 163 * the shared region intersects the mapping, trim the mapping to the 164 * start of the shared extent. If there are no shared regions that 165 * overlap, just return the original extent. 166 */ 167 int 168 xfs_reflink_trim_around_shared( 169 struct xfs_inode *ip, 170 struct xfs_bmbt_irec *irec, 171 bool *shared, 172 bool *trimmed) 173 { 174 xfs_agnumber_t agno; 175 xfs_agblock_t agbno; 176 xfs_extlen_t aglen; 177 xfs_agblock_t fbno; 178 xfs_extlen_t flen; 179 int error = 0; 180 181 /* Holes, unwritten, and delalloc extents cannot be shared */ 182 if (!xfs_is_reflink_inode(ip) || 183 ISUNWRITTEN(irec) || 184 irec->br_startblock == HOLESTARTBLOCK || 185 irec->br_startblock == DELAYSTARTBLOCK) { 186 *shared = false; 187 return 0; 188 } 189 190 trace_xfs_reflink_trim_around_shared(ip, irec); 191 192 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 193 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 194 aglen = irec->br_blockcount; 195 196 error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, 197 aglen, &fbno, &flen, true); 198 if (error) 199 return error; 200 201 *shared = *trimmed = false; 202 if (fbno == NULLAGBLOCK) { 203 /* No shared blocks at all. */ 204 return 0; 205 } else if (fbno == agbno) { 206 /* 207 * The start of this extent is shared. Truncate the 208 * mapping at the end of the shared region so that a 209 * subsequent iteration starts at the start of the 210 * unshared region. 211 */ 212 irec->br_blockcount = flen; 213 *shared = true; 214 if (flen != aglen) 215 *trimmed = true; 216 return 0; 217 } else { 218 /* 219 * There's a shared extent midway through this extent. 220 * Truncate the mapping at the start of the shared 221 * extent so that a subsequent iteration starts at the 222 * start of the shared region. 223 */ 224 irec->br_blockcount = fbno - agbno; 225 *trimmed = true; 226 return 0; 227 } 228 } 229 230 /* Create a CoW reservation for a range of blocks within a file. */ 231 static int 232 __xfs_reflink_reserve_cow( 233 struct xfs_inode *ip, 234 xfs_fileoff_t *offset_fsb, 235 xfs_fileoff_t end_fsb, 236 bool *skipped) 237 { 238 struct xfs_bmbt_irec got, prev, imap; 239 xfs_fileoff_t orig_end_fsb; 240 int nimaps, eof = 0, error = 0; 241 bool shared = false, trimmed = false; 242 xfs_extnum_t idx; 243 xfs_extlen_t align; 244 245 /* Already reserved? Skip the refcount btree access. */ 246 xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, 247 &got, &prev); 248 if (!eof && got.br_startoff <= *offset_fsb) { 249 end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; 250 trace_xfs_reflink_cow_found(ip, &got); 251 goto done; 252 } 253 254 /* Read extent from the source file. */ 255 nimaps = 1; 256 error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, 257 &imap, &nimaps, 0); 258 if (error) 259 goto out_unlock; 260 ASSERT(nimaps == 1); 261 262 /* Trim the mapping to the nearest shared extent boundary. */ 263 error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); 264 if (error) 265 goto out_unlock; 266 267 end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; 268 269 /* Not shared? Just report the (potentially capped) extent. */ 270 if (!shared) { 271 *skipped = true; 272 goto done; 273 } 274 275 /* 276 * Fork all the shared blocks from our write offset until the end of 277 * the extent. 278 */ 279 error = xfs_qm_dqattach_locked(ip, 0); 280 if (error) 281 goto out_unlock; 282 283 align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); 284 if (align) 285 end_fsb = roundup_64(end_fsb, align); 286 287 retry: 288 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, 289 end_fsb - *offset_fsb, &got, 290 &prev, &idx, eof); 291 switch (error) { 292 case 0: 293 break; 294 case -ENOSPC: 295 case -EDQUOT: 296 /* retry without any preallocation */ 297 trace_xfs_reflink_cow_enospc(ip, &imap); 298 if (end_fsb != orig_end_fsb) { 299 end_fsb = orig_end_fsb; 300 goto retry; 301 } 302 /*FALLTHRU*/ 303 default: 304 goto out_unlock; 305 } 306 307 if (end_fsb != orig_end_fsb) 308 xfs_inode_set_cowblocks_tag(ip); 309 310 trace_xfs_reflink_cow_alloc(ip, &got); 311 done: 312 *offset_fsb = end_fsb; 313 out_unlock: 314 return error; 315 } 316 317 /* Create a CoW reservation for part of a file. */ 318 int 319 xfs_reflink_reserve_cow_range( 320 struct xfs_inode *ip, 321 xfs_off_t offset, 322 xfs_off_t count) 323 { 324 struct xfs_mount *mp = ip->i_mount; 325 xfs_fileoff_t offset_fsb, end_fsb; 326 bool skipped = false; 327 int error; 328 329 trace_xfs_reflink_reserve_cow_range(ip, offset, count); 330 331 offset_fsb = XFS_B_TO_FSBT(mp, offset); 332 end_fsb = XFS_B_TO_FSB(mp, offset + count); 333 334 xfs_ilock(ip, XFS_ILOCK_EXCL); 335 while (offset_fsb < end_fsb) { 336 error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, 337 &skipped); 338 if (error) { 339 trace_xfs_reflink_reserve_cow_range_error(ip, error, 340 _RET_IP_); 341 break; 342 } 343 } 344 xfs_iunlock(ip, XFS_ILOCK_EXCL); 345 346 return error; 347 } 348 349 /* Allocate all CoW reservations covering a range of blocks in a file. */ 350 static int 351 __xfs_reflink_allocate_cow( 352 struct xfs_inode *ip, 353 xfs_fileoff_t *offset_fsb, 354 xfs_fileoff_t end_fsb) 355 { 356 struct xfs_mount *mp = ip->i_mount; 357 struct xfs_bmbt_irec imap; 358 struct xfs_defer_ops dfops; 359 struct xfs_trans *tp; 360 xfs_fsblock_t first_block; 361 xfs_fileoff_t next_fsb; 362 int nimaps = 1, error; 363 bool skipped = false; 364 365 xfs_defer_init(&dfops, &first_block); 366 367 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 368 XFS_TRANS_RESERVE, &tp); 369 if (error) 370 return error; 371 372 xfs_ilock(ip, XFS_ILOCK_EXCL); 373 374 next_fsb = *offset_fsb; 375 error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); 376 if (error) 377 goto out_trans_cancel; 378 379 if (skipped) { 380 *offset_fsb = next_fsb; 381 goto out_trans_cancel; 382 } 383 384 xfs_trans_ijoin(tp, ip, 0); 385 error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, 386 XFS_BMAPI_COWFORK, &first_block, 387 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 388 &imap, &nimaps, &dfops); 389 if (error) 390 goto out_trans_cancel; 391 392 /* We might not have been able to map the whole delalloc extent */ 393 *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); 394 395 error = xfs_defer_finish(&tp, &dfops, NULL); 396 if (error) 397 goto out_trans_cancel; 398 399 error = xfs_trans_commit(tp); 400 401 out_unlock: 402 xfs_iunlock(ip, XFS_ILOCK_EXCL); 403 return error; 404 out_trans_cancel: 405 xfs_defer_cancel(&dfops); 406 xfs_trans_cancel(tp); 407 goto out_unlock; 408 } 409 410 /* Allocate all CoW reservations covering a part of a file. */ 411 int 412 xfs_reflink_allocate_cow_range( 413 struct xfs_inode *ip, 414 xfs_off_t offset, 415 xfs_off_t count) 416 { 417 struct xfs_mount *mp = ip->i_mount; 418 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 419 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 420 int error; 421 422 ASSERT(xfs_is_reflink_inode(ip)); 423 424 trace_xfs_reflink_allocate_cow_range(ip, offset, count); 425 426 /* 427 * Make sure that the dquots are there. 428 */ 429 error = xfs_qm_dqattach(ip, 0); 430 if (error) 431 return error; 432 433 while (offset_fsb < end_fsb) { 434 error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); 435 if (error) { 436 trace_xfs_reflink_allocate_cow_range_error(ip, error, 437 _RET_IP_); 438 break; 439 } 440 } 441 442 return error; 443 } 444 445 /* 446 * Find the CoW reservation (and whether or not it needs block allocation) 447 * for a given byte offset of a file. 448 */ 449 bool 450 xfs_reflink_find_cow_mapping( 451 struct xfs_inode *ip, 452 xfs_off_t offset, 453 struct xfs_bmbt_irec *imap, 454 bool *need_alloc) 455 { 456 struct xfs_bmbt_irec irec; 457 struct xfs_ifork *ifp; 458 struct xfs_bmbt_rec_host *gotp; 459 xfs_fileoff_t bno; 460 xfs_extnum_t idx; 461 462 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); 463 ASSERT(xfs_is_reflink_inode(ip)); 464 465 /* Find the extent in the CoW fork. */ 466 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 467 bno = XFS_B_TO_FSBT(ip->i_mount, offset); 468 gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); 469 if (!gotp) 470 return false; 471 472 xfs_bmbt_get_all(gotp, &irec); 473 if (bno >= irec.br_startoff + irec.br_blockcount || 474 bno < irec.br_startoff) 475 return false; 476 477 trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, 478 &irec); 479 480 /* If it's still delalloc, we must allocate later. */ 481 *imap = irec; 482 *need_alloc = !!(isnullstartblock(irec.br_startblock)); 483 484 return true; 485 } 486 487 /* 488 * Trim an extent to end at the next CoW reservation past offset_fsb. 489 */ 490 int 491 xfs_reflink_trim_irec_to_next_cow( 492 struct xfs_inode *ip, 493 xfs_fileoff_t offset_fsb, 494 struct xfs_bmbt_irec *imap) 495 { 496 struct xfs_bmbt_irec irec; 497 struct xfs_ifork *ifp; 498 struct xfs_bmbt_rec_host *gotp; 499 xfs_extnum_t idx; 500 501 if (!xfs_is_reflink_inode(ip)) 502 return 0; 503 504 /* Find the extent in the CoW fork. */ 505 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 506 gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); 507 if (!gotp) 508 return 0; 509 xfs_bmbt_get_all(gotp, &irec); 510 511 /* This is the extent before; try sliding up one. */ 512 if (irec.br_startoff < offset_fsb) { 513 idx++; 514 if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) 515 return 0; 516 gotp = xfs_iext_get_ext(ifp, idx); 517 xfs_bmbt_get_all(gotp, &irec); 518 } 519 520 if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) 521 return 0; 522 523 imap->br_blockcount = irec.br_startoff - imap->br_startoff; 524 trace_xfs_reflink_trim_irec(ip, imap); 525 526 return 0; 527 } 528 529 /* 530 * Cancel all pending CoW reservations for some block range of an inode. 531 */ 532 int 533 xfs_reflink_cancel_cow_blocks( 534 struct xfs_inode *ip, 535 struct xfs_trans **tpp, 536 xfs_fileoff_t offset_fsb, 537 xfs_fileoff_t end_fsb) 538 { 539 struct xfs_bmbt_irec irec; 540 xfs_filblks_t count_fsb; 541 xfs_fsblock_t firstfsb; 542 struct xfs_defer_ops dfops; 543 int error = 0; 544 int nimaps; 545 546 if (!xfs_is_reflink_inode(ip)) 547 return 0; 548 549 /* Go find the old extent in the CoW fork. */ 550 while (offset_fsb < end_fsb) { 551 nimaps = 1; 552 count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); 553 error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, 554 &nimaps, XFS_BMAPI_COWFORK); 555 if (error) 556 break; 557 ASSERT(nimaps == 1); 558 559 trace_xfs_reflink_cancel_cow(ip, &irec); 560 561 if (irec.br_startblock == DELAYSTARTBLOCK) { 562 /* Free a delayed allocation. */ 563 xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, 564 false); 565 ip->i_delayed_blks -= irec.br_blockcount; 566 567 /* Remove the mapping from the CoW fork. */ 568 error = xfs_bunmapi_cow(ip, &irec); 569 if (error) 570 break; 571 } else if (irec.br_startblock == HOLESTARTBLOCK) { 572 /* empty */ 573 } else { 574 xfs_trans_ijoin(*tpp, ip, 0); 575 xfs_defer_init(&dfops, &firstfsb); 576 577 /* Free the CoW orphan record. */ 578 error = xfs_refcount_free_cow_extent(ip->i_mount, 579 &dfops, irec.br_startblock, 580 irec.br_blockcount); 581 if (error) 582 break; 583 584 xfs_bmap_add_free(ip->i_mount, &dfops, 585 irec.br_startblock, irec.br_blockcount, 586 NULL); 587 588 /* Update quota accounting */ 589 xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, 590 -(long)irec.br_blockcount); 591 592 /* Roll the transaction */ 593 error = xfs_defer_finish(tpp, &dfops, ip); 594 if (error) { 595 xfs_defer_cancel(&dfops); 596 break; 597 } 598 599 /* Remove the mapping from the CoW fork. */ 600 error = xfs_bunmapi_cow(ip, &irec); 601 if (error) 602 break; 603 } 604 605 /* Roll on... */ 606 offset_fsb = irec.br_startoff + irec.br_blockcount; 607 } 608 609 return error; 610 } 611 612 /* 613 * Cancel all pending CoW reservations for some byte range of an inode. 614 */ 615 int 616 xfs_reflink_cancel_cow_range( 617 struct xfs_inode *ip, 618 xfs_off_t offset, 619 xfs_off_t count) 620 { 621 struct xfs_trans *tp; 622 xfs_fileoff_t offset_fsb; 623 xfs_fileoff_t end_fsb; 624 int error; 625 626 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 627 ASSERT(xfs_is_reflink_inode(ip)); 628 629 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 630 if (count == NULLFILEOFF) 631 end_fsb = NULLFILEOFF; 632 else 633 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 634 635 /* Start a rolling transaction to remove the mappings */ 636 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 637 0, 0, 0, &tp); 638 if (error) 639 goto out; 640 641 xfs_ilock(ip, XFS_ILOCK_EXCL); 642 xfs_trans_ijoin(tp, ip, 0); 643 644 /* Scrape out the old CoW reservations */ 645 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); 646 if (error) 647 goto out_cancel; 648 649 error = xfs_trans_commit(tp); 650 651 xfs_iunlock(ip, XFS_ILOCK_EXCL); 652 return error; 653 654 out_cancel: 655 xfs_trans_cancel(tp); 656 xfs_iunlock(ip, XFS_ILOCK_EXCL); 657 out: 658 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 659 return error; 660 } 661 662 /* 663 * Remap parts of a file's data fork after a successful CoW. 664 */ 665 int 666 xfs_reflink_end_cow( 667 struct xfs_inode *ip, 668 xfs_off_t offset, 669 xfs_off_t count) 670 { 671 struct xfs_bmbt_irec irec; 672 struct xfs_bmbt_irec uirec; 673 struct xfs_trans *tp; 674 xfs_fileoff_t offset_fsb; 675 xfs_fileoff_t end_fsb; 676 xfs_filblks_t count_fsb; 677 xfs_fsblock_t firstfsb; 678 struct xfs_defer_ops dfops; 679 int error; 680 unsigned int resblks; 681 xfs_filblks_t ilen; 682 xfs_filblks_t rlen; 683 int nimaps; 684 685 trace_xfs_reflink_end_cow(ip, offset, count); 686 687 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 688 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 689 count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); 690 691 /* Start a rolling transaction to switch the mappings */ 692 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 693 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 694 resblks, 0, 0, &tp); 695 if (error) 696 goto out; 697 698 xfs_ilock(ip, XFS_ILOCK_EXCL); 699 xfs_trans_ijoin(tp, ip, 0); 700 701 /* Go find the old extent in the CoW fork. */ 702 while (offset_fsb < end_fsb) { 703 /* Read extent from the source file */ 704 nimaps = 1; 705 count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); 706 error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, 707 &nimaps, XFS_BMAPI_COWFORK); 708 if (error) 709 goto out_cancel; 710 ASSERT(nimaps == 1); 711 712 ASSERT(irec.br_startblock != DELAYSTARTBLOCK); 713 trace_xfs_reflink_cow_remap(ip, &irec); 714 715 /* 716 * We can have a hole in the CoW fork if part of a directio 717 * write is CoW but part of it isn't. 718 */ 719 rlen = ilen = irec.br_blockcount; 720 if (irec.br_startblock == HOLESTARTBLOCK) 721 goto next_extent; 722 723 /* Unmap the old blocks in the data fork. */ 724 while (rlen) { 725 xfs_defer_init(&dfops, &firstfsb); 726 error = __xfs_bunmapi(tp, ip, irec.br_startoff, 727 &rlen, 0, 1, &firstfsb, &dfops); 728 if (error) 729 goto out_defer; 730 731 /* 732 * Trim the extent to whatever got unmapped. 733 * Remember, bunmapi works backwards. 734 */ 735 uirec.br_startblock = irec.br_startblock + rlen; 736 uirec.br_startoff = irec.br_startoff + rlen; 737 uirec.br_blockcount = irec.br_blockcount - rlen; 738 irec.br_blockcount = rlen; 739 trace_xfs_reflink_cow_remap_piece(ip, &uirec); 740 741 /* Free the CoW orphan record. */ 742 error = xfs_refcount_free_cow_extent(tp->t_mountp, 743 &dfops, uirec.br_startblock, 744 uirec.br_blockcount); 745 if (error) 746 goto out_defer; 747 748 /* Map the new blocks into the data fork. */ 749 error = xfs_bmap_map_extent(tp->t_mountp, &dfops, 750 ip, &uirec); 751 if (error) 752 goto out_defer; 753 754 /* Remove the mapping from the CoW fork. */ 755 error = xfs_bunmapi_cow(ip, &uirec); 756 if (error) 757 goto out_defer; 758 759 error = xfs_defer_finish(&tp, &dfops, ip); 760 if (error) 761 goto out_defer; 762 } 763 764 next_extent: 765 /* Roll on... */ 766 offset_fsb = irec.br_startoff + ilen; 767 } 768 769 error = xfs_trans_commit(tp); 770 xfs_iunlock(ip, XFS_ILOCK_EXCL); 771 if (error) 772 goto out; 773 return 0; 774 775 out_defer: 776 xfs_defer_cancel(&dfops); 777 out_cancel: 778 xfs_trans_cancel(tp); 779 xfs_iunlock(ip, XFS_ILOCK_EXCL); 780 out: 781 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 782 return error; 783 } 784 785 /* 786 * Free leftover CoW reservations that didn't get cleaned out. 787 */ 788 int 789 xfs_reflink_recover_cow( 790 struct xfs_mount *mp) 791 { 792 xfs_agnumber_t agno; 793 int error = 0; 794 795 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 796 return 0; 797 798 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 799 error = xfs_refcount_recover_cow_leftovers(mp, agno); 800 if (error) 801 break; 802 } 803 804 return error; 805 } 806 807 /* 808 * Reflinking (Block) Ranges of Two Files Together 809 * 810 * First, ensure that the reflink flag is set on both inodes. The flag is an 811 * optimization to avoid unnecessary refcount btree lookups in the write path. 812 * 813 * Now we can iteratively remap the range of extents (and holes) in src to the 814 * corresponding ranges in dest. Let drange and srange denote the ranges of 815 * logical blocks in dest and src touched by the reflink operation. 816 * 817 * While the length of drange is greater than zero, 818 * - Read src's bmbt at the start of srange ("imap") 819 * - If imap doesn't exist, make imap appear to start at the end of srange 820 * with zero length. 821 * - If imap starts before srange, advance imap to start at srange. 822 * - If imap goes beyond srange, truncate imap to end at the end of srange. 823 * - Punch (imap start - srange start + imap len) blocks from dest at 824 * offset (drange start). 825 * - If imap points to a real range of pblks, 826 * > Increase the refcount of the imap's pblks 827 * > Map imap's pblks into dest at the offset 828 * (drange start + imap start - srange start) 829 * - Advance drange and srange by (imap start - srange start + imap len) 830 * 831 * Finally, if the reflink made dest longer, update both the in-core and 832 * on-disk file sizes. 833 * 834 * ASCII Art Demonstration: 835 * 836 * Let's say we want to reflink this source file: 837 * 838 * ----SSSSSSS-SSSSS----SSSSSS (src file) 839 * <--------------------> 840 * 841 * into this destination file: 842 * 843 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 844 * <--------------------> 845 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 846 * Observe that the range has different logical offsets in either file. 847 * 848 * Consider that the first extent in the source file doesn't line up with our 849 * reflink range. Unmapping and remapping are separate operations, so we can 850 * unmap more blocks from the destination file than we remap. 851 * 852 * ----SSSSSSS-SSSSS----SSSSSS 853 * <-------> 854 * --DDDDD---------DDDDD--DDD 855 * <-------> 856 * 857 * Now remap the source extent into the destination file: 858 * 859 * ----SSSSSSS-SSSSS----SSSSSS 860 * <-------> 861 * --DDDDD--SSSSSSSDDDDD--DDD 862 * <-------> 863 * 864 * Do likewise with the second hole and extent in our range. Holes in the 865 * unmap range don't affect our operation. 866 * 867 * ----SSSSSSS-SSSSS----SSSSSS 868 * <----> 869 * --DDDDD--SSSSSSS-SSSSS-DDD 870 * <----> 871 * 872 * Finally, unmap and remap part of the third extent. This will increase the 873 * size of the destination file. 874 * 875 * ----SSSSSSS-SSSSS----SSSSSS 876 * <-----> 877 * --DDDDD--SSSSSSS-SSSSS----SSS 878 * <-----> 879 * 880 * Once we update the destination file's i_size, we're done. 881 */ 882 883 /* 884 * Ensure the reflink bit is set in both inodes. 885 */ 886 STATIC int 887 xfs_reflink_set_inode_flag( 888 struct xfs_inode *src, 889 struct xfs_inode *dest) 890 { 891 struct xfs_mount *mp = src->i_mount; 892 int error; 893 struct xfs_trans *tp; 894 895 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 896 return 0; 897 898 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 899 if (error) 900 goto out_error; 901 902 /* Lock both files against IO */ 903 if (src->i_ino == dest->i_ino) 904 xfs_ilock(src, XFS_ILOCK_EXCL); 905 else 906 xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); 907 908 if (!xfs_is_reflink_inode(src)) { 909 trace_xfs_reflink_set_inode_flag(src); 910 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 911 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 912 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 913 xfs_ifork_init_cow(src); 914 } else 915 xfs_iunlock(src, XFS_ILOCK_EXCL); 916 917 if (src->i_ino == dest->i_ino) 918 goto commit_flags; 919 920 if (!xfs_is_reflink_inode(dest)) { 921 trace_xfs_reflink_set_inode_flag(dest); 922 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 923 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 924 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 925 xfs_ifork_init_cow(dest); 926 } else 927 xfs_iunlock(dest, XFS_ILOCK_EXCL); 928 929 commit_flags: 930 error = xfs_trans_commit(tp); 931 if (error) 932 goto out_error; 933 return error; 934 935 out_error: 936 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 937 return error; 938 } 939 940 /* 941 * Update destination inode size & cowextsize hint, if necessary. 942 */ 943 STATIC int 944 xfs_reflink_update_dest( 945 struct xfs_inode *dest, 946 xfs_off_t newlen, 947 xfs_extlen_t cowextsize) 948 { 949 struct xfs_mount *mp = dest->i_mount; 950 struct xfs_trans *tp; 951 int error; 952 953 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 954 return 0; 955 956 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 957 if (error) 958 goto out_error; 959 960 xfs_ilock(dest, XFS_ILOCK_EXCL); 961 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 962 963 if (newlen > i_size_read(VFS_I(dest))) { 964 trace_xfs_reflink_update_inode_size(dest, newlen); 965 i_size_write(VFS_I(dest), newlen); 966 dest->i_d.di_size = newlen; 967 } 968 969 if (cowextsize) { 970 dest->i_d.di_cowextsize = cowextsize; 971 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 972 } 973 974 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 975 976 error = xfs_trans_commit(tp); 977 if (error) 978 goto out_error; 979 return error; 980 981 out_error: 982 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 983 return error; 984 } 985 986 /* 987 * Do we have enough reserve in this AG to handle a reflink? The refcount 988 * btree already reserved all the space it needs, but the rmap btree can grow 989 * infinitely, so we won't allow more reflinks when the AG is down to the 990 * btree reserves. 991 */ 992 static int 993 xfs_reflink_ag_has_free_space( 994 struct xfs_mount *mp, 995 xfs_agnumber_t agno) 996 { 997 struct xfs_perag *pag; 998 int error = 0; 999 1000 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 1001 return 0; 1002 1003 pag = xfs_perag_get(mp, agno); 1004 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || 1005 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 1006 error = -ENOSPC; 1007 xfs_perag_put(pag); 1008 return error; 1009 } 1010 1011 /* 1012 * Unmap a range of blocks from a file, then map other blocks into the hole. 1013 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 1014 * The extent irec is mapped into dest at irec->br_startoff. 1015 */ 1016 STATIC int 1017 xfs_reflink_remap_extent( 1018 struct xfs_inode *ip, 1019 struct xfs_bmbt_irec *irec, 1020 xfs_fileoff_t destoff, 1021 xfs_off_t new_isize) 1022 { 1023 struct xfs_mount *mp = ip->i_mount; 1024 struct xfs_trans *tp; 1025 xfs_fsblock_t firstfsb; 1026 unsigned int resblks; 1027 struct xfs_defer_ops dfops; 1028 struct xfs_bmbt_irec uirec; 1029 bool real_extent; 1030 xfs_filblks_t rlen; 1031 xfs_filblks_t unmap_len; 1032 xfs_off_t newlen; 1033 int error; 1034 1035 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1036 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1037 1038 /* Only remap normal extents. */ 1039 real_extent = (irec->br_startblock != HOLESTARTBLOCK && 1040 irec->br_startblock != DELAYSTARTBLOCK && 1041 !ISUNWRITTEN(irec)); 1042 1043 /* No reflinking if we're low on space */ 1044 if (real_extent) { 1045 error = xfs_reflink_ag_has_free_space(mp, 1046 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1047 if (error) 1048 goto out; 1049 } 1050 1051 /* Start a rolling transaction to switch the mappings */ 1052 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1053 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1054 if (error) 1055 goto out; 1056 1057 xfs_ilock(ip, XFS_ILOCK_EXCL); 1058 xfs_trans_ijoin(tp, ip, 0); 1059 1060 /* If we're not just clearing space, then do we have enough quota? */ 1061 if (real_extent) { 1062 error = xfs_trans_reserve_quota_nblks(tp, ip, 1063 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1064 if (error) 1065 goto out_cancel; 1066 } 1067 1068 trace_xfs_reflink_remap(ip, irec->br_startoff, 1069 irec->br_blockcount, irec->br_startblock); 1070 1071 /* Unmap the old blocks in the data fork. */ 1072 rlen = unmap_len; 1073 while (rlen) { 1074 xfs_defer_init(&dfops, &firstfsb); 1075 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, 1076 &firstfsb, &dfops); 1077 if (error) 1078 goto out_defer; 1079 1080 /* 1081 * Trim the extent to whatever got unmapped. 1082 * Remember, bunmapi works backwards. 1083 */ 1084 uirec.br_startblock = irec->br_startblock + rlen; 1085 uirec.br_startoff = irec->br_startoff + rlen; 1086 uirec.br_blockcount = unmap_len - rlen; 1087 unmap_len = rlen; 1088 1089 /* If this isn't a real mapping, we're done. */ 1090 if (!real_extent || uirec.br_blockcount == 0) 1091 goto next_extent; 1092 1093 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1094 uirec.br_blockcount, uirec.br_startblock); 1095 1096 /* Update the refcount tree */ 1097 error = xfs_refcount_increase_extent(mp, &dfops, &uirec); 1098 if (error) 1099 goto out_defer; 1100 1101 /* Map the new blocks into the data fork. */ 1102 error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); 1103 if (error) 1104 goto out_defer; 1105 1106 /* Update quota accounting. */ 1107 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1108 uirec.br_blockcount); 1109 1110 /* Update dest isize if needed. */ 1111 newlen = XFS_FSB_TO_B(mp, 1112 uirec.br_startoff + uirec.br_blockcount); 1113 newlen = min_t(xfs_off_t, newlen, new_isize); 1114 if (newlen > i_size_read(VFS_I(ip))) { 1115 trace_xfs_reflink_update_inode_size(ip, newlen); 1116 i_size_write(VFS_I(ip), newlen); 1117 ip->i_d.di_size = newlen; 1118 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1119 } 1120 1121 next_extent: 1122 /* Process all the deferred stuff. */ 1123 error = xfs_defer_finish(&tp, &dfops, ip); 1124 if (error) 1125 goto out_defer; 1126 } 1127 1128 error = xfs_trans_commit(tp); 1129 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1130 if (error) 1131 goto out; 1132 return 0; 1133 1134 out_defer: 1135 xfs_defer_cancel(&dfops); 1136 out_cancel: 1137 xfs_trans_cancel(tp); 1138 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1139 out: 1140 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1141 return error; 1142 } 1143 1144 /* 1145 * Iteratively remap one file's extents (and holes) to another's. 1146 */ 1147 STATIC int 1148 xfs_reflink_remap_blocks( 1149 struct xfs_inode *src, 1150 xfs_fileoff_t srcoff, 1151 struct xfs_inode *dest, 1152 xfs_fileoff_t destoff, 1153 xfs_filblks_t len, 1154 xfs_off_t new_isize) 1155 { 1156 struct xfs_bmbt_irec imap; 1157 int nimaps; 1158 int error = 0; 1159 xfs_filblks_t range_len; 1160 1161 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1162 while (len) { 1163 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1164 dest, destoff); 1165 /* Read extent from the source file */ 1166 nimaps = 1; 1167 xfs_ilock(src, XFS_ILOCK_EXCL); 1168 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1169 xfs_iunlock(src, XFS_ILOCK_EXCL); 1170 if (error) 1171 goto err; 1172 ASSERT(nimaps == 1); 1173 1174 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1175 &imap); 1176 1177 /* Translate imap into the destination file. */ 1178 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1179 imap.br_startoff += destoff - srcoff; 1180 1181 /* Clear dest from destoff to the end of imap and map it in. */ 1182 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1183 new_isize); 1184 if (error) 1185 goto err; 1186 1187 if (fatal_signal_pending(current)) { 1188 error = -EINTR; 1189 goto err; 1190 } 1191 1192 /* Advance drange/srange */ 1193 srcoff += range_len; 1194 destoff += range_len; 1195 len -= range_len; 1196 } 1197 1198 return 0; 1199 1200 err: 1201 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1202 return error; 1203 } 1204 1205 /* 1206 * Read a page's worth of file data into the page cache. Return the page 1207 * locked. 1208 */ 1209 static struct page * 1210 xfs_get_page( 1211 struct inode *inode, 1212 xfs_off_t offset) 1213 { 1214 struct address_space *mapping; 1215 struct page *page; 1216 pgoff_t n; 1217 1218 n = offset >> PAGE_SHIFT; 1219 mapping = inode->i_mapping; 1220 page = read_mapping_page(mapping, n, NULL); 1221 if (IS_ERR(page)) 1222 return page; 1223 if (!PageUptodate(page)) { 1224 put_page(page); 1225 return ERR_PTR(-EIO); 1226 } 1227 lock_page(page); 1228 return page; 1229 } 1230 1231 /* 1232 * Compare extents of two files to see if they are the same. 1233 */ 1234 static int 1235 xfs_compare_extents( 1236 struct inode *src, 1237 xfs_off_t srcoff, 1238 struct inode *dest, 1239 xfs_off_t destoff, 1240 xfs_off_t len, 1241 bool *is_same) 1242 { 1243 xfs_off_t src_poff; 1244 xfs_off_t dest_poff; 1245 void *src_addr; 1246 void *dest_addr; 1247 struct page *src_page; 1248 struct page *dest_page; 1249 xfs_off_t cmp_len; 1250 bool same; 1251 int error; 1252 1253 error = -EINVAL; 1254 same = true; 1255 while (len) { 1256 src_poff = srcoff & (PAGE_SIZE - 1); 1257 dest_poff = destoff & (PAGE_SIZE - 1); 1258 cmp_len = min(PAGE_SIZE - src_poff, 1259 PAGE_SIZE - dest_poff); 1260 cmp_len = min(cmp_len, len); 1261 ASSERT(cmp_len > 0); 1262 1263 trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, 1264 XFS_I(dest), destoff); 1265 1266 src_page = xfs_get_page(src, srcoff); 1267 if (IS_ERR(src_page)) { 1268 error = PTR_ERR(src_page); 1269 goto out_error; 1270 } 1271 dest_page = xfs_get_page(dest, destoff); 1272 if (IS_ERR(dest_page)) { 1273 error = PTR_ERR(dest_page); 1274 unlock_page(src_page); 1275 put_page(src_page); 1276 goto out_error; 1277 } 1278 src_addr = kmap_atomic(src_page); 1279 dest_addr = kmap_atomic(dest_page); 1280 1281 flush_dcache_page(src_page); 1282 flush_dcache_page(dest_page); 1283 1284 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) 1285 same = false; 1286 1287 kunmap_atomic(dest_addr); 1288 kunmap_atomic(src_addr); 1289 unlock_page(dest_page); 1290 unlock_page(src_page); 1291 put_page(dest_page); 1292 put_page(src_page); 1293 1294 if (!same) 1295 break; 1296 1297 srcoff += cmp_len; 1298 destoff += cmp_len; 1299 len -= cmp_len; 1300 } 1301 1302 *is_same = same; 1303 return 0; 1304 1305 out_error: 1306 trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); 1307 return error; 1308 } 1309 1310 /* 1311 * Link a range of blocks from one file to another. 1312 */ 1313 int 1314 xfs_reflink_remap_range( 1315 struct xfs_inode *src, 1316 xfs_off_t srcoff, 1317 struct xfs_inode *dest, 1318 xfs_off_t destoff, 1319 xfs_off_t len, 1320 unsigned int flags) 1321 { 1322 struct xfs_mount *mp = src->i_mount; 1323 xfs_fileoff_t sfsbno, dfsbno; 1324 xfs_filblks_t fsblen; 1325 int error; 1326 xfs_extlen_t cowextsize; 1327 bool is_same; 1328 1329 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1330 return -EOPNOTSUPP; 1331 1332 if (XFS_FORCED_SHUTDOWN(mp)) 1333 return -EIO; 1334 1335 /* Don't reflink realtime inodes */ 1336 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1337 return -EINVAL; 1338 1339 if (flags & ~XFS_REFLINK_ALL) 1340 return -EINVAL; 1341 1342 trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); 1343 1344 /* Lock both files against IO */ 1345 if (src->i_ino == dest->i_ino) { 1346 xfs_ilock(src, XFS_IOLOCK_EXCL); 1347 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1348 } else { 1349 xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); 1350 xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); 1351 } 1352 1353 /* 1354 * Check that the extents are the same. 1355 */ 1356 if (flags & XFS_REFLINK_DEDUPE) { 1357 is_same = false; 1358 error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), 1359 destoff, len, &is_same); 1360 if (error) 1361 goto out_error; 1362 if (!is_same) { 1363 error = -EBADE; 1364 goto out_error; 1365 } 1366 } 1367 1368 error = xfs_reflink_set_inode_flag(src, dest); 1369 if (error) 1370 goto out_error; 1371 1372 /* 1373 * Invalidate the page cache so that we can clear any CoW mappings 1374 * in the destination file. 1375 */ 1376 truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, 1377 PAGE_ALIGN(destoff + len) - 1); 1378 1379 dfsbno = XFS_B_TO_FSBT(mp, destoff); 1380 sfsbno = XFS_B_TO_FSBT(mp, srcoff); 1381 fsblen = XFS_B_TO_FSB(mp, len); 1382 error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, 1383 destoff + len); 1384 if (error) 1385 goto out_error; 1386 1387 /* 1388 * Carry the cowextsize hint from src to dest if we're sharing the 1389 * entire source file to the entire destination file, the source file 1390 * has a cowextsize hint, and the destination file does not. 1391 */ 1392 cowextsize = 0; 1393 if (srcoff == 0 && len == i_size_read(VFS_I(src)) && 1394 (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && 1395 destoff == 0 && len >= i_size_read(VFS_I(dest)) && 1396 !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) 1397 cowextsize = src->i_d.di_cowextsize; 1398 1399 error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); 1400 if (error) 1401 goto out_error; 1402 1403 out_error: 1404 xfs_iunlock(src, XFS_MMAPLOCK_EXCL); 1405 xfs_iunlock(src, XFS_IOLOCK_EXCL); 1406 if (src->i_ino != dest->i_ino) { 1407 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1408 xfs_iunlock(dest, XFS_IOLOCK_EXCL); 1409 } 1410 if (error) 1411 trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); 1412 return error; 1413 } 1414 1415 /* 1416 * The user wants to preemptively CoW all shared blocks in this file, 1417 * which enables us to turn off the reflink flag. Iterate all 1418 * extents which are not prealloc/delalloc to see which ranges are 1419 * mentioned in the refcount tree, then read those blocks into the 1420 * pagecache, dirty them, fsync them back out, and then we can update 1421 * the inode flag. What happens if we run out of memory? :) 1422 */ 1423 STATIC int 1424 xfs_reflink_dirty_extents( 1425 struct xfs_inode *ip, 1426 xfs_fileoff_t fbno, 1427 xfs_filblks_t end, 1428 xfs_off_t isize) 1429 { 1430 struct xfs_mount *mp = ip->i_mount; 1431 xfs_agnumber_t agno; 1432 xfs_agblock_t agbno; 1433 xfs_extlen_t aglen; 1434 xfs_agblock_t rbno; 1435 xfs_extlen_t rlen; 1436 xfs_off_t fpos; 1437 xfs_off_t flen; 1438 struct xfs_bmbt_irec map[2]; 1439 int nmaps; 1440 int error = 0; 1441 1442 while (end - fbno > 0) { 1443 nmaps = 1; 1444 /* 1445 * Look for extents in the file. Skip holes, delalloc, or 1446 * unwritten extents; they can't be reflinked. 1447 */ 1448 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1449 if (error) 1450 goto out; 1451 if (nmaps == 0) 1452 break; 1453 if (map[0].br_startblock == HOLESTARTBLOCK || 1454 map[0].br_startblock == DELAYSTARTBLOCK || 1455 ISUNWRITTEN(&map[0])) 1456 goto next; 1457 1458 map[1] = map[0]; 1459 while (map[1].br_blockcount) { 1460 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1461 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1462 aglen = map[1].br_blockcount; 1463 1464 error = xfs_reflink_find_shared(mp, agno, agbno, aglen, 1465 &rbno, &rlen, true); 1466 if (error) 1467 goto out; 1468 if (rbno == NULLAGBLOCK) 1469 break; 1470 1471 /* Dirty the pages */ 1472 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1473 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1474 (rbno - agbno)); 1475 flen = XFS_FSB_TO_B(mp, rlen); 1476 if (fpos + flen > isize) 1477 flen = isize - fpos; 1478 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1479 &xfs_iomap_ops); 1480 xfs_ilock(ip, XFS_ILOCK_EXCL); 1481 if (error) 1482 goto out; 1483 1484 map[1].br_blockcount -= (rbno - agbno + rlen); 1485 map[1].br_startoff += (rbno - agbno + rlen); 1486 map[1].br_startblock += (rbno - agbno + rlen); 1487 } 1488 1489 next: 1490 fbno = map[0].br_startoff + map[0].br_blockcount; 1491 } 1492 out: 1493 return error; 1494 } 1495 1496 /* Clear the inode reflink flag if there are no shared extents. */ 1497 int 1498 xfs_reflink_clear_inode_flag( 1499 struct xfs_inode *ip, 1500 struct xfs_trans **tpp) 1501 { 1502 struct xfs_mount *mp = ip->i_mount; 1503 xfs_fileoff_t fbno; 1504 xfs_filblks_t end; 1505 xfs_agnumber_t agno; 1506 xfs_agblock_t agbno; 1507 xfs_extlen_t aglen; 1508 xfs_agblock_t rbno; 1509 xfs_extlen_t rlen; 1510 struct xfs_bmbt_irec map; 1511 int nmaps; 1512 int error = 0; 1513 1514 ASSERT(xfs_is_reflink_inode(ip)); 1515 1516 fbno = 0; 1517 end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); 1518 while (end - fbno > 0) { 1519 nmaps = 1; 1520 /* 1521 * Look for extents in the file. Skip holes, delalloc, or 1522 * unwritten extents; they can't be reflinked. 1523 */ 1524 error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); 1525 if (error) 1526 return error; 1527 if (nmaps == 0) 1528 break; 1529 if (map.br_startblock == HOLESTARTBLOCK || 1530 map.br_startblock == DELAYSTARTBLOCK || 1531 ISUNWRITTEN(&map)) 1532 goto next; 1533 1534 agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); 1535 agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); 1536 aglen = map.br_blockcount; 1537 1538 error = xfs_reflink_find_shared(mp, agno, agbno, aglen, 1539 &rbno, &rlen, false); 1540 if (error) 1541 return error; 1542 /* Is there still a shared block here? */ 1543 if (rbno != NULLAGBLOCK) 1544 return 0; 1545 next: 1546 fbno = map.br_startoff + map.br_blockcount; 1547 } 1548 1549 /* 1550 * We didn't find any shared blocks so turn off the reflink flag. 1551 * First, get rid of any leftover CoW mappings. 1552 */ 1553 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); 1554 if (error) 1555 return error; 1556 1557 /* Clear the inode flag. */ 1558 trace_xfs_reflink_unset_inode_flag(ip); 1559 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1560 xfs_inode_clear_cowblocks_tag(ip); 1561 xfs_trans_ijoin(*tpp, ip, 0); 1562 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1563 1564 return error; 1565 } 1566 1567 /* 1568 * Clear the inode reflink flag if there are no shared extents and the size 1569 * hasn't changed. 1570 */ 1571 STATIC int 1572 xfs_reflink_try_clear_inode_flag( 1573 struct xfs_inode *ip) 1574 { 1575 struct xfs_mount *mp = ip->i_mount; 1576 struct xfs_trans *tp; 1577 int error = 0; 1578 1579 /* Start a rolling transaction to remove the mappings */ 1580 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1581 if (error) 1582 return error; 1583 1584 xfs_ilock(ip, XFS_ILOCK_EXCL); 1585 xfs_trans_ijoin(tp, ip, 0); 1586 1587 error = xfs_reflink_clear_inode_flag(ip, &tp); 1588 if (error) 1589 goto cancel; 1590 1591 error = xfs_trans_commit(tp); 1592 if (error) 1593 goto out; 1594 1595 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1596 return 0; 1597 cancel: 1598 xfs_trans_cancel(tp); 1599 out: 1600 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1601 return error; 1602 } 1603 1604 /* 1605 * Pre-COW all shared blocks within a given byte range of a file and turn off 1606 * the reflink flag if we unshare all of the file's blocks. 1607 */ 1608 int 1609 xfs_reflink_unshare( 1610 struct xfs_inode *ip, 1611 xfs_off_t offset, 1612 xfs_off_t len) 1613 { 1614 struct xfs_mount *mp = ip->i_mount; 1615 xfs_fileoff_t fbno; 1616 xfs_filblks_t end; 1617 xfs_off_t isize; 1618 int error; 1619 1620 if (!xfs_is_reflink_inode(ip)) 1621 return 0; 1622 1623 trace_xfs_reflink_unshare(ip, offset, len); 1624 1625 inode_dio_wait(VFS_I(ip)); 1626 1627 /* Try to CoW the selected ranges */ 1628 xfs_ilock(ip, XFS_ILOCK_EXCL); 1629 fbno = XFS_B_TO_FSBT(mp, offset); 1630 isize = i_size_read(VFS_I(ip)); 1631 end = XFS_B_TO_FSB(mp, offset + len); 1632 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1633 if (error) 1634 goto out_unlock; 1635 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1636 1637 /* Wait for the IO to finish */ 1638 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1639 if (error) 1640 goto out; 1641 1642 /* Turn off the reflink flag if possible. */ 1643 error = xfs_reflink_try_clear_inode_flag(ip); 1644 if (error) 1645 goto out; 1646 1647 return 0; 1648 1649 out_unlock: 1650 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1651 out: 1652 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1653 return error; 1654 } 1655 1656 /* 1657 * Does this inode have any real CoW reservations? 1658 */ 1659 bool 1660 xfs_reflink_has_real_cow_blocks( 1661 struct xfs_inode *ip) 1662 { 1663 struct xfs_bmbt_irec irec; 1664 struct xfs_ifork *ifp; 1665 struct xfs_bmbt_rec_host *gotp; 1666 xfs_extnum_t idx; 1667 1668 if (!xfs_is_reflink_inode(ip)) 1669 return false; 1670 1671 /* Go find the old extent in the CoW fork. */ 1672 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1673 gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); 1674 while (gotp) { 1675 xfs_bmbt_get_all(gotp, &irec); 1676 1677 if (!isnullstartblock(irec.br_startblock)) 1678 return true; 1679 1680 /* Roll on... */ 1681 idx++; 1682 if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) 1683 break; 1684 gotp = xfs_iext_get_ext(ifp, idx); 1685 } 1686 1687 return false; 1688 } 1689