1 /* 2 * Copyright (C) 2016 Oracle. All Rights Reserved. 3 * 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_shared.h" 23 #include "xfs_format.h" 24 #include "xfs_log_format.h" 25 #include "xfs_trans_resv.h" 26 #include "xfs_mount.h" 27 #include "xfs_defer.h" 28 #include "xfs_da_format.h" 29 #include "xfs_da_btree.h" 30 #include "xfs_inode.h" 31 #include "xfs_trans.h" 32 #include "xfs_inode_item.h" 33 #include "xfs_bmap.h" 34 #include "xfs_bmap_util.h" 35 #include "xfs_error.h" 36 #include "xfs_dir2.h" 37 #include "xfs_dir2_priv.h" 38 #include "xfs_ioctl.h" 39 #include "xfs_trace.h" 40 #include "xfs_log.h" 41 #include "xfs_icache.h" 42 #include "xfs_pnfs.h" 43 #include "xfs_btree.h" 44 #include "xfs_refcount_btree.h" 45 #include "xfs_refcount.h" 46 #include "xfs_bmap_btree.h" 47 #include "xfs_trans_space.h" 48 #include "xfs_bit.h" 49 #include "xfs_alloc.h" 50 #include "xfs_quota_defs.h" 51 #include "xfs_quota.h" 52 #include "xfs_btree.h" 53 #include "xfs_bmap_btree.h" 54 #include "xfs_reflink.h" 55 #include "xfs_iomap.h" 56 #include "xfs_rmap_btree.h" 57 #include "xfs_sb.h" 58 #include "xfs_ag_resv.h" 59 60 /* 61 * Copy on Write of Shared Blocks 62 * 63 * XFS must preserve "the usual" file semantics even when two files share 64 * the same physical blocks. This means that a write to one file must not 65 * alter the blocks in a different file; the way that we'll do that is 66 * through the use of a copy-on-write mechanism. At a high level, that 67 * means that when we want to write to a shared block, we allocate a new 68 * block, write the data to the new block, and if that succeeds we map the 69 * new block into the file. 70 * 71 * XFS provides a "delayed allocation" mechanism that defers the allocation 72 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 73 * possible. This reduces fragmentation by enabling the filesystem to ask 74 * for bigger chunks less often, which is exactly what we want for CoW. 75 * 76 * The delalloc mechanism begins when the kernel wants to make a block 77 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 78 * create a delalloc mapping, which is a regular in-core extent, but without 79 * a real startblock. (For delalloc mappings, the startblock encodes both 80 * a flag that this is a delalloc mapping, and a worst-case estimate of how 81 * many blocks might be required to put the mapping into the BMBT.) delalloc 82 * mappings are a reservation against the free space in the filesystem; 83 * adjacent mappings can also be combined into fewer larger mappings. 84 * 85 * When dirty pages are being written out (typically in writepage), the 86 * delalloc reservations are converted into real mappings by allocating 87 * blocks and replacing the delalloc mapping with real ones. A delalloc 88 * mapping can be replaced by several real ones if the free space is 89 * fragmented. 90 * 91 * We want to adapt the delalloc mechanism for copy-on-write, since the 92 * write paths are similar. The first two steps (creating the reservation 93 * and allocating the blocks) are exactly the same as delalloc except that 94 * the mappings must be stored in a separate CoW fork because we do not want 95 * to disturb the mapping in the data fork until we're sure that the write 96 * succeeded. IO completion in this case is the process of removing the old 97 * mapping from the data fork and moving the new mapping from the CoW fork to 98 * the data fork. This will be discussed shortly. 99 * 100 * For now, unaligned directio writes will be bounced back to the page cache. 101 * Block-aligned directio writes will use the same mechanism as buffered 102 * writes. 103 * 104 * CoW remapping must be done after the data block write completes, 105 * because we don't want to destroy the old data fork map until we're sure 106 * the new block has been written. Since the new mappings are kept in a 107 * separate fork, we can simply iterate these mappings to find the ones 108 * that cover the file blocks that we just CoW'd. For each extent, simply 109 * unmap the corresponding range in the data fork, map the new range into 110 * the data fork, and remove the extent from the CoW fork. 111 * 112 * Since the remapping operation can be applied to an arbitrary file 113 * range, we record the need for the remap step as a flag in the ioend 114 * instead of declaring a new IO type. This is required for direct io 115 * because we only have ioend for the whole dio, and we have to be able to 116 * remember the presence of unwritten blocks and CoW blocks with a single 117 * ioend structure. Better yet, the more ground we can cover with one 118 * ioend, the better. 119 */ 120 121 /* 122 * Given an AG extent, find the lowest-numbered run of shared blocks 123 * within that range and return the range in fbno/flen. If 124 * find_end_of_shared is true, return the longest contiguous extent of 125 * shared blocks. If there are no shared extents, fbno and flen will 126 * be set to NULLAGBLOCK and 0, respectively. 127 */ 128 int 129 xfs_reflink_find_shared( 130 struct xfs_mount *mp, 131 xfs_agnumber_t agno, 132 xfs_agblock_t agbno, 133 xfs_extlen_t aglen, 134 xfs_agblock_t *fbno, 135 xfs_extlen_t *flen, 136 bool find_end_of_shared) 137 { 138 struct xfs_buf *agbp; 139 struct xfs_btree_cur *cur; 140 int error; 141 142 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); 143 if (error) 144 return error; 145 146 cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); 147 148 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 149 find_end_of_shared); 150 151 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 152 153 xfs_buf_relse(agbp); 154 return error; 155 } 156 157 /* 158 * Trim the mapping to the next block where there's a change in the 159 * shared/unshared status. More specifically, this means that we 160 * find the lowest-numbered extent of shared blocks that coincides with 161 * the given block mapping. If the shared extent overlaps the start of 162 * the mapping, trim the mapping to the end of the shared extent. If 163 * the shared region intersects the mapping, trim the mapping to the 164 * start of the shared extent. If there are no shared regions that 165 * overlap, just return the original extent. 166 */ 167 int 168 xfs_reflink_trim_around_shared( 169 struct xfs_inode *ip, 170 struct xfs_bmbt_irec *irec, 171 bool *shared, 172 bool *trimmed) 173 { 174 xfs_agnumber_t agno; 175 xfs_agblock_t agbno; 176 xfs_extlen_t aglen; 177 xfs_agblock_t fbno; 178 xfs_extlen_t flen; 179 int error = 0; 180 181 /* Holes, unwritten, and delalloc extents cannot be shared */ 182 if (!xfs_is_reflink_inode(ip) || 183 ISUNWRITTEN(irec) || 184 irec->br_startblock == HOLESTARTBLOCK || 185 irec->br_startblock == DELAYSTARTBLOCK || 186 isnullstartblock(irec->br_startblock)) { 187 *shared = false; 188 return 0; 189 } 190 191 trace_xfs_reflink_trim_around_shared(ip, irec); 192 193 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 194 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 195 aglen = irec->br_blockcount; 196 197 error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, 198 aglen, &fbno, &flen, true); 199 if (error) 200 return error; 201 202 *shared = *trimmed = false; 203 if (fbno == NULLAGBLOCK) { 204 /* No shared blocks at all. */ 205 return 0; 206 } else if (fbno == agbno) { 207 /* 208 * The start of this extent is shared. Truncate the 209 * mapping at the end of the shared region so that a 210 * subsequent iteration starts at the start of the 211 * unshared region. 212 */ 213 irec->br_blockcount = flen; 214 *shared = true; 215 if (flen != aglen) 216 *trimmed = true; 217 return 0; 218 } else { 219 /* 220 * There's a shared extent midway through this extent. 221 * Truncate the mapping at the start of the shared 222 * extent so that a subsequent iteration starts at the 223 * start of the shared region. 224 */ 225 irec->br_blockcount = fbno - agbno; 226 *trimmed = true; 227 return 0; 228 } 229 } 230 231 /* 232 * Trim the passed in imap to the next shared/unshared extent boundary, and 233 * if imap->br_startoff points to a shared extent reserve space for it in the 234 * COW fork. In this case *shared is set to true, else to false. 235 * 236 * Note that imap will always contain the block numbers for the existing blocks 237 * in the data fork, as the upper layers need them for read-modify-write 238 * operations. 239 */ 240 int 241 xfs_reflink_reserve_cow( 242 struct xfs_inode *ip, 243 struct xfs_bmbt_irec *imap, 244 bool *shared) 245 { 246 struct xfs_bmbt_irec got, prev; 247 xfs_fileoff_t end_fsb, orig_end_fsb; 248 int eof = 0, error = 0; 249 bool trimmed; 250 xfs_extnum_t idx; 251 xfs_extlen_t align; 252 253 /* 254 * Search the COW fork extent list first. This serves two purposes: 255 * first this implement the speculative preallocation using cowextisze, 256 * so that we also unshared block adjacent to shared blocks instead 257 * of just the shared blocks themselves. Second the lookup in the 258 * extent list is generally faster than going out to the shared extent 259 * tree. 260 */ 261 xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, 262 &got, &prev); 263 if (!eof && got.br_startoff <= imap->br_startoff) { 264 trace_xfs_reflink_cow_found(ip, imap); 265 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 266 267 *shared = true; 268 return 0; 269 } 270 271 /* Trim the mapping to the nearest shared extent boundary. */ 272 error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); 273 if (error) 274 return error; 275 276 /* Not shared? Just report the (potentially capped) extent. */ 277 if (!*shared) 278 return 0; 279 280 /* 281 * Fork all the shared blocks from our write offset until the end of 282 * the extent. 283 */ 284 error = xfs_qm_dqattach_locked(ip, 0); 285 if (error) 286 return error; 287 288 end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; 289 290 align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); 291 if (align) 292 end_fsb = roundup_64(end_fsb, align); 293 294 retry: 295 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, 296 end_fsb - imap->br_startoff, &got, &prev, &idx, eof); 297 switch (error) { 298 case 0: 299 break; 300 case -ENOSPC: 301 case -EDQUOT: 302 /* retry without any preallocation */ 303 trace_xfs_reflink_cow_enospc(ip, imap); 304 if (end_fsb != orig_end_fsb) { 305 end_fsb = orig_end_fsb; 306 goto retry; 307 } 308 /*FALLTHRU*/ 309 default: 310 return error; 311 } 312 313 if (end_fsb != orig_end_fsb) 314 xfs_inode_set_cowblocks_tag(ip); 315 316 trace_xfs_reflink_cow_alloc(ip, &got); 317 return 0; 318 } 319 320 /* Allocate all CoW reservations covering a range of blocks in a file. */ 321 static int 322 __xfs_reflink_allocate_cow( 323 struct xfs_inode *ip, 324 xfs_fileoff_t *offset_fsb, 325 xfs_fileoff_t end_fsb) 326 { 327 struct xfs_mount *mp = ip->i_mount; 328 struct xfs_bmbt_irec imap; 329 struct xfs_defer_ops dfops; 330 struct xfs_trans *tp; 331 xfs_fsblock_t first_block; 332 int nimaps = 1, error; 333 bool shared; 334 335 xfs_defer_init(&dfops, &first_block); 336 337 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 338 XFS_TRANS_RESERVE, &tp); 339 if (error) 340 return error; 341 342 xfs_ilock(ip, XFS_ILOCK_EXCL); 343 344 /* Read extent from the source file. */ 345 nimaps = 1; 346 error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, 347 &imap, &nimaps, 0); 348 if (error) 349 goto out_unlock; 350 ASSERT(nimaps == 1); 351 352 error = xfs_reflink_reserve_cow(ip, &imap, &shared); 353 if (error) 354 goto out_trans_cancel; 355 356 if (!shared) { 357 *offset_fsb = imap.br_startoff + imap.br_blockcount; 358 goto out_trans_cancel; 359 } 360 361 xfs_trans_ijoin(tp, ip, 0); 362 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, 363 XFS_BMAPI_COWFORK, &first_block, 364 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 365 &imap, &nimaps, &dfops); 366 if (error) 367 goto out_trans_cancel; 368 369 error = xfs_defer_finish(&tp, &dfops, NULL); 370 if (error) 371 goto out_trans_cancel; 372 373 error = xfs_trans_commit(tp); 374 375 *offset_fsb = imap.br_startoff + imap.br_blockcount; 376 out_unlock: 377 xfs_iunlock(ip, XFS_ILOCK_EXCL); 378 return error; 379 out_trans_cancel: 380 xfs_defer_cancel(&dfops); 381 xfs_trans_cancel(tp); 382 goto out_unlock; 383 } 384 385 /* Allocate all CoW reservations covering a part of a file. */ 386 int 387 xfs_reflink_allocate_cow_range( 388 struct xfs_inode *ip, 389 xfs_off_t offset, 390 xfs_off_t count) 391 { 392 struct xfs_mount *mp = ip->i_mount; 393 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 394 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 395 int error; 396 397 ASSERT(xfs_is_reflink_inode(ip)); 398 399 trace_xfs_reflink_allocate_cow_range(ip, offset, count); 400 401 /* 402 * Make sure that the dquots are there. 403 */ 404 error = xfs_qm_dqattach(ip, 0); 405 if (error) 406 return error; 407 408 while (offset_fsb < end_fsb) { 409 error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); 410 if (error) { 411 trace_xfs_reflink_allocate_cow_range_error(ip, error, 412 _RET_IP_); 413 break; 414 } 415 } 416 417 return error; 418 } 419 420 /* 421 * Find the CoW reservation (and whether or not it needs block allocation) 422 * for a given byte offset of a file. 423 */ 424 bool 425 xfs_reflink_find_cow_mapping( 426 struct xfs_inode *ip, 427 xfs_off_t offset, 428 struct xfs_bmbt_irec *imap, 429 bool *need_alloc) 430 { 431 struct xfs_bmbt_irec irec; 432 struct xfs_ifork *ifp; 433 struct xfs_bmbt_rec_host *gotp; 434 xfs_fileoff_t bno; 435 xfs_extnum_t idx; 436 437 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); 438 ASSERT(xfs_is_reflink_inode(ip)); 439 440 /* Find the extent in the CoW fork. */ 441 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 442 bno = XFS_B_TO_FSBT(ip->i_mount, offset); 443 gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); 444 if (!gotp) 445 return false; 446 447 xfs_bmbt_get_all(gotp, &irec); 448 if (bno >= irec.br_startoff + irec.br_blockcount || 449 bno < irec.br_startoff) 450 return false; 451 452 trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, 453 &irec); 454 455 /* If it's still delalloc, we must allocate later. */ 456 *imap = irec; 457 *need_alloc = !!(isnullstartblock(irec.br_startblock)); 458 459 return true; 460 } 461 462 /* 463 * Trim an extent to end at the next CoW reservation past offset_fsb. 464 */ 465 int 466 xfs_reflink_trim_irec_to_next_cow( 467 struct xfs_inode *ip, 468 xfs_fileoff_t offset_fsb, 469 struct xfs_bmbt_irec *imap) 470 { 471 struct xfs_bmbt_irec irec; 472 struct xfs_ifork *ifp; 473 struct xfs_bmbt_rec_host *gotp; 474 xfs_extnum_t idx; 475 476 if (!xfs_is_reflink_inode(ip)) 477 return 0; 478 479 /* Find the extent in the CoW fork. */ 480 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 481 gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); 482 if (!gotp) 483 return 0; 484 xfs_bmbt_get_all(gotp, &irec); 485 486 /* This is the extent before; try sliding up one. */ 487 if (irec.br_startoff < offset_fsb) { 488 idx++; 489 if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) 490 return 0; 491 gotp = xfs_iext_get_ext(ifp, idx); 492 xfs_bmbt_get_all(gotp, &irec); 493 } 494 495 if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) 496 return 0; 497 498 imap->br_blockcount = irec.br_startoff - imap->br_startoff; 499 trace_xfs_reflink_trim_irec(ip, imap); 500 501 return 0; 502 } 503 504 /* 505 * Cancel all pending CoW reservations for some block range of an inode. 506 */ 507 int 508 xfs_reflink_cancel_cow_blocks( 509 struct xfs_inode *ip, 510 struct xfs_trans **tpp, 511 xfs_fileoff_t offset_fsb, 512 xfs_fileoff_t end_fsb) 513 { 514 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 515 struct xfs_bmbt_irec got, prev, del; 516 xfs_extnum_t idx; 517 xfs_fsblock_t firstfsb; 518 struct xfs_defer_ops dfops; 519 int error = 0, eof = 0; 520 521 if (!xfs_is_reflink_inode(ip)) 522 return 0; 523 524 xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, 525 &got, &prev); 526 if (eof) 527 return 0; 528 529 while (got.br_startoff < end_fsb) { 530 del = got; 531 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 532 trace_xfs_reflink_cancel_cow(ip, &del); 533 534 if (isnullstartblock(del.br_startblock)) { 535 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 536 &idx, &got, &del); 537 if (error) 538 break; 539 } else { 540 xfs_trans_ijoin(*tpp, ip, 0); 541 xfs_defer_init(&dfops, &firstfsb); 542 543 /* Free the CoW orphan record. */ 544 error = xfs_refcount_free_cow_extent(ip->i_mount, 545 &dfops, del.br_startblock, 546 del.br_blockcount); 547 if (error) 548 break; 549 550 xfs_bmap_add_free(ip->i_mount, &dfops, 551 del.br_startblock, del.br_blockcount, 552 NULL); 553 554 /* Update quota accounting */ 555 xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, 556 -(long)del.br_blockcount); 557 558 /* Roll the transaction */ 559 error = xfs_defer_finish(tpp, &dfops, ip); 560 if (error) { 561 xfs_defer_cancel(&dfops); 562 break; 563 } 564 565 /* Remove the mapping from the CoW fork. */ 566 xfs_bmap_del_extent_cow(ip, &idx, &got, &del); 567 } 568 569 if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) 570 break; 571 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); 572 } 573 574 /* clear tag if cow fork is emptied */ 575 if (!ifp->if_bytes) 576 xfs_inode_clear_cowblocks_tag(ip); 577 578 return error; 579 } 580 581 /* 582 * Cancel all pending CoW reservations for some byte range of an inode. 583 */ 584 int 585 xfs_reflink_cancel_cow_range( 586 struct xfs_inode *ip, 587 xfs_off_t offset, 588 xfs_off_t count) 589 { 590 struct xfs_trans *tp; 591 xfs_fileoff_t offset_fsb; 592 xfs_fileoff_t end_fsb; 593 int error; 594 595 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 596 ASSERT(xfs_is_reflink_inode(ip)); 597 598 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 599 if (count == NULLFILEOFF) 600 end_fsb = NULLFILEOFF; 601 else 602 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 603 604 /* Start a rolling transaction to remove the mappings */ 605 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 606 0, 0, 0, &tp); 607 if (error) 608 goto out; 609 610 xfs_ilock(ip, XFS_ILOCK_EXCL); 611 xfs_trans_ijoin(tp, ip, 0); 612 613 /* Scrape out the old CoW reservations */ 614 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); 615 if (error) 616 goto out_cancel; 617 618 error = xfs_trans_commit(tp); 619 620 xfs_iunlock(ip, XFS_ILOCK_EXCL); 621 return error; 622 623 out_cancel: 624 xfs_trans_cancel(tp); 625 xfs_iunlock(ip, XFS_ILOCK_EXCL); 626 out: 627 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 628 return error; 629 } 630 631 /* 632 * Remap parts of a file's data fork after a successful CoW. 633 */ 634 int 635 xfs_reflink_end_cow( 636 struct xfs_inode *ip, 637 xfs_off_t offset, 638 xfs_off_t count) 639 { 640 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 641 struct xfs_bmbt_irec got, prev, del; 642 struct xfs_trans *tp; 643 xfs_fileoff_t offset_fsb; 644 xfs_fileoff_t end_fsb; 645 xfs_fsblock_t firstfsb; 646 struct xfs_defer_ops dfops; 647 int error, eof = 0; 648 unsigned int resblks; 649 xfs_filblks_t rlen; 650 xfs_extnum_t idx; 651 652 trace_xfs_reflink_end_cow(ip, offset, count); 653 654 /* No COW extents? That's easy! */ 655 if (ifp->if_bytes == 0) 656 return 0; 657 658 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 659 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 660 661 /* Start a rolling transaction to switch the mappings */ 662 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 663 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 664 resblks, 0, 0, &tp); 665 if (error) 666 goto out; 667 668 xfs_ilock(ip, XFS_ILOCK_EXCL); 669 xfs_trans_ijoin(tp, ip, 0); 670 671 xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, 672 &got, &prev); 673 674 /* If there is a hole at end_fsb - 1 go to the previous extent */ 675 if (eof || got.br_startoff > end_fsb) { 676 ASSERT(idx > 0); 677 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); 678 } 679 680 /* Walk backwards until we're out of the I/O range... */ 681 while (got.br_startoff + got.br_blockcount > offset_fsb) { 682 del = got; 683 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 684 685 /* Extent delete may have bumped idx forward */ 686 if (!del.br_blockcount) { 687 idx--; 688 goto next_extent; 689 } 690 691 ASSERT(!isnullstartblock(got.br_startblock)); 692 693 /* Unmap the old blocks in the data fork. */ 694 xfs_defer_init(&dfops, &firstfsb); 695 rlen = del.br_blockcount; 696 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1, 697 &firstfsb, &dfops); 698 if (error) 699 goto out_defer; 700 701 /* Trim the extent to whatever got unmapped. */ 702 if (rlen) { 703 xfs_trim_extent(&del, del.br_startoff + rlen, 704 del.br_blockcount - rlen); 705 } 706 trace_xfs_reflink_cow_remap(ip, &del); 707 708 /* Free the CoW orphan record. */ 709 error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops, 710 del.br_startblock, del.br_blockcount); 711 if (error) 712 goto out_defer; 713 714 /* Map the new blocks into the data fork. */ 715 error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del); 716 if (error) 717 goto out_defer; 718 719 /* Remove the mapping from the CoW fork. */ 720 xfs_bmap_del_extent_cow(ip, &idx, &got, &del); 721 722 error = xfs_defer_finish(&tp, &dfops, ip); 723 if (error) 724 goto out_defer; 725 726 next_extent: 727 if (idx < 0) 728 break; 729 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); 730 } 731 732 error = xfs_trans_commit(tp); 733 xfs_iunlock(ip, XFS_ILOCK_EXCL); 734 if (error) 735 goto out; 736 return 0; 737 738 out_defer: 739 xfs_defer_cancel(&dfops); 740 xfs_trans_cancel(tp); 741 xfs_iunlock(ip, XFS_ILOCK_EXCL); 742 out: 743 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 744 return error; 745 } 746 747 /* 748 * Free leftover CoW reservations that didn't get cleaned out. 749 */ 750 int 751 xfs_reflink_recover_cow( 752 struct xfs_mount *mp) 753 { 754 xfs_agnumber_t agno; 755 int error = 0; 756 757 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 758 return 0; 759 760 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 761 error = xfs_refcount_recover_cow_leftovers(mp, agno); 762 if (error) 763 break; 764 } 765 766 return error; 767 } 768 769 /* 770 * Reflinking (Block) Ranges of Two Files Together 771 * 772 * First, ensure that the reflink flag is set on both inodes. The flag is an 773 * optimization to avoid unnecessary refcount btree lookups in the write path. 774 * 775 * Now we can iteratively remap the range of extents (and holes) in src to the 776 * corresponding ranges in dest. Let drange and srange denote the ranges of 777 * logical blocks in dest and src touched by the reflink operation. 778 * 779 * While the length of drange is greater than zero, 780 * - Read src's bmbt at the start of srange ("imap") 781 * - If imap doesn't exist, make imap appear to start at the end of srange 782 * with zero length. 783 * - If imap starts before srange, advance imap to start at srange. 784 * - If imap goes beyond srange, truncate imap to end at the end of srange. 785 * - Punch (imap start - srange start + imap len) blocks from dest at 786 * offset (drange start). 787 * - If imap points to a real range of pblks, 788 * > Increase the refcount of the imap's pblks 789 * > Map imap's pblks into dest at the offset 790 * (drange start + imap start - srange start) 791 * - Advance drange and srange by (imap start - srange start + imap len) 792 * 793 * Finally, if the reflink made dest longer, update both the in-core and 794 * on-disk file sizes. 795 * 796 * ASCII Art Demonstration: 797 * 798 * Let's say we want to reflink this source file: 799 * 800 * ----SSSSSSS-SSSSS----SSSSSS (src file) 801 * <--------------------> 802 * 803 * into this destination file: 804 * 805 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 806 * <--------------------> 807 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 808 * Observe that the range has different logical offsets in either file. 809 * 810 * Consider that the first extent in the source file doesn't line up with our 811 * reflink range. Unmapping and remapping are separate operations, so we can 812 * unmap more blocks from the destination file than we remap. 813 * 814 * ----SSSSSSS-SSSSS----SSSSSS 815 * <-------> 816 * --DDDDD---------DDDDD--DDD 817 * <-------> 818 * 819 * Now remap the source extent into the destination file: 820 * 821 * ----SSSSSSS-SSSSS----SSSSSS 822 * <-------> 823 * --DDDDD--SSSSSSSDDDDD--DDD 824 * <-------> 825 * 826 * Do likewise with the second hole and extent in our range. Holes in the 827 * unmap range don't affect our operation. 828 * 829 * ----SSSSSSS-SSSSS----SSSSSS 830 * <----> 831 * --DDDDD--SSSSSSS-SSSSS-DDD 832 * <----> 833 * 834 * Finally, unmap and remap part of the third extent. This will increase the 835 * size of the destination file. 836 * 837 * ----SSSSSSS-SSSSS----SSSSSS 838 * <-----> 839 * --DDDDD--SSSSSSS-SSSSS----SSS 840 * <-----> 841 * 842 * Once we update the destination file's i_size, we're done. 843 */ 844 845 /* 846 * Ensure the reflink bit is set in both inodes. 847 */ 848 STATIC int 849 xfs_reflink_set_inode_flag( 850 struct xfs_inode *src, 851 struct xfs_inode *dest) 852 { 853 struct xfs_mount *mp = src->i_mount; 854 int error; 855 struct xfs_trans *tp; 856 857 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 858 return 0; 859 860 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 861 if (error) 862 goto out_error; 863 864 /* Lock both files against IO */ 865 if (src->i_ino == dest->i_ino) 866 xfs_ilock(src, XFS_ILOCK_EXCL); 867 else 868 xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); 869 870 if (!xfs_is_reflink_inode(src)) { 871 trace_xfs_reflink_set_inode_flag(src); 872 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 873 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 874 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 875 xfs_ifork_init_cow(src); 876 } else 877 xfs_iunlock(src, XFS_ILOCK_EXCL); 878 879 if (src->i_ino == dest->i_ino) 880 goto commit_flags; 881 882 if (!xfs_is_reflink_inode(dest)) { 883 trace_xfs_reflink_set_inode_flag(dest); 884 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 885 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 886 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 887 xfs_ifork_init_cow(dest); 888 } else 889 xfs_iunlock(dest, XFS_ILOCK_EXCL); 890 891 commit_flags: 892 error = xfs_trans_commit(tp); 893 if (error) 894 goto out_error; 895 return error; 896 897 out_error: 898 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 899 return error; 900 } 901 902 /* 903 * Update destination inode size & cowextsize hint, if necessary. 904 */ 905 STATIC int 906 xfs_reflink_update_dest( 907 struct xfs_inode *dest, 908 xfs_off_t newlen, 909 xfs_extlen_t cowextsize) 910 { 911 struct xfs_mount *mp = dest->i_mount; 912 struct xfs_trans *tp; 913 int error; 914 915 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 916 return 0; 917 918 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 919 if (error) 920 goto out_error; 921 922 xfs_ilock(dest, XFS_ILOCK_EXCL); 923 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 924 925 if (newlen > i_size_read(VFS_I(dest))) { 926 trace_xfs_reflink_update_inode_size(dest, newlen); 927 i_size_write(VFS_I(dest), newlen); 928 dest->i_d.di_size = newlen; 929 } 930 931 if (cowextsize) { 932 dest->i_d.di_cowextsize = cowextsize; 933 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 934 } 935 936 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 937 938 error = xfs_trans_commit(tp); 939 if (error) 940 goto out_error; 941 return error; 942 943 out_error: 944 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 945 return error; 946 } 947 948 /* 949 * Do we have enough reserve in this AG to handle a reflink? The refcount 950 * btree already reserved all the space it needs, but the rmap btree can grow 951 * infinitely, so we won't allow more reflinks when the AG is down to the 952 * btree reserves. 953 */ 954 static int 955 xfs_reflink_ag_has_free_space( 956 struct xfs_mount *mp, 957 xfs_agnumber_t agno) 958 { 959 struct xfs_perag *pag; 960 int error = 0; 961 962 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 963 return 0; 964 965 pag = xfs_perag_get(mp, agno); 966 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || 967 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 968 error = -ENOSPC; 969 xfs_perag_put(pag); 970 return error; 971 } 972 973 /* 974 * Unmap a range of blocks from a file, then map other blocks into the hole. 975 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 976 * The extent irec is mapped into dest at irec->br_startoff. 977 */ 978 STATIC int 979 xfs_reflink_remap_extent( 980 struct xfs_inode *ip, 981 struct xfs_bmbt_irec *irec, 982 xfs_fileoff_t destoff, 983 xfs_off_t new_isize) 984 { 985 struct xfs_mount *mp = ip->i_mount; 986 struct xfs_trans *tp; 987 xfs_fsblock_t firstfsb; 988 unsigned int resblks; 989 struct xfs_defer_ops dfops; 990 struct xfs_bmbt_irec uirec; 991 bool real_extent; 992 xfs_filblks_t rlen; 993 xfs_filblks_t unmap_len; 994 xfs_off_t newlen; 995 int error; 996 997 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 998 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 999 1000 /* Only remap normal extents. */ 1001 real_extent = (irec->br_startblock != HOLESTARTBLOCK && 1002 irec->br_startblock != DELAYSTARTBLOCK && 1003 !ISUNWRITTEN(irec)); 1004 1005 /* No reflinking if we're low on space */ 1006 if (real_extent) { 1007 error = xfs_reflink_ag_has_free_space(mp, 1008 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1009 if (error) 1010 goto out; 1011 } 1012 1013 /* Start a rolling transaction to switch the mappings */ 1014 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1015 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1016 if (error) 1017 goto out; 1018 1019 xfs_ilock(ip, XFS_ILOCK_EXCL); 1020 xfs_trans_ijoin(tp, ip, 0); 1021 1022 /* If we're not just clearing space, then do we have enough quota? */ 1023 if (real_extent) { 1024 error = xfs_trans_reserve_quota_nblks(tp, ip, 1025 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1026 if (error) 1027 goto out_cancel; 1028 } 1029 1030 trace_xfs_reflink_remap(ip, irec->br_startoff, 1031 irec->br_blockcount, irec->br_startblock); 1032 1033 /* Unmap the old blocks in the data fork. */ 1034 rlen = unmap_len; 1035 while (rlen) { 1036 xfs_defer_init(&dfops, &firstfsb); 1037 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, 1038 &firstfsb, &dfops); 1039 if (error) 1040 goto out_defer; 1041 1042 /* 1043 * Trim the extent to whatever got unmapped. 1044 * Remember, bunmapi works backwards. 1045 */ 1046 uirec.br_startblock = irec->br_startblock + rlen; 1047 uirec.br_startoff = irec->br_startoff + rlen; 1048 uirec.br_blockcount = unmap_len - rlen; 1049 unmap_len = rlen; 1050 1051 /* If this isn't a real mapping, we're done. */ 1052 if (!real_extent || uirec.br_blockcount == 0) 1053 goto next_extent; 1054 1055 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1056 uirec.br_blockcount, uirec.br_startblock); 1057 1058 /* Update the refcount tree */ 1059 error = xfs_refcount_increase_extent(mp, &dfops, &uirec); 1060 if (error) 1061 goto out_defer; 1062 1063 /* Map the new blocks into the data fork. */ 1064 error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); 1065 if (error) 1066 goto out_defer; 1067 1068 /* Update quota accounting. */ 1069 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1070 uirec.br_blockcount); 1071 1072 /* Update dest isize if needed. */ 1073 newlen = XFS_FSB_TO_B(mp, 1074 uirec.br_startoff + uirec.br_blockcount); 1075 newlen = min_t(xfs_off_t, newlen, new_isize); 1076 if (newlen > i_size_read(VFS_I(ip))) { 1077 trace_xfs_reflink_update_inode_size(ip, newlen); 1078 i_size_write(VFS_I(ip), newlen); 1079 ip->i_d.di_size = newlen; 1080 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1081 } 1082 1083 next_extent: 1084 /* Process all the deferred stuff. */ 1085 error = xfs_defer_finish(&tp, &dfops, ip); 1086 if (error) 1087 goto out_defer; 1088 } 1089 1090 error = xfs_trans_commit(tp); 1091 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1092 if (error) 1093 goto out; 1094 return 0; 1095 1096 out_defer: 1097 xfs_defer_cancel(&dfops); 1098 out_cancel: 1099 xfs_trans_cancel(tp); 1100 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1101 out: 1102 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1103 return error; 1104 } 1105 1106 /* 1107 * Iteratively remap one file's extents (and holes) to another's. 1108 */ 1109 STATIC int 1110 xfs_reflink_remap_blocks( 1111 struct xfs_inode *src, 1112 xfs_fileoff_t srcoff, 1113 struct xfs_inode *dest, 1114 xfs_fileoff_t destoff, 1115 xfs_filblks_t len, 1116 xfs_off_t new_isize) 1117 { 1118 struct xfs_bmbt_irec imap; 1119 int nimaps; 1120 int error = 0; 1121 xfs_filblks_t range_len; 1122 1123 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1124 while (len) { 1125 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1126 dest, destoff); 1127 /* Read extent from the source file */ 1128 nimaps = 1; 1129 xfs_ilock(src, XFS_ILOCK_EXCL); 1130 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1131 xfs_iunlock(src, XFS_ILOCK_EXCL); 1132 if (error) 1133 goto err; 1134 ASSERT(nimaps == 1); 1135 1136 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1137 &imap); 1138 1139 /* Translate imap into the destination file. */ 1140 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1141 imap.br_startoff += destoff - srcoff; 1142 1143 /* Clear dest from destoff to the end of imap and map it in. */ 1144 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1145 new_isize); 1146 if (error) 1147 goto err; 1148 1149 if (fatal_signal_pending(current)) { 1150 error = -EINTR; 1151 goto err; 1152 } 1153 1154 /* Advance drange/srange */ 1155 srcoff += range_len; 1156 destoff += range_len; 1157 len -= range_len; 1158 } 1159 1160 return 0; 1161 1162 err: 1163 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1164 return error; 1165 } 1166 1167 /* 1168 * Read a page's worth of file data into the page cache. Return the page 1169 * locked. 1170 */ 1171 static struct page * 1172 xfs_get_page( 1173 struct inode *inode, 1174 xfs_off_t offset) 1175 { 1176 struct address_space *mapping; 1177 struct page *page; 1178 pgoff_t n; 1179 1180 n = offset >> PAGE_SHIFT; 1181 mapping = inode->i_mapping; 1182 page = read_mapping_page(mapping, n, NULL); 1183 if (IS_ERR(page)) 1184 return page; 1185 if (!PageUptodate(page)) { 1186 put_page(page); 1187 return ERR_PTR(-EIO); 1188 } 1189 lock_page(page); 1190 return page; 1191 } 1192 1193 /* 1194 * Compare extents of two files to see if they are the same. 1195 */ 1196 static int 1197 xfs_compare_extents( 1198 struct inode *src, 1199 xfs_off_t srcoff, 1200 struct inode *dest, 1201 xfs_off_t destoff, 1202 xfs_off_t len, 1203 bool *is_same) 1204 { 1205 xfs_off_t src_poff; 1206 xfs_off_t dest_poff; 1207 void *src_addr; 1208 void *dest_addr; 1209 struct page *src_page; 1210 struct page *dest_page; 1211 xfs_off_t cmp_len; 1212 bool same; 1213 int error; 1214 1215 error = -EINVAL; 1216 same = true; 1217 while (len) { 1218 src_poff = srcoff & (PAGE_SIZE - 1); 1219 dest_poff = destoff & (PAGE_SIZE - 1); 1220 cmp_len = min(PAGE_SIZE - src_poff, 1221 PAGE_SIZE - dest_poff); 1222 cmp_len = min(cmp_len, len); 1223 ASSERT(cmp_len > 0); 1224 1225 trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, 1226 XFS_I(dest), destoff); 1227 1228 src_page = xfs_get_page(src, srcoff); 1229 if (IS_ERR(src_page)) { 1230 error = PTR_ERR(src_page); 1231 goto out_error; 1232 } 1233 dest_page = xfs_get_page(dest, destoff); 1234 if (IS_ERR(dest_page)) { 1235 error = PTR_ERR(dest_page); 1236 unlock_page(src_page); 1237 put_page(src_page); 1238 goto out_error; 1239 } 1240 src_addr = kmap_atomic(src_page); 1241 dest_addr = kmap_atomic(dest_page); 1242 1243 flush_dcache_page(src_page); 1244 flush_dcache_page(dest_page); 1245 1246 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) 1247 same = false; 1248 1249 kunmap_atomic(dest_addr); 1250 kunmap_atomic(src_addr); 1251 unlock_page(dest_page); 1252 unlock_page(src_page); 1253 put_page(dest_page); 1254 put_page(src_page); 1255 1256 if (!same) 1257 break; 1258 1259 srcoff += cmp_len; 1260 destoff += cmp_len; 1261 len -= cmp_len; 1262 } 1263 1264 *is_same = same; 1265 return 0; 1266 1267 out_error: 1268 trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); 1269 return error; 1270 } 1271 1272 /* 1273 * Link a range of blocks from one file to another. 1274 */ 1275 int 1276 xfs_reflink_remap_range( 1277 struct file *file_in, 1278 loff_t pos_in, 1279 struct file *file_out, 1280 loff_t pos_out, 1281 u64 len, 1282 bool is_dedupe) 1283 { 1284 struct inode *inode_in = file_inode(file_in); 1285 struct xfs_inode *src = XFS_I(inode_in); 1286 struct inode *inode_out = file_inode(file_out); 1287 struct xfs_inode *dest = XFS_I(inode_out); 1288 struct xfs_mount *mp = src->i_mount; 1289 loff_t bs = inode_out->i_sb->s_blocksize; 1290 bool same_inode = (inode_in == inode_out); 1291 xfs_fileoff_t sfsbno, dfsbno; 1292 xfs_filblks_t fsblen; 1293 xfs_extlen_t cowextsize; 1294 loff_t isize; 1295 ssize_t ret; 1296 loff_t blen; 1297 1298 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1299 return -EOPNOTSUPP; 1300 1301 if (XFS_FORCED_SHUTDOWN(mp)) 1302 return -EIO; 1303 1304 /* Lock both files against IO */ 1305 if (same_inode) { 1306 xfs_ilock(src, XFS_IOLOCK_EXCL); 1307 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1308 } else { 1309 xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); 1310 xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); 1311 } 1312 1313 /* Don't touch certain kinds of inodes */ 1314 ret = -EPERM; 1315 if (IS_IMMUTABLE(inode_out)) 1316 goto out_unlock; 1317 1318 ret = -ETXTBSY; 1319 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1320 goto out_unlock; 1321 1322 1323 /* Don't reflink dirs, pipes, sockets... */ 1324 ret = -EISDIR; 1325 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1326 goto out_unlock; 1327 ret = -EINVAL; 1328 if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) 1329 goto out_unlock; 1330 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1331 goto out_unlock; 1332 1333 /* Don't reflink realtime inodes */ 1334 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1335 goto out_unlock; 1336 1337 /* Don't share DAX file data for now. */ 1338 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1339 goto out_unlock; 1340 1341 /* Are we going all the way to the end? */ 1342 isize = i_size_read(inode_in); 1343 if (isize == 0) { 1344 ret = 0; 1345 goto out_unlock; 1346 } 1347 1348 if (len == 0) 1349 len = isize - pos_in; 1350 1351 /* Ensure offsets don't wrap and the input is inside i_size */ 1352 if (pos_in + len < pos_in || pos_out + len < pos_out || 1353 pos_in + len > isize) 1354 goto out_unlock; 1355 1356 /* Don't allow dedupe past EOF in the dest file */ 1357 if (is_dedupe) { 1358 loff_t disize; 1359 1360 disize = i_size_read(inode_out); 1361 if (pos_out >= disize || pos_out + len > disize) 1362 goto out_unlock; 1363 } 1364 1365 /* If we're linking to EOF, continue to the block boundary. */ 1366 if (pos_in + len == isize) 1367 blen = ALIGN(isize, bs) - pos_in; 1368 else 1369 blen = len; 1370 1371 /* Only reflink if we're aligned to block boundaries */ 1372 if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || 1373 !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) 1374 goto out_unlock; 1375 1376 /* Don't allow overlapped reflink within the same file */ 1377 if (same_inode) { 1378 if (pos_out + blen > pos_in && pos_out < pos_in + blen) 1379 goto out_unlock; 1380 } 1381 1382 /* Wait for the completion of any pending IOs on both files */ 1383 inode_dio_wait(inode_in); 1384 if (!same_inode) 1385 inode_dio_wait(inode_out); 1386 1387 ret = filemap_write_and_wait_range(inode_in->i_mapping, 1388 pos_in, pos_in + len - 1); 1389 if (ret) 1390 goto out_unlock; 1391 1392 ret = filemap_write_and_wait_range(inode_out->i_mapping, 1393 pos_out, pos_out + len - 1); 1394 if (ret) 1395 goto out_unlock; 1396 1397 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1398 1399 /* 1400 * Check that the extents are the same. 1401 */ 1402 if (is_dedupe) { 1403 bool is_same = false; 1404 1405 ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, 1406 len, &is_same); 1407 if (ret) 1408 goto out_unlock; 1409 if (!is_same) { 1410 ret = -EBADE; 1411 goto out_unlock; 1412 } 1413 } 1414 1415 ret = xfs_reflink_set_inode_flag(src, dest); 1416 if (ret) 1417 goto out_unlock; 1418 1419 /* 1420 * Invalidate the page cache so that we can clear any CoW mappings 1421 * in the destination file. 1422 */ 1423 truncate_inode_pages_range(&inode_out->i_data, pos_out, 1424 PAGE_ALIGN(pos_out + len) - 1); 1425 1426 dfsbno = XFS_B_TO_FSBT(mp, pos_out); 1427 sfsbno = XFS_B_TO_FSBT(mp, pos_in); 1428 fsblen = XFS_B_TO_FSB(mp, len); 1429 ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, 1430 pos_out + len); 1431 if (ret) 1432 goto out_unlock; 1433 1434 /* 1435 * Carry the cowextsize hint from src to dest if we're sharing the 1436 * entire source file to the entire destination file, the source file 1437 * has a cowextsize hint, and the destination file does not. 1438 */ 1439 cowextsize = 0; 1440 if (pos_in == 0 && len == i_size_read(inode_in) && 1441 (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && 1442 pos_out == 0 && len >= i_size_read(inode_out) && 1443 !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) 1444 cowextsize = src->i_d.di_cowextsize; 1445 1446 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); 1447 1448 out_unlock: 1449 xfs_iunlock(src, XFS_MMAPLOCK_EXCL); 1450 xfs_iunlock(src, XFS_IOLOCK_EXCL); 1451 if (src->i_ino != dest->i_ino) { 1452 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1453 xfs_iunlock(dest, XFS_IOLOCK_EXCL); 1454 } 1455 if (ret) 1456 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1457 return ret; 1458 } 1459 1460 /* 1461 * The user wants to preemptively CoW all shared blocks in this file, 1462 * which enables us to turn off the reflink flag. Iterate all 1463 * extents which are not prealloc/delalloc to see which ranges are 1464 * mentioned in the refcount tree, then read those blocks into the 1465 * pagecache, dirty them, fsync them back out, and then we can update 1466 * the inode flag. What happens if we run out of memory? :) 1467 */ 1468 STATIC int 1469 xfs_reflink_dirty_extents( 1470 struct xfs_inode *ip, 1471 xfs_fileoff_t fbno, 1472 xfs_filblks_t end, 1473 xfs_off_t isize) 1474 { 1475 struct xfs_mount *mp = ip->i_mount; 1476 xfs_agnumber_t agno; 1477 xfs_agblock_t agbno; 1478 xfs_extlen_t aglen; 1479 xfs_agblock_t rbno; 1480 xfs_extlen_t rlen; 1481 xfs_off_t fpos; 1482 xfs_off_t flen; 1483 struct xfs_bmbt_irec map[2]; 1484 int nmaps; 1485 int error = 0; 1486 1487 while (end - fbno > 0) { 1488 nmaps = 1; 1489 /* 1490 * Look for extents in the file. Skip holes, delalloc, or 1491 * unwritten extents; they can't be reflinked. 1492 */ 1493 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1494 if (error) 1495 goto out; 1496 if (nmaps == 0) 1497 break; 1498 if (map[0].br_startblock == HOLESTARTBLOCK || 1499 map[0].br_startblock == DELAYSTARTBLOCK || 1500 ISUNWRITTEN(&map[0])) 1501 goto next; 1502 1503 map[1] = map[0]; 1504 while (map[1].br_blockcount) { 1505 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1506 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1507 aglen = map[1].br_blockcount; 1508 1509 error = xfs_reflink_find_shared(mp, agno, agbno, aglen, 1510 &rbno, &rlen, true); 1511 if (error) 1512 goto out; 1513 if (rbno == NULLAGBLOCK) 1514 break; 1515 1516 /* Dirty the pages */ 1517 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1518 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1519 (rbno - agbno)); 1520 flen = XFS_FSB_TO_B(mp, rlen); 1521 if (fpos + flen > isize) 1522 flen = isize - fpos; 1523 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1524 &xfs_iomap_ops); 1525 xfs_ilock(ip, XFS_ILOCK_EXCL); 1526 if (error) 1527 goto out; 1528 1529 map[1].br_blockcount -= (rbno - agbno + rlen); 1530 map[1].br_startoff += (rbno - agbno + rlen); 1531 map[1].br_startblock += (rbno - agbno + rlen); 1532 } 1533 1534 next: 1535 fbno = map[0].br_startoff + map[0].br_blockcount; 1536 } 1537 out: 1538 return error; 1539 } 1540 1541 /* Clear the inode reflink flag if there are no shared extents. */ 1542 int 1543 xfs_reflink_clear_inode_flag( 1544 struct xfs_inode *ip, 1545 struct xfs_trans **tpp) 1546 { 1547 struct xfs_mount *mp = ip->i_mount; 1548 xfs_fileoff_t fbno; 1549 xfs_filblks_t end; 1550 xfs_agnumber_t agno; 1551 xfs_agblock_t agbno; 1552 xfs_extlen_t aglen; 1553 xfs_agblock_t rbno; 1554 xfs_extlen_t rlen; 1555 struct xfs_bmbt_irec map; 1556 int nmaps; 1557 int error = 0; 1558 1559 ASSERT(xfs_is_reflink_inode(ip)); 1560 1561 fbno = 0; 1562 end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); 1563 while (end - fbno > 0) { 1564 nmaps = 1; 1565 /* 1566 * Look for extents in the file. Skip holes, delalloc, or 1567 * unwritten extents; they can't be reflinked. 1568 */ 1569 error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); 1570 if (error) 1571 return error; 1572 if (nmaps == 0) 1573 break; 1574 if (map.br_startblock == HOLESTARTBLOCK || 1575 map.br_startblock == DELAYSTARTBLOCK || 1576 ISUNWRITTEN(&map)) 1577 goto next; 1578 1579 agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); 1580 agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); 1581 aglen = map.br_blockcount; 1582 1583 error = xfs_reflink_find_shared(mp, agno, agbno, aglen, 1584 &rbno, &rlen, false); 1585 if (error) 1586 return error; 1587 /* Is there still a shared block here? */ 1588 if (rbno != NULLAGBLOCK) 1589 return 0; 1590 next: 1591 fbno = map.br_startoff + map.br_blockcount; 1592 } 1593 1594 /* 1595 * We didn't find any shared blocks so turn off the reflink flag. 1596 * First, get rid of any leftover CoW mappings. 1597 */ 1598 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); 1599 if (error) 1600 return error; 1601 1602 /* Clear the inode flag. */ 1603 trace_xfs_reflink_unset_inode_flag(ip); 1604 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1605 xfs_inode_clear_cowblocks_tag(ip); 1606 xfs_trans_ijoin(*tpp, ip, 0); 1607 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1608 1609 return error; 1610 } 1611 1612 /* 1613 * Clear the inode reflink flag if there are no shared extents and the size 1614 * hasn't changed. 1615 */ 1616 STATIC int 1617 xfs_reflink_try_clear_inode_flag( 1618 struct xfs_inode *ip) 1619 { 1620 struct xfs_mount *mp = ip->i_mount; 1621 struct xfs_trans *tp; 1622 int error = 0; 1623 1624 /* Start a rolling transaction to remove the mappings */ 1625 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1626 if (error) 1627 return error; 1628 1629 xfs_ilock(ip, XFS_ILOCK_EXCL); 1630 xfs_trans_ijoin(tp, ip, 0); 1631 1632 error = xfs_reflink_clear_inode_flag(ip, &tp); 1633 if (error) 1634 goto cancel; 1635 1636 error = xfs_trans_commit(tp); 1637 if (error) 1638 goto out; 1639 1640 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1641 return 0; 1642 cancel: 1643 xfs_trans_cancel(tp); 1644 out: 1645 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1646 return error; 1647 } 1648 1649 /* 1650 * Pre-COW all shared blocks within a given byte range of a file and turn off 1651 * the reflink flag if we unshare all of the file's blocks. 1652 */ 1653 int 1654 xfs_reflink_unshare( 1655 struct xfs_inode *ip, 1656 xfs_off_t offset, 1657 xfs_off_t len) 1658 { 1659 struct xfs_mount *mp = ip->i_mount; 1660 xfs_fileoff_t fbno; 1661 xfs_filblks_t end; 1662 xfs_off_t isize; 1663 int error; 1664 1665 if (!xfs_is_reflink_inode(ip)) 1666 return 0; 1667 1668 trace_xfs_reflink_unshare(ip, offset, len); 1669 1670 inode_dio_wait(VFS_I(ip)); 1671 1672 /* Try to CoW the selected ranges */ 1673 xfs_ilock(ip, XFS_ILOCK_EXCL); 1674 fbno = XFS_B_TO_FSBT(mp, offset); 1675 isize = i_size_read(VFS_I(ip)); 1676 end = XFS_B_TO_FSB(mp, offset + len); 1677 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1678 if (error) 1679 goto out_unlock; 1680 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1681 1682 /* Wait for the IO to finish */ 1683 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1684 if (error) 1685 goto out; 1686 1687 /* Turn off the reflink flag if possible. */ 1688 error = xfs_reflink_try_clear_inode_flag(ip); 1689 if (error) 1690 goto out; 1691 1692 return 0; 1693 1694 out_unlock: 1695 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1696 out: 1697 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1698 return error; 1699 } 1700 1701 /* 1702 * Does this inode have any real CoW reservations? 1703 */ 1704 bool 1705 xfs_reflink_has_real_cow_blocks( 1706 struct xfs_inode *ip) 1707 { 1708 struct xfs_bmbt_irec irec; 1709 struct xfs_ifork *ifp; 1710 struct xfs_bmbt_rec_host *gotp; 1711 xfs_extnum_t idx; 1712 1713 if (!xfs_is_reflink_inode(ip)) 1714 return false; 1715 1716 /* Go find the old extent in the CoW fork. */ 1717 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1718 gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); 1719 while (gotp) { 1720 xfs_bmbt_get_all(gotp, &irec); 1721 1722 if (!isnullstartblock(irec.br_startblock)) 1723 return true; 1724 1725 /* Roll on... */ 1726 idx++; 1727 if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) 1728 break; 1729 gotp = xfs_iext_get_ext(ifp, idx); 1730 } 1731 1732 return false; 1733 } 1734