1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/blkdev.h> 4 #include <linux/iversion.h> 5 #include "messages.h" 6 #include "compression.h" 7 #include "ctree.h" 8 #include "delalloc-space.h" 9 #include "disk-io.h" 10 #include "reflink.h" 11 #include "transaction.h" 12 #include "subpage.h" 13 14 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 15 16 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 17 struct inode *inode, 18 u64 endoff, 19 const u64 destoff, 20 const u64 olen, 21 int no_time_update) 22 { 23 struct btrfs_root *root = BTRFS_I(inode)->root; 24 int ret; 25 26 inode_inc_iversion(inode); 27 if (!no_time_update) { 28 inode->i_mtime = current_time(inode); 29 inode->i_ctime = inode->i_mtime; 30 } 31 /* 32 * We round up to the block size at eof when determining which 33 * extents to clone above, but shouldn't round up the file size. 34 */ 35 if (endoff > destoff + olen) 36 endoff = destoff + olen; 37 if (endoff > inode->i_size) { 38 i_size_write(inode, endoff); 39 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 40 } 41 42 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 43 if (ret) { 44 btrfs_abort_transaction(trans, ret); 45 btrfs_end_transaction(trans); 46 goto out; 47 } 48 ret = btrfs_end_transaction(trans); 49 out: 50 return ret; 51 } 52 53 static int copy_inline_to_page(struct btrfs_inode *inode, 54 const u64 file_offset, 55 char *inline_data, 56 const u64 size, 57 const u64 datal, 58 const u8 comp_type) 59 { 60 struct btrfs_fs_info *fs_info = inode->root->fs_info; 61 const u32 block_size = fs_info->sectorsize; 62 const u64 range_end = file_offset + block_size - 1; 63 const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); 64 char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); 65 struct extent_changeset *data_reserved = NULL; 66 struct page *page = NULL; 67 struct address_space *mapping = inode->vfs_inode.i_mapping; 68 int ret; 69 70 ASSERT(IS_ALIGNED(file_offset, block_size)); 71 72 /* 73 * We have flushed and locked the ranges of the source and destination 74 * inodes, we also have locked the inodes, so we are safe to do a 75 * reservation here. Also we must not do the reservation while holding 76 * a transaction open, otherwise we would deadlock. 77 */ 78 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, 79 block_size); 80 if (ret) 81 goto out; 82 83 page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, 84 btrfs_alloc_write_mask(mapping)); 85 if (!page) { 86 ret = -ENOMEM; 87 goto out_unlock; 88 } 89 90 ret = set_page_extent_mapped(page); 91 if (ret < 0) 92 goto out_unlock; 93 94 clear_extent_bit(&inode->io_tree, file_offset, range_end, 95 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 96 NULL); 97 ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); 98 if (ret) 99 goto out_unlock; 100 101 /* 102 * After dirtying the page our caller will need to start a transaction, 103 * and if we are low on metadata free space, that can cause flushing of 104 * delalloc for all inodes in order to get metadata space released. 105 * However we are holding the range locked for the whole duration of 106 * the clone/dedupe operation, so we may deadlock if that happens and no 107 * other task releases enough space. So mark this inode as not being 108 * possible to flush to avoid such deadlock. We will clear that flag 109 * when we finish cloning all extents, since a transaction is started 110 * after finding each extent to clone. 111 */ 112 set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); 113 114 if (comp_type == BTRFS_COMPRESS_NONE) { 115 memcpy_to_page(page, offset_in_page(file_offset), data_start, 116 datal); 117 } else { 118 ret = btrfs_decompress(comp_type, data_start, page, 119 offset_in_page(file_offset), 120 inline_size, datal); 121 if (ret) 122 goto out_unlock; 123 flush_dcache_page(page); 124 } 125 126 /* 127 * If our inline data is smaller then the block/page size, then the 128 * remaining of the block/page is equivalent to zeroes. We had something 129 * like the following done: 130 * 131 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file 132 * $ sync # (or fsync) 133 * $ xfs_io -c "falloc 0 4K" file 134 * $ xfs_io -c "pwrite -S 0xcd 4K 4K" 135 * 136 * So what's in the range [500, 4095] corresponds to zeroes. 137 */ 138 if (datal < block_size) 139 memzero_page(page, datal, block_size - datal); 140 141 btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); 142 btrfs_page_clear_checked(fs_info, page, file_offset, block_size); 143 btrfs_page_set_dirty(fs_info, page, file_offset, block_size); 144 out_unlock: 145 if (page) { 146 unlock_page(page); 147 put_page(page); 148 } 149 if (ret) 150 btrfs_delalloc_release_space(inode, data_reserved, file_offset, 151 block_size, true); 152 btrfs_delalloc_release_extents(inode, block_size); 153 out: 154 extent_changeset_free(data_reserved); 155 156 return ret; 157 } 158 159 /* 160 * Deal with cloning of inline extents. We try to copy the inline extent from 161 * the source inode to destination inode when possible. When not possible we 162 * copy the inline extent's data into the respective page of the inode. 163 */ 164 static int clone_copy_inline_extent(struct inode *dst, 165 struct btrfs_path *path, 166 struct btrfs_key *new_key, 167 const u64 drop_start, 168 const u64 datal, 169 const u64 size, 170 const u8 comp_type, 171 char *inline_data, 172 struct btrfs_trans_handle **trans_out) 173 { 174 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 175 struct btrfs_root *root = BTRFS_I(dst)->root; 176 const u64 aligned_end = ALIGN(new_key->offset + datal, 177 fs_info->sectorsize); 178 struct btrfs_trans_handle *trans = NULL; 179 struct btrfs_drop_extents_args drop_args = { 0 }; 180 int ret; 181 struct btrfs_key key; 182 183 if (new_key->offset > 0) { 184 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 185 inline_data, size, datal, comp_type); 186 goto out; 187 } 188 189 key.objectid = btrfs_ino(BTRFS_I(dst)); 190 key.type = BTRFS_EXTENT_DATA_KEY; 191 key.offset = 0; 192 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 193 if (ret < 0) { 194 return ret; 195 } else if (ret > 0) { 196 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 197 ret = btrfs_next_leaf(root, path); 198 if (ret < 0) 199 return ret; 200 else if (ret > 0) 201 goto copy_inline_extent; 202 } 203 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 204 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 205 key.type == BTRFS_EXTENT_DATA_KEY) { 206 /* 207 * There's an implicit hole at file offset 0, copy the 208 * inline extent's data to the page. 209 */ 210 ASSERT(key.offset > 0); 211 goto copy_to_page; 212 } 213 } else if (i_size_read(dst) <= datal) { 214 struct btrfs_file_extent_item *ei; 215 216 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 217 struct btrfs_file_extent_item); 218 /* 219 * If it's an inline extent replace it with the source inline 220 * extent, otherwise copy the source inline extent data into 221 * the respective page at the destination inode. 222 */ 223 if (btrfs_file_extent_type(path->nodes[0], ei) == 224 BTRFS_FILE_EXTENT_INLINE) 225 goto copy_inline_extent; 226 227 goto copy_to_page; 228 } 229 230 copy_inline_extent: 231 /* 232 * We have no extent items, or we have an extent at offset 0 which may 233 * or may not be inlined. All these cases are dealt the same way. 234 */ 235 if (i_size_read(dst) > datal) { 236 /* 237 * At the destination offset 0 we have either a hole, a regular 238 * extent or an inline extent larger then the one we want to 239 * clone. Deal with all these cases by copying the inline extent 240 * data into the respective page at the destination inode. 241 */ 242 goto copy_to_page; 243 } 244 245 /* 246 * Release path before starting a new transaction so we don't hold locks 247 * that would confuse lockdep. 248 */ 249 btrfs_release_path(path); 250 /* 251 * If we end up here it means were copy the inline extent into a leaf 252 * of the destination inode. We know we will drop or adjust at most one 253 * extent item in the destination root. 254 * 255 * 1 unit - adjusting old extent (we may have to split it) 256 * 1 unit - add new extent 257 * 1 unit - inode update 258 */ 259 trans = btrfs_start_transaction(root, 3); 260 if (IS_ERR(trans)) { 261 ret = PTR_ERR(trans); 262 trans = NULL; 263 goto out; 264 } 265 drop_args.path = path; 266 drop_args.start = drop_start; 267 drop_args.end = aligned_end; 268 drop_args.drop_cache = true; 269 ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); 270 if (ret) 271 goto out; 272 ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 273 if (ret) 274 goto out; 275 276 write_extent_buffer(path->nodes[0], inline_data, 277 btrfs_item_ptr_offset(path->nodes[0], 278 path->slots[0]), 279 size); 280 btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); 281 btrfs_set_inode_full_sync(BTRFS_I(dst)); 282 ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); 283 out: 284 if (!ret && !trans) { 285 /* 286 * No transaction here means we copied the inline extent into a 287 * page of the destination inode. 288 * 289 * 1 unit to update inode item 290 */ 291 trans = btrfs_start_transaction(root, 1); 292 if (IS_ERR(trans)) { 293 ret = PTR_ERR(trans); 294 trans = NULL; 295 } 296 } 297 if (ret && trans) { 298 btrfs_abort_transaction(trans, ret); 299 btrfs_end_transaction(trans); 300 } 301 if (!ret) 302 *trans_out = trans; 303 304 return ret; 305 306 copy_to_page: 307 /* 308 * Release our path because we don't need it anymore and also because 309 * copy_inline_to_page() needs to reserve data and metadata, which may 310 * need to flush delalloc when we are low on available space and 311 * therefore cause a deadlock if writeback of an inline extent needs to 312 * write to the same leaf or an ordered extent completion needs to write 313 * to the same leaf. 314 */ 315 btrfs_release_path(path); 316 317 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 318 inline_data, size, datal, comp_type); 319 goto out; 320 } 321 322 /** 323 * btrfs_clone() - clone a range from inode file to another 324 * 325 * @src: Inode to clone from 326 * @inode: Inode to clone to 327 * @off: Offset within source to start clone from 328 * @olen: Original length, passed by user, of range to clone 329 * @olen_aligned: Block-aligned value of olen 330 * @destoff: Offset within @inode to start clone 331 * @no_time_update: Whether to update mtime/ctime on the target inode 332 */ 333 static int btrfs_clone(struct inode *src, struct inode *inode, 334 const u64 off, const u64 olen, const u64 olen_aligned, 335 const u64 destoff, int no_time_update) 336 { 337 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 338 struct btrfs_path *path = NULL; 339 struct extent_buffer *leaf; 340 struct btrfs_trans_handle *trans; 341 char *buf = NULL; 342 struct btrfs_key key; 343 u32 nritems; 344 int slot; 345 int ret; 346 const u64 len = olen_aligned; 347 u64 last_dest_end = destoff; 348 u64 prev_extent_end = off; 349 350 ret = -ENOMEM; 351 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 352 if (!buf) 353 return ret; 354 355 path = btrfs_alloc_path(); 356 if (!path) { 357 kvfree(buf); 358 return ret; 359 } 360 361 path->reada = READA_FORWARD; 362 /* Clone data */ 363 key.objectid = btrfs_ino(BTRFS_I(src)); 364 key.type = BTRFS_EXTENT_DATA_KEY; 365 key.offset = off; 366 367 while (1) { 368 struct btrfs_file_extent_item *extent; 369 u64 extent_gen; 370 int type; 371 u32 size; 372 struct btrfs_key new_key; 373 u64 disko = 0, diskl = 0; 374 u64 datao = 0, datal = 0; 375 u8 comp; 376 u64 drop_start; 377 378 /* Note the key will change type as we walk through the tree */ 379 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 380 0, 0); 381 if (ret < 0) 382 goto out; 383 /* 384 * First search, if no extent item that starts at offset off was 385 * found but the previous item is an extent item, it's possible 386 * it might overlap our target range, therefore process it. 387 */ 388 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 389 btrfs_item_key_to_cpu(path->nodes[0], &key, 390 path->slots[0] - 1); 391 if (key.type == BTRFS_EXTENT_DATA_KEY) 392 path->slots[0]--; 393 } 394 395 nritems = btrfs_header_nritems(path->nodes[0]); 396 process_slot: 397 if (path->slots[0] >= nritems) { 398 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 399 if (ret < 0) 400 goto out; 401 if (ret > 0) 402 break; 403 nritems = btrfs_header_nritems(path->nodes[0]); 404 } 405 leaf = path->nodes[0]; 406 slot = path->slots[0]; 407 408 btrfs_item_key_to_cpu(leaf, &key, slot); 409 if (key.type > BTRFS_EXTENT_DATA_KEY || 410 key.objectid != btrfs_ino(BTRFS_I(src))) 411 break; 412 413 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); 414 415 extent = btrfs_item_ptr(leaf, slot, 416 struct btrfs_file_extent_item); 417 extent_gen = btrfs_file_extent_generation(leaf, extent); 418 comp = btrfs_file_extent_compression(leaf, extent); 419 type = btrfs_file_extent_type(leaf, extent); 420 if (type == BTRFS_FILE_EXTENT_REG || 421 type == BTRFS_FILE_EXTENT_PREALLOC) { 422 disko = btrfs_file_extent_disk_bytenr(leaf, extent); 423 diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); 424 datao = btrfs_file_extent_offset(leaf, extent); 425 datal = btrfs_file_extent_num_bytes(leaf, extent); 426 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 427 /* Take upper bound, may be compressed */ 428 datal = btrfs_file_extent_ram_bytes(leaf, extent); 429 } 430 431 /* 432 * The first search might have left us at an extent item that 433 * ends before our target range's start, can happen if we have 434 * holes and NO_HOLES feature enabled. 435 * 436 * Subsequent searches may leave us on a file range we have 437 * processed before - this happens due to a race with ordered 438 * extent completion for a file range that is outside our source 439 * range, but that range was part of a file extent item that 440 * also covered a leading part of our source range. 441 */ 442 if (key.offset + datal <= prev_extent_end) { 443 path->slots[0]++; 444 goto process_slot; 445 } else if (key.offset >= off + len) { 446 break; 447 } 448 449 prev_extent_end = key.offset + datal; 450 size = btrfs_item_size(leaf, slot); 451 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), 452 size); 453 454 btrfs_release_path(path); 455 456 memcpy(&new_key, &key, sizeof(new_key)); 457 new_key.objectid = btrfs_ino(BTRFS_I(inode)); 458 if (off <= key.offset) 459 new_key.offset = key.offset + destoff - off; 460 else 461 new_key.offset = destoff; 462 463 /* 464 * Deal with a hole that doesn't have an extent item that 465 * represents it (NO_HOLES feature enabled). 466 * This hole is either in the middle of the cloning range or at 467 * the beginning (fully overlaps it or partially overlaps it). 468 */ 469 if (new_key.offset != last_dest_end) 470 drop_start = last_dest_end; 471 else 472 drop_start = new_key.offset; 473 474 if (type == BTRFS_FILE_EXTENT_REG || 475 type == BTRFS_FILE_EXTENT_PREALLOC) { 476 struct btrfs_replace_extent_info clone_info; 477 478 /* 479 * a | --- range to clone ---| b 480 * | ------------- extent ------------- | 481 */ 482 483 /* Subtract range b */ 484 if (key.offset + datal > off + len) 485 datal = off + len - key.offset; 486 487 /* Subtract range a */ 488 if (off > key.offset) { 489 datao += off - key.offset; 490 datal -= off - key.offset; 491 } 492 493 clone_info.disk_offset = disko; 494 clone_info.disk_len = diskl; 495 clone_info.data_offset = datao; 496 clone_info.data_len = datal; 497 clone_info.file_offset = new_key.offset; 498 clone_info.extent_buf = buf; 499 clone_info.is_new_extent = false; 500 clone_info.update_times = !no_time_update; 501 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 502 drop_start, new_key.offset + datal - 1, 503 &clone_info, &trans); 504 if (ret) 505 goto out; 506 } else { 507 ASSERT(type == BTRFS_FILE_EXTENT_INLINE); 508 /* 509 * Inline extents always have to start at file offset 0 510 * and can never be bigger then the sector size. We can 511 * never clone only parts of an inline extent, since all 512 * reflink operations must start at a sector size aligned 513 * offset, and the length must be aligned too or end at 514 * the i_size (which implies the whole inlined data). 515 */ 516 ASSERT(key.offset == 0); 517 ASSERT(datal <= fs_info->sectorsize); 518 if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || 519 WARN_ON(key.offset != 0) || 520 WARN_ON(datal > fs_info->sectorsize)) { 521 ret = -EUCLEAN; 522 goto out; 523 } 524 525 ret = clone_copy_inline_extent(inode, path, &new_key, 526 drop_start, datal, size, 527 comp, buf, &trans); 528 if (ret) 529 goto out; 530 } 531 532 btrfs_release_path(path); 533 534 /* 535 * Whenever we share an extent we update the last_reflink_trans 536 * of each inode to the current transaction. This is needed to 537 * make sure fsync does not log multiple checksum items with 538 * overlapping ranges (because some extent items might refer 539 * only to sections of the original extent). For the destination 540 * inode we do this regardless of the generation of the extents 541 * or even if they are inline extents or explicit holes, to make 542 * sure a full fsync does not skip them. For the source inode, 543 * we only need to update last_reflink_trans in case it's a new 544 * extent that is not a hole or an inline extent, to deal with 545 * the checksums problem on fsync. 546 */ 547 if (extent_gen == trans->transid && disko > 0) 548 BTRFS_I(src)->last_reflink_trans = trans->transid; 549 550 BTRFS_I(inode)->last_reflink_trans = trans->transid; 551 552 last_dest_end = ALIGN(new_key.offset + datal, 553 fs_info->sectorsize); 554 ret = clone_finish_inode_update(trans, inode, last_dest_end, 555 destoff, olen, no_time_update); 556 if (ret) 557 goto out; 558 if (new_key.offset + datal >= destoff + len) 559 break; 560 561 btrfs_release_path(path); 562 key.offset = prev_extent_end; 563 564 if (fatal_signal_pending(current)) { 565 ret = -EINTR; 566 goto out; 567 } 568 569 cond_resched(); 570 } 571 ret = 0; 572 573 if (last_dest_end < destoff + len) { 574 /* 575 * We have an implicit hole that fully or partially overlaps our 576 * cloning range at its end. This means that we either have the 577 * NO_HOLES feature enabled or the implicit hole happened due to 578 * mixing buffered and direct IO writes against this file. 579 */ 580 btrfs_release_path(path); 581 582 /* 583 * When using NO_HOLES and we are cloning a range that covers 584 * only a hole (no extents) into a range beyond the current 585 * i_size, punching a hole in the target range will not create 586 * an extent map defining a hole, because the range starts at or 587 * beyond current i_size. If the file previously had an i_size 588 * greater than the new i_size set by this clone operation, we 589 * need to make sure the next fsync is a full fsync, so that it 590 * detects and logs a hole covering a range from the current 591 * i_size to the new i_size. If the clone range covers extents, 592 * besides a hole, then we know the full sync flag was already 593 * set by previous calls to btrfs_replace_file_extents() that 594 * replaced file extent items. 595 */ 596 if (last_dest_end >= i_size_read(inode)) 597 btrfs_set_inode_full_sync(BTRFS_I(inode)); 598 599 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 600 last_dest_end, destoff + len - 1, NULL, &trans); 601 if (ret) 602 goto out; 603 604 ret = clone_finish_inode_update(trans, inode, destoff + len, 605 destoff, olen, no_time_update); 606 } 607 608 out: 609 btrfs_free_path(path); 610 kvfree(buf); 611 clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); 612 613 return ret; 614 } 615 616 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 617 struct inode *inode2, u64 loff2, u64 len) 618 { 619 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); 620 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); 621 } 622 623 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 624 struct inode *inode2, u64 loff2, u64 len) 625 { 626 u64 range1_end = loff1 + len - 1; 627 u64 range2_end = loff2 + len - 1; 628 629 if (inode1 < inode2) { 630 swap(inode1, inode2); 631 swap(loff1, loff2); 632 swap(range1_end, range2_end); 633 } else if (inode1 == inode2 && loff2 < loff1) { 634 swap(loff1, loff2); 635 swap(range1_end, range2_end); 636 } 637 638 lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); 639 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); 640 641 btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); 642 btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); 643 } 644 645 static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) 646 { 647 if (inode1 < inode2) 648 swap(inode1, inode2); 649 down_write(&BTRFS_I(inode1)->i_mmap_lock); 650 down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); 651 } 652 653 static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) 654 { 655 up_write(&BTRFS_I(inode1)->i_mmap_lock); 656 up_write(&BTRFS_I(inode2)->i_mmap_lock); 657 } 658 659 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, 660 struct inode *dst, u64 dst_loff) 661 { 662 struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; 663 const u64 bs = fs_info->sb->s_blocksize; 664 int ret; 665 666 /* 667 * Lock destination range to serialize with concurrent readahead() and 668 * source range to serialize with relocation. 669 */ 670 btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 671 ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); 672 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 673 674 btrfs_btree_balance_dirty(fs_info); 675 676 return ret; 677 } 678 679 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 680 struct inode *dst, u64 dst_loff) 681 { 682 int ret = 0; 683 u64 i, tail_len, chunk_count; 684 struct btrfs_root *root_dst = BTRFS_I(dst)->root; 685 686 spin_lock(&root_dst->root_item_lock); 687 if (root_dst->send_in_progress) { 688 btrfs_warn_rl(root_dst->fs_info, 689 "cannot deduplicate to root %llu while send operations are using it (%d in progress)", 690 root_dst->root_key.objectid, 691 root_dst->send_in_progress); 692 spin_unlock(&root_dst->root_item_lock); 693 return -EAGAIN; 694 } 695 root_dst->dedupe_in_progress++; 696 spin_unlock(&root_dst->root_item_lock); 697 698 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 699 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 700 701 for (i = 0; i < chunk_count; i++) { 702 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 703 dst, dst_loff); 704 if (ret) 705 goto out; 706 707 loff += BTRFS_MAX_DEDUPE_LEN; 708 dst_loff += BTRFS_MAX_DEDUPE_LEN; 709 } 710 711 if (tail_len > 0) 712 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); 713 out: 714 spin_lock(&root_dst->root_item_lock); 715 root_dst->dedupe_in_progress--; 716 spin_unlock(&root_dst->root_item_lock); 717 718 return ret; 719 } 720 721 static noinline int btrfs_clone_files(struct file *file, struct file *file_src, 722 u64 off, u64 olen, u64 destoff) 723 { 724 struct inode *inode = file_inode(file); 725 struct inode *src = file_inode(file_src); 726 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 727 int ret; 728 int wb_ret; 729 u64 len = olen; 730 u64 bs = fs_info->sb->s_blocksize; 731 732 /* 733 * VFS's generic_remap_file_range_prep() protects us from cloning the 734 * eof block into the middle of a file, which would result in corruption 735 * if the file size is not blocksize aligned. So we don't need to check 736 * for that case here. 737 */ 738 if (off + len == src->i_size) 739 len = ALIGN(src->i_size, bs) - off; 740 741 if (destoff > inode->i_size) { 742 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); 743 744 ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); 745 if (ret) 746 return ret; 747 /* 748 * We may have truncated the last block if the inode's size is 749 * not sector size aligned, so we need to wait for writeback to 750 * complete before proceeding further, otherwise we can race 751 * with cloning and attempt to increment a reference to an 752 * extent that no longer exists (writeback completed right after 753 * we found the previous extent covering eof and before we 754 * attempted to increment its reference count). 755 */ 756 ret = btrfs_wait_ordered_range(inode, wb_start, 757 destoff - wb_start); 758 if (ret) 759 return ret; 760 } 761 762 /* 763 * Lock destination range to serialize with concurrent readahead() and 764 * source range to serialize with relocation. 765 */ 766 btrfs_double_extent_lock(src, off, inode, destoff, len); 767 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 768 btrfs_double_extent_unlock(src, off, inode, destoff, len); 769 770 /* 771 * We may have copied an inline extent into a page of the destination 772 * range, so wait for writeback to complete before truncating pages 773 * from the page cache. This is a rare case. 774 */ 775 wb_ret = btrfs_wait_ordered_range(inode, destoff, len); 776 ret = ret ? ret : wb_ret; 777 /* 778 * Truncate page cache pages so that future reads will see the cloned 779 * data immediately and not the previous data. 780 */ 781 truncate_inode_pages_range(&inode->i_data, 782 round_down(destoff, PAGE_SIZE), 783 round_up(destoff + len, PAGE_SIZE) - 1); 784 785 btrfs_btree_balance_dirty(fs_info); 786 787 return ret; 788 } 789 790 static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, 791 struct file *file_out, loff_t pos_out, 792 loff_t *len, unsigned int remap_flags) 793 { 794 struct inode *inode_in = file_inode(file_in); 795 struct inode *inode_out = file_inode(file_out); 796 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; 797 u64 wb_len; 798 int ret; 799 800 if (!(remap_flags & REMAP_FILE_DEDUP)) { 801 struct btrfs_root *root_out = BTRFS_I(inode_out)->root; 802 803 if (btrfs_root_readonly(root_out)) 804 return -EROFS; 805 806 ASSERT(inode_in->i_sb == inode_out->i_sb); 807 } 808 809 /* Don't make the dst file partly checksummed */ 810 if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != 811 (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { 812 return -EINVAL; 813 } 814 815 /* 816 * Now that the inodes are locked, we need to start writeback ourselves 817 * and can not rely on the writeback from the VFS's generic helper 818 * generic_remap_file_range_prep() because: 819 * 820 * 1) For compression we must call filemap_fdatawrite_range() range 821 * twice (btrfs_fdatawrite_range() does it for us), and the generic 822 * helper only calls it once; 823 * 824 * 2) filemap_fdatawrite_range(), called by the generic helper only 825 * waits for the writeback to complete, i.e. for IO to be done, and 826 * not for the ordered extents to complete. We need to wait for them 827 * to complete so that new file extent items are in the fs tree. 828 */ 829 if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) 830 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); 831 else 832 wb_len = ALIGN(*len, bs); 833 834 /* 835 * Workaround to make sure NOCOW buffered write reach disk as NOCOW. 836 * 837 * Btrfs' back references do not have a block level granularity, they 838 * work at the whole extent level. 839 * NOCOW buffered write without data space reserved may not be able 840 * to fall back to CoW due to lack of data space, thus could cause 841 * data loss. 842 * 843 * Here we take a shortcut by flushing the whole inode, so that all 844 * nocow write should reach disk as nocow before we increase the 845 * reference of the extent. We could do better by only flushing NOCOW 846 * data, but that needs extra accounting. 847 * 848 * Also we don't need to check ASYNC_EXTENT, as async extent will be 849 * CoWed anyway, not affecting nocow part. 850 */ 851 ret = filemap_flush(inode_in->i_mapping); 852 if (ret < 0) 853 return ret; 854 855 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 856 wb_len); 857 if (ret < 0) 858 return ret; 859 ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), 860 wb_len); 861 if (ret < 0) 862 return ret; 863 864 return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 865 len, remap_flags); 866 } 867 868 static bool file_sync_write(const struct file *file) 869 { 870 if (file->f_flags & (__O_SYNC | O_DSYNC)) 871 return true; 872 if (IS_SYNC(file_inode(file))) 873 return true; 874 875 return false; 876 } 877 878 loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, 879 struct file *dst_file, loff_t destoff, loff_t len, 880 unsigned int remap_flags) 881 { 882 struct inode *src_inode = file_inode(src_file); 883 struct inode *dst_inode = file_inode(dst_file); 884 bool same_inode = dst_inode == src_inode; 885 int ret; 886 887 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 888 return -EINVAL; 889 890 if (same_inode) { 891 btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); 892 } else { 893 lock_two_nondirectories(src_inode, dst_inode); 894 btrfs_double_mmap_lock(src_inode, dst_inode); 895 } 896 897 ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, 898 &len, remap_flags); 899 if (ret < 0 || len == 0) 900 goto out_unlock; 901 902 if (remap_flags & REMAP_FILE_DEDUP) 903 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); 904 else 905 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); 906 907 out_unlock: 908 if (same_inode) { 909 btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); 910 } else { 911 btrfs_double_mmap_unlock(src_inode, dst_inode); 912 unlock_two_nondirectories(src_inode, dst_inode); 913 } 914 915 /* 916 * If either the source or the destination file was opened with O_SYNC, 917 * O_DSYNC or has the S_SYNC attribute, fsync both the destination and 918 * source files/ranges, so that after a successful return (0) followed 919 * by a power failure results in the reflinked data to be readable from 920 * both files/ranges. 921 */ 922 if (ret == 0 && len > 0 && 923 (file_sync_write(src_file) || file_sync_write(dst_file))) { 924 ret = btrfs_sync_file(src_file, off, off + len - 1, 0); 925 if (ret == 0) 926 ret = btrfs_sync_file(dst_file, destoff, 927 destoff + len - 1, 0); 928 } 929 930 return ret < 0 ? ret : len; 931 } 932