1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/blkdev.h> 4 #include <linux/iversion.h> 5 #include "ctree.h" 6 #include "fs.h" 7 #include "messages.h" 8 #include "compression.h" 9 #include "delalloc-space.h" 10 #include "disk-io.h" 11 #include "reflink.h" 12 #include "transaction.h" 13 #include "subpage.h" 14 #include "accessors.h" 15 16 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 17 18 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 19 struct inode *inode, 20 u64 endoff, 21 const u64 destoff, 22 const u64 olen, 23 int no_time_update) 24 { 25 struct btrfs_root *root = BTRFS_I(inode)->root; 26 int ret; 27 28 inode_inc_iversion(inode); 29 if (!no_time_update) { 30 inode->i_mtime = current_time(inode); 31 inode->i_ctime = inode->i_mtime; 32 } 33 /* 34 * We round up to the block size at eof when determining which 35 * extents to clone above, but shouldn't round up the file size. 36 */ 37 if (endoff > destoff + olen) 38 endoff = destoff + olen; 39 if (endoff > inode->i_size) { 40 i_size_write(inode, endoff); 41 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 42 } 43 44 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 45 if (ret) { 46 btrfs_abort_transaction(trans, ret); 47 btrfs_end_transaction(trans); 48 goto out; 49 } 50 ret = btrfs_end_transaction(trans); 51 out: 52 return ret; 53 } 54 55 static int copy_inline_to_page(struct btrfs_inode *inode, 56 const u64 file_offset, 57 char *inline_data, 58 const u64 size, 59 const u64 datal, 60 const u8 comp_type) 61 { 62 struct btrfs_fs_info *fs_info = inode->root->fs_info; 63 const u32 block_size = fs_info->sectorsize; 64 const u64 range_end = file_offset + block_size - 1; 65 const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); 66 char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); 67 struct extent_changeset *data_reserved = NULL; 68 struct page *page = NULL; 69 struct address_space *mapping = inode->vfs_inode.i_mapping; 70 int ret; 71 72 ASSERT(IS_ALIGNED(file_offset, block_size)); 73 74 /* 75 * We have flushed and locked the ranges of the source and destination 76 * inodes, we also have locked the inodes, so we are safe to do a 77 * reservation here. Also we must not do the reservation while holding 78 * a transaction open, otherwise we would deadlock. 79 */ 80 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, 81 block_size); 82 if (ret) 83 goto out; 84 85 page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, 86 btrfs_alloc_write_mask(mapping)); 87 if (!page) { 88 ret = -ENOMEM; 89 goto out_unlock; 90 } 91 92 ret = set_page_extent_mapped(page); 93 if (ret < 0) 94 goto out_unlock; 95 96 clear_extent_bit(&inode->io_tree, file_offset, range_end, 97 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 98 NULL); 99 ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); 100 if (ret) 101 goto out_unlock; 102 103 /* 104 * After dirtying the page our caller will need to start a transaction, 105 * and if we are low on metadata free space, that can cause flushing of 106 * delalloc for all inodes in order to get metadata space released. 107 * However we are holding the range locked for the whole duration of 108 * the clone/dedupe operation, so we may deadlock if that happens and no 109 * other task releases enough space. So mark this inode as not being 110 * possible to flush to avoid such deadlock. We will clear that flag 111 * when we finish cloning all extents, since a transaction is started 112 * after finding each extent to clone. 113 */ 114 set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); 115 116 if (comp_type == BTRFS_COMPRESS_NONE) { 117 memcpy_to_page(page, offset_in_page(file_offset), data_start, 118 datal); 119 } else { 120 ret = btrfs_decompress(comp_type, data_start, page, 121 offset_in_page(file_offset), 122 inline_size, datal); 123 if (ret) 124 goto out_unlock; 125 flush_dcache_page(page); 126 } 127 128 /* 129 * If our inline data is smaller then the block/page size, then the 130 * remaining of the block/page is equivalent to zeroes. We had something 131 * like the following done: 132 * 133 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file 134 * $ sync # (or fsync) 135 * $ xfs_io -c "falloc 0 4K" file 136 * $ xfs_io -c "pwrite -S 0xcd 4K 4K" 137 * 138 * So what's in the range [500, 4095] corresponds to zeroes. 139 */ 140 if (datal < block_size) 141 memzero_page(page, datal, block_size - datal); 142 143 btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); 144 btrfs_page_clear_checked(fs_info, page, file_offset, block_size); 145 btrfs_page_set_dirty(fs_info, page, file_offset, block_size); 146 out_unlock: 147 if (page) { 148 unlock_page(page); 149 put_page(page); 150 } 151 if (ret) 152 btrfs_delalloc_release_space(inode, data_reserved, file_offset, 153 block_size, true); 154 btrfs_delalloc_release_extents(inode, block_size); 155 out: 156 extent_changeset_free(data_reserved); 157 158 return ret; 159 } 160 161 /* 162 * Deal with cloning of inline extents. We try to copy the inline extent from 163 * the source inode to destination inode when possible. When not possible we 164 * copy the inline extent's data into the respective page of the inode. 165 */ 166 static int clone_copy_inline_extent(struct inode *dst, 167 struct btrfs_path *path, 168 struct btrfs_key *new_key, 169 const u64 drop_start, 170 const u64 datal, 171 const u64 size, 172 const u8 comp_type, 173 char *inline_data, 174 struct btrfs_trans_handle **trans_out) 175 { 176 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 177 struct btrfs_root *root = BTRFS_I(dst)->root; 178 const u64 aligned_end = ALIGN(new_key->offset + datal, 179 fs_info->sectorsize); 180 struct btrfs_trans_handle *trans = NULL; 181 struct btrfs_drop_extents_args drop_args = { 0 }; 182 int ret; 183 struct btrfs_key key; 184 185 if (new_key->offset > 0) { 186 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 187 inline_data, size, datal, comp_type); 188 goto out; 189 } 190 191 key.objectid = btrfs_ino(BTRFS_I(dst)); 192 key.type = BTRFS_EXTENT_DATA_KEY; 193 key.offset = 0; 194 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 195 if (ret < 0) { 196 return ret; 197 } else if (ret > 0) { 198 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 199 ret = btrfs_next_leaf(root, path); 200 if (ret < 0) 201 return ret; 202 else if (ret > 0) 203 goto copy_inline_extent; 204 } 205 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 206 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 207 key.type == BTRFS_EXTENT_DATA_KEY) { 208 /* 209 * There's an implicit hole at file offset 0, copy the 210 * inline extent's data to the page. 211 */ 212 ASSERT(key.offset > 0); 213 goto copy_to_page; 214 } 215 } else if (i_size_read(dst) <= datal) { 216 struct btrfs_file_extent_item *ei; 217 218 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 219 struct btrfs_file_extent_item); 220 /* 221 * If it's an inline extent replace it with the source inline 222 * extent, otherwise copy the source inline extent data into 223 * the respective page at the destination inode. 224 */ 225 if (btrfs_file_extent_type(path->nodes[0], ei) == 226 BTRFS_FILE_EXTENT_INLINE) 227 goto copy_inline_extent; 228 229 goto copy_to_page; 230 } 231 232 copy_inline_extent: 233 /* 234 * We have no extent items, or we have an extent at offset 0 which may 235 * or may not be inlined. All these cases are dealt the same way. 236 */ 237 if (i_size_read(dst) > datal) { 238 /* 239 * At the destination offset 0 we have either a hole, a regular 240 * extent or an inline extent larger then the one we want to 241 * clone. Deal with all these cases by copying the inline extent 242 * data into the respective page at the destination inode. 243 */ 244 goto copy_to_page; 245 } 246 247 /* 248 * Release path before starting a new transaction so we don't hold locks 249 * that would confuse lockdep. 250 */ 251 btrfs_release_path(path); 252 /* 253 * If we end up here it means were copy the inline extent into a leaf 254 * of the destination inode. We know we will drop or adjust at most one 255 * extent item in the destination root. 256 * 257 * 1 unit - adjusting old extent (we may have to split it) 258 * 1 unit - add new extent 259 * 1 unit - inode update 260 */ 261 trans = btrfs_start_transaction(root, 3); 262 if (IS_ERR(trans)) { 263 ret = PTR_ERR(trans); 264 trans = NULL; 265 goto out; 266 } 267 drop_args.path = path; 268 drop_args.start = drop_start; 269 drop_args.end = aligned_end; 270 drop_args.drop_cache = true; 271 ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); 272 if (ret) 273 goto out; 274 ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 275 if (ret) 276 goto out; 277 278 write_extent_buffer(path->nodes[0], inline_data, 279 btrfs_item_ptr_offset(path->nodes[0], 280 path->slots[0]), 281 size); 282 btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); 283 btrfs_set_inode_full_sync(BTRFS_I(dst)); 284 ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); 285 out: 286 if (!ret && !trans) { 287 /* 288 * No transaction here means we copied the inline extent into a 289 * page of the destination inode. 290 * 291 * 1 unit to update inode item 292 */ 293 trans = btrfs_start_transaction(root, 1); 294 if (IS_ERR(trans)) { 295 ret = PTR_ERR(trans); 296 trans = NULL; 297 } 298 } 299 if (ret && trans) { 300 btrfs_abort_transaction(trans, ret); 301 btrfs_end_transaction(trans); 302 } 303 if (!ret) 304 *trans_out = trans; 305 306 return ret; 307 308 copy_to_page: 309 /* 310 * Release our path because we don't need it anymore and also because 311 * copy_inline_to_page() needs to reserve data and metadata, which may 312 * need to flush delalloc when we are low on available space and 313 * therefore cause a deadlock if writeback of an inline extent needs to 314 * write to the same leaf or an ordered extent completion needs to write 315 * to the same leaf. 316 */ 317 btrfs_release_path(path); 318 319 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 320 inline_data, size, datal, comp_type); 321 goto out; 322 } 323 324 /* 325 * Clone a range from inode file to another. 326 * 327 * @src: Inode to clone from 328 * @inode: Inode to clone to 329 * @off: Offset within source to start clone from 330 * @olen: Original length, passed by user, of range to clone 331 * @olen_aligned: Block-aligned value of olen 332 * @destoff: Offset within @inode to start clone 333 * @no_time_update: Whether to update mtime/ctime on the target inode 334 */ 335 static int btrfs_clone(struct inode *src, struct inode *inode, 336 const u64 off, const u64 olen, const u64 olen_aligned, 337 const u64 destoff, int no_time_update) 338 { 339 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 340 struct btrfs_path *path = NULL; 341 struct extent_buffer *leaf; 342 struct btrfs_trans_handle *trans; 343 char *buf = NULL; 344 struct btrfs_key key; 345 u32 nritems; 346 int slot; 347 int ret; 348 const u64 len = olen_aligned; 349 u64 last_dest_end = destoff; 350 u64 prev_extent_end = off; 351 352 ret = -ENOMEM; 353 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 354 if (!buf) 355 return ret; 356 357 path = btrfs_alloc_path(); 358 if (!path) { 359 kvfree(buf); 360 return ret; 361 } 362 363 path->reada = READA_FORWARD; 364 /* Clone data */ 365 key.objectid = btrfs_ino(BTRFS_I(src)); 366 key.type = BTRFS_EXTENT_DATA_KEY; 367 key.offset = off; 368 369 while (1) { 370 struct btrfs_file_extent_item *extent; 371 u64 extent_gen; 372 int type; 373 u32 size; 374 struct btrfs_key new_key; 375 u64 disko = 0, diskl = 0; 376 u64 datao = 0, datal = 0; 377 u8 comp; 378 u64 drop_start; 379 380 /* Note the key will change type as we walk through the tree */ 381 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 382 0, 0); 383 if (ret < 0) 384 goto out; 385 /* 386 * First search, if no extent item that starts at offset off was 387 * found but the previous item is an extent item, it's possible 388 * it might overlap our target range, therefore process it. 389 */ 390 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 391 btrfs_item_key_to_cpu(path->nodes[0], &key, 392 path->slots[0] - 1); 393 if (key.type == BTRFS_EXTENT_DATA_KEY) 394 path->slots[0]--; 395 } 396 397 nritems = btrfs_header_nritems(path->nodes[0]); 398 process_slot: 399 if (path->slots[0] >= nritems) { 400 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 401 if (ret < 0) 402 goto out; 403 if (ret > 0) 404 break; 405 nritems = btrfs_header_nritems(path->nodes[0]); 406 } 407 leaf = path->nodes[0]; 408 slot = path->slots[0]; 409 410 btrfs_item_key_to_cpu(leaf, &key, slot); 411 if (key.type > BTRFS_EXTENT_DATA_KEY || 412 key.objectid != btrfs_ino(BTRFS_I(src))) 413 break; 414 415 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); 416 417 extent = btrfs_item_ptr(leaf, slot, 418 struct btrfs_file_extent_item); 419 extent_gen = btrfs_file_extent_generation(leaf, extent); 420 comp = btrfs_file_extent_compression(leaf, extent); 421 type = btrfs_file_extent_type(leaf, extent); 422 if (type == BTRFS_FILE_EXTENT_REG || 423 type == BTRFS_FILE_EXTENT_PREALLOC) { 424 disko = btrfs_file_extent_disk_bytenr(leaf, extent); 425 diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); 426 datao = btrfs_file_extent_offset(leaf, extent); 427 datal = btrfs_file_extent_num_bytes(leaf, extent); 428 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 429 /* Take upper bound, may be compressed */ 430 datal = btrfs_file_extent_ram_bytes(leaf, extent); 431 } 432 433 /* 434 * The first search might have left us at an extent item that 435 * ends before our target range's start, can happen if we have 436 * holes and NO_HOLES feature enabled. 437 * 438 * Subsequent searches may leave us on a file range we have 439 * processed before - this happens due to a race with ordered 440 * extent completion for a file range that is outside our source 441 * range, but that range was part of a file extent item that 442 * also covered a leading part of our source range. 443 */ 444 if (key.offset + datal <= prev_extent_end) { 445 path->slots[0]++; 446 goto process_slot; 447 } else if (key.offset >= off + len) { 448 break; 449 } 450 451 prev_extent_end = key.offset + datal; 452 size = btrfs_item_size(leaf, slot); 453 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), 454 size); 455 456 btrfs_release_path(path); 457 458 memcpy(&new_key, &key, sizeof(new_key)); 459 new_key.objectid = btrfs_ino(BTRFS_I(inode)); 460 if (off <= key.offset) 461 new_key.offset = key.offset + destoff - off; 462 else 463 new_key.offset = destoff; 464 465 /* 466 * Deal with a hole that doesn't have an extent item that 467 * represents it (NO_HOLES feature enabled). 468 * This hole is either in the middle of the cloning range or at 469 * the beginning (fully overlaps it or partially overlaps it). 470 */ 471 if (new_key.offset != last_dest_end) 472 drop_start = last_dest_end; 473 else 474 drop_start = new_key.offset; 475 476 if (type == BTRFS_FILE_EXTENT_REG || 477 type == BTRFS_FILE_EXTENT_PREALLOC) { 478 struct btrfs_replace_extent_info clone_info; 479 480 /* 481 * a | --- range to clone ---| b 482 * | ------------- extent ------------- | 483 */ 484 485 /* Subtract range b */ 486 if (key.offset + datal > off + len) 487 datal = off + len - key.offset; 488 489 /* Subtract range a */ 490 if (off > key.offset) { 491 datao += off - key.offset; 492 datal -= off - key.offset; 493 } 494 495 clone_info.disk_offset = disko; 496 clone_info.disk_len = diskl; 497 clone_info.data_offset = datao; 498 clone_info.data_len = datal; 499 clone_info.file_offset = new_key.offset; 500 clone_info.extent_buf = buf; 501 clone_info.is_new_extent = false; 502 clone_info.update_times = !no_time_update; 503 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 504 drop_start, new_key.offset + datal - 1, 505 &clone_info, &trans); 506 if (ret) 507 goto out; 508 } else { 509 ASSERT(type == BTRFS_FILE_EXTENT_INLINE); 510 /* 511 * Inline extents always have to start at file offset 0 512 * and can never be bigger then the sector size. We can 513 * never clone only parts of an inline extent, since all 514 * reflink operations must start at a sector size aligned 515 * offset, and the length must be aligned too or end at 516 * the i_size (which implies the whole inlined data). 517 */ 518 ASSERT(key.offset == 0); 519 ASSERT(datal <= fs_info->sectorsize); 520 if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || 521 WARN_ON(key.offset != 0) || 522 WARN_ON(datal > fs_info->sectorsize)) { 523 ret = -EUCLEAN; 524 goto out; 525 } 526 527 ret = clone_copy_inline_extent(inode, path, &new_key, 528 drop_start, datal, size, 529 comp, buf, &trans); 530 if (ret) 531 goto out; 532 } 533 534 btrfs_release_path(path); 535 536 /* 537 * Whenever we share an extent we update the last_reflink_trans 538 * of each inode to the current transaction. This is needed to 539 * make sure fsync does not log multiple checksum items with 540 * overlapping ranges (because some extent items might refer 541 * only to sections of the original extent). For the destination 542 * inode we do this regardless of the generation of the extents 543 * or even if they are inline extents or explicit holes, to make 544 * sure a full fsync does not skip them. For the source inode, 545 * we only need to update last_reflink_trans in case it's a new 546 * extent that is not a hole or an inline extent, to deal with 547 * the checksums problem on fsync. 548 */ 549 if (extent_gen == trans->transid && disko > 0) 550 BTRFS_I(src)->last_reflink_trans = trans->transid; 551 552 BTRFS_I(inode)->last_reflink_trans = trans->transid; 553 554 last_dest_end = ALIGN(new_key.offset + datal, 555 fs_info->sectorsize); 556 ret = clone_finish_inode_update(trans, inode, last_dest_end, 557 destoff, olen, no_time_update); 558 if (ret) 559 goto out; 560 if (new_key.offset + datal >= destoff + len) 561 break; 562 563 btrfs_release_path(path); 564 key.offset = prev_extent_end; 565 566 if (fatal_signal_pending(current)) { 567 ret = -EINTR; 568 goto out; 569 } 570 571 cond_resched(); 572 } 573 ret = 0; 574 575 if (last_dest_end < destoff + len) { 576 /* 577 * We have an implicit hole that fully or partially overlaps our 578 * cloning range at its end. This means that we either have the 579 * NO_HOLES feature enabled or the implicit hole happened due to 580 * mixing buffered and direct IO writes against this file. 581 */ 582 btrfs_release_path(path); 583 584 /* 585 * When using NO_HOLES and we are cloning a range that covers 586 * only a hole (no extents) into a range beyond the current 587 * i_size, punching a hole in the target range will not create 588 * an extent map defining a hole, because the range starts at or 589 * beyond current i_size. If the file previously had an i_size 590 * greater than the new i_size set by this clone operation, we 591 * need to make sure the next fsync is a full fsync, so that it 592 * detects and logs a hole covering a range from the current 593 * i_size to the new i_size. If the clone range covers extents, 594 * besides a hole, then we know the full sync flag was already 595 * set by previous calls to btrfs_replace_file_extents() that 596 * replaced file extent items. 597 */ 598 if (last_dest_end >= i_size_read(inode)) 599 btrfs_set_inode_full_sync(BTRFS_I(inode)); 600 601 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 602 last_dest_end, destoff + len - 1, NULL, &trans); 603 if (ret) 604 goto out; 605 606 ret = clone_finish_inode_update(trans, inode, destoff + len, 607 destoff, olen, no_time_update); 608 } 609 610 out: 611 btrfs_free_path(path); 612 kvfree(buf); 613 clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); 614 615 return ret; 616 } 617 618 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 619 struct inode *inode2, u64 loff2, u64 len) 620 { 621 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); 622 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); 623 } 624 625 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 626 struct inode *inode2, u64 loff2, u64 len) 627 { 628 u64 range1_end = loff1 + len - 1; 629 u64 range2_end = loff2 + len - 1; 630 631 if (inode1 < inode2) { 632 swap(inode1, inode2); 633 swap(loff1, loff2); 634 swap(range1_end, range2_end); 635 } else if (inode1 == inode2 && loff2 < loff1) { 636 swap(loff1, loff2); 637 swap(range1_end, range2_end); 638 } 639 640 lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); 641 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); 642 643 btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); 644 btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); 645 } 646 647 static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) 648 { 649 if (inode1 < inode2) 650 swap(inode1, inode2); 651 down_write(&BTRFS_I(inode1)->i_mmap_lock); 652 down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); 653 } 654 655 static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) 656 { 657 up_write(&BTRFS_I(inode1)->i_mmap_lock); 658 up_write(&BTRFS_I(inode2)->i_mmap_lock); 659 } 660 661 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, 662 struct inode *dst, u64 dst_loff) 663 { 664 struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; 665 const u64 bs = fs_info->sb->s_blocksize; 666 int ret; 667 668 /* 669 * Lock destination range to serialize with concurrent readahead() and 670 * source range to serialize with relocation. 671 */ 672 btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 673 ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); 674 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 675 676 btrfs_btree_balance_dirty(fs_info); 677 678 return ret; 679 } 680 681 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 682 struct inode *dst, u64 dst_loff) 683 { 684 int ret = 0; 685 u64 i, tail_len, chunk_count; 686 struct btrfs_root *root_dst = BTRFS_I(dst)->root; 687 688 spin_lock(&root_dst->root_item_lock); 689 if (root_dst->send_in_progress) { 690 btrfs_warn_rl(root_dst->fs_info, 691 "cannot deduplicate to root %llu while send operations are using it (%d in progress)", 692 root_dst->root_key.objectid, 693 root_dst->send_in_progress); 694 spin_unlock(&root_dst->root_item_lock); 695 return -EAGAIN; 696 } 697 root_dst->dedupe_in_progress++; 698 spin_unlock(&root_dst->root_item_lock); 699 700 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 701 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 702 703 for (i = 0; i < chunk_count; i++) { 704 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 705 dst, dst_loff); 706 if (ret) 707 goto out; 708 709 loff += BTRFS_MAX_DEDUPE_LEN; 710 dst_loff += BTRFS_MAX_DEDUPE_LEN; 711 } 712 713 if (tail_len > 0) 714 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); 715 out: 716 spin_lock(&root_dst->root_item_lock); 717 root_dst->dedupe_in_progress--; 718 spin_unlock(&root_dst->root_item_lock); 719 720 return ret; 721 } 722 723 static noinline int btrfs_clone_files(struct file *file, struct file *file_src, 724 u64 off, u64 olen, u64 destoff) 725 { 726 struct inode *inode = file_inode(file); 727 struct inode *src = file_inode(file_src); 728 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 729 int ret; 730 int wb_ret; 731 u64 len = olen; 732 u64 bs = fs_info->sb->s_blocksize; 733 734 /* 735 * VFS's generic_remap_file_range_prep() protects us from cloning the 736 * eof block into the middle of a file, which would result in corruption 737 * if the file size is not blocksize aligned. So we don't need to check 738 * for that case here. 739 */ 740 if (off + len == src->i_size) 741 len = ALIGN(src->i_size, bs) - off; 742 743 if (destoff > inode->i_size) { 744 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); 745 746 ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); 747 if (ret) 748 return ret; 749 /* 750 * We may have truncated the last block if the inode's size is 751 * not sector size aligned, so we need to wait for writeback to 752 * complete before proceeding further, otherwise we can race 753 * with cloning and attempt to increment a reference to an 754 * extent that no longer exists (writeback completed right after 755 * we found the previous extent covering eof and before we 756 * attempted to increment its reference count). 757 */ 758 ret = btrfs_wait_ordered_range(inode, wb_start, 759 destoff - wb_start); 760 if (ret) 761 return ret; 762 } 763 764 /* 765 * Lock destination range to serialize with concurrent readahead() and 766 * source range to serialize with relocation. 767 */ 768 btrfs_double_extent_lock(src, off, inode, destoff, len); 769 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 770 btrfs_double_extent_unlock(src, off, inode, destoff, len); 771 772 /* 773 * We may have copied an inline extent into a page of the destination 774 * range, so wait for writeback to complete before truncating pages 775 * from the page cache. This is a rare case. 776 */ 777 wb_ret = btrfs_wait_ordered_range(inode, destoff, len); 778 ret = ret ? ret : wb_ret; 779 /* 780 * Truncate page cache pages so that future reads will see the cloned 781 * data immediately and not the previous data. 782 */ 783 truncate_inode_pages_range(&inode->i_data, 784 round_down(destoff, PAGE_SIZE), 785 round_up(destoff + len, PAGE_SIZE) - 1); 786 787 btrfs_btree_balance_dirty(fs_info); 788 789 return ret; 790 } 791 792 static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, 793 struct file *file_out, loff_t pos_out, 794 loff_t *len, unsigned int remap_flags) 795 { 796 struct inode *inode_in = file_inode(file_in); 797 struct inode *inode_out = file_inode(file_out); 798 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; 799 u64 wb_len; 800 int ret; 801 802 if (!(remap_flags & REMAP_FILE_DEDUP)) { 803 struct btrfs_root *root_out = BTRFS_I(inode_out)->root; 804 805 if (btrfs_root_readonly(root_out)) 806 return -EROFS; 807 808 ASSERT(inode_in->i_sb == inode_out->i_sb); 809 } 810 811 /* Don't make the dst file partly checksummed */ 812 if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != 813 (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { 814 return -EINVAL; 815 } 816 817 /* 818 * Now that the inodes are locked, we need to start writeback ourselves 819 * and can not rely on the writeback from the VFS's generic helper 820 * generic_remap_file_range_prep() because: 821 * 822 * 1) For compression we must call filemap_fdatawrite_range() range 823 * twice (btrfs_fdatawrite_range() does it for us), and the generic 824 * helper only calls it once; 825 * 826 * 2) filemap_fdatawrite_range(), called by the generic helper only 827 * waits for the writeback to complete, i.e. for IO to be done, and 828 * not for the ordered extents to complete. We need to wait for them 829 * to complete so that new file extent items are in the fs tree. 830 */ 831 if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) 832 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); 833 else 834 wb_len = ALIGN(*len, bs); 835 836 /* 837 * Workaround to make sure NOCOW buffered write reach disk as NOCOW. 838 * 839 * Btrfs' back references do not have a block level granularity, they 840 * work at the whole extent level. 841 * NOCOW buffered write without data space reserved may not be able 842 * to fall back to CoW due to lack of data space, thus could cause 843 * data loss. 844 * 845 * Here we take a shortcut by flushing the whole inode, so that all 846 * nocow write should reach disk as nocow before we increase the 847 * reference of the extent. We could do better by only flushing NOCOW 848 * data, but that needs extra accounting. 849 * 850 * Also we don't need to check ASYNC_EXTENT, as async extent will be 851 * CoWed anyway, not affecting nocow part. 852 */ 853 ret = filemap_flush(inode_in->i_mapping); 854 if (ret < 0) 855 return ret; 856 857 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 858 wb_len); 859 if (ret < 0) 860 return ret; 861 ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), 862 wb_len); 863 if (ret < 0) 864 return ret; 865 866 return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 867 len, remap_flags); 868 } 869 870 static bool file_sync_write(const struct file *file) 871 { 872 if (file->f_flags & (__O_SYNC | O_DSYNC)) 873 return true; 874 if (IS_SYNC(file_inode(file))) 875 return true; 876 877 return false; 878 } 879 880 loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, 881 struct file *dst_file, loff_t destoff, loff_t len, 882 unsigned int remap_flags) 883 { 884 struct inode *src_inode = file_inode(src_file); 885 struct inode *dst_inode = file_inode(dst_file); 886 bool same_inode = dst_inode == src_inode; 887 int ret; 888 889 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 890 return -EINVAL; 891 892 if (same_inode) { 893 btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); 894 } else { 895 lock_two_nondirectories(src_inode, dst_inode); 896 btrfs_double_mmap_lock(src_inode, dst_inode); 897 } 898 899 ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, 900 &len, remap_flags); 901 if (ret < 0 || len == 0) 902 goto out_unlock; 903 904 if (remap_flags & REMAP_FILE_DEDUP) 905 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); 906 else 907 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); 908 909 out_unlock: 910 if (same_inode) { 911 btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); 912 } else { 913 btrfs_double_mmap_unlock(src_inode, dst_inode); 914 unlock_two_nondirectories(src_inode, dst_inode); 915 } 916 917 /* 918 * If either the source or the destination file was opened with O_SYNC, 919 * O_DSYNC or has the S_SYNC attribute, fsync both the destination and 920 * source files/ranges, so that after a successful return (0) followed 921 * by a power failure results in the reflinked data to be readable from 922 * both files/ranges. 923 */ 924 if (ret == 0 && len > 0 && 925 (file_sync_write(src_file) || file_sync_write(dst_file))) { 926 ret = btrfs_sync_file(src_file, off, off + len - 1, 0); 927 if (ret == 0) 928 ret = btrfs_sync_file(dst_file, destoff, 929 destoff + len - 1, 0); 930 } 931 932 return ret < 0 ? ret : len; 933 } 934