1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/blkdev.h> 4 #include <linux/iversion.h> 5 #include "compression.h" 6 #include "ctree.h" 7 #include "delalloc-space.h" 8 #include "disk-io.h" 9 #include "reflink.h" 10 #include "transaction.h" 11 #include "subpage.h" 12 13 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 14 15 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 16 struct inode *inode, 17 u64 endoff, 18 const u64 destoff, 19 const u64 olen, 20 int no_time_update) 21 { 22 struct btrfs_root *root = BTRFS_I(inode)->root; 23 int ret; 24 25 inode_inc_iversion(inode); 26 if (!no_time_update) 27 inode->i_mtime = inode->i_ctime = current_time(inode); 28 /* 29 * We round up to the block size at eof when determining which 30 * extents to clone above, but shouldn't round up the file size. 31 */ 32 if (endoff > destoff + olen) 33 endoff = destoff + olen; 34 if (endoff > inode->i_size) { 35 i_size_write(inode, endoff); 36 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 37 } 38 39 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 40 if (ret) { 41 btrfs_abort_transaction(trans, ret); 42 btrfs_end_transaction(trans); 43 goto out; 44 } 45 ret = btrfs_end_transaction(trans); 46 out: 47 return ret; 48 } 49 50 static int copy_inline_to_page(struct btrfs_inode *inode, 51 const u64 file_offset, 52 char *inline_data, 53 const u64 size, 54 const u64 datal, 55 const u8 comp_type) 56 { 57 struct btrfs_fs_info *fs_info = inode->root->fs_info; 58 const u32 block_size = fs_info->sectorsize; 59 const u64 range_end = file_offset + block_size - 1; 60 const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); 61 char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); 62 struct extent_changeset *data_reserved = NULL; 63 struct page *page = NULL; 64 struct address_space *mapping = inode->vfs_inode.i_mapping; 65 int ret; 66 67 ASSERT(IS_ALIGNED(file_offset, block_size)); 68 69 /* 70 * We have flushed and locked the ranges of the source and destination 71 * inodes, we also have locked the inodes, so we are safe to do a 72 * reservation here. Also we must not do the reservation while holding 73 * a transaction open, otherwise we would deadlock. 74 */ 75 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, 76 block_size); 77 if (ret) 78 goto out; 79 80 page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, 81 btrfs_alloc_write_mask(mapping)); 82 if (!page) { 83 ret = -ENOMEM; 84 goto out_unlock; 85 } 86 87 ret = set_page_extent_mapped(page); 88 if (ret < 0) 89 goto out_unlock; 90 91 clear_extent_bit(&inode->io_tree, file_offset, range_end, 92 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 93 0, 0, NULL); 94 ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); 95 if (ret) 96 goto out_unlock; 97 98 /* 99 * After dirtying the page our caller will need to start a transaction, 100 * and if we are low on metadata free space, that can cause flushing of 101 * delalloc for all inodes in order to get metadata space released. 102 * However we are holding the range locked for the whole duration of 103 * the clone/dedupe operation, so we may deadlock if that happens and no 104 * other task releases enough space. So mark this inode as not being 105 * possible to flush to avoid such deadlock. We will clear that flag 106 * when we finish cloning all extents, since a transaction is started 107 * after finding each extent to clone. 108 */ 109 set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); 110 111 if (comp_type == BTRFS_COMPRESS_NONE) { 112 memcpy_to_page(page, offset_in_page(file_offset), data_start, 113 datal); 114 } else { 115 ret = btrfs_decompress(comp_type, data_start, page, 116 offset_in_page(file_offset), 117 inline_size, datal); 118 if (ret) 119 goto out_unlock; 120 flush_dcache_page(page); 121 } 122 123 /* 124 * If our inline data is smaller then the block/page size, then the 125 * remaining of the block/page is equivalent to zeroes. We had something 126 * like the following done: 127 * 128 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file 129 * $ sync # (or fsync) 130 * $ xfs_io -c "falloc 0 4K" file 131 * $ xfs_io -c "pwrite -S 0xcd 4K 4K" 132 * 133 * So what's in the range [500, 4095] corresponds to zeroes. 134 */ 135 if (datal < block_size) 136 memzero_page(page, datal, block_size - datal); 137 138 btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); 139 btrfs_page_clear_checked(fs_info, page, file_offset, block_size); 140 btrfs_page_set_dirty(fs_info, page, file_offset, block_size); 141 out_unlock: 142 if (page) { 143 unlock_page(page); 144 put_page(page); 145 } 146 if (ret) 147 btrfs_delalloc_release_space(inode, data_reserved, file_offset, 148 block_size, true); 149 btrfs_delalloc_release_extents(inode, block_size); 150 out: 151 extent_changeset_free(data_reserved); 152 153 return ret; 154 } 155 156 /* 157 * Deal with cloning of inline extents. We try to copy the inline extent from 158 * the source inode to destination inode when possible. When not possible we 159 * copy the inline extent's data into the respective page of the inode. 160 */ 161 static int clone_copy_inline_extent(struct inode *dst, 162 struct btrfs_path *path, 163 struct btrfs_key *new_key, 164 const u64 drop_start, 165 const u64 datal, 166 const u64 size, 167 const u8 comp_type, 168 char *inline_data, 169 struct btrfs_trans_handle **trans_out) 170 { 171 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 172 struct btrfs_root *root = BTRFS_I(dst)->root; 173 const u64 aligned_end = ALIGN(new_key->offset + datal, 174 fs_info->sectorsize); 175 struct btrfs_trans_handle *trans = NULL; 176 struct btrfs_drop_extents_args drop_args = { 0 }; 177 int ret; 178 struct btrfs_key key; 179 180 if (new_key->offset > 0) { 181 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 182 inline_data, size, datal, comp_type); 183 goto out; 184 } 185 186 key.objectid = btrfs_ino(BTRFS_I(dst)); 187 key.type = BTRFS_EXTENT_DATA_KEY; 188 key.offset = 0; 189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 190 if (ret < 0) { 191 return ret; 192 } else if (ret > 0) { 193 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 194 ret = btrfs_next_leaf(root, path); 195 if (ret < 0) 196 return ret; 197 else if (ret > 0) 198 goto copy_inline_extent; 199 } 200 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 201 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 202 key.type == BTRFS_EXTENT_DATA_KEY) { 203 /* 204 * There's an implicit hole at file offset 0, copy the 205 * inline extent's data to the page. 206 */ 207 ASSERT(key.offset > 0); 208 goto copy_to_page; 209 } 210 } else if (i_size_read(dst) <= datal) { 211 struct btrfs_file_extent_item *ei; 212 213 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 214 struct btrfs_file_extent_item); 215 /* 216 * If it's an inline extent replace it with the source inline 217 * extent, otherwise copy the source inline extent data into 218 * the respective page at the destination inode. 219 */ 220 if (btrfs_file_extent_type(path->nodes[0], ei) == 221 BTRFS_FILE_EXTENT_INLINE) 222 goto copy_inline_extent; 223 224 goto copy_to_page; 225 } 226 227 copy_inline_extent: 228 /* 229 * We have no extent items, or we have an extent at offset 0 which may 230 * or may not be inlined. All these cases are dealt the same way. 231 */ 232 if (i_size_read(dst) > datal) { 233 /* 234 * At the destination offset 0 we have either a hole, a regular 235 * extent or an inline extent larger then the one we want to 236 * clone. Deal with all these cases by copying the inline extent 237 * data into the respective page at the destination inode. 238 */ 239 goto copy_to_page; 240 } 241 242 /* 243 * Release path before starting a new transaction so we don't hold locks 244 * that would confuse lockdep. 245 */ 246 btrfs_release_path(path); 247 /* 248 * If we end up here it means were copy the inline extent into a leaf 249 * of the destination inode. We know we will drop or adjust at most one 250 * extent item in the destination root. 251 * 252 * 1 unit - adjusting old extent (we may have to split it) 253 * 1 unit - add new extent 254 * 1 unit - inode update 255 */ 256 trans = btrfs_start_transaction(root, 3); 257 if (IS_ERR(trans)) { 258 ret = PTR_ERR(trans); 259 trans = NULL; 260 goto out; 261 } 262 drop_args.path = path; 263 drop_args.start = drop_start; 264 drop_args.end = aligned_end; 265 drop_args.drop_cache = true; 266 ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); 267 if (ret) 268 goto out; 269 ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 270 if (ret) 271 goto out; 272 273 write_extent_buffer(path->nodes[0], inline_data, 274 btrfs_item_ptr_offset(path->nodes[0], 275 path->slots[0]), 276 size); 277 btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); 278 btrfs_set_inode_full_sync(BTRFS_I(dst)); 279 ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); 280 out: 281 if (!ret && !trans) { 282 /* 283 * No transaction here means we copied the inline extent into a 284 * page of the destination inode. 285 * 286 * 1 unit to update inode item 287 */ 288 trans = btrfs_start_transaction(root, 1); 289 if (IS_ERR(trans)) { 290 ret = PTR_ERR(trans); 291 trans = NULL; 292 } 293 } 294 if (ret && trans) { 295 btrfs_abort_transaction(trans, ret); 296 btrfs_end_transaction(trans); 297 } 298 if (!ret) 299 *trans_out = trans; 300 301 return ret; 302 303 copy_to_page: 304 /* 305 * Release our path because we don't need it anymore and also because 306 * copy_inline_to_page() needs to reserve data and metadata, which may 307 * need to flush delalloc when we are low on available space and 308 * therefore cause a deadlock if writeback of an inline extent needs to 309 * write to the same leaf or an ordered extent completion needs to write 310 * to the same leaf. 311 */ 312 btrfs_release_path(path); 313 314 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 315 inline_data, size, datal, comp_type); 316 goto out; 317 } 318 319 /** 320 * btrfs_clone() - clone a range from inode file to another 321 * 322 * @src: Inode to clone from 323 * @inode: Inode to clone to 324 * @off: Offset within source to start clone from 325 * @olen: Original length, passed by user, of range to clone 326 * @olen_aligned: Block-aligned value of olen 327 * @destoff: Offset within @inode to start clone 328 * @no_time_update: Whether to update mtime/ctime on the target inode 329 */ 330 static int btrfs_clone(struct inode *src, struct inode *inode, 331 const u64 off, const u64 olen, const u64 olen_aligned, 332 const u64 destoff, int no_time_update) 333 { 334 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 335 struct btrfs_path *path = NULL; 336 struct extent_buffer *leaf; 337 struct btrfs_trans_handle *trans; 338 char *buf = NULL; 339 struct btrfs_key key; 340 u32 nritems; 341 int slot; 342 int ret; 343 const u64 len = olen_aligned; 344 u64 last_dest_end = destoff; 345 u64 prev_extent_end = off; 346 347 ret = -ENOMEM; 348 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 349 if (!buf) 350 return ret; 351 352 path = btrfs_alloc_path(); 353 if (!path) { 354 kvfree(buf); 355 return ret; 356 } 357 358 path->reada = READA_FORWARD; 359 /* Clone data */ 360 key.objectid = btrfs_ino(BTRFS_I(src)); 361 key.type = BTRFS_EXTENT_DATA_KEY; 362 key.offset = off; 363 364 while (1) { 365 struct btrfs_file_extent_item *extent; 366 u64 extent_gen; 367 int type; 368 u32 size; 369 struct btrfs_key new_key; 370 u64 disko = 0, diskl = 0; 371 u64 datao = 0, datal = 0; 372 u8 comp; 373 u64 drop_start; 374 375 /* Note the key will change type as we walk through the tree */ 376 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 377 0, 0); 378 if (ret < 0) 379 goto out; 380 /* 381 * First search, if no extent item that starts at offset off was 382 * found but the previous item is an extent item, it's possible 383 * it might overlap our target range, therefore process it. 384 */ 385 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 386 btrfs_item_key_to_cpu(path->nodes[0], &key, 387 path->slots[0] - 1); 388 if (key.type == BTRFS_EXTENT_DATA_KEY) 389 path->slots[0]--; 390 } 391 392 nritems = btrfs_header_nritems(path->nodes[0]); 393 process_slot: 394 if (path->slots[0] >= nritems) { 395 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 396 if (ret < 0) 397 goto out; 398 if (ret > 0) 399 break; 400 nritems = btrfs_header_nritems(path->nodes[0]); 401 } 402 leaf = path->nodes[0]; 403 slot = path->slots[0]; 404 405 btrfs_item_key_to_cpu(leaf, &key, slot); 406 if (key.type > BTRFS_EXTENT_DATA_KEY || 407 key.objectid != btrfs_ino(BTRFS_I(src))) 408 break; 409 410 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); 411 412 extent = btrfs_item_ptr(leaf, slot, 413 struct btrfs_file_extent_item); 414 extent_gen = btrfs_file_extent_generation(leaf, extent); 415 comp = btrfs_file_extent_compression(leaf, extent); 416 type = btrfs_file_extent_type(leaf, extent); 417 if (type == BTRFS_FILE_EXTENT_REG || 418 type == BTRFS_FILE_EXTENT_PREALLOC) { 419 disko = btrfs_file_extent_disk_bytenr(leaf, extent); 420 diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); 421 datao = btrfs_file_extent_offset(leaf, extent); 422 datal = btrfs_file_extent_num_bytes(leaf, extent); 423 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 424 /* Take upper bound, may be compressed */ 425 datal = btrfs_file_extent_ram_bytes(leaf, extent); 426 } 427 428 /* 429 * The first search might have left us at an extent item that 430 * ends before our target range's start, can happen if we have 431 * holes and NO_HOLES feature enabled. 432 * 433 * Subsequent searches may leave us on a file range we have 434 * processed before - this happens due to a race with ordered 435 * extent completion for a file range that is outside our source 436 * range, but that range was part of a file extent item that 437 * also covered a leading part of our source range. 438 */ 439 if (key.offset + datal <= prev_extent_end) { 440 path->slots[0]++; 441 goto process_slot; 442 } else if (key.offset >= off + len) { 443 break; 444 } 445 446 prev_extent_end = key.offset + datal; 447 size = btrfs_item_size(leaf, slot); 448 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), 449 size); 450 451 btrfs_release_path(path); 452 453 memcpy(&new_key, &key, sizeof(new_key)); 454 new_key.objectid = btrfs_ino(BTRFS_I(inode)); 455 if (off <= key.offset) 456 new_key.offset = key.offset + destoff - off; 457 else 458 new_key.offset = destoff; 459 460 /* 461 * Deal with a hole that doesn't have an extent item that 462 * represents it (NO_HOLES feature enabled). 463 * This hole is either in the middle of the cloning range or at 464 * the beginning (fully overlaps it or partially overlaps it). 465 */ 466 if (new_key.offset != last_dest_end) 467 drop_start = last_dest_end; 468 else 469 drop_start = new_key.offset; 470 471 if (type == BTRFS_FILE_EXTENT_REG || 472 type == BTRFS_FILE_EXTENT_PREALLOC) { 473 struct btrfs_replace_extent_info clone_info; 474 475 /* 476 * a | --- range to clone ---| b 477 * | ------------- extent ------------- | 478 */ 479 480 /* Subtract range b */ 481 if (key.offset + datal > off + len) 482 datal = off + len - key.offset; 483 484 /* Subtract range a */ 485 if (off > key.offset) { 486 datao += off - key.offset; 487 datal -= off - key.offset; 488 } 489 490 clone_info.disk_offset = disko; 491 clone_info.disk_len = diskl; 492 clone_info.data_offset = datao; 493 clone_info.data_len = datal; 494 clone_info.file_offset = new_key.offset; 495 clone_info.extent_buf = buf; 496 clone_info.is_new_extent = false; 497 clone_info.update_times = !no_time_update; 498 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 499 drop_start, new_key.offset + datal - 1, 500 &clone_info, &trans); 501 if (ret) 502 goto out; 503 } else { 504 ASSERT(type == BTRFS_FILE_EXTENT_INLINE); 505 /* 506 * Inline extents always have to start at file offset 0 507 * and can never be bigger then the sector size. We can 508 * never clone only parts of an inline extent, since all 509 * reflink operations must start at a sector size aligned 510 * offset, and the length must be aligned too or end at 511 * the i_size (which implies the whole inlined data). 512 */ 513 ASSERT(key.offset == 0); 514 ASSERT(datal <= fs_info->sectorsize); 515 if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || 516 WARN_ON(key.offset != 0) || 517 WARN_ON(datal > fs_info->sectorsize)) { 518 ret = -EUCLEAN; 519 goto out; 520 } 521 522 ret = clone_copy_inline_extent(inode, path, &new_key, 523 drop_start, datal, size, 524 comp, buf, &trans); 525 if (ret) 526 goto out; 527 } 528 529 btrfs_release_path(path); 530 531 /* 532 * Whenever we share an extent we update the last_reflink_trans 533 * of each inode to the current transaction. This is needed to 534 * make sure fsync does not log multiple checksum items with 535 * overlapping ranges (because some extent items might refer 536 * only to sections of the original extent). For the destination 537 * inode we do this regardless of the generation of the extents 538 * or even if they are inline extents or explicit holes, to make 539 * sure a full fsync does not skip them. For the source inode, 540 * we only need to update last_reflink_trans in case it's a new 541 * extent that is not a hole or an inline extent, to deal with 542 * the checksums problem on fsync. 543 */ 544 if (extent_gen == trans->transid && disko > 0) 545 BTRFS_I(src)->last_reflink_trans = trans->transid; 546 547 BTRFS_I(inode)->last_reflink_trans = trans->transid; 548 549 last_dest_end = ALIGN(new_key.offset + datal, 550 fs_info->sectorsize); 551 ret = clone_finish_inode_update(trans, inode, last_dest_end, 552 destoff, olen, no_time_update); 553 if (ret) 554 goto out; 555 if (new_key.offset + datal >= destoff + len) 556 break; 557 558 btrfs_release_path(path); 559 key.offset = prev_extent_end; 560 561 if (fatal_signal_pending(current)) { 562 ret = -EINTR; 563 goto out; 564 } 565 566 cond_resched(); 567 } 568 ret = 0; 569 570 if (last_dest_end < destoff + len) { 571 /* 572 * We have an implicit hole that fully or partially overlaps our 573 * cloning range at its end. This means that we either have the 574 * NO_HOLES feature enabled or the implicit hole happened due to 575 * mixing buffered and direct IO writes against this file. 576 */ 577 btrfs_release_path(path); 578 579 /* 580 * When using NO_HOLES and we are cloning a range that covers 581 * only a hole (no extents) into a range beyond the current 582 * i_size, punching a hole in the target range will not create 583 * an extent map defining a hole, because the range starts at or 584 * beyond current i_size. If the file previously had an i_size 585 * greater than the new i_size set by this clone operation, we 586 * need to make sure the next fsync is a full fsync, so that it 587 * detects and logs a hole covering a range from the current 588 * i_size to the new i_size. If the clone range covers extents, 589 * besides a hole, then we know the full sync flag was already 590 * set by previous calls to btrfs_replace_file_extents() that 591 * replaced file extent items. 592 */ 593 if (last_dest_end >= i_size_read(inode)) 594 btrfs_set_inode_full_sync(BTRFS_I(inode)); 595 596 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 597 last_dest_end, destoff + len - 1, NULL, &trans); 598 if (ret) 599 goto out; 600 601 ret = clone_finish_inode_update(trans, inode, destoff + len, 602 destoff, olen, no_time_update); 603 } 604 605 out: 606 btrfs_free_path(path); 607 kvfree(buf); 608 clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); 609 610 return ret; 611 } 612 613 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 614 struct inode *inode2, u64 loff2, u64 len) 615 { 616 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 617 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 618 } 619 620 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 621 struct inode *inode2, u64 loff2, u64 len) 622 { 623 u64 range1_end = loff1 + len - 1; 624 u64 range2_end = loff2 + len - 1; 625 626 if (inode1 < inode2) { 627 swap(inode1, inode2); 628 swap(loff1, loff2); 629 swap(range1_end, range2_end); 630 } else if (inode1 == inode2 && loff2 < loff1) { 631 swap(loff1, loff2); 632 swap(range1_end, range2_end); 633 } 634 635 lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); 636 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); 637 638 btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); 639 btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); 640 } 641 642 static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) 643 { 644 if (inode1 < inode2) 645 swap(inode1, inode2); 646 down_write(&BTRFS_I(inode1)->i_mmap_lock); 647 down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); 648 } 649 650 static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) 651 { 652 up_write(&BTRFS_I(inode1)->i_mmap_lock); 653 up_write(&BTRFS_I(inode2)->i_mmap_lock); 654 } 655 656 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, 657 struct inode *dst, u64 dst_loff) 658 { 659 struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; 660 const u64 bs = fs_info->sb->s_blocksize; 661 int ret; 662 663 /* 664 * Lock destination range to serialize with concurrent readahead() and 665 * source range to serialize with relocation. 666 */ 667 btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 668 ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); 669 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 670 671 btrfs_btree_balance_dirty(fs_info); 672 673 return ret; 674 } 675 676 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 677 struct inode *dst, u64 dst_loff) 678 { 679 int ret = 0; 680 u64 i, tail_len, chunk_count; 681 struct btrfs_root *root_dst = BTRFS_I(dst)->root; 682 683 spin_lock(&root_dst->root_item_lock); 684 if (root_dst->send_in_progress) { 685 btrfs_warn_rl(root_dst->fs_info, 686 "cannot deduplicate to root %llu while send operations are using it (%d in progress)", 687 root_dst->root_key.objectid, 688 root_dst->send_in_progress); 689 spin_unlock(&root_dst->root_item_lock); 690 return -EAGAIN; 691 } 692 root_dst->dedupe_in_progress++; 693 spin_unlock(&root_dst->root_item_lock); 694 695 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 696 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 697 698 for (i = 0; i < chunk_count; i++) { 699 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 700 dst, dst_loff); 701 if (ret) 702 goto out; 703 704 loff += BTRFS_MAX_DEDUPE_LEN; 705 dst_loff += BTRFS_MAX_DEDUPE_LEN; 706 } 707 708 if (tail_len > 0) 709 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); 710 out: 711 spin_lock(&root_dst->root_item_lock); 712 root_dst->dedupe_in_progress--; 713 spin_unlock(&root_dst->root_item_lock); 714 715 return ret; 716 } 717 718 static noinline int btrfs_clone_files(struct file *file, struct file *file_src, 719 u64 off, u64 olen, u64 destoff) 720 { 721 struct inode *inode = file_inode(file); 722 struct inode *src = file_inode(file_src); 723 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 724 int ret; 725 int wb_ret; 726 u64 len = olen; 727 u64 bs = fs_info->sb->s_blocksize; 728 729 /* 730 * VFS's generic_remap_file_range_prep() protects us from cloning the 731 * eof block into the middle of a file, which would result in corruption 732 * if the file size is not blocksize aligned. So we don't need to check 733 * for that case here. 734 */ 735 if (off + len == src->i_size) 736 len = ALIGN(src->i_size, bs) - off; 737 738 if (destoff > inode->i_size) { 739 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); 740 741 ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); 742 if (ret) 743 return ret; 744 /* 745 * We may have truncated the last block if the inode's size is 746 * not sector size aligned, so we need to wait for writeback to 747 * complete before proceeding further, otherwise we can race 748 * with cloning and attempt to increment a reference to an 749 * extent that no longer exists (writeback completed right after 750 * we found the previous extent covering eof and before we 751 * attempted to increment its reference count). 752 */ 753 ret = btrfs_wait_ordered_range(inode, wb_start, 754 destoff - wb_start); 755 if (ret) 756 return ret; 757 } 758 759 /* 760 * Lock destination range to serialize with concurrent readahead() and 761 * source range to serialize with relocation. 762 */ 763 btrfs_double_extent_lock(src, off, inode, destoff, len); 764 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 765 btrfs_double_extent_unlock(src, off, inode, destoff, len); 766 767 /* 768 * We may have copied an inline extent into a page of the destination 769 * range, so wait for writeback to complete before truncating pages 770 * from the page cache. This is a rare case. 771 */ 772 wb_ret = btrfs_wait_ordered_range(inode, destoff, len); 773 ret = ret ? ret : wb_ret; 774 /* 775 * Truncate page cache pages so that future reads will see the cloned 776 * data immediately and not the previous data. 777 */ 778 truncate_inode_pages_range(&inode->i_data, 779 round_down(destoff, PAGE_SIZE), 780 round_up(destoff + len, PAGE_SIZE) - 1); 781 782 btrfs_btree_balance_dirty(fs_info); 783 784 return ret; 785 } 786 787 static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, 788 struct file *file_out, loff_t pos_out, 789 loff_t *len, unsigned int remap_flags) 790 { 791 struct inode *inode_in = file_inode(file_in); 792 struct inode *inode_out = file_inode(file_out); 793 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; 794 u64 wb_len; 795 int ret; 796 797 if (!(remap_flags & REMAP_FILE_DEDUP)) { 798 struct btrfs_root *root_out = BTRFS_I(inode_out)->root; 799 800 if (btrfs_root_readonly(root_out)) 801 return -EROFS; 802 803 ASSERT(inode_in->i_sb == inode_out->i_sb); 804 } 805 806 /* Don't make the dst file partly checksummed */ 807 if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != 808 (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { 809 return -EINVAL; 810 } 811 812 /* 813 * Now that the inodes are locked, we need to start writeback ourselves 814 * and can not rely on the writeback from the VFS's generic helper 815 * generic_remap_file_range_prep() because: 816 * 817 * 1) For compression we must call filemap_fdatawrite_range() range 818 * twice (btrfs_fdatawrite_range() does it for us), and the generic 819 * helper only calls it once; 820 * 821 * 2) filemap_fdatawrite_range(), called by the generic helper only 822 * waits for the writeback to complete, i.e. for IO to be done, and 823 * not for the ordered extents to complete. We need to wait for them 824 * to complete so that new file extent items are in the fs tree. 825 */ 826 if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) 827 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); 828 else 829 wb_len = ALIGN(*len, bs); 830 831 /* 832 * Workaround to make sure NOCOW buffered write reach disk as NOCOW. 833 * 834 * Btrfs' back references do not have a block level granularity, they 835 * work at the whole extent level. 836 * NOCOW buffered write without data space reserved may not be able 837 * to fall back to CoW due to lack of data space, thus could cause 838 * data loss. 839 * 840 * Here we take a shortcut by flushing the whole inode, so that all 841 * nocow write should reach disk as nocow before we increase the 842 * reference of the extent. We could do better by only flushing NOCOW 843 * data, but that needs extra accounting. 844 * 845 * Also we don't need to check ASYNC_EXTENT, as async extent will be 846 * CoWed anyway, not affecting nocow part. 847 */ 848 ret = filemap_flush(inode_in->i_mapping); 849 if (ret < 0) 850 return ret; 851 852 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 853 wb_len); 854 if (ret < 0) 855 return ret; 856 ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), 857 wb_len); 858 if (ret < 0) 859 return ret; 860 861 return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 862 len, remap_flags); 863 } 864 865 static bool file_sync_write(const struct file *file) 866 { 867 if (file->f_flags & (__O_SYNC | O_DSYNC)) 868 return true; 869 if (IS_SYNC(file_inode(file))) 870 return true; 871 872 return false; 873 } 874 875 loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, 876 struct file *dst_file, loff_t destoff, loff_t len, 877 unsigned int remap_flags) 878 { 879 struct inode *src_inode = file_inode(src_file); 880 struct inode *dst_inode = file_inode(dst_file); 881 bool same_inode = dst_inode == src_inode; 882 int ret; 883 884 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 885 return -EINVAL; 886 887 if (same_inode) { 888 btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); 889 } else { 890 lock_two_nondirectories(src_inode, dst_inode); 891 btrfs_double_mmap_lock(src_inode, dst_inode); 892 } 893 894 ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, 895 &len, remap_flags); 896 if (ret < 0 || len == 0) 897 goto out_unlock; 898 899 if (remap_flags & REMAP_FILE_DEDUP) 900 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); 901 else 902 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); 903 904 out_unlock: 905 if (same_inode) { 906 btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); 907 } else { 908 btrfs_double_mmap_unlock(src_inode, dst_inode); 909 unlock_two_nondirectories(src_inode, dst_inode); 910 } 911 912 /* 913 * If either the source or the destination file was opened with O_SYNC, 914 * O_DSYNC or has the S_SYNC attribute, fsync both the destination and 915 * source files/ranges, so that after a successful return (0) followed 916 * by a power failure results in the reflinked data to be readable from 917 * both files/ranges. 918 */ 919 if (ret == 0 && len > 0 && 920 (file_sync_write(src_file) || file_sync_write(dst_file))) { 921 ret = btrfs_sync_file(src_file, off, off + len - 1, 0); 922 if (ret == 0) 923 ret = btrfs_sync_file(dst_file, destoff, 924 destoff + len - 1, 0); 925 } 926 927 return ret < 0 ? ret : len; 928 } 929