1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Alexander Block. All rights reserved. 4 */ 5 6 #include <linux/bsearch.h> 7 #include <linux/fs.h> 8 #include <linux/file.h> 9 #include <linux/sort.h> 10 #include <linux/mount.h> 11 #include <linux/xattr.h> 12 #include <linux/posix_acl_xattr.h> 13 #include <linux/radix-tree.h> 14 #include <linux/vmalloc.h> 15 #include <linux/string.h> 16 #include <linux/compat.h> 17 #include <linux/crc32c.h> 18 #include <linux/fsverity.h> 19 20 #include "send.h" 21 #include "ctree.h" 22 #include "backref.h" 23 #include "locking.h" 24 #include "disk-io.h" 25 #include "btrfs_inode.h" 26 #include "transaction.h" 27 #include "compression.h" 28 #include "xattr.h" 29 #include "print-tree.h" 30 #include "accessors.h" 31 #include "dir-item.h" 32 #include "file-item.h" 33 #include "ioctl.h" 34 #include "verity.h" 35 36 /* 37 * Maximum number of references an extent can have in order for us to attempt to 38 * issue clone operations instead of write operations. This currently exists to 39 * avoid hitting limitations of the backreference walking code (taking a lot of 40 * time and using too much memory for extents with large number of references). 41 */ 42 #define SEND_MAX_EXTENT_REFS 1024 43 44 /* 45 * A fs_path is a helper to dynamically build path names with unknown size. 46 * It reallocates the internal buffer on demand. 47 * It allows fast adding of path elements on the right side (normal path) and 48 * fast adding to the left side (reversed path). A reversed path can also be 49 * unreversed if needed. 50 */ 51 struct fs_path { 52 union { 53 struct { 54 char *start; 55 char *end; 56 57 char *buf; 58 unsigned short buf_len:15; 59 unsigned short reversed:1; 60 char inline_buf[]; 61 }; 62 /* 63 * Average path length does not exceed 200 bytes, we'll have 64 * better packing in the slab and higher chance to satisfy 65 * a allocation later during send. 66 */ 67 char pad[256]; 68 }; 69 }; 70 #define FS_PATH_INLINE_SIZE \ 71 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) 72 73 74 /* reused for each extent */ 75 struct clone_root { 76 struct btrfs_root *root; 77 u64 ino; 78 u64 offset; 79 u64 num_bytes; 80 bool found_ref; 81 }; 82 83 #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 84 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) 85 86 /* 87 * Limit the root_ids array of struct backref_cache_entry to 12 elements. 88 * This makes the size of a cache entry to be exactly 128 bytes on x86_64. 89 * The most common case is to have a single root for cloning, which corresponds 90 * to the send root. Having the user specify more than 11 clone roots is not 91 * common, and in such rare cases we simply don't use caching if the number of 92 * cloning roots that lead down to a leaf is more than 12. 93 */ 94 #define SEND_MAX_BACKREF_CACHE_ROOTS 12 95 96 /* 97 * Max number of entries in the cache. 98 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding 99 * maple tree's internal nodes, is 16K. 100 */ 101 #define SEND_MAX_BACKREF_CACHE_SIZE 128 102 103 /* 104 * A backref cache entry maps a leaf to a list of IDs of roots from which the 105 * leaf is accessible and we can use for clone operations. 106 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on 107 * x86_64). 108 */ 109 struct backref_cache_entry { 110 /* List to link to the cache's lru list. */ 111 struct list_head list; 112 /* The key for this entry in the cache. */ 113 u64 key; 114 u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; 115 /* Number of valid elements in the root_ids array. */ 116 int num_roots; 117 }; 118 119 struct send_ctx { 120 struct file *send_filp; 121 loff_t send_off; 122 char *send_buf; 123 u32 send_size; 124 u32 send_max_size; 125 /* 126 * Whether BTRFS_SEND_A_DATA attribute was already added to current 127 * command (since protocol v2, data must be the last attribute). 128 */ 129 bool put_data; 130 struct page **send_buf_pages; 131 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ 132 /* Protocol version compatibility requested */ 133 u32 proto; 134 135 struct btrfs_root *send_root; 136 struct btrfs_root *parent_root; 137 struct clone_root *clone_roots; 138 int clone_roots_cnt; 139 140 /* current state of the compare_tree call */ 141 struct btrfs_path *left_path; 142 struct btrfs_path *right_path; 143 struct btrfs_key *cmp_key; 144 145 /* 146 * Keep track of the generation of the last transaction that was used 147 * for relocating a block group. This is periodically checked in order 148 * to detect if a relocation happened since the last check, so that we 149 * don't operate on stale extent buffers for nodes (level >= 1) or on 150 * stale disk_bytenr values of file extent items. 151 */ 152 u64 last_reloc_trans; 153 154 /* 155 * infos of the currently processed inode. In case of deleted inodes, 156 * these are the values from the deleted inode. 157 */ 158 u64 cur_ino; 159 u64 cur_inode_gen; 160 u64 cur_inode_size; 161 u64 cur_inode_mode; 162 u64 cur_inode_rdev; 163 u64 cur_inode_last_extent; 164 u64 cur_inode_next_write_offset; 165 bool cur_inode_new; 166 bool cur_inode_new_gen; 167 bool cur_inode_deleted; 168 bool ignore_cur_inode; 169 bool cur_inode_needs_verity; 170 void *verity_descriptor; 171 172 u64 send_progress; 173 174 struct list_head new_refs; 175 struct list_head deleted_refs; 176 177 struct radix_tree_root name_cache; 178 struct list_head name_cache_list; 179 int name_cache_size; 180 181 /* 182 * The inode we are currently processing. It's not NULL only when we 183 * need to issue write commands for data extents from this inode. 184 */ 185 struct inode *cur_inode; 186 struct file_ra_state ra; 187 u64 page_cache_clear_start; 188 bool clean_page_cache; 189 190 /* 191 * We process inodes by their increasing order, so if before an 192 * incremental send we reverse the parent/child relationship of 193 * directories such that a directory with a lower inode number was 194 * the parent of a directory with a higher inode number, and the one 195 * becoming the new parent got renamed too, we can't rename/move the 196 * directory with lower inode number when we finish processing it - we 197 * must process the directory with higher inode number first, then 198 * rename/move it and then rename/move the directory with lower inode 199 * number. Example follows. 200 * 201 * Tree state when the first send was performed: 202 * 203 * . 204 * |-- a (ino 257) 205 * |-- b (ino 258) 206 * | 207 * | 208 * |-- c (ino 259) 209 * | |-- d (ino 260) 210 * | 211 * |-- c2 (ino 261) 212 * 213 * Tree state when the second (incremental) send is performed: 214 * 215 * . 216 * |-- a (ino 257) 217 * |-- b (ino 258) 218 * |-- c2 (ino 261) 219 * |-- d2 (ino 260) 220 * |-- cc (ino 259) 221 * 222 * The sequence of steps that lead to the second state was: 223 * 224 * mv /a/b/c/d /a/b/c2/d2 225 * mv /a/b/c /a/b/c2/d2/cc 226 * 227 * "c" has lower inode number, but we can't move it (2nd mv operation) 228 * before we move "d", which has higher inode number. 229 * 230 * So we just memorize which move/rename operations must be performed 231 * later when their respective parent is processed and moved/renamed. 232 */ 233 234 /* Indexed by parent directory inode number. */ 235 struct rb_root pending_dir_moves; 236 237 /* 238 * Reverse index, indexed by the inode number of a directory that 239 * is waiting for the move/rename of its immediate parent before its 240 * own move/rename can be performed. 241 */ 242 struct rb_root waiting_dir_moves; 243 244 /* 245 * A directory that is going to be rm'ed might have a child directory 246 * which is in the pending directory moves index above. In this case, 247 * the directory can only be removed after the move/rename of its child 248 * is performed. Example: 249 * 250 * Parent snapshot: 251 * 252 * . (ino 256) 253 * |-- a/ (ino 257) 254 * |-- b/ (ino 258) 255 * |-- c/ (ino 259) 256 * | |-- x/ (ino 260) 257 * | 258 * |-- y/ (ino 261) 259 * 260 * Send snapshot: 261 * 262 * . (ino 256) 263 * |-- a/ (ino 257) 264 * |-- b/ (ino 258) 265 * |-- YY/ (ino 261) 266 * |-- x/ (ino 260) 267 * 268 * Sequence of steps that lead to the send snapshot: 269 * rm -f /a/b/c/foo.txt 270 * mv /a/b/y /a/b/YY 271 * mv /a/b/c/x /a/b/YY 272 * rmdir /a/b/c 273 * 274 * When the child is processed, its move/rename is delayed until its 275 * parent is processed (as explained above), but all other operations 276 * like update utimes, chown, chgrp, etc, are performed and the paths 277 * that it uses for those operations must use the orphanized name of 278 * its parent (the directory we're going to rm later), so we need to 279 * memorize that name. 280 * 281 * Indexed by the inode number of the directory to be deleted. 282 */ 283 struct rb_root orphan_dirs; 284 285 struct rb_root rbtree_new_refs; 286 struct rb_root rbtree_deleted_refs; 287 288 struct { 289 u64 last_reloc_trans; 290 struct list_head lru_list; 291 struct maple_tree entries; 292 /* Number of entries stored in the cache. */ 293 int size; 294 } backref_cache; 295 }; 296 297 struct pending_dir_move { 298 struct rb_node node; 299 struct list_head list; 300 u64 parent_ino; 301 u64 ino; 302 u64 gen; 303 struct list_head update_refs; 304 }; 305 306 struct waiting_dir_move { 307 struct rb_node node; 308 u64 ino; 309 /* 310 * There might be some directory that could not be removed because it 311 * was waiting for this directory inode to be moved first. Therefore 312 * after this directory is moved, we can try to rmdir the ino rmdir_ino. 313 */ 314 u64 rmdir_ino; 315 u64 rmdir_gen; 316 bool orphanized; 317 }; 318 319 struct orphan_dir_info { 320 struct rb_node node; 321 u64 ino; 322 u64 gen; 323 u64 last_dir_index_offset; 324 }; 325 326 struct name_cache_entry { 327 struct list_head list; 328 /* 329 * radix_tree has only 32bit entries but we need to handle 64bit inums. 330 * We use the lower 32bit of the 64bit inum to store it in the tree. If 331 * more then one inum would fall into the same entry, we use radix_list 332 * to store the additional entries. radix_list is also used to store 333 * entries where two entries have the same inum but different 334 * generations. 335 */ 336 struct list_head radix_list; 337 u64 ino; 338 u64 gen; 339 u64 parent_ino; 340 u64 parent_gen; 341 int ret; 342 int need_later_update; 343 int name_len; 344 char name[]; 345 }; 346 347 #define ADVANCE 1 348 #define ADVANCE_ONLY_NEXT -1 349 350 enum btrfs_compare_tree_result { 351 BTRFS_COMPARE_TREE_NEW, 352 BTRFS_COMPARE_TREE_DELETED, 353 BTRFS_COMPARE_TREE_CHANGED, 354 BTRFS_COMPARE_TREE_SAME, 355 }; 356 357 __cold 358 static void inconsistent_snapshot_error(struct send_ctx *sctx, 359 enum btrfs_compare_tree_result result, 360 const char *what) 361 { 362 const char *result_string; 363 364 switch (result) { 365 case BTRFS_COMPARE_TREE_NEW: 366 result_string = "new"; 367 break; 368 case BTRFS_COMPARE_TREE_DELETED: 369 result_string = "deleted"; 370 break; 371 case BTRFS_COMPARE_TREE_CHANGED: 372 result_string = "updated"; 373 break; 374 case BTRFS_COMPARE_TREE_SAME: 375 ASSERT(0); 376 result_string = "unchanged"; 377 break; 378 default: 379 ASSERT(0); 380 result_string = "unexpected"; 381 } 382 383 btrfs_err(sctx->send_root->fs_info, 384 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", 385 result_string, what, sctx->cmp_key->objectid, 386 sctx->send_root->root_key.objectid, 387 (sctx->parent_root ? 388 sctx->parent_root->root_key.objectid : 0)); 389 } 390 391 __maybe_unused 392 static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) 393 { 394 switch (sctx->proto) { 395 case 1: return cmd <= BTRFS_SEND_C_MAX_V1; 396 case 2: return cmd <= BTRFS_SEND_C_MAX_V2; 397 case 3: return cmd <= BTRFS_SEND_C_MAX_V3; 398 default: return false; 399 } 400 } 401 402 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 403 404 static struct waiting_dir_move * 405 get_waiting_dir_move(struct send_ctx *sctx, u64 ino); 406 407 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); 408 409 static int need_send_hole(struct send_ctx *sctx) 410 { 411 return (sctx->parent_root && !sctx->cur_inode_new && 412 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && 413 S_ISREG(sctx->cur_inode_mode)); 414 } 415 416 static void fs_path_reset(struct fs_path *p) 417 { 418 if (p->reversed) { 419 p->start = p->buf + p->buf_len - 1; 420 p->end = p->start; 421 *p->start = 0; 422 } else { 423 p->start = p->buf; 424 p->end = p->start; 425 *p->start = 0; 426 } 427 } 428 429 static struct fs_path *fs_path_alloc(void) 430 { 431 struct fs_path *p; 432 433 p = kmalloc(sizeof(*p), GFP_KERNEL); 434 if (!p) 435 return NULL; 436 p->reversed = 0; 437 p->buf = p->inline_buf; 438 p->buf_len = FS_PATH_INLINE_SIZE; 439 fs_path_reset(p); 440 return p; 441 } 442 443 static struct fs_path *fs_path_alloc_reversed(void) 444 { 445 struct fs_path *p; 446 447 p = fs_path_alloc(); 448 if (!p) 449 return NULL; 450 p->reversed = 1; 451 fs_path_reset(p); 452 return p; 453 } 454 455 static void fs_path_free(struct fs_path *p) 456 { 457 if (!p) 458 return; 459 if (p->buf != p->inline_buf) 460 kfree(p->buf); 461 kfree(p); 462 } 463 464 static int fs_path_len(struct fs_path *p) 465 { 466 return p->end - p->start; 467 } 468 469 static int fs_path_ensure_buf(struct fs_path *p, int len) 470 { 471 char *tmp_buf; 472 int path_len; 473 int old_buf_len; 474 475 len++; 476 477 if (p->buf_len >= len) 478 return 0; 479 480 if (len > PATH_MAX) { 481 WARN_ON(1); 482 return -ENOMEM; 483 } 484 485 path_len = p->end - p->start; 486 old_buf_len = p->buf_len; 487 488 /* 489 * Allocate to the next largest kmalloc bucket size, to let 490 * the fast path happen most of the time. 491 */ 492 len = kmalloc_size_roundup(len); 493 /* 494 * First time the inline_buf does not suffice 495 */ 496 if (p->buf == p->inline_buf) { 497 tmp_buf = kmalloc(len, GFP_KERNEL); 498 if (tmp_buf) 499 memcpy(tmp_buf, p->buf, old_buf_len); 500 } else { 501 tmp_buf = krealloc(p->buf, len, GFP_KERNEL); 502 } 503 if (!tmp_buf) 504 return -ENOMEM; 505 p->buf = tmp_buf; 506 p->buf_len = len; 507 508 if (p->reversed) { 509 tmp_buf = p->buf + old_buf_len - path_len - 1; 510 p->end = p->buf + p->buf_len - 1; 511 p->start = p->end - path_len; 512 memmove(p->start, tmp_buf, path_len + 1); 513 } else { 514 p->start = p->buf; 515 p->end = p->start + path_len; 516 } 517 return 0; 518 } 519 520 static int fs_path_prepare_for_add(struct fs_path *p, int name_len, 521 char **prepared) 522 { 523 int ret; 524 int new_len; 525 526 new_len = p->end - p->start + name_len; 527 if (p->start != p->end) 528 new_len++; 529 ret = fs_path_ensure_buf(p, new_len); 530 if (ret < 0) 531 goto out; 532 533 if (p->reversed) { 534 if (p->start != p->end) 535 *--p->start = '/'; 536 p->start -= name_len; 537 *prepared = p->start; 538 } else { 539 if (p->start != p->end) 540 *p->end++ = '/'; 541 *prepared = p->end; 542 p->end += name_len; 543 *p->end = 0; 544 } 545 546 out: 547 return ret; 548 } 549 550 static int fs_path_add(struct fs_path *p, const char *name, int name_len) 551 { 552 int ret; 553 char *prepared; 554 555 ret = fs_path_prepare_for_add(p, name_len, &prepared); 556 if (ret < 0) 557 goto out; 558 memcpy(prepared, name, name_len); 559 560 out: 561 return ret; 562 } 563 564 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 565 { 566 int ret; 567 char *prepared; 568 569 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); 570 if (ret < 0) 571 goto out; 572 memcpy(prepared, p2->start, p2->end - p2->start); 573 574 out: 575 return ret; 576 } 577 578 static int fs_path_add_from_extent_buffer(struct fs_path *p, 579 struct extent_buffer *eb, 580 unsigned long off, int len) 581 { 582 int ret; 583 char *prepared; 584 585 ret = fs_path_prepare_for_add(p, len, &prepared); 586 if (ret < 0) 587 goto out; 588 589 read_extent_buffer(eb, prepared, off, len); 590 591 out: 592 return ret; 593 } 594 595 static int fs_path_copy(struct fs_path *p, struct fs_path *from) 596 { 597 p->reversed = from->reversed; 598 fs_path_reset(p); 599 600 return fs_path_add_path(p, from); 601 } 602 603 static void fs_path_unreverse(struct fs_path *p) 604 { 605 char *tmp; 606 int len; 607 608 if (!p->reversed) 609 return; 610 611 tmp = p->start; 612 len = p->end - p->start; 613 p->start = p->buf; 614 p->end = p->start + len; 615 memmove(p->start, tmp, len + 1); 616 p->reversed = 0; 617 } 618 619 static struct btrfs_path *alloc_path_for_send(void) 620 { 621 struct btrfs_path *path; 622 623 path = btrfs_alloc_path(); 624 if (!path) 625 return NULL; 626 path->search_commit_root = 1; 627 path->skip_locking = 1; 628 path->need_commit_sem = 1; 629 return path; 630 } 631 632 static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) 633 { 634 int ret; 635 u32 pos = 0; 636 637 while (pos < len) { 638 ret = kernel_write(filp, buf + pos, len - pos, off); 639 if (ret < 0) 640 return ret; 641 if (ret == 0) 642 return -EIO; 643 pos += ret; 644 } 645 646 return 0; 647 } 648 649 static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) 650 { 651 struct btrfs_tlv_header *hdr; 652 int total_len = sizeof(*hdr) + len; 653 int left = sctx->send_max_size - sctx->send_size; 654 655 if (WARN_ON_ONCE(sctx->put_data)) 656 return -EINVAL; 657 658 if (unlikely(left < total_len)) 659 return -EOVERFLOW; 660 661 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); 662 put_unaligned_le16(attr, &hdr->tlv_type); 663 put_unaligned_le16(len, &hdr->tlv_len); 664 memcpy(hdr + 1, data, len); 665 sctx->send_size += total_len; 666 667 return 0; 668 } 669 670 #define TLV_PUT_DEFINE_INT(bits) \ 671 static int tlv_put_u##bits(struct send_ctx *sctx, \ 672 u##bits attr, u##bits value) \ 673 { \ 674 __le##bits __tmp = cpu_to_le##bits(value); \ 675 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ 676 } 677 678 TLV_PUT_DEFINE_INT(8) 679 TLV_PUT_DEFINE_INT(32) 680 TLV_PUT_DEFINE_INT(64) 681 682 static int tlv_put_string(struct send_ctx *sctx, u16 attr, 683 const char *str, int len) 684 { 685 if (len == -1) 686 len = strlen(str); 687 return tlv_put(sctx, attr, str, len); 688 } 689 690 static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, 691 const u8 *uuid) 692 { 693 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); 694 } 695 696 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, 697 struct extent_buffer *eb, 698 struct btrfs_timespec *ts) 699 { 700 struct btrfs_timespec bts; 701 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); 702 return tlv_put(sctx, attr, &bts, sizeof(bts)); 703 } 704 705 706 #define TLV_PUT(sctx, attrtype, data, attrlen) \ 707 do { \ 708 ret = tlv_put(sctx, attrtype, data, attrlen); \ 709 if (ret < 0) \ 710 goto tlv_put_failure; \ 711 } while (0) 712 713 #define TLV_PUT_INT(sctx, attrtype, bits, value) \ 714 do { \ 715 ret = tlv_put_u##bits(sctx, attrtype, value); \ 716 if (ret < 0) \ 717 goto tlv_put_failure; \ 718 } while (0) 719 720 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) 721 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) 722 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) 723 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) 724 #define TLV_PUT_STRING(sctx, attrtype, str, len) \ 725 do { \ 726 ret = tlv_put_string(sctx, attrtype, str, len); \ 727 if (ret < 0) \ 728 goto tlv_put_failure; \ 729 } while (0) 730 #define TLV_PUT_PATH(sctx, attrtype, p) \ 731 do { \ 732 ret = tlv_put_string(sctx, attrtype, p->start, \ 733 p->end - p->start); \ 734 if (ret < 0) \ 735 goto tlv_put_failure; \ 736 } while(0) 737 #define TLV_PUT_UUID(sctx, attrtype, uuid) \ 738 do { \ 739 ret = tlv_put_uuid(sctx, attrtype, uuid); \ 740 if (ret < 0) \ 741 goto tlv_put_failure; \ 742 } while (0) 743 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ 744 do { \ 745 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ 746 if (ret < 0) \ 747 goto tlv_put_failure; \ 748 } while (0) 749 750 static int send_header(struct send_ctx *sctx) 751 { 752 struct btrfs_stream_header hdr; 753 754 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 755 hdr.version = cpu_to_le32(sctx->proto); 756 return write_buf(sctx->send_filp, &hdr, sizeof(hdr), 757 &sctx->send_off); 758 } 759 760 /* 761 * For each command/item we want to send to userspace, we call this function. 762 */ 763 static int begin_cmd(struct send_ctx *sctx, int cmd) 764 { 765 struct btrfs_cmd_header *hdr; 766 767 if (WARN_ON(!sctx->send_buf)) 768 return -EINVAL; 769 770 BUG_ON(sctx->send_size); 771 772 sctx->send_size += sizeof(*hdr); 773 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 774 put_unaligned_le16(cmd, &hdr->cmd); 775 776 return 0; 777 } 778 779 static int send_cmd(struct send_ctx *sctx) 780 { 781 int ret; 782 struct btrfs_cmd_header *hdr; 783 u32 crc; 784 785 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 786 put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); 787 put_unaligned_le32(0, &hdr->crc); 788 789 crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 790 put_unaligned_le32(crc, &hdr->crc); 791 792 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 793 &sctx->send_off); 794 795 sctx->send_size = 0; 796 sctx->put_data = false; 797 798 return ret; 799 } 800 801 /* 802 * Sends a move instruction to user space 803 */ 804 static int send_rename(struct send_ctx *sctx, 805 struct fs_path *from, struct fs_path *to) 806 { 807 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 808 int ret; 809 810 btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); 811 812 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); 813 if (ret < 0) 814 goto out; 815 816 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); 817 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); 818 819 ret = send_cmd(sctx); 820 821 tlv_put_failure: 822 out: 823 return ret; 824 } 825 826 /* 827 * Sends a link instruction to user space 828 */ 829 static int send_link(struct send_ctx *sctx, 830 struct fs_path *path, struct fs_path *lnk) 831 { 832 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 833 int ret; 834 835 btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); 836 837 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); 838 if (ret < 0) 839 goto out; 840 841 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 842 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); 843 844 ret = send_cmd(sctx); 845 846 tlv_put_failure: 847 out: 848 return ret; 849 } 850 851 /* 852 * Sends an unlink instruction to user space 853 */ 854 static int send_unlink(struct send_ctx *sctx, struct fs_path *path) 855 { 856 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 857 int ret; 858 859 btrfs_debug(fs_info, "send_unlink %s", path->start); 860 861 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); 862 if (ret < 0) 863 goto out; 864 865 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 866 867 ret = send_cmd(sctx); 868 869 tlv_put_failure: 870 out: 871 return ret; 872 } 873 874 /* 875 * Sends a rmdir instruction to user space 876 */ 877 static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) 878 { 879 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 880 int ret; 881 882 btrfs_debug(fs_info, "send_rmdir %s", path->start); 883 884 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); 885 if (ret < 0) 886 goto out; 887 888 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 889 890 ret = send_cmd(sctx); 891 892 tlv_put_failure: 893 out: 894 return ret; 895 } 896 897 struct btrfs_inode_info { 898 u64 size; 899 u64 gen; 900 u64 mode; 901 u64 uid; 902 u64 gid; 903 u64 rdev; 904 u64 fileattr; 905 u64 nlink; 906 }; 907 908 /* 909 * Helper function to retrieve some fields from an inode item. 910 */ 911 static int get_inode_info(struct btrfs_root *root, u64 ino, 912 struct btrfs_inode_info *info) 913 { 914 int ret; 915 struct btrfs_path *path; 916 struct btrfs_inode_item *ii; 917 struct btrfs_key key; 918 919 path = alloc_path_for_send(); 920 if (!path) 921 return -ENOMEM; 922 923 key.objectid = ino; 924 key.type = BTRFS_INODE_ITEM_KEY; 925 key.offset = 0; 926 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 927 if (ret) { 928 if (ret > 0) 929 ret = -ENOENT; 930 goto out; 931 } 932 933 if (!info) 934 goto out; 935 936 ii = btrfs_item_ptr(path->nodes[0], path->slots[0], 937 struct btrfs_inode_item); 938 info->size = btrfs_inode_size(path->nodes[0], ii); 939 info->gen = btrfs_inode_generation(path->nodes[0], ii); 940 info->mode = btrfs_inode_mode(path->nodes[0], ii); 941 info->uid = btrfs_inode_uid(path->nodes[0], ii); 942 info->gid = btrfs_inode_gid(path->nodes[0], ii); 943 info->rdev = btrfs_inode_rdev(path->nodes[0], ii); 944 info->nlink = btrfs_inode_nlink(path->nodes[0], ii); 945 /* 946 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's 947 * otherwise logically split to 32/32 parts. 948 */ 949 info->fileattr = btrfs_inode_flags(path->nodes[0], ii); 950 951 out: 952 btrfs_free_path(path); 953 return ret; 954 } 955 956 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) 957 { 958 int ret; 959 struct btrfs_inode_info info; 960 961 if (!gen) 962 return -EPERM; 963 964 ret = get_inode_info(root, ino, &info); 965 if (!ret) 966 *gen = info.gen; 967 return ret; 968 } 969 970 typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, 971 struct fs_path *p, 972 void *ctx); 973 974 /* 975 * Helper function to iterate the entries in ONE btrfs_inode_ref or 976 * btrfs_inode_extref. 977 * The iterate callback may return a non zero value to stop iteration. This can 978 * be a negative value for error codes or 1 to simply stop it. 979 * 980 * path must point to the INODE_REF or INODE_EXTREF when called. 981 */ 982 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, 983 struct btrfs_key *found_key, int resolve, 984 iterate_inode_ref_t iterate, void *ctx) 985 { 986 struct extent_buffer *eb = path->nodes[0]; 987 struct btrfs_inode_ref *iref; 988 struct btrfs_inode_extref *extref; 989 struct btrfs_path *tmp_path; 990 struct fs_path *p; 991 u32 cur = 0; 992 u32 total; 993 int slot = path->slots[0]; 994 u32 name_len; 995 char *start; 996 int ret = 0; 997 int num = 0; 998 int index; 999 u64 dir; 1000 unsigned long name_off; 1001 unsigned long elem_size; 1002 unsigned long ptr; 1003 1004 p = fs_path_alloc_reversed(); 1005 if (!p) 1006 return -ENOMEM; 1007 1008 tmp_path = alloc_path_for_send(); 1009 if (!tmp_path) { 1010 fs_path_free(p); 1011 return -ENOMEM; 1012 } 1013 1014 1015 if (found_key->type == BTRFS_INODE_REF_KEY) { 1016 ptr = (unsigned long)btrfs_item_ptr(eb, slot, 1017 struct btrfs_inode_ref); 1018 total = btrfs_item_size(eb, slot); 1019 elem_size = sizeof(*iref); 1020 } else { 1021 ptr = btrfs_item_ptr_offset(eb, slot); 1022 total = btrfs_item_size(eb, slot); 1023 elem_size = sizeof(*extref); 1024 } 1025 1026 while (cur < total) { 1027 fs_path_reset(p); 1028 1029 if (found_key->type == BTRFS_INODE_REF_KEY) { 1030 iref = (struct btrfs_inode_ref *)(ptr + cur); 1031 name_len = btrfs_inode_ref_name_len(eb, iref); 1032 name_off = (unsigned long)(iref + 1); 1033 index = btrfs_inode_ref_index(eb, iref); 1034 dir = found_key->offset; 1035 } else { 1036 extref = (struct btrfs_inode_extref *)(ptr + cur); 1037 name_len = btrfs_inode_extref_name_len(eb, extref); 1038 name_off = (unsigned long)&extref->name; 1039 index = btrfs_inode_extref_index(eb, extref); 1040 dir = btrfs_inode_extref_parent(eb, extref); 1041 } 1042 1043 if (resolve) { 1044 start = btrfs_ref_to_path(root, tmp_path, name_len, 1045 name_off, eb, dir, 1046 p->buf, p->buf_len); 1047 if (IS_ERR(start)) { 1048 ret = PTR_ERR(start); 1049 goto out; 1050 } 1051 if (start < p->buf) { 1052 /* overflow , try again with larger buffer */ 1053 ret = fs_path_ensure_buf(p, 1054 p->buf_len + p->buf - start); 1055 if (ret < 0) 1056 goto out; 1057 start = btrfs_ref_to_path(root, tmp_path, 1058 name_len, name_off, 1059 eb, dir, 1060 p->buf, p->buf_len); 1061 if (IS_ERR(start)) { 1062 ret = PTR_ERR(start); 1063 goto out; 1064 } 1065 BUG_ON(start < p->buf); 1066 } 1067 p->start = start; 1068 } else { 1069 ret = fs_path_add_from_extent_buffer(p, eb, name_off, 1070 name_len); 1071 if (ret < 0) 1072 goto out; 1073 } 1074 1075 cur += elem_size + name_len; 1076 ret = iterate(num, dir, index, p, ctx); 1077 if (ret) 1078 goto out; 1079 num++; 1080 } 1081 1082 out: 1083 btrfs_free_path(tmp_path); 1084 fs_path_free(p); 1085 return ret; 1086 } 1087 1088 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, 1089 const char *name, int name_len, 1090 const char *data, int data_len, 1091 void *ctx); 1092 1093 /* 1094 * Helper function to iterate the entries in ONE btrfs_dir_item. 1095 * The iterate callback may return a non zero value to stop iteration. This can 1096 * be a negative value for error codes or 1 to simply stop it. 1097 * 1098 * path must point to the dir item when called. 1099 */ 1100 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, 1101 iterate_dir_item_t iterate, void *ctx) 1102 { 1103 int ret = 0; 1104 struct extent_buffer *eb; 1105 struct btrfs_dir_item *di; 1106 struct btrfs_key di_key; 1107 char *buf = NULL; 1108 int buf_len; 1109 u32 name_len; 1110 u32 data_len; 1111 u32 cur; 1112 u32 len; 1113 u32 total; 1114 int slot; 1115 int num; 1116 1117 /* 1118 * Start with a small buffer (1 page). If later we end up needing more 1119 * space, which can happen for xattrs on a fs with a leaf size greater 1120 * then the page size, attempt to increase the buffer. Typically xattr 1121 * values are small. 1122 */ 1123 buf_len = PATH_MAX; 1124 buf = kmalloc(buf_len, GFP_KERNEL); 1125 if (!buf) { 1126 ret = -ENOMEM; 1127 goto out; 1128 } 1129 1130 eb = path->nodes[0]; 1131 slot = path->slots[0]; 1132 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1133 cur = 0; 1134 len = 0; 1135 total = btrfs_item_size(eb, slot); 1136 1137 num = 0; 1138 while (cur < total) { 1139 name_len = btrfs_dir_name_len(eb, di); 1140 data_len = btrfs_dir_data_len(eb, di); 1141 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1142 1143 if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { 1144 if (name_len > XATTR_NAME_MAX) { 1145 ret = -ENAMETOOLONG; 1146 goto out; 1147 } 1148 if (name_len + data_len > 1149 BTRFS_MAX_XATTR_SIZE(root->fs_info)) { 1150 ret = -E2BIG; 1151 goto out; 1152 } 1153 } else { 1154 /* 1155 * Path too long 1156 */ 1157 if (name_len + data_len > PATH_MAX) { 1158 ret = -ENAMETOOLONG; 1159 goto out; 1160 } 1161 } 1162 1163 if (name_len + data_len > buf_len) { 1164 buf_len = name_len + data_len; 1165 if (is_vmalloc_addr(buf)) { 1166 vfree(buf); 1167 buf = NULL; 1168 } else { 1169 char *tmp = krealloc(buf, buf_len, 1170 GFP_KERNEL | __GFP_NOWARN); 1171 1172 if (!tmp) 1173 kfree(buf); 1174 buf = tmp; 1175 } 1176 if (!buf) { 1177 buf = kvmalloc(buf_len, GFP_KERNEL); 1178 if (!buf) { 1179 ret = -ENOMEM; 1180 goto out; 1181 } 1182 } 1183 } 1184 1185 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1186 name_len + data_len); 1187 1188 len = sizeof(*di) + name_len + data_len; 1189 di = (struct btrfs_dir_item *)((char *)di + len); 1190 cur += len; 1191 1192 ret = iterate(num, &di_key, buf, name_len, buf + name_len, 1193 data_len, ctx); 1194 if (ret < 0) 1195 goto out; 1196 if (ret) { 1197 ret = 0; 1198 goto out; 1199 } 1200 1201 num++; 1202 } 1203 1204 out: 1205 kvfree(buf); 1206 return ret; 1207 } 1208 1209 static int __copy_first_ref(int num, u64 dir, int index, 1210 struct fs_path *p, void *ctx) 1211 { 1212 int ret; 1213 struct fs_path *pt = ctx; 1214 1215 ret = fs_path_copy(pt, p); 1216 if (ret < 0) 1217 return ret; 1218 1219 /* we want the first only */ 1220 return 1; 1221 } 1222 1223 /* 1224 * Retrieve the first path of an inode. If an inode has more then one 1225 * ref/hardlink, this is ignored. 1226 */ 1227 static int get_inode_path(struct btrfs_root *root, 1228 u64 ino, struct fs_path *path) 1229 { 1230 int ret; 1231 struct btrfs_key key, found_key; 1232 struct btrfs_path *p; 1233 1234 p = alloc_path_for_send(); 1235 if (!p) 1236 return -ENOMEM; 1237 1238 fs_path_reset(path); 1239 1240 key.objectid = ino; 1241 key.type = BTRFS_INODE_REF_KEY; 1242 key.offset = 0; 1243 1244 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); 1245 if (ret < 0) 1246 goto out; 1247 if (ret) { 1248 ret = 1; 1249 goto out; 1250 } 1251 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); 1252 if (found_key.objectid != ino || 1253 (found_key.type != BTRFS_INODE_REF_KEY && 1254 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 1255 ret = -ENOENT; 1256 goto out; 1257 } 1258 1259 ret = iterate_inode_ref(root, p, &found_key, 1, 1260 __copy_first_ref, path); 1261 if (ret < 0) 1262 goto out; 1263 ret = 0; 1264 1265 out: 1266 btrfs_free_path(p); 1267 return ret; 1268 } 1269 1270 struct backref_ctx { 1271 struct send_ctx *sctx; 1272 1273 /* number of total found references */ 1274 u64 found; 1275 1276 /* 1277 * used for clones found in send_root. clones found behind cur_objectid 1278 * and cur_offset are not considered as allowed clones. 1279 */ 1280 u64 cur_objectid; 1281 u64 cur_offset; 1282 1283 /* may be truncated in case it's the last extent in a file */ 1284 u64 extent_len; 1285 1286 /* The bytenr the file extent item we are processing refers to. */ 1287 u64 bytenr; 1288 /* The owner (root id) of the data backref for the current extent. */ 1289 u64 backref_owner; 1290 /* The offset of the data backref for the current extent. */ 1291 u64 backref_offset; 1292 }; 1293 1294 static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1295 { 1296 u64 root = (u64)(uintptr_t)key; 1297 const struct clone_root *cr = elt; 1298 1299 if (root < cr->root->root_key.objectid) 1300 return -1; 1301 if (root > cr->root->root_key.objectid) 1302 return 1; 1303 return 0; 1304 } 1305 1306 static int __clone_root_cmp_sort(const void *e1, const void *e2) 1307 { 1308 const struct clone_root *cr1 = e1; 1309 const struct clone_root *cr2 = e2; 1310 1311 if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) 1312 return -1; 1313 if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) 1314 return 1; 1315 return 0; 1316 } 1317 1318 /* 1319 * Called for every backref that is found for the current extent. 1320 * Results are collected in sctx->clone_roots->ino/offset. 1321 */ 1322 static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, 1323 void *ctx_) 1324 { 1325 struct backref_ctx *bctx = ctx_; 1326 struct clone_root *clone_root; 1327 1328 /* First check if the root is in the list of accepted clone sources */ 1329 clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, 1330 bctx->sctx->clone_roots_cnt, 1331 sizeof(struct clone_root), 1332 __clone_root_cmp_bsearch); 1333 if (!clone_root) 1334 return 0; 1335 1336 /* This is our own reference, bail out as we can't clone from it. */ 1337 if (clone_root->root == bctx->sctx->send_root && 1338 ino == bctx->cur_objectid && 1339 offset == bctx->cur_offset) 1340 return 0; 1341 1342 /* 1343 * Make sure we don't consider clones from send_root that are 1344 * behind the current inode/offset. 1345 */ 1346 if (clone_root->root == bctx->sctx->send_root) { 1347 /* 1348 * If the source inode was not yet processed we can't issue a 1349 * clone operation, as the source extent does not exist yet at 1350 * the destination of the stream. 1351 */ 1352 if (ino > bctx->cur_objectid) 1353 return 0; 1354 /* 1355 * We clone from the inode currently being sent as long as the 1356 * source extent is already processed, otherwise we could try 1357 * to clone from an extent that does not exist yet at the 1358 * destination of the stream. 1359 */ 1360 if (ino == bctx->cur_objectid && 1361 offset + bctx->extent_len > 1362 bctx->sctx->cur_inode_next_write_offset) 1363 return 0; 1364 } 1365 1366 bctx->found++; 1367 clone_root->found_ref = true; 1368 1369 /* 1370 * If the given backref refers to a file extent item with a larger 1371 * number of bytes than what we found before, use the new one so that 1372 * we clone more optimally and end up doing less writes and getting 1373 * less exclusive, non-shared extents at the destination. 1374 */ 1375 if (num_bytes > clone_root->num_bytes) { 1376 clone_root->ino = ino; 1377 clone_root->offset = offset; 1378 clone_root->num_bytes = num_bytes; 1379 1380 /* 1381 * Found a perfect candidate, so there's no need to continue 1382 * backref walking. 1383 */ 1384 if (num_bytes >= bctx->extent_len) 1385 return BTRFS_ITERATE_EXTENT_INODES_STOP; 1386 } 1387 1388 return 0; 1389 } 1390 1391 static void empty_backref_cache(struct send_ctx *sctx) 1392 { 1393 struct backref_cache_entry *entry; 1394 struct backref_cache_entry *tmp; 1395 1396 list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) 1397 kfree(entry); 1398 1399 INIT_LIST_HEAD(&sctx->backref_cache.lru_list); 1400 mtree_destroy(&sctx->backref_cache.entries); 1401 sctx->backref_cache.size = 0; 1402 } 1403 1404 static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, 1405 const u64 **root_ids_ret, int *root_count_ret) 1406 { 1407 struct backref_ctx *bctx = ctx; 1408 struct send_ctx *sctx = bctx->sctx; 1409 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1410 const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; 1411 struct backref_cache_entry *entry; 1412 1413 if (sctx->backref_cache.size == 0) 1414 return false; 1415 1416 /* 1417 * If relocation happened since we first filled the cache, then we must 1418 * empty the cache and can not use it, because even though we operate on 1419 * read-only roots, their leaves and nodes may have been reallocated and 1420 * now be used for different nodes/leaves of the same tree or some other 1421 * tree. 1422 * 1423 * We are called from iterate_extent_inodes() while either holding a 1424 * transaction handle or holding fs_info->commit_root_sem, so no need 1425 * to take any lock here. 1426 */ 1427 if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { 1428 empty_backref_cache(sctx); 1429 return false; 1430 } 1431 1432 entry = mtree_load(&sctx->backref_cache.entries, key); 1433 if (!entry) 1434 return false; 1435 1436 *root_ids_ret = entry->root_ids; 1437 *root_count_ret = entry->num_roots; 1438 list_move_tail(&entry->list, &sctx->backref_cache.lru_list); 1439 1440 return true; 1441 } 1442 1443 static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, 1444 void *ctx) 1445 { 1446 struct backref_ctx *bctx = ctx; 1447 struct send_ctx *sctx = bctx->sctx; 1448 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1449 struct backref_cache_entry *new_entry; 1450 struct ulist_iterator uiter; 1451 struct ulist_node *node; 1452 int ret; 1453 1454 /* 1455 * We're called while holding a transaction handle or while holding 1456 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a 1457 * NOFS allocation. 1458 */ 1459 new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); 1460 /* No worries, cache is optional. */ 1461 if (!new_entry) 1462 return; 1463 1464 new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; 1465 new_entry->num_roots = 0; 1466 ULIST_ITER_INIT(&uiter); 1467 while ((node = ulist_next(root_ids, &uiter)) != NULL) { 1468 const u64 root_id = node->val; 1469 struct clone_root *root; 1470 1471 root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, 1472 sctx->clone_roots_cnt, sizeof(struct clone_root), 1473 __clone_root_cmp_bsearch); 1474 if (!root) 1475 continue; 1476 1477 /* Too many roots, just exit, no worries as caching is optional. */ 1478 if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { 1479 kfree(new_entry); 1480 return; 1481 } 1482 1483 new_entry->root_ids[new_entry->num_roots] = root_id; 1484 new_entry->num_roots++; 1485 } 1486 1487 /* 1488 * We may have not added any roots to the new cache entry, which means 1489 * none of the roots is part of the list of roots from which we are 1490 * allowed to clone. Cache the new entry as it's still useful to avoid 1491 * backref walking to determine which roots have a path to the leaf. 1492 */ 1493 1494 if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { 1495 struct backref_cache_entry *lru_entry; 1496 struct backref_cache_entry *mt_entry; 1497 1498 lru_entry = list_first_entry(&sctx->backref_cache.lru_list, 1499 struct backref_cache_entry, list); 1500 mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); 1501 ASSERT(mt_entry == lru_entry); 1502 list_del(&mt_entry->list); 1503 kfree(mt_entry); 1504 sctx->backref_cache.size--; 1505 } 1506 1507 ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, 1508 new_entry, GFP_NOFS); 1509 ASSERT(ret == 0 || ret == -ENOMEM); 1510 if (ret) { 1511 /* Caching is optional, no worries. */ 1512 kfree(new_entry); 1513 return; 1514 } 1515 1516 list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); 1517 1518 /* 1519 * We are called from iterate_extent_inodes() while either holding a 1520 * transaction handle or holding fs_info->commit_root_sem, so no need 1521 * to take any lock here. 1522 */ 1523 if (sctx->backref_cache.size == 0) 1524 sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; 1525 1526 sctx->backref_cache.size++; 1527 } 1528 1529 static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, 1530 const struct extent_buffer *leaf, void *ctx) 1531 { 1532 const u64 refs = btrfs_extent_refs(leaf, ei); 1533 const struct backref_ctx *bctx = ctx; 1534 const struct send_ctx *sctx = bctx->sctx; 1535 1536 if (bytenr == bctx->bytenr) { 1537 const u64 flags = btrfs_extent_flags(leaf, ei); 1538 1539 if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 1540 return -EUCLEAN; 1541 1542 /* 1543 * If we have only one reference and only the send root as a 1544 * clone source - meaning no clone roots were given in the 1545 * struct btrfs_ioctl_send_args passed to the send ioctl - then 1546 * it's our reference and there's no point in doing backref 1547 * walking which is expensive, so exit early. 1548 */ 1549 if (refs == 1 && sctx->clone_roots_cnt == 1) 1550 return -ENOENT; 1551 } 1552 1553 /* 1554 * Backreference walking (iterate_extent_inodes() below) is currently 1555 * too expensive when an extent has a large number of references, both 1556 * in time spent and used memory. So for now just fallback to write 1557 * operations instead of clone operations when an extent has more than 1558 * a certain amount of references. 1559 */ 1560 if (refs > SEND_MAX_EXTENT_REFS) 1561 return -ENOENT; 1562 1563 return 0; 1564 } 1565 1566 static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) 1567 { 1568 const struct backref_ctx *bctx = ctx; 1569 1570 if (ino == bctx->cur_objectid && 1571 root == bctx->backref_owner && 1572 offset == bctx->backref_offset) 1573 return true; 1574 1575 return false; 1576 } 1577 1578 /* 1579 * Given an inode, offset and extent item, it finds a good clone for a clone 1580 * instruction. Returns -ENOENT when none could be found. The function makes 1581 * sure that the returned clone is usable at the point where sending is at the 1582 * moment. This means, that no clones are accepted which lie behind the current 1583 * inode+offset. 1584 * 1585 * path must point to the extent item when called. 1586 */ 1587 static int find_extent_clone(struct send_ctx *sctx, 1588 struct btrfs_path *path, 1589 u64 ino, u64 data_offset, 1590 u64 ino_size, 1591 struct clone_root **found) 1592 { 1593 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1594 int ret; 1595 int extent_type; 1596 u64 logical; 1597 u64 disk_byte; 1598 u64 num_bytes; 1599 struct btrfs_file_extent_item *fi; 1600 struct extent_buffer *eb = path->nodes[0]; 1601 struct backref_ctx backref_ctx = { 0 }; 1602 struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; 1603 struct clone_root *cur_clone_root; 1604 int compressed; 1605 u32 i; 1606 1607 /* 1608 * With fallocate we can get prealloc extents beyond the inode's i_size, 1609 * so we don't do anything here because clone operations can not clone 1610 * to a range beyond i_size without increasing the i_size of the 1611 * destination inode. 1612 */ 1613 if (data_offset >= ino_size) 1614 return 0; 1615 1616 fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); 1617 extent_type = btrfs_file_extent_type(eb, fi); 1618 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1619 return -ENOENT; 1620 1621 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1622 if (disk_byte == 0) 1623 return -ENOENT; 1624 1625 compressed = btrfs_file_extent_compression(eb, fi); 1626 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1627 logical = disk_byte + btrfs_file_extent_offset(eb, fi); 1628 1629 /* 1630 * Setup the clone roots. 1631 */ 1632 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1633 cur_clone_root = sctx->clone_roots + i; 1634 cur_clone_root->ino = (u64)-1; 1635 cur_clone_root->offset = 0; 1636 cur_clone_root->num_bytes = 0; 1637 cur_clone_root->found_ref = false; 1638 } 1639 1640 backref_ctx.sctx = sctx; 1641 backref_ctx.cur_objectid = ino; 1642 backref_ctx.cur_offset = data_offset; 1643 backref_ctx.bytenr = disk_byte; 1644 /* 1645 * Use the header owner and not the send root's id, because in case of a 1646 * snapshot we can have shared subtrees. 1647 */ 1648 backref_ctx.backref_owner = btrfs_header_owner(eb); 1649 backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); 1650 1651 /* 1652 * The last extent of a file may be too large due to page alignment. 1653 * We need to adjust extent_len in this case so that the checks in 1654 * iterate_backrefs() work. 1655 */ 1656 if (data_offset + num_bytes >= ino_size) 1657 backref_ctx.extent_len = ino_size - data_offset; 1658 else 1659 backref_ctx.extent_len = num_bytes; 1660 1661 /* 1662 * Now collect all backrefs. 1663 */ 1664 backref_walk_ctx.bytenr = disk_byte; 1665 if (compressed == BTRFS_COMPRESS_NONE) 1666 backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); 1667 backref_walk_ctx.fs_info = fs_info; 1668 backref_walk_ctx.cache_lookup = lookup_backref_cache; 1669 backref_walk_ctx.cache_store = store_backref_cache; 1670 backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; 1671 backref_walk_ctx.check_extent_item = check_extent_item; 1672 backref_walk_ctx.user_ctx = &backref_ctx; 1673 1674 /* 1675 * If have a single clone root, then it's the send root and we can tell 1676 * the backref walking code to skip our own backref and not resolve it, 1677 * since we can not use it for cloning - the source and destination 1678 * ranges can't overlap and in case the leaf is shared through a subtree 1679 * due to snapshots, we can't use those other roots since they are not 1680 * in the list of clone roots. 1681 */ 1682 if (sctx->clone_roots_cnt == 1) 1683 backref_walk_ctx.skip_data_ref = skip_self_data_ref; 1684 1685 ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, 1686 &backref_ctx); 1687 if (ret < 0) 1688 return ret; 1689 1690 down_read(&fs_info->commit_root_sem); 1691 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 1692 /* 1693 * A transaction commit for a transaction in which block group 1694 * relocation was done just happened. 1695 * The disk_bytenr of the file extent item we processed is 1696 * possibly stale, referring to the extent's location before 1697 * relocation. So act as if we haven't found any clone sources 1698 * and fallback to write commands, which will read the correct 1699 * data from the new extent location. Otherwise we will fail 1700 * below because we haven't found our own back reference or we 1701 * could be getting incorrect sources in case the old extent 1702 * was already reallocated after the relocation. 1703 */ 1704 up_read(&fs_info->commit_root_sem); 1705 return -ENOENT; 1706 } 1707 up_read(&fs_info->commit_root_sem); 1708 1709 btrfs_debug(fs_info, 1710 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", 1711 data_offset, ino, num_bytes, logical); 1712 1713 if (!backref_ctx.found) { 1714 btrfs_debug(fs_info, "no clones found"); 1715 return -ENOENT; 1716 } 1717 1718 cur_clone_root = NULL; 1719 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1720 struct clone_root *clone_root = &sctx->clone_roots[i]; 1721 1722 if (!clone_root->found_ref) 1723 continue; 1724 1725 /* 1726 * Choose the root from which we can clone more bytes, to 1727 * minimize write operations and therefore have more extent 1728 * sharing at the destination (the same as in the source). 1729 */ 1730 if (!cur_clone_root || 1731 clone_root->num_bytes > cur_clone_root->num_bytes) { 1732 cur_clone_root = clone_root; 1733 1734 /* 1735 * We found an optimal clone candidate (any inode from 1736 * any root is fine), so we're done. 1737 */ 1738 if (clone_root->num_bytes >= backref_ctx.extent_len) 1739 break; 1740 } 1741 } 1742 1743 if (cur_clone_root) { 1744 *found = cur_clone_root; 1745 ret = 0; 1746 } else { 1747 ret = -ENOENT; 1748 } 1749 1750 return ret; 1751 } 1752 1753 static int read_symlink(struct btrfs_root *root, 1754 u64 ino, 1755 struct fs_path *dest) 1756 { 1757 int ret; 1758 struct btrfs_path *path; 1759 struct btrfs_key key; 1760 struct btrfs_file_extent_item *ei; 1761 u8 type; 1762 u8 compression; 1763 unsigned long off; 1764 int len; 1765 1766 path = alloc_path_for_send(); 1767 if (!path) 1768 return -ENOMEM; 1769 1770 key.objectid = ino; 1771 key.type = BTRFS_EXTENT_DATA_KEY; 1772 key.offset = 0; 1773 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1774 if (ret < 0) 1775 goto out; 1776 if (ret) { 1777 /* 1778 * An empty symlink inode. Can happen in rare error paths when 1779 * creating a symlink (transaction committed before the inode 1780 * eviction handler removed the symlink inode items and a crash 1781 * happened in between or the subvol was snapshoted in between). 1782 * Print an informative message to dmesg/syslog so that the user 1783 * can delete the symlink. 1784 */ 1785 btrfs_err(root->fs_info, 1786 "Found empty symlink inode %llu at root %llu", 1787 ino, root->root_key.objectid); 1788 ret = -EIO; 1789 goto out; 1790 } 1791 1792 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 1793 struct btrfs_file_extent_item); 1794 type = btrfs_file_extent_type(path->nodes[0], ei); 1795 compression = btrfs_file_extent_compression(path->nodes[0], ei); 1796 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); 1797 BUG_ON(compression); 1798 1799 off = btrfs_file_extent_inline_start(ei); 1800 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); 1801 1802 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1803 1804 out: 1805 btrfs_free_path(path); 1806 return ret; 1807 } 1808 1809 /* 1810 * Helper function to generate a file name that is unique in the root of 1811 * send_root and parent_root. This is used to generate names for orphan inodes. 1812 */ 1813 static int gen_unique_name(struct send_ctx *sctx, 1814 u64 ino, u64 gen, 1815 struct fs_path *dest) 1816 { 1817 int ret = 0; 1818 struct btrfs_path *path; 1819 struct btrfs_dir_item *di; 1820 char tmp[64]; 1821 int len; 1822 u64 idx = 0; 1823 1824 path = alloc_path_for_send(); 1825 if (!path) 1826 return -ENOMEM; 1827 1828 while (1) { 1829 struct fscrypt_str tmp_name; 1830 1831 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1832 ino, gen, idx); 1833 ASSERT(len < sizeof(tmp)); 1834 tmp_name.name = tmp; 1835 tmp_name.len = strlen(tmp); 1836 1837 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1838 path, BTRFS_FIRST_FREE_OBJECTID, 1839 &tmp_name, 0); 1840 btrfs_release_path(path); 1841 if (IS_ERR(di)) { 1842 ret = PTR_ERR(di); 1843 goto out; 1844 } 1845 if (di) { 1846 /* not unique, try again */ 1847 idx++; 1848 continue; 1849 } 1850 1851 if (!sctx->parent_root) { 1852 /* unique */ 1853 ret = 0; 1854 break; 1855 } 1856 1857 di = btrfs_lookup_dir_item(NULL, sctx->parent_root, 1858 path, BTRFS_FIRST_FREE_OBJECTID, 1859 &tmp_name, 0); 1860 btrfs_release_path(path); 1861 if (IS_ERR(di)) { 1862 ret = PTR_ERR(di); 1863 goto out; 1864 } 1865 if (di) { 1866 /* not unique, try again */ 1867 idx++; 1868 continue; 1869 } 1870 /* unique */ 1871 break; 1872 } 1873 1874 ret = fs_path_add(dest, tmp, strlen(tmp)); 1875 1876 out: 1877 btrfs_free_path(path); 1878 return ret; 1879 } 1880 1881 enum inode_state { 1882 inode_state_no_change, 1883 inode_state_will_create, 1884 inode_state_did_create, 1885 inode_state_will_delete, 1886 inode_state_did_delete, 1887 }; 1888 1889 static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) 1890 { 1891 int ret; 1892 int left_ret; 1893 int right_ret; 1894 u64 left_gen; 1895 u64 right_gen; 1896 struct btrfs_inode_info info; 1897 1898 ret = get_inode_info(sctx->send_root, ino, &info); 1899 if (ret < 0 && ret != -ENOENT) 1900 goto out; 1901 left_ret = (info.nlink == 0) ? -ENOENT : ret; 1902 left_gen = info.gen; 1903 1904 if (!sctx->parent_root) { 1905 right_ret = -ENOENT; 1906 } else { 1907 ret = get_inode_info(sctx->parent_root, ino, &info); 1908 if (ret < 0 && ret != -ENOENT) 1909 goto out; 1910 right_ret = (info.nlink == 0) ? -ENOENT : ret; 1911 right_gen = info.gen; 1912 } 1913 1914 if (!left_ret && !right_ret) { 1915 if (left_gen == gen && right_gen == gen) { 1916 ret = inode_state_no_change; 1917 } else if (left_gen == gen) { 1918 if (ino < sctx->send_progress) 1919 ret = inode_state_did_create; 1920 else 1921 ret = inode_state_will_create; 1922 } else if (right_gen == gen) { 1923 if (ino < sctx->send_progress) 1924 ret = inode_state_did_delete; 1925 else 1926 ret = inode_state_will_delete; 1927 } else { 1928 ret = -ENOENT; 1929 } 1930 } else if (!left_ret) { 1931 if (left_gen == gen) { 1932 if (ino < sctx->send_progress) 1933 ret = inode_state_did_create; 1934 else 1935 ret = inode_state_will_create; 1936 } else { 1937 ret = -ENOENT; 1938 } 1939 } else if (!right_ret) { 1940 if (right_gen == gen) { 1941 if (ino < sctx->send_progress) 1942 ret = inode_state_did_delete; 1943 else 1944 ret = inode_state_will_delete; 1945 } else { 1946 ret = -ENOENT; 1947 } 1948 } else { 1949 ret = -ENOENT; 1950 } 1951 1952 out: 1953 return ret; 1954 } 1955 1956 static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) 1957 { 1958 int ret; 1959 1960 if (ino == BTRFS_FIRST_FREE_OBJECTID) 1961 return 1; 1962 1963 ret = get_cur_inode_state(sctx, ino, gen); 1964 if (ret < 0) 1965 goto out; 1966 1967 if (ret == inode_state_no_change || 1968 ret == inode_state_did_create || 1969 ret == inode_state_will_delete) 1970 ret = 1; 1971 else 1972 ret = 0; 1973 1974 out: 1975 return ret; 1976 } 1977 1978 /* 1979 * Helper function to lookup a dir item in a dir. 1980 */ 1981 static int lookup_dir_item_inode(struct btrfs_root *root, 1982 u64 dir, const char *name, int name_len, 1983 u64 *found_inode) 1984 { 1985 int ret = 0; 1986 struct btrfs_dir_item *di; 1987 struct btrfs_key key; 1988 struct btrfs_path *path; 1989 struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); 1990 1991 path = alloc_path_for_send(); 1992 if (!path) 1993 return -ENOMEM; 1994 1995 di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); 1996 if (IS_ERR_OR_NULL(di)) { 1997 ret = di ? PTR_ERR(di) : -ENOENT; 1998 goto out; 1999 } 2000 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 2001 if (key.type == BTRFS_ROOT_ITEM_KEY) { 2002 ret = -ENOENT; 2003 goto out; 2004 } 2005 *found_inode = key.objectid; 2006 2007 out: 2008 btrfs_free_path(path); 2009 return ret; 2010 } 2011 2012 /* 2013 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 2014 * generation of the parent dir and the name of the dir entry. 2015 */ 2016 static int get_first_ref(struct btrfs_root *root, u64 ino, 2017 u64 *dir, u64 *dir_gen, struct fs_path *name) 2018 { 2019 int ret; 2020 struct btrfs_key key; 2021 struct btrfs_key found_key; 2022 struct btrfs_path *path; 2023 int len; 2024 u64 parent_dir; 2025 2026 path = alloc_path_for_send(); 2027 if (!path) 2028 return -ENOMEM; 2029 2030 key.objectid = ino; 2031 key.type = BTRFS_INODE_REF_KEY; 2032 key.offset = 0; 2033 2034 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2035 if (ret < 0) 2036 goto out; 2037 if (!ret) 2038 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2039 path->slots[0]); 2040 if (ret || found_key.objectid != ino || 2041 (found_key.type != BTRFS_INODE_REF_KEY && 2042 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 2043 ret = -ENOENT; 2044 goto out; 2045 } 2046 2047 if (found_key.type == BTRFS_INODE_REF_KEY) { 2048 struct btrfs_inode_ref *iref; 2049 iref = btrfs_item_ptr(path->nodes[0], path->slots[0], 2050 struct btrfs_inode_ref); 2051 len = btrfs_inode_ref_name_len(path->nodes[0], iref); 2052 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 2053 (unsigned long)(iref + 1), 2054 len); 2055 parent_dir = found_key.offset; 2056 } else { 2057 struct btrfs_inode_extref *extref; 2058 extref = btrfs_item_ptr(path->nodes[0], path->slots[0], 2059 struct btrfs_inode_extref); 2060 len = btrfs_inode_extref_name_len(path->nodes[0], extref); 2061 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 2062 (unsigned long)&extref->name, len); 2063 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); 2064 } 2065 if (ret < 0) 2066 goto out; 2067 btrfs_release_path(path); 2068 2069 if (dir_gen) { 2070 ret = get_inode_gen(root, parent_dir, dir_gen); 2071 if (ret < 0) 2072 goto out; 2073 } 2074 2075 *dir = parent_dir; 2076 2077 out: 2078 btrfs_free_path(path); 2079 return ret; 2080 } 2081 2082 static int is_first_ref(struct btrfs_root *root, 2083 u64 ino, u64 dir, 2084 const char *name, int name_len) 2085 { 2086 int ret; 2087 struct fs_path *tmp_name; 2088 u64 tmp_dir; 2089 2090 tmp_name = fs_path_alloc(); 2091 if (!tmp_name) 2092 return -ENOMEM; 2093 2094 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); 2095 if (ret < 0) 2096 goto out; 2097 2098 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { 2099 ret = 0; 2100 goto out; 2101 } 2102 2103 ret = !memcmp(tmp_name->start, name, name_len); 2104 2105 out: 2106 fs_path_free(tmp_name); 2107 return ret; 2108 } 2109 2110 /* 2111 * Used by process_recorded_refs to determine if a new ref would overwrite an 2112 * already existing ref. In case it detects an overwrite, it returns the 2113 * inode/gen in who_ino/who_gen. 2114 * When an overwrite is detected, process_recorded_refs does proper orphanizing 2115 * to make sure later references to the overwritten inode are possible. 2116 * Orphanizing is however only required for the first ref of an inode. 2117 * process_recorded_refs does an additional is_first_ref check to see if 2118 * orphanizing is really required. 2119 */ 2120 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 2121 const char *name, int name_len, 2122 u64 *who_ino, u64 *who_gen, u64 *who_mode) 2123 { 2124 int ret = 0; 2125 u64 gen; 2126 u64 other_inode = 0; 2127 struct btrfs_inode_info info; 2128 2129 if (!sctx->parent_root) 2130 goto out; 2131 2132 ret = is_inode_existent(sctx, dir, dir_gen); 2133 if (ret <= 0) 2134 goto out; 2135 2136 /* 2137 * If we have a parent root we need to verify that the parent dir was 2138 * not deleted and then re-created, if it was then we have no overwrite 2139 * and we can just unlink this entry. 2140 */ 2141 if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { 2142 ret = get_inode_gen(sctx->parent_root, dir, &gen); 2143 if (ret < 0 && ret != -ENOENT) 2144 goto out; 2145 if (ret) { 2146 ret = 0; 2147 goto out; 2148 } 2149 if (gen != dir_gen) 2150 goto out; 2151 } 2152 2153 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, 2154 &other_inode); 2155 if (ret < 0 && ret != -ENOENT) 2156 goto out; 2157 if (ret) { 2158 ret = 0; 2159 goto out; 2160 } 2161 2162 /* 2163 * Check if the overwritten ref was already processed. If yes, the ref 2164 * was already unlinked/moved, so we can safely assume that we will not 2165 * overwrite anything at this point in time. 2166 */ 2167 if (other_inode > sctx->send_progress || 2168 is_waiting_for_move(sctx, other_inode)) { 2169 ret = get_inode_info(sctx->parent_root, other_inode, &info); 2170 if (ret < 0) 2171 goto out; 2172 2173 ret = 1; 2174 *who_ino = other_inode; 2175 *who_gen = info.gen; 2176 *who_mode = info.mode; 2177 } else { 2178 ret = 0; 2179 } 2180 2181 out: 2182 return ret; 2183 } 2184 2185 /* 2186 * Checks if the ref was overwritten by an already processed inode. This is 2187 * used by __get_cur_name_and_parent to find out if the ref was orphanized and 2188 * thus the orphan name needs be used. 2189 * process_recorded_refs also uses it to avoid unlinking of refs that were 2190 * overwritten. 2191 */ 2192 static int did_overwrite_ref(struct send_ctx *sctx, 2193 u64 dir, u64 dir_gen, 2194 u64 ino, u64 ino_gen, 2195 const char *name, int name_len) 2196 { 2197 int ret = 0; 2198 u64 gen; 2199 u64 ow_inode; 2200 2201 if (!sctx->parent_root) 2202 goto out; 2203 2204 ret = is_inode_existent(sctx, dir, dir_gen); 2205 if (ret <= 0) 2206 goto out; 2207 2208 if (dir != BTRFS_FIRST_FREE_OBJECTID) { 2209 ret = get_inode_gen(sctx->send_root, dir, &gen); 2210 if (ret < 0 && ret != -ENOENT) 2211 goto out; 2212 if (ret) { 2213 ret = 0; 2214 goto out; 2215 } 2216 if (gen != dir_gen) 2217 goto out; 2218 } 2219 2220 /* check if the ref was overwritten by another ref */ 2221 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, 2222 &ow_inode); 2223 if (ret < 0 && ret != -ENOENT) 2224 goto out; 2225 if (ret) { 2226 /* was never and will never be overwritten */ 2227 ret = 0; 2228 goto out; 2229 } 2230 2231 ret = get_inode_gen(sctx->send_root, ow_inode, &gen); 2232 if (ret < 0) 2233 goto out; 2234 2235 if (ow_inode == ino && gen == ino_gen) { 2236 ret = 0; 2237 goto out; 2238 } 2239 2240 /* 2241 * We know that it is or will be overwritten. Check this now. 2242 * The current inode being processed might have been the one that caused 2243 * inode 'ino' to be orphanized, therefore check if ow_inode matches 2244 * the current inode being processed. 2245 */ 2246 if ((ow_inode < sctx->send_progress) || 2247 (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && 2248 gen == sctx->cur_inode_gen)) 2249 ret = 1; 2250 else 2251 ret = 0; 2252 2253 out: 2254 return ret; 2255 } 2256 2257 /* 2258 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode 2259 * that got overwritten. This is used by process_recorded_refs to determine 2260 * if it has to use the path as returned by get_cur_path or the orphan name. 2261 */ 2262 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 2263 { 2264 int ret = 0; 2265 struct fs_path *name = NULL; 2266 u64 dir; 2267 u64 dir_gen; 2268 2269 if (!sctx->parent_root) 2270 goto out; 2271 2272 name = fs_path_alloc(); 2273 if (!name) 2274 return -ENOMEM; 2275 2276 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); 2277 if (ret < 0) 2278 goto out; 2279 2280 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 2281 name->start, fs_path_len(name)); 2282 2283 out: 2284 fs_path_free(name); 2285 return ret; 2286 } 2287 2288 /* 2289 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, 2290 * so we need to do some special handling in case we have clashes. This function 2291 * takes care of this with the help of name_cache_entry::radix_list. 2292 * In case of error, nce is kfreed. 2293 */ 2294 static int name_cache_insert(struct send_ctx *sctx, 2295 struct name_cache_entry *nce) 2296 { 2297 int ret = 0; 2298 struct list_head *nce_head; 2299 2300 nce_head = radix_tree_lookup(&sctx->name_cache, 2301 (unsigned long)nce->ino); 2302 if (!nce_head) { 2303 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); 2304 if (!nce_head) { 2305 kfree(nce); 2306 return -ENOMEM; 2307 } 2308 INIT_LIST_HEAD(nce_head); 2309 2310 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 2311 if (ret < 0) { 2312 kfree(nce_head); 2313 kfree(nce); 2314 return ret; 2315 } 2316 } 2317 list_add_tail(&nce->radix_list, nce_head); 2318 list_add_tail(&nce->list, &sctx->name_cache_list); 2319 sctx->name_cache_size++; 2320 2321 return ret; 2322 } 2323 2324 static void name_cache_delete(struct send_ctx *sctx, 2325 struct name_cache_entry *nce) 2326 { 2327 struct list_head *nce_head; 2328 2329 nce_head = radix_tree_lookup(&sctx->name_cache, 2330 (unsigned long)nce->ino); 2331 if (!nce_head) { 2332 btrfs_err(sctx->send_root->fs_info, 2333 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", 2334 nce->ino, sctx->name_cache_size); 2335 } 2336 2337 list_del(&nce->radix_list); 2338 list_del(&nce->list); 2339 sctx->name_cache_size--; 2340 2341 /* 2342 * We may not get to the final release of nce_head if the lookup fails 2343 */ 2344 if (nce_head && list_empty(nce_head)) { 2345 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 2346 kfree(nce_head); 2347 } 2348 } 2349 2350 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 2351 u64 ino, u64 gen) 2352 { 2353 struct list_head *nce_head; 2354 struct name_cache_entry *cur; 2355 2356 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); 2357 if (!nce_head) 2358 return NULL; 2359 2360 list_for_each_entry(cur, nce_head, radix_list) { 2361 if (cur->ino == ino && cur->gen == gen) 2362 return cur; 2363 } 2364 return NULL; 2365 } 2366 2367 /* 2368 * Remove some entries from the beginning of name_cache_list. 2369 */ 2370 static void name_cache_clean_unused(struct send_ctx *sctx) 2371 { 2372 struct name_cache_entry *nce; 2373 2374 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) 2375 return; 2376 2377 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { 2378 nce = list_entry(sctx->name_cache_list.next, 2379 struct name_cache_entry, list); 2380 name_cache_delete(sctx, nce); 2381 kfree(nce); 2382 } 2383 } 2384 2385 static void name_cache_free(struct send_ctx *sctx) 2386 { 2387 struct name_cache_entry *nce; 2388 2389 while (!list_empty(&sctx->name_cache_list)) { 2390 nce = list_entry(sctx->name_cache_list.next, 2391 struct name_cache_entry, list); 2392 name_cache_delete(sctx, nce); 2393 kfree(nce); 2394 } 2395 } 2396 2397 /* 2398 * Used by get_cur_path for each ref up to the root. 2399 * Returns 0 if it succeeded. 2400 * Returns 1 if the inode is not existent or got overwritten. In that case, the 2401 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 2402 * is returned, parent_ino/parent_gen are not guaranteed to be valid. 2403 * Returns <0 in case of error. 2404 */ 2405 static int __get_cur_name_and_parent(struct send_ctx *sctx, 2406 u64 ino, u64 gen, 2407 u64 *parent_ino, 2408 u64 *parent_gen, 2409 struct fs_path *dest) 2410 { 2411 int ret; 2412 int nce_ret; 2413 struct name_cache_entry *nce = NULL; 2414 2415 /* 2416 * First check if we already did a call to this function with the same 2417 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2418 * return the cached result. 2419 */ 2420 nce = name_cache_search(sctx, ino, gen); 2421 if (nce) { 2422 if (ino < sctx->send_progress && nce->need_later_update) { 2423 name_cache_delete(sctx, nce); 2424 kfree(nce); 2425 nce = NULL; 2426 } else { 2427 /* 2428 * Removes the entry from the list and adds it back to 2429 * the end. This marks the entry as recently used so 2430 * that name_cache_clean_unused does not remove it. 2431 */ 2432 list_move_tail(&nce->list, &sctx->name_cache_list); 2433 2434 *parent_ino = nce->parent_ino; 2435 *parent_gen = nce->parent_gen; 2436 ret = fs_path_add(dest, nce->name, nce->name_len); 2437 if (ret < 0) 2438 goto out; 2439 ret = nce->ret; 2440 goto out; 2441 } 2442 } 2443 2444 /* 2445 * If the inode is not existent yet, add the orphan name and return 1. 2446 * This should only happen for the parent dir that we determine in 2447 * record_new_ref_if_needed(). 2448 */ 2449 ret = is_inode_existent(sctx, ino, gen); 2450 if (ret < 0) 2451 goto out; 2452 2453 if (!ret) { 2454 ret = gen_unique_name(sctx, ino, gen, dest); 2455 if (ret < 0) 2456 goto out; 2457 ret = 1; 2458 goto out_cache; 2459 } 2460 2461 /* 2462 * Depending on whether the inode was already processed or not, use 2463 * send_root or parent_root for ref lookup. 2464 */ 2465 if (ino < sctx->send_progress) 2466 ret = get_first_ref(sctx->send_root, ino, 2467 parent_ino, parent_gen, dest); 2468 else 2469 ret = get_first_ref(sctx->parent_root, ino, 2470 parent_ino, parent_gen, dest); 2471 if (ret < 0) 2472 goto out; 2473 2474 /* 2475 * Check if the ref was overwritten by an inode's ref that was processed 2476 * earlier. If yes, treat as orphan and return 1. 2477 */ 2478 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 2479 dest->start, dest->end - dest->start); 2480 if (ret < 0) 2481 goto out; 2482 if (ret) { 2483 fs_path_reset(dest); 2484 ret = gen_unique_name(sctx, ino, gen, dest); 2485 if (ret < 0) 2486 goto out; 2487 ret = 1; 2488 } 2489 2490 out_cache: 2491 /* 2492 * Store the result of the lookup in the name cache. 2493 */ 2494 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); 2495 if (!nce) { 2496 ret = -ENOMEM; 2497 goto out; 2498 } 2499 2500 nce->ino = ino; 2501 nce->gen = gen; 2502 nce->parent_ino = *parent_ino; 2503 nce->parent_gen = *parent_gen; 2504 nce->name_len = fs_path_len(dest); 2505 nce->ret = ret; 2506 strcpy(nce->name, dest->start); 2507 2508 if (ino < sctx->send_progress) 2509 nce->need_later_update = 0; 2510 else 2511 nce->need_later_update = 1; 2512 2513 nce_ret = name_cache_insert(sctx, nce); 2514 if (nce_ret < 0) 2515 ret = nce_ret; 2516 name_cache_clean_unused(sctx); 2517 2518 out: 2519 return ret; 2520 } 2521 2522 /* 2523 * Magic happens here. This function returns the first ref to an inode as it 2524 * would look like while receiving the stream at this point in time. 2525 * We walk the path up to the root. For every inode in between, we check if it 2526 * was already processed/sent. If yes, we continue with the parent as found 2527 * in send_root. If not, we continue with the parent as found in parent_root. 2528 * If we encounter an inode that was deleted at this point in time, we use the 2529 * inodes "orphan" name instead of the real name and stop. Same with new inodes 2530 * that were not created yet and overwritten inodes/refs. 2531 * 2532 * When do we have orphan inodes: 2533 * 1. When an inode is freshly created and thus no valid refs are available yet 2534 * 2. When a directory lost all it's refs (deleted) but still has dir items 2535 * inside which were not processed yet (pending for move/delete). If anyone 2536 * tried to get the path to the dir items, it would get a path inside that 2537 * orphan directory. 2538 * 3. When an inode is moved around or gets new links, it may overwrite the ref 2539 * of an unprocessed inode. If in that case the first ref would be 2540 * overwritten, the overwritten inode gets "orphanized". Later when we 2541 * process this overwritten inode, it is restored at a new place by moving 2542 * the orphan inode. 2543 * 2544 * sctx->send_progress tells this function at which point in time receiving 2545 * would be. 2546 */ 2547 static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, 2548 struct fs_path *dest) 2549 { 2550 int ret = 0; 2551 struct fs_path *name = NULL; 2552 u64 parent_inode = 0; 2553 u64 parent_gen = 0; 2554 int stop = 0; 2555 2556 name = fs_path_alloc(); 2557 if (!name) { 2558 ret = -ENOMEM; 2559 goto out; 2560 } 2561 2562 dest->reversed = 1; 2563 fs_path_reset(dest); 2564 2565 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2566 struct waiting_dir_move *wdm; 2567 2568 fs_path_reset(name); 2569 2570 if (is_waiting_for_rm(sctx, ino, gen)) { 2571 ret = gen_unique_name(sctx, ino, gen, name); 2572 if (ret < 0) 2573 goto out; 2574 ret = fs_path_add_path(dest, name); 2575 break; 2576 } 2577 2578 wdm = get_waiting_dir_move(sctx, ino); 2579 if (wdm && wdm->orphanized) { 2580 ret = gen_unique_name(sctx, ino, gen, name); 2581 stop = 1; 2582 } else if (wdm) { 2583 ret = get_first_ref(sctx->parent_root, ino, 2584 &parent_inode, &parent_gen, name); 2585 } else { 2586 ret = __get_cur_name_and_parent(sctx, ino, gen, 2587 &parent_inode, 2588 &parent_gen, name); 2589 if (ret) 2590 stop = 1; 2591 } 2592 2593 if (ret < 0) 2594 goto out; 2595 2596 ret = fs_path_add_path(dest, name); 2597 if (ret < 0) 2598 goto out; 2599 2600 ino = parent_inode; 2601 gen = parent_gen; 2602 } 2603 2604 out: 2605 fs_path_free(name); 2606 if (!ret) 2607 fs_path_unreverse(dest); 2608 return ret; 2609 } 2610 2611 /* 2612 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace 2613 */ 2614 static int send_subvol_begin(struct send_ctx *sctx) 2615 { 2616 int ret; 2617 struct btrfs_root *send_root = sctx->send_root; 2618 struct btrfs_root *parent_root = sctx->parent_root; 2619 struct btrfs_path *path; 2620 struct btrfs_key key; 2621 struct btrfs_root_ref *ref; 2622 struct extent_buffer *leaf; 2623 char *name = NULL; 2624 int namelen; 2625 2626 path = btrfs_alloc_path(); 2627 if (!path) 2628 return -ENOMEM; 2629 2630 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 2631 if (!name) { 2632 btrfs_free_path(path); 2633 return -ENOMEM; 2634 } 2635 2636 key.objectid = send_root->root_key.objectid; 2637 key.type = BTRFS_ROOT_BACKREF_KEY; 2638 key.offset = 0; 2639 2640 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, 2641 &key, path, 1, 0); 2642 if (ret < 0) 2643 goto out; 2644 if (ret) { 2645 ret = -ENOENT; 2646 goto out; 2647 } 2648 2649 leaf = path->nodes[0]; 2650 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2651 if (key.type != BTRFS_ROOT_BACKREF_KEY || 2652 key.objectid != send_root->root_key.objectid) { 2653 ret = -ENOENT; 2654 goto out; 2655 } 2656 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 2657 namelen = btrfs_root_ref_name_len(leaf, ref); 2658 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2659 btrfs_release_path(path); 2660 2661 if (parent_root) { 2662 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2663 if (ret < 0) 2664 goto out; 2665 } else { 2666 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); 2667 if (ret < 0) 2668 goto out; 2669 } 2670 2671 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); 2672 2673 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) 2674 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2675 sctx->send_root->root_item.received_uuid); 2676 else 2677 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2678 sctx->send_root->root_item.uuid); 2679 2680 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2681 btrfs_root_ctransid(&sctx->send_root->root_item)); 2682 if (parent_root) { 2683 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) 2684 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2685 parent_root->root_item.received_uuid); 2686 else 2687 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2688 parent_root->root_item.uuid); 2689 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2690 btrfs_root_ctransid(&sctx->parent_root->root_item)); 2691 } 2692 2693 ret = send_cmd(sctx); 2694 2695 tlv_put_failure: 2696 out: 2697 btrfs_free_path(path); 2698 kfree(name); 2699 return ret; 2700 } 2701 2702 static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) 2703 { 2704 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2705 int ret = 0; 2706 struct fs_path *p; 2707 2708 btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); 2709 2710 p = fs_path_alloc(); 2711 if (!p) 2712 return -ENOMEM; 2713 2714 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); 2715 if (ret < 0) 2716 goto out; 2717 2718 ret = get_cur_path(sctx, ino, gen, p); 2719 if (ret < 0) 2720 goto out; 2721 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2722 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); 2723 2724 ret = send_cmd(sctx); 2725 2726 tlv_put_failure: 2727 out: 2728 fs_path_free(p); 2729 return ret; 2730 } 2731 2732 static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) 2733 { 2734 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2735 int ret = 0; 2736 struct fs_path *p; 2737 2738 btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); 2739 2740 p = fs_path_alloc(); 2741 if (!p) 2742 return -ENOMEM; 2743 2744 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); 2745 if (ret < 0) 2746 goto out; 2747 2748 ret = get_cur_path(sctx, ino, gen, p); 2749 if (ret < 0) 2750 goto out; 2751 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2752 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); 2753 2754 ret = send_cmd(sctx); 2755 2756 tlv_put_failure: 2757 out: 2758 fs_path_free(p); 2759 return ret; 2760 } 2761 2762 static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) 2763 { 2764 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2765 int ret = 0; 2766 struct fs_path *p; 2767 2768 if (sctx->proto < 2) 2769 return 0; 2770 2771 btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); 2772 2773 p = fs_path_alloc(); 2774 if (!p) 2775 return -ENOMEM; 2776 2777 ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); 2778 if (ret < 0) 2779 goto out; 2780 2781 ret = get_cur_path(sctx, ino, gen, p); 2782 if (ret < 0) 2783 goto out; 2784 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2785 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); 2786 2787 ret = send_cmd(sctx); 2788 2789 tlv_put_failure: 2790 out: 2791 fs_path_free(p); 2792 return ret; 2793 } 2794 2795 static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) 2796 { 2797 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2798 int ret = 0; 2799 struct fs_path *p; 2800 2801 btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", 2802 ino, uid, gid); 2803 2804 p = fs_path_alloc(); 2805 if (!p) 2806 return -ENOMEM; 2807 2808 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); 2809 if (ret < 0) 2810 goto out; 2811 2812 ret = get_cur_path(sctx, ino, gen, p); 2813 if (ret < 0) 2814 goto out; 2815 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2816 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); 2817 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); 2818 2819 ret = send_cmd(sctx); 2820 2821 tlv_put_failure: 2822 out: 2823 fs_path_free(p); 2824 return ret; 2825 } 2826 2827 static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) 2828 { 2829 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2830 int ret = 0; 2831 struct fs_path *p = NULL; 2832 struct btrfs_inode_item *ii; 2833 struct btrfs_path *path = NULL; 2834 struct extent_buffer *eb; 2835 struct btrfs_key key; 2836 int slot; 2837 2838 btrfs_debug(fs_info, "send_utimes %llu", ino); 2839 2840 p = fs_path_alloc(); 2841 if (!p) 2842 return -ENOMEM; 2843 2844 path = alloc_path_for_send(); 2845 if (!path) { 2846 ret = -ENOMEM; 2847 goto out; 2848 } 2849 2850 key.objectid = ino; 2851 key.type = BTRFS_INODE_ITEM_KEY; 2852 key.offset = 0; 2853 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2854 if (ret > 0) 2855 ret = -ENOENT; 2856 if (ret < 0) 2857 goto out; 2858 2859 eb = path->nodes[0]; 2860 slot = path->slots[0]; 2861 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2862 2863 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); 2864 if (ret < 0) 2865 goto out; 2866 2867 ret = get_cur_path(sctx, ino, gen, p); 2868 if (ret < 0) 2869 goto out; 2870 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2871 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); 2872 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); 2873 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); 2874 if (sctx->proto >= 2) 2875 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); 2876 2877 ret = send_cmd(sctx); 2878 2879 tlv_put_failure: 2880 out: 2881 fs_path_free(p); 2882 btrfs_free_path(path); 2883 return ret; 2884 } 2885 2886 /* 2887 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have 2888 * a valid path yet because we did not process the refs yet. So, the inode 2889 * is created as orphan. 2890 */ 2891 static int send_create_inode(struct send_ctx *sctx, u64 ino) 2892 { 2893 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2894 int ret = 0; 2895 struct fs_path *p; 2896 int cmd; 2897 struct btrfs_inode_info info; 2898 u64 gen; 2899 u64 mode; 2900 u64 rdev; 2901 2902 btrfs_debug(fs_info, "send_create_inode %llu", ino); 2903 2904 p = fs_path_alloc(); 2905 if (!p) 2906 return -ENOMEM; 2907 2908 if (ino != sctx->cur_ino) { 2909 ret = get_inode_info(sctx->send_root, ino, &info); 2910 if (ret < 0) 2911 goto out; 2912 gen = info.gen; 2913 mode = info.mode; 2914 rdev = info.rdev; 2915 } else { 2916 gen = sctx->cur_inode_gen; 2917 mode = sctx->cur_inode_mode; 2918 rdev = sctx->cur_inode_rdev; 2919 } 2920 2921 if (S_ISREG(mode)) { 2922 cmd = BTRFS_SEND_C_MKFILE; 2923 } else if (S_ISDIR(mode)) { 2924 cmd = BTRFS_SEND_C_MKDIR; 2925 } else if (S_ISLNK(mode)) { 2926 cmd = BTRFS_SEND_C_SYMLINK; 2927 } else if (S_ISCHR(mode) || S_ISBLK(mode)) { 2928 cmd = BTRFS_SEND_C_MKNOD; 2929 } else if (S_ISFIFO(mode)) { 2930 cmd = BTRFS_SEND_C_MKFIFO; 2931 } else if (S_ISSOCK(mode)) { 2932 cmd = BTRFS_SEND_C_MKSOCK; 2933 } else { 2934 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", 2935 (int)(mode & S_IFMT)); 2936 ret = -EOPNOTSUPP; 2937 goto out; 2938 } 2939 2940 ret = begin_cmd(sctx, cmd); 2941 if (ret < 0) 2942 goto out; 2943 2944 ret = gen_unique_name(sctx, ino, gen, p); 2945 if (ret < 0) 2946 goto out; 2947 2948 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2949 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); 2950 2951 if (S_ISLNK(mode)) { 2952 fs_path_reset(p); 2953 ret = read_symlink(sctx->send_root, ino, p); 2954 if (ret < 0) 2955 goto out; 2956 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2957 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2958 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2959 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); 2960 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); 2961 } 2962 2963 ret = send_cmd(sctx); 2964 if (ret < 0) 2965 goto out; 2966 2967 2968 tlv_put_failure: 2969 out: 2970 fs_path_free(p); 2971 return ret; 2972 } 2973 2974 /* 2975 * We need some special handling for inodes that get processed before the parent 2976 * directory got created. See process_recorded_refs for details. 2977 * This function does the check if we already created the dir out of order. 2978 */ 2979 static int did_create_dir(struct send_ctx *sctx, u64 dir) 2980 { 2981 int ret = 0; 2982 int iter_ret = 0; 2983 struct btrfs_path *path = NULL; 2984 struct btrfs_key key; 2985 struct btrfs_key found_key; 2986 struct btrfs_key di_key; 2987 struct btrfs_dir_item *di; 2988 2989 path = alloc_path_for_send(); 2990 if (!path) 2991 return -ENOMEM; 2992 2993 key.objectid = dir; 2994 key.type = BTRFS_DIR_INDEX_KEY; 2995 key.offset = 0; 2996 2997 btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { 2998 struct extent_buffer *eb = path->nodes[0]; 2999 3000 if (found_key.objectid != key.objectid || 3001 found_key.type != key.type) { 3002 ret = 0; 3003 break; 3004 } 3005 3006 di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); 3007 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 3008 3009 if (di_key.type != BTRFS_ROOT_ITEM_KEY && 3010 di_key.objectid < sctx->send_progress) { 3011 ret = 1; 3012 break; 3013 } 3014 } 3015 /* Catch error found during iteration */ 3016 if (iter_ret < 0) 3017 ret = iter_ret; 3018 3019 btrfs_free_path(path); 3020 return ret; 3021 } 3022 3023 /* 3024 * Only creates the inode if it is: 3025 * 1. Not a directory 3026 * 2. Or a directory which was not created already due to out of order 3027 * directories. See did_create_dir and process_recorded_refs for details. 3028 */ 3029 static int send_create_inode_if_needed(struct send_ctx *sctx) 3030 { 3031 int ret; 3032 3033 if (S_ISDIR(sctx->cur_inode_mode)) { 3034 ret = did_create_dir(sctx, sctx->cur_ino); 3035 if (ret < 0) 3036 return ret; 3037 else if (ret > 0) 3038 return 0; 3039 } 3040 3041 return send_create_inode(sctx, sctx->cur_ino); 3042 } 3043 3044 struct recorded_ref { 3045 struct list_head list; 3046 char *name; 3047 struct fs_path *full_path; 3048 u64 dir; 3049 u64 dir_gen; 3050 int name_len; 3051 struct rb_node node; 3052 struct rb_root *root; 3053 }; 3054 3055 static struct recorded_ref *recorded_ref_alloc(void) 3056 { 3057 struct recorded_ref *ref; 3058 3059 ref = kzalloc(sizeof(*ref), GFP_KERNEL); 3060 if (!ref) 3061 return NULL; 3062 RB_CLEAR_NODE(&ref->node); 3063 INIT_LIST_HEAD(&ref->list); 3064 return ref; 3065 } 3066 3067 static void recorded_ref_free(struct recorded_ref *ref) 3068 { 3069 if (!ref) 3070 return; 3071 if (!RB_EMPTY_NODE(&ref->node)) 3072 rb_erase(&ref->node, ref->root); 3073 list_del(&ref->list); 3074 fs_path_free(ref->full_path); 3075 kfree(ref); 3076 } 3077 3078 static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) 3079 { 3080 ref->full_path = path; 3081 ref->name = (char *)kbasename(ref->full_path->start); 3082 ref->name_len = ref->full_path->end - ref->name; 3083 } 3084 3085 static int dup_ref(struct recorded_ref *ref, struct list_head *list) 3086 { 3087 struct recorded_ref *new; 3088 3089 new = recorded_ref_alloc(); 3090 if (!new) 3091 return -ENOMEM; 3092 3093 new->dir = ref->dir; 3094 new->dir_gen = ref->dir_gen; 3095 list_add_tail(&new->list, list); 3096 return 0; 3097 } 3098 3099 static void __free_recorded_refs(struct list_head *head) 3100 { 3101 struct recorded_ref *cur; 3102 3103 while (!list_empty(head)) { 3104 cur = list_entry(head->next, struct recorded_ref, list); 3105 recorded_ref_free(cur); 3106 } 3107 } 3108 3109 static void free_recorded_refs(struct send_ctx *sctx) 3110 { 3111 __free_recorded_refs(&sctx->new_refs); 3112 __free_recorded_refs(&sctx->deleted_refs); 3113 } 3114 3115 /* 3116 * Renames/moves a file/dir to its orphan name. Used when the first 3117 * ref of an unprocessed inode gets overwritten and for all non empty 3118 * directories. 3119 */ 3120 static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, 3121 struct fs_path *path) 3122 { 3123 int ret; 3124 struct fs_path *orphan; 3125 3126 orphan = fs_path_alloc(); 3127 if (!orphan) 3128 return -ENOMEM; 3129 3130 ret = gen_unique_name(sctx, ino, gen, orphan); 3131 if (ret < 0) 3132 goto out; 3133 3134 ret = send_rename(sctx, path, orphan); 3135 3136 out: 3137 fs_path_free(orphan); 3138 return ret; 3139 } 3140 3141 static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, 3142 u64 dir_ino, u64 dir_gen) 3143 { 3144 struct rb_node **p = &sctx->orphan_dirs.rb_node; 3145 struct rb_node *parent = NULL; 3146 struct orphan_dir_info *entry, *odi; 3147 3148 while (*p) { 3149 parent = *p; 3150 entry = rb_entry(parent, struct orphan_dir_info, node); 3151 if (dir_ino < entry->ino) 3152 p = &(*p)->rb_left; 3153 else if (dir_ino > entry->ino) 3154 p = &(*p)->rb_right; 3155 else if (dir_gen < entry->gen) 3156 p = &(*p)->rb_left; 3157 else if (dir_gen > entry->gen) 3158 p = &(*p)->rb_right; 3159 else 3160 return entry; 3161 } 3162 3163 odi = kmalloc(sizeof(*odi), GFP_KERNEL); 3164 if (!odi) 3165 return ERR_PTR(-ENOMEM); 3166 odi->ino = dir_ino; 3167 odi->gen = dir_gen; 3168 odi->last_dir_index_offset = 0; 3169 3170 rb_link_node(&odi->node, parent, p); 3171 rb_insert_color(&odi->node, &sctx->orphan_dirs); 3172 return odi; 3173 } 3174 3175 static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, 3176 u64 dir_ino, u64 gen) 3177 { 3178 struct rb_node *n = sctx->orphan_dirs.rb_node; 3179 struct orphan_dir_info *entry; 3180 3181 while (n) { 3182 entry = rb_entry(n, struct orphan_dir_info, node); 3183 if (dir_ino < entry->ino) 3184 n = n->rb_left; 3185 else if (dir_ino > entry->ino) 3186 n = n->rb_right; 3187 else if (gen < entry->gen) 3188 n = n->rb_left; 3189 else if (gen > entry->gen) 3190 n = n->rb_right; 3191 else 3192 return entry; 3193 } 3194 return NULL; 3195 } 3196 3197 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) 3198 { 3199 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); 3200 3201 return odi != NULL; 3202 } 3203 3204 static void free_orphan_dir_info(struct send_ctx *sctx, 3205 struct orphan_dir_info *odi) 3206 { 3207 if (!odi) 3208 return; 3209 rb_erase(&odi->node, &sctx->orphan_dirs); 3210 kfree(odi); 3211 } 3212 3213 /* 3214 * Returns 1 if a directory can be removed at this point in time. 3215 * We check this by iterating all dir items and checking if the inode behind 3216 * the dir item was already processed. 3217 */ 3218 static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, 3219 u64 send_progress) 3220 { 3221 int ret = 0; 3222 int iter_ret = 0; 3223 struct btrfs_root *root = sctx->parent_root; 3224 struct btrfs_path *path; 3225 struct btrfs_key key; 3226 struct btrfs_key found_key; 3227 struct btrfs_key loc; 3228 struct btrfs_dir_item *di; 3229 struct orphan_dir_info *odi = NULL; 3230 3231 /* 3232 * Don't try to rmdir the top/root subvolume dir. 3233 */ 3234 if (dir == BTRFS_FIRST_FREE_OBJECTID) 3235 return 0; 3236 3237 path = alloc_path_for_send(); 3238 if (!path) 3239 return -ENOMEM; 3240 3241 key.objectid = dir; 3242 key.type = BTRFS_DIR_INDEX_KEY; 3243 key.offset = 0; 3244 3245 odi = get_orphan_dir_info(sctx, dir, dir_gen); 3246 if (odi) 3247 key.offset = odi->last_dir_index_offset; 3248 3249 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 3250 struct waiting_dir_move *dm; 3251 3252 if (found_key.objectid != key.objectid || 3253 found_key.type != key.type) 3254 break; 3255 3256 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 3257 struct btrfs_dir_item); 3258 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 3259 3260 dm = get_waiting_dir_move(sctx, loc.objectid); 3261 if (dm) { 3262 odi = add_orphan_dir_info(sctx, dir, dir_gen); 3263 if (IS_ERR(odi)) { 3264 ret = PTR_ERR(odi); 3265 goto out; 3266 } 3267 odi->gen = dir_gen; 3268 odi->last_dir_index_offset = found_key.offset; 3269 dm->rmdir_ino = dir; 3270 dm->rmdir_gen = dir_gen; 3271 ret = 0; 3272 goto out; 3273 } 3274 3275 if (loc.objectid > send_progress) { 3276 odi = add_orphan_dir_info(sctx, dir, dir_gen); 3277 if (IS_ERR(odi)) { 3278 ret = PTR_ERR(odi); 3279 goto out; 3280 } 3281 odi->gen = dir_gen; 3282 odi->last_dir_index_offset = found_key.offset; 3283 ret = 0; 3284 goto out; 3285 } 3286 } 3287 if (iter_ret < 0) { 3288 ret = iter_ret; 3289 goto out; 3290 } 3291 free_orphan_dir_info(sctx, odi); 3292 3293 ret = 1; 3294 3295 out: 3296 btrfs_free_path(path); 3297 return ret; 3298 } 3299 3300 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 3301 { 3302 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); 3303 3304 return entry != NULL; 3305 } 3306 3307 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) 3308 { 3309 struct rb_node **p = &sctx->waiting_dir_moves.rb_node; 3310 struct rb_node *parent = NULL; 3311 struct waiting_dir_move *entry, *dm; 3312 3313 dm = kmalloc(sizeof(*dm), GFP_KERNEL); 3314 if (!dm) 3315 return -ENOMEM; 3316 dm->ino = ino; 3317 dm->rmdir_ino = 0; 3318 dm->rmdir_gen = 0; 3319 dm->orphanized = orphanized; 3320 3321 while (*p) { 3322 parent = *p; 3323 entry = rb_entry(parent, struct waiting_dir_move, node); 3324 if (ino < entry->ino) { 3325 p = &(*p)->rb_left; 3326 } else if (ino > entry->ino) { 3327 p = &(*p)->rb_right; 3328 } else { 3329 kfree(dm); 3330 return -EEXIST; 3331 } 3332 } 3333 3334 rb_link_node(&dm->node, parent, p); 3335 rb_insert_color(&dm->node, &sctx->waiting_dir_moves); 3336 return 0; 3337 } 3338 3339 static struct waiting_dir_move * 3340 get_waiting_dir_move(struct send_ctx *sctx, u64 ino) 3341 { 3342 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 3343 struct waiting_dir_move *entry; 3344 3345 while (n) { 3346 entry = rb_entry(n, struct waiting_dir_move, node); 3347 if (ino < entry->ino) 3348 n = n->rb_left; 3349 else if (ino > entry->ino) 3350 n = n->rb_right; 3351 else 3352 return entry; 3353 } 3354 return NULL; 3355 } 3356 3357 static void free_waiting_dir_move(struct send_ctx *sctx, 3358 struct waiting_dir_move *dm) 3359 { 3360 if (!dm) 3361 return; 3362 rb_erase(&dm->node, &sctx->waiting_dir_moves); 3363 kfree(dm); 3364 } 3365 3366 static int add_pending_dir_move(struct send_ctx *sctx, 3367 u64 ino, 3368 u64 ino_gen, 3369 u64 parent_ino, 3370 struct list_head *new_refs, 3371 struct list_head *deleted_refs, 3372 const bool is_orphan) 3373 { 3374 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 3375 struct rb_node *parent = NULL; 3376 struct pending_dir_move *entry = NULL, *pm; 3377 struct recorded_ref *cur; 3378 int exists = 0; 3379 int ret; 3380 3381 pm = kmalloc(sizeof(*pm), GFP_KERNEL); 3382 if (!pm) 3383 return -ENOMEM; 3384 pm->parent_ino = parent_ino; 3385 pm->ino = ino; 3386 pm->gen = ino_gen; 3387 INIT_LIST_HEAD(&pm->list); 3388 INIT_LIST_HEAD(&pm->update_refs); 3389 RB_CLEAR_NODE(&pm->node); 3390 3391 while (*p) { 3392 parent = *p; 3393 entry = rb_entry(parent, struct pending_dir_move, node); 3394 if (parent_ino < entry->parent_ino) { 3395 p = &(*p)->rb_left; 3396 } else if (parent_ino > entry->parent_ino) { 3397 p = &(*p)->rb_right; 3398 } else { 3399 exists = 1; 3400 break; 3401 } 3402 } 3403 3404 list_for_each_entry(cur, deleted_refs, list) { 3405 ret = dup_ref(cur, &pm->update_refs); 3406 if (ret < 0) 3407 goto out; 3408 } 3409 list_for_each_entry(cur, new_refs, list) { 3410 ret = dup_ref(cur, &pm->update_refs); 3411 if (ret < 0) 3412 goto out; 3413 } 3414 3415 ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); 3416 if (ret) 3417 goto out; 3418 3419 if (exists) { 3420 list_add_tail(&pm->list, &entry->list); 3421 } else { 3422 rb_link_node(&pm->node, parent, p); 3423 rb_insert_color(&pm->node, &sctx->pending_dir_moves); 3424 } 3425 ret = 0; 3426 out: 3427 if (ret) { 3428 __free_recorded_refs(&pm->update_refs); 3429 kfree(pm); 3430 } 3431 return ret; 3432 } 3433 3434 static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, 3435 u64 parent_ino) 3436 { 3437 struct rb_node *n = sctx->pending_dir_moves.rb_node; 3438 struct pending_dir_move *entry; 3439 3440 while (n) { 3441 entry = rb_entry(n, struct pending_dir_move, node); 3442 if (parent_ino < entry->parent_ino) 3443 n = n->rb_left; 3444 else if (parent_ino > entry->parent_ino) 3445 n = n->rb_right; 3446 else 3447 return entry; 3448 } 3449 return NULL; 3450 } 3451 3452 static int path_loop(struct send_ctx *sctx, struct fs_path *name, 3453 u64 ino, u64 gen, u64 *ancestor_ino) 3454 { 3455 int ret = 0; 3456 u64 parent_inode = 0; 3457 u64 parent_gen = 0; 3458 u64 start_ino = ino; 3459 3460 *ancestor_ino = 0; 3461 while (ino != BTRFS_FIRST_FREE_OBJECTID) { 3462 fs_path_reset(name); 3463 3464 if (is_waiting_for_rm(sctx, ino, gen)) 3465 break; 3466 if (is_waiting_for_move(sctx, ino)) { 3467 if (*ancestor_ino == 0) 3468 *ancestor_ino = ino; 3469 ret = get_first_ref(sctx->parent_root, ino, 3470 &parent_inode, &parent_gen, name); 3471 } else { 3472 ret = __get_cur_name_and_parent(sctx, ino, gen, 3473 &parent_inode, 3474 &parent_gen, name); 3475 if (ret > 0) { 3476 ret = 0; 3477 break; 3478 } 3479 } 3480 if (ret < 0) 3481 break; 3482 if (parent_inode == start_ino) { 3483 ret = 1; 3484 if (*ancestor_ino == 0) 3485 *ancestor_ino = ino; 3486 break; 3487 } 3488 ino = parent_inode; 3489 gen = parent_gen; 3490 } 3491 return ret; 3492 } 3493 3494 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3495 { 3496 struct fs_path *from_path = NULL; 3497 struct fs_path *to_path = NULL; 3498 struct fs_path *name = NULL; 3499 u64 orig_progress = sctx->send_progress; 3500 struct recorded_ref *cur; 3501 u64 parent_ino, parent_gen; 3502 struct waiting_dir_move *dm = NULL; 3503 u64 rmdir_ino = 0; 3504 u64 rmdir_gen; 3505 u64 ancestor; 3506 bool is_orphan; 3507 int ret; 3508 3509 name = fs_path_alloc(); 3510 from_path = fs_path_alloc(); 3511 if (!name || !from_path) { 3512 ret = -ENOMEM; 3513 goto out; 3514 } 3515 3516 dm = get_waiting_dir_move(sctx, pm->ino); 3517 ASSERT(dm); 3518 rmdir_ino = dm->rmdir_ino; 3519 rmdir_gen = dm->rmdir_gen; 3520 is_orphan = dm->orphanized; 3521 free_waiting_dir_move(sctx, dm); 3522 3523 if (is_orphan) { 3524 ret = gen_unique_name(sctx, pm->ino, 3525 pm->gen, from_path); 3526 } else { 3527 ret = get_first_ref(sctx->parent_root, pm->ino, 3528 &parent_ino, &parent_gen, name); 3529 if (ret < 0) 3530 goto out; 3531 ret = get_cur_path(sctx, parent_ino, parent_gen, 3532 from_path); 3533 if (ret < 0) 3534 goto out; 3535 ret = fs_path_add_path(from_path, name); 3536 } 3537 if (ret < 0) 3538 goto out; 3539 3540 sctx->send_progress = sctx->cur_ino + 1; 3541 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); 3542 if (ret < 0) 3543 goto out; 3544 if (ret) { 3545 LIST_HEAD(deleted_refs); 3546 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); 3547 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, 3548 &pm->update_refs, &deleted_refs, 3549 is_orphan); 3550 if (ret < 0) 3551 goto out; 3552 if (rmdir_ino) { 3553 dm = get_waiting_dir_move(sctx, pm->ino); 3554 ASSERT(dm); 3555 dm->rmdir_ino = rmdir_ino; 3556 dm->rmdir_gen = rmdir_gen; 3557 } 3558 goto out; 3559 } 3560 fs_path_reset(name); 3561 to_path = name; 3562 name = NULL; 3563 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3564 if (ret < 0) 3565 goto out; 3566 3567 ret = send_rename(sctx, from_path, to_path); 3568 if (ret < 0) 3569 goto out; 3570 3571 if (rmdir_ino) { 3572 struct orphan_dir_info *odi; 3573 u64 gen; 3574 3575 odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); 3576 if (!odi) { 3577 /* already deleted */ 3578 goto finish; 3579 } 3580 gen = odi->gen; 3581 3582 ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); 3583 if (ret < 0) 3584 goto out; 3585 if (!ret) 3586 goto finish; 3587 3588 name = fs_path_alloc(); 3589 if (!name) { 3590 ret = -ENOMEM; 3591 goto out; 3592 } 3593 ret = get_cur_path(sctx, rmdir_ino, gen, name); 3594 if (ret < 0) 3595 goto out; 3596 ret = send_rmdir(sctx, name); 3597 if (ret < 0) 3598 goto out; 3599 } 3600 3601 finish: 3602 ret = send_utimes(sctx, pm->ino, pm->gen); 3603 if (ret < 0) 3604 goto out; 3605 3606 /* 3607 * After rename/move, need to update the utimes of both new parent(s) 3608 * and old parent(s). 3609 */ 3610 list_for_each_entry(cur, &pm->update_refs, list) { 3611 /* 3612 * The parent inode might have been deleted in the send snapshot 3613 */ 3614 ret = get_inode_info(sctx->send_root, cur->dir, NULL); 3615 if (ret == -ENOENT) { 3616 ret = 0; 3617 continue; 3618 } 3619 if (ret < 0) 3620 goto out; 3621 3622 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3623 if (ret < 0) 3624 goto out; 3625 } 3626 3627 out: 3628 fs_path_free(name); 3629 fs_path_free(from_path); 3630 fs_path_free(to_path); 3631 sctx->send_progress = orig_progress; 3632 3633 return ret; 3634 } 3635 3636 static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) 3637 { 3638 if (!list_empty(&m->list)) 3639 list_del(&m->list); 3640 if (!RB_EMPTY_NODE(&m->node)) 3641 rb_erase(&m->node, &sctx->pending_dir_moves); 3642 __free_recorded_refs(&m->update_refs); 3643 kfree(m); 3644 } 3645 3646 static void tail_append_pending_moves(struct send_ctx *sctx, 3647 struct pending_dir_move *moves, 3648 struct list_head *stack) 3649 { 3650 if (list_empty(&moves->list)) { 3651 list_add_tail(&moves->list, stack); 3652 } else { 3653 LIST_HEAD(list); 3654 list_splice_init(&moves->list, &list); 3655 list_add_tail(&moves->list, stack); 3656 list_splice_tail(&list, stack); 3657 } 3658 if (!RB_EMPTY_NODE(&moves->node)) { 3659 rb_erase(&moves->node, &sctx->pending_dir_moves); 3660 RB_CLEAR_NODE(&moves->node); 3661 } 3662 } 3663 3664 static int apply_children_dir_moves(struct send_ctx *sctx) 3665 { 3666 struct pending_dir_move *pm; 3667 struct list_head stack; 3668 u64 parent_ino = sctx->cur_ino; 3669 int ret = 0; 3670 3671 pm = get_pending_dir_moves(sctx, parent_ino); 3672 if (!pm) 3673 return 0; 3674 3675 INIT_LIST_HEAD(&stack); 3676 tail_append_pending_moves(sctx, pm, &stack); 3677 3678 while (!list_empty(&stack)) { 3679 pm = list_first_entry(&stack, struct pending_dir_move, list); 3680 parent_ino = pm->ino; 3681 ret = apply_dir_move(sctx, pm); 3682 free_pending_move(sctx, pm); 3683 if (ret) 3684 goto out; 3685 pm = get_pending_dir_moves(sctx, parent_ino); 3686 if (pm) 3687 tail_append_pending_moves(sctx, pm, &stack); 3688 } 3689 return 0; 3690 3691 out: 3692 while (!list_empty(&stack)) { 3693 pm = list_first_entry(&stack, struct pending_dir_move, list); 3694 free_pending_move(sctx, pm); 3695 } 3696 return ret; 3697 } 3698 3699 /* 3700 * We might need to delay a directory rename even when no ancestor directory 3701 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was 3702 * renamed. This happens when we rename a directory to the old name (the name 3703 * in the parent root) of some other unrelated directory that got its rename 3704 * delayed due to some ancestor with higher number that got renamed. 3705 * 3706 * Example: 3707 * 3708 * Parent snapshot: 3709 * . (ino 256) 3710 * |---- a/ (ino 257) 3711 * | |---- file (ino 260) 3712 * | 3713 * |---- b/ (ino 258) 3714 * |---- c/ (ino 259) 3715 * 3716 * Send snapshot: 3717 * . (ino 256) 3718 * |---- a/ (ino 258) 3719 * |---- x/ (ino 259) 3720 * |---- y/ (ino 257) 3721 * |----- file (ino 260) 3722 * 3723 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 3724 * from 'a' to 'x/y' happening first, which in turn depends on the rename of 3725 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream 3726 * must issue is: 3727 * 3728 * 1 - rename 259 from 'c' to 'x' 3729 * 2 - rename 257 from 'a' to 'x/y' 3730 * 3 - rename 258 from 'b' to 'a' 3731 * 3732 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can 3733 * be done right away and < 0 on error. 3734 */ 3735 static int wait_for_dest_dir_move(struct send_ctx *sctx, 3736 struct recorded_ref *parent_ref, 3737 const bool is_orphan) 3738 { 3739 struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; 3740 struct btrfs_path *path; 3741 struct btrfs_key key; 3742 struct btrfs_key di_key; 3743 struct btrfs_dir_item *di; 3744 u64 left_gen; 3745 u64 right_gen; 3746 int ret = 0; 3747 struct waiting_dir_move *wdm; 3748 3749 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) 3750 return 0; 3751 3752 path = alloc_path_for_send(); 3753 if (!path) 3754 return -ENOMEM; 3755 3756 key.objectid = parent_ref->dir; 3757 key.type = BTRFS_DIR_ITEM_KEY; 3758 key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); 3759 3760 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); 3761 if (ret < 0) { 3762 goto out; 3763 } else if (ret > 0) { 3764 ret = 0; 3765 goto out; 3766 } 3767 3768 di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, 3769 parent_ref->name_len); 3770 if (!di) { 3771 ret = 0; 3772 goto out; 3773 } 3774 /* 3775 * di_key.objectid has the number of the inode that has a dentry in the 3776 * parent directory with the same name that sctx->cur_ino is being 3777 * renamed to. We need to check if that inode is in the send root as 3778 * well and if it is currently marked as an inode with a pending rename, 3779 * if it is, we need to delay the rename of sctx->cur_ino as well, so 3780 * that it happens after that other inode is renamed. 3781 */ 3782 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); 3783 if (di_key.type != BTRFS_INODE_ITEM_KEY) { 3784 ret = 0; 3785 goto out; 3786 } 3787 3788 ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); 3789 if (ret < 0) 3790 goto out; 3791 ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); 3792 if (ret < 0) { 3793 if (ret == -ENOENT) 3794 ret = 0; 3795 goto out; 3796 } 3797 3798 /* Different inode, no need to delay the rename of sctx->cur_ino */ 3799 if (right_gen != left_gen) { 3800 ret = 0; 3801 goto out; 3802 } 3803 3804 wdm = get_waiting_dir_move(sctx, di_key.objectid); 3805 if (wdm && !wdm->orphanized) { 3806 ret = add_pending_dir_move(sctx, 3807 sctx->cur_ino, 3808 sctx->cur_inode_gen, 3809 di_key.objectid, 3810 &sctx->new_refs, 3811 &sctx->deleted_refs, 3812 is_orphan); 3813 if (!ret) 3814 ret = 1; 3815 } 3816 out: 3817 btrfs_free_path(path); 3818 return ret; 3819 } 3820 3821 /* 3822 * Check if inode ino2, or any of its ancestors, is inode ino1. 3823 * Return 1 if true, 0 if false and < 0 on error. 3824 */ 3825 static int check_ino_in_path(struct btrfs_root *root, 3826 const u64 ino1, 3827 const u64 ino1_gen, 3828 const u64 ino2, 3829 const u64 ino2_gen, 3830 struct fs_path *fs_path) 3831 { 3832 u64 ino = ino2; 3833 3834 if (ino1 == ino2) 3835 return ino1_gen == ino2_gen; 3836 3837 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3838 u64 parent; 3839 u64 parent_gen; 3840 int ret; 3841 3842 fs_path_reset(fs_path); 3843 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); 3844 if (ret < 0) 3845 return ret; 3846 if (parent == ino1) 3847 return parent_gen == ino1_gen; 3848 ino = parent; 3849 } 3850 return 0; 3851 } 3852 3853 /* 3854 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any 3855 * possible path (in case ino2 is not a directory and has multiple hard links). 3856 * Return 1 if true, 0 if false and < 0 on error. 3857 */ 3858 static int is_ancestor(struct btrfs_root *root, 3859 const u64 ino1, 3860 const u64 ino1_gen, 3861 const u64 ino2, 3862 struct fs_path *fs_path) 3863 { 3864 bool free_fs_path = false; 3865 int ret = 0; 3866 int iter_ret = 0; 3867 struct btrfs_path *path = NULL; 3868 struct btrfs_key key; 3869 3870 if (!fs_path) { 3871 fs_path = fs_path_alloc(); 3872 if (!fs_path) 3873 return -ENOMEM; 3874 free_fs_path = true; 3875 } 3876 3877 path = alloc_path_for_send(); 3878 if (!path) { 3879 ret = -ENOMEM; 3880 goto out; 3881 } 3882 3883 key.objectid = ino2; 3884 key.type = BTRFS_INODE_REF_KEY; 3885 key.offset = 0; 3886 3887 btrfs_for_each_slot(root, &key, &key, path, iter_ret) { 3888 struct extent_buffer *leaf = path->nodes[0]; 3889 int slot = path->slots[0]; 3890 u32 cur_offset = 0; 3891 u32 item_size; 3892 3893 if (key.objectid != ino2) 3894 break; 3895 if (key.type != BTRFS_INODE_REF_KEY && 3896 key.type != BTRFS_INODE_EXTREF_KEY) 3897 break; 3898 3899 item_size = btrfs_item_size(leaf, slot); 3900 while (cur_offset < item_size) { 3901 u64 parent; 3902 u64 parent_gen; 3903 3904 if (key.type == BTRFS_INODE_EXTREF_KEY) { 3905 unsigned long ptr; 3906 struct btrfs_inode_extref *extref; 3907 3908 ptr = btrfs_item_ptr_offset(leaf, slot); 3909 extref = (struct btrfs_inode_extref *) 3910 (ptr + cur_offset); 3911 parent = btrfs_inode_extref_parent(leaf, 3912 extref); 3913 cur_offset += sizeof(*extref); 3914 cur_offset += btrfs_inode_extref_name_len(leaf, 3915 extref); 3916 } else { 3917 parent = key.offset; 3918 cur_offset = item_size; 3919 } 3920 3921 ret = get_inode_gen(root, parent, &parent_gen); 3922 if (ret < 0) 3923 goto out; 3924 ret = check_ino_in_path(root, ino1, ino1_gen, 3925 parent, parent_gen, fs_path); 3926 if (ret) 3927 goto out; 3928 } 3929 } 3930 ret = 0; 3931 if (iter_ret < 0) 3932 ret = iter_ret; 3933 3934 out: 3935 btrfs_free_path(path); 3936 if (free_fs_path) 3937 fs_path_free(fs_path); 3938 return ret; 3939 } 3940 3941 static int wait_for_parent_move(struct send_ctx *sctx, 3942 struct recorded_ref *parent_ref, 3943 const bool is_orphan) 3944 { 3945 int ret = 0; 3946 u64 ino = parent_ref->dir; 3947 u64 ino_gen = parent_ref->dir_gen; 3948 u64 parent_ino_before, parent_ino_after; 3949 struct fs_path *path_before = NULL; 3950 struct fs_path *path_after = NULL; 3951 int len1, len2; 3952 3953 path_after = fs_path_alloc(); 3954 path_before = fs_path_alloc(); 3955 if (!path_after || !path_before) { 3956 ret = -ENOMEM; 3957 goto out; 3958 } 3959 3960 /* 3961 * Our current directory inode may not yet be renamed/moved because some 3962 * ancestor (immediate or not) has to be renamed/moved first. So find if 3963 * such ancestor exists and make sure our own rename/move happens after 3964 * that ancestor is processed to avoid path build infinite loops (done 3965 * at get_cur_path()). 3966 */ 3967 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3968 u64 parent_ino_after_gen; 3969 3970 if (is_waiting_for_move(sctx, ino)) { 3971 /* 3972 * If the current inode is an ancestor of ino in the 3973 * parent root, we need to delay the rename of the 3974 * current inode, otherwise don't delayed the rename 3975 * because we can end up with a circular dependency 3976 * of renames, resulting in some directories never 3977 * getting the respective rename operations issued in 3978 * the send stream or getting into infinite path build 3979 * loops. 3980 */ 3981 ret = is_ancestor(sctx->parent_root, 3982 sctx->cur_ino, sctx->cur_inode_gen, 3983 ino, path_before); 3984 if (ret) 3985 break; 3986 } 3987 3988 fs_path_reset(path_before); 3989 fs_path_reset(path_after); 3990 3991 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3992 &parent_ino_after_gen, path_after); 3993 if (ret < 0) 3994 goto out; 3995 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, 3996 NULL, path_before); 3997 if (ret < 0 && ret != -ENOENT) { 3998 goto out; 3999 } else if (ret == -ENOENT) { 4000 ret = 0; 4001 break; 4002 } 4003 4004 len1 = fs_path_len(path_before); 4005 len2 = fs_path_len(path_after); 4006 if (ino > sctx->cur_ino && 4007 (parent_ino_before != parent_ino_after || len1 != len2 || 4008 memcmp(path_before->start, path_after->start, len1))) { 4009 u64 parent_ino_gen; 4010 4011 ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); 4012 if (ret < 0) 4013 goto out; 4014 if (ino_gen == parent_ino_gen) { 4015 ret = 1; 4016 break; 4017 } 4018 } 4019 ino = parent_ino_after; 4020 ino_gen = parent_ino_after_gen; 4021 } 4022 4023 out: 4024 fs_path_free(path_before); 4025 fs_path_free(path_after); 4026 4027 if (ret == 1) { 4028 ret = add_pending_dir_move(sctx, 4029 sctx->cur_ino, 4030 sctx->cur_inode_gen, 4031 ino, 4032 &sctx->new_refs, 4033 &sctx->deleted_refs, 4034 is_orphan); 4035 if (!ret) 4036 ret = 1; 4037 } 4038 4039 return ret; 4040 } 4041 4042 static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 4043 { 4044 int ret; 4045 struct fs_path *new_path; 4046 4047 /* 4048 * Our reference's name member points to its full_path member string, so 4049 * we use here a new path. 4050 */ 4051 new_path = fs_path_alloc(); 4052 if (!new_path) 4053 return -ENOMEM; 4054 4055 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); 4056 if (ret < 0) { 4057 fs_path_free(new_path); 4058 return ret; 4059 } 4060 ret = fs_path_add(new_path, ref->name, ref->name_len); 4061 if (ret < 0) { 4062 fs_path_free(new_path); 4063 return ret; 4064 } 4065 4066 fs_path_free(ref->full_path); 4067 set_ref_path(ref, new_path); 4068 4069 return 0; 4070 } 4071 4072 /* 4073 * When processing the new references for an inode we may orphanize an existing 4074 * directory inode because its old name conflicts with one of the new references 4075 * of the current inode. Later, when processing another new reference of our 4076 * inode, we might need to orphanize another inode, but the path we have in the 4077 * reference reflects the pre-orphanization name of the directory we previously 4078 * orphanized. For example: 4079 * 4080 * parent snapshot looks like: 4081 * 4082 * . (ino 256) 4083 * |----- f1 (ino 257) 4084 * |----- f2 (ino 258) 4085 * |----- d1/ (ino 259) 4086 * |----- d2/ (ino 260) 4087 * 4088 * send snapshot looks like: 4089 * 4090 * . (ino 256) 4091 * |----- d1 (ino 258) 4092 * |----- f2/ (ino 259) 4093 * |----- f2_link/ (ino 260) 4094 * | |----- f1 (ino 257) 4095 * | 4096 * |----- d2 (ino 258) 4097 * 4098 * When processing inode 257 we compute the name for inode 259 as "d1", and we 4099 * cache it in the name cache. Later when we start processing inode 258, when 4100 * collecting all its new references we set a full path of "d1/d2" for its new 4101 * reference with name "d2". When we start processing the new references we 4102 * start by processing the new reference with name "d1", and this results in 4103 * orphanizing inode 259, since its old reference causes a conflict. Then we 4104 * move on the next new reference, with name "d2", and we find out we must 4105 * orphanize inode 260, as its old reference conflicts with ours - but for the 4106 * orphanization we use a source path corresponding to the path we stored in the 4107 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the 4108 * receiver fail since the path component "d1/" no longer exists, it was renamed 4109 * to "o259-6-0/" when processing the previous new reference. So in this case we 4110 * must recompute the path in the new reference and use it for the new 4111 * orphanization operation. 4112 */ 4113 static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 4114 { 4115 char *name; 4116 int ret; 4117 4118 name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); 4119 if (!name) 4120 return -ENOMEM; 4121 4122 fs_path_reset(ref->full_path); 4123 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); 4124 if (ret < 0) 4125 goto out; 4126 4127 ret = fs_path_add(ref->full_path, name, ref->name_len); 4128 if (ret < 0) 4129 goto out; 4130 4131 /* Update the reference's base name pointer. */ 4132 set_ref_path(ref, ref->full_path); 4133 out: 4134 kfree(name); 4135 return ret; 4136 } 4137 4138 /* 4139 * This does all the move/link/unlink/rmdir magic. 4140 */ 4141 static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) 4142 { 4143 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 4144 int ret = 0; 4145 struct recorded_ref *cur; 4146 struct recorded_ref *cur2; 4147 struct list_head check_dirs; 4148 struct fs_path *valid_path = NULL; 4149 u64 ow_inode = 0; 4150 u64 ow_gen; 4151 u64 ow_mode; 4152 int did_overwrite = 0; 4153 int is_orphan = 0; 4154 u64 last_dir_ino_rm = 0; 4155 bool can_rename = true; 4156 bool orphanized_dir = false; 4157 bool orphanized_ancestor = false; 4158 4159 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); 4160 4161 /* 4162 * This should never happen as the root dir always has the same ref 4163 * which is always '..' 4164 */ 4165 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 4166 INIT_LIST_HEAD(&check_dirs); 4167 4168 valid_path = fs_path_alloc(); 4169 if (!valid_path) { 4170 ret = -ENOMEM; 4171 goto out; 4172 } 4173 4174 /* 4175 * First, check if the first ref of the current inode was overwritten 4176 * before. If yes, we know that the current inode was already orphanized 4177 * and thus use the orphan name. If not, we can use get_cur_path to 4178 * get the path of the first ref as it would like while receiving at 4179 * this point in time. 4180 * New inodes are always orphan at the beginning, so force to use the 4181 * orphan name in this case. 4182 * The first ref is stored in valid_path and will be updated if it 4183 * gets moved around. 4184 */ 4185 if (!sctx->cur_inode_new) { 4186 ret = did_overwrite_first_ref(sctx, sctx->cur_ino, 4187 sctx->cur_inode_gen); 4188 if (ret < 0) 4189 goto out; 4190 if (ret) 4191 did_overwrite = 1; 4192 } 4193 if (sctx->cur_inode_new || did_overwrite) { 4194 ret = gen_unique_name(sctx, sctx->cur_ino, 4195 sctx->cur_inode_gen, valid_path); 4196 if (ret < 0) 4197 goto out; 4198 is_orphan = 1; 4199 } else { 4200 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 4201 valid_path); 4202 if (ret < 0) 4203 goto out; 4204 } 4205 4206 /* 4207 * Before doing any rename and link operations, do a first pass on the 4208 * new references to orphanize any unprocessed inodes that may have a 4209 * reference that conflicts with one of the new references of the current 4210 * inode. This needs to happen first because a new reference may conflict 4211 * with the old reference of a parent directory, so we must make sure 4212 * that the path used for link and rename commands don't use an 4213 * orphanized name when an ancestor was not yet orphanized. 4214 * 4215 * Example: 4216 * 4217 * Parent snapshot: 4218 * 4219 * . (ino 256) 4220 * |----- testdir/ (ino 259) 4221 * | |----- a (ino 257) 4222 * | 4223 * |----- b (ino 258) 4224 * 4225 * Send snapshot: 4226 * 4227 * . (ino 256) 4228 * |----- testdir_2/ (ino 259) 4229 * | |----- a (ino 260) 4230 * | 4231 * |----- testdir (ino 257) 4232 * |----- b (ino 257) 4233 * |----- b2 (ino 258) 4234 * 4235 * Processing the new reference for inode 257 with name "b" may happen 4236 * before processing the new reference with name "testdir". If so, we 4237 * must make sure that by the time we send a link command to create the 4238 * hard link "b", inode 259 was already orphanized, since the generated 4239 * path in "valid_path" already contains the orphanized name for 259. 4240 * We are processing inode 257, so only later when processing 259 we do 4241 * the rename operation to change its temporary (orphanized) name to 4242 * "testdir_2". 4243 */ 4244 list_for_each_entry(cur, &sctx->new_refs, list) { 4245 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 4246 if (ret < 0) 4247 goto out; 4248 if (ret == inode_state_will_create) 4249 continue; 4250 4251 /* 4252 * Check if this new ref would overwrite the first ref of another 4253 * unprocessed inode. If yes, orphanize the overwritten inode. 4254 * If we find an overwritten ref that is not the first ref, 4255 * simply unlink it. 4256 */ 4257 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, 4258 cur->name, cur->name_len, 4259 &ow_inode, &ow_gen, &ow_mode); 4260 if (ret < 0) 4261 goto out; 4262 if (ret) { 4263 ret = is_first_ref(sctx->parent_root, 4264 ow_inode, cur->dir, cur->name, 4265 cur->name_len); 4266 if (ret < 0) 4267 goto out; 4268 if (ret) { 4269 struct name_cache_entry *nce; 4270 struct waiting_dir_move *wdm; 4271 4272 if (orphanized_dir) { 4273 ret = refresh_ref_path(sctx, cur); 4274 if (ret < 0) 4275 goto out; 4276 } 4277 4278 ret = orphanize_inode(sctx, ow_inode, ow_gen, 4279 cur->full_path); 4280 if (ret < 0) 4281 goto out; 4282 if (S_ISDIR(ow_mode)) 4283 orphanized_dir = true; 4284 4285 /* 4286 * If ow_inode has its rename operation delayed 4287 * make sure that its orphanized name is used in 4288 * the source path when performing its rename 4289 * operation. 4290 */ 4291 if (is_waiting_for_move(sctx, ow_inode)) { 4292 wdm = get_waiting_dir_move(sctx, 4293 ow_inode); 4294 ASSERT(wdm); 4295 wdm->orphanized = true; 4296 } 4297 4298 /* 4299 * Make sure we clear our orphanized inode's 4300 * name from the name cache. This is because the 4301 * inode ow_inode might be an ancestor of some 4302 * other inode that will be orphanized as well 4303 * later and has an inode number greater than 4304 * sctx->send_progress. We need to prevent 4305 * future name lookups from using the old name 4306 * and get instead the orphan name. 4307 */ 4308 nce = name_cache_search(sctx, ow_inode, ow_gen); 4309 if (nce) { 4310 name_cache_delete(sctx, nce); 4311 kfree(nce); 4312 } 4313 4314 /* 4315 * ow_inode might currently be an ancestor of 4316 * cur_ino, therefore compute valid_path (the 4317 * current path of cur_ino) again because it 4318 * might contain the pre-orphanization name of 4319 * ow_inode, which is no longer valid. 4320 */ 4321 ret = is_ancestor(sctx->parent_root, 4322 ow_inode, ow_gen, 4323 sctx->cur_ino, NULL); 4324 if (ret > 0) { 4325 orphanized_ancestor = true; 4326 fs_path_reset(valid_path); 4327 ret = get_cur_path(sctx, sctx->cur_ino, 4328 sctx->cur_inode_gen, 4329 valid_path); 4330 } 4331 if (ret < 0) 4332 goto out; 4333 } else { 4334 /* 4335 * If we previously orphanized a directory that 4336 * collided with a new reference that we already 4337 * processed, recompute the current path because 4338 * that directory may be part of the path. 4339 */ 4340 if (orphanized_dir) { 4341 ret = refresh_ref_path(sctx, cur); 4342 if (ret < 0) 4343 goto out; 4344 } 4345 ret = send_unlink(sctx, cur->full_path); 4346 if (ret < 0) 4347 goto out; 4348 } 4349 } 4350 4351 } 4352 4353 list_for_each_entry(cur, &sctx->new_refs, list) { 4354 /* 4355 * We may have refs where the parent directory does not exist 4356 * yet. This happens if the parent directories inum is higher 4357 * than the current inum. To handle this case, we create the 4358 * parent directory out of order. But we need to check if this 4359 * did already happen before due to other refs in the same dir. 4360 */ 4361 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 4362 if (ret < 0) 4363 goto out; 4364 if (ret == inode_state_will_create) { 4365 ret = 0; 4366 /* 4367 * First check if any of the current inodes refs did 4368 * already create the dir. 4369 */ 4370 list_for_each_entry(cur2, &sctx->new_refs, list) { 4371 if (cur == cur2) 4372 break; 4373 if (cur2->dir == cur->dir) { 4374 ret = 1; 4375 break; 4376 } 4377 } 4378 4379 /* 4380 * If that did not happen, check if a previous inode 4381 * did already create the dir. 4382 */ 4383 if (!ret) 4384 ret = did_create_dir(sctx, cur->dir); 4385 if (ret < 0) 4386 goto out; 4387 if (!ret) { 4388 ret = send_create_inode(sctx, cur->dir); 4389 if (ret < 0) 4390 goto out; 4391 } 4392 } 4393 4394 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { 4395 ret = wait_for_dest_dir_move(sctx, cur, is_orphan); 4396 if (ret < 0) 4397 goto out; 4398 if (ret == 1) { 4399 can_rename = false; 4400 *pending_move = 1; 4401 } 4402 } 4403 4404 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && 4405 can_rename) { 4406 ret = wait_for_parent_move(sctx, cur, is_orphan); 4407 if (ret < 0) 4408 goto out; 4409 if (ret == 1) { 4410 can_rename = false; 4411 *pending_move = 1; 4412 } 4413 } 4414 4415 /* 4416 * link/move the ref to the new place. If we have an orphan 4417 * inode, move it and update valid_path. If not, link or move 4418 * it depending on the inode mode. 4419 */ 4420 if (is_orphan && can_rename) { 4421 ret = send_rename(sctx, valid_path, cur->full_path); 4422 if (ret < 0) 4423 goto out; 4424 is_orphan = 0; 4425 ret = fs_path_copy(valid_path, cur->full_path); 4426 if (ret < 0) 4427 goto out; 4428 } else if (can_rename) { 4429 if (S_ISDIR(sctx->cur_inode_mode)) { 4430 /* 4431 * Dirs can't be linked, so move it. For moved 4432 * dirs, we always have one new and one deleted 4433 * ref. The deleted ref is ignored later. 4434 */ 4435 ret = send_rename(sctx, valid_path, 4436 cur->full_path); 4437 if (!ret) 4438 ret = fs_path_copy(valid_path, 4439 cur->full_path); 4440 if (ret < 0) 4441 goto out; 4442 } else { 4443 /* 4444 * We might have previously orphanized an inode 4445 * which is an ancestor of our current inode, 4446 * so our reference's full path, which was 4447 * computed before any such orphanizations, must 4448 * be updated. 4449 */ 4450 if (orphanized_dir) { 4451 ret = update_ref_path(sctx, cur); 4452 if (ret < 0) 4453 goto out; 4454 } 4455 ret = send_link(sctx, cur->full_path, 4456 valid_path); 4457 if (ret < 0) 4458 goto out; 4459 } 4460 } 4461 ret = dup_ref(cur, &check_dirs); 4462 if (ret < 0) 4463 goto out; 4464 } 4465 4466 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { 4467 /* 4468 * Check if we can already rmdir the directory. If not, 4469 * orphanize it. For every dir item inside that gets deleted 4470 * later, we do this check again and rmdir it then if possible. 4471 * See the use of check_dirs for more details. 4472 */ 4473 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, 4474 sctx->cur_ino); 4475 if (ret < 0) 4476 goto out; 4477 if (ret) { 4478 ret = send_rmdir(sctx, valid_path); 4479 if (ret < 0) 4480 goto out; 4481 } else if (!is_orphan) { 4482 ret = orphanize_inode(sctx, sctx->cur_ino, 4483 sctx->cur_inode_gen, valid_path); 4484 if (ret < 0) 4485 goto out; 4486 is_orphan = 1; 4487 } 4488 4489 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4490 ret = dup_ref(cur, &check_dirs); 4491 if (ret < 0) 4492 goto out; 4493 } 4494 } else if (S_ISDIR(sctx->cur_inode_mode) && 4495 !list_empty(&sctx->deleted_refs)) { 4496 /* 4497 * We have a moved dir. Add the old parent to check_dirs 4498 */ 4499 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, 4500 list); 4501 ret = dup_ref(cur, &check_dirs); 4502 if (ret < 0) 4503 goto out; 4504 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 4505 /* 4506 * We have a non dir inode. Go through all deleted refs and 4507 * unlink them if they were not already overwritten by other 4508 * inodes. 4509 */ 4510 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4511 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, 4512 sctx->cur_ino, sctx->cur_inode_gen, 4513 cur->name, cur->name_len); 4514 if (ret < 0) 4515 goto out; 4516 if (!ret) { 4517 /* 4518 * If we orphanized any ancestor before, we need 4519 * to recompute the full path for deleted names, 4520 * since any such path was computed before we 4521 * processed any references and orphanized any 4522 * ancestor inode. 4523 */ 4524 if (orphanized_ancestor) { 4525 ret = update_ref_path(sctx, cur); 4526 if (ret < 0) 4527 goto out; 4528 } 4529 ret = send_unlink(sctx, cur->full_path); 4530 if (ret < 0) 4531 goto out; 4532 } 4533 ret = dup_ref(cur, &check_dirs); 4534 if (ret < 0) 4535 goto out; 4536 } 4537 /* 4538 * If the inode is still orphan, unlink the orphan. This may 4539 * happen when a previous inode did overwrite the first ref 4540 * of this inode and no new refs were added for the current 4541 * inode. Unlinking does not mean that the inode is deleted in 4542 * all cases. There may still be links to this inode in other 4543 * places. 4544 */ 4545 if (is_orphan) { 4546 ret = send_unlink(sctx, valid_path); 4547 if (ret < 0) 4548 goto out; 4549 } 4550 } 4551 4552 /* 4553 * We did collect all parent dirs where cur_inode was once located. We 4554 * now go through all these dirs and check if they are pending for 4555 * deletion and if it's finally possible to perform the rmdir now. 4556 * We also update the inode stats of the parent dirs here. 4557 */ 4558 list_for_each_entry(cur, &check_dirs, list) { 4559 /* 4560 * In case we had refs into dirs that were not processed yet, 4561 * we don't need to do the utime and rmdir logic for these dirs. 4562 * The dir will be processed later. 4563 */ 4564 if (cur->dir > sctx->cur_ino) 4565 continue; 4566 4567 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 4568 if (ret < 0) 4569 goto out; 4570 4571 if (ret == inode_state_did_create || 4572 ret == inode_state_no_change) { 4573 /* TODO delayed utimes */ 4574 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 4575 if (ret < 0) 4576 goto out; 4577 } else if (ret == inode_state_did_delete && 4578 cur->dir != last_dir_ino_rm) { 4579 ret = can_rmdir(sctx, cur->dir, cur->dir_gen, 4580 sctx->cur_ino); 4581 if (ret < 0) 4582 goto out; 4583 if (ret) { 4584 ret = get_cur_path(sctx, cur->dir, 4585 cur->dir_gen, valid_path); 4586 if (ret < 0) 4587 goto out; 4588 ret = send_rmdir(sctx, valid_path); 4589 if (ret < 0) 4590 goto out; 4591 last_dir_ino_rm = cur->dir; 4592 } 4593 } 4594 } 4595 4596 ret = 0; 4597 4598 out: 4599 __free_recorded_refs(&check_dirs); 4600 free_recorded_refs(sctx); 4601 fs_path_free(valid_path); 4602 return ret; 4603 } 4604 4605 static int rbtree_ref_comp(const void *k, const struct rb_node *node) 4606 { 4607 const struct recorded_ref *data = k; 4608 const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); 4609 int result; 4610 4611 if (data->dir > ref->dir) 4612 return 1; 4613 if (data->dir < ref->dir) 4614 return -1; 4615 if (data->dir_gen > ref->dir_gen) 4616 return 1; 4617 if (data->dir_gen < ref->dir_gen) 4618 return -1; 4619 if (data->name_len > ref->name_len) 4620 return 1; 4621 if (data->name_len < ref->name_len) 4622 return -1; 4623 result = strcmp(data->name, ref->name); 4624 if (result > 0) 4625 return 1; 4626 if (result < 0) 4627 return -1; 4628 return 0; 4629 } 4630 4631 static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) 4632 { 4633 const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); 4634 4635 return rbtree_ref_comp(entry, parent) < 0; 4636 } 4637 4638 static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, 4639 struct fs_path *name, u64 dir, u64 dir_gen, 4640 struct send_ctx *sctx) 4641 { 4642 int ret = 0; 4643 struct fs_path *path = NULL; 4644 struct recorded_ref *ref = NULL; 4645 4646 path = fs_path_alloc(); 4647 if (!path) { 4648 ret = -ENOMEM; 4649 goto out; 4650 } 4651 4652 ref = recorded_ref_alloc(); 4653 if (!ref) { 4654 ret = -ENOMEM; 4655 goto out; 4656 } 4657 4658 ret = get_cur_path(sctx, dir, dir_gen, path); 4659 if (ret < 0) 4660 goto out; 4661 ret = fs_path_add_path(path, name); 4662 if (ret < 0) 4663 goto out; 4664 4665 ref->dir = dir; 4666 ref->dir_gen = dir_gen; 4667 set_ref_path(ref, path); 4668 list_add_tail(&ref->list, refs); 4669 rb_add(&ref->node, root, rbtree_ref_less); 4670 ref->root = root; 4671 out: 4672 if (ret) { 4673 if (path && (!ref || !ref->full_path)) 4674 fs_path_free(path); 4675 recorded_ref_free(ref); 4676 } 4677 return ret; 4678 } 4679 4680 static int record_new_ref_if_needed(int num, u64 dir, int index, 4681 struct fs_path *name, void *ctx) 4682 { 4683 int ret = 0; 4684 struct send_ctx *sctx = ctx; 4685 struct rb_node *node = NULL; 4686 struct recorded_ref data; 4687 struct recorded_ref *ref; 4688 u64 dir_gen; 4689 4690 ret = get_inode_gen(sctx->send_root, dir, &dir_gen); 4691 if (ret < 0) 4692 goto out; 4693 4694 data.dir = dir; 4695 data.dir_gen = dir_gen; 4696 set_ref_path(&data, name); 4697 node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); 4698 if (node) { 4699 ref = rb_entry(node, struct recorded_ref, node); 4700 recorded_ref_free(ref); 4701 } else { 4702 ret = record_ref_in_tree(&sctx->rbtree_new_refs, 4703 &sctx->new_refs, name, dir, dir_gen, 4704 sctx); 4705 } 4706 out: 4707 return ret; 4708 } 4709 4710 static int record_deleted_ref_if_needed(int num, u64 dir, int index, 4711 struct fs_path *name, void *ctx) 4712 { 4713 int ret = 0; 4714 struct send_ctx *sctx = ctx; 4715 struct rb_node *node = NULL; 4716 struct recorded_ref data; 4717 struct recorded_ref *ref; 4718 u64 dir_gen; 4719 4720 ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); 4721 if (ret < 0) 4722 goto out; 4723 4724 data.dir = dir; 4725 data.dir_gen = dir_gen; 4726 set_ref_path(&data, name); 4727 node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); 4728 if (node) { 4729 ref = rb_entry(node, struct recorded_ref, node); 4730 recorded_ref_free(ref); 4731 } else { 4732 ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, 4733 &sctx->deleted_refs, name, dir, 4734 dir_gen, sctx); 4735 } 4736 out: 4737 return ret; 4738 } 4739 4740 static int record_new_ref(struct send_ctx *sctx) 4741 { 4742 int ret; 4743 4744 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4745 sctx->cmp_key, 0, record_new_ref_if_needed, sctx); 4746 if (ret < 0) 4747 goto out; 4748 ret = 0; 4749 4750 out: 4751 return ret; 4752 } 4753 4754 static int record_deleted_ref(struct send_ctx *sctx) 4755 { 4756 int ret; 4757 4758 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4759 sctx->cmp_key, 0, record_deleted_ref_if_needed, 4760 sctx); 4761 if (ret < 0) 4762 goto out; 4763 ret = 0; 4764 4765 out: 4766 return ret; 4767 } 4768 4769 static int record_changed_ref(struct send_ctx *sctx) 4770 { 4771 int ret = 0; 4772 4773 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4774 sctx->cmp_key, 0, record_new_ref_if_needed, sctx); 4775 if (ret < 0) 4776 goto out; 4777 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4778 sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); 4779 if (ret < 0) 4780 goto out; 4781 ret = 0; 4782 4783 out: 4784 return ret; 4785 } 4786 4787 /* 4788 * Record and process all refs at once. Needed when an inode changes the 4789 * generation number, which means that it was deleted and recreated. 4790 */ 4791 static int process_all_refs(struct send_ctx *sctx, 4792 enum btrfs_compare_tree_result cmd) 4793 { 4794 int ret = 0; 4795 int iter_ret = 0; 4796 struct btrfs_root *root; 4797 struct btrfs_path *path; 4798 struct btrfs_key key; 4799 struct btrfs_key found_key; 4800 iterate_inode_ref_t cb; 4801 int pending_move = 0; 4802 4803 path = alloc_path_for_send(); 4804 if (!path) 4805 return -ENOMEM; 4806 4807 if (cmd == BTRFS_COMPARE_TREE_NEW) { 4808 root = sctx->send_root; 4809 cb = record_new_ref_if_needed; 4810 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { 4811 root = sctx->parent_root; 4812 cb = record_deleted_ref_if_needed; 4813 } else { 4814 btrfs_err(sctx->send_root->fs_info, 4815 "Wrong command %d in process_all_refs", cmd); 4816 ret = -EINVAL; 4817 goto out; 4818 } 4819 4820 key.objectid = sctx->cmp_key->objectid; 4821 key.type = BTRFS_INODE_REF_KEY; 4822 key.offset = 0; 4823 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 4824 if (found_key.objectid != key.objectid || 4825 (found_key.type != BTRFS_INODE_REF_KEY && 4826 found_key.type != BTRFS_INODE_EXTREF_KEY)) 4827 break; 4828 4829 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 4830 if (ret < 0) 4831 goto out; 4832 } 4833 /* Catch error found during iteration */ 4834 if (iter_ret < 0) { 4835 ret = iter_ret; 4836 goto out; 4837 } 4838 btrfs_release_path(path); 4839 4840 /* 4841 * We don't actually care about pending_move as we are simply 4842 * re-creating this inode and will be rename'ing it into place once we 4843 * rename the parent directory. 4844 */ 4845 ret = process_recorded_refs(sctx, &pending_move); 4846 out: 4847 btrfs_free_path(path); 4848 return ret; 4849 } 4850 4851 static int send_set_xattr(struct send_ctx *sctx, 4852 struct fs_path *path, 4853 const char *name, int name_len, 4854 const char *data, int data_len) 4855 { 4856 int ret = 0; 4857 4858 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); 4859 if (ret < 0) 4860 goto out; 4861 4862 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4863 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4864 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); 4865 4866 ret = send_cmd(sctx); 4867 4868 tlv_put_failure: 4869 out: 4870 return ret; 4871 } 4872 4873 static int send_remove_xattr(struct send_ctx *sctx, 4874 struct fs_path *path, 4875 const char *name, int name_len) 4876 { 4877 int ret = 0; 4878 4879 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); 4880 if (ret < 0) 4881 goto out; 4882 4883 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4884 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4885 4886 ret = send_cmd(sctx); 4887 4888 tlv_put_failure: 4889 out: 4890 return ret; 4891 } 4892 4893 static int __process_new_xattr(int num, struct btrfs_key *di_key, 4894 const char *name, int name_len, const char *data, 4895 int data_len, void *ctx) 4896 { 4897 int ret; 4898 struct send_ctx *sctx = ctx; 4899 struct fs_path *p; 4900 struct posix_acl_xattr_header dummy_acl; 4901 4902 /* Capabilities are emitted by finish_inode_if_needed */ 4903 if (!strncmp(name, XATTR_NAME_CAPS, name_len)) 4904 return 0; 4905 4906 p = fs_path_alloc(); 4907 if (!p) 4908 return -ENOMEM; 4909 4910 /* 4911 * This hack is needed because empty acls are stored as zero byte 4912 * data in xattrs. Problem with that is, that receiving these zero byte 4913 * acls will fail later. To fix this, we send a dummy acl list that 4914 * only contains the version number and no entries. 4915 */ 4916 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || 4917 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { 4918 if (data_len == 0) { 4919 dummy_acl.a_version = 4920 cpu_to_le32(POSIX_ACL_XATTR_VERSION); 4921 data = (char *)&dummy_acl; 4922 data_len = sizeof(dummy_acl); 4923 } 4924 } 4925 4926 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 4927 if (ret < 0) 4928 goto out; 4929 4930 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 4931 4932 out: 4933 fs_path_free(p); 4934 return ret; 4935 } 4936 4937 static int __process_deleted_xattr(int num, struct btrfs_key *di_key, 4938 const char *name, int name_len, 4939 const char *data, int data_len, void *ctx) 4940 { 4941 int ret; 4942 struct send_ctx *sctx = ctx; 4943 struct fs_path *p; 4944 4945 p = fs_path_alloc(); 4946 if (!p) 4947 return -ENOMEM; 4948 4949 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 4950 if (ret < 0) 4951 goto out; 4952 4953 ret = send_remove_xattr(sctx, p, name, name_len); 4954 4955 out: 4956 fs_path_free(p); 4957 return ret; 4958 } 4959 4960 static int process_new_xattr(struct send_ctx *sctx) 4961 { 4962 int ret = 0; 4963 4964 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4965 __process_new_xattr, sctx); 4966 4967 return ret; 4968 } 4969 4970 static int process_deleted_xattr(struct send_ctx *sctx) 4971 { 4972 return iterate_dir_item(sctx->parent_root, sctx->right_path, 4973 __process_deleted_xattr, sctx); 4974 } 4975 4976 struct find_xattr_ctx { 4977 const char *name; 4978 int name_len; 4979 int found_idx; 4980 char *found_data; 4981 int found_data_len; 4982 }; 4983 4984 static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, 4985 int name_len, const char *data, int data_len, void *vctx) 4986 { 4987 struct find_xattr_ctx *ctx = vctx; 4988 4989 if (name_len == ctx->name_len && 4990 strncmp(name, ctx->name, name_len) == 0) { 4991 ctx->found_idx = num; 4992 ctx->found_data_len = data_len; 4993 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); 4994 if (!ctx->found_data) 4995 return -ENOMEM; 4996 return 1; 4997 } 4998 return 0; 4999 } 5000 5001 static int find_xattr(struct btrfs_root *root, 5002 struct btrfs_path *path, 5003 struct btrfs_key *key, 5004 const char *name, int name_len, 5005 char **data, int *data_len) 5006 { 5007 int ret; 5008 struct find_xattr_ctx ctx; 5009 5010 ctx.name = name; 5011 ctx.name_len = name_len; 5012 ctx.found_idx = -1; 5013 ctx.found_data = NULL; 5014 ctx.found_data_len = 0; 5015 5016 ret = iterate_dir_item(root, path, __find_xattr, &ctx); 5017 if (ret < 0) 5018 return ret; 5019 5020 if (ctx.found_idx == -1) 5021 return -ENOENT; 5022 if (data) { 5023 *data = ctx.found_data; 5024 *data_len = ctx.found_data_len; 5025 } else { 5026 kfree(ctx.found_data); 5027 } 5028 return ctx.found_idx; 5029 } 5030 5031 5032 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, 5033 const char *name, int name_len, 5034 const char *data, int data_len, 5035 void *ctx) 5036 { 5037 int ret; 5038 struct send_ctx *sctx = ctx; 5039 char *found_data = NULL; 5040 int found_data_len = 0; 5041 5042 ret = find_xattr(sctx->parent_root, sctx->right_path, 5043 sctx->cmp_key, name, name_len, &found_data, 5044 &found_data_len); 5045 if (ret == -ENOENT) { 5046 ret = __process_new_xattr(num, di_key, name, name_len, data, 5047 data_len, ctx); 5048 } else if (ret >= 0) { 5049 if (data_len != found_data_len || 5050 memcmp(data, found_data, data_len)) { 5051 ret = __process_new_xattr(num, di_key, name, name_len, 5052 data, data_len, ctx); 5053 } else { 5054 ret = 0; 5055 } 5056 } 5057 5058 kfree(found_data); 5059 return ret; 5060 } 5061 5062 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, 5063 const char *name, int name_len, 5064 const char *data, int data_len, 5065 void *ctx) 5066 { 5067 int ret; 5068 struct send_ctx *sctx = ctx; 5069 5070 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, 5071 name, name_len, NULL, NULL); 5072 if (ret == -ENOENT) 5073 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 5074 data_len, ctx); 5075 else if (ret >= 0) 5076 ret = 0; 5077 5078 return ret; 5079 } 5080 5081 static int process_changed_xattr(struct send_ctx *sctx) 5082 { 5083 int ret = 0; 5084 5085 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 5086 __process_changed_new_xattr, sctx); 5087 if (ret < 0) 5088 goto out; 5089 ret = iterate_dir_item(sctx->parent_root, sctx->right_path, 5090 __process_changed_deleted_xattr, sctx); 5091 5092 out: 5093 return ret; 5094 } 5095 5096 static int process_all_new_xattrs(struct send_ctx *sctx) 5097 { 5098 int ret = 0; 5099 int iter_ret = 0; 5100 struct btrfs_root *root; 5101 struct btrfs_path *path; 5102 struct btrfs_key key; 5103 struct btrfs_key found_key; 5104 5105 path = alloc_path_for_send(); 5106 if (!path) 5107 return -ENOMEM; 5108 5109 root = sctx->send_root; 5110 5111 key.objectid = sctx->cmp_key->objectid; 5112 key.type = BTRFS_XATTR_ITEM_KEY; 5113 key.offset = 0; 5114 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 5115 if (found_key.objectid != key.objectid || 5116 found_key.type != key.type) { 5117 ret = 0; 5118 break; 5119 } 5120 5121 ret = iterate_dir_item(root, path, __process_new_xattr, sctx); 5122 if (ret < 0) 5123 break; 5124 } 5125 /* Catch error found during iteration */ 5126 if (iter_ret < 0) 5127 ret = iter_ret; 5128 5129 btrfs_free_path(path); 5130 return ret; 5131 } 5132 5133 static int send_verity(struct send_ctx *sctx, struct fs_path *path, 5134 struct fsverity_descriptor *desc) 5135 { 5136 int ret; 5137 5138 ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); 5139 if (ret < 0) 5140 goto out; 5141 5142 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 5143 TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, 5144 le8_to_cpu(desc->hash_algorithm)); 5145 TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, 5146 1U << le8_to_cpu(desc->log_blocksize)); 5147 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, 5148 le8_to_cpu(desc->salt_size)); 5149 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, 5150 le32_to_cpu(desc->sig_size)); 5151 5152 ret = send_cmd(sctx); 5153 5154 tlv_put_failure: 5155 out: 5156 return ret; 5157 } 5158 5159 static int process_verity(struct send_ctx *sctx) 5160 { 5161 int ret = 0; 5162 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 5163 struct inode *inode; 5164 struct fs_path *p; 5165 5166 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); 5167 if (IS_ERR(inode)) 5168 return PTR_ERR(inode); 5169 5170 ret = btrfs_get_verity_descriptor(inode, NULL, 0); 5171 if (ret < 0) 5172 goto iput; 5173 5174 if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { 5175 ret = -EMSGSIZE; 5176 goto iput; 5177 } 5178 if (!sctx->verity_descriptor) { 5179 sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, 5180 GFP_KERNEL); 5181 if (!sctx->verity_descriptor) { 5182 ret = -ENOMEM; 5183 goto iput; 5184 } 5185 } 5186 5187 ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); 5188 if (ret < 0) 5189 goto iput; 5190 5191 p = fs_path_alloc(); 5192 if (!p) { 5193 ret = -ENOMEM; 5194 goto iput; 5195 } 5196 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5197 if (ret < 0) 5198 goto free_path; 5199 5200 ret = send_verity(sctx, p, sctx->verity_descriptor); 5201 if (ret < 0) 5202 goto free_path; 5203 5204 free_path: 5205 fs_path_free(p); 5206 iput: 5207 iput(inode); 5208 return ret; 5209 } 5210 5211 static inline u64 max_send_read_size(const struct send_ctx *sctx) 5212 { 5213 return sctx->send_max_size - SZ_16K; 5214 } 5215 5216 static int put_data_header(struct send_ctx *sctx, u32 len) 5217 { 5218 if (WARN_ON_ONCE(sctx->put_data)) 5219 return -EINVAL; 5220 sctx->put_data = true; 5221 if (sctx->proto >= 2) { 5222 /* 5223 * Since v2, the data attribute header doesn't include a length, 5224 * it is implicitly to the end of the command. 5225 */ 5226 if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) 5227 return -EOVERFLOW; 5228 put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); 5229 sctx->send_size += sizeof(__le16); 5230 } else { 5231 struct btrfs_tlv_header *hdr; 5232 5233 if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) 5234 return -EOVERFLOW; 5235 hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); 5236 put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); 5237 put_unaligned_le16(len, &hdr->tlv_len); 5238 sctx->send_size += sizeof(*hdr); 5239 } 5240 return 0; 5241 } 5242 5243 static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) 5244 { 5245 struct btrfs_root *root = sctx->send_root; 5246 struct btrfs_fs_info *fs_info = root->fs_info; 5247 struct page *page; 5248 pgoff_t index = offset >> PAGE_SHIFT; 5249 pgoff_t last_index; 5250 unsigned pg_offset = offset_in_page(offset); 5251 int ret; 5252 5253 ret = put_data_header(sctx, len); 5254 if (ret) 5255 return ret; 5256 5257 last_index = (offset + len - 1) >> PAGE_SHIFT; 5258 5259 while (index <= last_index) { 5260 unsigned cur_len = min_t(unsigned, len, 5261 PAGE_SIZE - pg_offset); 5262 5263 page = find_lock_page(sctx->cur_inode->i_mapping, index); 5264 if (!page) { 5265 page_cache_sync_readahead(sctx->cur_inode->i_mapping, 5266 &sctx->ra, NULL, index, 5267 last_index + 1 - index); 5268 5269 page = find_or_create_page(sctx->cur_inode->i_mapping, 5270 index, GFP_KERNEL); 5271 if (!page) { 5272 ret = -ENOMEM; 5273 break; 5274 } 5275 } 5276 5277 if (PageReadahead(page)) 5278 page_cache_async_readahead(sctx->cur_inode->i_mapping, 5279 &sctx->ra, NULL, page_folio(page), 5280 index, last_index + 1 - index); 5281 5282 if (!PageUptodate(page)) { 5283 btrfs_read_folio(NULL, page_folio(page)); 5284 lock_page(page); 5285 if (!PageUptodate(page)) { 5286 unlock_page(page); 5287 btrfs_err(fs_info, 5288 "send: IO error at offset %llu for inode %llu root %llu", 5289 page_offset(page), sctx->cur_ino, 5290 sctx->send_root->root_key.objectid); 5291 put_page(page); 5292 ret = -EIO; 5293 break; 5294 } 5295 } 5296 5297 memcpy_from_page(sctx->send_buf + sctx->send_size, page, 5298 pg_offset, cur_len); 5299 unlock_page(page); 5300 put_page(page); 5301 index++; 5302 pg_offset = 0; 5303 len -= cur_len; 5304 sctx->send_size += cur_len; 5305 } 5306 5307 return ret; 5308 } 5309 5310 /* 5311 * Read some bytes from the current inode/file and send a write command to 5312 * user space. 5313 */ 5314 static int send_write(struct send_ctx *sctx, u64 offset, u32 len) 5315 { 5316 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 5317 int ret = 0; 5318 struct fs_path *p; 5319 5320 p = fs_path_alloc(); 5321 if (!p) 5322 return -ENOMEM; 5323 5324 btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); 5325 5326 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5327 if (ret < 0) 5328 goto out; 5329 5330 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5331 if (ret < 0) 5332 goto out; 5333 5334 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5335 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5336 ret = put_file_data(sctx, offset, len); 5337 if (ret < 0) 5338 goto out; 5339 5340 ret = send_cmd(sctx); 5341 5342 tlv_put_failure: 5343 out: 5344 fs_path_free(p); 5345 return ret; 5346 } 5347 5348 /* 5349 * Send a clone command to user space. 5350 */ 5351 static int send_clone(struct send_ctx *sctx, 5352 u64 offset, u32 len, 5353 struct clone_root *clone_root) 5354 { 5355 int ret = 0; 5356 struct fs_path *p; 5357 u64 gen; 5358 5359 btrfs_debug(sctx->send_root->fs_info, 5360 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", 5361 offset, len, clone_root->root->root_key.objectid, 5362 clone_root->ino, clone_root->offset); 5363 5364 p = fs_path_alloc(); 5365 if (!p) 5366 return -ENOMEM; 5367 5368 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); 5369 if (ret < 0) 5370 goto out; 5371 5372 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5373 if (ret < 0) 5374 goto out; 5375 5376 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5377 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 5378 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5379 5380 if (clone_root->root == sctx->send_root) { 5381 ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); 5382 if (ret < 0) 5383 goto out; 5384 ret = get_cur_path(sctx, clone_root->ino, gen, p); 5385 } else { 5386 ret = get_inode_path(clone_root->root, clone_root->ino, p); 5387 } 5388 if (ret < 0) 5389 goto out; 5390 5391 /* 5392 * If the parent we're using has a received_uuid set then use that as 5393 * our clone source as that is what we will look for when doing a 5394 * receive. 5395 * 5396 * This covers the case that we create a snapshot off of a received 5397 * subvolume and then use that as the parent and try to receive on a 5398 * different host. 5399 */ 5400 if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) 5401 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5402 clone_root->root->root_item.received_uuid); 5403 else 5404 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5405 clone_root->root->root_item.uuid); 5406 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 5407 btrfs_root_ctransid(&clone_root->root->root_item)); 5408 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 5409 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 5410 clone_root->offset); 5411 5412 ret = send_cmd(sctx); 5413 5414 tlv_put_failure: 5415 out: 5416 fs_path_free(p); 5417 return ret; 5418 } 5419 5420 /* 5421 * Send an update extent command to user space. 5422 */ 5423 static int send_update_extent(struct send_ctx *sctx, 5424 u64 offset, u32 len) 5425 { 5426 int ret = 0; 5427 struct fs_path *p; 5428 5429 p = fs_path_alloc(); 5430 if (!p) 5431 return -ENOMEM; 5432 5433 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); 5434 if (ret < 0) 5435 goto out; 5436 5437 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5438 if (ret < 0) 5439 goto out; 5440 5441 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5442 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5443 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); 5444 5445 ret = send_cmd(sctx); 5446 5447 tlv_put_failure: 5448 out: 5449 fs_path_free(p); 5450 return ret; 5451 } 5452 5453 static int send_hole(struct send_ctx *sctx, u64 end) 5454 { 5455 struct fs_path *p = NULL; 5456 u64 read_size = max_send_read_size(sctx); 5457 u64 offset = sctx->cur_inode_last_extent; 5458 int ret = 0; 5459 5460 /* 5461 * A hole that starts at EOF or beyond it. Since we do not yet support 5462 * fallocate (for extent preallocation and hole punching), sending a 5463 * write of zeroes starting at EOF or beyond would later require issuing 5464 * a truncate operation which would undo the write and achieve nothing. 5465 */ 5466 if (offset >= sctx->cur_inode_size) 5467 return 0; 5468 5469 /* 5470 * Don't go beyond the inode's i_size due to prealloc extents that start 5471 * after the i_size. 5472 */ 5473 end = min_t(u64, end, sctx->cur_inode_size); 5474 5475 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5476 return send_update_extent(sctx, offset, end - offset); 5477 5478 p = fs_path_alloc(); 5479 if (!p) 5480 return -ENOMEM; 5481 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5482 if (ret < 0) 5483 goto tlv_put_failure; 5484 while (offset < end) { 5485 u64 len = min(end - offset, read_size); 5486 5487 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5488 if (ret < 0) 5489 break; 5490 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5491 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5492 ret = put_data_header(sctx, len); 5493 if (ret < 0) 5494 break; 5495 memset(sctx->send_buf + sctx->send_size, 0, len); 5496 sctx->send_size += len; 5497 ret = send_cmd(sctx); 5498 if (ret < 0) 5499 break; 5500 offset += len; 5501 } 5502 sctx->cur_inode_next_write_offset = offset; 5503 tlv_put_failure: 5504 fs_path_free(p); 5505 return ret; 5506 } 5507 5508 static int send_encoded_inline_extent(struct send_ctx *sctx, 5509 struct btrfs_path *path, u64 offset, 5510 u64 len) 5511 { 5512 struct btrfs_root *root = sctx->send_root; 5513 struct btrfs_fs_info *fs_info = root->fs_info; 5514 struct inode *inode; 5515 struct fs_path *fspath; 5516 struct extent_buffer *leaf = path->nodes[0]; 5517 struct btrfs_key key; 5518 struct btrfs_file_extent_item *ei; 5519 u64 ram_bytes; 5520 size_t inline_size; 5521 int ret; 5522 5523 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); 5524 if (IS_ERR(inode)) 5525 return PTR_ERR(inode); 5526 5527 fspath = fs_path_alloc(); 5528 if (!fspath) { 5529 ret = -ENOMEM; 5530 goto out; 5531 } 5532 5533 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); 5534 if (ret < 0) 5535 goto out; 5536 5537 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); 5538 if (ret < 0) 5539 goto out; 5540 5541 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5542 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 5543 ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); 5544 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); 5545 5546 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); 5547 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5548 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, 5549 min(key.offset + ram_bytes - offset, len)); 5550 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); 5551 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); 5552 ret = btrfs_encoded_io_compression_from_extent(fs_info, 5553 btrfs_file_extent_compression(leaf, ei)); 5554 if (ret < 0) 5555 goto out; 5556 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); 5557 5558 ret = put_data_header(sctx, inline_size); 5559 if (ret < 0) 5560 goto out; 5561 read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, 5562 btrfs_file_extent_inline_start(ei), inline_size); 5563 sctx->send_size += inline_size; 5564 5565 ret = send_cmd(sctx); 5566 5567 tlv_put_failure: 5568 out: 5569 fs_path_free(fspath); 5570 iput(inode); 5571 return ret; 5572 } 5573 5574 static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, 5575 u64 offset, u64 len) 5576 { 5577 struct btrfs_root *root = sctx->send_root; 5578 struct btrfs_fs_info *fs_info = root->fs_info; 5579 struct inode *inode; 5580 struct fs_path *fspath; 5581 struct extent_buffer *leaf = path->nodes[0]; 5582 struct btrfs_key key; 5583 struct btrfs_file_extent_item *ei; 5584 u64 disk_bytenr, disk_num_bytes; 5585 u32 data_offset; 5586 struct btrfs_cmd_header *hdr; 5587 u32 crc; 5588 int ret; 5589 5590 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); 5591 if (IS_ERR(inode)) 5592 return PTR_ERR(inode); 5593 5594 fspath = fs_path_alloc(); 5595 if (!fspath) { 5596 ret = -ENOMEM; 5597 goto out; 5598 } 5599 5600 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); 5601 if (ret < 0) 5602 goto out; 5603 5604 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); 5605 if (ret < 0) 5606 goto out; 5607 5608 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5609 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 5610 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 5611 disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); 5612 5613 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); 5614 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5615 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, 5616 min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, 5617 len)); 5618 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, 5619 btrfs_file_extent_ram_bytes(leaf, ei)); 5620 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, 5621 offset - key.offset + btrfs_file_extent_offset(leaf, ei)); 5622 ret = btrfs_encoded_io_compression_from_extent(fs_info, 5623 btrfs_file_extent_compression(leaf, ei)); 5624 if (ret < 0) 5625 goto out; 5626 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); 5627 TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); 5628 5629 ret = put_data_header(sctx, disk_num_bytes); 5630 if (ret < 0) 5631 goto out; 5632 5633 /* 5634 * We want to do I/O directly into the send buffer, so get the next page 5635 * boundary in the send buffer. This means that there may be a gap 5636 * between the beginning of the command and the file data. 5637 */ 5638 data_offset = ALIGN(sctx->send_size, PAGE_SIZE); 5639 if (data_offset > sctx->send_max_size || 5640 sctx->send_max_size - data_offset < disk_num_bytes) { 5641 ret = -EOVERFLOW; 5642 goto out; 5643 } 5644 5645 /* 5646 * Note that send_buf is a mapping of send_buf_pages, so this is really 5647 * reading into send_buf. 5648 */ 5649 ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, 5650 disk_bytenr, disk_num_bytes, 5651 sctx->send_buf_pages + 5652 (data_offset >> PAGE_SHIFT)); 5653 if (ret) 5654 goto out; 5655 5656 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 5657 hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); 5658 hdr->crc = 0; 5659 crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); 5660 crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); 5661 hdr->crc = cpu_to_le32(crc); 5662 5663 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 5664 &sctx->send_off); 5665 if (!ret) { 5666 ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, 5667 disk_num_bytes, &sctx->send_off); 5668 } 5669 sctx->send_size = 0; 5670 sctx->put_data = false; 5671 5672 tlv_put_failure: 5673 out: 5674 fs_path_free(fspath); 5675 iput(inode); 5676 return ret; 5677 } 5678 5679 static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, 5680 const u64 offset, const u64 len) 5681 { 5682 const u64 end = offset + len; 5683 struct extent_buffer *leaf = path->nodes[0]; 5684 struct btrfs_file_extent_item *ei; 5685 u64 read_size = max_send_read_size(sctx); 5686 u64 sent = 0; 5687 5688 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5689 return send_update_extent(sctx, offset, len); 5690 5691 ei = btrfs_item_ptr(leaf, path->slots[0], 5692 struct btrfs_file_extent_item); 5693 if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && 5694 btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { 5695 bool is_inline = (btrfs_file_extent_type(leaf, ei) == 5696 BTRFS_FILE_EXTENT_INLINE); 5697 5698 /* 5699 * Send the compressed extent unless the compressed data is 5700 * larger than the decompressed data. This can happen if we're 5701 * not sending the entire extent, either because it has been 5702 * partially overwritten/truncated or because this is a part of 5703 * the extent that we couldn't clone in clone_range(). 5704 */ 5705 if (is_inline && 5706 btrfs_file_extent_inline_item_len(leaf, 5707 path->slots[0]) <= len) { 5708 return send_encoded_inline_extent(sctx, path, offset, 5709 len); 5710 } else if (!is_inline && 5711 btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { 5712 return send_encoded_extent(sctx, path, offset, len); 5713 } 5714 } 5715 5716 if (sctx->cur_inode == NULL) { 5717 struct btrfs_root *root = sctx->send_root; 5718 5719 sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); 5720 if (IS_ERR(sctx->cur_inode)) { 5721 int err = PTR_ERR(sctx->cur_inode); 5722 5723 sctx->cur_inode = NULL; 5724 return err; 5725 } 5726 memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 5727 file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); 5728 5729 /* 5730 * It's very likely there are no pages from this inode in the page 5731 * cache, so after reading extents and sending their data, we clean 5732 * the page cache to avoid trashing the page cache (adding pressure 5733 * to the page cache and forcing eviction of other data more useful 5734 * for applications). 5735 * 5736 * We decide if we should clean the page cache simply by checking 5737 * if the inode's mapping nrpages is 0 when we first open it, and 5738 * not by using something like filemap_range_has_page() before 5739 * reading an extent because when we ask the readahead code to 5740 * read a given file range, it may (and almost always does) read 5741 * pages from beyond that range (see the documentation for 5742 * page_cache_sync_readahead()), so it would not be reliable, 5743 * because after reading the first extent future calls to 5744 * filemap_range_has_page() would return true because the readahead 5745 * on the previous extent resulted in reading pages of the current 5746 * extent as well. 5747 */ 5748 sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); 5749 sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); 5750 } 5751 5752 while (sent < len) { 5753 u64 size = min(len - sent, read_size); 5754 int ret; 5755 5756 ret = send_write(sctx, offset + sent, size); 5757 if (ret < 0) 5758 return ret; 5759 sent += size; 5760 } 5761 5762 if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { 5763 /* 5764 * Always operate only on ranges that are a multiple of the page 5765 * size. This is not only to prevent zeroing parts of a page in 5766 * the case of subpage sector size, but also to guarantee we evict 5767 * pages, as passing a range that is smaller than page size does 5768 * not evict the respective page (only zeroes part of its content). 5769 * 5770 * Always start from the end offset of the last range cleared. 5771 * This is because the readahead code may (and very often does) 5772 * reads pages beyond the range we request for readahead. So if 5773 * we have an extent layout like this: 5774 * 5775 * [ extent A ] [ extent B ] [ extent C ] 5776 * 5777 * When we ask page_cache_sync_readahead() to read extent A, it 5778 * may also trigger reads for pages of extent B. If we are doing 5779 * an incremental send and extent B has not changed between the 5780 * parent and send snapshots, some or all of its pages may end 5781 * up being read and placed in the page cache. So when truncating 5782 * the page cache we always start from the end offset of the 5783 * previously processed extent up to the end of the current 5784 * extent. 5785 */ 5786 truncate_inode_pages_range(&sctx->cur_inode->i_data, 5787 sctx->page_cache_clear_start, 5788 end - 1); 5789 sctx->page_cache_clear_start = end; 5790 } 5791 5792 return 0; 5793 } 5794 5795 /* 5796 * Search for a capability xattr related to sctx->cur_ino. If the capability is 5797 * found, call send_set_xattr function to emit it. 5798 * 5799 * Return 0 if there isn't a capability, or when the capability was emitted 5800 * successfully, or < 0 if an error occurred. 5801 */ 5802 static int send_capabilities(struct send_ctx *sctx) 5803 { 5804 struct fs_path *fspath = NULL; 5805 struct btrfs_path *path; 5806 struct btrfs_dir_item *di; 5807 struct extent_buffer *leaf; 5808 unsigned long data_ptr; 5809 char *buf = NULL; 5810 int buf_len; 5811 int ret = 0; 5812 5813 path = alloc_path_for_send(); 5814 if (!path) 5815 return -ENOMEM; 5816 5817 di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, 5818 XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); 5819 if (!di) { 5820 /* There is no xattr for this inode */ 5821 goto out; 5822 } else if (IS_ERR(di)) { 5823 ret = PTR_ERR(di); 5824 goto out; 5825 } 5826 5827 leaf = path->nodes[0]; 5828 buf_len = btrfs_dir_data_len(leaf, di); 5829 5830 fspath = fs_path_alloc(); 5831 buf = kmalloc(buf_len, GFP_KERNEL); 5832 if (!fspath || !buf) { 5833 ret = -ENOMEM; 5834 goto out; 5835 } 5836 5837 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); 5838 if (ret < 0) 5839 goto out; 5840 5841 data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); 5842 read_extent_buffer(leaf, buf, data_ptr, buf_len); 5843 5844 ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, 5845 strlen(XATTR_NAME_CAPS), buf, buf_len); 5846 out: 5847 kfree(buf); 5848 fs_path_free(fspath); 5849 btrfs_free_path(path); 5850 return ret; 5851 } 5852 5853 static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, 5854 struct clone_root *clone_root, const u64 disk_byte, 5855 u64 data_offset, u64 offset, u64 len) 5856 { 5857 struct btrfs_path *path; 5858 struct btrfs_key key; 5859 int ret; 5860 struct btrfs_inode_info info; 5861 u64 clone_src_i_size = 0; 5862 5863 /* 5864 * Prevent cloning from a zero offset with a length matching the sector 5865 * size because in some scenarios this will make the receiver fail. 5866 * 5867 * For example, if in the source filesystem the extent at offset 0 5868 * has a length of sectorsize and it was written using direct IO, then 5869 * it can never be an inline extent (even if compression is enabled). 5870 * Then this extent can be cloned in the original filesystem to a non 5871 * zero file offset, but it may not be possible to clone in the 5872 * destination filesystem because it can be inlined due to compression 5873 * on the destination filesystem (as the receiver's write operations are 5874 * always done using buffered IO). The same happens when the original 5875 * filesystem does not have compression enabled but the destination 5876 * filesystem has. 5877 */ 5878 if (clone_root->offset == 0 && 5879 len == sctx->send_root->fs_info->sectorsize) 5880 return send_extent_data(sctx, dst_path, offset, len); 5881 5882 path = alloc_path_for_send(); 5883 if (!path) 5884 return -ENOMEM; 5885 5886 /* 5887 * There are inodes that have extents that lie behind its i_size. Don't 5888 * accept clones from these extents. 5889 */ 5890 ret = get_inode_info(clone_root->root, clone_root->ino, &info); 5891 btrfs_release_path(path); 5892 if (ret < 0) 5893 goto out; 5894 clone_src_i_size = info.size; 5895 5896 /* 5897 * We can't send a clone operation for the entire range if we find 5898 * extent items in the respective range in the source file that 5899 * refer to different extents or if we find holes. 5900 * So check for that and do a mix of clone and regular write/copy 5901 * operations if needed. 5902 * 5903 * Example: 5904 * 5905 * mkfs.btrfs -f /dev/sda 5906 * mount /dev/sda /mnt 5907 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo 5908 * cp --reflink=always /mnt/foo /mnt/bar 5909 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo 5910 * btrfs subvolume snapshot -r /mnt /mnt/snap 5911 * 5912 * If when we send the snapshot and we are processing file bar (which 5913 * has a higher inode number than foo) we blindly send a clone operation 5914 * for the [0, 100K[ range from foo to bar, the receiver ends up getting 5915 * a file bar that matches the content of file foo - iow, doesn't match 5916 * the content from bar in the original filesystem. 5917 */ 5918 key.objectid = clone_root->ino; 5919 key.type = BTRFS_EXTENT_DATA_KEY; 5920 key.offset = clone_root->offset; 5921 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); 5922 if (ret < 0) 5923 goto out; 5924 if (ret > 0 && path->slots[0] > 0) { 5925 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 5926 if (key.objectid == clone_root->ino && 5927 key.type == BTRFS_EXTENT_DATA_KEY) 5928 path->slots[0]--; 5929 } 5930 5931 while (true) { 5932 struct extent_buffer *leaf = path->nodes[0]; 5933 int slot = path->slots[0]; 5934 struct btrfs_file_extent_item *ei; 5935 u8 type; 5936 u64 ext_len; 5937 u64 clone_len; 5938 u64 clone_data_offset; 5939 bool crossed_src_i_size = false; 5940 5941 if (slot >= btrfs_header_nritems(leaf)) { 5942 ret = btrfs_next_leaf(clone_root->root, path); 5943 if (ret < 0) 5944 goto out; 5945 else if (ret > 0) 5946 break; 5947 continue; 5948 } 5949 5950 btrfs_item_key_to_cpu(leaf, &key, slot); 5951 5952 /* 5953 * We might have an implicit trailing hole (NO_HOLES feature 5954 * enabled). We deal with it after leaving this loop. 5955 */ 5956 if (key.objectid != clone_root->ino || 5957 key.type != BTRFS_EXTENT_DATA_KEY) 5958 break; 5959 5960 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5961 type = btrfs_file_extent_type(leaf, ei); 5962 if (type == BTRFS_FILE_EXTENT_INLINE) { 5963 ext_len = btrfs_file_extent_ram_bytes(leaf, ei); 5964 ext_len = PAGE_ALIGN(ext_len); 5965 } else { 5966 ext_len = btrfs_file_extent_num_bytes(leaf, ei); 5967 } 5968 5969 if (key.offset + ext_len <= clone_root->offset) 5970 goto next; 5971 5972 if (key.offset > clone_root->offset) { 5973 /* Implicit hole, NO_HOLES feature enabled. */ 5974 u64 hole_len = key.offset - clone_root->offset; 5975 5976 if (hole_len > len) 5977 hole_len = len; 5978 ret = send_extent_data(sctx, dst_path, offset, 5979 hole_len); 5980 if (ret < 0) 5981 goto out; 5982 5983 len -= hole_len; 5984 if (len == 0) 5985 break; 5986 offset += hole_len; 5987 clone_root->offset += hole_len; 5988 data_offset += hole_len; 5989 } 5990 5991 if (key.offset >= clone_root->offset + len) 5992 break; 5993 5994 if (key.offset >= clone_src_i_size) 5995 break; 5996 5997 if (key.offset + ext_len > clone_src_i_size) { 5998 ext_len = clone_src_i_size - key.offset; 5999 crossed_src_i_size = true; 6000 } 6001 6002 clone_data_offset = btrfs_file_extent_offset(leaf, ei); 6003 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { 6004 clone_root->offset = key.offset; 6005 if (clone_data_offset < data_offset && 6006 clone_data_offset + ext_len > data_offset) { 6007 u64 extent_offset; 6008 6009 extent_offset = data_offset - clone_data_offset; 6010 ext_len -= extent_offset; 6011 clone_data_offset += extent_offset; 6012 clone_root->offset += extent_offset; 6013 } 6014 } 6015 6016 clone_len = min_t(u64, ext_len, len); 6017 6018 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && 6019 clone_data_offset == data_offset) { 6020 const u64 src_end = clone_root->offset + clone_len; 6021 const u64 sectorsize = SZ_64K; 6022 6023 /* 6024 * We can't clone the last block, when its size is not 6025 * sector size aligned, into the middle of a file. If we 6026 * do so, the receiver will get a failure (-EINVAL) when 6027 * trying to clone or will silently corrupt the data in 6028 * the destination file if it's on a kernel without the 6029 * fix introduced by commit ac765f83f1397646 6030 * ("Btrfs: fix data corruption due to cloning of eof 6031 * block). 6032 * 6033 * So issue a clone of the aligned down range plus a 6034 * regular write for the eof block, if we hit that case. 6035 * 6036 * Also, we use the maximum possible sector size, 64K, 6037 * because we don't know what's the sector size of the 6038 * filesystem that receives the stream, so we have to 6039 * assume the largest possible sector size. 6040 */ 6041 if (src_end == clone_src_i_size && 6042 !IS_ALIGNED(src_end, sectorsize) && 6043 offset + clone_len < sctx->cur_inode_size) { 6044 u64 slen; 6045 6046 slen = ALIGN_DOWN(src_end - clone_root->offset, 6047 sectorsize); 6048 if (slen > 0) { 6049 ret = send_clone(sctx, offset, slen, 6050 clone_root); 6051 if (ret < 0) 6052 goto out; 6053 } 6054 ret = send_extent_data(sctx, dst_path, 6055 offset + slen, 6056 clone_len - slen); 6057 } else { 6058 ret = send_clone(sctx, offset, clone_len, 6059 clone_root); 6060 } 6061 } else if (crossed_src_i_size && clone_len < len) { 6062 /* 6063 * If we are at i_size of the clone source inode and we 6064 * can not clone from it, terminate the loop. This is 6065 * to avoid sending two write operations, one with a 6066 * length matching clone_len and the final one after 6067 * this loop with a length of len - clone_len. 6068 * 6069 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED 6070 * was passed to the send ioctl), this helps avoid 6071 * sending an encoded write for an offset that is not 6072 * sector size aligned, in case the i_size of the source 6073 * inode is not sector size aligned. That will make the 6074 * receiver fallback to decompression of the data and 6075 * writing it using regular buffered IO, therefore while 6076 * not incorrect, it's not optimal due decompression and 6077 * possible re-compression at the receiver. 6078 */ 6079 break; 6080 } else { 6081 ret = send_extent_data(sctx, dst_path, offset, 6082 clone_len); 6083 } 6084 6085 if (ret < 0) 6086 goto out; 6087 6088 len -= clone_len; 6089 if (len == 0) 6090 break; 6091 offset += clone_len; 6092 clone_root->offset += clone_len; 6093 6094 /* 6095 * If we are cloning from the file we are currently processing, 6096 * and using the send root as the clone root, we must stop once 6097 * the current clone offset reaches the current eof of the file 6098 * at the receiver, otherwise we would issue an invalid clone 6099 * operation (source range going beyond eof) and cause the 6100 * receiver to fail. So if we reach the current eof, bail out 6101 * and fallback to a regular write. 6102 */ 6103 if (clone_root->root == sctx->send_root && 6104 clone_root->ino == sctx->cur_ino && 6105 clone_root->offset >= sctx->cur_inode_next_write_offset) 6106 break; 6107 6108 data_offset += clone_len; 6109 next: 6110 path->slots[0]++; 6111 } 6112 6113 if (len > 0) 6114 ret = send_extent_data(sctx, dst_path, offset, len); 6115 else 6116 ret = 0; 6117 out: 6118 btrfs_free_path(path); 6119 return ret; 6120 } 6121 6122 static int send_write_or_clone(struct send_ctx *sctx, 6123 struct btrfs_path *path, 6124 struct btrfs_key *key, 6125 struct clone_root *clone_root) 6126 { 6127 int ret = 0; 6128 u64 offset = key->offset; 6129 u64 end; 6130 u64 bs = sctx->send_root->fs_info->sb->s_blocksize; 6131 6132 end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); 6133 if (offset >= end) 6134 return 0; 6135 6136 if (clone_root && IS_ALIGNED(end, bs)) { 6137 struct btrfs_file_extent_item *ei; 6138 u64 disk_byte; 6139 u64 data_offset; 6140 6141 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 6142 struct btrfs_file_extent_item); 6143 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); 6144 data_offset = btrfs_file_extent_offset(path->nodes[0], ei); 6145 ret = clone_range(sctx, path, clone_root, disk_byte, 6146 data_offset, offset, end - offset); 6147 } else { 6148 ret = send_extent_data(sctx, path, offset, end - offset); 6149 } 6150 sctx->cur_inode_next_write_offset = end; 6151 return ret; 6152 } 6153 6154 static int is_extent_unchanged(struct send_ctx *sctx, 6155 struct btrfs_path *left_path, 6156 struct btrfs_key *ekey) 6157 { 6158 int ret = 0; 6159 struct btrfs_key key; 6160 struct btrfs_path *path = NULL; 6161 struct extent_buffer *eb; 6162 int slot; 6163 struct btrfs_key found_key; 6164 struct btrfs_file_extent_item *ei; 6165 u64 left_disknr; 6166 u64 right_disknr; 6167 u64 left_offset; 6168 u64 right_offset; 6169 u64 left_offset_fixed; 6170 u64 left_len; 6171 u64 right_len; 6172 u64 left_gen; 6173 u64 right_gen; 6174 u8 left_type; 6175 u8 right_type; 6176 6177 path = alloc_path_for_send(); 6178 if (!path) 6179 return -ENOMEM; 6180 6181 eb = left_path->nodes[0]; 6182 slot = left_path->slots[0]; 6183 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 6184 left_type = btrfs_file_extent_type(eb, ei); 6185 6186 if (left_type != BTRFS_FILE_EXTENT_REG) { 6187 ret = 0; 6188 goto out; 6189 } 6190 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 6191 left_len = btrfs_file_extent_num_bytes(eb, ei); 6192 left_offset = btrfs_file_extent_offset(eb, ei); 6193 left_gen = btrfs_file_extent_generation(eb, ei); 6194 6195 /* 6196 * Following comments will refer to these graphics. L is the left 6197 * extents which we are checking at the moment. 1-8 are the right 6198 * extents that we iterate. 6199 * 6200 * |-----L-----| 6201 * |-1-|-2a-|-3-|-4-|-5-|-6-| 6202 * 6203 * |-----L-----| 6204 * |--1--|-2b-|...(same as above) 6205 * 6206 * Alternative situation. Happens on files where extents got split. 6207 * |-----L-----| 6208 * |-----------7-----------|-6-| 6209 * 6210 * Alternative situation. Happens on files which got larger. 6211 * |-----L-----| 6212 * |-8-| 6213 * Nothing follows after 8. 6214 */ 6215 6216 key.objectid = ekey->objectid; 6217 key.type = BTRFS_EXTENT_DATA_KEY; 6218 key.offset = ekey->offset; 6219 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); 6220 if (ret < 0) 6221 goto out; 6222 if (ret) { 6223 ret = 0; 6224 goto out; 6225 } 6226 6227 /* 6228 * Handle special case where the right side has no extents at all. 6229 */ 6230 eb = path->nodes[0]; 6231 slot = path->slots[0]; 6232 btrfs_item_key_to_cpu(eb, &found_key, slot); 6233 if (found_key.objectid != key.objectid || 6234 found_key.type != key.type) { 6235 /* If we're a hole then just pretend nothing changed */ 6236 ret = (left_disknr) ? 0 : 1; 6237 goto out; 6238 } 6239 6240 /* 6241 * We're now on 2a, 2b or 7. 6242 */ 6243 key = found_key; 6244 while (key.offset < ekey->offset + left_len) { 6245 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 6246 right_type = btrfs_file_extent_type(eb, ei); 6247 if (right_type != BTRFS_FILE_EXTENT_REG && 6248 right_type != BTRFS_FILE_EXTENT_INLINE) { 6249 ret = 0; 6250 goto out; 6251 } 6252 6253 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 6254 right_len = btrfs_file_extent_ram_bytes(eb, ei); 6255 right_len = PAGE_ALIGN(right_len); 6256 } else { 6257 right_len = btrfs_file_extent_num_bytes(eb, ei); 6258 } 6259 6260 /* 6261 * Are we at extent 8? If yes, we know the extent is changed. 6262 * This may only happen on the first iteration. 6263 */ 6264 if (found_key.offset + right_len <= ekey->offset) { 6265 /* If we're a hole just pretend nothing changed */ 6266 ret = (left_disknr) ? 0 : 1; 6267 goto out; 6268 } 6269 6270 /* 6271 * We just wanted to see if when we have an inline extent, what 6272 * follows it is a regular extent (wanted to check the above 6273 * condition for inline extents too). This should normally not 6274 * happen but it's possible for example when we have an inline 6275 * compressed extent representing data with a size matching 6276 * the page size (currently the same as sector size). 6277 */ 6278 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 6279 ret = 0; 6280 goto out; 6281 } 6282 6283 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 6284 right_offset = btrfs_file_extent_offset(eb, ei); 6285 right_gen = btrfs_file_extent_generation(eb, ei); 6286 6287 left_offset_fixed = left_offset; 6288 if (key.offset < ekey->offset) { 6289 /* Fix the right offset for 2a and 7. */ 6290 right_offset += ekey->offset - key.offset; 6291 } else { 6292 /* Fix the left offset for all behind 2a and 2b */ 6293 left_offset_fixed += key.offset - ekey->offset; 6294 } 6295 6296 /* 6297 * Check if we have the same extent. 6298 */ 6299 if (left_disknr != right_disknr || 6300 left_offset_fixed != right_offset || 6301 left_gen != right_gen) { 6302 ret = 0; 6303 goto out; 6304 } 6305 6306 /* 6307 * Go to the next extent. 6308 */ 6309 ret = btrfs_next_item(sctx->parent_root, path); 6310 if (ret < 0) 6311 goto out; 6312 if (!ret) { 6313 eb = path->nodes[0]; 6314 slot = path->slots[0]; 6315 btrfs_item_key_to_cpu(eb, &found_key, slot); 6316 } 6317 if (ret || found_key.objectid != key.objectid || 6318 found_key.type != key.type) { 6319 key.offset += right_len; 6320 break; 6321 } 6322 if (found_key.offset != key.offset + right_len) { 6323 ret = 0; 6324 goto out; 6325 } 6326 key = found_key; 6327 } 6328 6329 /* 6330 * We're now behind the left extent (treat as unchanged) or at the end 6331 * of the right side (treat as changed). 6332 */ 6333 if (key.offset >= ekey->offset + left_len) 6334 ret = 1; 6335 else 6336 ret = 0; 6337 6338 6339 out: 6340 btrfs_free_path(path); 6341 return ret; 6342 } 6343 6344 static int get_last_extent(struct send_ctx *sctx, u64 offset) 6345 { 6346 struct btrfs_path *path; 6347 struct btrfs_root *root = sctx->send_root; 6348 struct btrfs_key key; 6349 int ret; 6350 6351 path = alloc_path_for_send(); 6352 if (!path) 6353 return -ENOMEM; 6354 6355 sctx->cur_inode_last_extent = 0; 6356 6357 key.objectid = sctx->cur_ino; 6358 key.type = BTRFS_EXTENT_DATA_KEY; 6359 key.offset = offset; 6360 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); 6361 if (ret < 0) 6362 goto out; 6363 ret = 0; 6364 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 6365 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) 6366 goto out; 6367 6368 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 6369 out: 6370 btrfs_free_path(path); 6371 return ret; 6372 } 6373 6374 static int range_is_hole_in_parent(struct send_ctx *sctx, 6375 const u64 start, 6376 const u64 end) 6377 { 6378 struct btrfs_path *path; 6379 struct btrfs_key key; 6380 struct btrfs_root *root = sctx->parent_root; 6381 u64 search_start = start; 6382 int ret; 6383 6384 path = alloc_path_for_send(); 6385 if (!path) 6386 return -ENOMEM; 6387 6388 key.objectid = sctx->cur_ino; 6389 key.type = BTRFS_EXTENT_DATA_KEY; 6390 key.offset = search_start; 6391 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6392 if (ret < 0) 6393 goto out; 6394 if (ret > 0 && path->slots[0] > 0) 6395 path->slots[0]--; 6396 6397 while (search_start < end) { 6398 struct extent_buffer *leaf = path->nodes[0]; 6399 int slot = path->slots[0]; 6400 struct btrfs_file_extent_item *fi; 6401 u64 extent_end; 6402 6403 if (slot >= btrfs_header_nritems(leaf)) { 6404 ret = btrfs_next_leaf(root, path); 6405 if (ret < 0) 6406 goto out; 6407 else if (ret > 0) 6408 break; 6409 continue; 6410 } 6411 6412 btrfs_item_key_to_cpu(leaf, &key, slot); 6413 if (key.objectid < sctx->cur_ino || 6414 key.type < BTRFS_EXTENT_DATA_KEY) 6415 goto next; 6416 if (key.objectid > sctx->cur_ino || 6417 key.type > BTRFS_EXTENT_DATA_KEY || 6418 key.offset >= end) 6419 break; 6420 6421 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 6422 extent_end = btrfs_file_extent_end(path); 6423 if (extent_end <= start) 6424 goto next; 6425 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { 6426 search_start = extent_end; 6427 goto next; 6428 } 6429 ret = 0; 6430 goto out; 6431 next: 6432 path->slots[0]++; 6433 } 6434 ret = 1; 6435 out: 6436 btrfs_free_path(path); 6437 return ret; 6438 } 6439 6440 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, 6441 struct btrfs_key *key) 6442 { 6443 int ret = 0; 6444 6445 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) 6446 return 0; 6447 6448 if (sctx->cur_inode_last_extent == (u64)-1) { 6449 ret = get_last_extent(sctx, key->offset - 1); 6450 if (ret) 6451 return ret; 6452 } 6453 6454 if (path->slots[0] == 0 && 6455 sctx->cur_inode_last_extent < key->offset) { 6456 /* 6457 * We might have skipped entire leafs that contained only 6458 * file extent items for our current inode. These leafs have 6459 * a generation number smaller (older) than the one in the 6460 * current leaf and the leaf our last extent came from, and 6461 * are located between these 2 leafs. 6462 */ 6463 ret = get_last_extent(sctx, key->offset - 1); 6464 if (ret) 6465 return ret; 6466 } 6467 6468 if (sctx->cur_inode_last_extent < key->offset) { 6469 ret = range_is_hole_in_parent(sctx, 6470 sctx->cur_inode_last_extent, 6471 key->offset); 6472 if (ret < 0) 6473 return ret; 6474 else if (ret == 0) 6475 ret = send_hole(sctx, key->offset); 6476 else 6477 ret = 0; 6478 } 6479 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 6480 return ret; 6481 } 6482 6483 static int process_extent(struct send_ctx *sctx, 6484 struct btrfs_path *path, 6485 struct btrfs_key *key) 6486 { 6487 struct clone_root *found_clone = NULL; 6488 int ret = 0; 6489 6490 if (S_ISLNK(sctx->cur_inode_mode)) 6491 return 0; 6492 6493 if (sctx->parent_root && !sctx->cur_inode_new) { 6494 ret = is_extent_unchanged(sctx, path, key); 6495 if (ret < 0) 6496 goto out; 6497 if (ret) { 6498 ret = 0; 6499 goto out_hole; 6500 } 6501 } else { 6502 struct btrfs_file_extent_item *ei; 6503 u8 type; 6504 6505 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 6506 struct btrfs_file_extent_item); 6507 type = btrfs_file_extent_type(path->nodes[0], ei); 6508 if (type == BTRFS_FILE_EXTENT_PREALLOC || 6509 type == BTRFS_FILE_EXTENT_REG) { 6510 /* 6511 * The send spec does not have a prealloc command yet, 6512 * so just leave a hole for prealloc'ed extents until 6513 * we have enough commands queued up to justify rev'ing 6514 * the send spec. 6515 */ 6516 if (type == BTRFS_FILE_EXTENT_PREALLOC) { 6517 ret = 0; 6518 goto out; 6519 } 6520 6521 /* Have a hole, just skip it. */ 6522 if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { 6523 ret = 0; 6524 goto out; 6525 } 6526 } 6527 } 6528 6529 ret = find_extent_clone(sctx, path, key->objectid, key->offset, 6530 sctx->cur_inode_size, &found_clone); 6531 if (ret != -ENOENT && ret < 0) 6532 goto out; 6533 6534 ret = send_write_or_clone(sctx, path, key, found_clone); 6535 if (ret) 6536 goto out; 6537 out_hole: 6538 ret = maybe_send_hole(sctx, path, key); 6539 out: 6540 return ret; 6541 } 6542 6543 static int process_all_extents(struct send_ctx *sctx) 6544 { 6545 int ret = 0; 6546 int iter_ret = 0; 6547 struct btrfs_root *root; 6548 struct btrfs_path *path; 6549 struct btrfs_key key; 6550 struct btrfs_key found_key; 6551 6552 root = sctx->send_root; 6553 path = alloc_path_for_send(); 6554 if (!path) 6555 return -ENOMEM; 6556 6557 key.objectid = sctx->cmp_key->objectid; 6558 key.type = BTRFS_EXTENT_DATA_KEY; 6559 key.offset = 0; 6560 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 6561 if (found_key.objectid != key.objectid || 6562 found_key.type != key.type) { 6563 ret = 0; 6564 break; 6565 } 6566 6567 ret = process_extent(sctx, path, &found_key); 6568 if (ret < 0) 6569 break; 6570 } 6571 /* Catch error found during iteration */ 6572 if (iter_ret < 0) 6573 ret = iter_ret; 6574 6575 btrfs_free_path(path); 6576 return ret; 6577 } 6578 6579 static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, 6580 int *pending_move, 6581 int *refs_processed) 6582 { 6583 int ret = 0; 6584 6585 if (sctx->cur_ino == 0) 6586 goto out; 6587 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && 6588 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) 6589 goto out; 6590 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 6591 goto out; 6592 6593 ret = process_recorded_refs(sctx, pending_move); 6594 if (ret < 0) 6595 goto out; 6596 6597 *refs_processed = 1; 6598 out: 6599 return ret; 6600 } 6601 6602 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) 6603 { 6604 int ret = 0; 6605 struct btrfs_inode_info info; 6606 u64 left_mode; 6607 u64 left_uid; 6608 u64 left_gid; 6609 u64 left_fileattr; 6610 u64 right_mode; 6611 u64 right_uid; 6612 u64 right_gid; 6613 u64 right_fileattr; 6614 int need_chmod = 0; 6615 int need_chown = 0; 6616 bool need_fileattr = false; 6617 int need_truncate = 1; 6618 int pending_move = 0; 6619 int refs_processed = 0; 6620 6621 if (sctx->ignore_cur_inode) 6622 return 0; 6623 6624 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, 6625 &refs_processed); 6626 if (ret < 0) 6627 goto out; 6628 6629 /* 6630 * We have processed the refs and thus need to advance send_progress. 6631 * Now, calls to get_cur_xxx will take the updated refs of the current 6632 * inode into account. 6633 * 6634 * On the other hand, if our current inode is a directory and couldn't 6635 * be moved/renamed because its parent was renamed/moved too and it has 6636 * a higher inode number, we can only move/rename our current inode 6637 * after we moved/renamed its parent. Therefore in this case operate on 6638 * the old path (pre move/rename) of our current inode, and the 6639 * move/rename will be performed later. 6640 */ 6641 if (refs_processed && !pending_move) 6642 sctx->send_progress = sctx->cur_ino + 1; 6643 6644 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) 6645 goto out; 6646 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) 6647 goto out; 6648 ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); 6649 if (ret < 0) 6650 goto out; 6651 left_mode = info.mode; 6652 left_uid = info.uid; 6653 left_gid = info.gid; 6654 left_fileattr = info.fileattr; 6655 6656 if (!sctx->parent_root || sctx->cur_inode_new) { 6657 need_chown = 1; 6658 if (!S_ISLNK(sctx->cur_inode_mode)) 6659 need_chmod = 1; 6660 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) 6661 need_truncate = 0; 6662 } else { 6663 u64 old_size; 6664 6665 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); 6666 if (ret < 0) 6667 goto out; 6668 old_size = info.size; 6669 right_mode = info.mode; 6670 right_uid = info.uid; 6671 right_gid = info.gid; 6672 right_fileattr = info.fileattr; 6673 6674 if (left_uid != right_uid || left_gid != right_gid) 6675 need_chown = 1; 6676 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) 6677 need_chmod = 1; 6678 if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) 6679 need_fileattr = true; 6680 if ((old_size == sctx->cur_inode_size) || 6681 (sctx->cur_inode_size > old_size && 6682 sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) 6683 need_truncate = 0; 6684 } 6685 6686 if (S_ISREG(sctx->cur_inode_mode)) { 6687 if (need_send_hole(sctx)) { 6688 if (sctx->cur_inode_last_extent == (u64)-1 || 6689 sctx->cur_inode_last_extent < 6690 sctx->cur_inode_size) { 6691 ret = get_last_extent(sctx, (u64)-1); 6692 if (ret) 6693 goto out; 6694 } 6695 if (sctx->cur_inode_last_extent < 6696 sctx->cur_inode_size) { 6697 ret = send_hole(sctx, sctx->cur_inode_size); 6698 if (ret) 6699 goto out; 6700 } 6701 } 6702 if (need_truncate) { 6703 ret = send_truncate(sctx, sctx->cur_ino, 6704 sctx->cur_inode_gen, 6705 sctx->cur_inode_size); 6706 if (ret < 0) 6707 goto out; 6708 } 6709 } 6710 6711 if (need_chown) { 6712 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6713 left_uid, left_gid); 6714 if (ret < 0) 6715 goto out; 6716 } 6717 if (need_chmod) { 6718 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6719 left_mode); 6720 if (ret < 0) 6721 goto out; 6722 } 6723 if (need_fileattr) { 6724 ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6725 left_fileattr); 6726 if (ret < 0) 6727 goto out; 6728 } 6729 6730 if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) 6731 && sctx->cur_inode_needs_verity) { 6732 ret = process_verity(sctx); 6733 if (ret < 0) 6734 goto out; 6735 } 6736 6737 ret = send_capabilities(sctx); 6738 if (ret < 0) 6739 goto out; 6740 6741 /* 6742 * If other directory inodes depended on our current directory 6743 * inode's move/rename, now do their move/rename operations. 6744 */ 6745 if (!is_waiting_for_move(sctx, sctx->cur_ino)) { 6746 ret = apply_children_dir_moves(sctx); 6747 if (ret) 6748 goto out; 6749 /* 6750 * Need to send that every time, no matter if it actually 6751 * changed between the two trees as we have done changes to 6752 * the inode before. If our inode is a directory and it's 6753 * waiting to be moved/renamed, we will send its utimes when 6754 * it's moved/renamed, therefore we don't need to do it here. 6755 */ 6756 sctx->send_progress = sctx->cur_ino + 1; 6757 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); 6758 if (ret < 0) 6759 goto out; 6760 } 6761 6762 out: 6763 return ret; 6764 } 6765 6766 static void close_current_inode(struct send_ctx *sctx) 6767 { 6768 u64 i_size; 6769 6770 if (sctx->cur_inode == NULL) 6771 return; 6772 6773 i_size = i_size_read(sctx->cur_inode); 6774 6775 /* 6776 * If we are doing an incremental send, we may have extents between the 6777 * last processed extent and the i_size that have not been processed 6778 * because they haven't changed but we may have read some of their pages 6779 * through readahead, see the comments at send_extent_data(). 6780 */ 6781 if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) 6782 truncate_inode_pages_range(&sctx->cur_inode->i_data, 6783 sctx->page_cache_clear_start, 6784 round_up(i_size, PAGE_SIZE) - 1); 6785 6786 iput(sctx->cur_inode); 6787 sctx->cur_inode = NULL; 6788 } 6789 6790 static int changed_inode(struct send_ctx *sctx, 6791 enum btrfs_compare_tree_result result) 6792 { 6793 int ret = 0; 6794 struct btrfs_key *key = sctx->cmp_key; 6795 struct btrfs_inode_item *left_ii = NULL; 6796 struct btrfs_inode_item *right_ii = NULL; 6797 u64 left_gen = 0; 6798 u64 right_gen = 0; 6799 6800 close_current_inode(sctx); 6801 6802 sctx->cur_ino = key->objectid; 6803 sctx->cur_inode_new_gen = false; 6804 sctx->cur_inode_last_extent = (u64)-1; 6805 sctx->cur_inode_next_write_offset = 0; 6806 sctx->ignore_cur_inode = false; 6807 6808 /* 6809 * Set send_progress to current inode. This will tell all get_cur_xxx 6810 * functions that the current inode's refs are not updated yet. Later, 6811 * when process_recorded_refs is finished, it is set to cur_ino + 1. 6812 */ 6813 sctx->send_progress = sctx->cur_ino; 6814 6815 if (result == BTRFS_COMPARE_TREE_NEW || 6816 result == BTRFS_COMPARE_TREE_CHANGED) { 6817 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], 6818 sctx->left_path->slots[0], 6819 struct btrfs_inode_item); 6820 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], 6821 left_ii); 6822 } else { 6823 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6824 sctx->right_path->slots[0], 6825 struct btrfs_inode_item); 6826 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6827 right_ii); 6828 } 6829 if (result == BTRFS_COMPARE_TREE_CHANGED) { 6830 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6831 sctx->right_path->slots[0], 6832 struct btrfs_inode_item); 6833 6834 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6835 right_ii); 6836 6837 /* 6838 * The cur_ino = root dir case is special here. We can't treat 6839 * the inode as deleted+reused because it would generate a 6840 * stream that tries to delete/mkdir the root dir. 6841 */ 6842 if (left_gen != right_gen && 6843 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6844 sctx->cur_inode_new_gen = true; 6845 } 6846 6847 /* 6848 * Normally we do not find inodes with a link count of zero (orphans) 6849 * because the most common case is to create a snapshot and use it 6850 * for a send operation. However other less common use cases involve 6851 * using a subvolume and send it after turning it to RO mode just 6852 * after deleting all hard links of a file while holding an open 6853 * file descriptor against it or turning a RO snapshot into RW mode, 6854 * keep an open file descriptor against a file, delete it and then 6855 * turn the snapshot back to RO mode before using it for a send 6856 * operation. The former is what the receiver operation does. 6857 * Therefore, if we want to send these snapshots soon after they're 6858 * received, we need to handle orphan inodes as well. Moreover, orphans 6859 * can appear not only in the send snapshot but also in the parent 6860 * snapshot. Here are several cases: 6861 * 6862 * Case 1: BTRFS_COMPARE_TREE_NEW 6863 * | send snapshot | action 6864 * -------------------------------- 6865 * nlink | 0 | ignore 6866 * 6867 * Case 2: BTRFS_COMPARE_TREE_DELETED 6868 * | parent snapshot | action 6869 * ---------------------------------- 6870 * nlink | 0 | as usual 6871 * Note: No unlinks will be sent because there're no paths for it. 6872 * 6873 * Case 3: BTRFS_COMPARE_TREE_CHANGED 6874 * | | parent snapshot | send snapshot | action 6875 * ----------------------------------------------------------------------- 6876 * subcase 1 | nlink | 0 | 0 | ignore 6877 * subcase 2 | nlink | >0 | 0 | new_gen(deletion) 6878 * subcase 3 | nlink | 0 | >0 | new_gen(creation) 6879 * 6880 */ 6881 if (result == BTRFS_COMPARE_TREE_NEW) { 6882 if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { 6883 sctx->ignore_cur_inode = true; 6884 goto out; 6885 } 6886 sctx->cur_inode_gen = left_gen; 6887 sctx->cur_inode_new = true; 6888 sctx->cur_inode_deleted = false; 6889 sctx->cur_inode_size = btrfs_inode_size( 6890 sctx->left_path->nodes[0], left_ii); 6891 sctx->cur_inode_mode = btrfs_inode_mode( 6892 sctx->left_path->nodes[0], left_ii); 6893 sctx->cur_inode_rdev = btrfs_inode_rdev( 6894 sctx->left_path->nodes[0], left_ii); 6895 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6896 ret = send_create_inode_if_needed(sctx); 6897 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 6898 sctx->cur_inode_gen = right_gen; 6899 sctx->cur_inode_new = false; 6900 sctx->cur_inode_deleted = true; 6901 sctx->cur_inode_size = btrfs_inode_size( 6902 sctx->right_path->nodes[0], right_ii); 6903 sctx->cur_inode_mode = btrfs_inode_mode( 6904 sctx->right_path->nodes[0], right_ii); 6905 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 6906 u32 new_nlinks, old_nlinks; 6907 6908 new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); 6909 old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); 6910 if (new_nlinks == 0 && old_nlinks == 0) { 6911 sctx->ignore_cur_inode = true; 6912 goto out; 6913 } else if (new_nlinks == 0 || old_nlinks == 0) { 6914 sctx->cur_inode_new_gen = 1; 6915 } 6916 /* 6917 * We need to do some special handling in case the inode was 6918 * reported as changed with a changed generation number. This 6919 * means that the original inode was deleted and new inode 6920 * reused the same inum. So we have to treat the old inode as 6921 * deleted and the new one as new. 6922 */ 6923 if (sctx->cur_inode_new_gen) { 6924 /* 6925 * First, process the inode as if it was deleted. 6926 */ 6927 if (old_nlinks > 0) { 6928 sctx->cur_inode_gen = right_gen; 6929 sctx->cur_inode_new = false; 6930 sctx->cur_inode_deleted = true; 6931 sctx->cur_inode_size = btrfs_inode_size( 6932 sctx->right_path->nodes[0], right_ii); 6933 sctx->cur_inode_mode = btrfs_inode_mode( 6934 sctx->right_path->nodes[0], right_ii); 6935 ret = process_all_refs(sctx, 6936 BTRFS_COMPARE_TREE_DELETED); 6937 if (ret < 0) 6938 goto out; 6939 } 6940 6941 /* 6942 * Now process the inode as if it was new. 6943 */ 6944 if (new_nlinks > 0) { 6945 sctx->cur_inode_gen = left_gen; 6946 sctx->cur_inode_new = true; 6947 sctx->cur_inode_deleted = false; 6948 sctx->cur_inode_size = btrfs_inode_size( 6949 sctx->left_path->nodes[0], 6950 left_ii); 6951 sctx->cur_inode_mode = btrfs_inode_mode( 6952 sctx->left_path->nodes[0], 6953 left_ii); 6954 sctx->cur_inode_rdev = btrfs_inode_rdev( 6955 sctx->left_path->nodes[0], 6956 left_ii); 6957 ret = send_create_inode_if_needed(sctx); 6958 if (ret < 0) 6959 goto out; 6960 6961 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 6962 if (ret < 0) 6963 goto out; 6964 /* 6965 * Advance send_progress now as we did not get 6966 * into process_recorded_refs_if_needed in the 6967 * new_gen case. 6968 */ 6969 sctx->send_progress = sctx->cur_ino + 1; 6970 6971 /* 6972 * Now process all extents and xattrs of the 6973 * inode as if they were all new. 6974 */ 6975 ret = process_all_extents(sctx); 6976 if (ret < 0) 6977 goto out; 6978 ret = process_all_new_xattrs(sctx); 6979 if (ret < 0) 6980 goto out; 6981 } 6982 } else { 6983 sctx->cur_inode_gen = left_gen; 6984 sctx->cur_inode_new = false; 6985 sctx->cur_inode_new_gen = false; 6986 sctx->cur_inode_deleted = false; 6987 sctx->cur_inode_size = btrfs_inode_size( 6988 sctx->left_path->nodes[0], left_ii); 6989 sctx->cur_inode_mode = btrfs_inode_mode( 6990 sctx->left_path->nodes[0], left_ii); 6991 } 6992 } 6993 6994 out: 6995 return ret; 6996 } 6997 6998 /* 6999 * We have to process new refs before deleted refs, but compare_trees gives us 7000 * the new and deleted refs mixed. To fix this, we record the new/deleted refs 7001 * first and later process them in process_recorded_refs. 7002 * For the cur_inode_new_gen case, we skip recording completely because 7003 * changed_inode did already initiate processing of refs. The reason for this is 7004 * that in this case, compare_tree actually compares the refs of 2 different 7005 * inodes. To fix this, process_all_refs is used in changed_inode to handle all 7006 * refs of the right tree as deleted and all refs of the left tree as new. 7007 */ 7008 static int changed_ref(struct send_ctx *sctx, 7009 enum btrfs_compare_tree_result result) 7010 { 7011 int ret = 0; 7012 7013 if (sctx->cur_ino != sctx->cmp_key->objectid) { 7014 inconsistent_snapshot_error(sctx, result, "reference"); 7015 return -EIO; 7016 } 7017 7018 if (!sctx->cur_inode_new_gen && 7019 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { 7020 if (result == BTRFS_COMPARE_TREE_NEW) 7021 ret = record_new_ref(sctx); 7022 else if (result == BTRFS_COMPARE_TREE_DELETED) 7023 ret = record_deleted_ref(sctx); 7024 else if (result == BTRFS_COMPARE_TREE_CHANGED) 7025 ret = record_changed_ref(sctx); 7026 } 7027 7028 return ret; 7029 } 7030 7031 /* 7032 * Process new/deleted/changed xattrs. We skip processing in the 7033 * cur_inode_new_gen case because changed_inode did already initiate processing 7034 * of xattrs. The reason is the same as in changed_ref 7035 */ 7036 static int changed_xattr(struct send_ctx *sctx, 7037 enum btrfs_compare_tree_result result) 7038 { 7039 int ret = 0; 7040 7041 if (sctx->cur_ino != sctx->cmp_key->objectid) { 7042 inconsistent_snapshot_error(sctx, result, "xattr"); 7043 return -EIO; 7044 } 7045 7046 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7047 if (result == BTRFS_COMPARE_TREE_NEW) 7048 ret = process_new_xattr(sctx); 7049 else if (result == BTRFS_COMPARE_TREE_DELETED) 7050 ret = process_deleted_xattr(sctx); 7051 else if (result == BTRFS_COMPARE_TREE_CHANGED) 7052 ret = process_changed_xattr(sctx); 7053 } 7054 7055 return ret; 7056 } 7057 7058 /* 7059 * Process new/deleted/changed extents. We skip processing in the 7060 * cur_inode_new_gen case because changed_inode did already initiate processing 7061 * of extents. The reason is the same as in changed_ref 7062 */ 7063 static int changed_extent(struct send_ctx *sctx, 7064 enum btrfs_compare_tree_result result) 7065 { 7066 int ret = 0; 7067 7068 /* 7069 * We have found an extent item that changed without the inode item 7070 * having changed. This can happen either after relocation (where the 7071 * disk_bytenr of an extent item is replaced at 7072 * relocation.c:replace_file_extents()) or after deduplication into a 7073 * file in both the parent and send snapshots (where an extent item can 7074 * get modified or replaced with a new one). Note that deduplication 7075 * updates the inode item, but it only changes the iversion (sequence 7076 * field in the inode item) of the inode, so if a file is deduplicated 7077 * the same amount of times in both the parent and send snapshots, its 7078 * iversion becomes the same in both snapshots, whence the inode item is 7079 * the same on both snapshots. 7080 */ 7081 if (sctx->cur_ino != sctx->cmp_key->objectid) 7082 return 0; 7083 7084 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7085 if (result != BTRFS_COMPARE_TREE_DELETED) 7086 ret = process_extent(sctx, sctx->left_path, 7087 sctx->cmp_key); 7088 } 7089 7090 return ret; 7091 } 7092 7093 static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) 7094 { 7095 int ret = 0; 7096 7097 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7098 if (result == BTRFS_COMPARE_TREE_NEW) 7099 sctx->cur_inode_needs_verity = true; 7100 } 7101 return ret; 7102 } 7103 7104 static int dir_changed(struct send_ctx *sctx, u64 dir) 7105 { 7106 u64 orig_gen, new_gen; 7107 int ret; 7108 7109 ret = get_inode_gen(sctx->send_root, dir, &new_gen); 7110 if (ret) 7111 return ret; 7112 7113 ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); 7114 if (ret) 7115 return ret; 7116 7117 return (orig_gen != new_gen) ? 1 : 0; 7118 } 7119 7120 static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, 7121 struct btrfs_key *key) 7122 { 7123 struct btrfs_inode_extref *extref; 7124 struct extent_buffer *leaf; 7125 u64 dirid = 0, last_dirid = 0; 7126 unsigned long ptr; 7127 u32 item_size; 7128 u32 cur_offset = 0; 7129 int ref_name_len; 7130 int ret = 0; 7131 7132 /* Easy case, just check this one dirid */ 7133 if (key->type == BTRFS_INODE_REF_KEY) { 7134 dirid = key->offset; 7135 7136 ret = dir_changed(sctx, dirid); 7137 goto out; 7138 } 7139 7140 leaf = path->nodes[0]; 7141 item_size = btrfs_item_size(leaf, path->slots[0]); 7142 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 7143 while (cur_offset < item_size) { 7144 extref = (struct btrfs_inode_extref *)(ptr + 7145 cur_offset); 7146 dirid = btrfs_inode_extref_parent(leaf, extref); 7147 ref_name_len = btrfs_inode_extref_name_len(leaf, extref); 7148 cur_offset += ref_name_len + sizeof(*extref); 7149 if (dirid == last_dirid) 7150 continue; 7151 ret = dir_changed(sctx, dirid); 7152 if (ret) 7153 break; 7154 last_dirid = dirid; 7155 } 7156 out: 7157 return ret; 7158 } 7159 7160 /* 7161 * Updates compare related fields in sctx and simply forwards to the actual 7162 * changed_xxx functions. 7163 */ 7164 static int changed_cb(struct btrfs_path *left_path, 7165 struct btrfs_path *right_path, 7166 struct btrfs_key *key, 7167 enum btrfs_compare_tree_result result, 7168 struct send_ctx *sctx) 7169 { 7170 int ret = 0; 7171 7172 /* 7173 * We can not hold the commit root semaphore here. This is because in 7174 * the case of sending and receiving to the same filesystem, using a 7175 * pipe, could result in a deadlock: 7176 * 7177 * 1) The task running send blocks on the pipe because it's full; 7178 * 7179 * 2) The task running receive, which is the only consumer of the pipe, 7180 * is waiting for a transaction commit (for example due to a space 7181 * reservation when doing a write or triggering a transaction commit 7182 * when creating a subvolume); 7183 * 7184 * 3) The transaction is waiting to write lock the commit root semaphore, 7185 * but can not acquire it since it's being held at 1). 7186 * 7187 * Down this call chain we write to the pipe through kernel_write(). 7188 * The same type of problem can also happen when sending to a file that 7189 * is stored in the same filesystem - when reserving space for a write 7190 * into the file, we can trigger a transaction commit. 7191 * 7192 * Our caller has supplied us with clones of leaves from the send and 7193 * parent roots, so we're safe here from a concurrent relocation and 7194 * further reallocation of metadata extents while we are here. Below we 7195 * also assert that the leaves are clones. 7196 */ 7197 lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); 7198 7199 /* 7200 * We always have a send root, so left_path is never NULL. We will not 7201 * have a leaf when we have reached the end of the send root but have 7202 * not yet reached the end of the parent root. 7203 */ 7204 if (left_path->nodes[0]) 7205 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, 7206 &left_path->nodes[0]->bflags)); 7207 /* 7208 * When doing a full send we don't have a parent root, so right_path is 7209 * NULL. When doing an incremental send, we may have reached the end of 7210 * the parent root already, so we don't have a leaf at right_path. 7211 */ 7212 if (right_path && right_path->nodes[0]) 7213 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, 7214 &right_path->nodes[0]->bflags)); 7215 7216 if (result == BTRFS_COMPARE_TREE_SAME) { 7217 if (key->type == BTRFS_INODE_REF_KEY || 7218 key->type == BTRFS_INODE_EXTREF_KEY) { 7219 ret = compare_refs(sctx, left_path, key); 7220 if (!ret) 7221 return 0; 7222 if (ret < 0) 7223 return ret; 7224 } else if (key->type == BTRFS_EXTENT_DATA_KEY) { 7225 return maybe_send_hole(sctx, left_path, key); 7226 } else { 7227 return 0; 7228 } 7229 result = BTRFS_COMPARE_TREE_CHANGED; 7230 ret = 0; 7231 } 7232 7233 sctx->left_path = left_path; 7234 sctx->right_path = right_path; 7235 sctx->cmp_key = key; 7236 7237 ret = finish_inode_if_needed(sctx, 0); 7238 if (ret < 0) 7239 goto out; 7240 7241 /* Ignore non-FS objects */ 7242 if (key->objectid == BTRFS_FREE_INO_OBJECTID || 7243 key->objectid == BTRFS_FREE_SPACE_OBJECTID) 7244 goto out; 7245 7246 if (key->type == BTRFS_INODE_ITEM_KEY) { 7247 ret = changed_inode(sctx, result); 7248 } else if (!sctx->ignore_cur_inode) { 7249 if (key->type == BTRFS_INODE_REF_KEY || 7250 key->type == BTRFS_INODE_EXTREF_KEY) 7251 ret = changed_ref(sctx, result); 7252 else if (key->type == BTRFS_XATTR_ITEM_KEY) 7253 ret = changed_xattr(sctx, result); 7254 else if (key->type == BTRFS_EXTENT_DATA_KEY) 7255 ret = changed_extent(sctx, result); 7256 else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && 7257 key->offset == 0) 7258 ret = changed_verity(sctx, result); 7259 } 7260 7261 out: 7262 return ret; 7263 } 7264 7265 static int search_key_again(const struct send_ctx *sctx, 7266 struct btrfs_root *root, 7267 struct btrfs_path *path, 7268 const struct btrfs_key *key) 7269 { 7270 int ret; 7271 7272 if (!path->need_commit_sem) 7273 lockdep_assert_held_read(&root->fs_info->commit_root_sem); 7274 7275 /* 7276 * Roots used for send operations are readonly and no one can add, 7277 * update or remove keys from them, so we should be able to find our 7278 * key again. The only exception is deduplication, which can operate on 7279 * readonly roots and add, update or remove keys to/from them - but at 7280 * the moment we don't allow it to run in parallel with send. 7281 */ 7282 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7283 ASSERT(ret <= 0); 7284 if (ret > 0) { 7285 btrfs_print_tree(path->nodes[path->lowest_level], false); 7286 btrfs_err(root->fs_info, 7287 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", 7288 key->objectid, key->type, key->offset, 7289 (root == sctx->parent_root ? "parent" : "send"), 7290 root->root_key.objectid, path->lowest_level, 7291 path->slots[path->lowest_level]); 7292 return -EUCLEAN; 7293 } 7294 7295 return ret; 7296 } 7297 7298 static int full_send_tree(struct send_ctx *sctx) 7299 { 7300 int ret; 7301 struct btrfs_root *send_root = sctx->send_root; 7302 struct btrfs_key key; 7303 struct btrfs_fs_info *fs_info = send_root->fs_info; 7304 struct btrfs_path *path; 7305 7306 path = alloc_path_for_send(); 7307 if (!path) 7308 return -ENOMEM; 7309 path->reada = READA_FORWARD_ALWAYS; 7310 7311 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 7312 key.type = BTRFS_INODE_ITEM_KEY; 7313 key.offset = 0; 7314 7315 down_read(&fs_info->commit_root_sem); 7316 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7317 up_read(&fs_info->commit_root_sem); 7318 7319 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); 7320 if (ret < 0) 7321 goto out; 7322 if (ret) 7323 goto out_finish; 7324 7325 while (1) { 7326 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 7327 7328 ret = changed_cb(path, NULL, &key, 7329 BTRFS_COMPARE_TREE_NEW, sctx); 7330 if (ret < 0) 7331 goto out; 7332 7333 down_read(&fs_info->commit_root_sem); 7334 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 7335 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7336 up_read(&fs_info->commit_root_sem); 7337 /* 7338 * A transaction used for relocating a block group was 7339 * committed or is about to finish its commit. Release 7340 * our path (leaf) and restart the search, so that we 7341 * avoid operating on any file extent items that are 7342 * stale, with a disk_bytenr that reflects a pre 7343 * relocation value. This way we avoid as much as 7344 * possible to fallback to regular writes when checking 7345 * if we can clone file ranges. 7346 */ 7347 btrfs_release_path(path); 7348 ret = search_key_again(sctx, send_root, path, &key); 7349 if (ret < 0) 7350 goto out; 7351 } else { 7352 up_read(&fs_info->commit_root_sem); 7353 } 7354 7355 ret = btrfs_next_item(send_root, path); 7356 if (ret < 0) 7357 goto out; 7358 if (ret) { 7359 ret = 0; 7360 break; 7361 } 7362 } 7363 7364 out_finish: 7365 ret = finish_inode_if_needed(sctx, 1); 7366 7367 out: 7368 btrfs_free_path(path); 7369 return ret; 7370 } 7371 7372 static int replace_node_with_clone(struct btrfs_path *path, int level) 7373 { 7374 struct extent_buffer *clone; 7375 7376 clone = btrfs_clone_extent_buffer(path->nodes[level]); 7377 if (!clone) 7378 return -ENOMEM; 7379 7380 free_extent_buffer(path->nodes[level]); 7381 path->nodes[level] = clone; 7382 7383 return 0; 7384 } 7385 7386 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) 7387 { 7388 struct extent_buffer *eb; 7389 struct extent_buffer *parent = path->nodes[*level]; 7390 int slot = path->slots[*level]; 7391 const int nritems = btrfs_header_nritems(parent); 7392 u64 reada_max; 7393 u64 reada_done = 0; 7394 7395 lockdep_assert_held_read(&parent->fs_info->commit_root_sem); 7396 7397 BUG_ON(*level == 0); 7398 eb = btrfs_read_node_slot(parent, slot); 7399 if (IS_ERR(eb)) 7400 return PTR_ERR(eb); 7401 7402 /* 7403 * Trigger readahead for the next leaves we will process, so that it is 7404 * very likely that when we need them they are already in memory and we 7405 * will not block on disk IO. For nodes we only do readahead for one, 7406 * since the time window between processing nodes is typically larger. 7407 */ 7408 reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); 7409 7410 for (slot++; slot < nritems && reada_done < reada_max; slot++) { 7411 if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { 7412 btrfs_readahead_node_child(parent, slot); 7413 reada_done += eb->fs_info->nodesize; 7414 } 7415 } 7416 7417 path->nodes[*level - 1] = eb; 7418 path->slots[*level - 1] = 0; 7419 (*level)--; 7420 7421 if (*level == 0) 7422 return replace_node_with_clone(path, 0); 7423 7424 return 0; 7425 } 7426 7427 static int tree_move_next_or_upnext(struct btrfs_path *path, 7428 int *level, int root_level) 7429 { 7430 int ret = 0; 7431 int nritems; 7432 nritems = btrfs_header_nritems(path->nodes[*level]); 7433 7434 path->slots[*level]++; 7435 7436 while (path->slots[*level] >= nritems) { 7437 if (*level == root_level) { 7438 path->slots[*level] = nritems - 1; 7439 return -1; 7440 } 7441 7442 /* move upnext */ 7443 path->slots[*level] = 0; 7444 free_extent_buffer(path->nodes[*level]); 7445 path->nodes[*level] = NULL; 7446 (*level)++; 7447 path->slots[*level]++; 7448 7449 nritems = btrfs_header_nritems(path->nodes[*level]); 7450 ret = 1; 7451 } 7452 return ret; 7453 } 7454 7455 /* 7456 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 7457 * or down. 7458 */ 7459 static int tree_advance(struct btrfs_path *path, 7460 int *level, int root_level, 7461 int allow_down, 7462 struct btrfs_key *key, 7463 u64 reada_min_gen) 7464 { 7465 int ret; 7466 7467 if (*level == 0 || !allow_down) { 7468 ret = tree_move_next_or_upnext(path, level, root_level); 7469 } else { 7470 ret = tree_move_down(path, level, reada_min_gen); 7471 } 7472 7473 /* 7474 * Even if we have reached the end of a tree, ret is -1, update the key 7475 * anyway, so that in case we need to restart due to a block group 7476 * relocation, we can assert that the last key of the root node still 7477 * exists in the tree. 7478 */ 7479 if (*level == 0) 7480 btrfs_item_key_to_cpu(path->nodes[*level], key, 7481 path->slots[*level]); 7482 else 7483 btrfs_node_key_to_cpu(path->nodes[*level], key, 7484 path->slots[*level]); 7485 7486 return ret; 7487 } 7488 7489 static int tree_compare_item(struct btrfs_path *left_path, 7490 struct btrfs_path *right_path, 7491 char *tmp_buf) 7492 { 7493 int cmp; 7494 int len1, len2; 7495 unsigned long off1, off2; 7496 7497 len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); 7498 len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); 7499 if (len1 != len2) 7500 return 1; 7501 7502 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 7503 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 7504 right_path->slots[0]); 7505 7506 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 7507 7508 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 7509 if (cmp) 7510 return 1; 7511 return 0; 7512 } 7513 7514 /* 7515 * A transaction used for relocating a block group was committed or is about to 7516 * finish its commit. Release our paths and restart the search, so that we are 7517 * not using stale extent buffers: 7518 * 7519 * 1) For levels > 0, we are only holding references of extent buffers, without 7520 * any locks on them, which does not prevent them from having been relocated 7521 * and reallocated after the last time we released the commit root semaphore. 7522 * The exception are the root nodes, for which we always have a clone, see 7523 * the comment at btrfs_compare_trees(); 7524 * 7525 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so 7526 * we are safe from the concurrent relocation and reallocation. However they 7527 * can have file extent items with a pre relocation disk_bytenr value, so we 7528 * restart the start from the current commit roots and clone the new leaves so 7529 * that we get the post relocation disk_bytenr values. Not doing so, could 7530 * make us clone the wrong data in case there are new extents using the old 7531 * disk_bytenr that happen to be shared. 7532 */ 7533 static int restart_after_relocation(struct btrfs_path *left_path, 7534 struct btrfs_path *right_path, 7535 const struct btrfs_key *left_key, 7536 const struct btrfs_key *right_key, 7537 int left_level, 7538 int right_level, 7539 const struct send_ctx *sctx) 7540 { 7541 int root_level; 7542 int ret; 7543 7544 lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); 7545 7546 btrfs_release_path(left_path); 7547 btrfs_release_path(right_path); 7548 7549 /* 7550 * Since keys can not be added or removed to/from our roots because they 7551 * are readonly and we do not allow deduplication to run in parallel 7552 * (which can add, remove or change keys), the layout of the trees should 7553 * not change. 7554 */ 7555 left_path->lowest_level = left_level; 7556 ret = search_key_again(sctx, sctx->send_root, left_path, left_key); 7557 if (ret < 0) 7558 return ret; 7559 7560 right_path->lowest_level = right_level; 7561 ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); 7562 if (ret < 0) 7563 return ret; 7564 7565 /* 7566 * If the lowest level nodes are leaves, clone them so that they can be 7567 * safely used by changed_cb() while not under the protection of the 7568 * commit root semaphore, even if relocation and reallocation happens in 7569 * parallel. 7570 */ 7571 if (left_level == 0) { 7572 ret = replace_node_with_clone(left_path, 0); 7573 if (ret < 0) 7574 return ret; 7575 } 7576 7577 if (right_level == 0) { 7578 ret = replace_node_with_clone(right_path, 0); 7579 if (ret < 0) 7580 return ret; 7581 } 7582 7583 /* 7584 * Now clone the root nodes (unless they happen to be the leaves we have 7585 * already cloned). This is to protect against concurrent snapshotting of 7586 * the send and parent roots (see the comment at btrfs_compare_trees()). 7587 */ 7588 root_level = btrfs_header_level(sctx->send_root->commit_root); 7589 if (root_level > 0) { 7590 ret = replace_node_with_clone(left_path, root_level); 7591 if (ret < 0) 7592 return ret; 7593 } 7594 7595 root_level = btrfs_header_level(sctx->parent_root->commit_root); 7596 if (root_level > 0) { 7597 ret = replace_node_with_clone(right_path, root_level); 7598 if (ret < 0) 7599 return ret; 7600 } 7601 7602 return 0; 7603 } 7604 7605 /* 7606 * This function compares two trees and calls the provided callback for 7607 * every changed/new/deleted item it finds. 7608 * If shared tree blocks are encountered, whole subtrees are skipped, making 7609 * the compare pretty fast on snapshotted subvolumes. 7610 * 7611 * This currently works on commit roots only. As commit roots are read only, 7612 * we don't do any locking. The commit roots are protected with transactions. 7613 * Transactions are ended and rejoined when a commit is tried in between. 7614 * 7615 * This function checks for modifications done to the trees while comparing. 7616 * If it detects a change, it aborts immediately. 7617 */ 7618 static int btrfs_compare_trees(struct btrfs_root *left_root, 7619 struct btrfs_root *right_root, struct send_ctx *sctx) 7620 { 7621 struct btrfs_fs_info *fs_info = left_root->fs_info; 7622 int ret; 7623 int cmp; 7624 struct btrfs_path *left_path = NULL; 7625 struct btrfs_path *right_path = NULL; 7626 struct btrfs_key left_key; 7627 struct btrfs_key right_key; 7628 char *tmp_buf = NULL; 7629 int left_root_level; 7630 int right_root_level; 7631 int left_level; 7632 int right_level; 7633 int left_end_reached = 0; 7634 int right_end_reached = 0; 7635 int advance_left = 0; 7636 int advance_right = 0; 7637 u64 left_blockptr; 7638 u64 right_blockptr; 7639 u64 left_gen; 7640 u64 right_gen; 7641 u64 reada_min_gen; 7642 7643 left_path = btrfs_alloc_path(); 7644 if (!left_path) { 7645 ret = -ENOMEM; 7646 goto out; 7647 } 7648 right_path = btrfs_alloc_path(); 7649 if (!right_path) { 7650 ret = -ENOMEM; 7651 goto out; 7652 } 7653 7654 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 7655 if (!tmp_buf) { 7656 ret = -ENOMEM; 7657 goto out; 7658 } 7659 7660 left_path->search_commit_root = 1; 7661 left_path->skip_locking = 1; 7662 right_path->search_commit_root = 1; 7663 right_path->skip_locking = 1; 7664 7665 /* 7666 * Strategy: Go to the first items of both trees. Then do 7667 * 7668 * If both trees are at level 0 7669 * Compare keys of current items 7670 * If left < right treat left item as new, advance left tree 7671 * and repeat 7672 * If left > right treat right item as deleted, advance right tree 7673 * and repeat 7674 * If left == right do deep compare of items, treat as changed if 7675 * needed, advance both trees and repeat 7676 * If both trees are at the same level but not at level 0 7677 * Compare keys of current nodes/leafs 7678 * If left < right advance left tree and repeat 7679 * If left > right advance right tree and repeat 7680 * If left == right compare blockptrs of the next nodes/leafs 7681 * If they match advance both trees but stay at the same level 7682 * and repeat 7683 * If they don't match advance both trees while allowing to go 7684 * deeper and repeat 7685 * If tree levels are different 7686 * Advance the tree that needs it and repeat 7687 * 7688 * Advancing a tree means: 7689 * If we are at level 0, try to go to the next slot. If that's not 7690 * possible, go one level up and repeat. Stop when we found a level 7691 * where we could go to the next slot. We may at this point be on a 7692 * node or a leaf. 7693 * 7694 * If we are not at level 0 and not on shared tree blocks, go one 7695 * level deeper. 7696 * 7697 * If we are not at level 0 and on shared tree blocks, go one slot to 7698 * the right if possible or go up and right. 7699 */ 7700 7701 down_read(&fs_info->commit_root_sem); 7702 left_level = btrfs_header_level(left_root->commit_root); 7703 left_root_level = left_level; 7704 /* 7705 * We clone the root node of the send and parent roots to prevent races 7706 * with snapshot creation of these roots. Snapshot creation COWs the 7707 * root node of a tree, so after the transaction is committed the old 7708 * extent can be reallocated while this send operation is still ongoing. 7709 * So we clone them, under the commit root semaphore, to be race free. 7710 */ 7711 left_path->nodes[left_level] = 7712 btrfs_clone_extent_buffer(left_root->commit_root); 7713 if (!left_path->nodes[left_level]) { 7714 ret = -ENOMEM; 7715 goto out_unlock; 7716 } 7717 7718 right_level = btrfs_header_level(right_root->commit_root); 7719 right_root_level = right_level; 7720 right_path->nodes[right_level] = 7721 btrfs_clone_extent_buffer(right_root->commit_root); 7722 if (!right_path->nodes[right_level]) { 7723 ret = -ENOMEM; 7724 goto out_unlock; 7725 } 7726 /* 7727 * Our right root is the parent root, while the left root is the "send" 7728 * root. We know that all new nodes/leaves in the left root must have 7729 * a generation greater than the right root's generation, so we trigger 7730 * readahead for those nodes and leaves of the left root, as we know we 7731 * will need to read them at some point. 7732 */ 7733 reada_min_gen = btrfs_header_generation(right_root->commit_root); 7734 7735 if (left_level == 0) 7736 btrfs_item_key_to_cpu(left_path->nodes[left_level], 7737 &left_key, left_path->slots[left_level]); 7738 else 7739 btrfs_node_key_to_cpu(left_path->nodes[left_level], 7740 &left_key, left_path->slots[left_level]); 7741 if (right_level == 0) 7742 btrfs_item_key_to_cpu(right_path->nodes[right_level], 7743 &right_key, right_path->slots[right_level]); 7744 else 7745 btrfs_node_key_to_cpu(right_path->nodes[right_level], 7746 &right_key, right_path->slots[right_level]); 7747 7748 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7749 7750 while (1) { 7751 if (need_resched() || 7752 rwsem_is_contended(&fs_info->commit_root_sem)) { 7753 up_read(&fs_info->commit_root_sem); 7754 cond_resched(); 7755 down_read(&fs_info->commit_root_sem); 7756 } 7757 7758 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 7759 ret = restart_after_relocation(left_path, right_path, 7760 &left_key, &right_key, 7761 left_level, right_level, 7762 sctx); 7763 if (ret < 0) 7764 goto out_unlock; 7765 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7766 } 7767 7768 if (advance_left && !left_end_reached) { 7769 ret = tree_advance(left_path, &left_level, 7770 left_root_level, 7771 advance_left != ADVANCE_ONLY_NEXT, 7772 &left_key, reada_min_gen); 7773 if (ret == -1) 7774 left_end_reached = ADVANCE; 7775 else if (ret < 0) 7776 goto out_unlock; 7777 advance_left = 0; 7778 } 7779 if (advance_right && !right_end_reached) { 7780 ret = tree_advance(right_path, &right_level, 7781 right_root_level, 7782 advance_right != ADVANCE_ONLY_NEXT, 7783 &right_key, reada_min_gen); 7784 if (ret == -1) 7785 right_end_reached = ADVANCE; 7786 else if (ret < 0) 7787 goto out_unlock; 7788 advance_right = 0; 7789 } 7790 7791 if (left_end_reached && right_end_reached) { 7792 ret = 0; 7793 goto out_unlock; 7794 } else if (left_end_reached) { 7795 if (right_level == 0) { 7796 up_read(&fs_info->commit_root_sem); 7797 ret = changed_cb(left_path, right_path, 7798 &right_key, 7799 BTRFS_COMPARE_TREE_DELETED, 7800 sctx); 7801 if (ret < 0) 7802 goto out; 7803 down_read(&fs_info->commit_root_sem); 7804 } 7805 advance_right = ADVANCE; 7806 continue; 7807 } else if (right_end_reached) { 7808 if (left_level == 0) { 7809 up_read(&fs_info->commit_root_sem); 7810 ret = changed_cb(left_path, right_path, 7811 &left_key, 7812 BTRFS_COMPARE_TREE_NEW, 7813 sctx); 7814 if (ret < 0) 7815 goto out; 7816 down_read(&fs_info->commit_root_sem); 7817 } 7818 advance_left = ADVANCE; 7819 continue; 7820 } 7821 7822 if (left_level == 0 && right_level == 0) { 7823 up_read(&fs_info->commit_root_sem); 7824 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7825 if (cmp < 0) { 7826 ret = changed_cb(left_path, right_path, 7827 &left_key, 7828 BTRFS_COMPARE_TREE_NEW, 7829 sctx); 7830 advance_left = ADVANCE; 7831 } else if (cmp > 0) { 7832 ret = changed_cb(left_path, right_path, 7833 &right_key, 7834 BTRFS_COMPARE_TREE_DELETED, 7835 sctx); 7836 advance_right = ADVANCE; 7837 } else { 7838 enum btrfs_compare_tree_result result; 7839 7840 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 7841 ret = tree_compare_item(left_path, right_path, 7842 tmp_buf); 7843 if (ret) 7844 result = BTRFS_COMPARE_TREE_CHANGED; 7845 else 7846 result = BTRFS_COMPARE_TREE_SAME; 7847 ret = changed_cb(left_path, right_path, 7848 &left_key, result, sctx); 7849 advance_left = ADVANCE; 7850 advance_right = ADVANCE; 7851 } 7852 7853 if (ret < 0) 7854 goto out; 7855 down_read(&fs_info->commit_root_sem); 7856 } else if (left_level == right_level) { 7857 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7858 if (cmp < 0) { 7859 advance_left = ADVANCE; 7860 } else if (cmp > 0) { 7861 advance_right = ADVANCE; 7862 } else { 7863 left_blockptr = btrfs_node_blockptr( 7864 left_path->nodes[left_level], 7865 left_path->slots[left_level]); 7866 right_blockptr = btrfs_node_blockptr( 7867 right_path->nodes[right_level], 7868 right_path->slots[right_level]); 7869 left_gen = btrfs_node_ptr_generation( 7870 left_path->nodes[left_level], 7871 left_path->slots[left_level]); 7872 right_gen = btrfs_node_ptr_generation( 7873 right_path->nodes[right_level], 7874 right_path->slots[right_level]); 7875 if (left_blockptr == right_blockptr && 7876 left_gen == right_gen) { 7877 /* 7878 * As we're on a shared block, don't 7879 * allow to go deeper. 7880 */ 7881 advance_left = ADVANCE_ONLY_NEXT; 7882 advance_right = ADVANCE_ONLY_NEXT; 7883 } else { 7884 advance_left = ADVANCE; 7885 advance_right = ADVANCE; 7886 } 7887 } 7888 } else if (left_level < right_level) { 7889 advance_right = ADVANCE; 7890 } else { 7891 advance_left = ADVANCE; 7892 } 7893 } 7894 7895 out_unlock: 7896 up_read(&fs_info->commit_root_sem); 7897 out: 7898 btrfs_free_path(left_path); 7899 btrfs_free_path(right_path); 7900 kvfree(tmp_buf); 7901 return ret; 7902 } 7903 7904 static int send_subvol(struct send_ctx *sctx) 7905 { 7906 int ret; 7907 7908 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { 7909 ret = send_header(sctx); 7910 if (ret < 0) 7911 goto out; 7912 } 7913 7914 ret = send_subvol_begin(sctx); 7915 if (ret < 0) 7916 goto out; 7917 7918 if (sctx->parent_root) { 7919 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); 7920 if (ret < 0) 7921 goto out; 7922 ret = finish_inode_if_needed(sctx, 1); 7923 if (ret < 0) 7924 goto out; 7925 } else { 7926 ret = full_send_tree(sctx); 7927 if (ret < 0) 7928 goto out; 7929 } 7930 7931 out: 7932 free_recorded_refs(sctx); 7933 return ret; 7934 } 7935 7936 /* 7937 * If orphan cleanup did remove any orphans from a root, it means the tree 7938 * was modified and therefore the commit root is not the same as the current 7939 * root anymore. This is a problem, because send uses the commit root and 7940 * therefore can see inode items that don't exist in the current root anymore, 7941 * and for example make calls to btrfs_iget, which will do tree lookups based 7942 * on the current root and not on the commit root. Those lookups will fail, 7943 * returning a -ESTALE error, and making send fail with that error. So make 7944 * sure a send does not see any orphans we have just removed, and that it will 7945 * see the same inodes regardless of whether a transaction commit happened 7946 * before it started (meaning that the commit root will be the same as the 7947 * current root) or not. 7948 */ 7949 static int ensure_commit_roots_uptodate(struct send_ctx *sctx) 7950 { 7951 int i; 7952 struct btrfs_trans_handle *trans = NULL; 7953 7954 again: 7955 if (sctx->parent_root && 7956 sctx->parent_root->node != sctx->parent_root->commit_root) 7957 goto commit_trans; 7958 7959 for (i = 0; i < sctx->clone_roots_cnt; i++) 7960 if (sctx->clone_roots[i].root->node != 7961 sctx->clone_roots[i].root->commit_root) 7962 goto commit_trans; 7963 7964 if (trans) 7965 return btrfs_end_transaction(trans); 7966 7967 return 0; 7968 7969 commit_trans: 7970 /* Use any root, all fs roots will get their commit roots updated. */ 7971 if (!trans) { 7972 trans = btrfs_join_transaction(sctx->send_root); 7973 if (IS_ERR(trans)) 7974 return PTR_ERR(trans); 7975 goto again; 7976 } 7977 7978 return btrfs_commit_transaction(trans); 7979 } 7980 7981 /* 7982 * Make sure any existing dellaloc is flushed for any root used by a send 7983 * operation so that we do not miss any data and we do not race with writeback 7984 * finishing and changing a tree while send is using the tree. This could 7985 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and 7986 * a send operation then uses the subvolume. 7987 * After flushing delalloc ensure_commit_roots_uptodate() must be called. 7988 */ 7989 static int flush_delalloc_roots(struct send_ctx *sctx) 7990 { 7991 struct btrfs_root *root = sctx->parent_root; 7992 int ret; 7993 int i; 7994 7995 if (root) { 7996 ret = btrfs_start_delalloc_snapshot(root, false); 7997 if (ret) 7998 return ret; 7999 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); 8000 } 8001 8002 for (i = 0; i < sctx->clone_roots_cnt; i++) { 8003 root = sctx->clone_roots[i].root; 8004 ret = btrfs_start_delalloc_snapshot(root, false); 8005 if (ret) 8006 return ret; 8007 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); 8008 } 8009 8010 return 0; 8011 } 8012 8013 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 8014 { 8015 spin_lock(&root->root_item_lock); 8016 root->send_in_progress--; 8017 /* 8018 * Not much left to do, we don't know why it's unbalanced and 8019 * can't blindly reset it to 0. 8020 */ 8021 if (root->send_in_progress < 0) 8022 btrfs_err(root->fs_info, 8023 "send_in_progress unbalanced %d root %llu", 8024 root->send_in_progress, root->root_key.objectid); 8025 spin_unlock(&root->root_item_lock); 8026 } 8027 8028 static void dedupe_in_progress_warn(const struct btrfs_root *root) 8029 { 8030 btrfs_warn_rl(root->fs_info, 8031 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", 8032 root->root_key.objectid, root->dedupe_in_progress); 8033 } 8034 8035 long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) 8036 { 8037 int ret = 0; 8038 struct btrfs_root *send_root = BTRFS_I(inode)->root; 8039 struct btrfs_fs_info *fs_info = send_root->fs_info; 8040 struct btrfs_root *clone_root; 8041 struct send_ctx *sctx = NULL; 8042 u32 i; 8043 u64 *clone_sources_tmp = NULL; 8044 int clone_sources_to_rollback = 0; 8045 size_t alloc_size; 8046 int sort_clone_roots = 0; 8047 8048 if (!capable(CAP_SYS_ADMIN)) 8049 return -EPERM; 8050 8051 /* 8052 * The subvolume must remain read-only during send, protect against 8053 * making it RW. This also protects against deletion. 8054 */ 8055 spin_lock(&send_root->root_item_lock); 8056 if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { 8057 dedupe_in_progress_warn(send_root); 8058 spin_unlock(&send_root->root_item_lock); 8059 return -EAGAIN; 8060 } 8061 send_root->send_in_progress++; 8062 spin_unlock(&send_root->root_item_lock); 8063 8064 /* 8065 * Userspace tools do the checks and warn the user if it's 8066 * not RO. 8067 */ 8068 if (!btrfs_root_readonly(send_root)) { 8069 ret = -EPERM; 8070 goto out; 8071 } 8072 8073 /* 8074 * Check that we don't overflow at later allocations, we request 8075 * clone_sources_count + 1 items, and compare to unsigned long inside 8076 * access_ok. Also set an upper limit for allocation size so this can't 8077 * easily exhaust memory. Max number of clone sources is about 200K. 8078 */ 8079 if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { 8080 ret = -EINVAL; 8081 goto out; 8082 } 8083 8084 if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { 8085 ret = -EINVAL; 8086 goto out; 8087 } 8088 8089 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); 8090 if (!sctx) { 8091 ret = -ENOMEM; 8092 goto out; 8093 } 8094 8095 INIT_LIST_HEAD(&sctx->new_refs); 8096 INIT_LIST_HEAD(&sctx->deleted_refs); 8097 INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); 8098 INIT_LIST_HEAD(&sctx->name_cache_list); 8099 8100 INIT_LIST_HEAD(&sctx->backref_cache.lru_list); 8101 mt_init(&sctx->backref_cache.entries); 8102 8103 sctx->flags = arg->flags; 8104 8105 if (arg->flags & BTRFS_SEND_FLAG_VERSION) { 8106 if (arg->version > BTRFS_SEND_STREAM_VERSION) { 8107 ret = -EPROTO; 8108 goto out; 8109 } 8110 /* Zero means "use the highest version" */ 8111 sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; 8112 } else { 8113 sctx->proto = 1; 8114 } 8115 if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { 8116 ret = -EINVAL; 8117 goto out; 8118 } 8119 8120 sctx->send_filp = fget(arg->send_fd); 8121 if (!sctx->send_filp) { 8122 ret = -EBADF; 8123 goto out; 8124 } 8125 8126 sctx->send_root = send_root; 8127 /* 8128 * Unlikely but possible, if the subvolume is marked for deletion but 8129 * is slow to remove the directory entry, send can still be started 8130 */ 8131 if (btrfs_root_dead(sctx->send_root)) { 8132 ret = -EPERM; 8133 goto out; 8134 } 8135 8136 sctx->clone_roots_cnt = arg->clone_sources_count; 8137 8138 if (sctx->proto >= 2) { 8139 u32 send_buf_num_pages; 8140 8141 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; 8142 sctx->send_buf = vmalloc(sctx->send_max_size); 8143 if (!sctx->send_buf) { 8144 ret = -ENOMEM; 8145 goto out; 8146 } 8147 send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; 8148 sctx->send_buf_pages = kcalloc(send_buf_num_pages, 8149 sizeof(*sctx->send_buf_pages), 8150 GFP_KERNEL); 8151 if (!sctx->send_buf_pages) { 8152 ret = -ENOMEM; 8153 goto out; 8154 } 8155 for (i = 0; i < send_buf_num_pages; i++) { 8156 sctx->send_buf_pages[i] = 8157 vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); 8158 } 8159 } else { 8160 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; 8161 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); 8162 } 8163 if (!sctx->send_buf) { 8164 ret = -ENOMEM; 8165 goto out; 8166 } 8167 8168 sctx->pending_dir_moves = RB_ROOT; 8169 sctx->waiting_dir_moves = RB_ROOT; 8170 sctx->orphan_dirs = RB_ROOT; 8171 sctx->rbtree_new_refs = RB_ROOT; 8172 sctx->rbtree_deleted_refs = RB_ROOT; 8173 8174 sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), 8175 arg->clone_sources_count + 1, 8176 GFP_KERNEL); 8177 if (!sctx->clone_roots) { 8178 ret = -ENOMEM; 8179 goto out; 8180 } 8181 8182 alloc_size = array_size(sizeof(*arg->clone_sources), 8183 arg->clone_sources_count); 8184 8185 if (arg->clone_sources_count) { 8186 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); 8187 if (!clone_sources_tmp) { 8188 ret = -ENOMEM; 8189 goto out; 8190 } 8191 8192 ret = copy_from_user(clone_sources_tmp, arg->clone_sources, 8193 alloc_size); 8194 if (ret) { 8195 ret = -EFAULT; 8196 goto out; 8197 } 8198 8199 for (i = 0; i < arg->clone_sources_count; i++) { 8200 clone_root = btrfs_get_fs_root(fs_info, 8201 clone_sources_tmp[i], true); 8202 if (IS_ERR(clone_root)) { 8203 ret = PTR_ERR(clone_root); 8204 goto out; 8205 } 8206 spin_lock(&clone_root->root_item_lock); 8207 if (!btrfs_root_readonly(clone_root) || 8208 btrfs_root_dead(clone_root)) { 8209 spin_unlock(&clone_root->root_item_lock); 8210 btrfs_put_root(clone_root); 8211 ret = -EPERM; 8212 goto out; 8213 } 8214 if (clone_root->dedupe_in_progress) { 8215 dedupe_in_progress_warn(clone_root); 8216 spin_unlock(&clone_root->root_item_lock); 8217 btrfs_put_root(clone_root); 8218 ret = -EAGAIN; 8219 goto out; 8220 } 8221 clone_root->send_in_progress++; 8222 spin_unlock(&clone_root->root_item_lock); 8223 8224 sctx->clone_roots[i].root = clone_root; 8225 clone_sources_to_rollback = i + 1; 8226 } 8227 kvfree(clone_sources_tmp); 8228 clone_sources_tmp = NULL; 8229 } 8230 8231 if (arg->parent_root) { 8232 sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, 8233 true); 8234 if (IS_ERR(sctx->parent_root)) { 8235 ret = PTR_ERR(sctx->parent_root); 8236 goto out; 8237 } 8238 8239 spin_lock(&sctx->parent_root->root_item_lock); 8240 sctx->parent_root->send_in_progress++; 8241 if (!btrfs_root_readonly(sctx->parent_root) || 8242 btrfs_root_dead(sctx->parent_root)) { 8243 spin_unlock(&sctx->parent_root->root_item_lock); 8244 ret = -EPERM; 8245 goto out; 8246 } 8247 if (sctx->parent_root->dedupe_in_progress) { 8248 dedupe_in_progress_warn(sctx->parent_root); 8249 spin_unlock(&sctx->parent_root->root_item_lock); 8250 ret = -EAGAIN; 8251 goto out; 8252 } 8253 spin_unlock(&sctx->parent_root->root_item_lock); 8254 } 8255 8256 /* 8257 * Clones from send_root are allowed, but only if the clone source 8258 * is behind the current send position. This is checked while searching 8259 * for possible clone sources. 8260 */ 8261 sctx->clone_roots[sctx->clone_roots_cnt++].root = 8262 btrfs_grab_root(sctx->send_root); 8263 8264 /* We do a bsearch later */ 8265 sort(sctx->clone_roots, sctx->clone_roots_cnt, 8266 sizeof(*sctx->clone_roots), __clone_root_cmp_sort, 8267 NULL); 8268 sort_clone_roots = 1; 8269 8270 ret = flush_delalloc_roots(sctx); 8271 if (ret) 8272 goto out; 8273 8274 ret = ensure_commit_roots_uptodate(sctx); 8275 if (ret) 8276 goto out; 8277 8278 ret = send_subvol(sctx); 8279 if (ret < 0) 8280 goto out; 8281 8282 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { 8283 ret = begin_cmd(sctx, BTRFS_SEND_C_END); 8284 if (ret < 0) 8285 goto out; 8286 ret = send_cmd(sctx); 8287 if (ret < 0) 8288 goto out; 8289 } 8290 8291 out: 8292 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); 8293 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { 8294 struct rb_node *n; 8295 struct pending_dir_move *pm; 8296 8297 n = rb_first(&sctx->pending_dir_moves); 8298 pm = rb_entry(n, struct pending_dir_move, node); 8299 while (!list_empty(&pm->list)) { 8300 struct pending_dir_move *pm2; 8301 8302 pm2 = list_first_entry(&pm->list, 8303 struct pending_dir_move, list); 8304 free_pending_move(sctx, pm2); 8305 } 8306 free_pending_move(sctx, pm); 8307 } 8308 8309 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); 8310 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { 8311 struct rb_node *n; 8312 struct waiting_dir_move *dm; 8313 8314 n = rb_first(&sctx->waiting_dir_moves); 8315 dm = rb_entry(n, struct waiting_dir_move, node); 8316 rb_erase(&dm->node, &sctx->waiting_dir_moves); 8317 kfree(dm); 8318 } 8319 8320 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); 8321 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { 8322 struct rb_node *n; 8323 struct orphan_dir_info *odi; 8324 8325 n = rb_first(&sctx->orphan_dirs); 8326 odi = rb_entry(n, struct orphan_dir_info, node); 8327 free_orphan_dir_info(sctx, odi); 8328 } 8329 8330 if (sort_clone_roots) { 8331 for (i = 0; i < sctx->clone_roots_cnt; i++) { 8332 btrfs_root_dec_send_in_progress( 8333 sctx->clone_roots[i].root); 8334 btrfs_put_root(sctx->clone_roots[i].root); 8335 } 8336 } else { 8337 for (i = 0; sctx && i < clone_sources_to_rollback; i++) { 8338 btrfs_root_dec_send_in_progress( 8339 sctx->clone_roots[i].root); 8340 btrfs_put_root(sctx->clone_roots[i].root); 8341 } 8342 8343 btrfs_root_dec_send_in_progress(send_root); 8344 } 8345 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { 8346 btrfs_root_dec_send_in_progress(sctx->parent_root); 8347 btrfs_put_root(sctx->parent_root); 8348 } 8349 8350 kvfree(clone_sources_tmp); 8351 8352 if (sctx) { 8353 if (sctx->send_filp) 8354 fput(sctx->send_filp); 8355 8356 kvfree(sctx->clone_roots); 8357 kfree(sctx->send_buf_pages); 8358 kvfree(sctx->send_buf); 8359 kvfree(sctx->verity_descriptor); 8360 8361 name_cache_free(sctx); 8362 8363 close_current_inode(sctx); 8364 8365 empty_backref_cache(sctx); 8366 8367 kfree(sctx); 8368 } 8369 8370 return ret; 8371 } 8372