1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Alexander Block. All rights reserved. 4 */ 5 6 #include <linux/bsearch.h> 7 #include <linux/fs.h> 8 #include <linux/file.h> 9 #include <linux/sort.h> 10 #include <linux/mount.h> 11 #include <linux/xattr.h> 12 #include <linux/posix_acl_xattr.h> 13 #include <linux/radix-tree.h> 14 #include <linux/vmalloc.h> 15 #include <linux/string.h> 16 #include <linux/compat.h> 17 #include <linux/crc32c.h> 18 19 #include "send.h" 20 #include "backref.h" 21 #include "locking.h" 22 #include "disk-io.h" 23 #include "btrfs_inode.h" 24 #include "transaction.h" 25 #include "compression.h" 26 #include "xattr.h" 27 28 /* 29 * Maximum number of references an extent can have in order for us to attempt to 30 * issue clone operations instead of write operations. This currently exists to 31 * avoid hitting limitations of the backreference walking code (taking a lot of 32 * time and using too much memory for extents with large number of references). 33 */ 34 #define SEND_MAX_EXTENT_REFS 64 35 36 /* 37 * A fs_path is a helper to dynamically build path names with unknown size. 38 * It reallocates the internal buffer on demand. 39 * It allows fast adding of path elements on the right side (normal path) and 40 * fast adding to the left side (reversed path). A reversed path can also be 41 * unreversed if needed. 42 */ 43 struct fs_path { 44 union { 45 struct { 46 char *start; 47 char *end; 48 49 char *buf; 50 unsigned short buf_len:15; 51 unsigned short reversed:1; 52 char inline_buf[]; 53 }; 54 /* 55 * Average path length does not exceed 200 bytes, we'll have 56 * better packing in the slab and higher chance to satisfy 57 * a allocation later during send. 58 */ 59 char pad[256]; 60 }; 61 }; 62 #define FS_PATH_INLINE_SIZE \ 63 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) 64 65 66 /* reused for each extent */ 67 struct clone_root { 68 struct btrfs_root *root; 69 u64 ino; 70 u64 offset; 71 72 u64 found_refs; 73 }; 74 75 #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 76 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) 77 78 struct send_ctx { 79 struct file *send_filp; 80 loff_t send_off; 81 char *send_buf; 82 u32 send_size; 83 u32 send_max_size; 84 u64 total_send_size; 85 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; 86 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ 87 /* Protocol version compatibility requested */ 88 u32 proto; 89 90 struct btrfs_root *send_root; 91 struct btrfs_root *parent_root; 92 struct clone_root *clone_roots; 93 int clone_roots_cnt; 94 95 /* current state of the compare_tree call */ 96 struct btrfs_path *left_path; 97 struct btrfs_path *right_path; 98 struct btrfs_key *cmp_key; 99 100 /* 101 * infos of the currently processed inode. In case of deleted inodes, 102 * these are the values from the deleted inode. 103 */ 104 u64 cur_ino; 105 u64 cur_inode_gen; 106 int cur_inode_new; 107 int cur_inode_new_gen; 108 int cur_inode_deleted; 109 u64 cur_inode_size; 110 u64 cur_inode_mode; 111 u64 cur_inode_rdev; 112 u64 cur_inode_last_extent; 113 u64 cur_inode_next_write_offset; 114 bool ignore_cur_inode; 115 116 u64 send_progress; 117 118 struct list_head new_refs; 119 struct list_head deleted_refs; 120 121 struct radix_tree_root name_cache; 122 struct list_head name_cache_list; 123 int name_cache_size; 124 125 struct file_ra_state ra; 126 127 /* 128 * We process inodes by their increasing order, so if before an 129 * incremental send we reverse the parent/child relationship of 130 * directories such that a directory with a lower inode number was 131 * the parent of a directory with a higher inode number, and the one 132 * becoming the new parent got renamed too, we can't rename/move the 133 * directory with lower inode number when we finish processing it - we 134 * must process the directory with higher inode number first, then 135 * rename/move it and then rename/move the directory with lower inode 136 * number. Example follows. 137 * 138 * Tree state when the first send was performed: 139 * 140 * . 141 * |-- a (ino 257) 142 * |-- b (ino 258) 143 * | 144 * | 145 * |-- c (ino 259) 146 * | |-- d (ino 260) 147 * | 148 * |-- c2 (ino 261) 149 * 150 * Tree state when the second (incremental) send is performed: 151 * 152 * . 153 * |-- a (ino 257) 154 * |-- b (ino 258) 155 * |-- c2 (ino 261) 156 * |-- d2 (ino 260) 157 * |-- cc (ino 259) 158 * 159 * The sequence of steps that lead to the second state was: 160 * 161 * mv /a/b/c/d /a/b/c2/d2 162 * mv /a/b/c /a/b/c2/d2/cc 163 * 164 * "c" has lower inode number, but we can't move it (2nd mv operation) 165 * before we move "d", which has higher inode number. 166 * 167 * So we just memorize which move/rename operations must be performed 168 * later when their respective parent is processed and moved/renamed. 169 */ 170 171 /* Indexed by parent directory inode number. */ 172 struct rb_root pending_dir_moves; 173 174 /* 175 * Reverse index, indexed by the inode number of a directory that 176 * is waiting for the move/rename of its immediate parent before its 177 * own move/rename can be performed. 178 */ 179 struct rb_root waiting_dir_moves; 180 181 /* 182 * A directory that is going to be rm'ed might have a child directory 183 * which is in the pending directory moves index above. In this case, 184 * the directory can only be removed after the move/rename of its child 185 * is performed. Example: 186 * 187 * Parent snapshot: 188 * 189 * . (ino 256) 190 * |-- a/ (ino 257) 191 * |-- b/ (ino 258) 192 * |-- c/ (ino 259) 193 * | |-- x/ (ino 260) 194 * | 195 * |-- y/ (ino 261) 196 * 197 * Send snapshot: 198 * 199 * . (ino 256) 200 * |-- a/ (ino 257) 201 * |-- b/ (ino 258) 202 * |-- YY/ (ino 261) 203 * |-- x/ (ino 260) 204 * 205 * Sequence of steps that lead to the send snapshot: 206 * rm -f /a/b/c/foo.txt 207 * mv /a/b/y /a/b/YY 208 * mv /a/b/c/x /a/b/YY 209 * rmdir /a/b/c 210 * 211 * When the child is processed, its move/rename is delayed until its 212 * parent is processed (as explained above), but all other operations 213 * like update utimes, chown, chgrp, etc, are performed and the paths 214 * that it uses for those operations must use the orphanized name of 215 * its parent (the directory we're going to rm later), so we need to 216 * memorize that name. 217 * 218 * Indexed by the inode number of the directory to be deleted. 219 */ 220 struct rb_root orphan_dirs; 221 }; 222 223 struct pending_dir_move { 224 struct rb_node node; 225 struct list_head list; 226 u64 parent_ino; 227 u64 ino; 228 u64 gen; 229 struct list_head update_refs; 230 }; 231 232 struct waiting_dir_move { 233 struct rb_node node; 234 u64 ino; 235 /* 236 * There might be some directory that could not be removed because it 237 * was waiting for this directory inode to be moved first. Therefore 238 * after this directory is moved, we can try to rmdir the ino rmdir_ino. 239 */ 240 u64 rmdir_ino; 241 u64 rmdir_gen; 242 bool orphanized; 243 }; 244 245 struct orphan_dir_info { 246 struct rb_node node; 247 u64 ino; 248 u64 gen; 249 u64 last_dir_index_offset; 250 }; 251 252 struct name_cache_entry { 253 struct list_head list; 254 /* 255 * radix_tree has only 32bit entries but we need to handle 64bit inums. 256 * We use the lower 32bit of the 64bit inum to store it in the tree. If 257 * more then one inum would fall into the same entry, we use radix_list 258 * to store the additional entries. radix_list is also used to store 259 * entries where two entries have the same inum but different 260 * generations. 261 */ 262 struct list_head radix_list; 263 u64 ino; 264 u64 gen; 265 u64 parent_ino; 266 u64 parent_gen; 267 int ret; 268 int need_later_update; 269 int name_len; 270 char name[]; 271 }; 272 273 #define ADVANCE 1 274 #define ADVANCE_ONLY_NEXT -1 275 276 enum btrfs_compare_tree_result { 277 BTRFS_COMPARE_TREE_NEW, 278 BTRFS_COMPARE_TREE_DELETED, 279 BTRFS_COMPARE_TREE_CHANGED, 280 BTRFS_COMPARE_TREE_SAME, 281 }; 282 283 __cold 284 static void inconsistent_snapshot_error(struct send_ctx *sctx, 285 enum btrfs_compare_tree_result result, 286 const char *what) 287 { 288 const char *result_string; 289 290 switch (result) { 291 case BTRFS_COMPARE_TREE_NEW: 292 result_string = "new"; 293 break; 294 case BTRFS_COMPARE_TREE_DELETED: 295 result_string = "deleted"; 296 break; 297 case BTRFS_COMPARE_TREE_CHANGED: 298 result_string = "updated"; 299 break; 300 case BTRFS_COMPARE_TREE_SAME: 301 ASSERT(0); 302 result_string = "unchanged"; 303 break; 304 default: 305 ASSERT(0); 306 result_string = "unexpected"; 307 } 308 309 btrfs_err(sctx->send_root->fs_info, 310 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", 311 result_string, what, sctx->cmp_key->objectid, 312 sctx->send_root->root_key.objectid, 313 (sctx->parent_root ? 314 sctx->parent_root->root_key.objectid : 0)); 315 } 316 317 __maybe_unused 318 static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) 319 { 320 switch (sctx->proto) { 321 case 1: return cmd < __BTRFS_SEND_C_MAX_V1; 322 case 2: return cmd < __BTRFS_SEND_C_MAX_V2; 323 default: return false; 324 } 325 } 326 327 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 328 329 static struct waiting_dir_move * 330 get_waiting_dir_move(struct send_ctx *sctx, u64 ino); 331 332 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); 333 334 static int need_send_hole(struct send_ctx *sctx) 335 { 336 return (sctx->parent_root && !sctx->cur_inode_new && 337 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && 338 S_ISREG(sctx->cur_inode_mode)); 339 } 340 341 static void fs_path_reset(struct fs_path *p) 342 { 343 if (p->reversed) { 344 p->start = p->buf + p->buf_len - 1; 345 p->end = p->start; 346 *p->start = 0; 347 } else { 348 p->start = p->buf; 349 p->end = p->start; 350 *p->start = 0; 351 } 352 } 353 354 static struct fs_path *fs_path_alloc(void) 355 { 356 struct fs_path *p; 357 358 p = kmalloc(sizeof(*p), GFP_KERNEL); 359 if (!p) 360 return NULL; 361 p->reversed = 0; 362 p->buf = p->inline_buf; 363 p->buf_len = FS_PATH_INLINE_SIZE; 364 fs_path_reset(p); 365 return p; 366 } 367 368 static struct fs_path *fs_path_alloc_reversed(void) 369 { 370 struct fs_path *p; 371 372 p = fs_path_alloc(); 373 if (!p) 374 return NULL; 375 p->reversed = 1; 376 fs_path_reset(p); 377 return p; 378 } 379 380 static void fs_path_free(struct fs_path *p) 381 { 382 if (!p) 383 return; 384 if (p->buf != p->inline_buf) 385 kfree(p->buf); 386 kfree(p); 387 } 388 389 static int fs_path_len(struct fs_path *p) 390 { 391 return p->end - p->start; 392 } 393 394 static int fs_path_ensure_buf(struct fs_path *p, int len) 395 { 396 char *tmp_buf; 397 int path_len; 398 int old_buf_len; 399 400 len++; 401 402 if (p->buf_len >= len) 403 return 0; 404 405 if (len > PATH_MAX) { 406 WARN_ON(1); 407 return -ENOMEM; 408 } 409 410 path_len = p->end - p->start; 411 old_buf_len = p->buf_len; 412 413 /* 414 * First time the inline_buf does not suffice 415 */ 416 if (p->buf == p->inline_buf) { 417 tmp_buf = kmalloc(len, GFP_KERNEL); 418 if (tmp_buf) 419 memcpy(tmp_buf, p->buf, old_buf_len); 420 } else { 421 tmp_buf = krealloc(p->buf, len, GFP_KERNEL); 422 } 423 if (!tmp_buf) 424 return -ENOMEM; 425 p->buf = tmp_buf; 426 /* 427 * The real size of the buffer is bigger, this will let the fast path 428 * happen most of the time 429 */ 430 p->buf_len = ksize(p->buf); 431 432 if (p->reversed) { 433 tmp_buf = p->buf + old_buf_len - path_len - 1; 434 p->end = p->buf + p->buf_len - 1; 435 p->start = p->end - path_len; 436 memmove(p->start, tmp_buf, path_len + 1); 437 } else { 438 p->start = p->buf; 439 p->end = p->start + path_len; 440 } 441 return 0; 442 } 443 444 static int fs_path_prepare_for_add(struct fs_path *p, int name_len, 445 char **prepared) 446 { 447 int ret; 448 int new_len; 449 450 new_len = p->end - p->start + name_len; 451 if (p->start != p->end) 452 new_len++; 453 ret = fs_path_ensure_buf(p, new_len); 454 if (ret < 0) 455 goto out; 456 457 if (p->reversed) { 458 if (p->start != p->end) 459 *--p->start = '/'; 460 p->start -= name_len; 461 *prepared = p->start; 462 } else { 463 if (p->start != p->end) 464 *p->end++ = '/'; 465 *prepared = p->end; 466 p->end += name_len; 467 *p->end = 0; 468 } 469 470 out: 471 return ret; 472 } 473 474 static int fs_path_add(struct fs_path *p, const char *name, int name_len) 475 { 476 int ret; 477 char *prepared; 478 479 ret = fs_path_prepare_for_add(p, name_len, &prepared); 480 if (ret < 0) 481 goto out; 482 memcpy(prepared, name, name_len); 483 484 out: 485 return ret; 486 } 487 488 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 489 { 490 int ret; 491 char *prepared; 492 493 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); 494 if (ret < 0) 495 goto out; 496 memcpy(prepared, p2->start, p2->end - p2->start); 497 498 out: 499 return ret; 500 } 501 502 static int fs_path_add_from_extent_buffer(struct fs_path *p, 503 struct extent_buffer *eb, 504 unsigned long off, int len) 505 { 506 int ret; 507 char *prepared; 508 509 ret = fs_path_prepare_for_add(p, len, &prepared); 510 if (ret < 0) 511 goto out; 512 513 read_extent_buffer(eb, prepared, off, len); 514 515 out: 516 return ret; 517 } 518 519 static int fs_path_copy(struct fs_path *p, struct fs_path *from) 520 { 521 int ret; 522 523 p->reversed = from->reversed; 524 fs_path_reset(p); 525 526 ret = fs_path_add_path(p, from); 527 528 return ret; 529 } 530 531 532 static void fs_path_unreverse(struct fs_path *p) 533 { 534 char *tmp; 535 int len; 536 537 if (!p->reversed) 538 return; 539 540 tmp = p->start; 541 len = p->end - p->start; 542 p->start = p->buf; 543 p->end = p->start + len; 544 memmove(p->start, tmp, len + 1); 545 p->reversed = 0; 546 } 547 548 static struct btrfs_path *alloc_path_for_send(void) 549 { 550 struct btrfs_path *path; 551 552 path = btrfs_alloc_path(); 553 if (!path) 554 return NULL; 555 path->search_commit_root = 1; 556 path->skip_locking = 1; 557 path->need_commit_sem = 1; 558 return path; 559 } 560 561 static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) 562 { 563 int ret; 564 u32 pos = 0; 565 566 while (pos < len) { 567 ret = kernel_write(filp, buf + pos, len - pos, off); 568 /* TODO handle that correctly */ 569 /*if (ret == -ERESTARTSYS) { 570 continue; 571 }*/ 572 if (ret < 0) 573 return ret; 574 if (ret == 0) { 575 return -EIO; 576 } 577 pos += ret; 578 } 579 580 return 0; 581 } 582 583 static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) 584 { 585 struct btrfs_tlv_header *hdr; 586 int total_len = sizeof(*hdr) + len; 587 int left = sctx->send_max_size - sctx->send_size; 588 589 if (unlikely(left < total_len)) 590 return -EOVERFLOW; 591 592 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); 593 put_unaligned_le16(attr, &hdr->tlv_type); 594 put_unaligned_le16(len, &hdr->tlv_len); 595 memcpy(hdr + 1, data, len); 596 sctx->send_size += total_len; 597 598 return 0; 599 } 600 601 #define TLV_PUT_DEFINE_INT(bits) \ 602 static int tlv_put_u##bits(struct send_ctx *sctx, \ 603 u##bits attr, u##bits value) \ 604 { \ 605 __le##bits __tmp = cpu_to_le##bits(value); \ 606 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ 607 } 608 609 TLV_PUT_DEFINE_INT(64) 610 611 static int tlv_put_string(struct send_ctx *sctx, u16 attr, 612 const char *str, int len) 613 { 614 if (len == -1) 615 len = strlen(str); 616 return tlv_put(sctx, attr, str, len); 617 } 618 619 static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, 620 const u8 *uuid) 621 { 622 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); 623 } 624 625 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, 626 struct extent_buffer *eb, 627 struct btrfs_timespec *ts) 628 { 629 struct btrfs_timespec bts; 630 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); 631 return tlv_put(sctx, attr, &bts, sizeof(bts)); 632 } 633 634 635 #define TLV_PUT(sctx, attrtype, data, attrlen) \ 636 do { \ 637 ret = tlv_put(sctx, attrtype, data, attrlen); \ 638 if (ret < 0) \ 639 goto tlv_put_failure; \ 640 } while (0) 641 642 #define TLV_PUT_INT(sctx, attrtype, bits, value) \ 643 do { \ 644 ret = tlv_put_u##bits(sctx, attrtype, value); \ 645 if (ret < 0) \ 646 goto tlv_put_failure; \ 647 } while (0) 648 649 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) 650 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) 651 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) 652 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) 653 #define TLV_PUT_STRING(sctx, attrtype, str, len) \ 654 do { \ 655 ret = tlv_put_string(sctx, attrtype, str, len); \ 656 if (ret < 0) \ 657 goto tlv_put_failure; \ 658 } while (0) 659 #define TLV_PUT_PATH(sctx, attrtype, p) \ 660 do { \ 661 ret = tlv_put_string(sctx, attrtype, p->start, \ 662 p->end - p->start); \ 663 if (ret < 0) \ 664 goto tlv_put_failure; \ 665 } while(0) 666 #define TLV_PUT_UUID(sctx, attrtype, uuid) \ 667 do { \ 668 ret = tlv_put_uuid(sctx, attrtype, uuid); \ 669 if (ret < 0) \ 670 goto tlv_put_failure; \ 671 } while (0) 672 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ 673 do { \ 674 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ 675 if (ret < 0) \ 676 goto tlv_put_failure; \ 677 } while (0) 678 679 static int send_header(struct send_ctx *sctx) 680 { 681 struct btrfs_stream_header hdr; 682 683 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 684 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); 685 686 return write_buf(sctx->send_filp, &hdr, sizeof(hdr), 687 &sctx->send_off); 688 } 689 690 /* 691 * For each command/item we want to send to userspace, we call this function. 692 */ 693 static int begin_cmd(struct send_ctx *sctx, int cmd) 694 { 695 struct btrfs_cmd_header *hdr; 696 697 if (WARN_ON(!sctx->send_buf)) 698 return -EINVAL; 699 700 BUG_ON(sctx->send_size); 701 702 sctx->send_size += sizeof(*hdr); 703 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 704 put_unaligned_le16(cmd, &hdr->cmd); 705 706 return 0; 707 } 708 709 static int send_cmd(struct send_ctx *sctx) 710 { 711 int ret; 712 struct btrfs_cmd_header *hdr; 713 u32 crc; 714 715 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 716 put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); 717 put_unaligned_le32(0, &hdr->crc); 718 719 crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 720 put_unaligned_le32(crc, &hdr->crc); 721 722 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 723 &sctx->send_off); 724 725 sctx->total_send_size += sctx->send_size; 726 sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; 727 sctx->send_size = 0; 728 729 return ret; 730 } 731 732 /* 733 * Sends a move instruction to user space 734 */ 735 static int send_rename(struct send_ctx *sctx, 736 struct fs_path *from, struct fs_path *to) 737 { 738 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 739 int ret; 740 741 btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); 742 743 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); 744 if (ret < 0) 745 goto out; 746 747 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); 748 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); 749 750 ret = send_cmd(sctx); 751 752 tlv_put_failure: 753 out: 754 return ret; 755 } 756 757 /* 758 * Sends a link instruction to user space 759 */ 760 static int send_link(struct send_ctx *sctx, 761 struct fs_path *path, struct fs_path *lnk) 762 { 763 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 764 int ret; 765 766 btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); 767 768 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); 769 if (ret < 0) 770 goto out; 771 772 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 773 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); 774 775 ret = send_cmd(sctx); 776 777 tlv_put_failure: 778 out: 779 return ret; 780 } 781 782 /* 783 * Sends an unlink instruction to user space 784 */ 785 static int send_unlink(struct send_ctx *sctx, struct fs_path *path) 786 { 787 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 788 int ret; 789 790 btrfs_debug(fs_info, "send_unlink %s", path->start); 791 792 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); 793 if (ret < 0) 794 goto out; 795 796 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 797 798 ret = send_cmd(sctx); 799 800 tlv_put_failure: 801 out: 802 return ret; 803 } 804 805 /* 806 * Sends a rmdir instruction to user space 807 */ 808 static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) 809 { 810 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 811 int ret; 812 813 btrfs_debug(fs_info, "send_rmdir %s", path->start); 814 815 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); 816 if (ret < 0) 817 goto out; 818 819 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 820 821 ret = send_cmd(sctx); 822 823 tlv_put_failure: 824 out: 825 return ret; 826 } 827 828 /* 829 * Helper function to retrieve some fields from an inode item. 830 */ 831 static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, 832 u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, 833 u64 *gid, u64 *rdev) 834 { 835 int ret; 836 struct btrfs_inode_item *ii; 837 struct btrfs_key key; 838 839 key.objectid = ino; 840 key.type = BTRFS_INODE_ITEM_KEY; 841 key.offset = 0; 842 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 843 if (ret) { 844 if (ret > 0) 845 ret = -ENOENT; 846 return ret; 847 } 848 849 ii = btrfs_item_ptr(path->nodes[0], path->slots[0], 850 struct btrfs_inode_item); 851 if (size) 852 *size = btrfs_inode_size(path->nodes[0], ii); 853 if (gen) 854 *gen = btrfs_inode_generation(path->nodes[0], ii); 855 if (mode) 856 *mode = btrfs_inode_mode(path->nodes[0], ii); 857 if (uid) 858 *uid = btrfs_inode_uid(path->nodes[0], ii); 859 if (gid) 860 *gid = btrfs_inode_gid(path->nodes[0], ii); 861 if (rdev) 862 *rdev = btrfs_inode_rdev(path->nodes[0], ii); 863 864 return ret; 865 } 866 867 static int get_inode_info(struct btrfs_root *root, 868 u64 ino, u64 *size, u64 *gen, 869 u64 *mode, u64 *uid, u64 *gid, 870 u64 *rdev) 871 { 872 struct btrfs_path *path; 873 int ret; 874 875 path = alloc_path_for_send(); 876 if (!path) 877 return -ENOMEM; 878 ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, 879 rdev); 880 btrfs_free_path(path); 881 return ret; 882 } 883 884 typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, 885 struct fs_path *p, 886 void *ctx); 887 888 /* 889 * Helper function to iterate the entries in ONE btrfs_inode_ref or 890 * btrfs_inode_extref. 891 * The iterate callback may return a non zero value to stop iteration. This can 892 * be a negative value for error codes or 1 to simply stop it. 893 * 894 * path must point to the INODE_REF or INODE_EXTREF when called. 895 */ 896 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, 897 struct btrfs_key *found_key, int resolve, 898 iterate_inode_ref_t iterate, void *ctx) 899 { 900 struct extent_buffer *eb = path->nodes[0]; 901 struct btrfs_item *item; 902 struct btrfs_inode_ref *iref; 903 struct btrfs_inode_extref *extref; 904 struct btrfs_path *tmp_path; 905 struct fs_path *p; 906 u32 cur = 0; 907 u32 total; 908 int slot = path->slots[0]; 909 u32 name_len; 910 char *start; 911 int ret = 0; 912 int num = 0; 913 int index; 914 u64 dir; 915 unsigned long name_off; 916 unsigned long elem_size; 917 unsigned long ptr; 918 919 p = fs_path_alloc_reversed(); 920 if (!p) 921 return -ENOMEM; 922 923 tmp_path = alloc_path_for_send(); 924 if (!tmp_path) { 925 fs_path_free(p); 926 return -ENOMEM; 927 } 928 929 930 if (found_key->type == BTRFS_INODE_REF_KEY) { 931 ptr = (unsigned long)btrfs_item_ptr(eb, slot, 932 struct btrfs_inode_ref); 933 item = btrfs_item_nr(slot); 934 total = btrfs_item_size(eb, item); 935 elem_size = sizeof(*iref); 936 } else { 937 ptr = btrfs_item_ptr_offset(eb, slot); 938 total = btrfs_item_size_nr(eb, slot); 939 elem_size = sizeof(*extref); 940 } 941 942 while (cur < total) { 943 fs_path_reset(p); 944 945 if (found_key->type == BTRFS_INODE_REF_KEY) { 946 iref = (struct btrfs_inode_ref *)(ptr + cur); 947 name_len = btrfs_inode_ref_name_len(eb, iref); 948 name_off = (unsigned long)(iref + 1); 949 index = btrfs_inode_ref_index(eb, iref); 950 dir = found_key->offset; 951 } else { 952 extref = (struct btrfs_inode_extref *)(ptr + cur); 953 name_len = btrfs_inode_extref_name_len(eb, extref); 954 name_off = (unsigned long)&extref->name; 955 index = btrfs_inode_extref_index(eb, extref); 956 dir = btrfs_inode_extref_parent(eb, extref); 957 } 958 959 if (resolve) { 960 start = btrfs_ref_to_path(root, tmp_path, name_len, 961 name_off, eb, dir, 962 p->buf, p->buf_len); 963 if (IS_ERR(start)) { 964 ret = PTR_ERR(start); 965 goto out; 966 } 967 if (start < p->buf) { 968 /* overflow , try again with larger buffer */ 969 ret = fs_path_ensure_buf(p, 970 p->buf_len + p->buf - start); 971 if (ret < 0) 972 goto out; 973 start = btrfs_ref_to_path(root, tmp_path, 974 name_len, name_off, 975 eb, dir, 976 p->buf, p->buf_len); 977 if (IS_ERR(start)) { 978 ret = PTR_ERR(start); 979 goto out; 980 } 981 BUG_ON(start < p->buf); 982 } 983 p->start = start; 984 } else { 985 ret = fs_path_add_from_extent_buffer(p, eb, name_off, 986 name_len); 987 if (ret < 0) 988 goto out; 989 } 990 991 cur += elem_size + name_len; 992 ret = iterate(num, dir, index, p, ctx); 993 if (ret) 994 goto out; 995 num++; 996 } 997 998 out: 999 btrfs_free_path(tmp_path); 1000 fs_path_free(p); 1001 return ret; 1002 } 1003 1004 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, 1005 const char *name, int name_len, 1006 const char *data, int data_len, 1007 u8 type, void *ctx); 1008 1009 /* 1010 * Helper function to iterate the entries in ONE btrfs_dir_item. 1011 * The iterate callback may return a non zero value to stop iteration. This can 1012 * be a negative value for error codes or 1 to simply stop it. 1013 * 1014 * path must point to the dir item when called. 1015 */ 1016 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, 1017 iterate_dir_item_t iterate, void *ctx) 1018 { 1019 int ret = 0; 1020 struct extent_buffer *eb; 1021 struct btrfs_item *item; 1022 struct btrfs_dir_item *di; 1023 struct btrfs_key di_key; 1024 char *buf = NULL; 1025 int buf_len; 1026 u32 name_len; 1027 u32 data_len; 1028 u32 cur; 1029 u32 len; 1030 u32 total; 1031 int slot; 1032 int num; 1033 u8 type; 1034 1035 /* 1036 * Start with a small buffer (1 page). If later we end up needing more 1037 * space, which can happen for xattrs on a fs with a leaf size greater 1038 * then the page size, attempt to increase the buffer. Typically xattr 1039 * values are small. 1040 */ 1041 buf_len = PATH_MAX; 1042 buf = kmalloc(buf_len, GFP_KERNEL); 1043 if (!buf) { 1044 ret = -ENOMEM; 1045 goto out; 1046 } 1047 1048 eb = path->nodes[0]; 1049 slot = path->slots[0]; 1050 item = btrfs_item_nr(slot); 1051 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1052 cur = 0; 1053 len = 0; 1054 total = btrfs_item_size(eb, item); 1055 1056 num = 0; 1057 while (cur < total) { 1058 name_len = btrfs_dir_name_len(eb, di); 1059 data_len = btrfs_dir_data_len(eb, di); 1060 type = btrfs_dir_type(eb, di); 1061 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1062 1063 if (type == BTRFS_FT_XATTR) { 1064 if (name_len > XATTR_NAME_MAX) { 1065 ret = -ENAMETOOLONG; 1066 goto out; 1067 } 1068 if (name_len + data_len > 1069 BTRFS_MAX_XATTR_SIZE(root->fs_info)) { 1070 ret = -E2BIG; 1071 goto out; 1072 } 1073 } else { 1074 /* 1075 * Path too long 1076 */ 1077 if (name_len + data_len > PATH_MAX) { 1078 ret = -ENAMETOOLONG; 1079 goto out; 1080 } 1081 } 1082 1083 if (name_len + data_len > buf_len) { 1084 buf_len = name_len + data_len; 1085 if (is_vmalloc_addr(buf)) { 1086 vfree(buf); 1087 buf = NULL; 1088 } else { 1089 char *tmp = krealloc(buf, buf_len, 1090 GFP_KERNEL | __GFP_NOWARN); 1091 1092 if (!tmp) 1093 kfree(buf); 1094 buf = tmp; 1095 } 1096 if (!buf) { 1097 buf = kvmalloc(buf_len, GFP_KERNEL); 1098 if (!buf) { 1099 ret = -ENOMEM; 1100 goto out; 1101 } 1102 } 1103 } 1104 1105 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1106 name_len + data_len); 1107 1108 len = sizeof(*di) + name_len + data_len; 1109 di = (struct btrfs_dir_item *)((char *)di + len); 1110 cur += len; 1111 1112 ret = iterate(num, &di_key, buf, name_len, buf + name_len, 1113 data_len, type, ctx); 1114 if (ret < 0) 1115 goto out; 1116 if (ret) { 1117 ret = 0; 1118 goto out; 1119 } 1120 1121 num++; 1122 } 1123 1124 out: 1125 kvfree(buf); 1126 return ret; 1127 } 1128 1129 static int __copy_first_ref(int num, u64 dir, int index, 1130 struct fs_path *p, void *ctx) 1131 { 1132 int ret; 1133 struct fs_path *pt = ctx; 1134 1135 ret = fs_path_copy(pt, p); 1136 if (ret < 0) 1137 return ret; 1138 1139 /* we want the first only */ 1140 return 1; 1141 } 1142 1143 /* 1144 * Retrieve the first path of an inode. If an inode has more then one 1145 * ref/hardlink, this is ignored. 1146 */ 1147 static int get_inode_path(struct btrfs_root *root, 1148 u64 ino, struct fs_path *path) 1149 { 1150 int ret; 1151 struct btrfs_key key, found_key; 1152 struct btrfs_path *p; 1153 1154 p = alloc_path_for_send(); 1155 if (!p) 1156 return -ENOMEM; 1157 1158 fs_path_reset(path); 1159 1160 key.objectid = ino; 1161 key.type = BTRFS_INODE_REF_KEY; 1162 key.offset = 0; 1163 1164 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); 1165 if (ret < 0) 1166 goto out; 1167 if (ret) { 1168 ret = 1; 1169 goto out; 1170 } 1171 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); 1172 if (found_key.objectid != ino || 1173 (found_key.type != BTRFS_INODE_REF_KEY && 1174 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 1175 ret = -ENOENT; 1176 goto out; 1177 } 1178 1179 ret = iterate_inode_ref(root, p, &found_key, 1, 1180 __copy_first_ref, path); 1181 if (ret < 0) 1182 goto out; 1183 ret = 0; 1184 1185 out: 1186 btrfs_free_path(p); 1187 return ret; 1188 } 1189 1190 struct backref_ctx { 1191 struct send_ctx *sctx; 1192 1193 /* number of total found references */ 1194 u64 found; 1195 1196 /* 1197 * used for clones found in send_root. clones found behind cur_objectid 1198 * and cur_offset are not considered as allowed clones. 1199 */ 1200 u64 cur_objectid; 1201 u64 cur_offset; 1202 1203 /* may be truncated in case it's the last extent in a file */ 1204 u64 extent_len; 1205 1206 /* Just to check for bugs in backref resolving */ 1207 int found_itself; 1208 }; 1209 1210 static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1211 { 1212 u64 root = (u64)(uintptr_t)key; 1213 const struct clone_root *cr = elt; 1214 1215 if (root < cr->root->root_key.objectid) 1216 return -1; 1217 if (root > cr->root->root_key.objectid) 1218 return 1; 1219 return 0; 1220 } 1221 1222 static int __clone_root_cmp_sort(const void *e1, const void *e2) 1223 { 1224 const struct clone_root *cr1 = e1; 1225 const struct clone_root *cr2 = e2; 1226 1227 if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) 1228 return -1; 1229 if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) 1230 return 1; 1231 return 0; 1232 } 1233 1234 /* 1235 * Called for every backref that is found for the current extent. 1236 * Results are collected in sctx->clone_roots->ino/offset/found_refs 1237 */ 1238 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) 1239 { 1240 struct backref_ctx *bctx = ctx_; 1241 struct clone_root *found; 1242 1243 /* First check if the root is in the list of accepted clone sources */ 1244 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, 1245 bctx->sctx->clone_roots_cnt, 1246 sizeof(struct clone_root), 1247 __clone_root_cmp_bsearch); 1248 if (!found) 1249 return 0; 1250 1251 if (found->root == bctx->sctx->send_root && 1252 ino == bctx->cur_objectid && 1253 offset == bctx->cur_offset) { 1254 bctx->found_itself = 1; 1255 } 1256 1257 /* 1258 * Make sure we don't consider clones from send_root that are 1259 * behind the current inode/offset. 1260 */ 1261 if (found->root == bctx->sctx->send_root) { 1262 /* 1263 * If the source inode was not yet processed we can't issue a 1264 * clone operation, as the source extent does not exist yet at 1265 * the destination of the stream. 1266 */ 1267 if (ino > bctx->cur_objectid) 1268 return 0; 1269 /* 1270 * We clone from the inode currently being sent as long as the 1271 * source extent is already processed, otherwise we could try 1272 * to clone from an extent that does not exist yet at the 1273 * destination of the stream. 1274 */ 1275 if (ino == bctx->cur_objectid && 1276 offset + bctx->extent_len > 1277 bctx->sctx->cur_inode_next_write_offset) 1278 return 0; 1279 } 1280 1281 bctx->found++; 1282 found->found_refs++; 1283 if (ino < found->ino) { 1284 found->ino = ino; 1285 found->offset = offset; 1286 } else if (found->ino == ino) { 1287 /* 1288 * same extent found more then once in the same file. 1289 */ 1290 if (found->offset > offset + bctx->extent_len) 1291 found->offset = offset; 1292 } 1293 1294 return 0; 1295 } 1296 1297 /* 1298 * Given an inode, offset and extent item, it finds a good clone for a clone 1299 * instruction. Returns -ENOENT when none could be found. The function makes 1300 * sure that the returned clone is usable at the point where sending is at the 1301 * moment. This means, that no clones are accepted which lie behind the current 1302 * inode+offset. 1303 * 1304 * path must point to the extent item when called. 1305 */ 1306 static int find_extent_clone(struct send_ctx *sctx, 1307 struct btrfs_path *path, 1308 u64 ino, u64 data_offset, 1309 u64 ino_size, 1310 struct clone_root **found) 1311 { 1312 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1313 int ret; 1314 int extent_type; 1315 u64 logical; 1316 u64 disk_byte; 1317 u64 num_bytes; 1318 u64 extent_item_pos; 1319 u64 flags = 0; 1320 struct btrfs_file_extent_item *fi; 1321 struct extent_buffer *eb = path->nodes[0]; 1322 struct backref_ctx backref_ctx = {0}; 1323 struct clone_root *cur_clone_root; 1324 struct btrfs_key found_key; 1325 struct btrfs_path *tmp_path; 1326 struct btrfs_extent_item *ei; 1327 int compressed; 1328 u32 i; 1329 1330 tmp_path = alloc_path_for_send(); 1331 if (!tmp_path) 1332 return -ENOMEM; 1333 1334 /* We only use this path under the commit sem */ 1335 tmp_path->need_commit_sem = 0; 1336 1337 if (data_offset >= ino_size) { 1338 /* 1339 * There may be extents that lie behind the file's size. 1340 * I at least had this in combination with snapshotting while 1341 * writing large files. 1342 */ 1343 ret = 0; 1344 goto out; 1345 } 1346 1347 fi = btrfs_item_ptr(eb, path->slots[0], 1348 struct btrfs_file_extent_item); 1349 extent_type = btrfs_file_extent_type(eb, fi); 1350 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1351 ret = -ENOENT; 1352 goto out; 1353 } 1354 compressed = btrfs_file_extent_compression(eb, fi); 1355 1356 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1357 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1358 if (disk_byte == 0) { 1359 ret = -ENOENT; 1360 goto out; 1361 } 1362 logical = disk_byte + btrfs_file_extent_offset(eb, fi); 1363 1364 down_read(&fs_info->commit_root_sem); 1365 ret = extent_from_logical(fs_info, disk_byte, tmp_path, 1366 &found_key, &flags); 1367 up_read(&fs_info->commit_root_sem); 1368 1369 if (ret < 0) 1370 goto out; 1371 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1372 ret = -EIO; 1373 goto out; 1374 } 1375 1376 ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], 1377 struct btrfs_extent_item); 1378 /* 1379 * Backreference walking (iterate_extent_inodes() below) is currently 1380 * too expensive when an extent has a large number of references, both 1381 * in time spent and used memory. So for now just fallback to write 1382 * operations instead of clone operations when an extent has more than 1383 * a certain amount of references. 1384 */ 1385 if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { 1386 ret = -ENOENT; 1387 goto out; 1388 } 1389 btrfs_release_path(tmp_path); 1390 1391 /* 1392 * Setup the clone roots. 1393 */ 1394 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1395 cur_clone_root = sctx->clone_roots + i; 1396 cur_clone_root->ino = (u64)-1; 1397 cur_clone_root->offset = 0; 1398 cur_clone_root->found_refs = 0; 1399 } 1400 1401 backref_ctx.sctx = sctx; 1402 backref_ctx.found = 0; 1403 backref_ctx.cur_objectid = ino; 1404 backref_ctx.cur_offset = data_offset; 1405 backref_ctx.found_itself = 0; 1406 backref_ctx.extent_len = num_bytes; 1407 1408 /* 1409 * The last extent of a file may be too large due to page alignment. 1410 * We need to adjust extent_len in this case so that the checks in 1411 * __iterate_backrefs work. 1412 */ 1413 if (data_offset + num_bytes >= ino_size) 1414 backref_ctx.extent_len = ino_size - data_offset; 1415 1416 /* 1417 * Now collect all backrefs. 1418 */ 1419 if (compressed == BTRFS_COMPRESS_NONE) 1420 extent_item_pos = logical - found_key.objectid; 1421 else 1422 extent_item_pos = 0; 1423 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1424 extent_item_pos, 1, __iterate_backrefs, 1425 &backref_ctx, false); 1426 1427 if (ret < 0) 1428 goto out; 1429 1430 if (!backref_ctx.found_itself) { 1431 /* found a bug in backref code? */ 1432 ret = -EIO; 1433 btrfs_err(fs_info, 1434 "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu", 1435 ino, data_offset, disk_byte, found_key.objectid); 1436 goto out; 1437 } 1438 1439 btrfs_debug(fs_info, 1440 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", 1441 data_offset, ino, num_bytes, logical); 1442 1443 if (!backref_ctx.found) 1444 btrfs_debug(fs_info, "no clones found"); 1445 1446 cur_clone_root = NULL; 1447 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1448 if (sctx->clone_roots[i].found_refs) { 1449 if (!cur_clone_root) 1450 cur_clone_root = sctx->clone_roots + i; 1451 else if (sctx->clone_roots[i].root == sctx->send_root) 1452 /* prefer clones from send_root over others */ 1453 cur_clone_root = sctx->clone_roots + i; 1454 } 1455 1456 } 1457 1458 if (cur_clone_root) { 1459 *found = cur_clone_root; 1460 ret = 0; 1461 } else { 1462 ret = -ENOENT; 1463 } 1464 1465 out: 1466 btrfs_free_path(tmp_path); 1467 return ret; 1468 } 1469 1470 static int read_symlink(struct btrfs_root *root, 1471 u64 ino, 1472 struct fs_path *dest) 1473 { 1474 int ret; 1475 struct btrfs_path *path; 1476 struct btrfs_key key; 1477 struct btrfs_file_extent_item *ei; 1478 u8 type; 1479 u8 compression; 1480 unsigned long off; 1481 int len; 1482 1483 path = alloc_path_for_send(); 1484 if (!path) 1485 return -ENOMEM; 1486 1487 key.objectid = ino; 1488 key.type = BTRFS_EXTENT_DATA_KEY; 1489 key.offset = 0; 1490 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1491 if (ret < 0) 1492 goto out; 1493 if (ret) { 1494 /* 1495 * An empty symlink inode. Can happen in rare error paths when 1496 * creating a symlink (transaction committed before the inode 1497 * eviction handler removed the symlink inode items and a crash 1498 * happened in between or the subvol was snapshoted in between). 1499 * Print an informative message to dmesg/syslog so that the user 1500 * can delete the symlink. 1501 */ 1502 btrfs_err(root->fs_info, 1503 "Found empty symlink inode %llu at root %llu", 1504 ino, root->root_key.objectid); 1505 ret = -EIO; 1506 goto out; 1507 } 1508 1509 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 1510 struct btrfs_file_extent_item); 1511 type = btrfs_file_extent_type(path->nodes[0], ei); 1512 compression = btrfs_file_extent_compression(path->nodes[0], ei); 1513 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); 1514 BUG_ON(compression); 1515 1516 off = btrfs_file_extent_inline_start(ei); 1517 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); 1518 1519 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1520 1521 out: 1522 btrfs_free_path(path); 1523 return ret; 1524 } 1525 1526 /* 1527 * Helper function to generate a file name that is unique in the root of 1528 * send_root and parent_root. This is used to generate names for orphan inodes. 1529 */ 1530 static int gen_unique_name(struct send_ctx *sctx, 1531 u64 ino, u64 gen, 1532 struct fs_path *dest) 1533 { 1534 int ret = 0; 1535 struct btrfs_path *path; 1536 struct btrfs_dir_item *di; 1537 char tmp[64]; 1538 int len; 1539 u64 idx = 0; 1540 1541 path = alloc_path_for_send(); 1542 if (!path) 1543 return -ENOMEM; 1544 1545 while (1) { 1546 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1547 ino, gen, idx); 1548 ASSERT(len < sizeof(tmp)); 1549 1550 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1551 path, BTRFS_FIRST_FREE_OBJECTID, 1552 tmp, strlen(tmp), 0); 1553 btrfs_release_path(path); 1554 if (IS_ERR(di)) { 1555 ret = PTR_ERR(di); 1556 goto out; 1557 } 1558 if (di) { 1559 /* not unique, try again */ 1560 idx++; 1561 continue; 1562 } 1563 1564 if (!sctx->parent_root) { 1565 /* unique */ 1566 ret = 0; 1567 break; 1568 } 1569 1570 di = btrfs_lookup_dir_item(NULL, sctx->parent_root, 1571 path, BTRFS_FIRST_FREE_OBJECTID, 1572 tmp, strlen(tmp), 0); 1573 btrfs_release_path(path); 1574 if (IS_ERR(di)) { 1575 ret = PTR_ERR(di); 1576 goto out; 1577 } 1578 if (di) { 1579 /* not unique, try again */ 1580 idx++; 1581 continue; 1582 } 1583 /* unique */ 1584 break; 1585 } 1586 1587 ret = fs_path_add(dest, tmp, strlen(tmp)); 1588 1589 out: 1590 btrfs_free_path(path); 1591 return ret; 1592 } 1593 1594 enum inode_state { 1595 inode_state_no_change, 1596 inode_state_will_create, 1597 inode_state_did_create, 1598 inode_state_will_delete, 1599 inode_state_did_delete, 1600 }; 1601 1602 static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) 1603 { 1604 int ret; 1605 int left_ret; 1606 int right_ret; 1607 u64 left_gen; 1608 u64 right_gen; 1609 1610 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, 1611 NULL, NULL); 1612 if (ret < 0 && ret != -ENOENT) 1613 goto out; 1614 left_ret = ret; 1615 1616 if (!sctx->parent_root) { 1617 right_ret = -ENOENT; 1618 } else { 1619 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, 1620 NULL, NULL, NULL, NULL); 1621 if (ret < 0 && ret != -ENOENT) 1622 goto out; 1623 right_ret = ret; 1624 } 1625 1626 if (!left_ret && !right_ret) { 1627 if (left_gen == gen && right_gen == gen) { 1628 ret = inode_state_no_change; 1629 } else if (left_gen == gen) { 1630 if (ino < sctx->send_progress) 1631 ret = inode_state_did_create; 1632 else 1633 ret = inode_state_will_create; 1634 } else if (right_gen == gen) { 1635 if (ino < sctx->send_progress) 1636 ret = inode_state_did_delete; 1637 else 1638 ret = inode_state_will_delete; 1639 } else { 1640 ret = -ENOENT; 1641 } 1642 } else if (!left_ret) { 1643 if (left_gen == gen) { 1644 if (ino < sctx->send_progress) 1645 ret = inode_state_did_create; 1646 else 1647 ret = inode_state_will_create; 1648 } else { 1649 ret = -ENOENT; 1650 } 1651 } else if (!right_ret) { 1652 if (right_gen == gen) { 1653 if (ino < sctx->send_progress) 1654 ret = inode_state_did_delete; 1655 else 1656 ret = inode_state_will_delete; 1657 } else { 1658 ret = -ENOENT; 1659 } 1660 } else { 1661 ret = -ENOENT; 1662 } 1663 1664 out: 1665 return ret; 1666 } 1667 1668 static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) 1669 { 1670 int ret; 1671 1672 if (ino == BTRFS_FIRST_FREE_OBJECTID) 1673 return 1; 1674 1675 ret = get_cur_inode_state(sctx, ino, gen); 1676 if (ret < 0) 1677 goto out; 1678 1679 if (ret == inode_state_no_change || 1680 ret == inode_state_did_create || 1681 ret == inode_state_will_delete) 1682 ret = 1; 1683 else 1684 ret = 0; 1685 1686 out: 1687 return ret; 1688 } 1689 1690 /* 1691 * Helper function to lookup a dir item in a dir. 1692 */ 1693 static int lookup_dir_item_inode(struct btrfs_root *root, 1694 u64 dir, const char *name, int name_len, 1695 u64 *found_inode, 1696 u8 *found_type) 1697 { 1698 int ret = 0; 1699 struct btrfs_dir_item *di; 1700 struct btrfs_key key; 1701 struct btrfs_path *path; 1702 1703 path = alloc_path_for_send(); 1704 if (!path) 1705 return -ENOMEM; 1706 1707 di = btrfs_lookup_dir_item(NULL, root, path, 1708 dir, name, name_len, 0); 1709 if (IS_ERR_OR_NULL(di)) { 1710 ret = di ? PTR_ERR(di) : -ENOENT; 1711 goto out; 1712 } 1713 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1714 if (key.type == BTRFS_ROOT_ITEM_KEY) { 1715 ret = -ENOENT; 1716 goto out; 1717 } 1718 *found_inode = key.objectid; 1719 *found_type = btrfs_dir_type(path->nodes[0], di); 1720 1721 out: 1722 btrfs_free_path(path); 1723 return ret; 1724 } 1725 1726 /* 1727 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1728 * generation of the parent dir and the name of the dir entry. 1729 */ 1730 static int get_first_ref(struct btrfs_root *root, u64 ino, 1731 u64 *dir, u64 *dir_gen, struct fs_path *name) 1732 { 1733 int ret; 1734 struct btrfs_key key; 1735 struct btrfs_key found_key; 1736 struct btrfs_path *path; 1737 int len; 1738 u64 parent_dir; 1739 1740 path = alloc_path_for_send(); 1741 if (!path) 1742 return -ENOMEM; 1743 1744 key.objectid = ino; 1745 key.type = BTRFS_INODE_REF_KEY; 1746 key.offset = 0; 1747 1748 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 1749 if (ret < 0) 1750 goto out; 1751 if (!ret) 1752 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1753 path->slots[0]); 1754 if (ret || found_key.objectid != ino || 1755 (found_key.type != BTRFS_INODE_REF_KEY && 1756 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 1757 ret = -ENOENT; 1758 goto out; 1759 } 1760 1761 if (found_key.type == BTRFS_INODE_REF_KEY) { 1762 struct btrfs_inode_ref *iref; 1763 iref = btrfs_item_ptr(path->nodes[0], path->slots[0], 1764 struct btrfs_inode_ref); 1765 len = btrfs_inode_ref_name_len(path->nodes[0], iref); 1766 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 1767 (unsigned long)(iref + 1), 1768 len); 1769 parent_dir = found_key.offset; 1770 } else { 1771 struct btrfs_inode_extref *extref; 1772 extref = btrfs_item_ptr(path->nodes[0], path->slots[0], 1773 struct btrfs_inode_extref); 1774 len = btrfs_inode_extref_name_len(path->nodes[0], extref); 1775 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 1776 (unsigned long)&extref->name, len); 1777 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); 1778 } 1779 if (ret < 0) 1780 goto out; 1781 btrfs_release_path(path); 1782 1783 if (dir_gen) { 1784 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, 1785 NULL, NULL, NULL); 1786 if (ret < 0) 1787 goto out; 1788 } 1789 1790 *dir = parent_dir; 1791 1792 out: 1793 btrfs_free_path(path); 1794 return ret; 1795 } 1796 1797 static int is_first_ref(struct btrfs_root *root, 1798 u64 ino, u64 dir, 1799 const char *name, int name_len) 1800 { 1801 int ret; 1802 struct fs_path *tmp_name; 1803 u64 tmp_dir; 1804 1805 tmp_name = fs_path_alloc(); 1806 if (!tmp_name) 1807 return -ENOMEM; 1808 1809 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); 1810 if (ret < 0) 1811 goto out; 1812 1813 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { 1814 ret = 0; 1815 goto out; 1816 } 1817 1818 ret = !memcmp(tmp_name->start, name, name_len); 1819 1820 out: 1821 fs_path_free(tmp_name); 1822 return ret; 1823 } 1824 1825 /* 1826 * Used by process_recorded_refs to determine if a new ref would overwrite an 1827 * already existing ref. In case it detects an overwrite, it returns the 1828 * inode/gen in who_ino/who_gen. 1829 * When an overwrite is detected, process_recorded_refs does proper orphanizing 1830 * to make sure later references to the overwritten inode are possible. 1831 * Orphanizing is however only required for the first ref of an inode. 1832 * process_recorded_refs does an additional is_first_ref check to see if 1833 * orphanizing is really required. 1834 */ 1835 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 1836 const char *name, int name_len, 1837 u64 *who_ino, u64 *who_gen, u64 *who_mode) 1838 { 1839 int ret = 0; 1840 u64 gen; 1841 u64 other_inode = 0; 1842 u8 other_type = 0; 1843 1844 if (!sctx->parent_root) 1845 goto out; 1846 1847 ret = is_inode_existent(sctx, dir, dir_gen); 1848 if (ret <= 0) 1849 goto out; 1850 1851 /* 1852 * If we have a parent root we need to verify that the parent dir was 1853 * not deleted and then re-created, if it was then we have no overwrite 1854 * and we can just unlink this entry. 1855 */ 1856 if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { 1857 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, 1858 NULL, NULL, NULL); 1859 if (ret < 0 && ret != -ENOENT) 1860 goto out; 1861 if (ret) { 1862 ret = 0; 1863 goto out; 1864 } 1865 if (gen != dir_gen) 1866 goto out; 1867 } 1868 1869 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, 1870 &other_inode, &other_type); 1871 if (ret < 0 && ret != -ENOENT) 1872 goto out; 1873 if (ret) { 1874 ret = 0; 1875 goto out; 1876 } 1877 1878 /* 1879 * Check if the overwritten ref was already processed. If yes, the ref 1880 * was already unlinked/moved, so we can safely assume that we will not 1881 * overwrite anything at this point in time. 1882 */ 1883 if (other_inode > sctx->send_progress || 1884 is_waiting_for_move(sctx, other_inode)) { 1885 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1886 who_gen, who_mode, NULL, NULL, NULL); 1887 if (ret < 0) 1888 goto out; 1889 1890 ret = 1; 1891 *who_ino = other_inode; 1892 } else { 1893 ret = 0; 1894 } 1895 1896 out: 1897 return ret; 1898 } 1899 1900 /* 1901 * Checks if the ref was overwritten by an already processed inode. This is 1902 * used by __get_cur_name_and_parent to find out if the ref was orphanized and 1903 * thus the orphan name needs be used. 1904 * process_recorded_refs also uses it to avoid unlinking of refs that were 1905 * overwritten. 1906 */ 1907 static int did_overwrite_ref(struct send_ctx *sctx, 1908 u64 dir, u64 dir_gen, 1909 u64 ino, u64 ino_gen, 1910 const char *name, int name_len) 1911 { 1912 int ret = 0; 1913 u64 gen; 1914 u64 ow_inode; 1915 u8 other_type; 1916 1917 if (!sctx->parent_root) 1918 goto out; 1919 1920 ret = is_inode_existent(sctx, dir, dir_gen); 1921 if (ret <= 0) 1922 goto out; 1923 1924 if (dir != BTRFS_FIRST_FREE_OBJECTID) { 1925 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, 1926 NULL, NULL, NULL); 1927 if (ret < 0 && ret != -ENOENT) 1928 goto out; 1929 if (ret) { 1930 ret = 0; 1931 goto out; 1932 } 1933 if (gen != dir_gen) 1934 goto out; 1935 } 1936 1937 /* check if the ref was overwritten by another ref */ 1938 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, 1939 &ow_inode, &other_type); 1940 if (ret < 0 && ret != -ENOENT) 1941 goto out; 1942 if (ret) { 1943 /* was never and will never be overwritten */ 1944 ret = 0; 1945 goto out; 1946 } 1947 1948 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, 1949 NULL, NULL); 1950 if (ret < 0) 1951 goto out; 1952 1953 if (ow_inode == ino && gen == ino_gen) { 1954 ret = 0; 1955 goto out; 1956 } 1957 1958 /* 1959 * We know that it is or will be overwritten. Check this now. 1960 * The current inode being processed might have been the one that caused 1961 * inode 'ino' to be orphanized, therefore check if ow_inode matches 1962 * the current inode being processed. 1963 */ 1964 if ((ow_inode < sctx->send_progress) || 1965 (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && 1966 gen == sctx->cur_inode_gen)) 1967 ret = 1; 1968 else 1969 ret = 0; 1970 1971 out: 1972 return ret; 1973 } 1974 1975 /* 1976 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode 1977 * that got overwritten. This is used by process_recorded_refs to determine 1978 * if it has to use the path as returned by get_cur_path or the orphan name. 1979 */ 1980 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 1981 { 1982 int ret = 0; 1983 struct fs_path *name = NULL; 1984 u64 dir; 1985 u64 dir_gen; 1986 1987 if (!sctx->parent_root) 1988 goto out; 1989 1990 name = fs_path_alloc(); 1991 if (!name) 1992 return -ENOMEM; 1993 1994 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); 1995 if (ret < 0) 1996 goto out; 1997 1998 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 1999 name->start, fs_path_len(name)); 2000 2001 out: 2002 fs_path_free(name); 2003 return ret; 2004 } 2005 2006 /* 2007 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, 2008 * so we need to do some special handling in case we have clashes. This function 2009 * takes care of this with the help of name_cache_entry::radix_list. 2010 * In case of error, nce is kfreed. 2011 */ 2012 static int name_cache_insert(struct send_ctx *sctx, 2013 struct name_cache_entry *nce) 2014 { 2015 int ret = 0; 2016 struct list_head *nce_head; 2017 2018 nce_head = radix_tree_lookup(&sctx->name_cache, 2019 (unsigned long)nce->ino); 2020 if (!nce_head) { 2021 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); 2022 if (!nce_head) { 2023 kfree(nce); 2024 return -ENOMEM; 2025 } 2026 INIT_LIST_HEAD(nce_head); 2027 2028 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 2029 if (ret < 0) { 2030 kfree(nce_head); 2031 kfree(nce); 2032 return ret; 2033 } 2034 } 2035 list_add_tail(&nce->radix_list, nce_head); 2036 list_add_tail(&nce->list, &sctx->name_cache_list); 2037 sctx->name_cache_size++; 2038 2039 return ret; 2040 } 2041 2042 static void name_cache_delete(struct send_ctx *sctx, 2043 struct name_cache_entry *nce) 2044 { 2045 struct list_head *nce_head; 2046 2047 nce_head = radix_tree_lookup(&sctx->name_cache, 2048 (unsigned long)nce->ino); 2049 if (!nce_head) { 2050 btrfs_err(sctx->send_root->fs_info, 2051 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", 2052 nce->ino, sctx->name_cache_size); 2053 } 2054 2055 list_del(&nce->radix_list); 2056 list_del(&nce->list); 2057 sctx->name_cache_size--; 2058 2059 /* 2060 * We may not get to the final release of nce_head if the lookup fails 2061 */ 2062 if (nce_head && list_empty(nce_head)) { 2063 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 2064 kfree(nce_head); 2065 } 2066 } 2067 2068 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 2069 u64 ino, u64 gen) 2070 { 2071 struct list_head *nce_head; 2072 struct name_cache_entry *cur; 2073 2074 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); 2075 if (!nce_head) 2076 return NULL; 2077 2078 list_for_each_entry(cur, nce_head, radix_list) { 2079 if (cur->ino == ino && cur->gen == gen) 2080 return cur; 2081 } 2082 return NULL; 2083 } 2084 2085 /* 2086 * Remove some entries from the beginning of name_cache_list. 2087 */ 2088 static void name_cache_clean_unused(struct send_ctx *sctx) 2089 { 2090 struct name_cache_entry *nce; 2091 2092 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) 2093 return; 2094 2095 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { 2096 nce = list_entry(sctx->name_cache_list.next, 2097 struct name_cache_entry, list); 2098 name_cache_delete(sctx, nce); 2099 kfree(nce); 2100 } 2101 } 2102 2103 static void name_cache_free(struct send_ctx *sctx) 2104 { 2105 struct name_cache_entry *nce; 2106 2107 while (!list_empty(&sctx->name_cache_list)) { 2108 nce = list_entry(sctx->name_cache_list.next, 2109 struct name_cache_entry, list); 2110 name_cache_delete(sctx, nce); 2111 kfree(nce); 2112 } 2113 } 2114 2115 /* 2116 * Used by get_cur_path for each ref up to the root. 2117 * Returns 0 if it succeeded. 2118 * Returns 1 if the inode is not existent or got overwritten. In that case, the 2119 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 2120 * is returned, parent_ino/parent_gen are not guaranteed to be valid. 2121 * Returns <0 in case of error. 2122 */ 2123 static int __get_cur_name_and_parent(struct send_ctx *sctx, 2124 u64 ino, u64 gen, 2125 u64 *parent_ino, 2126 u64 *parent_gen, 2127 struct fs_path *dest) 2128 { 2129 int ret; 2130 int nce_ret; 2131 struct name_cache_entry *nce = NULL; 2132 2133 /* 2134 * First check if we already did a call to this function with the same 2135 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2136 * return the cached result. 2137 */ 2138 nce = name_cache_search(sctx, ino, gen); 2139 if (nce) { 2140 if (ino < sctx->send_progress && nce->need_later_update) { 2141 name_cache_delete(sctx, nce); 2142 kfree(nce); 2143 nce = NULL; 2144 } else { 2145 /* 2146 * Removes the entry from the list and adds it back to 2147 * the end. This marks the entry as recently used so 2148 * that name_cache_clean_unused does not remove it. 2149 */ 2150 list_move_tail(&nce->list, &sctx->name_cache_list); 2151 2152 *parent_ino = nce->parent_ino; 2153 *parent_gen = nce->parent_gen; 2154 ret = fs_path_add(dest, nce->name, nce->name_len); 2155 if (ret < 0) 2156 goto out; 2157 ret = nce->ret; 2158 goto out; 2159 } 2160 } 2161 2162 /* 2163 * If the inode is not existent yet, add the orphan name and return 1. 2164 * This should only happen for the parent dir that we determine in 2165 * __record_new_ref 2166 */ 2167 ret = is_inode_existent(sctx, ino, gen); 2168 if (ret < 0) 2169 goto out; 2170 2171 if (!ret) { 2172 ret = gen_unique_name(sctx, ino, gen, dest); 2173 if (ret < 0) 2174 goto out; 2175 ret = 1; 2176 goto out_cache; 2177 } 2178 2179 /* 2180 * Depending on whether the inode was already processed or not, use 2181 * send_root or parent_root for ref lookup. 2182 */ 2183 if (ino < sctx->send_progress) 2184 ret = get_first_ref(sctx->send_root, ino, 2185 parent_ino, parent_gen, dest); 2186 else 2187 ret = get_first_ref(sctx->parent_root, ino, 2188 parent_ino, parent_gen, dest); 2189 if (ret < 0) 2190 goto out; 2191 2192 /* 2193 * Check if the ref was overwritten by an inode's ref that was processed 2194 * earlier. If yes, treat as orphan and return 1. 2195 */ 2196 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 2197 dest->start, dest->end - dest->start); 2198 if (ret < 0) 2199 goto out; 2200 if (ret) { 2201 fs_path_reset(dest); 2202 ret = gen_unique_name(sctx, ino, gen, dest); 2203 if (ret < 0) 2204 goto out; 2205 ret = 1; 2206 } 2207 2208 out_cache: 2209 /* 2210 * Store the result of the lookup in the name cache. 2211 */ 2212 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); 2213 if (!nce) { 2214 ret = -ENOMEM; 2215 goto out; 2216 } 2217 2218 nce->ino = ino; 2219 nce->gen = gen; 2220 nce->parent_ino = *parent_ino; 2221 nce->parent_gen = *parent_gen; 2222 nce->name_len = fs_path_len(dest); 2223 nce->ret = ret; 2224 strcpy(nce->name, dest->start); 2225 2226 if (ino < sctx->send_progress) 2227 nce->need_later_update = 0; 2228 else 2229 nce->need_later_update = 1; 2230 2231 nce_ret = name_cache_insert(sctx, nce); 2232 if (nce_ret < 0) 2233 ret = nce_ret; 2234 name_cache_clean_unused(sctx); 2235 2236 out: 2237 return ret; 2238 } 2239 2240 /* 2241 * Magic happens here. This function returns the first ref to an inode as it 2242 * would look like while receiving the stream at this point in time. 2243 * We walk the path up to the root. For every inode in between, we check if it 2244 * was already processed/sent. If yes, we continue with the parent as found 2245 * in send_root. If not, we continue with the parent as found in parent_root. 2246 * If we encounter an inode that was deleted at this point in time, we use the 2247 * inodes "orphan" name instead of the real name and stop. Same with new inodes 2248 * that were not created yet and overwritten inodes/refs. 2249 * 2250 * When do we have orphan inodes: 2251 * 1. When an inode is freshly created and thus no valid refs are available yet 2252 * 2. When a directory lost all it's refs (deleted) but still has dir items 2253 * inside which were not processed yet (pending for move/delete). If anyone 2254 * tried to get the path to the dir items, it would get a path inside that 2255 * orphan directory. 2256 * 3. When an inode is moved around or gets new links, it may overwrite the ref 2257 * of an unprocessed inode. If in that case the first ref would be 2258 * overwritten, the overwritten inode gets "orphanized". Later when we 2259 * process this overwritten inode, it is restored at a new place by moving 2260 * the orphan inode. 2261 * 2262 * sctx->send_progress tells this function at which point in time receiving 2263 * would be. 2264 */ 2265 static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, 2266 struct fs_path *dest) 2267 { 2268 int ret = 0; 2269 struct fs_path *name = NULL; 2270 u64 parent_inode = 0; 2271 u64 parent_gen = 0; 2272 int stop = 0; 2273 2274 name = fs_path_alloc(); 2275 if (!name) { 2276 ret = -ENOMEM; 2277 goto out; 2278 } 2279 2280 dest->reversed = 1; 2281 fs_path_reset(dest); 2282 2283 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2284 struct waiting_dir_move *wdm; 2285 2286 fs_path_reset(name); 2287 2288 if (is_waiting_for_rm(sctx, ino, gen)) { 2289 ret = gen_unique_name(sctx, ino, gen, name); 2290 if (ret < 0) 2291 goto out; 2292 ret = fs_path_add_path(dest, name); 2293 break; 2294 } 2295 2296 wdm = get_waiting_dir_move(sctx, ino); 2297 if (wdm && wdm->orphanized) { 2298 ret = gen_unique_name(sctx, ino, gen, name); 2299 stop = 1; 2300 } else if (wdm) { 2301 ret = get_first_ref(sctx->parent_root, ino, 2302 &parent_inode, &parent_gen, name); 2303 } else { 2304 ret = __get_cur_name_and_parent(sctx, ino, gen, 2305 &parent_inode, 2306 &parent_gen, name); 2307 if (ret) 2308 stop = 1; 2309 } 2310 2311 if (ret < 0) 2312 goto out; 2313 2314 ret = fs_path_add_path(dest, name); 2315 if (ret < 0) 2316 goto out; 2317 2318 ino = parent_inode; 2319 gen = parent_gen; 2320 } 2321 2322 out: 2323 fs_path_free(name); 2324 if (!ret) 2325 fs_path_unreverse(dest); 2326 return ret; 2327 } 2328 2329 /* 2330 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace 2331 */ 2332 static int send_subvol_begin(struct send_ctx *sctx) 2333 { 2334 int ret; 2335 struct btrfs_root *send_root = sctx->send_root; 2336 struct btrfs_root *parent_root = sctx->parent_root; 2337 struct btrfs_path *path; 2338 struct btrfs_key key; 2339 struct btrfs_root_ref *ref; 2340 struct extent_buffer *leaf; 2341 char *name = NULL; 2342 int namelen; 2343 2344 path = btrfs_alloc_path(); 2345 if (!path) 2346 return -ENOMEM; 2347 2348 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 2349 if (!name) { 2350 btrfs_free_path(path); 2351 return -ENOMEM; 2352 } 2353 2354 key.objectid = send_root->root_key.objectid; 2355 key.type = BTRFS_ROOT_BACKREF_KEY; 2356 key.offset = 0; 2357 2358 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, 2359 &key, path, 1, 0); 2360 if (ret < 0) 2361 goto out; 2362 if (ret) { 2363 ret = -ENOENT; 2364 goto out; 2365 } 2366 2367 leaf = path->nodes[0]; 2368 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2369 if (key.type != BTRFS_ROOT_BACKREF_KEY || 2370 key.objectid != send_root->root_key.objectid) { 2371 ret = -ENOENT; 2372 goto out; 2373 } 2374 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 2375 namelen = btrfs_root_ref_name_len(leaf, ref); 2376 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2377 btrfs_release_path(path); 2378 2379 if (parent_root) { 2380 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2381 if (ret < 0) 2382 goto out; 2383 } else { 2384 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); 2385 if (ret < 0) 2386 goto out; 2387 } 2388 2389 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); 2390 2391 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) 2392 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2393 sctx->send_root->root_item.received_uuid); 2394 else 2395 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2396 sctx->send_root->root_item.uuid); 2397 2398 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2399 btrfs_root_ctransid(&sctx->send_root->root_item)); 2400 if (parent_root) { 2401 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) 2402 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2403 parent_root->root_item.received_uuid); 2404 else 2405 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2406 parent_root->root_item.uuid); 2407 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2408 btrfs_root_ctransid(&sctx->parent_root->root_item)); 2409 } 2410 2411 ret = send_cmd(sctx); 2412 2413 tlv_put_failure: 2414 out: 2415 btrfs_free_path(path); 2416 kfree(name); 2417 return ret; 2418 } 2419 2420 static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) 2421 { 2422 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2423 int ret = 0; 2424 struct fs_path *p; 2425 2426 btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); 2427 2428 p = fs_path_alloc(); 2429 if (!p) 2430 return -ENOMEM; 2431 2432 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); 2433 if (ret < 0) 2434 goto out; 2435 2436 ret = get_cur_path(sctx, ino, gen, p); 2437 if (ret < 0) 2438 goto out; 2439 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2440 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); 2441 2442 ret = send_cmd(sctx); 2443 2444 tlv_put_failure: 2445 out: 2446 fs_path_free(p); 2447 return ret; 2448 } 2449 2450 static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) 2451 { 2452 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2453 int ret = 0; 2454 struct fs_path *p; 2455 2456 btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); 2457 2458 p = fs_path_alloc(); 2459 if (!p) 2460 return -ENOMEM; 2461 2462 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); 2463 if (ret < 0) 2464 goto out; 2465 2466 ret = get_cur_path(sctx, ino, gen, p); 2467 if (ret < 0) 2468 goto out; 2469 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2470 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); 2471 2472 ret = send_cmd(sctx); 2473 2474 tlv_put_failure: 2475 out: 2476 fs_path_free(p); 2477 return ret; 2478 } 2479 2480 static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) 2481 { 2482 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2483 int ret = 0; 2484 struct fs_path *p; 2485 2486 btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", 2487 ino, uid, gid); 2488 2489 p = fs_path_alloc(); 2490 if (!p) 2491 return -ENOMEM; 2492 2493 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); 2494 if (ret < 0) 2495 goto out; 2496 2497 ret = get_cur_path(sctx, ino, gen, p); 2498 if (ret < 0) 2499 goto out; 2500 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2501 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); 2502 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); 2503 2504 ret = send_cmd(sctx); 2505 2506 tlv_put_failure: 2507 out: 2508 fs_path_free(p); 2509 return ret; 2510 } 2511 2512 static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) 2513 { 2514 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2515 int ret = 0; 2516 struct fs_path *p = NULL; 2517 struct btrfs_inode_item *ii; 2518 struct btrfs_path *path = NULL; 2519 struct extent_buffer *eb; 2520 struct btrfs_key key; 2521 int slot; 2522 2523 btrfs_debug(fs_info, "send_utimes %llu", ino); 2524 2525 p = fs_path_alloc(); 2526 if (!p) 2527 return -ENOMEM; 2528 2529 path = alloc_path_for_send(); 2530 if (!path) { 2531 ret = -ENOMEM; 2532 goto out; 2533 } 2534 2535 key.objectid = ino; 2536 key.type = BTRFS_INODE_ITEM_KEY; 2537 key.offset = 0; 2538 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2539 if (ret > 0) 2540 ret = -ENOENT; 2541 if (ret < 0) 2542 goto out; 2543 2544 eb = path->nodes[0]; 2545 slot = path->slots[0]; 2546 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2547 2548 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); 2549 if (ret < 0) 2550 goto out; 2551 2552 ret = get_cur_path(sctx, ino, gen, p); 2553 if (ret < 0) 2554 goto out; 2555 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2556 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); 2557 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); 2558 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); 2559 /* TODO Add otime support when the otime patches get into upstream */ 2560 2561 ret = send_cmd(sctx); 2562 2563 tlv_put_failure: 2564 out: 2565 fs_path_free(p); 2566 btrfs_free_path(path); 2567 return ret; 2568 } 2569 2570 /* 2571 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have 2572 * a valid path yet because we did not process the refs yet. So, the inode 2573 * is created as orphan. 2574 */ 2575 static int send_create_inode(struct send_ctx *sctx, u64 ino) 2576 { 2577 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 2578 int ret = 0; 2579 struct fs_path *p; 2580 int cmd; 2581 u64 gen; 2582 u64 mode; 2583 u64 rdev; 2584 2585 btrfs_debug(fs_info, "send_create_inode %llu", ino); 2586 2587 p = fs_path_alloc(); 2588 if (!p) 2589 return -ENOMEM; 2590 2591 if (ino != sctx->cur_ino) { 2592 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, 2593 NULL, NULL, &rdev); 2594 if (ret < 0) 2595 goto out; 2596 } else { 2597 gen = sctx->cur_inode_gen; 2598 mode = sctx->cur_inode_mode; 2599 rdev = sctx->cur_inode_rdev; 2600 } 2601 2602 if (S_ISREG(mode)) { 2603 cmd = BTRFS_SEND_C_MKFILE; 2604 } else if (S_ISDIR(mode)) { 2605 cmd = BTRFS_SEND_C_MKDIR; 2606 } else if (S_ISLNK(mode)) { 2607 cmd = BTRFS_SEND_C_SYMLINK; 2608 } else if (S_ISCHR(mode) || S_ISBLK(mode)) { 2609 cmd = BTRFS_SEND_C_MKNOD; 2610 } else if (S_ISFIFO(mode)) { 2611 cmd = BTRFS_SEND_C_MKFIFO; 2612 } else if (S_ISSOCK(mode)) { 2613 cmd = BTRFS_SEND_C_MKSOCK; 2614 } else { 2615 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", 2616 (int)(mode & S_IFMT)); 2617 ret = -EOPNOTSUPP; 2618 goto out; 2619 } 2620 2621 ret = begin_cmd(sctx, cmd); 2622 if (ret < 0) 2623 goto out; 2624 2625 ret = gen_unique_name(sctx, ino, gen, p); 2626 if (ret < 0) 2627 goto out; 2628 2629 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2630 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); 2631 2632 if (S_ISLNK(mode)) { 2633 fs_path_reset(p); 2634 ret = read_symlink(sctx->send_root, ino, p); 2635 if (ret < 0) 2636 goto out; 2637 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2638 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2639 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2640 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); 2641 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); 2642 } 2643 2644 ret = send_cmd(sctx); 2645 if (ret < 0) 2646 goto out; 2647 2648 2649 tlv_put_failure: 2650 out: 2651 fs_path_free(p); 2652 return ret; 2653 } 2654 2655 /* 2656 * We need some special handling for inodes that get processed before the parent 2657 * directory got created. See process_recorded_refs for details. 2658 * This function does the check if we already created the dir out of order. 2659 */ 2660 static int did_create_dir(struct send_ctx *sctx, u64 dir) 2661 { 2662 int ret = 0; 2663 struct btrfs_path *path = NULL; 2664 struct btrfs_key key; 2665 struct btrfs_key found_key; 2666 struct btrfs_key di_key; 2667 struct extent_buffer *eb; 2668 struct btrfs_dir_item *di; 2669 int slot; 2670 2671 path = alloc_path_for_send(); 2672 if (!path) { 2673 ret = -ENOMEM; 2674 goto out; 2675 } 2676 2677 key.objectid = dir; 2678 key.type = BTRFS_DIR_INDEX_KEY; 2679 key.offset = 0; 2680 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2681 if (ret < 0) 2682 goto out; 2683 2684 while (1) { 2685 eb = path->nodes[0]; 2686 slot = path->slots[0]; 2687 if (slot >= btrfs_header_nritems(eb)) { 2688 ret = btrfs_next_leaf(sctx->send_root, path); 2689 if (ret < 0) { 2690 goto out; 2691 } else if (ret > 0) { 2692 ret = 0; 2693 break; 2694 } 2695 continue; 2696 } 2697 2698 btrfs_item_key_to_cpu(eb, &found_key, slot); 2699 if (found_key.objectid != key.objectid || 2700 found_key.type != key.type) { 2701 ret = 0; 2702 goto out; 2703 } 2704 2705 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2706 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2707 2708 if (di_key.type != BTRFS_ROOT_ITEM_KEY && 2709 di_key.objectid < sctx->send_progress) { 2710 ret = 1; 2711 goto out; 2712 } 2713 2714 path->slots[0]++; 2715 } 2716 2717 out: 2718 btrfs_free_path(path); 2719 return ret; 2720 } 2721 2722 /* 2723 * Only creates the inode if it is: 2724 * 1. Not a directory 2725 * 2. Or a directory which was not created already due to out of order 2726 * directories. See did_create_dir and process_recorded_refs for details. 2727 */ 2728 static int send_create_inode_if_needed(struct send_ctx *sctx) 2729 { 2730 int ret; 2731 2732 if (S_ISDIR(sctx->cur_inode_mode)) { 2733 ret = did_create_dir(sctx, sctx->cur_ino); 2734 if (ret < 0) 2735 return ret; 2736 else if (ret > 0) 2737 return 0; 2738 } 2739 2740 return send_create_inode(sctx, sctx->cur_ino); 2741 } 2742 2743 struct recorded_ref { 2744 struct list_head list; 2745 char *name; 2746 struct fs_path *full_path; 2747 u64 dir; 2748 u64 dir_gen; 2749 int name_len; 2750 }; 2751 2752 static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) 2753 { 2754 ref->full_path = path; 2755 ref->name = (char *)kbasename(ref->full_path->start); 2756 ref->name_len = ref->full_path->end - ref->name; 2757 } 2758 2759 /* 2760 * We need to process new refs before deleted refs, but compare_tree gives us 2761 * everything mixed. So we first record all refs and later process them. 2762 * This function is a helper to record one ref. 2763 */ 2764 static int __record_ref(struct list_head *head, u64 dir, 2765 u64 dir_gen, struct fs_path *path) 2766 { 2767 struct recorded_ref *ref; 2768 2769 ref = kmalloc(sizeof(*ref), GFP_KERNEL); 2770 if (!ref) 2771 return -ENOMEM; 2772 2773 ref->dir = dir; 2774 ref->dir_gen = dir_gen; 2775 set_ref_path(ref, path); 2776 list_add_tail(&ref->list, head); 2777 return 0; 2778 } 2779 2780 static int dup_ref(struct recorded_ref *ref, struct list_head *list) 2781 { 2782 struct recorded_ref *new; 2783 2784 new = kmalloc(sizeof(*ref), GFP_KERNEL); 2785 if (!new) 2786 return -ENOMEM; 2787 2788 new->dir = ref->dir; 2789 new->dir_gen = ref->dir_gen; 2790 new->full_path = NULL; 2791 INIT_LIST_HEAD(&new->list); 2792 list_add_tail(&new->list, list); 2793 return 0; 2794 } 2795 2796 static void __free_recorded_refs(struct list_head *head) 2797 { 2798 struct recorded_ref *cur; 2799 2800 while (!list_empty(head)) { 2801 cur = list_entry(head->next, struct recorded_ref, list); 2802 fs_path_free(cur->full_path); 2803 list_del(&cur->list); 2804 kfree(cur); 2805 } 2806 } 2807 2808 static void free_recorded_refs(struct send_ctx *sctx) 2809 { 2810 __free_recorded_refs(&sctx->new_refs); 2811 __free_recorded_refs(&sctx->deleted_refs); 2812 } 2813 2814 /* 2815 * Renames/moves a file/dir to its orphan name. Used when the first 2816 * ref of an unprocessed inode gets overwritten and for all non empty 2817 * directories. 2818 */ 2819 static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, 2820 struct fs_path *path) 2821 { 2822 int ret; 2823 struct fs_path *orphan; 2824 2825 orphan = fs_path_alloc(); 2826 if (!orphan) 2827 return -ENOMEM; 2828 2829 ret = gen_unique_name(sctx, ino, gen, orphan); 2830 if (ret < 0) 2831 goto out; 2832 2833 ret = send_rename(sctx, path, orphan); 2834 2835 out: 2836 fs_path_free(orphan); 2837 return ret; 2838 } 2839 2840 static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, 2841 u64 dir_ino, u64 dir_gen) 2842 { 2843 struct rb_node **p = &sctx->orphan_dirs.rb_node; 2844 struct rb_node *parent = NULL; 2845 struct orphan_dir_info *entry, *odi; 2846 2847 while (*p) { 2848 parent = *p; 2849 entry = rb_entry(parent, struct orphan_dir_info, node); 2850 if (dir_ino < entry->ino) 2851 p = &(*p)->rb_left; 2852 else if (dir_ino > entry->ino) 2853 p = &(*p)->rb_right; 2854 else if (dir_gen < entry->gen) 2855 p = &(*p)->rb_left; 2856 else if (dir_gen > entry->gen) 2857 p = &(*p)->rb_right; 2858 else 2859 return entry; 2860 } 2861 2862 odi = kmalloc(sizeof(*odi), GFP_KERNEL); 2863 if (!odi) 2864 return ERR_PTR(-ENOMEM); 2865 odi->ino = dir_ino; 2866 odi->gen = dir_gen; 2867 odi->last_dir_index_offset = 0; 2868 2869 rb_link_node(&odi->node, parent, p); 2870 rb_insert_color(&odi->node, &sctx->orphan_dirs); 2871 return odi; 2872 } 2873 2874 static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, 2875 u64 dir_ino, u64 gen) 2876 { 2877 struct rb_node *n = sctx->orphan_dirs.rb_node; 2878 struct orphan_dir_info *entry; 2879 2880 while (n) { 2881 entry = rb_entry(n, struct orphan_dir_info, node); 2882 if (dir_ino < entry->ino) 2883 n = n->rb_left; 2884 else if (dir_ino > entry->ino) 2885 n = n->rb_right; 2886 else if (gen < entry->gen) 2887 n = n->rb_left; 2888 else if (gen > entry->gen) 2889 n = n->rb_right; 2890 else 2891 return entry; 2892 } 2893 return NULL; 2894 } 2895 2896 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) 2897 { 2898 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); 2899 2900 return odi != NULL; 2901 } 2902 2903 static void free_orphan_dir_info(struct send_ctx *sctx, 2904 struct orphan_dir_info *odi) 2905 { 2906 if (!odi) 2907 return; 2908 rb_erase(&odi->node, &sctx->orphan_dirs); 2909 kfree(odi); 2910 } 2911 2912 /* 2913 * Returns 1 if a directory can be removed at this point in time. 2914 * We check this by iterating all dir items and checking if the inode behind 2915 * the dir item was already processed. 2916 */ 2917 static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, 2918 u64 send_progress) 2919 { 2920 int ret = 0; 2921 struct btrfs_root *root = sctx->parent_root; 2922 struct btrfs_path *path; 2923 struct btrfs_key key; 2924 struct btrfs_key found_key; 2925 struct btrfs_key loc; 2926 struct btrfs_dir_item *di; 2927 struct orphan_dir_info *odi = NULL; 2928 2929 /* 2930 * Don't try to rmdir the top/root subvolume dir. 2931 */ 2932 if (dir == BTRFS_FIRST_FREE_OBJECTID) 2933 return 0; 2934 2935 path = alloc_path_for_send(); 2936 if (!path) 2937 return -ENOMEM; 2938 2939 key.objectid = dir; 2940 key.type = BTRFS_DIR_INDEX_KEY; 2941 key.offset = 0; 2942 2943 odi = get_orphan_dir_info(sctx, dir, dir_gen); 2944 if (odi) 2945 key.offset = odi->last_dir_index_offset; 2946 2947 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2948 if (ret < 0) 2949 goto out; 2950 2951 while (1) { 2952 struct waiting_dir_move *dm; 2953 2954 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2955 ret = btrfs_next_leaf(root, path); 2956 if (ret < 0) 2957 goto out; 2958 else if (ret > 0) 2959 break; 2960 continue; 2961 } 2962 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2963 path->slots[0]); 2964 if (found_key.objectid != key.objectid || 2965 found_key.type != key.type) 2966 break; 2967 2968 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2969 struct btrfs_dir_item); 2970 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2971 2972 dm = get_waiting_dir_move(sctx, loc.objectid); 2973 if (dm) { 2974 odi = add_orphan_dir_info(sctx, dir, dir_gen); 2975 if (IS_ERR(odi)) { 2976 ret = PTR_ERR(odi); 2977 goto out; 2978 } 2979 odi->gen = dir_gen; 2980 odi->last_dir_index_offset = found_key.offset; 2981 dm->rmdir_ino = dir; 2982 dm->rmdir_gen = dir_gen; 2983 ret = 0; 2984 goto out; 2985 } 2986 2987 if (loc.objectid > send_progress) { 2988 odi = add_orphan_dir_info(sctx, dir, dir_gen); 2989 if (IS_ERR(odi)) { 2990 ret = PTR_ERR(odi); 2991 goto out; 2992 } 2993 odi->gen = dir_gen; 2994 odi->last_dir_index_offset = found_key.offset; 2995 ret = 0; 2996 goto out; 2997 } 2998 2999 path->slots[0]++; 3000 } 3001 free_orphan_dir_info(sctx, odi); 3002 3003 ret = 1; 3004 3005 out: 3006 btrfs_free_path(path); 3007 return ret; 3008 } 3009 3010 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 3011 { 3012 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); 3013 3014 return entry != NULL; 3015 } 3016 3017 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) 3018 { 3019 struct rb_node **p = &sctx->waiting_dir_moves.rb_node; 3020 struct rb_node *parent = NULL; 3021 struct waiting_dir_move *entry, *dm; 3022 3023 dm = kmalloc(sizeof(*dm), GFP_KERNEL); 3024 if (!dm) 3025 return -ENOMEM; 3026 dm->ino = ino; 3027 dm->rmdir_ino = 0; 3028 dm->rmdir_gen = 0; 3029 dm->orphanized = orphanized; 3030 3031 while (*p) { 3032 parent = *p; 3033 entry = rb_entry(parent, struct waiting_dir_move, node); 3034 if (ino < entry->ino) { 3035 p = &(*p)->rb_left; 3036 } else if (ino > entry->ino) { 3037 p = &(*p)->rb_right; 3038 } else { 3039 kfree(dm); 3040 return -EEXIST; 3041 } 3042 } 3043 3044 rb_link_node(&dm->node, parent, p); 3045 rb_insert_color(&dm->node, &sctx->waiting_dir_moves); 3046 return 0; 3047 } 3048 3049 static struct waiting_dir_move * 3050 get_waiting_dir_move(struct send_ctx *sctx, u64 ino) 3051 { 3052 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 3053 struct waiting_dir_move *entry; 3054 3055 while (n) { 3056 entry = rb_entry(n, struct waiting_dir_move, node); 3057 if (ino < entry->ino) 3058 n = n->rb_left; 3059 else if (ino > entry->ino) 3060 n = n->rb_right; 3061 else 3062 return entry; 3063 } 3064 return NULL; 3065 } 3066 3067 static void free_waiting_dir_move(struct send_ctx *sctx, 3068 struct waiting_dir_move *dm) 3069 { 3070 if (!dm) 3071 return; 3072 rb_erase(&dm->node, &sctx->waiting_dir_moves); 3073 kfree(dm); 3074 } 3075 3076 static int add_pending_dir_move(struct send_ctx *sctx, 3077 u64 ino, 3078 u64 ino_gen, 3079 u64 parent_ino, 3080 struct list_head *new_refs, 3081 struct list_head *deleted_refs, 3082 const bool is_orphan) 3083 { 3084 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 3085 struct rb_node *parent = NULL; 3086 struct pending_dir_move *entry = NULL, *pm; 3087 struct recorded_ref *cur; 3088 int exists = 0; 3089 int ret; 3090 3091 pm = kmalloc(sizeof(*pm), GFP_KERNEL); 3092 if (!pm) 3093 return -ENOMEM; 3094 pm->parent_ino = parent_ino; 3095 pm->ino = ino; 3096 pm->gen = ino_gen; 3097 INIT_LIST_HEAD(&pm->list); 3098 INIT_LIST_HEAD(&pm->update_refs); 3099 RB_CLEAR_NODE(&pm->node); 3100 3101 while (*p) { 3102 parent = *p; 3103 entry = rb_entry(parent, struct pending_dir_move, node); 3104 if (parent_ino < entry->parent_ino) { 3105 p = &(*p)->rb_left; 3106 } else if (parent_ino > entry->parent_ino) { 3107 p = &(*p)->rb_right; 3108 } else { 3109 exists = 1; 3110 break; 3111 } 3112 } 3113 3114 list_for_each_entry(cur, deleted_refs, list) { 3115 ret = dup_ref(cur, &pm->update_refs); 3116 if (ret < 0) 3117 goto out; 3118 } 3119 list_for_each_entry(cur, new_refs, list) { 3120 ret = dup_ref(cur, &pm->update_refs); 3121 if (ret < 0) 3122 goto out; 3123 } 3124 3125 ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); 3126 if (ret) 3127 goto out; 3128 3129 if (exists) { 3130 list_add_tail(&pm->list, &entry->list); 3131 } else { 3132 rb_link_node(&pm->node, parent, p); 3133 rb_insert_color(&pm->node, &sctx->pending_dir_moves); 3134 } 3135 ret = 0; 3136 out: 3137 if (ret) { 3138 __free_recorded_refs(&pm->update_refs); 3139 kfree(pm); 3140 } 3141 return ret; 3142 } 3143 3144 static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, 3145 u64 parent_ino) 3146 { 3147 struct rb_node *n = sctx->pending_dir_moves.rb_node; 3148 struct pending_dir_move *entry; 3149 3150 while (n) { 3151 entry = rb_entry(n, struct pending_dir_move, node); 3152 if (parent_ino < entry->parent_ino) 3153 n = n->rb_left; 3154 else if (parent_ino > entry->parent_ino) 3155 n = n->rb_right; 3156 else 3157 return entry; 3158 } 3159 return NULL; 3160 } 3161 3162 static int path_loop(struct send_ctx *sctx, struct fs_path *name, 3163 u64 ino, u64 gen, u64 *ancestor_ino) 3164 { 3165 int ret = 0; 3166 u64 parent_inode = 0; 3167 u64 parent_gen = 0; 3168 u64 start_ino = ino; 3169 3170 *ancestor_ino = 0; 3171 while (ino != BTRFS_FIRST_FREE_OBJECTID) { 3172 fs_path_reset(name); 3173 3174 if (is_waiting_for_rm(sctx, ino, gen)) 3175 break; 3176 if (is_waiting_for_move(sctx, ino)) { 3177 if (*ancestor_ino == 0) 3178 *ancestor_ino = ino; 3179 ret = get_first_ref(sctx->parent_root, ino, 3180 &parent_inode, &parent_gen, name); 3181 } else { 3182 ret = __get_cur_name_and_parent(sctx, ino, gen, 3183 &parent_inode, 3184 &parent_gen, name); 3185 if (ret > 0) { 3186 ret = 0; 3187 break; 3188 } 3189 } 3190 if (ret < 0) 3191 break; 3192 if (parent_inode == start_ino) { 3193 ret = 1; 3194 if (*ancestor_ino == 0) 3195 *ancestor_ino = ino; 3196 break; 3197 } 3198 ino = parent_inode; 3199 gen = parent_gen; 3200 } 3201 return ret; 3202 } 3203 3204 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3205 { 3206 struct fs_path *from_path = NULL; 3207 struct fs_path *to_path = NULL; 3208 struct fs_path *name = NULL; 3209 u64 orig_progress = sctx->send_progress; 3210 struct recorded_ref *cur; 3211 u64 parent_ino, parent_gen; 3212 struct waiting_dir_move *dm = NULL; 3213 u64 rmdir_ino = 0; 3214 u64 rmdir_gen; 3215 u64 ancestor; 3216 bool is_orphan; 3217 int ret; 3218 3219 name = fs_path_alloc(); 3220 from_path = fs_path_alloc(); 3221 if (!name || !from_path) { 3222 ret = -ENOMEM; 3223 goto out; 3224 } 3225 3226 dm = get_waiting_dir_move(sctx, pm->ino); 3227 ASSERT(dm); 3228 rmdir_ino = dm->rmdir_ino; 3229 rmdir_gen = dm->rmdir_gen; 3230 is_orphan = dm->orphanized; 3231 free_waiting_dir_move(sctx, dm); 3232 3233 if (is_orphan) { 3234 ret = gen_unique_name(sctx, pm->ino, 3235 pm->gen, from_path); 3236 } else { 3237 ret = get_first_ref(sctx->parent_root, pm->ino, 3238 &parent_ino, &parent_gen, name); 3239 if (ret < 0) 3240 goto out; 3241 ret = get_cur_path(sctx, parent_ino, parent_gen, 3242 from_path); 3243 if (ret < 0) 3244 goto out; 3245 ret = fs_path_add_path(from_path, name); 3246 } 3247 if (ret < 0) 3248 goto out; 3249 3250 sctx->send_progress = sctx->cur_ino + 1; 3251 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); 3252 if (ret < 0) 3253 goto out; 3254 if (ret) { 3255 LIST_HEAD(deleted_refs); 3256 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); 3257 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, 3258 &pm->update_refs, &deleted_refs, 3259 is_orphan); 3260 if (ret < 0) 3261 goto out; 3262 if (rmdir_ino) { 3263 dm = get_waiting_dir_move(sctx, pm->ino); 3264 ASSERT(dm); 3265 dm->rmdir_ino = rmdir_ino; 3266 dm->rmdir_gen = rmdir_gen; 3267 } 3268 goto out; 3269 } 3270 fs_path_reset(name); 3271 to_path = name; 3272 name = NULL; 3273 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3274 if (ret < 0) 3275 goto out; 3276 3277 ret = send_rename(sctx, from_path, to_path); 3278 if (ret < 0) 3279 goto out; 3280 3281 if (rmdir_ino) { 3282 struct orphan_dir_info *odi; 3283 u64 gen; 3284 3285 odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); 3286 if (!odi) { 3287 /* already deleted */ 3288 goto finish; 3289 } 3290 gen = odi->gen; 3291 3292 ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); 3293 if (ret < 0) 3294 goto out; 3295 if (!ret) 3296 goto finish; 3297 3298 name = fs_path_alloc(); 3299 if (!name) { 3300 ret = -ENOMEM; 3301 goto out; 3302 } 3303 ret = get_cur_path(sctx, rmdir_ino, gen, name); 3304 if (ret < 0) 3305 goto out; 3306 ret = send_rmdir(sctx, name); 3307 if (ret < 0) 3308 goto out; 3309 } 3310 3311 finish: 3312 ret = send_utimes(sctx, pm->ino, pm->gen); 3313 if (ret < 0) 3314 goto out; 3315 3316 /* 3317 * After rename/move, need to update the utimes of both new parent(s) 3318 * and old parent(s). 3319 */ 3320 list_for_each_entry(cur, &pm->update_refs, list) { 3321 /* 3322 * The parent inode might have been deleted in the send snapshot 3323 */ 3324 ret = get_inode_info(sctx->send_root, cur->dir, NULL, 3325 NULL, NULL, NULL, NULL, NULL); 3326 if (ret == -ENOENT) { 3327 ret = 0; 3328 continue; 3329 } 3330 if (ret < 0) 3331 goto out; 3332 3333 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3334 if (ret < 0) 3335 goto out; 3336 } 3337 3338 out: 3339 fs_path_free(name); 3340 fs_path_free(from_path); 3341 fs_path_free(to_path); 3342 sctx->send_progress = orig_progress; 3343 3344 return ret; 3345 } 3346 3347 static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) 3348 { 3349 if (!list_empty(&m->list)) 3350 list_del(&m->list); 3351 if (!RB_EMPTY_NODE(&m->node)) 3352 rb_erase(&m->node, &sctx->pending_dir_moves); 3353 __free_recorded_refs(&m->update_refs); 3354 kfree(m); 3355 } 3356 3357 static void tail_append_pending_moves(struct send_ctx *sctx, 3358 struct pending_dir_move *moves, 3359 struct list_head *stack) 3360 { 3361 if (list_empty(&moves->list)) { 3362 list_add_tail(&moves->list, stack); 3363 } else { 3364 LIST_HEAD(list); 3365 list_splice_init(&moves->list, &list); 3366 list_add_tail(&moves->list, stack); 3367 list_splice_tail(&list, stack); 3368 } 3369 if (!RB_EMPTY_NODE(&moves->node)) { 3370 rb_erase(&moves->node, &sctx->pending_dir_moves); 3371 RB_CLEAR_NODE(&moves->node); 3372 } 3373 } 3374 3375 static int apply_children_dir_moves(struct send_ctx *sctx) 3376 { 3377 struct pending_dir_move *pm; 3378 struct list_head stack; 3379 u64 parent_ino = sctx->cur_ino; 3380 int ret = 0; 3381 3382 pm = get_pending_dir_moves(sctx, parent_ino); 3383 if (!pm) 3384 return 0; 3385 3386 INIT_LIST_HEAD(&stack); 3387 tail_append_pending_moves(sctx, pm, &stack); 3388 3389 while (!list_empty(&stack)) { 3390 pm = list_first_entry(&stack, struct pending_dir_move, list); 3391 parent_ino = pm->ino; 3392 ret = apply_dir_move(sctx, pm); 3393 free_pending_move(sctx, pm); 3394 if (ret) 3395 goto out; 3396 pm = get_pending_dir_moves(sctx, parent_ino); 3397 if (pm) 3398 tail_append_pending_moves(sctx, pm, &stack); 3399 } 3400 return 0; 3401 3402 out: 3403 while (!list_empty(&stack)) { 3404 pm = list_first_entry(&stack, struct pending_dir_move, list); 3405 free_pending_move(sctx, pm); 3406 } 3407 return ret; 3408 } 3409 3410 /* 3411 * We might need to delay a directory rename even when no ancestor directory 3412 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was 3413 * renamed. This happens when we rename a directory to the old name (the name 3414 * in the parent root) of some other unrelated directory that got its rename 3415 * delayed due to some ancestor with higher number that got renamed. 3416 * 3417 * Example: 3418 * 3419 * Parent snapshot: 3420 * . (ino 256) 3421 * |---- a/ (ino 257) 3422 * | |---- file (ino 260) 3423 * | 3424 * |---- b/ (ino 258) 3425 * |---- c/ (ino 259) 3426 * 3427 * Send snapshot: 3428 * . (ino 256) 3429 * |---- a/ (ino 258) 3430 * |---- x/ (ino 259) 3431 * |---- y/ (ino 257) 3432 * |----- file (ino 260) 3433 * 3434 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 3435 * from 'a' to 'x/y' happening first, which in turn depends on the rename of 3436 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream 3437 * must issue is: 3438 * 3439 * 1 - rename 259 from 'c' to 'x' 3440 * 2 - rename 257 from 'a' to 'x/y' 3441 * 3 - rename 258 from 'b' to 'a' 3442 * 3443 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can 3444 * be done right away and < 0 on error. 3445 */ 3446 static int wait_for_dest_dir_move(struct send_ctx *sctx, 3447 struct recorded_ref *parent_ref, 3448 const bool is_orphan) 3449 { 3450 struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; 3451 struct btrfs_path *path; 3452 struct btrfs_key key; 3453 struct btrfs_key di_key; 3454 struct btrfs_dir_item *di; 3455 u64 left_gen; 3456 u64 right_gen; 3457 int ret = 0; 3458 struct waiting_dir_move *wdm; 3459 3460 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) 3461 return 0; 3462 3463 path = alloc_path_for_send(); 3464 if (!path) 3465 return -ENOMEM; 3466 3467 key.objectid = parent_ref->dir; 3468 key.type = BTRFS_DIR_ITEM_KEY; 3469 key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); 3470 3471 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); 3472 if (ret < 0) { 3473 goto out; 3474 } else if (ret > 0) { 3475 ret = 0; 3476 goto out; 3477 } 3478 3479 di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, 3480 parent_ref->name_len); 3481 if (!di) { 3482 ret = 0; 3483 goto out; 3484 } 3485 /* 3486 * di_key.objectid has the number of the inode that has a dentry in the 3487 * parent directory with the same name that sctx->cur_ino is being 3488 * renamed to. We need to check if that inode is in the send root as 3489 * well and if it is currently marked as an inode with a pending rename, 3490 * if it is, we need to delay the rename of sctx->cur_ino as well, so 3491 * that it happens after that other inode is renamed. 3492 */ 3493 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); 3494 if (di_key.type != BTRFS_INODE_ITEM_KEY) { 3495 ret = 0; 3496 goto out; 3497 } 3498 3499 ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, 3500 &left_gen, NULL, NULL, NULL, NULL); 3501 if (ret < 0) 3502 goto out; 3503 ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, 3504 &right_gen, NULL, NULL, NULL, NULL); 3505 if (ret < 0) { 3506 if (ret == -ENOENT) 3507 ret = 0; 3508 goto out; 3509 } 3510 3511 /* Different inode, no need to delay the rename of sctx->cur_ino */ 3512 if (right_gen != left_gen) { 3513 ret = 0; 3514 goto out; 3515 } 3516 3517 wdm = get_waiting_dir_move(sctx, di_key.objectid); 3518 if (wdm && !wdm->orphanized) { 3519 ret = add_pending_dir_move(sctx, 3520 sctx->cur_ino, 3521 sctx->cur_inode_gen, 3522 di_key.objectid, 3523 &sctx->new_refs, 3524 &sctx->deleted_refs, 3525 is_orphan); 3526 if (!ret) 3527 ret = 1; 3528 } 3529 out: 3530 btrfs_free_path(path); 3531 return ret; 3532 } 3533 3534 /* 3535 * Check if inode ino2, or any of its ancestors, is inode ino1. 3536 * Return 1 if true, 0 if false and < 0 on error. 3537 */ 3538 static int check_ino_in_path(struct btrfs_root *root, 3539 const u64 ino1, 3540 const u64 ino1_gen, 3541 const u64 ino2, 3542 const u64 ino2_gen, 3543 struct fs_path *fs_path) 3544 { 3545 u64 ino = ino2; 3546 3547 if (ino1 == ino2) 3548 return ino1_gen == ino2_gen; 3549 3550 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3551 u64 parent; 3552 u64 parent_gen; 3553 int ret; 3554 3555 fs_path_reset(fs_path); 3556 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); 3557 if (ret < 0) 3558 return ret; 3559 if (parent == ino1) 3560 return parent_gen == ino1_gen; 3561 ino = parent; 3562 } 3563 return 0; 3564 } 3565 3566 /* 3567 * Check if ino ino1 is an ancestor of inode ino2 in the given root for any 3568 * possible path (in case ino2 is not a directory and has multiple hard links). 3569 * Return 1 if true, 0 if false and < 0 on error. 3570 */ 3571 static int is_ancestor(struct btrfs_root *root, 3572 const u64 ino1, 3573 const u64 ino1_gen, 3574 const u64 ino2, 3575 struct fs_path *fs_path) 3576 { 3577 bool free_fs_path = false; 3578 int ret = 0; 3579 struct btrfs_path *path = NULL; 3580 struct btrfs_key key; 3581 3582 if (!fs_path) { 3583 fs_path = fs_path_alloc(); 3584 if (!fs_path) 3585 return -ENOMEM; 3586 free_fs_path = true; 3587 } 3588 3589 path = alloc_path_for_send(); 3590 if (!path) { 3591 ret = -ENOMEM; 3592 goto out; 3593 } 3594 3595 key.objectid = ino2; 3596 key.type = BTRFS_INODE_REF_KEY; 3597 key.offset = 0; 3598 3599 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3600 if (ret < 0) 3601 goto out; 3602 3603 while (true) { 3604 struct extent_buffer *leaf = path->nodes[0]; 3605 int slot = path->slots[0]; 3606 u32 cur_offset = 0; 3607 u32 item_size; 3608 3609 if (slot >= btrfs_header_nritems(leaf)) { 3610 ret = btrfs_next_leaf(root, path); 3611 if (ret < 0) 3612 goto out; 3613 if (ret > 0) 3614 break; 3615 continue; 3616 } 3617 3618 btrfs_item_key_to_cpu(leaf, &key, slot); 3619 if (key.objectid != ino2) 3620 break; 3621 if (key.type != BTRFS_INODE_REF_KEY && 3622 key.type != BTRFS_INODE_EXTREF_KEY) 3623 break; 3624 3625 item_size = btrfs_item_size_nr(leaf, slot); 3626 while (cur_offset < item_size) { 3627 u64 parent; 3628 u64 parent_gen; 3629 3630 if (key.type == BTRFS_INODE_EXTREF_KEY) { 3631 unsigned long ptr; 3632 struct btrfs_inode_extref *extref; 3633 3634 ptr = btrfs_item_ptr_offset(leaf, slot); 3635 extref = (struct btrfs_inode_extref *) 3636 (ptr + cur_offset); 3637 parent = btrfs_inode_extref_parent(leaf, 3638 extref); 3639 cur_offset += sizeof(*extref); 3640 cur_offset += btrfs_inode_extref_name_len(leaf, 3641 extref); 3642 } else { 3643 parent = key.offset; 3644 cur_offset = item_size; 3645 } 3646 3647 ret = get_inode_info(root, parent, NULL, &parent_gen, 3648 NULL, NULL, NULL, NULL); 3649 if (ret < 0) 3650 goto out; 3651 ret = check_ino_in_path(root, ino1, ino1_gen, 3652 parent, parent_gen, fs_path); 3653 if (ret) 3654 goto out; 3655 } 3656 path->slots[0]++; 3657 } 3658 ret = 0; 3659 out: 3660 btrfs_free_path(path); 3661 if (free_fs_path) 3662 fs_path_free(fs_path); 3663 return ret; 3664 } 3665 3666 static int wait_for_parent_move(struct send_ctx *sctx, 3667 struct recorded_ref *parent_ref, 3668 const bool is_orphan) 3669 { 3670 int ret = 0; 3671 u64 ino = parent_ref->dir; 3672 u64 ino_gen = parent_ref->dir_gen; 3673 u64 parent_ino_before, parent_ino_after; 3674 struct fs_path *path_before = NULL; 3675 struct fs_path *path_after = NULL; 3676 int len1, len2; 3677 3678 path_after = fs_path_alloc(); 3679 path_before = fs_path_alloc(); 3680 if (!path_after || !path_before) { 3681 ret = -ENOMEM; 3682 goto out; 3683 } 3684 3685 /* 3686 * Our current directory inode may not yet be renamed/moved because some 3687 * ancestor (immediate or not) has to be renamed/moved first. So find if 3688 * such ancestor exists and make sure our own rename/move happens after 3689 * that ancestor is processed to avoid path build infinite loops (done 3690 * at get_cur_path()). 3691 */ 3692 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3693 u64 parent_ino_after_gen; 3694 3695 if (is_waiting_for_move(sctx, ino)) { 3696 /* 3697 * If the current inode is an ancestor of ino in the 3698 * parent root, we need to delay the rename of the 3699 * current inode, otherwise don't delayed the rename 3700 * because we can end up with a circular dependency 3701 * of renames, resulting in some directories never 3702 * getting the respective rename operations issued in 3703 * the send stream or getting into infinite path build 3704 * loops. 3705 */ 3706 ret = is_ancestor(sctx->parent_root, 3707 sctx->cur_ino, sctx->cur_inode_gen, 3708 ino, path_before); 3709 if (ret) 3710 break; 3711 } 3712 3713 fs_path_reset(path_before); 3714 fs_path_reset(path_after); 3715 3716 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3717 &parent_ino_after_gen, path_after); 3718 if (ret < 0) 3719 goto out; 3720 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, 3721 NULL, path_before); 3722 if (ret < 0 && ret != -ENOENT) { 3723 goto out; 3724 } else if (ret == -ENOENT) { 3725 ret = 0; 3726 break; 3727 } 3728 3729 len1 = fs_path_len(path_before); 3730 len2 = fs_path_len(path_after); 3731 if (ino > sctx->cur_ino && 3732 (parent_ino_before != parent_ino_after || len1 != len2 || 3733 memcmp(path_before->start, path_after->start, len1))) { 3734 u64 parent_ino_gen; 3735 3736 ret = get_inode_info(sctx->parent_root, ino, NULL, 3737 &parent_ino_gen, NULL, NULL, NULL, 3738 NULL); 3739 if (ret < 0) 3740 goto out; 3741 if (ino_gen == parent_ino_gen) { 3742 ret = 1; 3743 break; 3744 } 3745 } 3746 ino = parent_ino_after; 3747 ino_gen = parent_ino_after_gen; 3748 } 3749 3750 out: 3751 fs_path_free(path_before); 3752 fs_path_free(path_after); 3753 3754 if (ret == 1) { 3755 ret = add_pending_dir_move(sctx, 3756 sctx->cur_ino, 3757 sctx->cur_inode_gen, 3758 ino, 3759 &sctx->new_refs, 3760 &sctx->deleted_refs, 3761 is_orphan); 3762 if (!ret) 3763 ret = 1; 3764 } 3765 3766 return ret; 3767 } 3768 3769 static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 3770 { 3771 int ret; 3772 struct fs_path *new_path; 3773 3774 /* 3775 * Our reference's name member points to its full_path member string, so 3776 * we use here a new path. 3777 */ 3778 new_path = fs_path_alloc(); 3779 if (!new_path) 3780 return -ENOMEM; 3781 3782 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); 3783 if (ret < 0) { 3784 fs_path_free(new_path); 3785 return ret; 3786 } 3787 ret = fs_path_add(new_path, ref->name, ref->name_len); 3788 if (ret < 0) { 3789 fs_path_free(new_path); 3790 return ret; 3791 } 3792 3793 fs_path_free(ref->full_path); 3794 set_ref_path(ref, new_path); 3795 3796 return 0; 3797 } 3798 3799 /* 3800 * When processing the new references for an inode we may orphanize an existing 3801 * directory inode because its old name conflicts with one of the new references 3802 * of the current inode. Later, when processing another new reference of our 3803 * inode, we might need to orphanize another inode, but the path we have in the 3804 * reference reflects the pre-orphanization name of the directory we previously 3805 * orphanized. For example: 3806 * 3807 * parent snapshot looks like: 3808 * 3809 * . (ino 256) 3810 * |----- f1 (ino 257) 3811 * |----- f2 (ino 258) 3812 * |----- d1/ (ino 259) 3813 * |----- d2/ (ino 260) 3814 * 3815 * send snapshot looks like: 3816 * 3817 * . (ino 256) 3818 * |----- d1 (ino 258) 3819 * |----- f2/ (ino 259) 3820 * |----- f2_link/ (ino 260) 3821 * | |----- f1 (ino 257) 3822 * | 3823 * |----- d2 (ino 258) 3824 * 3825 * When processing inode 257 we compute the name for inode 259 as "d1", and we 3826 * cache it in the name cache. Later when we start processing inode 258, when 3827 * collecting all its new references we set a full path of "d1/d2" for its new 3828 * reference with name "d2". When we start processing the new references we 3829 * start by processing the new reference with name "d1", and this results in 3830 * orphanizing inode 259, since its old reference causes a conflict. Then we 3831 * move on the next new reference, with name "d2", and we find out we must 3832 * orphanize inode 260, as its old reference conflicts with ours - but for the 3833 * orphanization we use a source path corresponding to the path we stored in the 3834 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the 3835 * receiver fail since the path component "d1/" no longer exists, it was renamed 3836 * to "o259-6-0/" when processing the previous new reference. So in this case we 3837 * must recompute the path in the new reference and use it for the new 3838 * orphanization operation. 3839 */ 3840 static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 3841 { 3842 char *name; 3843 int ret; 3844 3845 name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); 3846 if (!name) 3847 return -ENOMEM; 3848 3849 fs_path_reset(ref->full_path); 3850 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); 3851 if (ret < 0) 3852 goto out; 3853 3854 ret = fs_path_add(ref->full_path, name, ref->name_len); 3855 if (ret < 0) 3856 goto out; 3857 3858 /* Update the reference's base name pointer. */ 3859 set_ref_path(ref, ref->full_path); 3860 out: 3861 kfree(name); 3862 return ret; 3863 } 3864 3865 /* 3866 * This does all the move/link/unlink/rmdir magic. 3867 */ 3868 static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) 3869 { 3870 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 3871 int ret = 0; 3872 struct recorded_ref *cur; 3873 struct recorded_ref *cur2; 3874 struct list_head check_dirs; 3875 struct fs_path *valid_path = NULL; 3876 u64 ow_inode = 0; 3877 u64 ow_gen; 3878 u64 ow_mode; 3879 int did_overwrite = 0; 3880 int is_orphan = 0; 3881 u64 last_dir_ino_rm = 0; 3882 bool can_rename = true; 3883 bool orphanized_dir = false; 3884 bool orphanized_ancestor = false; 3885 3886 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); 3887 3888 /* 3889 * This should never happen as the root dir always has the same ref 3890 * which is always '..' 3891 */ 3892 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 3893 INIT_LIST_HEAD(&check_dirs); 3894 3895 valid_path = fs_path_alloc(); 3896 if (!valid_path) { 3897 ret = -ENOMEM; 3898 goto out; 3899 } 3900 3901 /* 3902 * First, check if the first ref of the current inode was overwritten 3903 * before. If yes, we know that the current inode was already orphanized 3904 * and thus use the orphan name. If not, we can use get_cur_path to 3905 * get the path of the first ref as it would like while receiving at 3906 * this point in time. 3907 * New inodes are always orphan at the beginning, so force to use the 3908 * orphan name in this case. 3909 * The first ref is stored in valid_path and will be updated if it 3910 * gets moved around. 3911 */ 3912 if (!sctx->cur_inode_new) { 3913 ret = did_overwrite_first_ref(sctx, sctx->cur_ino, 3914 sctx->cur_inode_gen); 3915 if (ret < 0) 3916 goto out; 3917 if (ret) 3918 did_overwrite = 1; 3919 } 3920 if (sctx->cur_inode_new || did_overwrite) { 3921 ret = gen_unique_name(sctx, sctx->cur_ino, 3922 sctx->cur_inode_gen, valid_path); 3923 if (ret < 0) 3924 goto out; 3925 is_orphan = 1; 3926 } else { 3927 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 3928 valid_path); 3929 if (ret < 0) 3930 goto out; 3931 } 3932 3933 /* 3934 * Before doing any rename and link operations, do a first pass on the 3935 * new references to orphanize any unprocessed inodes that may have a 3936 * reference that conflicts with one of the new references of the current 3937 * inode. This needs to happen first because a new reference may conflict 3938 * with the old reference of a parent directory, so we must make sure 3939 * that the path used for link and rename commands don't use an 3940 * orphanized name when an ancestor was not yet orphanized. 3941 * 3942 * Example: 3943 * 3944 * Parent snapshot: 3945 * 3946 * . (ino 256) 3947 * |----- testdir/ (ino 259) 3948 * | |----- a (ino 257) 3949 * | 3950 * |----- b (ino 258) 3951 * 3952 * Send snapshot: 3953 * 3954 * . (ino 256) 3955 * |----- testdir_2/ (ino 259) 3956 * | |----- a (ino 260) 3957 * | 3958 * |----- testdir (ino 257) 3959 * |----- b (ino 257) 3960 * |----- b2 (ino 258) 3961 * 3962 * Processing the new reference for inode 257 with name "b" may happen 3963 * before processing the new reference with name "testdir". If so, we 3964 * must make sure that by the time we send a link command to create the 3965 * hard link "b", inode 259 was already orphanized, since the generated 3966 * path in "valid_path" already contains the orphanized name for 259. 3967 * We are processing inode 257, so only later when processing 259 we do 3968 * the rename operation to change its temporary (orphanized) name to 3969 * "testdir_2". 3970 */ 3971 list_for_each_entry(cur, &sctx->new_refs, list) { 3972 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 3973 if (ret < 0) 3974 goto out; 3975 if (ret == inode_state_will_create) 3976 continue; 3977 3978 /* 3979 * Check if this new ref would overwrite the first ref of another 3980 * unprocessed inode. If yes, orphanize the overwritten inode. 3981 * If we find an overwritten ref that is not the first ref, 3982 * simply unlink it. 3983 */ 3984 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, 3985 cur->name, cur->name_len, 3986 &ow_inode, &ow_gen, &ow_mode); 3987 if (ret < 0) 3988 goto out; 3989 if (ret) { 3990 ret = is_first_ref(sctx->parent_root, 3991 ow_inode, cur->dir, cur->name, 3992 cur->name_len); 3993 if (ret < 0) 3994 goto out; 3995 if (ret) { 3996 struct name_cache_entry *nce; 3997 struct waiting_dir_move *wdm; 3998 3999 if (orphanized_dir) { 4000 ret = refresh_ref_path(sctx, cur); 4001 if (ret < 0) 4002 goto out; 4003 } 4004 4005 ret = orphanize_inode(sctx, ow_inode, ow_gen, 4006 cur->full_path); 4007 if (ret < 0) 4008 goto out; 4009 if (S_ISDIR(ow_mode)) 4010 orphanized_dir = true; 4011 4012 /* 4013 * If ow_inode has its rename operation delayed 4014 * make sure that its orphanized name is used in 4015 * the source path when performing its rename 4016 * operation. 4017 */ 4018 if (is_waiting_for_move(sctx, ow_inode)) { 4019 wdm = get_waiting_dir_move(sctx, 4020 ow_inode); 4021 ASSERT(wdm); 4022 wdm->orphanized = true; 4023 } 4024 4025 /* 4026 * Make sure we clear our orphanized inode's 4027 * name from the name cache. This is because the 4028 * inode ow_inode might be an ancestor of some 4029 * other inode that will be orphanized as well 4030 * later and has an inode number greater than 4031 * sctx->send_progress. We need to prevent 4032 * future name lookups from using the old name 4033 * and get instead the orphan name. 4034 */ 4035 nce = name_cache_search(sctx, ow_inode, ow_gen); 4036 if (nce) { 4037 name_cache_delete(sctx, nce); 4038 kfree(nce); 4039 } 4040 4041 /* 4042 * ow_inode might currently be an ancestor of 4043 * cur_ino, therefore compute valid_path (the 4044 * current path of cur_ino) again because it 4045 * might contain the pre-orphanization name of 4046 * ow_inode, which is no longer valid. 4047 */ 4048 ret = is_ancestor(sctx->parent_root, 4049 ow_inode, ow_gen, 4050 sctx->cur_ino, NULL); 4051 if (ret > 0) { 4052 orphanized_ancestor = true; 4053 fs_path_reset(valid_path); 4054 ret = get_cur_path(sctx, sctx->cur_ino, 4055 sctx->cur_inode_gen, 4056 valid_path); 4057 } 4058 if (ret < 0) 4059 goto out; 4060 } else { 4061 /* 4062 * If we previously orphanized a directory that 4063 * collided with a new reference that we already 4064 * processed, recompute the current path because 4065 * that directory may be part of the path. 4066 */ 4067 if (orphanized_dir) { 4068 ret = refresh_ref_path(sctx, cur); 4069 if (ret < 0) 4070 goto out; 4071 } 4072 ret = send_unlink(sctx, cur->full_path); 4073 if (ret < 0) 4074 goto out; 4075 } 4076 } 4077 4078 } 4079 4080 list_for_each_entry(cur, &sctx->new_refs, list) { 4081 /* 4082 * We may have refs where the parent directory does not exist 4083 * yet. This happens if the parent directories inum is higher 4084 * than the current inum. To handle this case, we create the 4085 * parent directory out of order. But we need to check if this 4086 * did already happen before due to other refs in the same dir. 4087 */ 4088 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 4089 if (ret < 0) 4090 goto out; 4091 if (ret == inode_state_will_create) { 4092 ret = 0; 4093 /* 4094 * First check if any of the current inodes refs did 4095 * already create the dir. 4096 */ 4097 list_for_each_entry(cur2, &sctx->new_refs, list) { 4098 if (cur == cur2) 4099 break; 4100 if (cur2->dir == cur->dir) { 4101 ret = 1; 4102 break; 4103 } 4104 } 4105 4106 /* 4107 * If that did not happen, check if a previous inode 4108 * did already create the dir. 4109 */ 4110 if (!ret) 4111 ret = did_create_dir(sctx, cur->dir); 4112 if (ret < 0) 4113 goto out; 4114 if (!ret) { 4115 ret = send_create_inode(sctx, cur->dir); 4116 if (ret < 0) 4117 goto out; 4118 } 4119 } 4120 4121 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { 4122 ret = wait_for_dest_dir_move(sctx, cur, is_orphan); 4123 if (ret < 0) 4124 goto out; 4125 if (ret == 1) { 4126 can_rename = false; 4127 *pending_move = 1; 4128 } 4129 } 4130 4131 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && 4132 can_rename) { 4133 ret = wait_for_parent_move(sctx, cur, is_orphan); 4134 if (ret < 0) 4135 goto out; 4136 if (ret == 1) { 4137 can_rename = false; 4138 *pending_move = 1; 4139 } 4140 } 4141 4142 /* 4143 * link/move the ref to the new place. If we have an orphan 4144 * inode, move it and update valid_path. If not, link or move 4145 * it depending on the inode mode. 4146 */ 4147 if (is_orphan && can_rename) { 4148 ret = send_rename(sctx, valid_path, cur->full_path); 4149 if (ret < 0) 4150 goto out; 4151 is_orphan = 0; 4152 ret = fs_path_copy(valid_path, cur->full_path); 4153 if (ret < 0) 4154 goto out; 4155 } else if (can_rename) { 4156 if (S_ISDIR(sctx->cur_inode_mode)) { 4157 /* 4158 * Dirs can't be linked, so move it. For moved 4159 * dirs, we always have one new and one deleted 4160 * ref. The deleted ref is ignored later. 4161 */ 4162 ret = send_rename(sctx, valid_path, 4163 cur->full_path); 4164 if (!ret) 4165 ret = fs_path_copy(valid_path, 4166 cur->full_path); 4167 if (ret < 0) 4168 goto out; 4169 } else { 4170 /* 4171 * We might have previously orphanized an inode 4172 * which is an ancestor of our current inode, 4173 * so our reference's full path, which was 4174 * computed before any such orphanizations, must 4175 * be updated. 4176 */ 4177 if (orphanized_dir) { 4178 ret = update_ref_path(sctx, cur); 4179 if (ret < 0) 4180 goto out; 4181 } 4182 ret = send_link(sctx, cur->full_path, 4183 valid_path); 4184 if (ret < 0) 4185 goto out; 4186 } 4187 } 4188 ret = dup_ref(cur, &check_dirs); 4189 if (ret < 0) 4190 goto out; 4191 } 4192 4193 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { 4194 /* 4195 * Check if we can already rmdir the directory. If not, 4196 * orphanize it. For every dir item inside that gets deleted 4197 * later, we do this check again and rmdir it then if possible. 4198 * See the use of check_dirs for more details. 4199 */ 4200 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, 4201 sctx->cur_ino); 4202 if (ret < 0) 4203 goto out; 4204 if (ret) { 4205 ret = send_rmdir(sctx, valid_path); 4206 if (ret < 0) 4207 goto out; 4208 } else if (!is_orphan) { 4209 ret = orphanize_inode(sctx, sctx->cur_ino, 4210 sctx->cur_inode_gen, valid_path); 4211 if (ret < 0) 4212 goto out; 4213 is_orphan = 1; 4214 } 4215 4216 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4217 ret = dup_ref(cur, &check_dirs); 4218 if (ret < 0) 4219 goto out; 4220 } 4221 } else if (S_ISDIR(sctx->cur_inode_mode) && 4222 !list_empty(&sctx->deleted_refs)) { 4223 /* 4224 * We have a moved dir. Add the old parent to check_dirs 4225 */ 4226 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, 4227 list); 4228 ret = dup_ref(cur, &check_dirs); 4229 if (ret < 0) 4230 goto out; 4231 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 4232 /* 4233 * We have a non dir inode. Go through all deleted refs and 4234 * unlink them if they were not already overwritten by other 4235 * inodes. 4236 */ 4237 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4238 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, 4239 sctx->cur_ino, sctx->cur_inode_gen, 4240 cur->name, cur->name_len); 4241 if (ret < 0) 4242 goto out; 4243 if (!ret) { 4244 /* 4245 * If we orphanized any ancestor before, we need 4246 * to recompute the full path for deleted names, 4247 * since any such path was computed before we 4248 * processed any references and orphanized any 4249 * ancestor inode. 4250 */ 4251 if (orphanized_ancestor) { 4252 ret = update_ref_path(sctx, cur); 4253 if (ret < 0) 4254 goto out; 4255 } 4256 ret = send_unlink(sctx, cur->full_path); 4257 if (ret < 0) 4258 goto out; 4259 } 4260 ret = dup_ref(cur, &check_dirs); 4261 if (ret < 0) 4262 goto out; 4263 } 4264 /* 4265 * If the inode is still orphan, unlink the orphan. This may 4266 * happen when a previous inode did overwrite the first ref 4267 * of this inode and no new refs were added for the current 4268 * inode. Unlinking does not mean that the inode is deleted in 4269 * all cases. There may still be links to this inode in other 4270 * places. 4271 */ 4272 if (is_orphan) { 4273 ret = send_unlink(sctx, valid_path); 4274 if (ret < 0) 4275 goto out; 4276 } 4277 } 4278 4279 /* 4280 * We did collect all parent dirs where cur_inode was once located. We 4281 * now go through all these dirs and check if they are pending for 4282 * deletion and if it's finally possible to perform the rmdir now. 4283 * We also update the inode stats of the parent dirs here. 4284 */ 4285 list_for_each_entry(cur, &check_dirs, list) { 4286 /* 4287 * In case we had refs into dirs that were not processed yet, 4288 * we don't need to do the utime and rmdir logic for these dirs. 4289 * The dir will be processed later. 4290 */ 4291 if (cur->dir > sctx->cur_ino) 4292 continue; 4293 4294 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); 4295 if (ret < 0) 4296 goto out; 4297 4298 if (ret == inode_state_did_create || 4299 ret == inode_state_no_change) { 4300 /* TODO delayed utimes */ 4301 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 4302 if (ret < 0) 4303 goto out; 4304 } else if (ret == inode_state_did_delete && 4305 cur->dir != last_dir_ino_rm) { 4306 ret = can_rmdir(sctx, cur->dir, cur->dir_gen, 4307 sctx->cur_ino); 4308 if (ret < 0) 4309 goto out; 4310 if (ret) { 4311 ret = get_cur_path(sctx, cur->dir, 4312 cur->dir_gen, valid_path); 4313 if (ret < 0) 4314 goto out; 4315 ret = send_rmdir(sctx, valid_path); 4316 if (ret < 0) 4317 goto out; 4318 last_dir_ino_rm = cur->dir; 4319 } 4320 } 4321 } 4322 4323 ret = 0; 4324 4325 out: 4326 __free_recorded_refs(&check_dirs); 4327 free_recorded_refs(sctx); 4328 fs_path_free(valid_path); 4329 return ret; 4330 } 4331 4332 static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, 4333 void *ctx, struct list_head *refs) 4334 { 4335 int ret = 0; 4336 struct send_ctx *sctx = ctx; 4337 struct fs_path *p; 4338 u64 gen; 4339 4340 p = fs_path_alloc(); 4341 if (!p) 4342 return -ENOMEM; 4343 4344 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, 4345 NULL, NULL); 4346 if (ret < 0) 4347 goto out; 4348 4349 ret = get_cur_path(sctx, dir, gen, p); 4350 if (ret < 0) 4351 goto out; 4352 ret = fs_path_add_path(p, name); 4353 if (ret < 0) 4354 goto out; 4355 4356 ret = __record_ref(refs, dir, gen, p); 4357 4358 out: 4359 if (ret) 4360 fs_path_free(p); 4361 return ret; 4362 } 4363 4364 static int __record_new_ref(int num, u64 dir, int index, 4365 struct fs_path *name, 4366 void *ctx) 4367 { 4368 struct send_ctx *sctx = ctx; 4369 return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); 4370 } 4371 4372 4373 static int __record_deleted_ref(int num, u64 dir, int index, 4374 struct fs_path *name, 4375 void *ctx) 4376 { 4377 struct send_ctx *sctx = ctx; 4378 return record_ref(sctx->parent_root, dir, name, ctx, 4379 &sctx->deleted_refs); 4380 } 4381 4382 static int record_new_ref(struct send_ctx *sctx) 4383 { 4384 int ret; 4385 4386 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4387 sctx->cmp_key, 0, __record_new_ref, sctx); 4388 if (ret < 0) 4389 goto out; 4390 ret = 0; 4391 4392 out: 4393 return ret; 4394 } 4395 4396 static int record_deleted_ref(struct send_ctx *sctx) 4397 { 4398 int ret; 4399 4400 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4401 sctx->cmp_key, 0, __record_deleted_ref, sctx); 4402 if (ret < 0) 4403 goto out; 4404 ret = 0; 4405 4406 out: 4407 return ret; 4408 } 4409 4410 struct find_ref_ctx { 4411 u64 dir; 4412 u64 dir_gen; 4413 struct btrfs_root *root; 4414 struct fs_path *name; 4415 int found_idx; 4416 }; 4417 4418 static int __find_iref(int num, u64 dir, int index, 4419 struct fs_path *name, 4420 void *ctx_) 4421 { 4422 struct find_ref_ctx *ctx = ctx_; 4423 u64 dir_gen; 4424 int ret; 4425 4426 if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && 4427 strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { 4428 /* 4429 * To avoid doing extra lookups we'll only do this if everything 4430 * else matches. 4431 */ 4432 ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, 4433 NULL, NULL, NULL); 4434 if (ret) 4435 return ret; 4436 if (dir_gen != ctx->dir_gen) 4437 return 0; 4438 ctx->found_idx = num; 4439 return 1; 4440 } 4441 return 0; 4442 } 4443 4444 static int find_iref(struct btrfs_root *root, 4445 struct btrfs_path *path, 4446 struct btrfs_key *key, 4447 u64 dir, u64 dir_gen, struct fs_path *name) 4448 { 4449 int ret; 4450 struct find_ref_ctx ctx; 4451 4452 ctx.dir = dir; 4453 ctx.name = name; 4454 ctx.dir_gen = dir_gen; 4455 ctx.found_idx = -1; 4456 ctx.root = root; 4457 4458 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); 4459 if (ret < 0) 4460 return ret; 4461 4462 if (ctx.found_idx == -1) 4463 return -ENOENT; 4464 4465 return ctx.found_idx; 4466 } 4467 4468 static int __record_changed_new_ref(int num, u64 dir, int index, 4469 struct fs_path *name, 4470 void *ctx) 4471 { 4472 u64 dir_gen; 4473 int ret; 4474 struct send_ctx *sctx = ctx; 4475 4476 ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, 4477 NULL, NULL, NULL); 4478 if (ret) 4479 return ret; 4480 4481 ret = find_iref(sctx->parent_root, sctx->right_path, 4482 sctx->cmp_key, dir, dir_gen, name); 4483 if (ret == -ENOENT) 4484 ret = __record_new_ref(num, dir, index, name, sctx); 4485 else if (ret > 0) 4486 ret = 0; 4487 4488 return ret; 4489 } 4490 4491 static int __record_changed_deleted_ref(int num, u64 dir, int index, 4492 struct fs_path *name, 4493 void *ctx) 4494 { 4495 u64 dir_gen; 4496 int ret; 4497 struct send_ctx *sctx = ctx; 4498 4499 ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, 4500 NULL, NULL, NULL); 4501 if (ret) 4502 return ret; 4503 4504 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, 4505 dir, dir_gen, name); 4506 if (ret == -ENOENT) 4507 ret = __record_deleted_ref(num, dir, index, name, sctx); 4508 else if (ret > 0) 4509 ret = 0; 4510 4511 return ret; 4512 } 4513 4514 static int record_changed_ref(struct send_ctx *sctx) 4515 { 4516 int ret = 0; 4517 4518 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4519 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 4520 if (ret < 0) 4521 goto out; 4522 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4523 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 4524 if (ret < 0) 4525 goto out; 4526 ret = 0; 4527 4528 out: 4529 return ret; 4530 } 4531 4532 /* 4533 * Record and process all refs at once. Needed when an inode changes the 4534 * generation number, which means that it was deleted and recreated. 4535 */ 4536 static int process_all_refs(struct send_ctx *sctx, 4537 enum btrfs_compare_tree_result cmd) 4538 { 4539 int ret; 4540 struct btrfs_root *root; 4541 struct btrfs_path *path; 4542 struct btrfs_key key; 4543 struct btrfs_key found_key; 4544 struct extent_buffer *eb; 4545 int slot; 4546 iterate_inode_ref_t cb; 4547 int pending_move = 0; 4548 4549 path = alloc_path_for_send(); 4550 if (!path) 4551 return -ENOMEM; 4552 4553 if (cmd == BTRFS_COMPARE_TREE_NEW) { 4554 root = sctx->send_root; 4555 cb = __record_new_ref; 4556 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { 4557 root = sctx->parent_root; 4558 cb = __record_deleted_ref; 4559 } else { 4560 btrfs_err(sctx->send_root->fs_info, 4561 "Wrong command %d in process_all_refs", cmd); 4562 ret = -EINVAL; 4563 goto out; 4564 } 4565 4566 key.objectid = sctx->cmp_key->objectid; 4567 key.type = BTRFS_INODE_REF_KEY; 4568 key.offset = 0; 4569 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4570 if (ret < 0) 4571 goto out; 4572 4573 while (1) { 4574 eb = path->nodes[0]; 4575 slot = path->slots[0]; 4576 if (slot >= btrfs_header_nritems(eb)) { 4577 ret = btrfs_next_leaf(root, path); 4578 if (ret < 0) 4579 goto out; 4580 else if (ret > 0) 4581 break; 4582 continue; 4583 } 4584 4585 btrfs_item_key_to_cpu(eb, &found_key, slot); 4586 4587 if (found_key.objectid != key.objectid || 4588 (found_key.type != BTRFS_INODE_REF_KEY && 4589 found_key.type != BTRFS_INODE_EXTREF_KEY)) 4590 break; 4591 4592 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 4593 if (ret < 0) 4594 goto out; 4595 4596 path->slots[0]++; 4597 } 4598 btrfs_release_path(path); 4599 4600 /* 4601 * We don't actually care about pending_move as we are simply 4602 * re-creating this inode and will be rename'ing it into place once we 4603 * rename the parent directory. 4604 */ 4605 ret = process_recorded_refs(sctx, &pending_move); 4606 out: 4607 btrfs_free_path(path); 4608 return ret; 4609 } 4610 4611 static int send_set_xattr(struct send_ctx *sctx, 4612 struct fs_path *path, 4613 const char *name, int name_len, 4614 const char *data, int data_len) 4615 { 4616 int ret = 0; 4617 4618 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); 4619 if (ret < 0) 4620 goto out; 4621 4622 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4623 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4624 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); 4625 4626 ret = send_cmd(sctx); 4627 4628 tlv_put_failure: 4629 out: 4630 return ret; 4631 } 4632 4633 static int send_remove_xattr(struct send_ctx *sctx, 4634 struct fs_path *path, 4635 const char *name, int name_len) 4636 { 4637 int ret = 0; 4638 4639 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); 4640 if (ret < 0) 4641 goto out; 4642 4643 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4644 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4645 4646 ret = send_cmd(sctx); 4647 4648 tlv_put_failure: 4649 out: 4650 return ret; 4651 } 4652 4653 static int __process_new_xattr(int num, struct btrfs_key *di_key, 4654 const char *name, int name_len, 4655 const char *data, int data_len, 4656 u8 type, void *ctx) 4657 { 4658 int ret; 4659 struct send_ctx *sctx = ctx; 4660 struct fs_path *p; 4661 struct posix_acl_xattr_header dummy_acl; 4662 4663 /* Capabilities are emitted by finish_inode_if_needed */ 4664 if (!strncmp(name, XATTR_NAME_CAPS, name_len)) 4665 return 0; 4666 4667 p = fs_path_alloc(); 4668 if (!p) 4669 return -ENOMEM; 4670 4671 /* 4672 * This hack is needed because empty acls are stored as zero byte 4673 * data in xattrs. Problem with that is, that receiving these zero byte 4674 * acls will fail later. To fix this, we send a dummy acl list that 4675 * only contains the version number and no entries. 4676 */ 4677 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || 4678 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { 4679 if (data_len == 0) { 4680 dummy_acl.a_version = 4681 cpu_to_le32(POSIX_ACL_XATTR_VERSION); 4682 data = (char *)&dummy_acl; 4683 data_len = sizeof(dummy_acl); 4684 } 4685 } 4686 4687 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 4688 if (ret < 0) 4689 goto out; 4690 4691 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 4692 4693 out: 4694 fs_path_free(p); 4695 return ret; 4696 } 4697 4698 static int __process_deleted_xattr(int num, struct btrfs_key *di_key, 4699 const char *name, int name_len, 4700 const char *data, int data_len, 4701 u8 type, void *ctx) 4702 { 4703 int ret; 4704 struct send_ctx *sctx = ctx; 4705 struct fs_path *p; 4706 4707 p = fs_path_alloc(); 4708 if (!p) 4709 return -ENOMEM; 4710 4711 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 4712 if (ret < 0) 4713 goto out; 4714 4715 ret = send_remove_xattr(sctx, p, name, name_len); 4716 4717 out: 4718 fs_path_free(p); 4719 return ret; 4720 } 4721 4722 static int process_new_xattr(struct send_ctx *sctx) 4723 { 4724 int ret = 0; 4725 4726 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4727 __process_new_xattr, sctx); 4728 4729 return ret; 4730 } 4731 4732 static int process_deleted_xattr(struct send_ctx *sctx) 4733 { 4734 return iterate_dir_item(sctx->parent_root, sctx->right_path, 4735 __process_deleted_xattr, sctx); 4736 } 4737 4738 struct find_xattr_ctx { 4739 const char *name; 4740 int name_len; 4741 int found_idx; 4742 char *found_data; 4743 int found_data_len; 4744 }; 4745 4746 static int __find_xattr(int num, struct btrfs_key *di_key, 4747 const char *name, int name_len, 4748 const char *data, int data_len, 4749 u8 type, void *vctx) 4750 { 4751 struct find_xattr_ctx *ctx = vctx; 4752 4753 if (name_len == ctx->name_len && 4754 strncmp(name, ctx->name, name_len) == 0) { 4755 ctx->found_idx = num; 4756 ctx->found_data_len = data_len; 4757 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); 4758 if (!ctx->found_data) 4759 return -ENOMEM; 4760 return 1; 4761 } 4762 return 0; 4763 } 4764 4765 static int find_xattr(struct btrfs_root *root, 4766 struct btrfs_path *path, 4767 struct btrfs_key *key, 4768 const char *name, int name_len, 4769 char **data, int *data_len) 4770 { 4771 int ret; 4772 struct find_xattr_ctx ctx; 4773 4774 ctx.name = name; 4775 ctx.name_len = name_len; 4776 ctx.found_idx = -1; 4777 ctx.found_data = NULL; 4778 ctx.found_data_len = 0; 4779 4780 ret = iterate_dir_item(root, path, __find_xattr, &ctx); 4781 if (ret < 0) 4782 return ret; 4783 4784 if (ctx.found_idx == -1) 4785 return -ENOENT; 4786 if (data) { 4787 *data = ctx.found_data; 4788 *data_len = ctx.found_data_len; 4789 } else { 4790 kfree(ctx.found_data); 4791 } 4792 return ctx.found_idx; 4793 } 4794 4795 4796 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, 4797 const char *name, int name_len, 4798 const char *data, int data_len, 4799 u8 type, void *ctx) 4800 { 4801 int ret; 4802 struct send_ctx *sctx = ctx; 4803 char *found_data = NULL; 4804 int found_data_len = 0; 4805 4806 ret = find_xattr(sctx->parent_root, sctx->right_path, 4807 sctx->cmp_key, name, name_len, &found_data, 4808 &found_data_len); 4809 if (ret == -ENOENT) { 4810 ret = __process_new_xattr(num, di_key, name, name_len, data, 4811 data_len, type, ctx); 4812 } else if (ret >= 0) { 4813 if (data_len != found_data_len || 4814 memcmp(data, found_data, data_len)) { 4815 ret = __process_new_xattr(num, di_key, name, name_len, 4816 data, data_len, type, ctx); 4817 } else { 4818 ret = 0; 4819 } 4820 } 4821 4822 kfree(found_data); 4823 return ret; 4824 } 4825 4826 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, 4827 const char *name, int name_len, 4828 const char *data, int data_len, 4829 u8 type, void *ctx) 4830 { 4831 int ret; 4832 struct send_ctx *sctx = ctx; 4833 4834 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, 4835 name, name_len, NULL, NULL); 4836 if (ret == -ENOENT) 4837 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 4838 data_len, type, ctx); 4839 else if (ret >= 0) 4840 ret = 0; 4841 4842 return ret; 4843 } 4844 4845 static int process_changed_xattr(struct send_ctx *sctx) 4846 { 4847 int ret = 0; 4848 4849 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4850 __process_changed_new_xattr, sctx); 4851 if (ret < 0) 4852 goto out; 4853 ret = iterate_dir_item(sctx->parent_root, sctx->right_path, 4854 __process_changed_deleted_xattr, sctx); 4855 4856 out: 4857 return ret; 4858 } 4859 4860 static int process_all_new_xattrs(struct send_ctx *sctx) 4861 { 4862 int ret; 4863 struct btrfs_root *root; 4864 struct btrfs_path *path; 4865 struct btrfs_key key; 4866 struct btrfs_key found_key; 4867 struct extent_buffer *eb; 4868 int slot; 4869 4870 path = alloc_path_for_send(); 4871 if (!path) 4872 return -ENOMEM; 4873 4874 root = sctx->send_root; 4875 4876 key.objectid = sctx->cmp_key->objectid; 4877 key.type = BTRFS_XATTR_ITEM_KEY; 4878 key.offset = 0; 4879 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4880 if (ret < 0) 4881 goto out; 4882 4883 while (1) { 4884 eb = path->nodes[0]; 4885 slot = path->slots[0]; 4886 if (slot >= btrfs_header_nritems(eb)) { 4887 ret = btrfs_next_leaf(root, path); 4888 if (ret < 0) { 4889 goto out; 4890 } else if (ret > 0) { 4891 ret = 0; 4892 break; 4893 } 4894 continue; 4895 } 4896 4897 btrfs_item_key_to_cpu(eb, &found_key, slot); 4898 if (found_key.objectid != key.objectid || 4899 found_key.type != key.type) { 4900 ret = 0; 4901 goto out; 4902 } 4903 4904 ret = iterate_dir_item(root, path, __process_new_xattr, sctx); 4905 if (ret < 0) 4906 goto out; 4907 4908 path->slots[0]++; 4909 } 4910 4911 out: 4912 btrfs_free_path(path); 4913 return ret; 4914 } 4915 4916 static inline u64 max_send_read_size(const struct send_ctx *sctx) 4917 { 4918 return sctx->send_max_size - SZ_16K; 4919 } 4920 4921 static int put_data_header(struct send_ctx *sctx, u32 len) 4922 { 4923 struct btrfs_tlv_header *hdr; 4924 4925 if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) 4926 return -EOVERFLOW; 4927 hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); 4928 put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); 4929 put_unaligned_le16(len, &hdr->tlv_len); 4930 sctx->send_size += sizeof(*hdr); 4931 return 0; 4932 } 4933 4934 static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) 4935 { 4936 struct btrfs_root *root = sctx->send_root; 4937 struct btrfs_fs_info *fs_info = root->fs_info; 4938 struct inode *inode; 4939 struct page *page; 4940 pgoff_t index = offset >> PAGE_SHIFT; 4941 pgoff_t last_index; 4942 unsigned pg_offset = offset_in_page(offset); 4943 int ret; 4944 4945 ret = put_data_header(sctx, len); 4946 if (ret) 4947 return ret; 4948 4949 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); 4950 if (IS_ERR(inode)) 4951 return PTR_ERR(inode); 4952 4953 last_index = (offset + len - 1) >> PAGE_SHIFT; 4954 4955 /* initial readahead */ 4956 memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 4957 file_ra_state_init(&sctx->ra, inode->i_mapping); 4958 4959 while (index <= last_index) { 4960 unsigned cur_len = min_t(unsigned, len, 4961 PAGE_SIZE - pg_offset); 4962 4963 page = find_lock_page(inode->i_mapping, index); 4964 if (!page) { 4965 page_cache_sync_readahead(inode->i_mapping, &sctx->ra, 4966 NULL, index, last_index + 1 - index); 4967 4968 page = find_or_create_page(inode->i_mapping, index, 4969 GFP_KERNEL); 4970 if (!page) { 4971 ret = -ENOMEM; 4972 break; 4973 } 4974 } 4975 4976 if (PageReadahead(page)) { 4977 page_cache_async_readahead(inode->i_mapping, &sctx->ra, 4978 NULL, page, index, last_index + 1 - index); 4979 } 4980 4981 if (!PageUptodate(page)) { 4982 btrfs_readpage(NULL, page); 4983 lock_page(page); 4984 if (!PageUptodate(page)) { 4985 unlock_page(page); 4986 put_page(page); 4987 ret = -EIO; 4988 break; 4989 } 4990 } 4991 4992 memcpy_from_page(sctx->send_buf + sctx->send_size, page, 4993 pg_offset, cur_len); 4994 unlock_page(page); 4995 put_page(page); 4996 index++; 4997 pg_offset = 0; 4998 len -= cur_len; 4999 sctx->send_size += cur_len; 5000 } 5001 iput(inode); 5002 return ret; 5003 } 5004 5005 /* 5006 * Read some bytes from the current inode/file and send a write command to 5007 * user space. 5008 */ 5009 static int send_write(struct send_ctx *sctx, u64 offset, u32 len) 5010 { 5011 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 5012 int ret = 0; 5013 struct fs_path *p; 5014 5015 p = fs_path_alloc(); 5016 if (!p) 5017 return -ENOMEM; 5018 5019 btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); 5020 5021 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5022 if (ret < 0) 5023 goto out; 5024 5025 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5026 if (ret < 0) 5027 goto out; 5028 5029 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5030 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5031 ret = put_file_data(sctx, offset, len); 5032 if (ret < 0) 5033 goto out; 5034 5035 ret = send_cmd(sctx); 5036 5037 tlv_put_failure: 5038 out: 5039 fs_path_free(p); 5040 return ret; 5041 } 5042 5043 /* 5044 * Send a clone command to user space. 5045 */ 5046 static int send_clone(struct send_ctx *sctx, 5047 u64 offset, u32 len, 5048 struct clone_root *clone_root) 5049 { 5050 int ret = 0; 5051 struct fs_path *p; 5052 u64 gen; 5053 5054 btrfs_debug(sctx->send_root->fs_info, 5055 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", 5056 offset, len, clone_root->root->root_key.objectid, 5057 clone_root->ino, clone_root->offset); 5058 5059 p = fs_path_alloc(); 5060 if (!p) 5061 return -ENOMEM; 5062 5063 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); 5064 if (ret < 0) 5065 goto out; 5066 5067 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5068 if (ret < 0) 5069 goto out; 5070 5071 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5072 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 5073 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5074 5075 if (clone_root->root == sctx->send_root) { 5076 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, 5077 &gen, NULL, NULL, NULL, NULL); 5078 if (ret < 0) 5079 goto out; 5080 ret = get_cur_path(sctx, clone_root->ino, gen, p); 5081 } else { 5082 ret = get_inode_path(clone_root->root, clone_root->ino, p); 5083 } 5084 if (ret < 0) 5085 goto out; 5086 5087 /* 5088 * If the parent we're using has a received_uuid set then use that as 5089 * our clone source as that is what we will look for when doing a 5090 * receive. 5091 * 5092 * This covers the case that we create a snapshot off of a received 5093 * subvolume and then use that as the parent and try to receive on a 5094 * different host. 5095 */ 5096 if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) 5097 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5098 clone_root->root->root_item.received_uuid); 5099 else 5100 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5101 clone_root->root->root_item.uuid); 5102 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 5103 btrfs_root_ctransid(&clone_root->root->root_item)); 5104 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 5105 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 5106 clone_root->offset); 5107 5108 ret = send_cmd(sctx); 5109 5110 tlv_put_failure: 5111 out: 5112 fs_path_free(p); 5113 return ret; 5114 } 5115 5116 /* 5117 * Send an update extent command to user space. 5118 */ 5119 static int send_update_extent(struct send_ctx *sctx, 5120 u64 offset, u32 len) 5121 { 5122 int ret = 0; 5123 struct fs_path *p; 5124 5125 p = fs_path_alloc(); 5126 if (!p) 5127 return -ENOMEM; 5128 5129 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); 5130 if (ret < 0) 5131 goto out; 5132 5133 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5134 if (ret < 0) 5135 goto out; 5136 5137 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5138 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5139 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); 5140 5141 ret = send_cmd(sctx); 5142 5143 tlv_put_failure: 5144 out: 5145 fs_path_free(p); 5146 return ret; 5147 } 5148 5149 static int send_hole(struct send_ctx *sctx, u64 end) 5150 { 5151 struct fs_path *p = NULL; 5152 u64 read_size = max_send_read_size(sctx); 5153 u64 offset = sctx->cur_inode_last_extent; 5154 int ret = 0; 5155 5156 /* 5157 * A hole that starts at EOF or beyond it. Since we do not yet support 5158 * fallocate (for extent preallocation and hole punching), sending a 5159 * write of zeroes starting at EOF or beyond would later require issuing 5160 * a truncate operation which would undo the write and achieve nothing. 5161 */ 5162 if (offset >= sctx->cur_inode_size) 5163 return 0; 5164 5165 /* 5166 * Don't go beyond the inode's i_size due to prealloc extents that start 5167 * after the i_size. 5168 */ 5169 end = min_t(u64, end, sctx->cur_inode_size); 5170 5171 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5172 return send_update_extent(sctx, offset, end - offset); 5173 5174 p = fs_path_alloc(); 5175 if (!p) 5176 return -ENOMEM; 5177 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 5178 if (ret < 0) 5179 goto tlv_put_failure; 5180 while (offset < end) { 5181 u64 len = min(end - offset, read_size); 5182 5183 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5184 if (ret < 0) 5185 break; 5186 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5187 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5188 ret = put_data_header(sctx, len); 5189 if (ret < 0) 5190 break; 5191 memset(sctx->send_buf + sctx->send_size, 0, len); 5192 sctx->send_size += len; 5193 ret = send_cmd(sctx); 5194 if (ret < 0) 5195 break; 5196 offset += len; 5197 } 5198 sctx->cur_inode_next_write_offset = offset; 5199 tlv_put_failure: 5200 fs_path_free(p); 5201 return ret; 5202 } 5203 5204 static int send_extent_data(struct send_ctx *sctx, 5205 const u64 offset, 5206 const u64 len) 5207 { 5208 u64 read_size = max_send_read_size(sctx); 5209 u64 sent = 0; 5210 5211 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5212 return send_update_extent(sctx, offset, len); 5213 5214 while (sent < len) { 5215 u64 size = min(len - sent, read_size); 5216 int ret; 5217 5218 ret = send_write(sctx, offset + sent, size); 5219 if (ret < 0) 5220 return ret; 5221 sent += size; 5222 } 5223 return 0; 5224 } 5225 5226 /* 5227 * Search for a capability xattr related to sctx->cur_ino. If the capability is 5228 * found, call send_set_xattr function to emit it. 5229 * 5230 * Return 0 if there isn't a capability, or when the capability was emitted 5231 * successfully, or < 0 if an error occurred. 5232 */ 5233 static int send_capabilities(struct send_ctx *sctx) 5234 { 5235 struct fs_path *fspath = NULL; 5236 struct btrfs_path *path; 5237 struct btrfs_dir_item *di; 5238 struct extent_buffer *leaf; 5239 unsigned long data_ptr; 5240 char *buf = NULL; 5241 int buf_len; 5242 int ret = 0; 5243 5244 path = alloc_path_for_send(); 5245 if (!path) 5246 return -ENOMEM; 5247 5248 di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, 5249 XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); 5250 if (!di) { 5251 /* There is no xattr for this inode */ 5252 goto out; 5253 } else if (IS_ERR(di)) { 5254 ret = PTR_ERR(di); 5255 goto out; 5256 } 5257 5258 leaf = path->nodes[0]; 5259 buf_len = btrfs_dir_data_len(leaf, di); 5260 5261 fspath = fs_path_alloc(); 5262 buf = kmalloc(buf_len, GFP_KERNEL); 5263 if (!fspath || !buf) { 5264 ret = -ENOMEM; 5265 goto out; 5266 } 5267 5268 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); 5269 if (ret < 0) 5270 goto out; 5271 5272 data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); 5273 read_extent_buffer(leaf, buf, data_ptr, buf_len); 5274 5275 ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, 5276 strlen(XATTR_NAME_CAPS), buf, buf_len); 5277 out: 5278 kfree(buf); 5279 fs_path_free(fspath); 5280 btrfs_free_path(path); 5281 return ret; 5282 } 5283 5284 static int clone_range(struct send_ctx *sctx, 5285 struct clone_root *clone_root, 5286 const u64 disk_byte, 5287 u64 data_offset, 5288 u64 offset, 5289 u64 len) 5290 { 5291 struct btrfs_path *path; 5292 struct btrfs_key key; 5293 int ret; 5294 u64 clone_src_i_size = 0; 5295 5296 /* 5297 * Prevent cloning from a zero offset with a length matching the sector 5298 * size because in some scenarios this will make the receiver fail. 5299 * 5300 * For example, if in the source filesystem the extent at offset 0 5301 * has a length of sectorsize and it was written using direct IO, then 5302 * it can never be an inline extent (even if compression is enabled). 5303 * Then this extent can be cloned in the original filesystem to a non 5304 * zero file offset, but it may not be possible to clone in the 5305 * destination filesystem because it can be inlined due to compression 5306 * on the destination filesystem (as the receiver's write operations are 5307 * always done using buffered IO). The same happens when the original 5308 * filesystem does not have compression enabled but the destination 5309 * filesystem has. 5310 */ 5311 if (clone_root->offset == 0 && 5312 len == sctx->send_root->fs_info->sectorsize) 5313 return send_extent_data(sctx, offset, len); 5314 5315 path = alloc_path_for_send(); 5316 if (!path) 5317 return -ENOMEM; 5318 5319 /* 5320 * There are inodes that have extents that lie behind its i_size. Don't 5321 * accept clones from these extents. 5322 */ 5323 ret = __get_inode_info(clone_root->root, path, clone_root->ino, 5324 &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); 5325 btrfs_release_path(path); 5326 if (ret < 0) 5327 goto out; 5328 5329 /* 5330 * We can't send a clone operation for the entire range if we find 5331 * extent items in the respective range in the source file that 5332 * refer to different extents or if we find holes. 5333 * So check for that and do a mix of clone and regular write/copy 5334 * operations if needed. 5335 * 5336 * Example: 5337 * 5338 * mkfs.btrfs -f /dev/sda 5339 * mount /dev/sda /mnt 5340 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo 5341 * cp --reflink=always /mnt/foo /mnt/bar 5342 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo 5343 * btrfs subvolume snapshot -r /mnt /mnt/snap 5344 * 5345 * If when we send the snapshot and we are processing file bar (which 5346 * has a higher inode number than foo) we blindly send a clone operation 5347 * for the [0, 100K[ range from foo to bar, the receiver ends up getting 5348 * a file bar that matches the content of file foo - iow, doesn't match 5349 * the content from bar in the original filesystem. 5350 */ 5351 key.objectid = clone_root->ino; 5352 key.type = BTRFS_EXTENT_DATA_KEY; 5353 key.offset = clone_root->offset; 5354 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); 5355 if (ret < 0) 5356 goto out; 5357 if (ret > 0 && path->slots[0] > 0) { 5358 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 5359 if (key.objectid == clone_root->ino && 5360 key.type == BTRFS_EXTENT_DATA_KEY) 5361 path->slots[0]--; 5362 } 5363 5364 while (true) { 5365 struct extent_buffer *leaf = path->nodes[0]; 5366 int slot = path->slots[0]; 5367 struct btrfs_file_extent_item *ei; 5368 u8 type; 5369 u64 ext_len; 5370 u64 clone_len; 5371 u64 clone_data_offset; 5372 5373 if (slot >= btrfs_header_nritems(leaf)) { 5374 ret = btrfs_next_leaf(clone_root->root, path); 5375 if (ret < 0) 5376 goto out; 5377 else if (ret > 0) 5378 break; 5379 continue; 5380 } 5381 5382 btrfs_item_key_to_cpu(leaf, &key, slot); 5383 5384 /* 5385 * We might have an implicit trailing hole (NO_HOLES feature 5386 * enabled). We deal with it after leaving this loop. 5387 */ 5388 if (key.objectid != clone_root->ino || 5389 key.type != BTRFS_EXTENT_DATA_KEY) 5390 break; 5391 5392 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5393 type = btrfs_file_extent_type(leaf, ei); 5394 if (type == BTRFS_FILE_EXTENT_INLINE) { 5395 ext_len = btrfs_file_extent_ram_bytes(leaf, ei); 5396 ext_len = PAGE_ALIGN(ext_len); 5397 } else { 5398 ext_len = btrfs_file_extent_num_bytes(leaf, ei); 5399 } 5400 5401 if (key.offset + ext_len <= clone_root->offset) 5402 goto next; 5403 5404 if (key.offset > clone_root->offset) { 5405 /* Implicit hole, NO_HOLES feature enabled. */ 5406 u64 hole_len = key.offset - clone_root->offset; 5407 5408 if (hole_len > len) 5409 hole_len = len; 5410 ret = send_extent_data(sctx, offset, hole_len); 5411 if (ret < 0) 5412 goto out; 5413 5414 len -= hole_len; 5415 if (len == 0) 5416 break; 5417 offset += hole_len; 5418 clone_root->offset += hole_len; 5419 data_offset += hole_len; 5420 } 5421 5422 if (key.offset >= clone_root->offset + len) 5423 break; 5424 5425 if (key.offset >= clone_src_i_size) 5426 break; 5427 5428 if (key.offset + ext_len > clone_src_i_size) 5429 ext_len = clone_src_i_size - key.offset; 5430 5431 clone_data_offset = btrfs_file_extent_offset(leaf, ei); 5432 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { 5433 clone_root->offset = key.offset; 5434 if (clone_data_offset < data_offset && 5435 clone_data_offset + ext_len > data_offset) { 5436 u64 extent_offset; 5437 5438 extent_offset = data_offset - clone_data_offset; 5439 ext_len -= extent_offset; 5440 clone_data_offset += extent_offset; 5441 clone_root->offset += extent_offset; 5442 } 5443 } 5444 5445 clone_len = min_t(u64, ext_len, len); 5446 5447 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && 5448 clone_data_offset == data_offset) { 5449 const u64 src_end = clone_root->offset + clone_len; 5450 const u64 sectorsize = SZ_64K; 5451 5452 /* 5453 * We can't clone the last block, when its size is not 5454 * sector size aligned, into the middle of a file. If we 5455 * do so, the receiver will get a failure (-EINVAL) when 5456 * trying to clone or will silently corrupt the data in 5457 * the destination file if it's on a kernel without the 5458 * fix introduced by commit ac765f83f1397646 5459 * ("Btrfs: fix data corruption due to cloning of eof 5460 * block). 5461 * 5462 * So issue a clone of the aligned down range plus a 5463 * regular write for the eof block, if we hit that case. 5464 * 5465 * Also, we use the maximum possible sector size, 64K, 5466 * because we don't know what's the sector size of the 5467 * filesystem that receives the stream, so we have to 5468 * assume the largest possible sector size. 5469 */ 5470 if (src_end == clone_src_i_size && 5471 !IS_ALIGNED(src_end, sectorsize) && 5472 offset + clone_len < sctx->cur_inode_size) { 5473 u64 slen; 5474 5475 slen = ALIGN_DOWN(src_end - clone_root->offset, 5476 sectorsize); 5477 if (slen > 0) { 5478 ret = send_clone(sctx, offset, slen, 5479 clone_root); 5480 if (ret < 0) 5481 goto out; 5482 } 5483 ret = send_extent_data(sctx, offset + slen, 5484 clone_len - slen); 5485 } else { 5486 ret = send_clone(sctx, offset, clone_len, 5487 clone_root); 5488 } 5489 } else { 5490 ret = send_extent_data(sctx, offset, clone_len); 5491 } 5492 5493 if (ret < 0) 5494 goto out; 5495 5496 len -= clone_len; 5497 if (len == 0) 5498 break; 5499 offset += clone_len; 5500 clone_root->offset += clone_len; 5501 5502 /* 5503 * If we are cloning from the file we are currently processing, 5504 * and using the send root as the clone root, we must stop once 5505 * the current clone offset reaches the current eof of the file 5506 * at the receiver, otherwise we would issue an invalid clone 5507 * operation (source range going beyond eof) and cause the 5508 * receiver to fail. So if we reach the current eof, bail out 5509 * and fallback to a regular write. 5510 */ 5511 if (clone_root->root == sctx->send_root && 5512 clone_root->ino == sctx->cur_ino && 5513 clone_root->offset >= sctx->cur_inode_next_write_offset) 5514 break; 5515 5516 data_offset += clone_len; 5517 next: 5518 path->slots[0]++; 5519 } 5520 5521 if (len > 0) 5522 ret = send_extent_data(sctx, offset, len); 5523 else 5524 ret = 0; 5525 out: 5526 btrfs_free_path(path); 5527 return ret; 5528 } 5529 5530 static int send_write_or_clone(struct send_ctx *sctx, 5531 struct btrfs_path *path, 5532 struct btrfs_key *key, 5533 struct clone_root *clone_root) 5534 { 5535 int ret = 0; 5536 u64 offset = key->offset; 5537 u64 end; 5538 u64 bs = sctx->send_root->fs_info->sb->s_blocksize; 5539 5540 end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); 5541 if (offset >= end) 5542 return 0; 5543 5544 if (clone_root && IS_ALIGNED(end, bs)) { 5545 struct btrfs_file_extent_item *ei; 5546 u64 disk_byte; 5547 u64 data_offset; 5548 5549 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 5550 struct btrfs_file_extent_item); 5551 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); 5552 data_offset = btrfs_file_extent_offset(path->nodes[0], ei); 5553 ret = clone_range(sctx, clone_root, disk_byte, data_offset, 5554 offset, end - offset); 5555 } else { 5556 ret = send_extent_data(sctx, offset, end - offset); 5557 } 5558 sctx->cur_inode_next_write_offset = end; 5559 return ret; 5560 } 5561 5562 static int is_extent_unchanged(struct send_ctx *sctx, 5563 struct btrfs_path *left_path, 5564 struct btrfs_key *ekey) 5565 { 5566 int ret = 0; 5567 struct btrfs_key key; 5568 struct btrfs_path *path = NULL; 5569 struct extent_buffer *eb; 5570 int slot; 5571 struct btrfs_key found_key; 5572 struct btrfs_file_extent_item *ei; 5573 u64 left_disknr; 5574 u64 right_disknr; 5575 u64 left_offset; 5576 u64 right_offset; 5577 u64 left_offset_fixed; 5578 u64 left_len; 5579 u64 right_len; 5580 u64 left_gen; 5581 u64 right_gen; 5582 u8 left_type; 5583 u8 right_type; 5584 5585 path = alloc_path_for_send(); 5586 if (!path) 5587 return -ENOMEM; 5588 5589 eb = left_path->nodes[0]; 5590 slot = left_path->slots[0]; 5591 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 5592 left_type = btrfs_file_extent_type(eb, ei); 5593 5594 if (left_type != BTRFS_FILE_EXTENT_REG) { 5595 ret = 0; 5596 goto out; 5597 } 5598 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 5599 left_len = btrfs_file_extent_num_bytes(eb, ei); 5600 left_offset = btrfs_file_extent_offset(eb, ei); 5601 left_gen = btrfs_file_extent_generation(eb, ei); 5602 5603 /* 5604 * Following comments will refer to these graphics. L is the left 5605 * extents which we are checking at the moment. 1-8 are the right 5606 * extents that we iterate. 5607 * 5608 * |-----L-----| 5609 * |-1-|-2a-|-3-|-4-|-5-|-6-| 5610 * 5611 * |-----L-----| 5612 * |--1--|-2b-|...(same as above) 5613 * 5614 * Alternative situation. Happens on files where extents got split. 5615 * |-----L-----| 5616 * |-----------7-----------|-6-| 5617 * 5618 * Alternative situation. Happens on files which got larger. 5619 * |-----L-----| 5620 * |-8-| 5621 * Nothing follows after 8. 5622 */ 5623 5624 key.objectid = ekey->objectid; 5625 key.type = BTRFS_EXTENT_DATA_KEY; 5626 key.offset = ekey->offset; 5627 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); 5628 if (ret < 0) 5629 goto out; 5630 if (ret) { 5631 ret = 0; 5632 goto out; 5633 } 5634 5635 /* 5636 * Handle special case where the right side has no extents at all. 5637 */ 5638 eb = path->nodes[0]; 5639 slot = path->slots[0]; 5640 btrfs_item_key_to_cpu(eb, &found_key, slot); 5641 if (found_key.objectid != key.objectid || 5642 found_key.type != key.type) { 5643 /* If we're a hole then just pretend nothing changed */ 5644 ret = (left_disknr) ? 0 : 1; 5645 goto out; 5646 } 5647 5648 /* 5649 * We're now on 2a, 2b or 7. 5650 */ 5651 key = found_key; 5652 while (key.offset < ekey->offset + left_len) { 5653 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 5654 right_type = btrfs_file_extent_type(eb, ei); 5655 if (right_type != BTRFS_FILE_EXTENT_REG && 5656 right_type != BTRFS_FILE_EXTENT_INLINE) { 5657 ret = 0; 5658 goto out; 5659 } 5660 5661 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 5662 right_len = btrfs_file_extent_ram_bytes(eb, ei); 5663 right_len = PAGE_ALIGN(right_len); 5664 } else { 5665 right_len = btrfs_file_extent_num_bytes(eb, ei); 5666 } 5667 5668 /* 5669 * Are we at extent 8? If yes, we know the extent is changed. 5670 * This may only happen on the first iteration. 5671 */ 5672 if (found_key.offset + right_len <= ekey->offset) { 5673 /* If we're a hole just pretend nothing changed */ 5674 ret = (left_disknr) ? 0 : 1; 5675 goto out; 5676 } 5677 5678 /* 5679 * We just wanted to see if when we have an inline extent, what 5680 * follows it is a regular extent (wanted to check the above 5681 * condition for inline extents too). This should normally not 5682 * happen but it's possible for example when we have an inline 5683 * compressed extent representing data with a size matching 5684 * the page size (currently the same as sector size). 5685 */ 5686 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 5687 ret = 0; 5688 goto out; 5689 } 5690 5691 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 5692 right_offset = btrfs_file_extent_offset(eb, ei); 5693 right_gen = btrfs_file_extent_generation(eb, ei); 5694 5695 left_offset_fixed = left_offset; 5696 if (key.offset < ekey->offset) { 5697 /* Fix the right offset for 2a and 7. */ 5698 right_offset += ekey->offset - key.offset; 5699 } else { 5700 /* Fix the left offset for all behind 2a and 2b */ 5701 left_offset_fixed += key.offset - ekey->offset; 5702 } 5703 5704 /* 5705 * Check if we have the same extent. 5706 */ 5707 if (left_disknr != right_disknr || 5708 left_offset_fixed != right_offset || 5709 left_gen != right_gen) { 5710 ret = 0; 5711 goto out; 5712 } 5713 5714 /* 5715 * Go to the next extent. 5716 */ 5717 ret = btrfs_next_item(sctx->parent_root, path); 5718 if (ret < 0) 5719 goto out; 5720 if (!ret) { 5721 eb = path->nodes[0]; 5722 slot = path->slots[0]; 5723 btrfs_item_key_to_cpu(eb, &found_key, slot); 5724 } 5725 if (ret || found_key.objectid != key.objectid || 5726 found_key.type != key.type) { 5727 key.offset += right_len; 5728 break; 5729 } 5730 if (found_key.offset != key.offset + right_len) { 5731 ret = 0; 5732 goto out; 5733 } 5734 key = found_key; 5735 } 5736 5737 /* 5738 * We're now behind the left extent (treat as unchanged) or at the end 5739 * of the right side (treat as changed). 5740 */ 5741 if (key.offset >= ekey->offset + left_len) 5742 ret = 1; 5743 else 5744 ret = 0; 5745 5746 5747 out: 5748 btrfs_free_path(path); 5749 return ret; 5750 } 5751 5752 static int get_last_extent(struct send_ctx *sctx, u64 offset) 5753 { 5754 struct btrfs_path *path; 5755 struct btrfs_root *root = sctx->send_root; 5756 struct btrfs_key key; 5757 int ret; 5758 5759 path = alloc_path_for_send(); 5760 if (!path) 5761 return -ENOMEM; 5762 5763 sctx->cur_inode_last_extent = 0; 5764 5765 key.objectid = sctx->cur_ino; 5766 key.type = BTRFS_EXTENT_DATA_KEY; 5767 key.offset = offset; 5768 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); 5769 if (ret < 0) 5770 goto out; 5771 ret = 0; 5772 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 5773 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) 5774 goto out; 5775 5776 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 5777 out: 5778 btrfs_free_path(path); 5779 return ret; 5780 } 5781 5782 static int range_is_hole_in_parent(struct send_ctx *sctx, 5783 const u64 start, 5784 const u64 end) 5785 { 5786 struct btrfs_path *path; 5787 struct btrfs_key key; 5788 struct btrfs_root *root = sctx->parent_root; 5789 u64 search_start = start; 5790 int ret; 5791 5792 path = alloc_path_for_send(); 5793 if (!path) 5794 return -ENOMEM; 5795 5796 key.objectid = sctx->cur_ino; 5797 key.type = BTRFS_EXTENT_DATA_KEY; 5798 key.offset = search_start; 5799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5800 if (ret < 0) 5801 goto out; 5802 if (ret > 0 && path->slots[0] > 0) 5803 path->slots[0]--; 5804 5805 while (search_start < end) { 5806 struct extent_buffer *leaf = path->nodes[0]; 5807 int slot = path->slots[0]; 5808 struct btrfs_file_extent_item *fi; 5809 u64 extent_end; 5810 5811 if (slot >= btrfs_header_nritems(leaf)) { 5812 ret = btrfs_next_leaf(root, path); 5813 if (ret < 0) 5814 goto out; 5815 else if (ret > 0) 5816 break; 5817 continue; 5818 } 5819 5820 btrfs_item_key_to_cpu(leaf, &key, slot); 5821 if (key.objectid < sctx->cur_ino || 5822 key.type < BTRFS_EXTENT_DATA_KEY) 5823 goto next; 5824 if (key.objectid > sctx->cur_ino || 5825 key.type > BTRFS_EXTENT_DATA_KEY || 5826 key.offset >= end) 5827 break; 5828 5829 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5830 extent_end = btrfs_file_extent_end(path); 5831 if (extent_end <= start) 5832 goto next; 5833 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { 5834 search_start = extent_end; 5835 goto next; 5836 } 5837 ret = 0; 5838 goto out; 5839 next: 5840 path->slots[0]++; 5841 } 5842 ret = 1; 5843 out: 5844 btrfs_free_path(path); 5845 return ret; 5846 } 5847 5848 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, 5849 struct btrfs_key *key) 5850 { 5851 int ret = 0; 5852 5853 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) 5854 return 0; 5855 5856 if (sctx->cur_inode_last_extent == (u64)-1) { 5857 ret = get_last_extent(sctx, key->offset - 1); 5858 if (ret) 5859 return ret; 5860 } 5861 5862 if (path->slots[0] == 0 && 5863 sctx->cur_inode_last_extent < key->offset) { 5864 /* 5865 * We might have skipped entire leafs that contained only 5866 * file extent items for our current inode. These leafs have 5867 * a generation number smaller (older) than the one in the 5868 * current leaf and the leaf our last extent came from, and 5869 * are located between these 2 leafs. 5870 */ 5871 ret = get_last_extent(sctx, key->offset - 1); 5872 if (ret) 5873 return ret; 5874 } 5875 5876 if (sctx->cur_inode_last_extent < key->offset) { 5877 ret = range_is_hole_in_parent(sctx, 5878 sctx->cur_inode_last_extent, 5879 key->offset); 5880 if (ret < 0) 5881 return ret; 5882 else if (ret == 0) 5883 ret = send_hole(sctx, key->offset); 5884 else 5885 ret = 0; 5886 } 5887 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 5888 return ret; 5889 } 5890 5891 static int process_extent(struct send_ctx *sctx, 5892 struct btrfs_path *path, 5893 struct btrfs_key *key) 5894 { 5895 struct clone_root *found_clone = NULL; 5896 int ret = 0; 5897 5898 if (S_ISLNK(sctx->cur_inode_mode)) 5899 return 0; 5900 5901 if (sctx->parent_root && !sctx->cur_inode_new) { 5902 ret = is_extent_unchanged(sctx, path, key); 5903 if (ret < 0) 5904 goto out; 5905 if (ret) { 5906 ret = 0; 5907 goto out_hole; 5908 } 5909 } else { 5910 struct btrfs_file_extent_item *ei; 5911 u8 type; 5912 5913 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 5914 struct btrfs_file_extent_item); 5915 type = btrfs_file_extent_type(path->nodes[0], ei); 5916 if (type == BTRFS_FILE_EXTENT_PREALLOC || 5917 type == BTRFS_FILE_EXTENT_REG) { 5918 /* 5919 * The send spec does not have a prealloc command yet, 5920 * so just leave a hole for prealloc'ed extents until 5921 * we have enough commands queued up to justify rev'ing 5922 * the send spec. 5923 */ 5924 if (type == BTRFS_FILE_EXTENT_PREALLOC) { 5925 ret = 0; 5926 goto out; 5927 } 5928 5929 /* Have a hole, just skip it. */ 5930 if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { 5931 ret = 0; 5932 goto out; 5933 } 5934 } 5935 } 5936 5937 ret = find_extent_clone(sctx, path, key->objectid, key->offset, 5938 sctx->cur_inode_size, &found_clone); 5939 if (ret != -ENOENT && ret < 0) 5940 goto out; 5941 5942 ret = send_write_or_clone(sctx, path, key, found_clone); 5943 if (ret) 5944 goto out; 5945 out_hole: 5946 ret = maybe_send_hole(sctx, path, key); 5947 out: 5948 return ret; 5949 } 5950 5951 static int process_all_extents(struct send_ctx *sctx) 5952 { 5953 int ret; 5954 struct btrfs_root *root; 5955 struct btrfs_path *path; 5956 struct btrfs_key key; 5957 struct btrfs_key found_key; 5958 struct extent_buffer *eb; 5959 int slot; 5960 5961 root = sctx->send_root; 5962 path = alloc_path_for_send(); 5963 if (!path) 5964 return -ENOMEM; 5965 5966 key.objectid = sctx->cmp_key->objectid; 5967 key.type = BTRFS_EXTENT_DATA_KEY; 5968 key.offset = 0; 5969 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5970 if (ret < 0) 5971 goto out; 5972 5973 while (1) { 5974 eb = path->nodes[0]; 5975 slot = path->slots[0]; 5976 5977 if (slot >= btrfs_header_nritems(eb)) { 5978 ret = btrfs_next_leaf(root, path); 5979 if (ret < 0) { 5980 goto out; 5981 } else if (ret > 0) { 5982 ret = 0; 5983 break; 5984 } 5985 continue; 5986 } 5987 5988 btrfs_item_key_to_cpu(eb, &found_key, slot); 5989 5990 if (found_key.objectid != key.objectid || 5991 found_key.type != key.type) { 5992 ret = 0; 5993 goto out; 5994 } 5995 5996 ret = process_extent(sctx, path, &found_key); 5997 if (ret < 0) 5998 goto out; 5999 6000 path->slots[0]++; 6001 } 6002 6003 out: 6004 btrfs_free_path(path); 6005 return ret; 6006 } 6007 6008 static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, 6009 int *pending_move, 6010 int *refs_processed) 6011 { 6012 int ret = 0; 6013 6014 if (sctx->cur_ino == 0) 6015 goto out; 6016 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && 6017 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) 6018 goto out; 6019 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 6020 goto out; 6021 6022 ret = process_recorded_refs(sctx, pending_move); 6023 if (ret < 0) 6024 goto out; 6025 6026 *refs_processed = 1; 6027 out: 6028 return ret; 6029 } 6030 6031 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) 6032 { 6033 int ret = 0; 6034 u64 left_mode; 6035 u64 left_uid; 6036 u64 left_gid; 6037 u64 right_mode; 6038 u64 right_uid; 6039 u64 right_gid; 6040 int need_chmod = 0; 6041 int need_chown = 0; 6042 int need_truncate = 1; 6043 int pending_move = 0; 6044 int refs_processed = 0; 6045 6046 if (sctx->ignore_cur_inode) 6047 return 0; 6048 6049 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, 6050 &refs_processed); 6051 if (ret < 0) 6052 goto out; 6053 6054 /* 6055 * We have processed the refs and thus need to advance send_progress. 6056 * Now, calls to get_cur_xxx will take the updated refs of the current 6057 * inode into account. 6058 * 6059 * On the other hand, if our current inode is a directory and couldn't 6060 * be moved/renamed because its parent was renamed/moved too and it has 6061 * a higher inode number, we can only move/rename our current inode 6062 * after we moved/renamed its parent. Therefore in this case operate on 6063 * the old path (pre move/rename) of our current inode, and the 6064 * move/rename will be performed later. 6065 */ 6066 if (refs_processed && !pending_move) 6067 sctx->send_progress = sctx->cur_ino + 1; 6068 6069 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) 6070 goto out; 6071 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) 6072 goto out; 6073 6074 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, 6075 &left_mode, &left_uid, &left_gid, NULL); 6076 if (ret < 0) 6077 goto out; 6078 6079 if (!sctx->parent_root || sctx->cur_inode_new) { 6080 need_chown = 1; 6081 if (!S_ISLNK(sctx->cur_inode_mode)) 6082 need_chmod = 1; 6083 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) 6084 need_truncate = 0; 6085 } else { 6086 u64 old_size; 6087 6088 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, 6089 &old_size, NULL, &right_mode, &right_uid, 6090 &right_gid, NULL); 6091 if (ret < 0) 6092 goto out; 6093 6094 if (left_uid != right_uid || left_gid != right_gid) 6095 need_chown = 1; 6096 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) 6097 need_chmod = 1; 6098 if ((old_size == sctx->cur_inode_size) || 6099 (sctx->cur_inode_size > old_size && 6100 sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) 6101 need_truncate = 0; 6102 } 6103 6104 if (S_ISREG(sctx->cur_inode_mode)) { 6105 if (need_send_hole(sctx)) { 6106 if (sctx->cur_inode_last_extent == (u64)-1 || 6107 sctx->cur_inode_last_extent < 6108 sctx->cur_inode_size) { 6109 ret = get_last_extent(sctx, (u64)-1); 6110 if (ret) 6111 goto out; 6112 } 6113 if (sctx->cur_inode_last_extent < 6114 sctx->cur_inode_size) { 6115 ret = send_hole(sctx, sctx->cur_inode_size); 6116 if (ret) 6117 goto out; 6118 } 6119 } 6120 if (need_truncate) { 6121 ret = send_truncate(sctx, sctx->cur_ino, 6122 sctx->cur_inode_gen, 6123 sctx->cur_inode_size); 6124 if (ret < 0) 6125 goto out; 6126 } 6127 } 6128 6129 if (need_chown) { 6130 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6131 left_uid, left_gid); 6132 if (ret < 0) 6133 goto out; 6134 } 6135 if (need_chmod) { 6136 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6137 left_mode); 6138 if (ret < 0) 6139 goto out; 6140 } 6141 6142 ret = send_capabilities(sctx); 6143 if (ret < 0) 6144 goto out; 6145 6146 /* 6147 * If other directory inodes depended on our current directory 6148 * inode's move/rename, now do their move/rename operations. 6149 */ 6150 if (!is_waiting_for_move(sctx, sctx->cur_ino)) { 6151 ret = apply_children_dir_moves(sctx); 6152 if (ret) 6153 goto out; 6154 /* 6155 * Need to send that every time, no matter if it actually 6156 * changed between the two trees as we have done changes to 6157 * the inode before. If our inode is a directory and it's 6158 * waiting to be moved/renamed, we will send its utimes when 6159 * it's moved/renamed, therefore we don't need to do it here. 6160 */ 6161 sctx->send_progress = sctx->cur_ino + 1; 6162 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); 6163 if (ret < 0) 6164 goto out; 6165 } 6166 6167 out: 6168 return ret; 6169 } 6170 6171 struct parent_paths_ctx { 6172 struct list_head *refs; 6173 struct send_ctx *sctx; 6174 }; 6175 6176 static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, 6177 void *ctx) 6178 { 6179 struct parent_paths_ctx *ppctx = ctx; 6180 6181 return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, 6182 ppctx->refs); 6183 } 6184 6185 /* 6186 * Issue unlink operations for all paths of the current inode found in the 6187 * parent snapshot. 6188 */ 6189 static int btrfs_unlink_all_paths(struct send_ctx *sctx) 6190 { 6191 LIST_HEAD(deleted_refs); 6192 struct btrfs_path *path; 6193 struct btrfs_key key; 6194 struct parent_paths_ctx ctx; 6195 int ret; 6196 6197 path = alloc_path_for_send(); 6198 if (!path) 6199 return -ENOMEM; 6200 6201 key.objectid = sctx->cur_ino; 6202 key.type = BTRFS_INODE_REF_KEY; 6203 key.offset = 0; 6204 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); 6205 if (ret < 0) 6206 goto out; 6207 6208 ctx.refs = &deleted_refs; 6209 ctx.sctx = sctx; 6210 6211 while (true) { 6212 struct extent_buffer *eb = path->nodes[0]; 6213 int slot = path->slots[0]; 6214 6215 if (slot >= btrfs_header_nritems(eb)) { 6216 ret = btrfs_next_leaf(sctx->parent_root, path); 6217 if (ret < 0) 6218 goto out; 6219 else if (ret > 0) 6220 break; 6221 continue; 6222 } 6223 6224 btrfs_item_key_to_cpu(eb, &key, slot); 6225 if (key.objectid != sctx->cur_ino) 6226 break; 6227 if (key.type != BTRFS_INODE_REF_KEY && 6228 key.type != BTRFS_INODE_EXTREF_KEY) 6229 break; 6230 6231 ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, 6232 record_parent_ref, &ctx); 6233 if (ret < 0) 6234 goto out; 6235 6236 path->slots[0]++; 6237 } 6238 6239 while (!list_empty(&deleted_refs)) { 6240 struct recorded_ref *ref; 6241 6242 ref = list_first_entry(&deleted_refs, struct recorded_ref, list); 6243 ret = send_unlink(sctx, ref->full_path); 6244 if (ret < 0) 6245 goto out; 6246 fs_path_free(ref->full_path); 6247 list_del(&ref->list); 6248 kfree(ref); 6249 } 6250 ret = 0; 6251 out: 6252 btrfs_free_path(path); 6253 if (ret) 6254 __free_recorded_refs(&deleted_refs); 6255 return ret; 6256 } 6257 6258 static int changed_inode(struct send_ctx *sctx, 6259 enum btrfs_compare_tree_result result) 6260 { 6261 int ret = 0; 6262 struct btrfs_key *key = sctx->cmp_key; 6263 struct btrfs_inode_item *left_ii = NULL; 6264 struct btrfs_inode_item *right_ii = NULL; 6265 u64 left_gen = 0; 6266 u64 right_gen = 0; 6267 6268 sctx->cur_ino = key->objectid; 6269 sctx->cur_inode_new_gen = 0; 6270 sctx->cur_inode_last_extent = (u64)-1; 6271 sctx->cur_inode_next_write_offset = 0; 6272 sctx->ignore_cur_inode = false; 6273 6274 /* 6275 * Set send_progress to current inode. This will tell all get_cur_xxx 6276 * functions that the current inode's refs are not updated yet. Later, 6277 * when process_recorded_refs is finished, it is set to cur_ino + 1. 6278 */ 6279 sctx->send_progress = sctx->cur_ino; 6280 6281 if (result == BTRFS_COMPARE_TREE_NEW || 6282 result == BTRFS_COMPARE_TREE_CHANGED) { 6283 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], 6284 sctx->left_path->slots[0], 6285 struct btrfs_inode_item); 6286 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], 6287 left_ii); 6288 } else { 6289 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6290 sctx->right_path->slots[0], 6291 struct btrfs_inode_item); 6292 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6293 right_ii); 6294 } 6295 if (result == BTRFS_COMPARE_TREE_CHANGED) { 6296 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6297 sctx->right_path->slots[0], 6298 struct btrfs_inode_item); 6299 6300 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6301 right_ii); 6302 6303 /* 6304 * The cur_ino = root dir case is special here. We can't treat 6305 * the inode as deleted+reused because it would generate a 6306 * stream that tries to delete/mkdir the root dir. 6307 */ 6308 if (left_gen != right_gen && 6309 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6310 sctx->cur_inode_new_gen = 1; 6311 } 6312 6313 /* 6314 * Normally we do not find inodes with a link count of zero (orphans) 6315 * because the most common case is to create a snapshot and use it 6316 * for a send operation. However other less common use cases involve 6317 * using a subvolume and send it after turning it to RO mode just 6318 * after deleting all hard links of a file while holding an open 6319 * file descriptor against it or turning a RO snapshot into RW mode, 6320 * keep an open file descriptor against a file, delete it and then 6321 * turn the snapshot back to RO mode before using it for a send 6322 * operation. So if we find such cases, ignore the inode and all its 6323 * items completely if it's a new inode, or if it's a changed inode 6324 * make sure all its previous paths (from the parent snapshot) are all 6325 * unlinked and all other the inode items are ignored. 6326 */ 6327 if (result == BTRFS_COMPARE_TREE_NEW || 6328 result == BTRFS_COMPARE_TREE_CHANGED) { 6329 u32 nlinks; 6330 6331 nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); 6332 if (nlinks == 0) { 6333 sctx->ignore_cur_inode = true; 6334 if (result == BTRFS_COMPARE_TREE_CHANGED) 6335 ret = btrfs_unlink_all_paths(sctx); 6336 goto out; 6337 } 6338 } 6339 6340 if (result == BTRFS_COMPARE_TREE_NEW) { 6341 sctx->cur_inode_gen = left_gen; 6342 sctx->cur_inode_new = 1; 6343 sctx->cur_inode_deleted = 0; 6344 sctx->cur_inode_size = btrfs_inode_size( 6345 sctx->left_path->nodes[0], left_ii); 6346 sctx->cur_inode_mode = btrfs_inode_mode( 6347 sctx->left_path->nodes[0], left_ii); 6348 sctx->cur_inode_rdev = btrfs_inode_rdev( 6349 sctx->left_path->nodes[0], left_ii); 6350 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6351 ret = send_create_inode_if_needed(sctx); 6352 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 6353 sctx->cur_inode_gen = right_gen; 6354 sctx->cur_inode_new = 0; 6355 sctx->cur_inode_deleted = 1; 6356 sctx->cur_inode_size = btrfs_inode_size( 6357 sctx->right_path->nodes[0], right_ii); 6358 sctx->cur_inode_mode = btrfs_inode_mode( 6359 sctx->right_path->nodes[0], right_ii); 6360 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 6361 /* 6362 * We need to do some special handling in case the inode was 6363 * reported as changed with a changed generation number. This 6364 * means that the original inode was deleted and new inode 6365 * reused the same inum. So we have to treat the old inode as 6366 * deleted and the new one as new. 6367 */ 6368 if (sctx->cur_inode_new_gen) { 6369 /* 6370 * First, process the inode as if it was deleted. 6371 */ 6372 sctx->cur_inode_gen = right_gen; 6373 sctx->cur_inode_new = 0; 6374 sctx->cur_inode_deleted = 1; 6375 sctx->cur_inode_size = btrfs_inode_size( 6376 sctx->right_path->nodes[0], right_ii); 6377 sctx->cur_inode_mode = btrfs_inode_mode( 6378 sctx->right_path->nodes[0], right_ii); 6379 ret = process_all_refs(sctx, 6380 BTRFS_COMPARE_TREE_DELETED); 6381 if (ret < 0) 6382 goto out; 6383 6384 /* 6385 * Now process the inode as if it was new. 6386 */ 6387 sctx->cur_inode_gen = left_gen; 6388 sctx->cur_inode_new = 1; 6389 sctx->cur_inode_deleted = 0; 6390 sctx->cur_inode_size = btrfs_inode_size( 6391 sctx->left_path->nodes[0], left_ii); 6392 sctx->cur_inode_mode = btrfs_inode_mode( 6393 sctx->left_path->nodes[0], left_ii); 6394 sctx->cur_inode_rdev = btrfs_inode_rdev( 6395 sctx->left_path->nodes[0], left_ii); 6396 ret = send_create_inode_if_needed(sctx); 6397 if (ret < 0) 6398 goto out; 6399 6400 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 6401 if (ret < 0) 6402 goto out; 6403 /* 6404 * Advance send_progress now as we did not get into 6405 * process_recorded_refs_if_needed in the new_gen case. 6406 */ 6407 sctx->send_progress = sctx->cur_ino + 1; 6408 6409 /* 6410 * Now process all extents and xattrs of the inode as if 6411 * they were all new. 6412 */ 6413 ret = process_all_extents(sctx); 6414 if (ret < 0) 6415 goto out; 6416 ret = process_all_new_xattrs(sctx); 6417 if (ret < 0) 6418 goto out; 6419 } else { 6420 sctx->cur_inode_gen = left_gen; 6421 sctx->cur_inode_new = 0; 6422 sctx->cur_inode_new_gen = 0; 6423 sctx->cur_inode_deleted = 0; 6424 sctx->cur_inode_size = btrfs_inode_size( 6425 sctx->left_path->nodes[0], left_ii); 6426 sctx->cur_inode_mode = btrfs_inode_mode( 6427 sctx->left_path->nodes[0], left_ii); 6428 } 6429 } 6430 6431 out: 6432 return ret; 6433 } 6434 6435 /* 6436 * We have to process new refs before deleted refs, but compare_trees gives us 6437 * the new and deleted refs mixed. To fix this, we record the new/deleted refs 6438 * first and later process them in process_recorded_refs. 6439 * For the cur_inode_new_gen case, we skip recording completely because 6440 * changed_inode did already initiate processing of refs. The reason for this is 6441 * that in this case, compare_tree actually compares the refs of 2 different 6442 * inodes. To fix this, process_all_refs is used in changed_inode to handle all 6443 * refs of the right tree as deleted and all refs of the left tree as new. 6444 */ 6445 static int changed_ref(struct send_ctx *sctx, 6446 enum btrfs_compare_tree_result result) 6447 { 6448 int ret = 0; 6449 6450 if (sctx->cur_ino != sctx->cmp_key->objectid) { 6451 inconsistent_snapshot_error(sctx, result, "reference"); 6452 return -EIO; 6453 } 6454 6455 if (!sctx->cur_inode_new_gen && 6456 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { 6457 if (result == BTRFS_COMPARE_TREE_NEW) 6458 ret = record_new_ref(sctx); 6459 else if (result == BTRFS_COMPARE_TREE_DELETED) 6460 ret = record_deleted_ref(sctx); 6461 else if (result == BTRFS_COMPARE_TREE_CHANGED) 6462 ret = record_changed_ref(sctx); 6463 } 6464 6465 return ret; 6466 } 6467 6468 /* 6469 * Process new/deleted/changed xattrs. We skip processing in the 6470 * cur_inode_new_gen case because changed_inode did already initiate processing 6471 * of xattrs. The reason is the same as in changed_ref 6472 */ 6473 static int changed_xattr(struct send_ctx *sctx, 6474 enum btrfs_compare_tree_result result) 6475 { 6476 int ret = 0; 6477 6478 if (sctx->cur_ino != sctx->cmp_key->objectid) { 6479 inconsistent_snapshot_error(sctx, result, "xattr"); 6480 return -EIO; 6481 } 6482 6483 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 6484 if (result == BTRFS_COMPARE_TREE_NEW) 6485 ret = process_new_xattr(sctx); 6486 else if (result == BTRFS_COMPARE_TREE_DELETED) 6487 ret = process_deleted_xattr(sctx); 6488 else if (result == BTRFS_COMPARE_TREE_CHANGED) 6489 ret = process_changed_xattr(sctx); 6490 } 6491 6492 return ret; 6493 } 6494 6495 /* 6496 * Process new/deleted/changed extents. We skip processing in the 6497 * cur_inode_new_gen case because changed_inode did already initiate processing 6498 * of extents. The reason is the same as in changed_ref 6499 */ 6500 static int changed_extent(struct send_ctx *sctx, 6501 enum btrfs_compare_tree_result result) 6502 { 6503 int ret = 0; 6504 6505 /* 6506 * We have found an extent item that changed without the inode item 6507 * having changed. This can happen either after relocation (where the 6508 * disk_bytenr of an extent item is replaced at 6509 * relocation.c:replace_file_extents()) or after deduplication into a 6510 * file in both the parent and send snapshots (where an extent item can 6511 * get modified or replaced with a new one). Note that deduplication 6512 * updates the inode item, but it only changes the iversion (sequence 6513 * field in the inode item) of the inode, so if a file is deduplicated 6514 * the same amount of times in both the parent and send snapshots, its 6515 * iversion becomes the same in both snapshots, whence the inode item is 6516 * the same on both snapshots. 6517 */ 6518 if (sctx->cur_ino != sctx->cmp_key->objectid) 6519 return 0; 6520 6521 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 6522 if (result != BTRFS_COMPARE_TREE_DELETED) 6523 ret = process_extent(sctx, sctx->left_path, 6524 sctx->cmp_key); 6525 } 6526 6527 return ret; 6528 } 6529 6530 static int dir_changed(struct send_ctx *sctx, u64 dir) 6531 { 6532 u64 orig_gen, new_gen; 6533 int ret; 6534 6535 ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, 6536 NULL, NULL); 6537 if (ret) 6538 return ret; 6539 6540 ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, 6541 NULL, NULL, NULL); 6542 if (ret) 6543 return ret; 6544 6545 return (orig_gen != new_gen) ? 1 : 0; 6546 } 6547 6548 static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, 6549 struct btrfs_key *key) 6550 { 6551 struct btrfs_inode_extref *extref; 6552 struct extent_buffer *leaf; 6553 u64 dirid = 0, last_dirid = 0; 6554 unsigned long ptr; 6555 u32 item_size; 6556 u32 cur_offset = 0; 6557 int ref_name_len; 6558 int ret = 0; 6559 6560 /* Easy case, just check this one dirid */ 6561 if (key->type == BTRFS_INODE_REF_KEY) { 6562 dirid = key->offset; 6563 6564 ret = dir_changed(sctx, dirid); 6565 goto out; 6566 } 6567 6568 leaf = path->nodes[0]; 6569 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 6570 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 6571 while (cur_offset < item_size) { 6572 extref = (struct btrfs_inode_extref *)(ptr + 6573 cur_offset); 6574 dirid = btrfs_inode_extref_parent(leaf, extref); 6575 ref_name_len = btrfs_inode_extref_name_len(leaf, extref); 6576 cur_offset += ref_name_len + sizeof(*extref); 6577 if (dirid == last_dirid) 6578 continue; 6579 ret = dir_changed(sctx, dirid); 6580 if (ret) 6581 break; 6582 last_dirid = dirid; 6583 } 6584 out: 6585 return ret; 6586 } 6587 6588 /* 6589 * Updates compare related fields in sctx and simply forwards to the actual 6590 * changed_xxx functions. 6591 */ 6592 static int changed_cb(struct btrfs_path *left_path, 6593 struct btrfs_path *right_path, 6594 struct btrfs_key *key, 6595 enum btrfs_compare_tree_result result, 6596 struct send_ctx *sctx) 6597 { 6598 int ret = 0; 6599 6600 if (result == BTRFS_COMPARE_TREE_SAME) { 6601 if (key->type == BTRFS_INODE_REF_KEY || 6602 key->type == BTRFS_INODE_EXTREF_KEY) { 6603 ret = compare_refs(sctx, left_path, key); 6604 if (!ret) 6605 return 0; 6606 if (ret < 0) 6607 return ret; 6608 } else if (key->type == BTRFS_EXTENT_DATA_KEY) { 6609 return maybe_send_hole(sctx, left_path, key); 6610 } else { 6611 return 0; 6612 } 6613 result = BTRFS_COMPARE_TREE_CHANGED; 6614 ret = 0; 6615 } 6616 6617 sctx->left_path = left_path; 6618 sctx->right_path = right_path; 6619 sctx->cmp_key = key; 6620 6621 ret = finish_inode_if_needed(sctx, 0); 6622 if (ret < 0) 6623 goto out; 6624 6625 /* Ignore non-FS objects */ 6626 if (key->objectid == BTRFS_FREE_INO_OBJECTID || 6627 key->objectid == BTRFS_FREE_SPACE_OBJECTID) 6628 goto out; 6629 6630 if (key->type == BTRFS_INODE_ITEM_KEY) { 6631 ret = changed_inode(sctx, result); 6632 } else if (!sctx->ignore_cur_inode) { 6633 if (key->type == BTRFS_INODE_REF_KEY || 6634 key->type == BTRFS_INODE_EXTREF_KEY) 6635 ret = changed_ref(sctx, result); 6636 else if (key->type == BTRFS_XATTR_ITEM_KEY) 6637 ret = changed_xattr(sctx, result); 6638 else if (key->type == BTRFS_EXTENT_DATA_KEY) 6639 ret = changed_extent(sctx, result); 6640 } 6641 6642 out: 6643 return ret; 6644 } 6645 6646 static int full_send_tree(struct send_ctx *sctx) 6647 { 6648 int ret; 6649 struct btrfs_root *send_root = sctx->send_root; 6650 struct btrfs_key key; 6651 struct btrfs_path *path; 6652 struct extent_buffer *eb; 6653 int slot; 6654 6655 path = alloc_path_for_send(); 6656 if (!path) 6657 return -ENOMEM; 6658 path->reada = READA_FORWARD_ALWAYS; 6659 6660 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 6661 key.type = BTRFS_INODE_ITEM_KEY; 6662 key.offset = 0; 6663 6664 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); 6665 if (ret < 0) 6666 goto out; 6667 if (ret) 6668 goto out_finish; 6669 6670 while (1) { 6671 eb = path->nodes[0]; 6672 slot = path->slots[0]; 6673 btrfs_item_key_to_cpu(eb, &key, slot); 6674 6675 ret = changed_cb(path, NULL, &key, 6676 BTRFS_COMPARE_TREE_NEW, sctx); 6677 if (ret < 0) 6678 goto out; 6679 6680 ret = btrfs_next_item(send_root, path); 6681 if (ret < 0) 6682 goto out; 6683 if (ret) { 6684 ret = 0; 6685 break; 6686 } 6687 } 6688 6689 out_finish: 6690 ret = finish_inode_if_needed(sctx, 1); 6691 6692 out: 6693 btrfs_free_path(path); 6694 return ret; 6695 } 6696 6697 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) 6698 { 6699 struct extent_buffer *eb; 6700 struct extent_buffer *parent = path->nodes[*level]; 6701 int slot = path->slots[*level]; 6702 const int nritems = btrfs_header_nritems(parent); 6703 u64 reada_max; 6704 u64 reada_done = 0; 6705 6706 BUG_ON(*level == 0); 6707 eb = btrfs_read_node_slot(parent, slot); 6708 if (IS_ERR(eb)) 6709 return PTR_ERR(eb); 6710 6711 /* 6712 * Trigger readahead for the next leaves we will process, so that it is 6713 * very likely that when we need them they are already in memory and we 6714 * will not block on disk IO. For nodes we only do readahead for one, 6715 * since the time window between processing nodes is typically larger. 6716 */ 6717 reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); 6718 6719 for (slot++; slot < nritems && reada_done < reada_max; slot++) { 6720 if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { 6721 btrfs_readahead_node_child(parent, slot); 6722 reada_done += eb->fs_info->nodesize; 6723 } 6724 } 6725 6726 path->nodes[*level - 1] = eb; 6727 path->slots[*level - 1] = 0; 6728 (*level)--; 6729 return 0; 6730 } 6731 6732 static int tree_move_next_or_upnext(struct btrfs_path *path, 6733 int *level, int root_level) 6734 { 6735 int ret = 0; 6736 int nritems; 6737 nritems = btrfs_header_nritems(path->nodes[*level]); 6738 6739 path->slots[*level]++; 6740 6741 while (path->slots[*level] >= nritems) { 6742 if (*level == root_level) 6743 return -1; 6744 6745 /* move upnext */ 6746 path->slots[*level] = 0; 6747 free_extent_buffer(path->nodes[*level]); 6748 path->nodes[*level] = NULL; 6749 (*level)++; 6750 path->slots[*level]++; 6751 6752 nritems = btrfs_header_nritems(path->nodes[*level]); 6753 ret = 1; 6754 } 6755 return ret; 6756 } 6757 6758 /* 6759 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 6760 * or down. 6761 */ 6762 static int tree_advance(struct btrfs_path *path, 6763 int *level, int root_level, 6764 int allow_down, 6765 struct btrfs_key *key, 6766 u64 reada_min_gen) 6767 { 6768 int ret; 6769 6770 if (*level == 0 || !allow_down) { 6771 ret = tree_move_next_or_upnext(path, level, root_level); 6772 } else { 6773 ret = tree_move_down(path, level, reada_min_gen); 6774 } 6775 if (ret >= 0) { 6776 if (*level == 0) 6777 btrfs_item_key_to_cpu(path->nodes[*level], key, 6778 path->slots[*level]); 6779 else 6780 btrfs_node_key_to_cpu(path->nodes[*level], key, 6781 path->slots[*level]); 6782 } 6783 return ret; 6784 } 6785 6786 static int tree_compare_item(struct btrfs_path *left_path, 6787 struct btrfs_path *right_path, 6788 char *tmp_buf) 6789 { 6790 int cmp; 6791 int len1, len2; 6792 unsigned long off1, off2; 6793 6794 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 6795 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 6796 if (len1 != len2) 6797 return 1; 6798 6799 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 6800 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 6801 right_path->slots[0]); 6802 6803 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 6804 6805 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 6806 if (cmp) 6807 return 1; 6808 return 0; 6809 } 6810 6811 /* 6812 * This function compares two trees and calls the provided callback for 6813 * every changed/new/deleted item it finds. 6814 * If shared tree blocks are encountered, whole subtrees are skipped, making 6815 * the compare pretty fast on snapshotted subvolumes. 6816 * 6817 * This currently works on commit roots only. As commit roots are read only, 6818 * we don't do any locking. The commit roots are protected with transactions. 6819 * Transactions are ended and rejoined when a commit is tried in between. 6820 * 6821 * This function checks for modifications done to the trees while comparing. 6822 * If it detects a change, it aborts immediately. 6823 */ 6824 static int btrfs_compare_trees(struct btrfs_root *left_root, 6825 struct btrfs_root *right_root, struct send_ctx *sctx) 6826 { 6827 struct btrfs_fs_info *fs_info = left_root->fs_info; 6828 int ret; 6829 int cmp; 6830 struct btrfs_path *left_path = NULL; 6831 struct btrfs_path *right_path = NULL; 6832 struct btrfs_key left_key; 6833 struct btrfs_key right_key; 6834 char *tmp_buf = NULL; 6835 int left_root_level; 6836 int right_root_level; 6837 int left_level; 6838 int right_level; 6839 int left_end_reached; 6840 int right_end_reached; 6841 int advance_left; 6842 int advance_right; 6843 u64 left_blockptr; 6844 u64 right_blockptr; 6845 u64 left_gen; 6846 u64 right_gen; 6847 u64 reada_min_gen; 6848 6849 left_path = btrfs_alloc_path(); 6850 if (!left_path) { 6851 ret = -ENOMEM; 6852 goto out; 6853 } 6854 right_path = btrfs_alloc_path(); 6855 if (!right_path) { 6856 ret = -ENOMEM; 6857 goto out; 6858 } 6859 6860 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 6861 if (!tmp_buf) { 6862 ret = -ENOMEM; 6863 goto out; 6864 } 6865 6866 left_path->search_commit_root = 1; 6867 left_path->skip_locking = 1; 6868 right_path->search_commit_root = 1; 6869 right_path->skip_locking = 1; 6870 6871 /* 6872 * Strategy: Go to the first items of both trees. Then do 6873 * 6874 * If both trees are at level 0 6875 * Compare keys of current items 6876 * If left < right treat left item as new, advance left tree 6877 * and repeat 6878 * If left > right treat right item as deleted, advance right tree 6879 * and repeat 6880 * If left == right do deep compare of items, treat as changed if 6881 * needed, advance both trees and repeat 6882 * If both trees are at the same level but not at level 0 6883 * Compare keys of current nodes/leafs 6884 * If left < right advance left tree and repeat 6885 * If left > right advance right tree and repeat 6886 * If left == right compare blockptrs of the next nodes/leafs 6887 * If they match advance both trees but stay at the same level 6888 * and repeat 6889 * If they don't match advance both trees while allowing to go 6890 * deeper and repeat 6891 * If tree levels are different 6892 * Advance the tree that needs it and repeat 6893 * 6894 * Advancing a tree means: 6895 * If we are at level 0, try to go to the next slot. If that's not 6896 * possible, go one level up and repeat. Stop when we found a level 6897 * where we could go to the next slot. We may at this point be on a 6898 * node or a leaf. 6899 * 6900 * If we are not at level 0 and not on shared tree blocks, go one 6901 * level deeper. 6902 * 6903 * If we are not at level 0 and on shared tree blocks, go one slot to 6904 * the right if possible or go up and right. 6905 */ 6906 6907 down_read(&fs_info->commit_root_sem); 6908 left_level = btrfs_header_level(left_root->commit_root); 6909 left_root_level = left_level; 6910 left_path->nodes[left_level] = 6911 btrfs_clone_extent_buffer(left_root->commit_root); 6912 if (!left_path->nodes[left_level]) { 6913 up_read(&fs_info->commit_root_sem); 6914 ret = -ENOMEM; 6915 goto out; 6916 } 6917 6918 right_level = btrfs_header_level(right_root->commit_root); 6919 right_root_level = right_level; 6920 right_path->nodes[right_level] = 6921 btrfs_clone_extent_buffer(right_root->commit_root); 6922 if (!right_path->nodes[right_level]) { 6923 up_read(&fs_info->commit_root_sem); 6924 ret = -ENOMEM; 6925 goto out; 6926 } 6927 /* 6928 * Our right root is the parent root, while the left root is the "send" 6929 * root. We know that all new nodes/leaves in the left root must have 6930 * a generation greater than the right root's generation, so we trigger 6931 * readahead for those nodes and leaves of the left root, as we know we 6932 * will need to read them at some point. 6933 */ 6934 reada_min_gen = btrfs_header_generation(right_root->commit_root); 6935 up_read(&fs_info->commit_root_sem); 6936 6937 if (left_level == 0) 6938 btrfs_item_key_to_cpu(left_path->nodes[left_level], 6939 &left_key, left_path->slots[left_level]); 6940 else 6941 btrfs_node_key_to_cpu(left_path->nodes[left_level], 6942 &left_key, left_path->slots[left_level]); 6943 if (right_level == 0) 6944 btrfs_item_key_to_cpu(right_path->nodes[right_level], 6945 &right_key, right_path->slots[right_level]); 6946 else 6947 btrfs_node_key_to_cpu(right_path->nodes[right_level], 6948 &right_key, right_path->slots[right_level]); 6949 6950 left_end_reached = right_end_reached = 0; 6951 advance_left = advance_right = 0; 6952 6953 while (1) { 6954 cond_resched(); 6955 if (advance_left && !left_end_reached) { 6956 ret = tree_advance(left_path, &left_level, 6957 left_root_level, 6958 advance_left != ADVANCE_ONLY_NEXT, 6959 &left_key, reada_min_gen); 6960 if (ret == -1) 6961 left_end_reached = ADVANCE; 6962 else if (ret < 0) 6963 goto out; 6964 advance_left = 0; 6965 } 6966 if (advance_right && !right_end_reached) { 6967 ret = tree_advance(right_path, &right_level, 6968 right_root_level, 6969 advance_right != ADVANCE_ONLY_NEXT, 6970 &right_key, reada_min_gen); 6971 if (ret == -1) 6972 right_end_reached = ADVANCE; 6973 else if (ret < 0) 6974 goto out; 6975 advance_right = 0; 6976 } 6977 6978 if (left_end_reached && right_end_reached) { 6979 ret = 0; 6980 goto out; 6981 } else if (left_end_reached) { 6982 if (right_level == 0) { 6983 ret = changed_cb(left_path, right_path, 6984 &right_key, 6985 BTRFS_COMPARE_TREE_DELETED, 6986 sctx); 6987 if (ret < 0) 6988 goto out; 6989 } 6990 advance_right = ADVANCE; 6991 continue; 6992 } else if (right_end_reached) { 6993 if (left_level == 0) { 6994 ret = changed_cb(left_path, right_path, 6995 &left_key, 6996 BTRFS_COMPARE_TREE_NEW, 6997 sctx); 6998 if (ret < 0) 6999 goto out; 7000 } 7001 advance_left = ADVANCE; 7002 continue; 7003 } 7004 7005 if (left_level == 0 && right_level == 0) { 7006 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7007 if (cmp < 0) { 7008 ret = changed_cb(left_path, right_path, 7009 &left_key, 7010 BTRFS_COMPARE_TREE_NEW, 7011 sctx); 7012 if (ret < 0) 7013 goto out; 7014 advance_left = ADVANCE; 7015 } else if (cmp > 0) { 7016 ret = changed_cb(left_path, right_path, 7017 &right_key, 7018 BTRFS_COMPARE_TREE_DELETED, 7019 sctx); 7020 if (ret < 0) 7021 goto out; 7022 advance_right = ADVANCE; 7023 } else { 7024 enum btrfs_compare_tree_result result; 7025 7026 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 7027 ret = tree_compare_item(left_path, right_path, 7028 tmp_buf); 7029 if (ret) 7030 result = BTRFS_COMPARE_TREE_CHANGED; 7031 else 7032 result = BTRFS_COMPARE_TREE_SAME; 7033 ret = changed_cb(left_path, right_path, 7034 &left_key, result, sctx); 7035 if (ret < 0) 7036 goto out; 7037 advance_left = ADVANCE; 7038 advance_right = ADVANCE; 7039 } 7040 } else if (left_level == right_level) { 7041 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7042 if (cmp < 0) { 7043 advance_left = ADVANCE; 7044 } else if (cmp > 0) { 7045 advance_right = ADVANCE; 7046 } else { 7047 left_blockptr = btrfs_node_blockptr( 7048 left_path->nodes[left_level], 7049 left_path->slots[left_level]); 7050 right_blockptr = btrfs_node_blockptr( 7051 right_path->nodes[right_level], 7052 right_path->slots[right_level]); 7053 left_gen = btrfs_node_ptr_generation( 7054 left_path->nodes[left_level], 7055 left_path->slots[left_level]); 7056 right_gen = btrfs_node_ptr_generation( 7057 right_path->nodes[right_level], 7058 right_path->slots[right_level]); 7059 if (left_blockptr == right_blockptr && 7060 left_gen == right_gen) { 7061 /* 7062 * As we're on a shared block, don't 7063 * allow to go deeper. 7064 */ 7065 advance_left = ADVANCE_ONLY_NEXT; 7066 advance_right = ADVANCE_ONLY_NEXT; 7067 } else { 7068 advance_left = ADVANCE; 7069 advance_right = ADVANCE; 7070 } 7071 } 7072 } else if (left_level < right_level) { 7073 advance_right = ADVANCE; 7074 } else { 7075 advance_left = ADVANCE; 7076 } 7077 } 7078 7079 out: 7080 btrfs_free_path(left_path); 7081 btrfs_free_path(right_path); 7082 kvfree(tmp_buf); 7083 return ret; 7084 } 7085 7086 static int send_subvol(struct send_ctx *sctx) 7087 { 7088 int ret; 7089 7090 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { 7091 ret = send_header(sctx); 7092 if (ret < 0) 7093 goto out; 7094 } 7095 7096 ret = send_subvol_begin(sctx); 7097 if (ret < 0) 7098 goto out; 7099 7100 if (sctx->parent_root) { 7101 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); 7102 if (ret < 0) 7103 goto out; 7104 ret = finish_inode_if_needed(sctx, 1); 7105 if (ret < 0) 7106 goto out; 7107 } else { 7108 ret = full_send_tree(sctx); 7109 if (ret < 0) 7110 goto out; 7111 } 7112 7113 out: 7114 free_recorded_refs(sctx); 7115 return ret; 7116 } 7117 7118 /* 7119 * If orphan cleanup did remove any orphans from a root, it means the tree 7120 * was modified and therefore the commit root is not the same as the current 7121 * root anymore. This is a problem, because send uses the commit root and 7122 * therefore can see inode items that don't exist in the current root anymore, 7123 * and for example make calls to btrfs_iget, which will do tree lookups based 7124 * on the current root and not on the commit root. Those lookups will fail, 7125 * returning a -ESTALE error, and making send fail with that error. So make 7126 * sure a send does not see any orphans we have just removed, and that it will 7127 * see the same inodes regardless of whether a transaction commit happened 7128 * before it started (meaning that the commit root will be the same as the 7129 * current root) or not. 7130 */ 7131 static int ensure_commit_roots_uptodate(struct send_ctx *sctx) 7132 { 7133 int i; 7134 struct btrfs_trans_handle *trans = NULL; 7135 7136 again: 7137 if (sctx->parent_root && 7138 sctx->parent_root->node != sctx->parent_root->commit_root) 7139 goto commit_trans; 7140 7141 for (i = 0; i < sctx->clone_roots_cnt; i++) 7142 if (sctx->clone_roots[i].root->node != 7143 sctx->clone_roots[i].root->commit_root) 7144 goto commit_trans; 7145 7146 if (trans) 7147 return btrfs_end_transaction(trans); 7148 7149 return 0; 7150 7151 commit_trans: 7152 /* Use any root, all fs roots will get their commit roots updated. */ 7153 if (!trans) { 7154 trans = btrfs_join_transaction(sctx->send_root); 7155 if (IS_ERR(trans)) 7156 return PTR_ERR(trans); 7157 goto again; 7158 } 7159 7160 return btrfs_commit_transaction(trans); 7161 } 7162 7163 /* 7164 * Make sure any existing dellaloc is flushed for any root used by a send 7165 * operation so that we do not miss any data and we do not race with writeback 7166 * finishing and changing a tree while send is using the tree. This could 7167 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and 7168 * a send operation then uses the subvolume. 7169 * After flushing delalloc ensure_commit_roots_uptodate() must be called. 7170 */ 7171 static int flush_delalloc_roots(struct send_ctx *sctx) 7172 { 7173 struct btrfs_root *root = sctx->parent_root; 7174 int ret; 7175 int i; 7176 7177 if (root) { 7178 ret = btrfs_start_delalloc_snapshot(root, false); 7179 if (ret) 7180 return ret; 7181 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); 7182 } 7183 7184 for (i = 0; i < sctx->clone_roots_cnt; i++) { 7185 root = sctx->clone_roots[i].root; 7186 ret = btrfs_start_delalloc_snapshot(root, false); 7187 if (ret) 7188 return ret; 7189 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); 7190 } 7191 7192 return 0; 7193 } 7194 7195 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 7196 { 7197 spin_lock(&root->root_item_lock); 7198 root->send_in_progress--; 7199 /* 7200 * Not much left to do, we don't know why it's unbalanced and 7201 * can't blindly reset it to 0. 7202 */ 7203 if (root->send_in_progress < 0) 7204 btrfs_err(root->fs_info, 7205 "send_in_progress unbalanced %d root %llu", 7206 root->send_in_progress, root->root_key.objectid); 7207 spin_unlock(&root->root_item_lock); 7208 } 7209 7210 static void dedupe_in_progress_warn(const struct btrfs_root *root) 7211 { 7212 btrfs_warn_rl(root->fs_info, 7213 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", 7214 root->root_key.objectid, root->dedupe_in_progress); 7215 } 7216 7217 long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) 7218 { 7219 int ret = 0; 7220 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; 7221 struct btrfs_fs_info *fs_info = send_root->fs_info; 7222 struct btrfs_root *clone_root; 7223 struct send_ctx *sctx = NULL; 7224 u32 i; 7225 u64 *clone_sources_tmp = NULL; 7226 int clone_sources_to_rollback = 0; 7227 size_t alloc_size; 7228 int sort_clone_roots = 0; 7229 7230 if (!capable(CAP_SYS_ADMIN)) 7231 return -EPERM; 7232 7233 /* 7234 * The subvolume must remain read-only during send, protect against 7235 * making it RW. This also protects against deletion. 7236 */ 7237 spin_lock(&send_root->root_item_lock); 7238 if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { 7239 dedupe_in_progress_warn(send_root); 7240 spin_unlock(&send_root->root_item_lock); 7241 return -EAGAIN; 7242 } 7243 send_root->send_in_progress++; 7244 spin_unlock(&send_root->root_item_lock); 7245 7246 /* 7247 * Userspace tools do the checks and warn the user if it's 7248 * not RO. 7249 */ 7250 if (!btrfs_root_readonly(send_root)) { 7251 ret = -EPERM; 7252 goto out; 7253 } 7254 7255 /* 7256 * Check that we don't overflow at later allocations, we request 7257 * clone_sources_count + 1 items, and compare to unsigned long inside 7258 * access_ok. 7259 */ 7260 if (arg->clone_sources_count > 7261 ULONG_MAX / sizeof(struct clone_root) - 1) { 7262 ret = -EINVAL; 7263 goto out; 7264 } 7265 7266 if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { 7267 ret = -EINVAL; 7268 goto out; 7269 } 7270 7271 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); 7272 if (!sctx) { 7273 ret = -ENOMEM; 7274 goto out; 7275 } 7276 7277 INIT_LIST_HEAD(&sctx->new_refs); 7278 INIT_LIST_HEAD(&sctx->deleted_refs); 7279 INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); 7280 INIT_LIST_HEAD(&sctx->name_cache_list); 7281 7282 sctx->flags = arg->flags; 7283 7284 if (arg->flags & BTRFS_SEND_FLAG_VERSION) { 7285 if (arg->version > BTRFS_SEND_STREAM_VERSION) { 7286 ret = -EPROTO; 7287 goto out; 7288 } 7289 /* Zero means "use the highest version" */ 7290 sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; 7291 } else { 7292 sctx->proto = 1; 7293 } 7294 7295 sctx->send_filp = fget(arg->send_fd); 7296 if (!sctx->send_filp) { 7297 ret = -EBADF; 7298 goto out; 7299 } 7300 7301 sctx->send_root = send_root; 7302 /* 7303 * Unlikely but possible, if the subvolume is marked for deletion but 7304 * is slow to remove the directory entry, send can still be started 7305 */ 7306 if (btrfs_root_dead(sctx->send_root)) { 7307 ret = -EPERM; 7308 goto out; 7309 } 7310 7311 sctx->clone_roots_cnt = arg->clone_sources_count; 7312 7313 sctx->send_max_size = BTRFS_SEND_BUF_SIZE; 7314 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); 7315 if (!sctx->send_buf) { 7316 ret = -ENOMEM; 7317 goto out; 7318 } 7319 7320 sctx->pending_dir_moves = RB_ROOT; 7321 sctx->waiting_dir_moves = RB_ROOT; 7322 sctx->orphan_dirs = RB_ROOT; 7323 7324 sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), 7325 arg->clone_sources_count + 1, 7326 GFP_KERNEL); 7327 if (!sctx->clone_roots) { 7328 ret = -ENOMEM; 7329 goto out; 7330 } 7331 7332 alloc_size = array_size(sizeof(*arg->clone_sources), 7333 arg->clone_sources_count); 7334 7335 if (arg->clone_sources_count) { 7336 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); 7337 if (!clone_sources_tmp) { 7338 ret = -ENOMEM; 7339 goto out; 7340 } 7341 7342 ret = copy_from_user(clone_sources_tmp, arg->clone_sources, 7343 alloc_size); 7344 if (ret) { 7345 ret = -EFAULT; 7346 goto out; 7347 } 7348 7349 for (i = 0; i < arg->clone_sources_count; i++) { 7350 clone_root = btrfs_get_fs_root(fs_info, 7351 clone_sources_tmp[i], true); 7352 if (IS_ERR(clone_root)) { 7353 ret = PTR_ERR(clone_root); 7354 goto out; 7355 } 7356 spin_lock(&clone_root->root_item_lock); 7357 if (!btrfs_root_readonly(clone_root) || 7358 btrfs_root_dead(clone_root)) { 7359 spin_unlock(&clone_root->root_item_lock); 7360 btrfs_put_root(clone_root); 7361 ret = -EPERM; 7362 goto out; 7363 } 7364 if (clone_root->dedupe_in_progress) { 7365 dedupe_in_progress_warn(clone_root); 7366 spin_unlock(&clone_root->root_item_lock); 7367 btrfs_put_root(clone_root); 7368 ret = -EAGAIN; 7369 goto out; 7370 } 7371 clone_root->send_in_progress++; 7372 spin_unlock(&clone_root->root_item_lock); 7373 7374 sctx->clone_roots[i].root = clone_root; 7375 clone_sources_to_rollback = i + 1; 7376 } 7377 kvfree(clone_sources_tmp); 7378 clone_sources_tmp = NULL; 7379 } 7380 7381 if (arg->parent_root) { 7382 sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, 7383 true); 7384 if (IS_ERR(sctx->parent_root)) { 7385 ret = PTR_ERR(sctx->parent_root); 7386 goto out; 7387 } 7388 7389 spin_lock(&sctx->parent_root->root_item_lock); 7390 sctx->parent_root->send_in_progress++; 7391 if (!btrfs_root_readonly(sctx->parent_root) || 7392 btrfs_root_dead(sctx->parent_root)) { 7393 spin_unlock(&sctx->parent_root->root_item_lock); 7394 ret = -EPERM; 7395 goto out; 7396 } 7397 if (sctx->parent_root->dedupe_in_progress) { 7398 dedupe_in_progress_warn(sctx->parent_root); 7399 spin_unlock(&sctx->parent_root->root_item_lock); 7400 ret = -EAGAIN; 7401 goto out; 7402 } 7403 spin_unlock(&sctx->parent_root->root_item_lock); 7404 } 7405 7406 /* 7407 * Clones from send_root are allowed, but only if the clone source 7408 * is behind the current send position. This is checked while searching 7409 * for possible clone sources. 7410 */ 7411 sctx->clone_roots[sctx->clone_roots_cnt++].root = 7412 btrfs_grab_root(sctx->send_root); 7413 7414 /* We do a bsearch later */ 7415 sort(sctx->clone_roots, sctx->clone_roots_cnt, 7416 sizeof(*sctx->clone_roots), __clone_root_cmp_sort, 7417 NULL); 7418 sort_clone_roots = 1; 7419 7420 ret = flush_delalloc_roots(sctx); 7421 if (ret) 7422 goto out; 7423 7424 ret = ensure_commit_roots_uptodate(sctx); 7425 if (ret) 7426 goto out; 7427 7428 spin_lock(&fs_info->send_reloc_lock); 7429 if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 7430 spin_unlock(&fs_info->send_reloc_lock); 7431 btrfs_warn_rl(fs_info, 7432 "cannot run send because a relocation operation is in progress"); 7433 ret = -EAGAIN; 7434 goto out; 7435 } 7436 fs_info->send_in_progress++; 7437 spin_unlock(&fs_info->send_reloc_lock); 7438 7439 ret = send_subvol(sctx); 7440 spin_lock(&fs_info->send_reloc_lock); 7441 fs_info->send_in_progress--; 7442 spin_unlock(&fs_info->send_reloc_lock); 7443 if (ret < 0) 7444 goto out; 7445 7446 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { 7447 ret = begin_cmd(sctx, BTRFS_SEND_C_END); 7448 if (ret < 0) 7449 goto out; 7450 ret = send_cmd(sctx); 7451 if (ret < 0) 7452 goto out; 7453 } 7454 7455 out: 7456 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); 7457 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { 7458 struct rb_node *n; 7459 struct pending_dir_move *pm; 7460 7461 n = rb_first(&sctx->pending_dir_moves); 7462 pm = rb_entry(n, struct pending_dir_move, node); 7463 while (!list_empty(&pm->list)) { 7464 struct pending_dir_move *pm2; 7465 7466 pm2 = list_first_entry(&pm->list, 7467 struct pending_dir_move, list); 7468 free_pending_move(sctx, pm2); 7469 } 7470 free_pending_move(sctx, pm); 7471 } 7472 7473 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); 7474 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { 7475 struct rb_node *n; 7476 struct waiting_dir_move *dm; 7477 7478 n = rb_first(&sctx->waiting_dir_moves); 7479 dm = rb_entry(n, struct waiting_dir_move, node); 7480 rb_erase(&dm->node, &sctx->waiting_dir_moves); 7481 kfree(dm); 7482 } 7483 7484 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); 7485 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { 7486 struct rb_node *n; 7487 struct orphan_dir_info *odi; 7488 7489 n = rb_first(&sctx->orphan_dirs); 7490 odi = rb_entry(n, struct orphan_dir_info, node); 7491 free_orphan_dir_info(sctx, odi); 7492 } 7493 7494 if (sort_clone_roots) { 7495 for (i = 0; i < sctx->clone_roots_cnt; i++) { 7496 btrfs_root_dec_send_in_progress( 7497 sctx->clone_roots[i].root); 7498 btrfs_put_root(sctx->clone_roots[i].root); 7499 } 7500 } else { 7501 for (i = 0; sctx && i < clone_sources_to_rollback; i++) { 7502 btrfs_root_dec_send_in_progress( 7503 sctx->clone_roots[i].root); 7504 btrfs_put_root(sctx->clone_roots[i].root); 7505 } 7506 7507 btrfs_root_dec_send_in_progress(send_root); 7508 } 7509 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { 7510 btrfs_root_dec_send_in_progress(sctx->parent_root); 7511 btrfs_put_root(sctx->parent_root); 7512 } 7513 7514 kvfree(clone_sources_tmp); 7515 7516 if (sctx) { 7517 if (sctx->send_filp) 7518 fput(sctx->send_filp); 7519 7520 kvfree(sctx->clone_roots); 7521 kvfree(sctx->send_buf); 7522 7523 name_cache_free(sctx); 7524 7525 kfree(sctx); 7526 } 7527 7528 return ret; 7529 } 7530