1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2012 Alexander Block. All rights reserved.
4 */
5
6 #include <linux/bsearch.h>
7 #include <linux/fs.h>
8 #include <linux/file.h>
9 #include <linux/sort.h>
10 #include <linux/mount.h>
11 #include <linux/xattr.h>
12 #include <linux/posix_acl_xattr.h>
13 #include <linux/radix-tree.h>
14 #include <linux/vmalloc.h>
15 #include <linux/string.h>
16 #include <linux/compat.h>
17 #include <linux/crc32c.h>
18 #include <linux/fsverity.h>
19
20 #include "send.h"
21 #include "ctree.h"
22 #include "backref.h"
23 #include "locking.h"
24 #include "disk-io.h"
25 #include "btrfs_inode.h"
26 #include "transaction.h"
27 #include "compression.h"
28 #include "xattr.h"
29 #include "print-tree.h"
30 #include "accessors.h"
31 #include "dir-item.h"
32 #include "file-item.h"
33 #include "ioctl.h"
34 #include "verity.h"
35 #include "lru_cache.h"
36
37 /*
38 * Maximum number of references an extent can have in order for us to attempt to
39 * issue clone operations instead of write operations. This currently exists to
40 * avoid hitting limitations of the backreference walking code (taking a lot of
41 * time and using too much memory for extents with large number of references).
42 */
43 #define SEND_MAX_EXTENT_REFS 1024
44
45 /*
46 * A fs_path is a helper to dynamically build path names with unknown size.
47 * It reallocates the internal buffer on demand.
48 * It allows fast adding of path elements on the right side (normal path) and
49 * fast adding to the left side (reversed path). A reversed path can also be
50 * unreversed if needed.
51 */
52 struct fs_path {
53 union {
54 struct {
55 char *start;
56 char *end;
57
58 char *buf;
59 unsigned short buf_len:15;
60 unsigned short reversed:1;
61 char inline_buf[];
62 };
63 /*
64 * Average path length does not exceed 200 bytes, we'll have
65 * better packing in the slab and higher chance to satisfy
66 * a allocation later during send.
67 */
68 char pad[256];
69 };
70 };
71 #define FS_PATH_INLINE_SIZE \
72 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
73
74
75 /* reused for each extent */
76 struct clone_root {
77 struct btrfs_root *root;
78 u64 ino;
79 u64 offset;
80 u64 num_bytes;
81 bool found_ref;
82 };
83
84 #define SEND_MAX_NAME_CACHE_SIZE 256
85
86 /*
87 * Limit the root_ids array of struct backref_cache_entry to 17 elements.
88 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
89 * can be satisfied from the kmalloc-192 slab, without wasting any space.
90 * The most common case is to have a single root for cloning, which corresponds
91 * to the send root. Having the user specify more than 16 clone roots is not
92 * common, and in such rare cases we simply don't use caching if the number of
93 * cloning roots that lead down to a leaf is more than 17.
94 */
95 #define SEND_MAX_BACKREF_CACHE_ROOTS 17
96
97 /*
98 * Max number of entries in the cache.
99 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
100 * maple tree's internal nodes, is 24K.
101 */
102 #define SEND_MAX_BACKREF_CACHE_SIZE 128
103
104 /*
105 * A backref cache entry maps a leaf to a list of IDs of roots from which the
106 * leaf is accessible and we can use for clone operations.
107 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
108 * x86_64).
109 */
110 struct backref_cache_entry {
111 struct btrfs_lru_cache_entry entry;
112 u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
113 /* Number of valid elements in the root_ids array. */
114 int num_roots;
115 };
116
117 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
118 static_assert(offsetof(struct backref_cache_entry, entry) == 0);
119
120 /*
121 * Max number of entries in the cache that stores directories that were already
122 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
123 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
124 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
125 */
126 #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
127
128 /*
129 * Max number of entries in the cache that stores directories that were already
130 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
131 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
132 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
133 */
134 #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
135
136 struct send_ctx {
137 struct file *send_filp;
138 loff_t send_off;
139 char *send_buf;
140 u32 send_size;
141 u32 send_max_size;
142 /*
143 * Whether BTRFS_SEND_A_DATA attribute was already added to current
144 * command (since protocol v2, data must be the last attribute).
145 */
146 bool put_data;
147 struct page **send_buf_pages;
148 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
149 /* Protocol version compatibility requested */
150 u32 proto;
151
152 struct btrfs_root *send_root;
153 struct btrfs_root *parent_root;
154 struct clone_root *clone_roots;
155 int clone_roots_cnt;
156
157 /* current state of the compare_tree call */
158 struct btrfs_path *left_path;
159 struct btrfs_path *right_path;
160 struct btrfs_key *cmp_key;
161
162 /*
163 * Keep track of the generation of the last transaction that was used
164 * for relocating a block group. This is periodically checked in order
165 * to detect if a relocation happened since the last check, so that we
166 * don't operate on stale extent buffers for nodes (level >= 1) or on
167 * stale disk_bytenr values of file extent items.
168 */
169 u64 last_reloc_trans;
170
171 /*
172 * infos of the currently processed inode. In case of deleted inodes,
173 * these are the values from the deleted inode.
174 */
175 u64 cur_ino;
176 u64 cur_inode_gen;
177 u64 cur_inode_size;
178 u64 cur_inode_mode;
179 u64 cur_inode_rdev;
180 u64 cur_inode_last_extent;
181 u64 cur_inode_next_write_offset;
182 bool cur_inode_new;
183 bool cur_inode_new_gen;
184 bool cur_inode_deleted;
185 bool ignore_cur_inode;
186 bool cur_inode_needs_verity;
187 void *verity_descriptor;
188
189 u64 send_progress;
190
191 struct list_head new_refs;
192 struct list_head deleted_refs;
193
194 struct btrfs_lru_cache name_cache;
195
196 /*
197 * The inode we are currently processing. It's not NULL only when we
198 * need to issue write commands for data extents from this inode.
199 */
200 struct inode *cur_inode;
201 struct file_ra_state ra;
202 u64 page_cache_clear_start;
203 bool clean_page_cache;
204
205 /*
206 * We process inodes by their increasing order, so if before an
207 * incremental send we reverse the parent/child relationship of
208 * directories such that a directory with a lower inode number was
209 * the parent of a directory with a higher inode number, and the one
210 * becoming the new parent got renamed too, we can't rename/move the
211 * directory with lower inode number when we finish processing it - we
212 * must process the directory with higher inode number first, then
213 * rename/move it and then rename/move the directory with lower inode
214 * number. Example follows.
215 *
216 * Tree state when the first send was performed:
217 *
218 * .
219 * |-- a (ino 257)
220 * |-- b (ino 258)
221 * |
222 * |
223 * |-- c (ino 259)
224 * | |-- d (ino 260)
225 * |
226 * |-- c2 (ino 261)
227 *
228 * Tree state when the second (incremental) send is performed:
229 *
230 * .
231 * |-- a (ino 257)
232 * |-- b (ino 258)
233 * |-- c2 (ino 261)
234 * |-- d2 (ino 260)
235 * |-- cc (ino 259)
236 *
237 * The sequence of steps that lead to the second state was:
238 *
239 * mv /a/b/c/d /a/b/c2/d2
240 * mv /a/b/c /a/b/c2/d2/cc
241 *
242 * "c" has lower inode number, but we can't move it (2nd mv operation)
243 * before we move "d", which has higher inode number.
244 *
245 * So we just memorize which move/rename operations must be performed
246 * later when their respective parent is processed and moved/renamed.
247 */
248
249 /* Indexed by parent directory inode number. */
250 struct rb_root pending_dir_moves;
251
252 /*
253 * Reverse index, indexed by the inode number of a directory that
254 * is waiting for the move/rename of its immediate parent before its
255 * own move/rename can be performed.
256 */
257 struct rb_root waiting_dir_moves;
258
259 /*
260 * A directory that is going to be rm'ed might have a child directory
261 * which is in the pending directory moves index above. In this case,
262 * the directory can only be removed after the move/rename of its child
263 * is performed. Example:
264 *
265 * Parent snapshot:
266 *
267 * . (ino 256)
268 * |-- a/ (ino 257)
269 * |-- b/ (ino 258)
270 * |-- c/ (ino 259)
271 * | |-- x/ (ino 260)
272 * |
273 * |-- y/ (ino 261)
274 *
275 * Send snapshot:
276 *
277 * . (ino 256)
278 * |-- a/ (ino 257)
279 * |-- b/ (ino 258)
280 * |-- YY/ (ino 261)
281 * |-- x/ (ino 260)
282 *
283 * Sequence of steps that lead to the send snapshot:
284 * rm -f /a/b/c/foo.txt
285 * mv /a/b/y /a/b/YY
286 * mv /a/b/c/x /a/b/YY
287 * rmdir /a/b/c
288 *
289 * When the child is processed, its move/rename is delayed until its
290 * parent is processed (as explained above), but all other operations
291 * like update utimes, chown, chgrp, etc, are performed and the paths
292 * that it uses for those operations must use the orphanized name of
293 * its parent (the directory we're going to rm later), so we need to
294 * memorize that name.
295 *
296 * Indexed by the inode number of the directory to be deleted.
297 */
298 struct rb_root orphan_dirs;
299
300 struct rb_root rbtree_new_refs;
301 struct rb_root rbtree_deleted_refs;
302
303 struct btrfs_lru_cache backref_cache;
304 u64 backref_cache_last_reloc_trans;
305
306 struct btrfs_lru_cache dir_created_cache;
307 struct btrfs_lru_cache dir_utimes_cache;
308 };
309
310 struct pending_dir_move {
311 struct rb_node node;
312 struct list_head list;
313 u64 parent_ino;
314 u64 ino;
315 u64 gen;
316 struct list_head update_refs;
317 };
318
319 struct waiting_dir_move {
320 struct rb_node node;
321 u64 ino;
322 /*
323 * There might be some directory that could not be removed because it
324 * was waiting for this directory inode to be moved first. Therefore
325 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
326 */
327 u64 rmdir_ino;
328 u64 rmdir_gen;
329 bool orphanized;
330 };
331
332 struct orphan_dir_info {
333 struct rb_node node;
334 u64 ino;
335 u64 gen;
336 u64 last_dir_index_offset;
337 u64 dir_high_seq_ino;
338 };
339
340 struct name_cache_entry {
341 /*
342 * The key in the entry is an inode number, and the generation matches
343 * the inode's generation.
344 */
345 struct btrfs_lru_cache_entry entry;
346 u64 parent_ino;
347 u64 parent_gen;
348 int ret;
349 int need_later_update;
350 int name_len;
351 char name[];
352 };
353
354 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
355 static_assert(offsetof(struct name_cache_entry, entry) == 0);
356
357 #define ADVANCE 1
358 #define ADVANCE_ONLY_NEXT -1
359
360 enum btrfs_compare_tree_result {
361 BTRFS_COMPARE_TREE_NEW,
362 BTRFS_COMPARE_TREE_DELETED,
363 BTRFS_COMPARE_TREE_CHANGED,
364 BTRFS_COMPARE_TREE_SAME,
365 };
366
367 __cold
inconsistent_snapshot_error(struct send_ctx * sctx,enum btrfs_compare_tree_result result,const char * what)368 static void inconsistent_snapshot_error(struct send_ctx *sctx,
369 enum btrfs_compare_tree_result result,
370 const char *what)
371 {
372 const char *result_string;
373
374 switch (result) {
375 case BTRFS_COMPARE_TREE_NEW:
376 result_string = "new";
377 break;
378 case BTRFS_COMPARE_TREE_DELETED:
379 result_string = "deleted";
380 break;
381 case BTRFS_COMPARE_TREE_CHANGED:
382 result_string = "updated";
383 break;
384 case BTRFS_COMPARE_TREE_SAME:
385 ASSERT(0);
386 result_string = "unchanged";
387 break;
388 default:
389 ASSERT(0);
390 result_string = "unexpected";
391 }
392
393 btrfs_err(sctx->send_root->fs_info,
394 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
395 result_string, what, sctx->cmp_key->objectid,
396 sctx->send_root->root_key.objectid,
397 (sctx->parent_root ?
398 sctx->parent_root->root_key.objectid : 0));
399 }
400
401 __maybe_unused
proto_cmd_ok(const struct send_ctx * sctx,int cmd)402 static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
403 {
404 switch (sctx->proto) {
405 case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
406 case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
407 case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
408 default: return false;
409 }
410 }
411
412 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
413
414 static struct waiting_dir_move *
415 get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
416
417 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
418
need_send_hole(struct send_ctx * sctx)419 static int need_send_hole(struct send_ctx *sctx)
420 {
421 return (sctx->parent_root && !sctx->cur_inode_new &&
422 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
423 S_ISREG(sctx->cur_inode_mode));
424 }
425
fs_path_reset(struct fs_path * p)426 static void fs_path_reset(struct fs_path *p)
427 {
428 if (p->reversed) {
429 p->start = p->buf + p->buf_len - 1;
430 p->end = p->start;
431 *p->start = 0;
432 } else {
433 p->start = p->buf;
434 p->end = p->start;
435 *p->start = 0;
436 }
437 }
438
fs_path_alloc(void)439 static struct fs_path *fs_path_alloc(void)
440 {
441 struct fs_path *p;
442
443 p = kmalloc(sizeof(*p), GFP_KERNEL);
444 if (!p)
445 return NULL;
446 p->reversed = 0;
447 p->buf = p->inline_buf;
448 p->buf_len = FS_PATH_INLINE_SIZE;
449 fs_path_reset(p);
450 return p;
451 }
452
fs_path_alloc_reversed(void)453 static struct fs_path *fs_path_alloc_reversed(void)
454 {
455 struct fs_path *p;
456
457 p = fs_path_alloc();
458 if (!p)
459 return NULL;
460 p->reversed = 1;
461 fs_path_reset(p);
462 return p;
463 }
464
fs_path_free(struct fs_path * p)465 static void fs_path_free(struct fs_path *p)
466 {
467 if (!p)
468 return;
469 if (p->buf != p->inline_buf)
470 kfree(p->buf);
471 kfree(p);
472 }
473
fs_path_len(struct fs_path * p)474 static int fs_path_len(struct fs_path *p)
475 {
476 return p->end - p->start;
477 }
478
fs_path_ensure_buf(struct fs_path * p,int len)479 static int fs_path_ensure_buf(struct fs_path *p, int len)
480 {
481 char *tmp_buf;
482 int path_len;
483 int old_buf_len;
484
485 len++;
486
487 if (p->buf_len >= len)
488 return 0;
489
490 if (WARN_ON(len > PATH_MAX))
491 return -ENAMETOOLONG;
492
493 path_len = p->end - p->start;
494 old_buf_len = p->buf_len;
495
496 /*
497 * Allocate to the next largest kmalloc bucket size, to let
498 * the fast path happen most of the time.
499 */
500 len = kmalloc_size_roundup(len);
501 /*
502 * First time the inline_buf does not suffice
503 */
504 if (p->buf == p->inline_buf) {
505 tmp_buf = kmalloc(len, GFP_KERNEL);
506 if (tmp_buf)
507 memcpy(tmp_buf, p->buf, old_buf_len);
508 } else {
509 tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
510 }
511 if (!tmp_buf)
512 return -ENOMEM;
513 p->buf = tmp_buf;
514 p->buf_len = len;
515
516 if (p->reversed) {
517 tmp_buf = p->buf + old_buf_len - path_len - 1;
518 p->end = p->buf + p->buf_len - 1;
519 p->start = p->end - path_len;
520 memmove(p->start, tmp_buf, path_len + 1);
521 } else {
522 p->start = p->buf;
523 p->end = p->start + path_len;
524 }
525 return 0;
526 }
527
fs_path_prepare_for_add(struct fs_path * p,int name_len,char ** prepared)528 static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
529 char **prepared)
530 {
531 int ret;
532 int new_len;
533
534 new_len = p->end - p->start + name_len;
535 if (p->start != p->end)
536 new_len++;
537 ret = fs_path_ensure_buf(p, new_len);
538 if (ret < 0)
539 goto out;
540
541 if (p->reversed) {
542 if (p->start != p->end)
543 *--p->start = '/';
544 p->start -= name_len;
545 *prepared = p->start;
546 } else {
547 if (p->start != p->end)
548 *p->end++ = '/';
549 *prepared = p->end;
550 p->end += name_len;
551 *p->end = 0;
552 }
553
554 out:
555 return ret;
556 }
557
fs_path_add(struct fs_path * p,const char * name,int name_len)558 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
559 {
560 int ret;
561 char *prepared;
562
563 ret = fs_path_prepare_for_add(p, name_len, &prepared);
564 if (ret < 0)
565 goto out;
566 memcpy(prepared, name, name_len);
567
568 out:
569 return ret;
570 }
571
fs_path_add_path(struct fs_path * p,struct fs_path * p2)572 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
573 {
574 int ret;
575 char *prepared;
576
577 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
578 if (ret < 0)
579 goto out;
580 memcpy(prepared, p2->start, p2->end - p2->start);
581
582 out:
583 return ret;
584 }
585
fs_path_add_from_extent_buffer(struct fs_path * p,struct extent_buffer * eb,unsigned long off,int len)586 static int fs_path_add_from_extent_buffer(struct fs_path *p,
587 struct extent_buffer *eb,
588 unsigned long off, int len)
589 {
590 int ret;
591 char *prepared;
592
593 ret = fs_path_prepare_for_add(p, len, &prepared);
594 if (ret < 0)
595 goto out;
596
597 read_extent_buffer(eb, prepared, off, len);
598
599 out:
600 return ret;
601 }
602
fs_path_copy(struct fs_path * p,struct fs_path * from)603 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
604 {
605 p->reversed = from->reversed;
606 fs_path_reset(p);
607
608 return fs_path_add_path(p, from);
609 }
610
fs_path_unreverse(struct fs_path * p)611 static void fs_path_unreverse(struct fs_path *p)
612 {
613 char *tmp;
614 int len;
615
616 if (!p->reversed)
617 return;
618
619 tmp = p->start;
620 len = p->end - p->start;
621 p->start = p->buf;
622 p->end = p->start + len;
623 memmove(p->start, tmp, len + 1);
624 p->reversed = 0;
625 }
626
alloc_path_for_send(void)627 static struct btrfs_path *alloc_path_for_send(void)
628 {
629 struct btrfs_path *path;
630
631 path = btrfs_alloc_path();
632 if (!path)
633 return NULL;
634 path->search_commit_root = 1;
635 path->skip_locking = 1;
636 path->need_commit_sem = 1;
637 return path;
638 }
639
write_buf(struct file * filp,const void * buf,u32 len,loff_t * off)640 static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
641 {
642 int ret;
643 u32 pos = 0;
644
645 while (pos < len) {
646 ret = kernel_write(filp, buf + pos, len - pos, off);
647 if (ret < 0)
648 return ret;
649 if (ret == 0)
650 return -EIO;
651 pos += ret;
652 }
653
654 return 0;
655 }
656
tlv_put(struct send_ctx * sctx,u16 attr,const void * data,int len)657 static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
658 {
659 struct btrfs_tlv_header *hdr;
660 int total_len = sizeof(*hdr) + len;
661 int left = sctx->send_max_size - sctx->send_size;
662
663 if (WARN_ON_ONCE(sctx->put_data))
664 return -EINVAL;
665
666 if (unlikely(left < total_len))
667 return -EOVERFLOW;
668
669 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
670 put_unaligned_le16(attr, &hdr->tlv_type);
671 put_unaligned_le16(len, &hdr->tlv_len);
672 memcpy(hdr + 1, data, len);
673 sctx->send_size += total_len;
674
675 return 0;
676 }
677
678 #define TLV_PUT_DEFINE_INT(bits) \
679 static int tlv_put_u##bits(struct send_ctx *sctx, \
680 u##bits attr, u##bits value) \
681 { \
682 __le##bits __tmp = cpu_to_le##bits(value); \
683 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
684 }
685
686 TLV_PUT_DEFINE_INT(8)
687 TLV_PUT_DEFINE_INT(32)
688 TLV_PUT_DEFINE_INT(64)
689
tlv_put_string(struct send_ctx * sctx,u16 attr,const char * str,int len)690 static int tlv_put_string(struct send_ctx *sctx, u16 attr,
691 const char *str, int len)
692 {
693 if (len == -1)
694 len = strlen(str);
695 return tlv_put(sctx, attr, str, len);
696 }
697
tlv_put_uuid(struct send_ctx * sctx,u16 attr,const u8 * uuid)698 static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
699 const u8 *uuid)
700 {
701 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
702 }
703
tlv_put_btrfs_timespec(struct send_ctx * sctx,u16 attr,struct extent_buffer * eb,struct btrfs_timespec * ts)704 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
705 struct extent_buffer *eb,
706 struct btrfs_timespec *ts)
707 {
708 struct btrfs_timespec bts;
709 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
710 return tlv_put(sctx, attr, &bts, sizeof(bts));
711 }
712
713
714 #define TLV_PUT(sctx, attrtype, data, attrlen) \
715 do { \
716 ret = tlv_put(sctx, attrtype, data, attrlen); \
717 if (ret < 0) \
718 goto tlv_put_failure; \
719 } while (0)
720
721 #define TLV_PUT_INT(sctx, attrtype, bits, value) \
722 do { \
723 ret = tlv_put_u##bits(sctx, attrtype, value); \
724 if (ret < 0) \
725 goto tlv_put_failure; \
726 } while (0)
727
728 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
729 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
730 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
731 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
732 #define TLV_PUT_STRING(sctx, attrtype, str, len) \
733 do { \
734 ret = tlv_put_string(sctx, attrtype, str, len); \
735 if (ret < 0) \
736 goto tlv_put_failure; \
737 } while (0)
738 #define TLV_PUT_PATH(sctx, attrtype, p) \
739 do { \
740 ret = tlv_put_string(sctx, attrtype, p->start, \
741 p->end - p->start); \
742 if (ret < 0) \
743 goto tlv_put_failure; \
744 } while(0)
745 #define TLV_PUT_UUID(sctx, attrtype, uuid) \
746 do { \
747 ret = tlv_put_uuid(sctx, attrtype, uuid); \
748 if (ret < 0) \
749 goto tlv_put_failure; \
750 } while (0)
751 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
752 do { \
753 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
754 if (ret < 0) \
755 goto tlv_put_failure; \
756 } while (0)
757
send_header(struct send_ctx * sctx)758 static int send_header(struct send_ctx *sctx)
759 {
760 struct btrfs_stream_header hdr;
761
762 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
763 hdr.version = cpu_to_le32(sctx->proto);
764 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
765 &sctx->send_off);
766 }
767
768 /*
769 * For each command/item we want to send to userspace, we call this function.
770 */
begin_cmd(struct send_ctx * sctx,int cmd)771 static int begin_cmd(struct send_ctx *sctx, int cmd)
772 {
773 struct btrfs_cmd_header *hdr;
774
775 if (WARN_ON(!sctx->send_buf))
776 return -EINVAL;
777
778 if (unlikely(sctx->send_size != 0)) {
779 btrfs_err(sctx->send_root->fs_info,
780 "send: command header buffer not empty cmd %d offset %llu",
781 cmd, sctx->send_off);
782 return -EINVAL;
783 }
784
785 sctx->send_size += sizeof(*hdr);
786 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
787 put_unaligned_le16(cmd, &hdr->cmd);
788
789 return 0;
790 }
791
send_cmd(struct send_ctx * sctx)792 static int send_cmd(struct send_ctx *sctx)
793 {
794 int ret;
795 struct btrfs_cmd_header *hdr;
796 u32 crc;
797
798 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
799 put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
800 put_unaligned_le32(0, &hdr->crc);
801
802 crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
803 put_unaligned_le32(crc, &hdr->crc);
804
805 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
806 &sctx->send_off);
807
808 sctx->send_size = 0;
809 sctx->put_data = false;
810
811 return ret;
812 }
813
814 /*
815 * Sends a move instruction to user space
816 */
send_rename(struct send_ctx * sctx,struct fs_path * from,struct fs_path * to)817 static int send_rename(struct send_ctx *sctx,
818 struct fs_path *from, struct fs_path *to)
819 {
820 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
821 int ret;
822
823 btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
824
825 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
826 if (ret < 0)
827 goto out;
828
829 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
830 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
831
832 ret = send_cmd(sctx);
833
834 tlv_put_failure:
835 out:
836 return ret;
837 }
838
839 /*
840 * Sends a link instruction to user space
841 */
send_link(struct send_ctx * sctx,struct fs_path * path,struct fs_path * lnk)842 static int send_link(struct send_ctx *sctx,
843 struct fs_path *path, struct fs_path *lnk)
844 {
845 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
846 int ret;
847
848 btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
849
850 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
851 if (ret < 0)
852 goto out;
853
854 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
855 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
856
857 ret = send_cmd(sctx);
858
859 tlv_put_failure:
860 out:
861 return ret;
862 }
863
864 /*
865 * Sends an unlink instruction to user space
866 */
send_unlink(struct send_ctx * sctx,struct fs_path * path)867 static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
868 {
869 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
870 int ret;
871
872 btrfs_debug(fs_info, "send_unlink %s", path->start);
873
874 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
875 if (ret < 0)
876 goto out;
877
878 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
879
880 ret = send_cmd(sctx);
881
882 tlv_put_failure:
883 out:
884 return ret;
885 }
886
887 /*
888 * Sends a rmdir instruction to user space
889 */
send_rmdir(struct send_ctx * sctx,struct fs_path * path)890 static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
891 {
892 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
893 int ret;
894
895 btrfs_debug(fs_info, "send_rmdir %s", path->start);
896
897 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
898 if (ret < 0)
899 goto out;
900
901 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
902
903 ret = send_cmd(sctx);
904
905 tlv_put_failure:
906 out:
907 return ret;
908 }
909
910 struct btrfs_inode_info {
911 u64 size;
912 u64 gen;
913 u64 mode;
914 u64 uid;
915 u64 gid;
916 u64 rdev;
917 u64 fileattr;
918 u64 nlink;
919 };
920
921 /*
922 * Helper function to retrieve some fields from an inode item.
923 */
get_inode_info(struct btrfs_root * root,u64 ino,struct btrfs_inode_info * info)924 static int get_inode_info(struct btrfs_root *root, u64 ino,
925 struct btrfs_inode_info *info)
926 {
927 int ret;
928 struct btrfs_path *path;
929 struct btrfs_inode_item *ii;
930 struct btrfs_key key;
931
932 path = alloc_path_for_send();
933 if (!path)
934 return -ENOMEM;
935
936 key.objectid = ino;
937 key.type = BTRFS_INODE_ITEM_KEY;
938 key.offset = 0;
939 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
940 if (ret) {
941 if (ret > 0)
942 ret = -ENOENT;
943 goto out;
944 }
945
946 if (!info)
947 goto out;
948
949 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
950 struct btrfs_inode_item);
951 info->size = btrfs_inode_size(path->nodes[0], ii);
952 info->gen = btrfs_inode_generation(path->nodes[0], ii);
953 info->mode = btrfs_inode_mode(path->nodes[0], ii);
954 info->uid = btrfs_inode_uid(path->nodes[0], ii);
955 info->gid = btrfs_inode_gid(path->nodes[0], ii);
956 info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
957 info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
958 /*
959 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
960 * otherwise logically split to 32/32 parts.
961 */
962 info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
963
964 out:
965 btrfs_free_path(path);
966 return ret;
967 }
968
get_inode_gen(struct btrfs_root * root,u64 ino,u64 * gen)969 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
970 {
971 int ret;
972 struct btrfs_inode_info info = { 0 };
973
974 ASSERT(gen);
975
976 ret = get_inode_info(root, ino, &info);
977 *gen = info.gen;
978 return ret;
979 }
980
981 typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
982 struct fs_path *p,
983 void *ctx);
984
985 /*
986 * Helper function to iterate the entries in ONE btrfs_inode_ref or
987 * btrfs_inode_extref.
988 * The iterate callback may return a non zero value to stop iteration. This can
989 * be a negative value for error codes or 1 to simply stop it.
990 *
991 * path must point to the INODE_REF or INODE_EXTREF when called.
992 */
iterate_inode_ref(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * found_key,int resolve,iterate_inode_ref_t iterate,void * ctx)993 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
994 struct btrfs_key *found_key, int resolve,
995 iterate_inode_ref_t iterate, void *ctx)
996 {
997 struct extent_buffer *eb = path->nodes[0];
998 struct btrfs_inode_ref *iref;
999 struct btrfs_inode_extref *extref;
1000 struct btrfs_path *tmp_path;
1001 struct fs_path *p;
1002 u32 cur = 0;
1003 u32 total;
1004 int slot = path->slots[0];
1005 u32 name_len;
1006 char *start;
1007 int ret = 0;
1008 int num = 0;
1009 int index;
1010 u64 dir;
1011 unsigned long name_off;
1012 unsigned long elem_size;
1013 unsigned long ptr;
1014
1015 p = fs_path_alloc_reversed();
1016 if (!p)
1017 return -ENOMEM;
1018
1019 tmp_path = alloc_path_for_send();
1020 if (!tmp_path) {
1021 fs_path_free(p);
1022 return -ENOMEM;
1023 }
1024
1025
1026 if (found_key->type == BTRFS_INODE_REF_KEY) {
1027 ptr = (unsigned long)btrfs_item_ptr(eb, slot,
1028 struct btrfs_inode_ref);
1029 total = btrfs_item_size(eb, slot);
1030 elem_size = sizeof(*iref);
1031 } else {
1032 ptr = btrfs_item_ptr_offset(eb, slot);
1033 total = btrfs_item_size(eb, slot);
1034 elem_size = sizeof(*extref);
1035 }
1036
1037 while (cur < total) {
1038 fs_path_reset(p);
1039
1040 if (found_key->type == BTRFS_INODE_REF_KEY) {
1041 iref = (struct btrfs_inode_ref *)(ptr + cur);
1042 name_len = btrfs_inode_ref_name_len(eb, iref);
1043 name_off = (unsigned long)(iref + 1);
1044 index = btrfs_inode_ref_index(eb, iref);
1045 dir = found_key->offset;
1046 } else {
1047 extref = (struct btrfs_inode_extref *)(ptr + cur);
1048 name_len = btrfs_inode_extref_name_len(eb, extref);
1049 name_off = (unsigned long)&extref->name;
1050 index = btrfs_inode_extref_index(eb, extref);
1051 dir = btrfs_inode_extref_parent(eb, extref);
1052 }
1053
1054 if (resolve) {
1055 start = btrfs_ref_to_path(root, tmp_path, name_len,
1056 name_off, eb, dir,
1057 p->buf, p->buf_len);
1058 if (IS_ERR(start)) {
1059 ret = PTR_ERR(start);
1060 goto out;
1061 }
1062 if (start < p->buf) {
1063 /* overflow , try again with larger buffer */
1064 ret = fs_path_ensure_buf(p,
1065 p->buf_len + p->buf - start);
1066 if (ret < 0)
1067 goto out;
1068 start = btrfs_ref_to_path(root, tmp_path,
1069 name_len, name_off,
1070 eb, dir,
1071 p->buf, p->buf_len);
1072 if (IS_ERR(start)) {
1073 ret = PTR_ERR(start);
1074 goto out;
1075 }
1076 if (unlikely(start < p->buf)) {
1077 btrfs_err(root->fs_info,
1078 "send: path ref buffer underflow for key (%llu %u %llu)",
1079 found_key->objectid,
1080 found_key->type,
1081 found_key->offset);
1082 ret = -EINVAL;
1083 goto out;
1084 }
1085 }
1086 p->start = start;
1087 } else {
1088 ret = fs_path_add_from_extent_buffer(p, eb, name_off,
1089 name_len);
1090 if (ret < 0)
1091 goto out;
1092 }
1093
1094 cur += elem_size + name_len;
1095 ret = iterate(num, dir, index, p, ctx);
1096 if (ret)
1097 goto out;
1098 num++;
1099 }
1100
1101 out:
1102 btrfs_free_path(tmp_path);
1103 fs_path_free(p);
1104 return ret;
1105 }
1106
1107 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
1108 const char *name, int name_len,
1109 const char *data, int data_len,
1110 void *ctx);
1111
1112 /*
1113 * Helper function to iterate the entries in ONE btrfs_dir_item.
1114 * The iterate callback may return a non zero value to stop iteration. This can
1115 * be a negative value for error codes or 1 to simply stop it.
1116 *
1117 * path must point to the dir item when called.
1118 */
iterate_dir_item(struct btrfs_root * root,struct btrfs_path * path,iterate_dir_item_t iterate,void * ctx)1119 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1120 iterate_dir_item_t iterate, void *ctx)
1121 {
1122 int ret = 0;
1123 struct extent_buffer *eb;
1124 struct btrfs_dir_item *di;
1125 struct btrfs_key di_key;
1126 char *buf = NULL;
1127 int buf_len;
1128 u32 name_len;
1129 u32 data_len;
1130 u32 cur;
1131 u32 len;
1132 u32 total;
1133 int slot;
1134 int num;
1135
1136 /*
1137 * Start with a small buffer (1 page). If later we end up needing more
1138 * space, which can happen for xattrs on a fs with a leaf size greater
1139 * then the page size, attempt to increase the buffer. Typically xattr
1140 * values are small.
1141 */
1142 buf_len = PATH_MAX;
1143 buf = kmalloc(buf_len, GFP_KERNEL);
1144 if (!buf) {
1145 ret = -ENOMEM;
1146 goto out;
1147 }
1148
1149 eb = path->nodes[0];
1150 slot = path->slots[0];
1151 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1152 cur = 0;
1153 len = 0;
1154 total = btrfs_item_size(eb, slot);
1155
1156 num = 0;
1157 while (cur < total) {
1158 name_len = btrfs_dir_name_len(eb, di);
1159 data_len = btrfs_dir_data_len(eb, di);
1160 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1161
1162 if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
1163 if (name_len > XATTR_NAME_MAX) {
1164 ret = -ENAMETOOLONG;
1165 goto out;
1166 }
1167 if (name_len + data_len >
1168 BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
1169 ret = -E2BIG;
1170 goto out;
1171 }
1172 } else {
1173 /*
1174 * Path too long
1175 */
1176 if (name_len + data_len > PATH_MAX) {
1177 ret = -ENAMETOOLONG;
1178 goto out;
1179 }
1180 }
1181
1182 if (name_len + data_len > buf_len) {
1183 buf_len = name_len + data_len;
1184 if (is_vmalloc_addr(buf)) {
1185 vfree(buf);
1186 buf = NULL;
1187 } else {
1188 char *tmp = krealloc(buf, buf_len,
1189 GFP_KERNEL | __GFP_NOWARN);
1190
1191 if (!tmp)
1192 kfree(buf);
1193 buf = tmp;
1194 }
1195 if (!buf) {
1196 buf = kvmalloc(buf_len, GFP_KERNEL);
1197 if (!buf) {
1198 ret = -ENOMEM;
1199 goto out;
1200 }
1201 }
1202 }
1203
1204 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1205 name_len + data_len);
1206
1207 len = sizeof(*di) + name_len + data_len;
1208 di = (struct btrfs_dir_item *)((char *)di + len);
1209 cur += len;
1210
1211 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1212 data_len, ctx);
1213 if (ret < 0)
1214 goto out;
1215 if (ret) {
1216 ret = 0;
1217 goto out;
1218 }
1219
1220 num++;
1221 }
1222
1223 out:
1224 kvfree(buf);
1225 return ret;
1226 }
1227
__copy_first_ref(int num,u64 dir,int index,struct fs_path * p,void * ctx)1228 static int __copy_first_ref(int num, u64 dir, int index,
1229 struct fs_path *p, void *ctx)
1230 {
1231 int ret;
1232 struct fs_path *pt = ctx;
1233
1234 ret = fs_path_copy(pt, p);
1235 if (ret < 0)
1236 return ret;
1237
1238 /* we want the first only */
1239 return 1;
1240 }
1241
1242 /*
1243 * Retrieve the first path of an inode. If an inode has more then one
1244 * ref/hardlink, this is ignored.
1245 */
get_inode_path(struct btrfs_root * root,u64 ino,struct fs_path * path)1246 static int get_inode_path(struct btrfs_root *root,
1247 u64 ino, struct fs_path *path)
1248 {
1249 int ret;
1250 struct btrfs_key key, found_key;
1251 struct btrfs_path *p;
1252
1253 p = alloc_path_for_send();
1254 if (!p)
1255 return -ENOMEM;
1256
1257 fs_path_reset(path);
1258
1259 key.objectid = ino;
1260 key.type = BTRFS_INODE_REF_KEY;
1261 key.offset = 0;
1262
1263 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
1264 if (ret < 0)
1265 goto out;
1266 if (ret) {
1267 ret = 1;
1268 goto out;
1269 }
1270 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
1271 if (found_key.objectid != ino ||
1272 (found_key.type != BTRFS_INODE_REF_KEY &&
1273 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1274 ret = -ENOENT;
1275 goto out;
1276 }
1277
1278 ret = iterate_inode_ref(root, p, &found_key, 1,
1279 __copy_first_ref, path);
1280 if (ret < 0)
1281 goto out;
1282 ret = 0;
1283
1284 out:
1285 btrfs_free_path(p);
1286 return ret;
1287 }
1288
1289 struct backref_ctx {
1290 struct send_ctx *sctx;
1291
1292 /* number of total found references */
1293 u64 found;
1294
1295 /*
1296 * used for clones found in send_root. clones found behind cur_objectid
1297 * and cur_offset are not considered as allowed clones.
1298 */
1299 u64 cur_objectid;
1300 u64 cur_offset;
1301
1302 /* may be truncated in case it's the last extent in a file */
1303 u64 extent_len;
1304
1305 /* The bytenr the file extent item we are processing refers to. */
1306 u64 bytenr;
1307 /* The owner (root id) of the data backref for the current extent. */
1308 u64 backref_owner;
1309 /* The offset of the data backref for the current extent. */
1310 u64 backref_offset;
1311 };
1312
__clone_root_cmp_bsearch(const void * key,const void * elt)1313 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1314 {
1315 u64 root = (u64)(uintptr_t)key;
1316 const struct clone_root *cr = elt;
1317
1318 if (root < cr->root->root_key.objectid)
1319 return -1;
1320 if (root > cr->root->root_key.objectid)
1321 return 1;
1322 return 0;
1323 }
1324
__clone_root_cmp_sort(const void * e1,const void * e2)1325 static int __clone_root_cmp_sort(const void *e1, const void *e2)
1326 {
1327 const struct clone_root *cr1 = e1;
1328 const struct clone_root *cr2 = e2;
1329
1330 if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
1331 return -1;
1332 if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
1333 return 1;
1334 return 0;
1335 }
1336
1337 /*
1338 * Called for every backref that is found for the current extent.
1339 * Results are collected in sctx->clone_roots->ino/offset.
1340 */
iterate_backrefs(u64 ino,u64 offset,u64 num_bytes,u64 root_id,void * ctx_)1341 static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
1342 void *ctx_)
1343 {
1344 struct backref_ctx *bctx = ctx_;
1345 struct clone_root *clone_root;
1346
1347 /* First check if the root is in the list of accepted clone sources */
1348 clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
1349 bctx->sctx->clone_roots_cnt,
1350 sizeof(struct clone_root),
1351 __clone_root_cmp_bsearch);
1352 if (!clone_root)
1353 return 0;
1354
1355 /* This is our own reference, bail out as we can't clone from it. */
1356 if (clone_root->root == bctx->sctx->send_root &&
1357 ino == bctx->cur_objectid &&
1358 offset == bctx->cur_offset)
1359 return 0;
1360
1361 /*
1362 * Make sure we don't consider clones from send_root that are
1363 * behind the current inode/offset.
1364 */
1365 if (clone_root->root == bctx->sctx->send_root) {
1366 /*
1367 * If the source inode was not yet processed we can't issue a
1368 * clone operation, as the source extent does not exist yet at
1369 * the destination of the stream.
1370 */
1371 if (ino > bctx->cur_objectid)
1372 return 0;
1373 /*
1374 * We clone from the inode currently being sent as long as the
1375 * source extent is already processed, otherwise we could try
1376 * to clone from an extent that does not exist yet at the
1377 * destination of the stream.
1378 */
1379 if (ino == bctx->cur_objectid &&
1380 offset + bctx->extent_len >
1381 bctx->sctx->cur_inode_next_write_offset)
1382 return 0;
1383 }
1384
1385 bctx->found++;
1386 clone_root->found_ref = true;
1387
1388 /*
1389 * If the given backref refers to a file extent item with a larger
1390 * number of bytes than what we found before, use the new one so that
1391 * we clone more optimally and end up doing less writes and getting
1392 * less exclusive, non-shared extents at the destination.
1393 */
1394 if (num_bytes > clone_root->num_bytes) {
1395 clone_root->ino = ino;
1396 clone_root->offset = offset;
1397 clone_root->num_bytes = num_bytes;
1398
1399 /*
1400 * Found a perfect candidate, so there's no need to continue
1401 * backref walking.
1402 */
1403 if (num_bytes >= bctx->extent_len)
1404 return BTRFS_ITERATE_EXTENT_INODES_STOP;
1405 }
1406
1407 return 0;
1408 }
1409
lookup_backref_cache(u64 leaf_bytenr,void * ctx,const u64 ** root_ids_ret,int * root_count_ret)1410 static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
1411 const u64 **root_ids_ret, int *root_count_ret)
1412 {
1413 struct backref_ctx *bctx = ctx;
1414 struct send_ctx *sctx = bctx->sctx;
1415 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1416 const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
1417 struct btrfs_lru_cache_entry *raw_entry;
1418 struct backref_cache_entry *entry;
1419
1420 if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
1421 return false;
1422
1423 /*
1424 * If relocation happened since we first filled the cache, then we must
1425 * empty the cache and can not use it, because even though we operate on
1426 * read-only roots, their leaves and nodes may have been reallocated and
1427 * now be used for different nodes/leaves of the same tree or some other
1428 * tree.
1429 *
1430 * We are called from iterate_extent_inodes() while either holding a
1431 * transaction handle or holding fs_info->commit_root_sem, so no need
1432 * to take any lock here.
1433 */
1434 if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
1435 btrfs_lru_cache_clear(&sctx->backref_cache);
1436 return false;
1437 }
1438
1439 raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
1440 if (!raw_entry)
1441 return false;
1442
1443 entry = container_of(raw_entry, struct backref_cache_entry, entry);
1444 *root_ids_ret = entry->root_ids;
1445 *root_count_ret = entry->num_roots;
1446
1447 return true;
1448 }
1449
store_backref_cache(u64 leaf_bytenr,const struct ulist * root_ids,void * ctx)1450 static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
1451 void *ctx)
1452 {
1453 struct backref_ctx *bctx = ctx;
1454 struct send_ctx *sctx = bctx->sctx;
1455 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1456 struct backref_cache_entry *new_entry;
1457 struct ulist_iterator uiter;
1458 struct ulist_node *node;
1459 int ret;
1460
1461 /*
1462 * We're called while holding a transaction handle or while holding
1463 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1464 * NOFS allocation.
1465 */
1466 new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
1467 /* No worries, cache is optional. */
1468 if (!new_entry)
1469 return;
1470
1471 new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
1472 new_entry->entry.gen = 0;
1473 new_entry->num_roots = 0;
1474 ULIST_ITER_INIT(&uiter);
1475 while ((node = ulist_next(root_ids, &uiter)) != NULL) {
1476 const u64 root_id = node->val;
1477 struct clone_root *root;
1478
1479 root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
1480 sctx->clone_roots_cnt, sizeof(struct clone_root),
1481 __clone_root_cmp_bsearch);
1482 if (!root)
1483 continue;
1484
1485 /* Too many roots, just exit, no worries as caching is optional. */
1486 if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
1487 kfree(new_entry);
1488 return;
1489 }
1490
1491 new_entry->root_ids[new_entry->num_roots] = root_id;
1492 new_entry->num_roots++;
1493 }
1494
1495 /*
1496 * We may have not added any roots to the new cache entry, which means
1497 * none of the roots is part of the list of roots from which we are
1498 * allowed to clone. Cache the new entry as it's still useful to avoid
1499 * backref walking to determine which roots have a path to the leaf.
1500 *
1501 * Also use GFP_NOFS because we're called while holding a transaction
1502 * handle or while holding fs_info->commit_root_sem.
1503 */
1504 ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
1505 GFP_NOFS);
1506 ASSERT(ret == 0 || ret == -ENOMEM);
1507 if (ret) {
1508 /* Caching is optional, no worries. */
1509 kfree(new_entry);
1510 return;
1511 }
1512
1513 /*
1514 * We are called from iterate_extent_inodes() while either holding a
1515 * transaction handle or holding fs_info->commit_root_sem, so no need
1516 * to take any lock here.
1517 */
1518 if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
1519 sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
1520 }
1521
check_extent_item(u64 bytenr,const struct btrfs_extent_item * ei,const struct extent_buffer * leaf,void * ctx)1522 static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
1523 const struct extent_buffer *leaf, void *ctx)
1524 {
1525 const u64 refs = btrfs_extent_refs(leaf, ei);
1526 const struct backref_ctx *bctx = ctx;
1527 const struct send_ctx *sctx = bctx->sctx;
1528
1529 if (bytenr == bctx->bytenr) {
1530 const u64 flags = btrfs_extent_flags(leaf, ei);
1531
1532 if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
1533 return -EUCLEAN;
1534
1535 /*
1536 * If we have only one reference and only the send root as a
1537 * clone source - meaning no clone roots were given in the
1538 * struct btrfs_ioctl_send_args passed to the send ioctl - then
1539 * it's our reference and there's no point in doing backref
1540 * walking which is expensive, so exit early.
1541 */
1542 if (refs == 1 && sctx->clone_roots_cnt == 1)
1543 return -ENOENT;
1544 }
1545
1546 /*
1547 * Backreference walking (iterate_extent_inodes() below) is currently
1548 * too expensive when an extent has a large number of references, both
1549 * in time spent and used memory. So for now just fallback to write
1550 * operations instead of clone operations when an extent has more than
1551 * a certain amount of references.
1552 */
1553 if (refs > SEND_MAX_EXTENT_REFS)
1554 return -ENOENT;
1555
1556 return 0;
1557 }
1558
skip_self_data_ref(u64 root,u64 ino,u64 offset,void * ctx)1559 static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
1560 {
1561 const struct backref_ctx *bctx = ctx;
1562
1563 if (ino == bctx->cur_objectid &&
1564 root == bctx->backref_owner &&
1565 offset == bctx->backref_offset)
1566 return true;
1567
1568 return false;
1569 }
1570
1571 /*
1572 * Given an inode, offset and extent item, it finds a good clone for a clone
1573 * instruction. Returns -ENOENT when none could be found. The function makes
1574 * sure that the returned clone is usable at the point where sending is at the
1575 * moment. This means, that no clones are accepted which lie behind the current
1576 * inode+offset.
1577 *
1578 * path must point to the extent item when called.
1579 */
find_extent_clone(struct send_ctx * sctx,struct btrfs_path * path,u64 ino,u64 data_offset,u64 ino_size,struct clone_root ** found)1580 static int find_extent_clone(struct send_ctx *sctx,
1581 struct btrfs_path *path,
1582 u64 ino, u64 data_offset,
1583 u64 ino_size,
1584 struct clone_root **found)
1585 {
1586 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1587 int ret;
1588 int extent_type;
1589 u64 logical;
1590 u64 disk_byte;
1591 u64 num_bytes;
1592 struct btrfs_file_extent_item *fi;
1593 struct extent_buffer *eb = path->nodes[0];
1594 struct backref_ctx backref_ctx = { 0 };
1595 struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
1596 struct clone_root *cur_clone_root;
1597 int compressed;
1598 u32 i;
1599
1600 /*
1601 * With fallocate we can get prealloc extents beyond the inode's i_size,
1602 * so we don't do anything here because clone operations can not clone
1603 * to a range beyond i_size without increasing the i_size of the
1604 * destination inode.
1605 */
1606 if (data_offset >= ino_size)
1607 return 0;
1608
1609 fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
1610 extent_type = btrfs_file_extent_type(eb, fi);
1611 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1612 return -ENOENT;
1613
1614 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1615 if (disk_byte == 0)
1616 return -ENOENT;
1617
1618 compressed = btrfs_file_extent_compression(eb, fi);
1619 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1620 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1621
1622 /*
1623 * Setup the clone roots.
1624 */
1625 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1626 cur_clone_root = sctx->clone_roots + i;
1627 cur_clone_root->ino = (u64)-1;
1628 cur_clone_root->offset = 0;
1629 cur_clone_root->num_bytes = 0;
1630 cur_clone_root->found_ref = false;
1631 }
1632
1633 backref_ctx.sctx = sctx;
1634 backref_ctx.cur_objectid = ino;
1635 backref_ctx.cur_offset = data_offset;
1636 backref_ctx.bytenr = disk_byte;
1637 /*
1638 * Use the header owner and not the send root's id, because in case of a
1639 * snapshot we can have shared subtrees.
1640 */
1641 backref_ctx.backref_owner = btrfs_header_owner(eb);
1642 backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
1643
1644 /*
1645 * The last extent of a file may be too large due to page alignment.
1646 * We need to adjust extent_len in this case so that the checks in
1647 * iterate_backrefs() work.
1648 */
1649 if (data_offset + num_bytes >= ino_size)
1650 backref_ctx.extent_len = ino_size - data_offset;
1651 else
1652 backref_ctx.extent_len = num_bytes;
1653
1654 /*
1655 * Now collect all backrefs.
1656 */
1657 backref_walk_ctx.bytenr = disk_byte;
1658 if (compressed == BTRFS_COMPRESS_NONE)
1659 backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
1660 backref_walk_ctx.fs_info = fs_info;
1661 backref_walk_ctx.cache_lookup = lookup_backref_cache;
1662 backref_walk_ctx.cache_store = store_backref_cache;
1663 backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
1664 backref_walk_ctx.check_extent_item = check_extent_item;
1665 backref_walk_ctx.user_ctx = &backref_ctx;
1666
1667 /*
1668 * If have a single clone root, then it's the send root and we can tell
1669 * the backref walking code to skip our own backref and not resolve it,
1670 * since we can not use it for cloning - the source and destination
1671 * ranges can't overlap and in case the leaf is shared through a subtree
1672 * due to snapshots, we can't use those other roots since they are not
1673 * in the list of clone roots.
1674 */
1675 if (sctx->clone_roots_cnt == 1)
1676 backref_walk_ctx.skip_data_ref = skip_self_data_ref;
1677
1678 ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
1679 &backref_ctx);
1680 if (ret < 0)
1681 return ret;
1682
1683 down_read(&fs_info->commit_root_sem);
1684 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1685 /*
1686 * A transaction commit for a transaction in which block group
1687 * relocation was done just happened.
1688 * The disk_bytenr of the file extent item we processed is
1689 * possibly stale, referring to the extent's location before
1690 * relocation. So act as if we haven't found any clone sources
1691 * and fallback to write commands, which will read the correct
1692 * data from the new extent location. Otherwise we will fail
1693 * below because we haven't found our own back reference or we
1694 * could be getting incorrect sources in case the old extent
1695 * was already reallocated after the relocation.
1696 */
1697 up_read(&fs_info->commit_root_sem);
1698 return -ENOENT;
1699 }
1700 up_read(&fs_info->commit_root_sem);
1701
1702 btrfs_debug(fs_info,
1703 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1704 data_offset, ino, num_bytes, logical);
1705
1706 if (!backref_ctx.found) {
1707 btrfs_debug(fs_info, "no clones found");
1708 return -ENOENT;
1709 }
1710
1711 cur_clone_root = NULL;
1712 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1713 struct clone_root *clone_root = &sctx->clone_roots[i];
1714
1715 if (!clone_root->found_ref)
1716 continue;
1717
1718 /*
1719 * Choose the root from which we can clone more bytes, to
1720 * minimize write operations and therefore have more extent
1721 * sharing at the destination (the same as in the source).
1722 */
1723 if (!cur_clone_root ||
1724 clone_root->num_bytes > cur_clone_root->num_bytes) {
1725 cur_clone_root = clone_root;
1726
1727 /*
1728 * We found an optimal clone candidate (any inode from
1729 * any root is fine), so we're done.
1730 */
1731 if (clone_root->num_bytes >= backref_ctx.extent_len)
1732 break;
1733 }
1734 }
1735
1736 if (cur_clone_root) {
1737 *found = cur_clone_root;
1738 ret = 0;
1739 } else {
1740 ret = -ENOENT;
1741 }
1742
1743 return ret;
1744 }
1745
read_symlink(struct btrfs_root * root,u64 ino,struct fs_path * dest)1746 static int read_symlink(struct btrfs_root *root,
1747 u64 ino,
1748 struct fs_path *dest)
1749 {
1750 int ret;
1751 struct btrfs_path *path;
1752 struct btrfs_key key;
1753 struct btrfs_file_extent_item *ei;
1754 u8 type;
1755 u8 compression;
1756 unsigned long off;
1757 int len;
1758
1759 path = alloc_path_for_send();
1760 if (!path)
1761 return -ENOMEM;
1762
1763 key.objectid = ino;
1764 key.type = BTRFS_EXTENT_DATA_KEY;
1765 key.offset = 0;
1766 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1767 if (ret < 0)
1768 goto out;
1769 if (ret) {
1770 /*
1771 * An empty symlink inode. Can happen in rare error paths when
1772 * creating a symlink (transaction committed before the inode
1773 * eviction handler removed the symlink inode items and a crash
1774 * happened in between or the subvol was snapshoted in between).
1775 * Print an informative message to dmesg/syslog so that the user
1776 * can delete the symlink.
1777 */
1778 btrfs_err(root->fs_info,
1779 "Found empty symlink inode %llu at root %llu",
1780 ino, root->root_key.objectid);
1781 ret = -EIO;
1782 goto out;
1783 }
1784
1785 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1786 struct btrfs_file_extent_item);
1787 type = btrfs_file_extent_type(path->nodes[0], ei);
1788 if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
1789 ret = -EUCLEAN;
1790 btrfs_crit(root->fs_info,
1791 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1792 ino, btrfs_root_id(root), type);
1793 goto out;
1794 }
1795 compression = btrfs_file_extent_compression(path->nodes[0], ei);
1796 if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
1797 ret = -EUCLEAN;
1798 btrfs_crit(root->fs_info,
1799 "send: found symlink extent with compression, ino %llu root %llu compression type %d",
1800 ino, btrfs_root_id(root), compression);
1801 goto out;
1802 }
1803
1804 off = btrfs_file_extent_inline_start(ei);
1805 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
1806
1807 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1808
1809 out:
1810 btrfs_free_path(path);
1811 return ret;
1812 }
1813
1814 /*
1815 * Helper function to generate a file name that is unique in the root of
1816 * send_root and parent_root. This is used to generate names for orphan inodes.
1817 */
gen_unique_name(struct send_ctx * sctx,u64 ino,u64 gen,struct fs_path * dest)1818 static int gen_unique_name(struct send_ctx *sctx,
1819 u64 ino, u64 gen,
1820 struct fs_path *dest)
1821 {
1822 int ret = 0;
1823 struct btrfs_path *path;
1824 struct btrfs_dir_item *di;
1825 char tmp[64];
1826 int len;
1827 u64 idx = 0;
1828
1829 path = alloc_path_for_send();
1830 if (!path)
1831 return -ENOMEM;
1832
1833 while (1) {
1834 struct fscrypt_str tmp_name;
1835
1836 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1837 ino, gen, idx);
1838 ASSERT(len < sizeof(tmp));
1839 tmp_name.name = tmp;
1840 tmp_name.len = strlen(tmp);
1841
1842 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1843 path, BTRFS_FIRST_FREE_OBJECTID,
1844 &tmp_name, 0);
1845 btrfs_release_path(path);
1846 if (IS_ERR(di)) {
1847 ret = PTR_ERR(di);
1848 goto out;
1849 }
1850 if (di) {
1851 /* not unique, try again */
1852 idx++;
1853 continue;
1854 }
1855
1856 if (!sctx->parent_root) {
1857 /* unique */
1858 ret = 0;
1859 break;
1860 }
1861
1862 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1863 path, BTRFS_FIRST_FREE_OBJECTID,
1864 &tmp_name, 0);
1865 btrfs_release_path(path);
1866 if (IS_ERR(di)) {
1867 ret = PTR_ERR(di);
1868 goto out;
1869 }
1870 if (di) {
1871 /* not unique, try again */
1872 idx++;
1873 continue;
1874 }
1875 /* unique */
1876 break;
1877 }
1878
1879 ret = fs_path_add(dest, tmp, strlen(tmp));
1880
1881 out:
1882 btrfs_free_path(path);
1883 return ret;
1884 }
1885
1886 enum inode_state {
1887 inode_state_no_change,
1888 inode_state_will_create,
1889 inode_state_did_create,
1890 inode_state_will_delete,
1891 inode_state_did_delete,
1892 };
1893
get_cur_inode_state(struct send_ctx * sctx,u64 ino,u64 gen,u64 * send_gen,u64 * parent_gen)1894 static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
1895 u64 *send_gen, u64 *parent_gen)
1896 {
1897 int ret;
1898 int left_ret;
1899 int right_ret;
1900 u64 left_gen;
1901 u64 right_gen = 0;
1902 struct btrfs_inode_info info;
1903
1904 ret = get_inode_info(sctx->send_root, ino, &info);
1905 if (ret < 0 && ret != -ENOENT)
1906 goto out;
1907 left_ret = (info.nlink == 0) ? -ENOENT : ret;
1908 left_gen = info.gen;
1909 if (send_gen)
1910 *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
1911
1912 if (!sctx->parent_root) {
1913 right_ret = -ENOENT;
1914 } else {
1915 ret = get_inode_info(sctx->parent_root, ino, &info);
1916 if (ret < 0 && ret != -ENOENT)
1917 goto out;
1918 right_ret = (info.nlink == 0) ? -ENOENT : ret;
1919 right_gen = info.gen;
1920 if (parent_gen)
1921 *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
1922 }
1923
1924 if (!left_ret && !right_ret) {
1925 if (left_gen == gen && right_gen == gen) {
1926 ret = inode_state_no_change;
1927 } else if (left_gen == gen) {
1928 if (ino < sctx->send_progress)
1929 ret = inode_state_did_create;
1930 else
1931 ret = inode_state_will_create;
1932 } else if (right_gen == gen) {
1933 if (ino < sctx->send_progress)
1934 ret = inode_state_did_delete;
1935 else
1936 ret = inode_state_will_delete;
1937 } else {
1938 ret = -ENOENT;
1939 }
1940 } else if (!left_ret) {
1941 if (left_gen == gen) {
1942 if (ino < sctx->send_progress)
1943 ret = inode_state_did_create;
1944 else
1945 ret = inode_state_will_create;
1946 } else {
1947 ret = -ENOENT;
1948 }
1949 } else if (!right_ret) {
1950 if (right_gen == gen) {
1951 if (ino < sctx->send_progress)
1952 ret = inode_state_did_delete;
1953 else
1954 ret = inode_state_will_delete;
1955 } else {
1956 ret = -ENOENT;
1957 }
1958 } else {
1959 ret = -ENOENT;
1960 }
1961
1962 out:
1963 return ret;
1964 }
1965
is_inode_existent(struct send_ctx * sctx,u64 ino,u64 gen,u64 * send_gen,u64 * parent_gen)1966 static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
1967 u64 *send_gen, u64 *parent_gen)
1968 {
1969 int ret;
1970
1971 if (ino == BTRFS_FIRST_FREE_OBJECTID)
1972 return 1;
1973
1974 ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
1975 if (ret < 0)
1976 goto out;
1977
1978 if (ret == inode_state_no_change ||
1979 ret == inode_state_did_create ||
1980 ret == inode_state_will_delete)
1981 ret = 1;
1982 else
1983 ret = 0;
1984
1985 out:
1986 return ret;
1987 }
1988
1989 /*
1990 * Helper function to lookup a dir item in a dir.
1991 */
lookup_dir_item_inode(struct btrfs_root * root,u64 dir,const char * name,int name_len,u64 * found_inode)1992 static int lookup_dir_item_inode(struct btrfs_root *root,
1993 u64 dir, const char *name, int name_len,
1994 u64 *found_inode)
1995 {
1996 int ret = 0;
1997 struct btrfs_dir_item *di;
1998 struct btrfs_key key;
1999 struct btrfs_path *path;
2000 struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
2001
2002 path = alloc_path_for_send();
2003 if (!path)
2004 return -ENOMEM;
2005
2006 di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
2007 if (IS_ERR_OR_NULL(di)) {
2008 ret = di ? PTR_ERR(di) : -ENOENT;
2009 goto out;
2010 }
2011 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
2012 if (key.type == BTRFS_ROOT_ITEM_KEY) {
2013 ret = -ENOENT;
2014 goto out;
2015 }
2016 *found_inode = key.objectid;
2017
2018 out:
2019 btrfs_free_path(path);
2020 return ret;
2021 }
2022
2023 /*
2024 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
2025 * generation of the parent dir and the name of the dir entry.
2026 */
get_first_ref(struct btrfs_root * root,u64 ino,u64 * dir,u64 * dir_gen,struct fs_path * name)2027 static int get_first_ref(struct btrfs_root *root, u64 ino,
2028 u64 *dir, u64 *dir_gen, struct fs_path *name)
2029 {
2030 int ret;
2031 struct btrfs_key key;
2032 struct btrfs_key found_key;
2033 struct btrfs_path *path;
2034 int len;
2035 u64 parent_dir;
2036
2037 path = alloc_path_for_send();
2038 if (!path)
2039 return -ENOMEM;
2040
2041 key.objectid = ino;
2042 key.type = BTRFS_INODE_REF_KEY;
2043 key.offset = 0;
2044
2045 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
2046 if (ret < 0)
2047 goto out;
2048 if (!ret)
2049 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2050 path->slots[0]);
2051 if (ret || found_key.objectid != ino ||
2052 (found_key.type != BTRFS_INODE_REF_KEY &&
2053 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
2054 ret = -ENOENT;
2055 goto out;
2056 }
2057
2058 if (found_key.type == BTRFS_INODE_REF_KEY) {
2059 struct btrfs_inode_ref *iref;
2060 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2061 struct btrfs_inode_ref);
2062 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
2063 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
2064 (unsigned long)(iref + 1),
2065 len);
2066 parent_dir = found_key.offset;
2067 } else {
2068 struct btrfs_inode_extref *extref;
2069 extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2070 struct btrfs_inode_extref);
2071 len = btrfs_inode_extref_name_len(path->nodes[0], extref);
2072 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
2073 (unsigned long)&extref->name, len);
2074 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
2075 }
2076 if (ret < 0)
2077 goto out;
2078 btrfs_release_path(path);
2079
2080 if (dir_gen) {
2081 ret = get_inode_gen(root, parent_dir, dir_gen);
2082 if (ret < 0)
2083 goto out;
2084 }
2085
2086 *dir = parent_dir;
2087
2088 out:
2089 btrfs_free_path(path);
2090 return ret;
2091 }
2092
is_first_ref(struct btrfs_root * root,u64 ino,u64 dir,const char * name,int name_len)2093 static int is_first_ref(struct btrfs_root *root,
2094 u64 ino, u64 dir,
2095 const char *name, int name_len)
2096 {
2097 int ret;
2098 struct fs_path *tmp_name;
2099 u64 tmp_dir;
2100
2101 tmp_name = fs_path_alloc();
2102 if (!tmp_name)
2103 return -ENOMEM;
2104
2105 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
2106 if (ret < 0)
2107 goto out;
2108
2109 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
2110 ret = 0;
2111 goto out;
2112 }
2113
2114 ret = !memcmp(tmp_name->start, name, name_len);
2115
2116 out:
2117 fs_path_free(tmp_name);
2118 return ret;
2119 }
2120
2121 /*
2122 * Used by process_recorded_refs to determine if a new ref would overwrite an
2123 * already existing ref. In case it detects an overwrite, it returns the
2124 * inode/gen in who_ino/who_gen.
2125 * When an overwrite is detected, process_recorded_refs does proper orphanizing
2126 * to make sure later references to the overwritten inode are possible.
2127 * Orphanizing is however only required for the first ref of an inode.
2128 * process_recorded_refs does an additional is_first_ref check to see if
2129 * orphanizing is really required.
2130 */
will_overwrite_ref(struct send_ctx * sctx,u64 dir,u64 dir_gen,const char * name,int name_len,u64 * who_ino,u64 * who_gen,u64 * who_mode)2131 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2132 const char *name, int name_len,
2133 u64 *who_ino, u64 *who_gen, u64 *who_mode)
2134 {
2135 int ret;
2136 u64 parent_root_dir_gen;
2137 u64 other_inode = 0;
2138 struct btrfs_inode_info info;
2139
2140 if (!sctx->parent_root)
2141 return 0;
2142
2143 ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
2144 if (ret <= 0)
2145 return 0;
2146
2147 /*
2148 * If we have a parent root we need to verify that the parent dir was
2149 * not deleted and then re-created, if it was then we have no overwrite
2150 * and we can just unlink this entry.
2151 *
2152 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
2153 * parent root.
2154 */
2155 if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
2156 parent_root_dir_gen != dir_gen)
2157 return 0;
2158
2159 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
2160 &other_inode);
2161 if (ret == -ENOENT)
2162 return 0;
2163 else if (ret < 0)
2164 return ret;
2165
2166 /*
2167 * Check if the overwritten ref was already processed. If yes, the ref
2168 * was already unlinked/moved, so we can safely assume that we will not
2169 * overwrite anything at this point in time.
2170 */
2171 if (other_inode > sctx->send_progress ||
2172 is_waiting_for_move(sctx, other_inode)) {
2173 ret = get_inode_info(sctx->parent_root, other_inode, &info);
2174 if (ret < 0)
2175 return ret;
2176
2177 *who_ino = other_inode;
2178 *who_gen = info.gen;
2179 *who_mode = info.mode;
2180 return 1;
2181 }
2182
2183 return 0;
2184 }
2185
2186 /*
2187 * Checks if the ref was overwritten by an already processed inode. This is
2188 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
2189 * thus the orphan name needs be used.
2190 * process_recorded_refs also uses it to avoid unlinking of refs that were
2191 * overwritten.
2192 */
did_overwrite_ref(struct send_ctx * sctx,u64 dir,u64 dir_gen,u64 ino,u64 ino_gen,const char * name,int name_len)2193 static int did_overwrite_ref(struct send_ctx *sctx,
2194 u64 dir, u64 dir_gen,
2195 u64 ino, u64 ino_gen,
2196 const char *name, int name_len)
2197 {
2198 int ret;
2199 u64 ow_inode;
2200 u64 ow_gen = 0;
2201 u64 send_root_dir_gen;
2202
2203 if (!sctx->parent_root)
2204 return 0;
2205
2206 ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
2207 if (ret <= 0)
2208 return ret;
2209
2210 /*
2211 * @send_root_dir_gen was set to 0 if the inode does not exist in the
2212 * send root.
2213 */
2214 if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
2215 return 0;
2216
2217 /* check if the ref was overwritten by another ref */
2218 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
2219 &ow_inode);
2220 if (ret == -ENOENT) {
2221 /* was never and will never be overwritten */
2222 return 0;
2223 } else if (ret < 0) {
2224 return ret;
2225 }
2226
2227 if (ow_inode == ino) {
2228 ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
2229 if (ret < 0)
2230 return ret;
2231
2232 /* It's the same inode, so no overwrite happened. */
2233 if (ow_gen == ino_gen)
2234 return 0;
2235 }
2236
2237 /*
2238 * We know that it is or will be overwritten. Check this now.
2239 * The current inode being processed might have been the one that caused
2240 * inode 'ino' to be orphanized, therefore check if ow_inode matches
2241 * the current inode being processed.
2242 */
2243 if (ow_inode < sctx->send_progress)
2244 return 1;
2245
2246 if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
2247 if (ow_gen == 0) {
2248 ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
2249 if (ret < 0)
2250 return ret;
2251 }
2252 if (ow_gen == sctx->cur_inode_gen)
2253 return 1;
2254 }
2255
2256 return 0;
2257 }
2258
2259 /*
2260 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2261 * that got overwritten. This is used by process_recorded_refs to determine
2262 * if it has to use the path as returned by get_cur_path or the orphan name.
2263 */
did_overwrite_first_ref(struct send_ctx * sctx,u64 ino,u64 gen)2264 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2265 {
2266 int ret = 0;
2267 struct fs_path *name = NULL;
2268 u64 dir;
2269 u64 dir_gen;
2270
2271 if (!sctx->parent_root)
2272 goto out;
2273
2274 name = fs_path_alloc();
2275 if (!name)
2276 return -ENOMEM;
2277
2278 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
2279 if (ret < 0)
2280 goto out;
2281
2282 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
2283 name->start, fs_path_len(name));
2284
2285 out:
2286 fs_path_free(name);
2287 return ret;
2288 }
2289
name_cache_search(struct send_ctx * sctx,u64 ino,u64 gen)2290 static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
2291 u64 ino, u64 gen)
2292 {
2293 struct btrfs_lru_cache_entry *entry;
2294
2295 entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
2296 if (!entry)
2297 return NULL;
2298
2299 return container_of(entry, struct name_cache_entry, entry);
2300 }
2301
2302 /*
2303 * Used by get_cur_path for each ref up to the root.
2304 * Returns 0 if it succeeded.
2305 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2306 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2307 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2308 * Returns <0 in case of error.
2309 */
__get_cur_name_and_parent(struct send_ctx * sctx,u64 ino,u64 gen,u64 * parent_ino,u64 * parent_gen,struct fs_path * dest)2310 static int __get_cur_name_and_parent(struct send_ctx *sctx,
2311 u64 ino, u64 gen,
2312 u64 *parent_ino,
2313 u64 *parent_gen,
2314 struct fs_path *dest)
2315 {
2316 int ret;
2317 int nce_ret;
2318 struct name_cache_entry *nce;
2319
2320 /*
2321 * First check if we already did a call to this function with the same
2322 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2323 * return the cached result.
2324 */
2325 nce = name_cache_search(sctx, ino, gen);
2326 if (nce) {
2327 if (ino < sctx->send_progress && nce->need_later_update) {
2328 btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
2329 nce = NULL;
2330 } else {
2331 *parent_ino = nce->parent_ino;
2332 *parent_gen = nce->parent_gen;
2333 ret = fs_path_add(dest, nce->name, nce->name_len);
2334 if (ret < 0)
2335 goto out;
2336 ret = nce->ret;
2337 goto out;
2338 }
2339 }
2340
2341 /*
2342 * If the inode is not existent yet, add the orphan name and return 1.
2343 * This should only happen for the parent dir that we determine in
2344 * record_new_ref_if_needed().
2345 */
2346 ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
2347 if (ret < 0)
2348 goto out;
2349
2350 if (!ret) {
2351 ret = gen_unique_name(sctx, ino, gen, dest);
2352 if (ret < 0)
2353 goto out;
2354 ret = 1;
2355 goto out_cache;
2356 }
2357
2358 /*
2359 * Depending on whether the inode was already processed or not, use
2360 * send_root or parent_root for ref lookup.
2361 */
2362 if (ino < sctx->send_progress)
2363 ret = get_first_ref(sctx->send_root, ino,
2364 parent_ino, parent_gen, dest);
2365 else
2366 ret = get_first_ref(sctx->parent_root, ino,
2367 parent_ino, parent_gen, dest);
2368 if (ret < 0)
2369 goto out;
2370
2371 /*
2372 * Check if the ref was overwritten by an inode's ref that was processed
2373 * earlier. If yes, treat as orphan and return 1.
2374 */
2375 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
2376 dest->start, dest->end - dest->start);
2377 if (ret < 0)
2378 goto out;
2379 if (ret) {
2380 fs_path_reset(dest);
2381 ret = gen_unique_name(sctx, ino, gen, dest);
2382 if (ret < 0)
2383 goto out;
2384 ret = 1;
2385 }
2386
2387 out_cache:
2388 /*
2389 * Store the result of the lookup in the name cache.
2390 */
2391 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2392 if (!nce) {
2393 ret = -ENOMEM;
2394 goto out;
2395 }
2396
2397 nce->entry.key = ino;
2398 nce->entry.gen = gen;
2399 nce->parent_ino = *parent_ino;
2400 nce->parent_gen = *parent_gen;
2401 nce->name_len = fs_path_len(dest);
2402 nce->ret = ret;
2403 strcpy(nce->name, dest->start);
2404
2405 if (ino < sctx->send_progress)
2406 nce->need_later_update = 0;
2407 else
2408 nce->need_later_update = 1;
2409
2410 nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
2411 if (nce_ret < 0) {
2412 kfree(nce);
2413 ret = nce_ret;
2414 }
2415
2416 out:
2417 return ret;
2418 }
2419
2420 /*
2421 * Magic happens here. This function returns the first ref to an inode as it
2422 * would look like while receiving the stream at this point in time.
2423 * We walk the path up to the root. For every inode in between, we check if it
2424 * was already processed/sent. If yes, we continue with the parent as found
2425 * in send_root. If not, we continue with the parent as found in parent_root.
2426 * If we encounter an inode that was deleted at this point in time, we use the
2427 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2428 * that were not created yet and overwritten inodes/refs.
2429 *
2430 * When do we have orphan inodes:
2431 * 1. When an inode is freshly created and thus no valid refs are available yet
2432 * 2. When a directory lost all it's refs (deleted) but still has dir items
2433 * inside which were not processed yet (pending for move/delete). If anyone
2434 * tried to get the path to the dir items, it would get a path inside that
2435 * orphan directory.
2436 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2437 * of an unprocessed inode. If in that case the first ref would be
2438 * overwritten, the overwritten inode gets "orphanized". Later when we
2439 * process this overwritten inode, it is restored at a new place by moving
2440 * the orphan inode.
2441 *
2442 * sctx->send_progress tells this function at which point in time receiving
2443 * would be.
2444 */
get_cur_path(struct send_ctx * sctx,u64 ino,u64 gen,struct fs_path * dest)2445 static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2446 struct fs_path *dest)
2447 {
2448 int ret = 0;
2449 struct fs_path *name = NULL;
2450 u64 parent_inode = 0;
2451 u64 parent_gen = 0;
2452 int stop = 0;
2453
2454 name = fs_path_alloc();
2455 if (!name) {
2456 ret = -ENOMEM;
2457 goto out;
2458 }
2459
2460 dest->reversed = 1;
2461 fs_path_reset(dest);
2462
2463 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2464 struct waiting_dir_move *wdm;
2465
2466 fs_path_reset(name);
2467
2468 if (is_waiting_for_rm(sctx, ino, gen)) {
2469 ret = gen_unique_name(sctx, ino, gen, name);
2470 if (ret < 0)
2471 goto out;
2472 ret = fs_path_add_path(dest, name);
2473 break;
2474 }
2475
2476 wdm = get_waiting_dir_move(sctx, ino);
2477 if (wdm && wdm->orphanized) {
2478 ret = gen_unique_name(sctx, ino, gen, name);
2479 stop = 1;
2480 } else if (wdm) {
2481 ret = get_first_ref(sctx->parent_root, ino,
2482 &parent_inode, &parent_gen, name);
2483 } else {
2484 ret = __get_cur_name_and_parent(sctx, ino, gen,
2485 &parent_inode,
2486 &parent_gen, name);
2487 if (ret)
2488 stop = 1;
2489 }
2490
2491 if (ret < 0)
2492 goto out;
2493
2494 ret = fs_path_add_path(dest, name);
2495 if (ret < 0)
2496 goto out;
2497
2498 ino = parent_inode;
2499 gen = parent_gen;
2500 }
2501
2502 out:
2503 fs_path_free(name);
2504 if (!ret)
2505 fs_path_unreverse(dest);
2506 return ret;
2507 }
2508
2509 /*
2510 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2511 */
send_subvol_begin(struct send_ctx * sctx)2512 static int send_subvol_begin(struct send_ctx *sctx)
2513 {
2514 int ret;
2515 struct btrfs_root *send_root = sctx->send_root;
2516 struct btrfs_root *parent_root = sctx->parent_root;
2517 struct btrfs_path *path;
2518 struct btrfs_key key;
2519 struct btrfs_root_ref *ref;
2520 struct extent_buffer *leaf;
2521 char *name = NULL;
2522 int namelen;
2523
2524 path = btrfs_alloc_path();
2525 if (!path)
2526 return -ENOMEM;
2527
2528 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2529 if (!name) {
2530 btrfs_free_path(path);
2531 return -ENOMEM;
2532 }
2533
2534 key.objectid = send_root->root_key.objectid;
2535 key.type = BTRFS_ROOT_BACKREF_KEY;
2536 key.offset = 0;
2537
2538 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2539 &key, path, 1, 0);
2540 if (ret < 0)
2541 goto out;
2542 if (ret) {
2543 ret = -ENOENT;
2544 goto out;
2545 }
2546
2547 leaf = path->nodes[0];
2548 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2549 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2550 key.objectid != send_root->root_key.objectid) {
2551 ret = -ENOENT;
2552 goto out;
2553 }
2554 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2555 namelen = btrfs_root_ref_name_len(leaf, ref);
2556 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2557 btrfs_release_path(path);
2558
2559 if (parent_root) {
2560 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2561 if (ret < 0)
2562 goto out;
2563 } else {
2564 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2565 if (ret < 0)
2566 goto out;
2567 }
2568
2569 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2570
2571 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
2572 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2573 sctx->send_root->root_item.received_uuid);
2574 else
2575 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2576 sctx->send_root->root_item.uuid);
2577
2578 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2579 btrfs_root_ctransid(&sctx->send_root->root_item));
2580 if (parent_root) {
2581 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
2582 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2583 parent_root->root_item.received_uuid);
2584 else
2585 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2586 parent_root->root_item.uuid);
2587 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2588 btrfs_root_ctransid(&sctx->parent_root->root_item));
2589 }
2590
2591 ret = send_cmd(sctx);
2592
2593 tlv_put_failure:
2594 out:
2595 btrfs_free_path(path);
2596 kfree(name);
2597 return ret;
2598 }
2599
send_truncate(struct send_ctx * sctx,u64 ino,u64 gen,u64 size)2600 static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2601 {
2602 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2603 int ret = 0;
2604 struct fs_path *p;
2605
2606 btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2607
2608 p = fs_path_alloc();
2609 if (!p)
2610 return -ENOMEM;
2611
2612 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2613 if (ret < 0)
2614 goto out;
2615
2616 ret = get_cur_path(sctx, ino, gen, p);
2617 if (ret < 0)
2618 goto out;
2619 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2620 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2621
2622 ret = send_cmd(sctx);
2623
2624 tlv_put_failure:
2625 out:
2626 fs_path_free(p);
2627 return ret;
2628 }
2629
send_chmod(struct send_ctx * sctx,u64 ino,u64 gen,u64 mode)2630 static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2631 {
2632 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2633 int ret = 0;
2634 struct fs_path *p;
2635
2636 btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2637
2638 p = fs_path_alloc();
2639 if (!p)
2640 return -ENOMEM;
2641
2642 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2643 if (ret < 0)
2644 goto out;
2645
2646 ret = get_cur_path(sctx, ino, gen, p);
2647 if (ret < 0)
2648 goto out;
2649 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2650 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2651
2652 ret = send_cmd(sctx);
2653
2654 tlv_put_failure:
2655 out:
2656 fs_path_free(p);
2657 return ret;
2658 }
2659
send_fileattr(struct send_ctx * sctx,u64 ino,u64 gen,u64 fileattr)2660 static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
2661 {
2662 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2663 int ret = 0;
2664 struct fs_path *p;
2665
2666 if (sctx->proto < 2)
2667 return 0;
2668
2669 btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
2670
2671 p = fs_path_alloc();
2672 if (!p)
2673 return -ENOMEM;
2674
2675 ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
2676 if (ret < 0)
2677 goto out;
2678
2679 ret = get_cur_path(sctx, ino, gen, p);
2680 if (ret < 0)
2681 goto out;
2682 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2683 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
2684
2685 ret = send_cmd(sctx);
2686
2687 tlv_put_failure:
2688 out:
2689 fs_path_free(p);
2690 return ret;
2691 }
2692
send_chown(struct send_ctx * sctx,u64 ino,u64 gen,u64 uid,u64 gid)2693 static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2694 {
2695 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2696 int ret = 0;
2697 struct fs_path *p;
2698
2699 btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2700 ino, uid, gid);
2701
2702 p = fs_path_alloc();
2703 if (!p)
2704 return -ENOMEM;
2705
2706 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2707 if (ret < 0)
2708 goto out;
2709
2710 ret = get_cur_path(sctx, ino, gen, p);
2711 if (ret < 0)
2712 goto out;
2713 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2714 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2715 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2716
2717 ret = send_cmd(sctx);
2718
2719 tlv_put_failure:
2720 out:
2721 fs_path_free(p);
2722 return ret;
2723 }
2724
send_utimes(struct send_ctx * sctx,u64 ino,u64 gen)2725 static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2726 {
2727 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2728 int ret = 0;
2729 struct fs_path *p = NULL;
2730 struct btrfs_inode_item *ii;
2731 struct btrfs_path *path = NULL;
2732 struct extent_buffer *eb;
2733 struct btrfs_key key;
2734 int slot;
2735
2736 btrfs_debug(fs_info, "send_utimes %llu", ino);
2737
2738 p = fs_path_alloc();
2739 if (!p)
2740 return -ENOMEM;
2741
2742 path = alloc_path_for_send();
2743 if (!path) {
2744 ret = -ENOMEM;
2745 goto out;
2746 }
2747
2748 key.objectid = ino;
2749 key.type = BTRFS_INODE_ITEM_KEY;
2750 key.offset = 0;
2751 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2752 if (ret > 0)
2753 ret = -ENOENT;
2754 if (ret < 0)
2755 goto out;
2756
2757 eb = path->nodes[0];
2758 slot = path->slots[0];
2759 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2760
2761 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2762 if (ret < 0)
2763 goto out;
2764
2765 ret = get_cur_path(sctx, ino, gen, p);
2766 if (ret < 0)
2767 goto out;
2768 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2769 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2770 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2771 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2772 if (sctx->proto >= 2)
2773 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2774
2775 ret = send_cmd(sctx);
2776
2777 tlv_put_failure:
2778 out:
2779 fs_path_free(p);
2780 btrfs_free_path(path);
2781 return ret;
2782 }
2783
2784 /*
2785 * If the cache is full, we can't remove entries from it and do a call to
2786 * send_utimes() for each respective inode, because we might be finishing
2787 * processing an inode that is a directory and it just got renamed, and existing
2788 * entries in the cache may refer to inodes that have the directory in their
2789 * full path - in which case we would generate outdated paths (pre-rename)
2790 * for the inodes that the cache entries point to. Instead of prunning the
2791 * cache when inserting, do it after we finish processing each inode at
2792 * finish_inode_if_needed().
2793 */
cache_dir_utimes(struct send_ctx * sctx,u64 dir,u64 gen)2794 static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
2795 {
2796 struct btrfs_lru_cache_entry *entry;
2797 int ret;
2798
2799 entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
2800 if (entry != NULL)
2801 return 0;
2802
2803 /* Caching is optional, don't fail if we can't allocate memory. */
2804 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2805 if (!entry)
2806 return send_utimes(sctx, dir, gen);
2807
2808 entry->key = dir;
2809 entry->gen = gen;
2810
2811 ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
2812 ASSERT(ret != -EEXIST);
2813 if (ret) {
2814 kfree(entry);
2815 return send_utimes(sctx, dir, gen);
2816 }
2817
2818 return 0;
2819 }
2820
trim_dir_utimes_cache(struct send_ctx * sctx)2821 static int trim_dir_utimes_cache(struct send_ctx *sctx)
2822 {
2823 while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
2824 SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
2825 struct btrfs_lru_cache_entry *lru;
2826 int ret;
2827
2828 lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
2829 ASSERT(lru != NULL);
2830
2831 ret = send_utimes(sctx, lru->key, lru->gen);
2832 if (ret)
2833 return ret;
2834
2835 btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
2836 }
2837
2838 return 0;
2839 }
2840
2841 /*
2842 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2843 * a valid path yet because we did not process the refs yet. So, the inode
2844 * is created as orphan.
2845 */
send_create_inode(struct send_ctx * sctx,u64 ino)2846 static int send_create_inode(struct send_ctx *sctx, u64 ino)
2847 {
2848 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2849 int ret = 0;
2850 struct fs_path *p;
2851 int cmd;
2852 struct btrfs_inode_info info;
2853 u64 gen;
2854 u64 mode;
2855 u64 rdev;
2856
2857 btrfs_debug(fs_info, "send_create_inode %llu", ino);
2858
2859 p = fs_path_alloc();
2860 if (!p)
2861 return -ENOMEM;
2862
2863 if (ino != sctx->cur_ino) {
2864 ret = get_inode_info(sctx->send_root, ino, &info);
2865 if (ret < 0)
2866 goto out;
2867 gen = info.gen;
2868 mode = info.mode;
2869 rdev = info.rdev;
2870 } else {
2871 gen = sctx->cur_inode_gen;
2872 mode = sctx->cur_inode_mode;
2873 rdev = sctx->cur_inode_rdev;
2874 }
2875
2876 if (S_ISREG(mode)) {
2877 cmd = BTRFS_SEND_C_MKFILE;
2878 } else if (S_ISDIR(mode)) {
2879 cmd = BTRFS_SEND_C_MKDIR;
2880 } else if (S_ISLNK(mode)) {
2881 cmd = BTRFS_SEND_C_SYMLINK;
2882 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2883 cmd = BTRFS_SEND_C_MKNOD;
2884 } else if (S_ISFIFO(mode)) {
2885 cmd = BTRFS_SEND_C_MKFIFO;
2886 } else if (S_ISSOCK(mode)) {
2887 cmd = BTRFS_SEND_C_MKSOCK;
2888 } else {
2889 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2890 (int)(mode & S_IFMT));
2891 ret = -EOPNOTSUPP;
2892 goto out;
2893 }
2894
2895 ret = begin_cmd(sctx, cmd);
2896 if (ret < 0)
2897 goto out;
2898
2899 ret = gen_unique_name(sctx, ino, gen, p);
2900 if (ret < 0)
2901 goto out;
2902
2903 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2904 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2905
2906 if (S_ISLNK(mode)) {
2907 fs_path_reset(p);
2908 ret = read_symlink(sctx->send_root, ino, p);
2909 if (ret < 0)
2910 goto out;
2911 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2912 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2913 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2914 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2915 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2916 }
2917
2918 ret = send_cmd(sctx);
2919 if (ret < 0)
2920 goto out;
2921
2922
2923 tlv_put_failure:
2924 out:
2925 fs_path_free(p);
2926 return ret;
2927 }
2928
cache_dir_created(struct send_ctx * sctx,u64 dir)2929 static void cache_dir_created(struct send_ctx *sctx, u64 dir)
2930 {
2931 struct btrfs_lru_cache_entry *entry;
2932 int ret;
2933
2934 /* Caching is optional, ignore any failures. */
2935 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2936 if (!entry)
2937 return;
2938
2939 entry->key = dir;
2940 entry->gen = 0;
2941 ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
2942 if (ret < 0)
2943 kfree(entry);
2944 }
2945
2946 /*
2947 * We need some special handling for inodes that get processed before the parent
2948 * directory got created. See process_recorded_refs for details.
2949 * This function does the check if we already created the dir out of order.
2950 */
did_create_dir(struct send_ctx * sctx,u64 dir)2951 static int did_create_dir(struct send_ctx *sctx, u64 dir)
2952 {
2953 int ret = 0;
2954 int iter_ret = 0;
2955 struct btrfs_path *path = NULL;
2956 struct btrfs_key key;
2957 struct btrfs_key found_key;
2958 struct btrfs_key di_key;
2959 struct btrfs_dir_item *di;
2960
2961 if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
2962 return 1;
2963
2964 path = alloc_path_for_send();
2965 if (!path)
2966 return -ENOMEM;
2967
2968 key.objectid = dir;
2969 key.type = BTRFS_DIR_INDEX_KEY;
2970 key.offset = 0;
2971
2972 btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
2973 struct extent_buffer *eb = path->nodes[0];
2974
2975 if (found_key.objectid != key.objectid ||
2976 found_key.type != key.type) {
2977 ret = 0;
2978 break;
2979 }
2980
2981 di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
2982 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2983
2984 if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2985 di_key.objectid < sctx->send_progress) {
2986 ret = 1;
2987 cache_dir_created(sctx, dir);
2988 break;
2989 }
2990 }
2991 /* Catch error found during iteration */
2992 if (iter_ret < 0)
2993 ret = iter_ret;
2994
2995 btrfs_free_path(path);
2996 return ret;
2997 }
2998
2999 /*
3000 * Only creates the inode if it is:
3001 * 1. Not a directory
3002 * 2. Or a directory which was not created already due to out of order
3003 * directories. See did_create_dir and process_recorded_refs for details.
3004 */
send_create_inode_if_needed(struct send_ctx * sctx)3005 static int send_create_inode_if_needed(struct send_ctx *sctx)
3006 {
3007 int ret;
3008
3009 if (S_ISDIR(sctx->cur_inode_mode)) {
3010 ret = did_create_dir(sctx, sctx->cur_ino);
3011 if (ret < 0)
3012 return ret;
3013 else if (ret > 0)
3014 return 0;
3015 }
3016
3017 ret = send_create_inode(sctx, sctx->cur_ino);
3018
3019 if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
3020 cache_dir_created(sctx, sctx->cur_ino);
3021
3022 return ret;
3023 }
3024
3025 struct recorded_ref {
3026 struct list_head list;
3027 char *name;
3028 struct fs_path *full_path;
3029 u64 dir;
3030 u64 dir_gen;
3031 int name_len;
3032 struct rb_node node;
3033 struct rb_root *root;
3034 };
3035
recorded_ref_alloc(void)3036 static struct recorded_ref *recorded_ref_alloc(void)
3037 {
3038 struct recorded_ref *ref;
3039
3040 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
3041 if (!ref)
3042 return NULL;
3043 RB_CLEAR_NODE(&ref->node);
3044 INIT_LIST_HEAD(&ref->list);
3045 return ref;
3046 }
3047
recorded_ref_free(struct recorded_ref * ref)3048 static void recorded_ref_free(struct recorded_ref *ref)
3049 {
3050 if (!ref)
3051 return;
3052 if (!RB_EMPTY_NODE(&ref->node))
3053 rb_erase(&ref->node, ref->root);
3054 list_del(&ref->list);
3055 fs_path_free(ref->full_path);
3056 kfree(ref);
3057 }
3058
set_ref_path(struct recorded_ref * ref,struct fs_path * path)3059 static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
3060 {
3061 ref->full_path = path;
3062 ref->name = (char *)kbasename(ref->full_path->start);
3063 ref->name_len = ref->full_path->end - ref->name;
3064 }
3065
dup_ref(struct recorded_ref * ref,struct list_head * list)3066 static int dup_ref(struct recorded_ref *ref, struct list_head *list)
3067 {
3068 struct recorded_ref *new;
3069
3070 new = recorded_ref_alloc();
3071 if (!new)
3072 return -ENOMEM;
3073
3074 new->dir = ref->dir;
3075 new->dir_gen = ref->dir_gen;
3076 list_add_tail(&new->list, list);
3077 return 0;
3078 }
3079
__free_recorded_refs(struct list_head * head)3080 static void __free_recorded_refs(struct list_head *head)
3081 {
3082 struct recorded_ref *cur;
3083
3084 while (!list_empty(head)) {
3085 cur = list_entry(head->next, struct recorded_ref, list);
3086 recorded_ref_free(cur);
3087 }
3088 }
3089
free_recorded_refs(struct send_ctx * sctx)3090 static void free_recorded_refs(struct send_ctx *sctx)
3091 {
3092 __free_recorded_refs(&sctx->new_refs);
3093 __free_recorded_refs(&sctx->deleted_refs);
3094 }
3095
3096 /*
3097 * Renames/moves a file/dir to its orphan name. Used when the first
3098 * ref of an unprocessed inode gets overwritten and for all non empty
3099 * directories.
3100 */
orphanize_inode(struct send_ctx * sctx,u64 ino,u64 gen,struct fs_path * path)3101 static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
3102 struct fs_path *path)
3103 {
3104 int ret;
3105 struct fs_path *orphan;
3106
3107 orphan = fs_path_alloc();
3108 if (!orphan)
3109 return -ENOMEM;
3110
3111 ret = gen_unique_name(sctx, ino, gen, orphan);
3112 if (ret < 0)
3113 goto out;
3114
3115 ret = send_rename(sctx, path, orphan);
3116
3117 out:
3118 fs_path_free(orphan);
3119 return ret;
3120 }
3121
add_orphan_dir_info(struct send_ctx * sctx,u64 dir_ino,u64 dir_gen)3122 static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
3123 u64 dir_ino, u64 dir_gen)
3124 {
3125 struct rb_node **p = &sctx->orphan_dirs.rb_node;
3126 struct rb_node *parent = NULL;
3127 struct orphan_dir_info *entry, *odi;
3128
3129 while (*p) {
3130 parent = *p;
3131 entry = rb_entry(parent, struct orphan_dir_info, node);
3132 if (dir_ino < entry->ino)
3133 p = &(*p)->rb_left;
3134 else if (dir_ino > entry->ino)
3135 p = &(*p)->rb_right;
3136 else if (dir_gen < entry->gen)
3137 p = &(*p)->rb_left;
3138 else if (dir_gen > entry->gen)
3139 p = &(*p)->rb_right;
3140 else
3141 return entry;
3142 }
3143
3144 odi = kmalloc(sizeof(*odi), GFP_KERNEL);
3145 if (!odi)
3146 return ERR_PTR(-ENOMEM);
3147 odi->ino = dir_ino;
3148 odi->gen = dir_gen;
3149 odi->last_dir_index_offset = 0;
3150 odi->dir_high_seq_ino = 0;
3151
3152 rb_link_node(&odi->node, parent, p);
3153 rb_insert_color(&odi->node, &sctx->orphan_dirs);
3154 return odi;
3155 }
3156
get_orphan_dir_info(struct send_ctx * sctx,u64 dir_ino,u64 gen)3157 static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
3158 u64 dir_ino, u64 gen)
3159 {
3160 struct rb_node *n = sctx->orphan_dirs.rb_node;
3161 struct orphan_dir_info *entry;
3162
3163 while (n) {
3164 entry = rb_entry(n, struct orphan_dir_info, node);
3165 if (dir_ino < entry->ino)
3166 n = n->rb_left;
3167 else if (dir_ino > entry->ino)
3168 n = n->rb_right;
3169 else if (gen < entry->gen)
3170 n = n->rb_left;
3171 else if (gen > entry->gen)
3172 n = n->rb_right;
3173 else
3174 return entry;
3175 }
3176 return NULL;
3177 }
3178
is_waiting_for_rm(struct send_ctx * sctx,u64 dir_ino,u64 gen)3179 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
3180 {
3181 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
3182
3183 return odi != NULL;
3184 }
3185
free_orphan_dir_info(struct send_ctx * sctx,struct orphan_dir_info * odi)3186 static void free_orphan_dir_info(struct send_ctx *sctx,
3187 struct orphan_dir_info *odi)
3188 {
3189 if (!odi)
3190 return;
3191 rb_erase(&odi->node, &sctx->orphan_dirs);
3192 kfree(odi);
3193 }
3194
3195 /*
3196 * Returns 1 if a directory can be removed at this point in time.
3197 * We check this by iterating all dir items and checking if the inode behind
3198 * the dir item was already processed.
3199 */
can_rmdir(struct send_ctx * sctx,u64 dir,u64 dir_gen)3200 static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
3201 {
3202 int ret = 0;
3203 int iter_ret = 0;
3204 struct btrfs_root *root = sctx->parent_root;
3205 struct btrfs_path *path;
3206 struct btrfs_key key;
3207 struct btrfs_key found_key;
3208 struct btrfs_key loc;
3209 struct btrfs_dir_item *di;
3210 struct orphan_dir_info *odi = NULL;
3211 u64 dir_high_seq_ino = 0;
3212 u64 last_dir_index_offset = 0;
3213
3214 /*
3215 * Don't try to rmdir the top/root subvolume dir.
3216 */
3217 if (dir == BTRFS_FIRST_FREE_OBJECTID)
3218 return 0;
3219
3220 odi = get_orphan_dir_info(sctx, dir, dir_gen);
3221 if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
3222 return 0;
3223
3224 path = alloc_path_for_send();
3225 if (!path)
3226 return -ENOMEM;
3227
3228 if (!odi) {
3229 /*
3230 * Find the inode number associated with the last dir index
3231 * entry. This is very likely the inode with the highest number
3232 * of all inodes that have an entry in the directory. We can
3233 * then use it to avoid future calls to can_rmdir(), when
3234 * processing inodes with a lower number, from having to search
3235 * the parent root b+tree for dir index keys.
3236 */
3237 key.objectid = dir;
3238 key.type = BTRFS_DIR_INDEX_KEY;
3239 key.offset = (u64)-1;
3240
3241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3242 if (ret < 0) {
3243 goto out;
3244 } else if (ret > 0) {
3245 /* Can't happen, the root is never empty. */
3246 ASSERT(path->slots[0] > 0);
3247 if (WARN_ON(path->slots[0] == 0)) {
3248 ret = -EUCLEAN;
3249 goto out;
3250 }
3251 path->slots[0]--;
3252 }
3253
3254 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3255 if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
3256 /* No index keys, dir can be removed. */
3257 ret = 1;
3258 goto out;
3259 }
3260
3261 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
3262 struct btrfs_dir_item);
3263 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
3264 dir_high_seq_ino = loc.objectid;
3265 if (sctx->cur_ino < dir_high_seq_ino) {
3266 ret = 0;
3267 goto out;
3268 }
3269
3270 btrfs_release_path(path);
3271 }
3272
3273 key.objectid = dir;
3274 key.type = BTRFS_DIR_INDEX_KEY;
3275 key.offset = (odi ? odi->last_dir_index_offset : 0);
3276
3277 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3278 struct waiting_dir_move *dm;
3279
3280 if (found_key.objectid != key.objectid ||
3281 found_key.type != key.type)
3282 break;
3283
3284 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
3285 struct btrfs_dir_item);
3286 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
3287
3288 dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
3289 last_dir_index_offset = found_key.offset;
3290
3291 dm = get_waiting_dir_move(sctx, loc.objectid);
3292 if (dm) {
3293 dm->rmdir_ino = dir;
3294 dm->rmdir_gen = dir_gen;
3295 ret = 0;
3296 goto out;
3297 }
3298
3299 if (loc.objectid > sctx->cur_ino) {
3300 ret = 0;
3301 goto out;
3302 }
3303 }
3304 if (iter_ret < 0) {
3305 ret = iter_ret;
3306 goto out;
3307 }
3308 free_orphan_dir_info(sctx, odi);
3309
3310 ret = 1;
3311
3312 out:
3313 btrfs_free_path(path);
3314
3315 if (ret)
3316 return ret;
3317
3318 if (!odi) {
3319 odi = add_orphan_dir_info(sctx, dir, dir_gen);
3320 if (IS_ERR(odi))
3321 return PTR_ERR(odi);
3322
3323 odi->gen = dir_gen;
3324 }
3325
3326 odi->last_dir_index_offset = last_dir_index_offset;
3327 odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
3328
3329 return 0;
3330 }
3331
is_waiting_for_move(struct send_ctx * sctx,u64 ino)3332 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3333 {
3334 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3335
3336 return entry != NULL;
3337 }
3338
add_waiting_dir_move(struct send_ctx * sctx,u64 ino,bool orphanized)3339 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3340 {
3341 struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3342 struct rb_node *parent = NULL;
3343 struct waiting_dir_move *entry, *dm;
3344
3345 dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3346 if (!dm)
3347 return -ENOMEM;
3348 dm->ino = ino;
3349 dm->rmdir_ino = 0;
3350 dm->rmdir_gen = 0;
3351 dm->orphanized = orphanized;
3352
3353 while (*p) {
3354 parent = *p;
3355 entry = rb_entry(parent, struct waiting_dir_move, node);
3356 if (ino < entry->ino) {
3357 p = &(*p)->rb_left;
3358 } else if (ino > entry->ino) {
3359 p = &(*p)->rb_right;
3360 } else {
3361 kfree(dm);
3362 return -EEXIST;
3363 }
3364 }
3365
3366 rb_link_node(&dm->node, parent, p);
3367 rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3368 return 0;
3369 }
3370
3371 static struct waiting_dir_move *
get_waiting_dir_move(struct send_ctx * sctx,u64 ino)3372 get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3373 {
3374 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3375 struct waiting_dir_move *entry;
3376
3377 while (n) {
3378 entry = rb_entry(n, struct waiting_dir_move, node);
3379 if (ino < entry->ino)
3380 n = n->rb_left;
3381 else if (ino > entry->ino)
3382 n = n->rb_right;
3383 else
3384 return entry;
3385 }
3386 return NULL;
3387 }
3388
free_waiting_dir_move(struct send_ctx * sctx,struct waiting_dir_move * dm)3389 static void free_waiting_dir_move(struct send_ctx *sctx,
3390 struct waiting_dir_move *dm)
3391 {
3392 if (!dm)
3393 return;
3394 rb_erase(&dm->node, &sctx->waiting_dir_moves);
3395 kfree(dm);
3396 }
3397
add_pending_dir_move(struct send_ctx * sctx,u64 ino,u64 ino_gen,u64 parent_ino,struct list_head * new_refs,struct list_head * deleted_refs,const bool is_orphan)3398 static int add_pending_dir_move(struct send_ctx *sctx,
3399 u64 ino,
3400 u64 ino_gen,
3401 u64 parent_ino,
3402 struct list_head *new_refs,
3403 struct list_head *deleted_refs,
3404 const bool is_orphan)
3405 {
3406 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3407 struct rb_node *parent = NULL;
3408 struct pending_dir_move *entry = NULL, *pm;
3409 struct recorded_ref *cur;
3410 int exists = 0;
3411 int ret;
3412
3413 pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3414 if (!pm)
3415 return -ENOMEM;
3416 pm->parent_ino = parent_ino;
3417 pm->ino = ino;
3418 pm->gen = ino_gen;
3419 INIT_LIST_HEAD(&pm->list);
3420 INIT_LIST_HEAD(&pm->update_refs);
3421 RB_CLEAR_NODE(&pm->node);
3422
3423 while (*p) {
3424 parent = *p;
3425 entry = rb_entry(parent, struct pending_dir_move, node);
3426 if (parent_ino < entry->parent_ino) {
3427 p = &(*p)->rb_left;
3428 } else if (parent_ino > entry->parent_ino) {
3429 p = &(*p)->rb_right;
3430 } else {
3431 exists = 1;
3432 break;
3433 }
3434 }
3435
3436 list_for_each_entry(cur, deleted_refs, list) {
3437 ret = dup_ref(cur, &pm->update_refs);
3438 if (ret < 0)
3439 goto out;
3440 }
3441 list_for_each_entry(cur, new_refs, list) {
3442 ret = dup_ref(cur, &pm->update_refs);
3443 if (ret < 0)
3444 goto out;
3445 }
3446
3447 ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
3448 if (ret)
3449 goto out;
3450
3451 if (exists) {
3452 list_add_tail(&pm->list, &entry->list);
3453 } else {
3454 rb_link_node(&pm->node, parent, p);
3455 rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3456 }
3457 ret = 0;
3458 out:
3459 if (ret) {
3460 __free_recorded_refs(&pm->update_refs);
3461 kfree(pm);
3462 }
3463 return ret;
3464 }
3465
get_pending_dir_moves(struct send_ctx * sctx,u64 parent_ino)3466 static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3467 u64 parent_ino)
3468 {
3469 struct rb_node *n = sctx->pending_dir_moves.rb_node;
3470 struct pending_dir_move *entry;
3471
3472 while (n) {
3473 entry = rb_entry(n, struct pending_dir_move, node);
3474 if (parent_ino < entry->parent_ino)
3475 n = n->rb_left;
3476 else if (parent_ino > entry->parent_ino)
3477 n = n->rb_right;
3478 else
3479 return entry;
3480 }
3481 return NULL;
3482 }
3483
path_loop(struct send_ctx * sctx,struct fs_path * name,u64 ino,u64 gen,u64 * ancestor_ino)3484 static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3485 u64 ino, u64 gen, u64 *ancestor_ino)
3486 {
3487 int ret = 0;
3488 u64 parent_inode = 0;
3489 u64 parent_gen = 0;
3490 u64 start_ino = ino;
3491
3492 *ancestor_ino = 0;
3493 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3494 fs_path_reset(name);
3495
3496 if (is_waiting_for_rm(sctx, ino, gen))
3497 break;
3498 if (is_waiting_for_move(sctx, ino)) {
3499 if (*ancestor_ino == 0)
3500 *ancestor_ino = ino;
3501 ret = get_first_ref(sctx->parent_root, ino,
3502 &parent_inode, &parent_gen, name);
3503 } else {
3504 ret = __get_cur_name_and_parent(sctx, ino, gen,
3505 &parent_inode,
3506 &parent_gen, name);
3507 if (ret > 0) {
3508 ret = 0;
3509 break;
3510 }
3511 }
3512 if (ret < 0)
3513 break;
3514 if (parent_inode == start_ino) {
3515 ret = 1;
3516 if (*ancestor_ino == 0)
3517 *ancestor_ino = ino;
3518 break;
3519 }
3520 ino = parent_inode;
3521 gen = parent_gen;
3522 }
3523 return ret;
3524 }
3525
apply_dir_move(struct send_ctx * sctx,struct pending_dir_move * pm)3526 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3527 {
3528 struct fs_path *from_path = NULL;
3529 struct fs_path *to_path = NULL;
3530 struct fs_path *name = NULL;
3531 u64 orig_progress = sctx->send_progress;
3532 struct recorded_ref *cur;
3533 u64 parent_ino, parent_gen;
3534 struct waiting_dir_move *dm = NULL;
3535 u64 rmdir_ino = 0;
3536 u64 rmdir_gen;
3537 u64 ancestor;
3538 bool is_orphan;
3539 int ret;
3540
3541 name = fs_path_alloc();
3542 from_path = fs_path_alloc();
3543 if (!name || !from_path) {
3544 ret = -ENOMEM;
3545 goto out;
3546 }
3547
3548 dm = get_waiting_dir_move(sctx, pm->ino);
3549 ASSERT(dm);
3550 rmdir_ino = dm->rmdir_ino;
3551 rmdir_gen = dm->rmdir_gen;
3552 is_orphan = dm->orphanized;
3553 free_waiting_dir_move(sctx, dm);
3554
3555 if (is_orphan) {
3556 ret = gen_unique_name(sctx, pm->ino,
3557 pm->gen, from_path);
3558 } else {
3559 ret = get_first_ref(sctx->parent_root, pm->ino,
3560 &parent_ino, &parent_gen, name);
3561 if (ret < 0)
3562 goto out;
3563 ret = get_cur_path(sctx, parent_ino, parent_gen,
3564 from_path);
3565 if (ret < 0)
3566 goto out;
3567 ret = fs_path_add_path(from_path, name);
3568 }
3569 if (ret < 0)
3570 goto out;
3571
3572 sctx->send_progress = sctx->cur_ino + 1;
3573 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3574 if (ret < 0)
3575 goto out;
3576 if (ret) {
3577 LIST_HEAD(deleted_refs);
3578 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3579 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3580 &pm->update_refs, &deleted_refs,
3581 is_orphan);
3582 if (ret < 0)
3583 goto out;
3584 if (rmdir_ino) {
3585 dm = get_waiting_dir_move(sctx, pm->ino);
3586 ASSERT(dm);
3587 dm->rmdir_ino = rmdir_ino;
3588 dm->rmdir_gen = rmdir_gen;
3589 }
3590 goto out;
3591 }
3592 fs_path_reset(name);
3593 to_path = name;
3594 name = NULL;
3595 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3596 if (ret < 0)
3597 goto out;
3598
3599 ret = send_rename(sctx, from_path, to_path);
3600 if (ret < 0)
3601 goto out;
3602
3603 if (rmdir_ino) {
3604 struct orphan_dir_info *odi;
3605 u64 gen;
3606
3607 odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
3608 if (!odi) {
3609 /* already deleted */
3610 goto finish;
3611 }
3612 gen = odi->gen;
3613
3614 ret = can_rmdir(sctx, rmdir_ino, gen);
3615 if (ret < 0)
3616 goto out;
3617 if (!ret)
3618 goto finish;
3619
3620 name = fs_path_alloc();
3621 if (!name) {
3622 ret = -ENOMEM;
3623 goto out;
3624 }
3625 ret = get_cur_path(sctx, rmdir_ino, gen, name);
3626 if (ret < 0)
3627 goto out;
3628 ret = send_rmdir(sctx, name);
3629 if (ret < 0)
3630 goto out;
3631 }
3632
3633 finish:
3634 ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
3635 if (ret < 0)
3636 goto out;
3637
3638 /*
3639 * After rename/move, need to update the utimes of both new parent(s)
3640 * and old parent(s).
3641 */
3642 list_for_each_entry(cur, &pm->update_refs, list) {
3643 /*
3644 * The parent inode might have been deleted in the send snapshot
3645 */
3646 ret = get_inode_info(sctx->send_root, cur->dir, NULL);
3647 if (ret == -ENOENT) {
3648 ret = 0;
3649 continue;
3650 }
3651 if (ret < 0)
3652 goto out;
3653
3654 ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
3655 if (ret < 0)
3656 goto out;
3657 }
3658
3659 out:
3660 fs_path_free(name);
3661 fs_path_free(from_path);
3662 fs_path_free(to_path);
3663 sctx->send_progress = orig_progress;
3664
3665 return ret;
3666 }
3667
free_pending_move(struct send_ctx * sctx,struct pending_dir_move * m)3668 static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
3669 {
3670 if (!list_empty(&m->list))
3671 list_del(&m->list);
3672 if (!RB_EMPTY_NODE(&m->node))
3673 rb_erase(&m->node, &sctx->pending_dir_moves);
3674 __free_recorded_refs(&m->update_refs);
3675 kfree(m);
3676 }
3677
tail_append_pending_moves(struct send_ctx * sctx,struct pending_dir_move * moves,struct list_head * stack)3678 static void tail_append_pending_moves(struct send_ctx *sctx,
3679 struct pending_dir_move *moves,
3680 struct list_head *stack)
3681 {
3682 if (list_empty(&moves->list)) {
3683 list_add_tail(&moves->list, stack);
3684 } else {
3685 LIST_HEAD(list);
3686 list_splice_init(&moves->list, &list);
3687 list_add_tail(&moves->list, stack);
3688 list_splice_tail(&list, stack);
3689 }
3690 if (!RB_EMPTY_NODE(&moves->node)) {
3691 rb_erase(&moves->node, &sctx->pending_dir_moves);
3692 RB_CLEAR_NODE(&moves->node);
3693 }
3694 }
3695
apply_children_dir_moves(struct send_ctx * sctx)3696 static int apply_children_dir_moves(struct send_ctx *sctx)
3697 {
3698 struct pending_dir_move *pm;
3699 LIST_HEAD(stack);
3700 u64 parent_ino = sctx->cur_ino;
3701 int ret = 0;
3702
3703 pm = get_pending_dir_moves(sctx, parent_ino);
3704 if (!pm)
3705 return 0;
3706
3707 tail_append_pending_moves(sctx, pm, &stack);
3708
3709 while (!list_empty(&stack)) {
3710 pm = list_first_entry(&stack, struct pending_dir_move, list);
3711 parent_ino = pm->ino;
3712 ret = apply_dir_move(sctx, pm);
3713 free_pending_move(sctx, pm);
3714 if (ret)
3715 goto out;
3716 pm = get_pending_dir_moves(sctx, parent_ino);
3717 if (pm)
3718 tail_append_pending_moves(sctx, pm, &stack);
3719 }
3720 return 0;
3721
3722 out:
3723 while (!list_empty(&stack)) {
3724 pm = list_first_entry(&stack, struct pending_dir_move, list);
3725 free_pending_move(sctx, pm);
3726 }
3727 return ret;
3728 }
3729
3730 /*
3731 * We might need to delay a directory rename even when no ancestor directory
3732 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3733 * renamed. This happens when we rename a directory to the old name (the name
3734 * in the parent root) of some other unrelated directory that got its rename
3735 * delayed due to some ancestor with higher number that got renamed.
3736 *
3737 * Example:
3738 *
3739 * Parent snapshot:
3740 * . (ino 256)
3741 * |---- a/ (ino 257)
3742 * | |---- file (ino 260)
3743 * |
3744 * |---- b/ (ino 258)
3745 * |---- c/ (ino 259)
3746 *
3747 * Send snapshot:
3748 * . (ino 256)
3749 * |---- a/ (ino 258)
3750 * |---- x/ (ino 259)
3751 * |---- y/ (ino 257)
3752 * |----- file (ino 260)
3753 *
3754 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3755 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3756 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3757 * must issue is:
3758 *
3759 * 1 - rename 259 from 'c' to 'x'
3760 * 2 - rename 257 from 'a' to 'x/y'
3761 * 3 - rename 258 from 'b' to 'a'
3762 *
3763 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3764 * be done right away and < 0 on error.
3765 */
wait_for_dest_dir_move(struct send_ctx * sctx,struct recorded_ref * parent_ref,const bool is_orphan)3766 static int wait_for_dest_dir_move(struct send_ctx *sctx,
3767 struct recorded_ref *parent_ref,
3768 const bool is_orphan)
3769 {
3770 struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3771 struct btrfs_path *path;
3772 struct btrfs_key key;
3773 struct btrfs_key di_key;
3774 struct btrfs_dir_item *di;
3775 u64 left_gen;
3776 u64 right_gen;
3777 int ret = 0;
3778 struct waiting_dir_move *wdm;
3779
3780 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3781 return 0;
3782
3783 path = alloc_path_for_send();
3784 if (!path)
3785 return -ENOMEM;
3786
3787 key.objectid = parent_ref->dir;
3788 key.type = BTRFS_DIR_ITEM_KEY;
3789 key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
3790
3791 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
3792 if (ret < 0) {
3793 goto out;
3794 } else if (ret > 0) {
3795 ret = 0;
3796 goto out;
3797 }
3798
3799 di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
3800 parent_ref->name_len);
3801 if (!di) {
3802 ret = 0;
3803 goto out;
3804 }
3805 /*
3806 * di_key.objectid has the number of the inode that has a dentry in the
3807 * parent directory with the same name that sctx->cur_ino is being
3808 * renamed to. We need to check if that inode is in the send root as
3809 * well and if it is currently marked as an inode with a pending rename,
3810 * if it is, we need to delay the rename of sctx->cur_ino as well, so
3811 * that it happens after that other inode is renamed.
3812 */
3813 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
3814 if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3815 ret = 0;
3816 goto out;
3817 }
3818
3819 ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
3820 if (ret < 0)
3821 goto out;
3822 ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
3823 if (ret < 0) {
3824 if (ret == -ENOENT)
3825 ret = 0;
3826 goto out;
3827 }
3828
3829 /* Different inode, no need to delay the rename of sctx->cur_ino */
3830 if (right_gen != left_gen) {
3831 ret = 0;
3832 goto out;
3833 }
3834
3835 wdm = get_waiting_dir_move(sctx, di_key.objectid);
3836 if (wdm && !wdm->orphanized) {
3837 ret = add_pending_dir_move(sctx,
3838 sctx->cur_ino,
3839 sctx->cur_inode_gen,
3840 di_key.objectid,
3841 &sctx->new_refs,
3842 &sctx->deleted_refs,
3843 is_orphan);
3844 if (!ret)
3845 ret = 1;
3846 }
3847 out:
3848 btrfs_free_path(path);
3849 return ret;
3850 }
3851
3852 /*
3853 * Check if inode ino2, or any of its ancestors, is inode ino1.
3854 * Return 1 if true, 0 if false and < 0 on error.
3855 */
check_ino_in_path(struct btrfs_root * root,const u64 ino1,const u64 ino1_gen,const u64 ino2,const u64 ino2_gen,struct fs_path * fs_path)3856 static int check_ino_in_path(struct btrfs_root *root,
3857 const u64 ino1,
3858 const u64 ino1_gen,
3859 const u64 ino2,
3860 const u64 ino2_gen,
3861 struct fs_path *fs_path)
3862 {
3863 u64 ino = ino2;
3864
3865 if (ino1 == ino2)
3866 return ino1_gen == ino2_gen;
3867
3868 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3869 u64 parent;
3870 u64 parent_gen;
3871 int ret;
3872
3873 fs_path_reset(fs_path);
3874 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
3875 if (ret < 0)
3876 return ret;
3877 if (parent == ino1)
3878 return parent_gen == ino1_gen;
3879 ino = parent;
3880 }
3881 return 0;
3882 }
3883
3884 /*
3885 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3886 * possible path (in case ino2 is not a directory and has multiple hard links).
3887 * Return 1 if true, 0 if false and < 0 on error.
3888 */
is_ancestor(struct btrfs_root * root,const u64 ino1,const u64 ino1_gen,const u64 ino2,struct fs_path * fs_path)3889 static int is_ancestor(struct btrfs_root *root,
3890 const u64 ino1,
3891 const u64 ino1_gen,
3892 const u64 ino2,
3893 struct fs_path *fs_path)
3894 {
3895 bool free_fs_path = false;
3896 int ret = 0;
3897 int iter_ret = 0;
3898 struct btrfs_path *path = NULL;
3899 struct btrfs_key key;
3900
3901 if (!fs_path) {
3902 fs_path = fs_path_alloc();
3903 if (!fs_path)
3904 return -ENOMEM;
3905 free_fs_path = true;
3906 }
3907
3908 path = alloc_path_for_send();
3909 if (!path) {
3910 ret = -ENOMEM;
3911 goto out;
3912 }
3913
3914 key.objectid = ino2;
3915 key.type = BTRFS_INODE_REF_KEY;
3916 key.offset = 0;
3917
3918 btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3919 struct extent_buffer *leaf = path->nodes[0];
3920 int slot = path->slots[0];
3921 u32 cur_offset = 0;
3922 u32 item_size;
3923
3924 if (key.objectid != ino2)
3925 break;
3926 if (key.type != BTRFS_INODE_REF_KEY &&
3927 key.type != BTRFS_INODE_EXTREF_KEY)
3928 break;
3929
3930 item_size = btrfs_item_size(leaf, slot);
3931 while (cur_offset < item_size) {
3932 u64 parent;
3933 u64 parent_gen;
3934
3935 if (key.type == BTRFS_INODE_EXTREF_KEY) {
3936 unsigned long ptr;
3937 struct btrfs_inode_extref *extref;
3938
3939 ptr = btrfs_item_ptr_offset(leaf, slot);
3940 extref = (struct btrfs_inode_extref *)
3941 (ptr + cur_offset);
3942 parent = btrfs_inode_extref_parent(leaf,
3943 extref);
3944 cur_offset += sizeof(*extref);
3945 cur_offset += btrfs_inode_extref_name_len(leaf,
3946 extref);
3947 } else {
3948 parent = key.offset;
3949 cur_offset = item_size;
3950 }
3951
3952 ret = get_inode_gen(root, parent, &parent_gen);
3953 if (ret < 0)
3954 goto out;
3955 ret = check_ino_in_path(root, ino1, ino1_gen,
3956 parent, parent_gen, fs_path);
3957 if (ret)
3958 goto out;
3959 }
3960 }
3961 ret = 0;
3962 if (iter_ret < 0)
3963 ret = iter_ret;
3964
3965 out:
3966 btrfs_free_path(path);
3967 if (free_fs_path)
3968 fs_path_free(fs_path);
3969 return ret;
3970 }
3971
wait_for_parent_move(struct send_ctx * sctx,struct recorded_ref * parent_ref,const bool is_orphan)3972 static int wait_for_parent_move(struct send_ctx *sctx,
3973 struct recorded_ref *parent_ref,
3974 const bool is_orphan)
3975 {
3976 int ret = 0;
3977 u64 ino = parent_ref->dir;
3978 u64 ino_gen = parent_ref->dir_gen;
3979 u64 parent_ino_before, parent_ino_after;
3980 struct fs_path *path_before = NULL;
3981 struct fs_path *path_after = NULL;
3982 int len1, len2;
3983
3984 path_after = fs_path_alloc();
3985 path_before = fs_path_alloc();
3986 if (!path_after || !path_before) {
3987 ret = -ENOMEM;
3988 goto out;
3989 }
3990
3991 /*
3992 * Our current directory inode may not yet be renamed/moved because some
3993 * ancestor (immediate or not) has to be renamed/moved first. So find if
3994 * such ancestor exists and make sure our own rename/move happens after
3995 * that ancestor is processed to avoid path build infinite loops (done
3996 * at get_cur_path()).
3997 */
3998 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3999 u64 parent_ino_after_gen;
4000
4001 if (is_waiting_for_move(sctx, ino)) {
4002 /*
4003 * If the current inode is an ancestor of ino in the
4004 * parent root, we need to delay the rename of the
4005 * current inode, otherwise don't delayed the rename
4006 * because we can end up with a circular dependency
4007 * of renames, resulting in some directories never
4008 * getting the respective rename operations issued in
4009 * the send stream or getting into infinite path build
4010 * loops.
4011 */
4012 ret = is_ancestor(sctx->parent_root,
4013 sctx->cur_ino, sctx->cur_inode_gen,
4014 ino, path_before);
4015 if (ret)
4016 break;
4017 }
4018
4019 fs_path_reset(path_before);
4020 fs_path_reset(path_after);
4021
4022 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
4023 &parent_ino_after_gen, path_after);
4024 if (ret < 0)
4025 goto out;
4026 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
4027 NULL, path_before);
4028 if (ret < 0 && ret != -ENOENT) {
4029 goto out;
4030 } else if (ret == -ENOENT) {
4031 ret = 0;
4032 break;
4033 }
4034
4035 len1 = fs_path_len(path_before);
4036 len2 = fs_path_len(path_after);
4037 if (ino > sctx->cur_ino &&
4038 (parent_ino_before != parent_ino_after || len1 != len2 ||
4039 memcmp(path_before->start, path_after->start, len1))) {
4040 u64 parent_ino_gen;
4041
4042 ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
4043 if (ret < 0)
4044 goto out;
4045 if (ino_gen == parent_ino_gen) {
4046 ret = 1;
4047 break;
4048 }
4049 }
4050 ino = parent_ino_after;
4051 ino_gen = parent_ino_after_gen;
4052 }
4053
4054 out:
4055 fs_path_free(path_before);
4056 fs_path_free(path_after);
4057
4058 if (ret == 1) {
4059 ret = add_pending_dir_move(sctx,
4060 sctx->cur_ino,
4061 sctx->cur_inode_gen,
4062 ino,
4063 &sctx->new_refs,
4064 &sctx->deleted_refs,
4065 is_orphan);
4066 if (!ret)
4067 ret = 1;
4068 }
4069
4070 return ret;
4071 }
4072
update_ref_path(struct send_ctx * sctx,struct recorded_ref * ref)4073 static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
4074 {
4075 int ret;
4076 struct fs_path *new_path;
4077
4078 /*
4079 * Our reference's name member points to its full_path member string, so
4080 * we use here a new path.
4081 */
4082 new_path = fs_path_alloc();
4083 if (!new_path)
4084 return -ENOMEM;
4085
4086 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
4087 if (ret < 0) {
4088 fs_path_free(new_path);
4089 return ret;
4090 }
4091 ret = fs_path_add(new_path, ref->name, ref->name_len);
4092 if (ret < 0) {
4093 fs_path_free(new_path);
4094 return ret;
4095 }
4096
4097 fs_path_free(ref->full_path);
4098 set_ref_path(ref, new_path);
4099
4100 return 0;
4101 }
4102
4103 /*
4104 * When processing the new references for an inode we may orphanize an existing
4105 * directory inode because its old name conflicts with one of the new references
4106 * of the current inode. Later, when processing another new reference of our
4107 * inode, we might need to orphanize another inode, but the path we have in the
4108 * reference reflects the pre-orphanization name of the directory we previously
4109 * orphanized. For example:
4110 *
4111 * parent snapshot looks like:
4112 *
4113 * . (ino 256)
4114 * |----- f1 (ino 257)
4115 * |----- f2 (ino 258)
4116 * |----- d1/ (ino 259)
4117 * |----- d2/ (ino 260)
4118 *
4119 * send snapshot looks like:
4120 *
4121 * . (ino 256)
4122 * |----- d1 (ino 258)
4123 * |----- f2/ (ino 259)
4124 * |----- f2_link/ (ino 260)
4125 * | |----- f1 (ino 257)
4126 * |
4127 * |----- d2 (ino 258)
4128 *
4129 * When processing inode 257 we compute the name for inode 259 as "d1", and we
4130 * cache it in the name cache. Later when we start processing inode 258, when
4131 * collecting all its new references we set a full path of "d1/d2" for its new
4132 * reference with name "d2". When we start processing the new references we
4133 * start by processing the new reference with name "d1", and this results in
4134 * orphanizing inode 259, since its old reference causes a conflict. Then we
4135 * move on the next new reference, with name "d2", and we find out we must
4136 * orphanize inode 260, as its old reference conflicts with ours - but for the
4137 * orphanization we use a source path corresponding to the path we stored in the
4138 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4139 * receiver fail since the path component "d1/" no longer exists, it was renamed
4140 * to "o259-6-0/" when processing the previous new reference. So in this case we
4141 * must recompute the path in the new reference and use it for the new
4142 * orphanization operation.
4143 */
refresh_ref_path(struct send_ctx * sctx,struct recorded_ref * ref)4144 static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
4145 {
4146 char *name;
4147 int ret;
4148
4149 name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
4150 if (!name)
4151 return -ENOMEM;
4152
4153 fs_path_reset(ref->full_path);
4154 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
4155 if (ret < 0)
4156 goto out;
4157
4158 ret = fs_path_add(ref->full_path, name, ref->name_len);
4159 if (ret < 0)
4160 goto out;
4161
4162 /* Update the reference's base name pointer. */
4163 set_ref_path(ref, ref->full_path);
4164 out:
4165 kfree(name);
4166 return ret;
4167 }
4168
4169 /*
4170 * This does all the move/link/unlink/rmdir magic.
4171 */
process_recorded_refs(struct send_ctx * sctx,int * pending_move)4172 static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
4173 {
4174 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4175 int ret = 0;
4176 struct recorded_ref *cur;
4177 struct recorded_ref *cur2;
4178 LIST_HEAD(check_dirs);
4179 struct fs_path *valid_path = NULL;
4180 u64 ow_inode = 0;
4181 u64 ow_gen;
4182 u64 ow_mode;
4183 int did_overwrite = 0;
4184 int is_orphan = 0;
4185 u64 last_dir_ino_rm = 0;
4186 bool can_rename = true;
4187 bool orphanized_dir = false;
4188 bool orphanized_ancestor = false;
4189
4190 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
4191
4192 /*
4193 * This should never happen as the root dir always has the same ref
4194 * which is always '..'
4195 */
4196 if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
4197 btrfs_err(fs_info,
4198 "send: unexpected inode %llu in process_recorded_refs()",
4199 sctx->cur_ino);
4200 ret = -EINVAL;
4201 goto out;
4202 }
4203
4204 valid_path = fs_path_alloc();
4205 if (!valid_path) {
4206 ret = -ENOMEM;
4207 goto out;
4208 }
4209
4210 /*
4211 * First, check if the first ref of the current inode was overwritten
4212 * before. If yes, we know that the current inode was already orphanized
4213 * and thus use the orphan name. If not, we can use get_cur_path to
4214 * get the path of the first ref as it would like while receiving at
4215 * this point in time.
4216 * New inodes are always orphan at the beginning, so force to use the
4217 * orphan name in this case.
4218 * The first ref is stored in valid_path and will be updated if it
4219 * gets moved around.
4220 */
4221 if (!sctx->cur_inode_new) {
4222 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
4223 sctx->cur_inode_gen);
4224 if (ret < 0)
4225 goto out;
4226 if (ret)
4227 did_overwrite = 1;
4228 }
4229 if (sctx->cur_inode_new || did_overwrite) {
4230 ret = gen_unique_name(sctx, sctx->cur_ino,
4231 sctx->cur_inode_gen, valid_path);
4232 if (ret < 0)
4233 goto out;
4234 is_orphan = 1;
4235 } else {
4236 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4237 valid_path);
4238 if (ret < 0)
4239 goto out;
4240 }
4241
4242 /*
4243 * Before doing any rename and link operations, do a first pass on the
4244 * new references to orphanize any unprocessed inodes that may have a
4245 * reference that conflicts with one of the new references of the current
4246 * inode. This needs to happen first because a new reference may conflict
4247 * with the old reference of a parent directory, so we must make sure
4248 * that the path used for link and rename commands don't use an
4249 * orphanized name when an ancestor was not yet orphanized.
4250 *
4251 * Example:
4252 *
4253 * Parent snapshot:
4254 *
4255 * . (ino 256)
4256 * |----- testdir/ (ino 259)
4257 * | |----- a (ino 257)
4258 * |
4259 * |----- b (ino 258)
4260 *
4261 * Send snapshot:
4262 *
4263 * . (ino 256)
4264 * |----- testdir_2/ (ino 259)
4265 * | |----- a (ino 260)
4266 * |
4267 * |----- testdir (ino 257)
4268 * |----- b (ino 257)
4269 * |----- b2 (ino 258)
4270 *
4271 * Processing the new reference for inode 257 with name "b" may happen
4272 * before processing the new reference with name "testdir". If so, we
4273 * must make sure that by the time we send a link command to create the
4274 * hard link "b", inode 259 was already orphanized, since the generated
4275 * path in "valid_path" already contains the orphanized name for 259.
4276 * We are processing inode 257, so only later when processing 259 we do
4277 * the rename operation to change its temporary (orphanized) name to
4278 * "testdir_2".
4279 */
4280 list_for_each_entry(cur, &sctx->new_refs, list) {
4281 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4282 if (ret < 0)
4283 goto out;
4284 if (ret == inode_state_will_create)
4285 continue;
4286
4287 /*
4288 * Check if this new ref would overwrite the first ref of another
4289 * unprocessed inode. If yes, orphanize the overwritten inode.
4290 * If we find an overwritten ref that is not the first ref,
4291 * simply unlink it.
4292 */
4293 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4294 cur->name, cur->name_len,
4295 &ow_inode, &ow_gen, &ow_mode);
4296 if (ret < 0)
4297 goto out;
4298 if (ret) {
4299 ret = is_first_ref(sctx->parent_root,
4300 ow_inode, cur->dir, cur->name,
4301 cur->name_len);
4302 if (ret < 0)
4303 goto out;
4304 if (ret) {
4305 struct name_cache_entry *nce;
4306 struct waiting_dir_move *wdm;
4307
4308 if (orphanized_dir) {
4309 ret = refresh_ref_path(sctx, cur);
4310 if (ret < 0)
4311 goto out;
4312 }
4313
4314 ret = orphanize_inode(sctx, ow_inode, ow_gen,
4315 cur->full_path);
4316 if (ret < 0)
4317 goto out;
4318 if (S_ISDIR(ow_mode))
4319 orphanized_dir = true;
4320
4321 /*
4322 * If ow_inode has its rename operation delayed
4323 * make sure that its orphanized name is used in
4324 * the source path when performing its rename
4325 * operation.
4326 */
4327 wdm = get_waiting_dir_move(sctx, ow_inode);
4328 if (wdm)
4329 wdm->orphanized = true;
4330
4331 /*
4332 * Make sure we clear our orphanized inode's
4333 * name from the name cache. This is because the
4334 * inode ow_inode might be an ancestor of some
4335 * other inode that will be orphanized as well
4336 * later and has an inode number greater than
4337 * sctx->send_progress. We need to prevent
4338 * future name lookups from using the old name
4339 * and get instead the orphan name.
4340 */
4341 nce = name_cache_search(sctx, ow_inode, ow_gen);
4342 if (nce)
4343 btrfs_lru_cache_remove(&sctx->name_cache,
4344 &nce->entry);
4345
4346 /*
4347 * ow_inode might currently be an ancestor of
4348 * cur_ino, therefore compute valid_path (the
4349 * current path of cur_ino) again because it
4350 * might contain the pre-orphanization name of
4351 * ow_inode, which is no longer valid.
4352 */
4353 ret = is_ancestor(sctx->parent_root,
4354 ow_inode, ow_gen,
4355 sctx->cur_ino, NULL);
4356 if (ret > 0) {
4357 orphanized_ancestor = true;
4358 fs_path_reset(valid_path);
4359 ret = get_cur_path(sctx, sctx->cur_ino,
4360 sctx->cur_inode_gen,
4361 valid_path);
4362 }
4363 if (ret < 0)
4364 goto out;
4365 } else {
4366 /*
4367 * If we previously orphanized a directory that
4368 * collided with a new reference that we already
4369 * processed, recompute the current path because
4370 * that directory may be part of the path.
4371 */
4372 if (orphanized_dir) {
4373 ret = refresh_ref_path(sctx, cur);
4374 if (ret < 0)
4375 goto out;
4376 }
4377 ret = send_unlink(sctx, cur->full_path);
4378 if (ret < 0)
4379 goto out;
4380 }
4381 }
4382
4383 }
4384
4385 list_for_each_entry(cur, &sctx->new_refs, list) {
4386 /*
4387 * We may have refs where the parent directory does not exist
4388 * yet. This happens if the parent directories inum is higher
4389 * than the current inum. To handle this case, we create the
4390 * parent directory out of order. But we need to check if this
4391 * did already happen before due to other refs in the same dir.
4392 */
4393 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4394 if (ret < 0)
4395 goto out;
4396 if (ret == inode_state_will_create) {
4397 ret = 0;
4398 /*
4399 * First check if any of the current inodes refs did
4400 * already create the dir.
4401 */
4402 list_for_each_entry(cur2, &sctx->new_refs, list) {
4403 if (cur == cur2)
4404 break;
4405 if (cur2->dir == cur->dir) {
4406 ret = 1;
4407 break;
4408 }
4409 }
4410
4411 /*
4412 * If that did not happen, check if a previous inode
4413 * did already create the dir.
4414 */
4415 if (!ret)
4416 ret = did_create_dir(sctx, cur->dir);
4417 if (ret < 0)
4418 goto out;
4419 if (!ret) {
4420 ret = send_create_inode(sctx, cur->dir);
4421 if (ret < 0)
4422 goto out;
4423 cache_dir_created(sctx, cur->dir);
4424 }
4425 }
4426
4427 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4428 ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
4429 if (ret < 0)
4430 goto out;
4431 if (ret == 1) {
4432 can_rename = false;
4433 *pending_move = 1;
4434 }
4435 }
4436
4437 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4438 can_rename) {
4439 ret = wait_for_parent_move(sctx, cur, is_orphan);
4440 if (ret < 0)
4441 goto out;
4442 if (ret == 1) {
4443 can_rename = false;
4444 *pending_move = 1;
4445 }
4446 }
4447
4448 /*
4449 * link/move the ref to the new place. If we have an orphan
4450 * inode, move it and update valid_path. If not, link or move
4451 * it depending on the inode mode.
4452 */
4453 if (is_orphan && can_rename) {
4454 ret = send_rename(sctx, valid_path, cur->full_path);
4455 if (ret < 0)
4456 goto out;
4457 is_orphan = 0;
4458 ret = fs_path_copy(valid_path, cur->full_path);
4459 if (ret < 0)
4460 goto out;
4461 } else if (can_rename) {
4462 if (S_ISDIR(sctx->cur_inode_mode)) {
4463 /*
4464 * Dirs can't be linked, so move it. For moved
4465 * dirs, we always have one new and one deleted
4466 * ref. The deleted ref is ignored later.
4467 */
4468 ret = send_rename(sctx, valid_path,
4469 cur->full_path);
4470 if (!ret)
4471 ret = fs_path_copy(valid_path,
4472 cur->full_path);
4473 if (ret < 0)
4474 goto out;
4475 } else {
4476 /*
4477 * We might have previously orphanized an inode
4478 * which is an ancestor of our current inode,
4479 * so our reference's full path, which was
4480 * computed before any such orphanizations, must
4481 * be updated.
4482 */
4483 if (orphanized_dir) {
4484 ret = update_ref_path(sctx, cur);
4485 if (ret < 0)
4486 goto out;
4487 }
4488 ret = send_link(sctx, cur->full_path,
4489 valid_path);
4490 if (ret < 0)
4491 goto out;
4492 }
4493 }
4494 ret = dup_ref(cur, &check_dirs);
4495 if (ret < 0)
4496 goto out;
4497 }
4498
4499 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4500 /*
4501 * Check if we can already rmdir the directory. If not,
4502 * orphanize it. For every dir item inside that gets deleted
4503 * later, we do this check again and rmdir it then if possible.
4504 * See the use of check_dirs for more details.
4505 */
4506 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4507 if (ret < 0)
4508 goto out;
4509 if (ret) {
4510 ret = send_rmdir(sctx, valid_path);
4511 if (ret < 0)
4512 goto out;
4513 } else if (!is_orphan) {
4514 ret = orphanize_inode(sctx, sctx->cur_ino,
4515 sctx->cur_inode_gen, valid_path);
4516 if (ret < 0)
4517 goto out;
4518 is_orphan = 1;
4519 }
4520
4521 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4522 ret = dup_ref(cur, &check_dirs);
4523 if (ret < 0)
4524 goto out;
4525 }
4526 } else if (S_ISDIR(sctx->cur_inode_mode) &&
4527 !list_empty(&sctx->deleted_refs)) {
4528 /*
4529 * We have a moved dir. Add the old parent to check_dirs
4530 */
4531 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4532 list);
4533 ret = dup_ref(cur, &check_dirs);
4534 if (ret < 0)
4535 goto out;
4536 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
4537 /*
4538 * We have a non dir inode. Go through all deleted refs and
4539 * unlink them if they were not already overwritten by other
4540 * inodes.
4541 */
4542 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4543 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4544 sctx->cur_ino, sctx->cur_inode_gen,
4545 cur->name, cur->name_len);
4546 if (ret < 0)
4547 goto out;
4548 if (!ret) {
4549 /*
4550 * If we orphanized any ancestor before, we need
4551 * to recompute the full path for deleted names,
4552 * since any such path was computed before we
4553 * processed any references and orphanized any
4554 * ancestor inode.
4555 */
4556 if (orphanized_ancestor) {
4557 ret = update_ref_path(sctx, cur);
4558 if (ret < 0)
4559 goto out;
4560 }
4561 ret = send_unlink(sctx, cur->full_path);
4562 if (ret < 0)
4563 goto out;
4564 }
4565 ret = dup_ref(cur, &check_dirs);
4566 if (ret < 0)
4567 goto out;
4568 }
4569 /*
4570 * If the inode is still orphan, unlink the orphan. This may
4571 * happen when a previous inode did overwrite the first ref
4572 * of this inode and no new refs were added for the current
4573 * inode. Unlinking does not mean that the inode is deleted in
4574 * all cases. There may still be links to this inode in other
4575 * places.
4576 */
4577 if (is_orphan) {
4578 ret = send_unlink(sctx, valid_path);
4579 if (ret < 0)
4580 goto out;
4581 }
4582 }
4583
4584 /*
4585 * We did collect all parent dirs where cur_inode was once located. We
4586 * now go through all these dirs and check if they are pending for
4587 * deletion and if it's finally possible to perform the rmdir now.
4588 * We also update the inode stats of the parent dirs here.
4589 */
4590 list_for_each_entry(cur, &check_dirs, list) {
4591 /*
4592 * In case we had refs into dirs that were not processed yet,
4593 * we don't need to do the utime and rmdir logic for these dirs.
4594 * The dir will be processed later.
4595 */
4596 if (cur->dir > sctx->cur_ino)
4597 continue;
4598
4599 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4600 if (ret < 0)
4601 goto out;
4602
4603 if (ret == inode_state_did_create ||
4604 ret == inode_state_no_change) {
4605 ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
4606 if (ret < 0)
4607 goto out;
4608 } else if (ret == inode_state_did_delete &&
4609 cur->dir != last_dir_ino_rm) {
4610 ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
4611 if (ret < 0)
4612 goto out;
4613 if (ret) {
4614 ret = get_cur_path(sctx, cur->dir,
4615 cur->dir_gen, valid_path);
4616 if (ret < 0)
4617 goto out;
4618 ret = send_rmdir(sctx, valid_path);
4619 if (ret < 0)
4620 goto out;
4621 last_dir_ino_rm = cur->dir;
4622 }
4623 }
4624 }
4625
4626 ret = 0;
4627
4628 out:
4629 __free_recorded_refs(&check_dirs);
4630 free_recorded_refs(sctx);
4631 fs_path_free(valid_path);
4632 return ret;
4633 }
4634
rbtree_ref_comp(const void * k,const struct rb_node * node)4635 static int rbtree_ref_comp(const void *k, const struct rb_node *node)
4636 {
4637 const struct recorded_ref *data = k;
4638 const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
4639 int result;
4640
4641 if (data->dir > ref->dir)
4642 return 1;
4643 if (data->dir < ref->dir)
4644 return -1;
4645 if (data->dir_gen > ref->dir_gen)
4646 return 1;
4647 if (data->dir_gen < ref->dir_gen)
4648 return -1;
4649 if (data->name_len > ref->name_len)
4650 return 1;
4651 if (data->name_len < ref->name_len)
4652 return -1;
4653 result = strcmp(data->name, ref->name);
4654 if (result > 0)
4655 return 1;
4656 if (result < 0)
4657 return -1;
4658 return 0;
4659 }
4660
rbtree_ref_less(struct rb_node * node,const struct rb_node * parent)4661 static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
4662 {
4663 const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
4664
4665 return rbtree_ref_comp(entry, parent) < 0;
4666 }
4667
record_ref_in_tree(struct rb_root * root,struct list_head * refs,struct fs_path * name,u64 dir,u64 dir_gen,struct send_ctx * sctx)4668 static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
4669 struct fs_path *name, u64 dir, u64 dir_gen,
4670 struct send_ctx *sctx)
4671 {
4672 int ret = 0;
4673 struct fs_path *path = NULL;
4674 struct recorded_ref *ref = NULL;
4675
4676 path = fs_path_alloc();
4677 if (!path) {
4678 ret = -ENOMEM;
4679 goto out;
4680 }
4681
4682 ref = recorded_ref_alloc();
4683 if (!ref) {
4684 ret = -ENOMEM;
4685 goto out;
4686 }
4687
4688 ret = get_cur_path(sctx, dir, dir_gen, path);
4689 if (ret < 0)
4690 goto out;
4691 ret = fs_path_add_path(path, name);
4692 if (ret < 0)
4693 goto out;
4694
4695 ref->dir = dir;
4696 ref->dir_gen = dir_gen;
4697 set_ref_path(ref, path);
4698 list_add_tail(&ref->list, refs);
4699 rb_add(&ref->node, root, rbtree_ref_less);
4700 ref->root = root;
4701 out:
4702 if (ret) {
4703 if (path && (!ref || !ref->full_path))
4704 fs_path_free(path);
4705 recorded_ref_free(ref);
4706 }
4707 return ret;
4708 }
4709
record_new_ref_if_needed(int num,u64 dir,int index,struct fs_path * name,void * ctx)4710 static int record_new_ref_if_needed(int num, u64 dir, int index,
4711 struct fs_path *name, void *ctx)
4712 {
4713 int ret = 0;
4714 struct send_ctx *sctx = ctx;
4715 struct rb_node *node = NULL;
4716 struct recorded_ref data;
4717 struct recorded_ref *ref;
4718 u64 dir_gen;
4719
4720 ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
4721 if (ret < 0)
4722 goto out;
4723
4724 data.dir = dir;
4725 data.dir_gen = dir_gen;
4726 set_ref_path(&data, name);
4727 node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
4728 if (node) {
4729 ref = rb_entry(node, struct recorded_ref, node);
4730 recorded_ref_free(ref);
4731 } else {
4732 ret = record_ref_in_tree(&sctx->rbtree_new_refs,
4733 &sctx->new_refs, name, dir, dir_gen,
4734 sctx);
4735 }
4736 out:
4737 return ret;
4738 }
4739
record_deleted_ref_if_needed(int num,u64 dir,int index,struct fs_path * name,void * ctx)4740 static int record_deleted_ref_if_needed(int num, u64 dir, int index,
4741 struct fs_path *name, void *ctx)
4742 {
4743 int ret = 0;
4744 struct send_ctx *sctx = ctx;
4745 struct rb_node *node = NULL;
4746 struct recorded_ref data;
4747 struct recorded_ref *ref;
4748 u64 dir_gen;
4749
4750 ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
4751 if (ret < 0)
4752 goto out;
4753
4754 data.dir = dir;
4755 data.dir_gen = dir_gen;
4756 set_ref_path(&data, name);
4757 node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
4758 if (node) {
4759 ref = rb_entry(node, struct recorded_ref, node);
4760 recorded_ref_free(ref);
4761 } else {
4762 ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
4763 &sctx->deleted_refs, name, dir,
4764 dir_gen, sctx);
4765 }
4766 out:
4767 return ret;
4768 }
4769
record_new_ref(struct send_ctx * sctx)4770 static int record_new_ref(struct send_ctx *sctx)
4771 {
4772 int ret;
4773
4774 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4775 sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4776 if (ret < 0)
4777 goto out;
4778 ret = 0;
4779
4780 out:
4781 return ret;
4782 }
4783
record_deleted_ref(struct send_ctx * sctx)4784 static int record_deleted_ref(struct send_ctx *sctx)
4785 {
4786 int ret;
4787
4788 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4789 sctx->cmp_key, 0, record_deleted_ref_if_needed,
4790 sctx);
4791 if (ret < 0)
4792 goto out;
4793 ret = 0;
4794
4795 out:
4796 return ret;
4797 }
4798
record_changed_ref(struct send_ctx * sctx)4799 static int record_changed_ref(struct send_ctx *sctx)
4800 {
4801 int ret = 0;
4802
4803 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4804 sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4805 if (ret < 0)
4806 goto out;
4807 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4808 sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
4809 if (ret < 0)
4810 goto out;
4811 ret = 0;
4812
4813 out:
4814 return ret;
4815 }
4816
4817 /*
4818 * Record and process all refs at once. Needed when an inode changes the
4819 * generation number, which means that it was deleted and recreated.
4820 */
process_all_refs(struct send_ctx * sctx,enum btrfs_compare_tree_result cmd)4821 static int process_all_refs(struct send_ctx *sctx,
4822 enum btrfs_compare_tree_result cmd)
4823 {
4824 int ret = 0;
4825 int iter_ret = 0;
4826 struct btrfs_root *root;
4827 struct btrfs_path *path;
4828 struct btrfs_key key;
4829 struct btrfs_key found_key;
4830 iterate_inode_ref_t cb;
4831 int pending_move = 0;
4832
4833 path = alloc_path_for_send();
4834 if (!path)
4835 return -ENOMEM;
4836
4837 if (cmd == BTRFS_COMPARE_TREE_NEW) {
4838 root = sctx->send_root;
4839 cb = record_new_ref_if_needed;
4840 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4841 root = sctx->parent_root;
4842 cb = record_deleted_ref_if_needed;
4843 } else {
4844 btrfs_err(sctx->send_root->fs_info,
4845 "Wrong command %d in process_all_refs", cmd);
4846 ret = -EINVAL;
4847 goto out;
4848 }
4849
4850 key.objectid = sctx->cmp_key->objectid;
4851 key.type = BTRFS_INODE_REF_KEY;
4852 key.offset = 0;
4853 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4854 if (found_key.objectid != key.objectid ||
4855 (found_key.type != BTRFS_INODE_REF_KEY &&
4856 found_key.type != BTRFS_INODE_EXTREF_KEY))
4857 break;
4858
4859 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
4860 if (ret < 0)
4861 goto out;
4862 }
4863 /* Catch error found during iteration */
4864 if (iter_ret < 0) {
4865 ret = iter_ret;
4866 goto out;
4867 }
4868 btrfs_release_path(path);
4869
4870 /*
4871 * We don't actually care about pending_move as we are simply
4872 * re-creating this inode and will be rename'ing it into place once we
4873 * rename the parent directory.
4874 */
4875 ret = process_recorded_refs(sctx, &pending_move);
4876 out:
4877 btrfs_free_path(path);
4878 return ret;
4879 }
4880
send_set_xattr(struct send_ctx * sctx,struct fs_path * path,const char * name,int name_len,const char * data,int data_len)4881 static int send_set_xattr(struct send_ctx *sctx,
4882 struct fs_path *path,
4883 const char *name, int name_len,
4884 const char *data, int data_len)
4885 {
4886 int ret = 0;
4887
4888 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
4889 if (ret < 0)
4890 goto out;
4891
4892 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4893 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4894 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4895
4896 ret = send_cmd(sctx);
4897
4898 tlv_put_failure:
4899 out:
4900 return ret;
4901 }
4902
send_remove_xattr(struct send_ctx * sctx,struct fs_path * path,const char * name,int name_len)4903 static int send_remove_xattr(struct send_ctx *sctx,
4904 struct fs_path *path,
4905 const char *name, int name_len)
4906 {
4907 int ret = 0;
4908
4909 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
4910 if (ret < 0)
4911 goto out;
4912
4913 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4914 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4915
4916 ret = send_cmd(sctx);
4917
4918 tlv_put_failure:
4919 out:
4920 return ret;
4921 }
4922
__process_new_xattr(int num,struct btrfs_key * di_key,const char * name,int name_len,const char * data,int data_len,void * ctx)4923 static int __process_new_xattr(int num, struct btrfs_key *di_key,
4924 const char *name, int name_len, const char *data,
4925 int data_len, void *ctx)
4926 {
4927 int ret;
4928 struct send_ctx *sctx = ctx;
4929 struct fs_path *p;
4930 struct posix_acl_xattr_header dummy_acl;
4931
4932 /* Capabilities are emitted by finish_inode_if_needed */
4933 if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4934 return 0;
4935
4936 p = fs_path_alloc();
4937 if (!p)
4938 return -ENOMEM;
4939
4940 /*
4941 * This hack is needed because empty acls are stored as zero byte
4942 * data in xattrs. Problem with that is, that receiving these zero byte
4943 * acls will fail later. To fix this, we send a dummy acl list that
4944 * only contains the version number and no entries.
4945 */
4946 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
4947 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4948 if (data_len == 0) {
4949 dummy_acl.a_version =
4950 cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4951 data = (char *)&dummy_acl;
4952 data_len = sizeof(dummy_acl);
4953 }
4954 }
4955
4956 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4957 if (ret < 0)
4958 goto out;
4959
4960 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
4961
4962 out:
4963 fs_path_free(p);
4964 return ret;
4965 }
4966
__process_deleted_xattr(int num,struct btrfs_key * di_key,const char * name,int name_len,const char * data,int data_len,void * ctx)4967 static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4968 const char *name, int name_len,
4969 const char *data, int data_len, void *ctx)
4970 {
4971 int ret;
4972 struct send_ctx *sctx = ctx;
4973 struct fs_path *p;
4974
4975 p = fs_path_alloc();
4976 if (!p)
4977 return -ENOMEM;
4978
4979 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4980 if (ret < 0)
4981 goto out;
4982
4983 ret = send_remove_xattr(sctx, p, name, name_len);
4984
4985 out:
4986 fs_path_free(p);
4987 return ret;
4988 }
4989
process_new_xattr(struct send_ctx * sctx)4990 static int process_new_xattr(struct send_ctx *sctx)
4991 {
4992 int ret = 0;
4993
4994 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4995 __process_new_xattr, sctx);
4996
4997 return ret;
4998 }
4999
process_deleted_xattr(struct send_ctx * sctx)5000 static int process_deleted_xattr(struct send_ctx *sctx)
5001 {
5002 return iterate_dir_item(sctx->parent_root, sctx->right_path,
5003 __process_deleted_xattr, sctx);
5004 }
5005
5006 struct find_xattr_ctx {
5007 const char *name;
5008 int name_len;
5009 int found_idx;
5010 char *found_data;
5011 int found_data_len;
5012 };
5013
__find_xattr(int num,struct btrfs_key * di_key,const char * name,int name_len,const char * data,int data_len,void * vctx)5014 static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
5015 int name_len, const char *data, int data_len, void *vctx)
5016 {
5017 struct find_xattr_ctx *ctx = vctx;
5018
5019 if (name_len == ctx->name_len &&
5020 strncmp(name, ctx->name, name_len) == 0) {
5021 ctx->found_idx = num;
5022 ctx->found_data_len = data_len;
5023 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
5024 if (!ctx->found_data)
5025 return -ENOMEM;
5026 return 1;
5027 }
5028 return 0;
5029 }
5030
find_xattr(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * key,const char * name,int name_len,char ** data,int * data_len)5031 static int find_xattr(struct btrfs_root *root,
5032 struct btrfs_path *path,
5033 struct btrfs_key *key,
5034 const char *name, int name_len,
5035 char **data, int *data_len)
5036 {
5037 int ret;
5038 struct find_xattr_ctx ctx;
5039
5040 ctx.name = name;
5041 ctx.name_len = name_len;
5042 ctx.found_idx = -1;
5043 ctx.found_data = NULL;
5044 ctx.found_data_len = 0;
5045
5046 ret = iterate_dir_item(root, path, __find_xattr, &ctx);
5047 if (ret < 0)
5048 return ret;
5049
5050 if (ctx.found_idx == -1)
5051 return -ENOENT;
5052 if (data) {
5053 *data = ctx.found_data;
5054 *data_len = ctx.found_data_len;
5055 } else {
5056 kfree(ctx.found_data);
5057 }
5058 return ctx.found_idx;
5059 }
5060
5061
__process_changed_new_xattr(int num,struct btrfs_key * di_key,const char * name,int name_len,const char * data,int data_len,void * ctx)5062 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
5063 const char *name, int name_len,
5064 const char *data, int data_len,
5065 void *ctx)
5066 {
5067 int ret;
5068 struct send_ctx *sctx = ctx;
5069 char *found_data = NULL;
5070 int found_data_len = 0;
5071
5072 ret = find_xattr(sctx->parent_root, sctx->right_path,
5073 sctx->cmp_key, name, name_len, &found_data,
5074 &found_data_len);
5075 if (ret == -ENOENT) {
5076 ret = __process_new_xattr(num, di_key, name, name_len, data,
5077 data_len, ctx);
5078 } else if (ret >= 0) {
5079 if (data_len != found_data_len ||
5080 memcmp(data, found_data, data_len)) {
5081 ret = __process_new_xattr(num, di_key, name, name_len,
5082 data, data_len, ctx);
5083 } else {
5084 ret = 0;
5085 }
5086 }
5087
5088 kfree(found_data);
5089 return ret;
5090 }
5091
__process_changed_deleted_xattr(int num,struct btrfs_key * di_key,const char * name,int name_len,const char * data,int data_len,void * ctx)5092 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
5093 const char *name, int name_len,
5094 const char *data, int data_len,
5095 void *ctx)
5096 {
5097 int ret;
5098 struct send_ctx *sctx = ctx;
5099
5100 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
5101 name, name_len, NULL, NULL);
5102 if (ret == -ENOENT)
5103 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
5104 data_len, ctx);
5105 else if (ret >= 0)
5106 ret = 0;
5107
5108 return ret;
5109 }
5110
process_changed_xattr(struct send_ctx * sctx)5111 static int process_changed_xattr(struct send_ctx *sctx)
5112 {
5113 int ret = 0;
5114
5115 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
5116 __process_changed_new_xattr, sctx);
5117 if (ret < 0)
5118 goto out;
5119 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
5120 __process_changed_deleted_xattr, sctx);
5121
5122 out:
5123 return ret;
5124 }
5125
process_all_new_xattrs(struct send_ctx * sctx)5126 static int process_all_new_xattrs(struct send_ctx *sctx)
5127 {
5128 int ret = 0;
5129 int iter_ret = 0;
5130 struct btrfs_root *root;
5131 struct btrfs_path *path;
5132 struct btrfs_key key;
5133 struct btrfs_key found_key;
5134
5135 path = alloc_path_for_send();
5136 if (!path)
5137 return -ENOMEM;
5138
5139 root = sctx->send_root;
5140
5141 key.objectid = sctx->cmp_key->objectid;
5142 key.type = BTRFS_XATTR_ITEM_KEY;
5143 key.offset = 0;
5144 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
5145 if (found_key.objectid != key.objectid ||
5146 found_key.type != key.type) {
5147 ret = 0;
5148 break;
5149 }
5150
5151 ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
5152 if (ret < 0)
5153 break;
5154 }
5155 /* Catch error found during iteration */
5156 if (iter_ret < 0)
5157 ret = iter_ret;
5158
5159 btrfs_free_path(path);
5160 return ret;
5161 }
5162
send_verity(struct send_ctx * sctx,struct fs_path * path,struct fsverity_descriptor * desc)5163 static int send_verity(struct send_ctx *sctx, struct fs_path *path,
5164 struct fsverity_descriptor *desc)
5165 {
5166 int ret;
5167
5168 ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
5169 if (ret < 0)
5170 goto out;
5171
5172 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
5173 TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
5174 le8_to_cpu(desc->hash_algorithm));
5175 TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
5176 1U << le8_to_cpu(desc->log_blocksize));
5177 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
5178 le8_to_cpu(desc->salt_size));
5179 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
5180 le32_to_cpu(desc->sig_size));
5181
5182 ret = send_cmd(sctx);
5183
5184 tlv_put_failure:
5185 out:
5186 return ret;
5187 }
5188
process_verity(struct send_ctx * sctx)5189 static int process_verity(struct send_ctx *sctx)
5190 {
5191 int ret = 0;
5192 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5193 struct inode *inode;
5194 struct fs_path *p;
5195
5196 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
5197 if (IS_ERR(inode))
5198 return PTR_ERR(inode);
5199
5200 ret = btrfs_get_verity_descriptor(inode, NULL, 0);
5201 if (ret < 0)
5202 goto iput;
5203
5204 if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
5205 ret = -EMSGSIZE;
5206 goto iput;
5207 }
5208 if (!sctx->verity_descriptor) {
5209 sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
5210 GFP_KERNEL);
5211 if (!sctx->verity_descriptor) {
5212 ret = -ENOMEM;
5213 goto iput;
5214 }
5215 }
5216
5217 ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
5218 if (ret < 0)
5219 goto iput;
5220
5221 p = fs_path_alloc();
5222 if (!p) {
5223 ret = -ENOMEM;
5224 goto iput;
5225 }
5226 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5227 if (ret < 0)
5228 goto free_path;
5229
5230 ret = send_verity(sctx, p, sctx->verity_descriptor);
5231 if (ret < 0)
5232 goto free_path;
5233
5234 free_path:
5235 fs_path_free(p);
5236 iput:
5237 iput(inode);
5238 return ret;
5239 }
5240
max_send_read_size(const struct send_ctx * sctx)5241 static inline u64 max_send_read_size(const struct send_ctx *sctx)
5242 {
5243 return sctx->send_max_size - SZ_16K;
5244 }
5245
put_data_header(struct send_ctx * sctx,u32 len)5246 static int put_data_header(struct send_ctx *sctx, u32 len)
5247 {
5248 if (WARN_ON_ONCE(sctx->put_data))
5249 return -EINVAL;
5250 sctx->put_data = true;
5251 if (sctx->proto >= 2) {
5252 /*
5253 * Since v2, the data attribute header doesn't include a length,
5254 * it is implicitly to the end of the command.
5255 */
5256 if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
5257 return -EOVERFLOW;
5258 put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
5259 sctx->send_size += sizeof(__le16);
5260 } else {
5261 struct btrfs_tlv_header *hdr;
5262
5263 if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
5264 return -EOVERFLOW;
5265 hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
5266 put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
5267 put_unaligned_le16(len, &hdr->tlv_len);
5268 sctx->send_size += sizeof(*hdr);
5269 }
5270 return 0;
5271 }
5272
put_file_data(struct send_ctx * sctx,u64 offset,u32 len)5273 static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5274 {
5275 struct btrfs_root *root = sctx->send_root;
5276 struct btrfs_fs_info *fs_info = root->fs_info;
5277 struct page *page;
5278 pgoff_t index = offset >> PAGE_SHIFT;
5279 pgoff_t last_index;
5280 unsigned pg_offset = offset_in_page(offset);
5281 int ret;
5282
5283 ret = put_data_header(sctx, len);
5284 if (ret)
5285 return ret;
5286
5287 last_index = (offset + len - 1) >> PAGE_SHIFT;
5288
5289 while (index <= last_index) {
5290 unsigned cur_len = min_t(unsigned, len,
5291 PAGE_SIZE - pg_offset);
5292
5293 page = find_lock_page(sctx->cur_inode->i_mapping, index);
5294 if (!page) {
5295 page_cache_sync_readahead(sctx->cur_inode->i_mapping,
5296 &sctx->ra, NULL, index,
5297 last_index + 1 - index);
5298
5299 page = find_or_create_page(sctx->cur_inode->i_mapping,
5300 index, GFP_KERNEL);
5301 if (!page) {
5302 ret = -ENOMEM;
5303 break;
5304 }
5305 }
5306
5307 if (PageReadahead(page))
5308 page_cache_async_readahead(sctx->cur_inode->i_mapping,
5309 &sctx->ra, NULL, page_folio(page),
5310 index, last_index + 1 - index);
5311
5312 if (!PageUptodate(page)) {
5313 btrfs_read_folio(NULL, page_folio(page));
5314 lock_page(page);
5315 if (!PageUptodate(page)) {
5316 unlock_page(page);
5317 btrfs_err(fs_info,
5318 "send: IO error at offset %llu for inode %llu root %llu",
5319 page_offset(page), sctx->cur_ino,
5320 sctx->send_root->root_key.objectid);
5321 put_page(page);
5322 ret = -EIO;
5323 break;
5324 }
5325 }
5326
5327 memcpy_from_page(sctx->send_buf + sctx->send_size, page,
5328 pg_offset, cur_len);
5329 unlock_page(page);
5330 put_page(page);
5331 index++;
5332 pg_offset = 0;
5333 len -= cur_len;
5334 sctx->send_size += cur_len;
5335 }
5336
5337 return ret;
5338 }
5339
5340 /*
5341 * Read some bytes from the current inode/file and send a write command to
5342 * user space.
5343 */
send_write(struct send_ctx * sctx,u64 offset,u32 len)5344 static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5345 {
5346 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5347 int ret = 0;
5348 struct fs_path *p;
5349
5350 p = fs_path_alloc();
5351 if (!p)
5352 return -ENOMEM;
5353
5354 btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5355
5356 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5357 if (ret < 0)
5358 goto out;
5359
5360 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5361 if (ret < 0)
5362 goto out;
5363
5364 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5365 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5366 ret = put_file_data(sctx, offset, len);
5367 if (ret < 0)
5368 goto out;
5369
5370 ret = send_cmd(sctx);
5371
5372 tlv_put_failure:
5373 out:
5374 fs_path_free(p);
5375 return ret;
5376 }
5377
5378 /*
5379 * Send a clone command to user space.
5380 */
send_clone(struct send_ctx * sctx,u64 offset,u32 len,struct clone_root * clone_root)5381 static int send_clone(struct send_ctx *sctx,
5382 u64 offset, u32 len,
5383 struct clone_root *clone_root)
5384 {
5385 int ret = 0;
5386 struct fs_path *p;
5387 u64 gen;
5388
5389 btrfs_debug(sctx->send_root->fs_info,
5390 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5391 offset, len, clone_root->root->root_key.objectid,
5392 clone_root->ino, clone_root->offset);
5393
5394 p = fs_path_alloc();
5395 if (!p)
5396 return -ENOMEM;
5397
5398 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
5399 if (ret < 0)
5400 goto out;
5401
5402 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5403 if (ret < 0)
5404 goto out;
5405
5406 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5407 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5408 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5409
5410 if (clone_root->root == sctx->send_root) {
5411 ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
5412 if (ret < 0)
5413 goto out;
5414 ret = get_cur_path(sctx, clone_root->ino, gen, p);
5415 } else {
5416 ret = get_inode_path(clone_root->root, clone_root->ino, p);
5417 }
5418 if (ret < 0)
5419 goto out;
5420
5421 /*
5422 * If the parent we're using has a received_uuid set then use that as
5423 * our clone source as that is what we will look for when doing a
5424 * receive.
5425 *
5426 * This covers the case that we create a snapshot off of a received
5427 * subvolume and then use that as the parent and try to receive on a
5428 * different host.
5429 */
5430 if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
5431 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5432 clone_root->root->root_item.received_uuid);
5433 else
5434 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5435 clone_root->root->root_item.uuid);
5436 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5437 btrfs_root_ctransid(&clone_root->root->root_item));
5438 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5439 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5440 clone_root->offset);
5441
5442 ret = send_cmd(sctx);
5443
5444 tlv_put_failure:
5445 out:
5446 fs_path_free(p);
5447 return ret;
5448 }
5449
5450 /*
5451 * Send an update extent command to user space.
5452 */
send_update_extent(struct send_ctx * sctx,u64 offset,u32 len)5453 static int send_update_extent(struct send_ctx *sctx,
5454 u64 offset, u32 len)
5455 {
5456 int ret = 0;
5457 struct fs_path *p;
5458
5459 p = fs_path_alloc();
5460 if (!p)
5461 return -ENOMEM;
5462
5463 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
5464 if (ret < 0)
5465 goto out;
5466
5467 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5468 if (ret < 0)
5469 goto out;
5470
5471 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5472 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5473 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5474
5475 ret = send_cmd(sctx);
5476
5477 tlv_put_failure:
5478 out:
5479 fs_path_free(p);
5480 return ret;
5481 }
5482
send_hole(struct send_ctx * sctx,u64 end)5483 static int send_hole(struct send_ctx *sctx, u64 end)
5484 {
5485 struct fs_path *p = NULL;
5486 u64 read_size = max_send_read_size(sctx);
5487 u64 offset = sctx->cur_inode_last_extent;
5488 int ret = 0;
5489
5490 /*
5491 * A hole that starts at EOF or beyond it. Since we do not yet support
5492 * fallocate (for extent preallocation and hole punching), sending a
5493 * write of zeroes starting at EOF or beyond would later require issuing
5494 * a truncate operation which would undo the write and achieve nothing.
5495 */
5496 if (offset >= sctx->cur_inode_size)
5497 return 0;
5498
5499 /*
5500 * Don't go beyond the inode's i_size due to prealloc extents that start
5501 * after the i_size.
5502 */
5503 end = min_t(u64, end, sctx->cur_inode_size);
5504
5505 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5506 return send_update_extent(sctx, offset, end - offset);
5507
5508 p = fs_path_alloc();
5509 if (!p)
5510 return -ENOMEM;
5511 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5512 if (ret < 0)
5513 goto tlv_put_failure;
5514 while (offset < end) {
5515 u64 len = min(end - offset, read_size);
5516
5517 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5518 if (ret < 0)
5519 break;
5520 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5521 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5522 ret = put_data_header(sctx, len);
5523 if (ret < 0)
5524 break;
5525 memset(sctx->send_buf + sctx->send_size, 0, len);
5526 sctx->send_size += len;
5527 ret = send_cmd(sctx);
5528 if (ret < 0)
5529 break;
5530 offset += len;
5531 }
5532 sctx->cur_inode_next_write_offset = offset;
5533 tlv_put_failure:
5534 fs_path_free(p);
5535 return ret;
5536 }
5537
send_encoded_inline_extent(struct send_ctx * sctx,struct btrfs_path * path,u64 offset,u64 len)5538 static int send_encoded_inline_extent(struct send_ctx *sctx,
5539 struct btrfs_path *path, u64 offset,
5540 u64 len)
5541 {
5542 struct btrfs_root *root = sctx->send_root;
5543 struct btrfs_fs_info *fs_info = root->fs_info;
5544 struct inode *inode;
5545 struct fs_path *fspath;
5546 struct extent_buffer *leaf = path->nodes[0];
5547 struct btrfs_key key;
5548 struct btrfs_file_extent_item *ei;
5549 u64 ram_bytes;
5550 size_t inline_size;
5551 int ret;
5552
5553 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
5554 if (IS_ERR(inode))
5555 return PTR_ERR(inode);
5556
5557 fspath = fs_path_alloc();
5558 if (!fspath) {
5559 ret = -ENOMEM;
5560 goto out;
5561 }
5562
5563 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
5564 if (ret < 0)
5565 goto out;
5566
5567 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5568 if (ret < 0)
5569 goto out;
5570
5571 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5572 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
5573 ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
5574 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
5575
5576 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5577 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5578 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5579 min(key.offset + ram_bytes - offset, len));
5580 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
5581 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
5582 ret = btrfs_encoded_io_compression_from_extent(fs_info,
5583 btrfs_file_extent_compression(leaf, ei));
5584 if (ret < 0)
5585 goto out;
5586 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5587
5588 ret = put_data_header(sctx, inline_size);
5589 if (ret < 0)
5590 goto out;
5591 read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
5592 btrfs_file_extent_inline_start(ei), inline_size);
5593 sctx->send_size += inline_size;
5594
5595 ret = send_cmd(sctx);
5596
5597 tlv_put_failure:
5598 out:
5599 fs_path_free(fspath);
5600 iput(inode);
5601 return ret;
5602 }
5603
send_encoded_extent(struct send_ctx * sctx,struct btrfs_path * path,u64 offset,u64 len)5604 static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
5605 u64 offset, u64 len)
5606 {
5607 struct btrfs_root *root = sctx->send_root;
5608 struct btrfs_fs_info *fs_info = root->fs_info;
5609 struct inode *inode;
5610 struct fs_path *fspath;
5611 struct extent_buffer *leaf = path->nodes[0];
5612 struct btrfs_key key;
5613 struct btrfs_file_extent_item *ei;
5614 u64 disk_bytenr, disk_num_bytes;
5615 u32 data_offset;
5616 struct btrfs_cmd_header *hdr;
5617 u32 crc;
5618 int ret;
5619
5620 inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
5621 if (IS_ERR(inode))
5622 return PTR_ERR(inode);
5623
5624 fspath = fs_path_alloc();
5625 if (!fspath) {
5626 ret = -ENOMEM;
5627 goto out;
5628 }
5629
5630 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
5631 if (ret < 0)
5632 goto out;
5633
5634 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5635 if (ret < 0)
5636 goto out;
5637
5638 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5639 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
5640 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
5641 disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
5642
5643 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5644 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5645 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5646 min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
5647 len));
5648 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
5649 btrfs_file_extent_ram_bytes(leaf, ei));
5650 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
5651 offset - key.offset + btrfs_file_extent_offset(leaf, ei));
5652 ret = btrfs_encoded_io_compression_from_extent(fs_info,
5653 btrfs_file_extent_compression(leaf, ei));
5654 if (ret < 0)
5655 goto out;
5656 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5657 TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
5658
5659 ret = put_data_header(sctx, disk_num_bytes);
5660 if (ret < 0)
5661 goto out;
5662
5663 /*
5664 * We want to do I/O directly into the send buffer, so get the next page
5665 * boundary in the send buffer. This means that there may be a gap
5666 * between the beginning of the command and the file data.
5667 */
5668 data_offset = PAGE_ALIGN(sctx->send_size);
5669 if (data_offset > sctx->send_max_size ||
5670 sctx->send_max_size - data_offset < disk_num_bytes) {
5671 ret = -EOVERFLOW;
5672 goto out;
5673 }
5674
5675 /*
5676 * Note that send_buf is a mapping of send_buf_pages, so this is really
5677 * reading into send_buf.
5678 */
5679 ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
5680 disk_bytenr, disk_num_bytes,
5681 sctx->send_buf_pages +
5682 (data_offset >> PAGE_SHIFT));
5683 if (ret)
5684 goto out;
5685
5686 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
5687 hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
5688 hdr->crc = 0;
5689 crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
5690 crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
5691 hdr->crc = cpu_to_le32(crc);
5692
5693 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
5694 &sctx->send_off);
5695 if (!ret) {
5696 ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
5697 disk_num_bytes, &sctx->send_off);
5698 }
5699 sctx->send_size = 0;
5700 sctx->put_data = false;
5701
5702 tlv_put_failure:
5703 out:
5704 fs_path_free(fspath);
5705 iput(inode);
5706 return ret;
5707 }
5708
send_extent_data(struct send_ctx * sctx,struct btrfs_path * path,const u64 offset,const u64 len)5709 static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
5710 const u64 offset, const u64 len)
5711 {
5712 const u64 end = offset + len;
5713 struct extent_buffer *leaf = path->nodes[0];
5714 struct btrfs_file_extent_item *ei;
5715 u64 read_size = max_send_read_size(sctx);
5716 u64 sent = 0;
5717
5718 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5719 return send_update_extent(sctx, offset, len);
5720
5721 ei = btrfs_item_ptr(leaf, path->slots[0],
5722 struct btrfs_file_extent_item);
5723 if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5724 btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
5725 bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
5726 BTRFS_FILE_EXTENT_INLINE);
5727
5728 /*
5729 * Send the compressed extent unless the compressed data is
5730 * larger than the decompressed data. This can happen if we're
5731 * not sending the entire extent, either because it has been
5732 * partially overwritten/truncated or because this is a part of
5733 * the extent that we couldn't clone in clone_range().
5734 */
5735 if (is_inline &&
5736 btrfs_file_extent_inline_item_len(leaf,
5737 path->slots[0]) <= len) {
5738 return send_encoded_inline_extent(sctx, path, offset,
5739 len);
5740 } else if (!is_inline &&
5741 btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
5742 return send_encoded_extent(sctx, path, offset, len);
5743 }
5744 }
5745
5746 if (sctx->cur_inode == NULL) {
5747 struct btrfs_root *root = sctx->send_root;
5748
5749 sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
5750 if (IS_ERR(sctx->cur_inode)) {
5751 int err = PTR_ERR(sctx->cur_inode);
5752
5753 sctx->cur_inode = NULL;
5754 return err;
5755 }
5756 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
5757 file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
5758
5759 /*
5760 * It's very likely there are no pages from this inode in the page
5761 * cache, so after reading extents and sending their data, we clean
5762 * the page cache to avoid trashing the page cache (adding pressure
5763 * to the page cache and forcing eviction of other data more useful
5764 * for applications).
5765 *
5766 * We decide if we should clean the page cache simply by checking
5767 * if the inode's mapping nrpages is 0 when we first open it, and
5768 * not by using something like filemap_range_has_page() before
5769 * reading an extent because when we ask the readahead code to
5770 * read a given file range, it may (and almost always does) read
5771 * pages from beyond that range (see the documentation for
5772 * page_cache_sync_readahead()), so it would not be reliable,
5773 * because after reading the first extent future calls to
5774 * filemap_range_has_page() would return true because the readahead
5775 * on the previous extent resulted in reading pages of the current
5776 * extent as well.
5777 */
5778 sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
5779 sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5780 }
5781
5782 while (sent < len) {
5783 u64 size = min(len - sent, read_size);
5784 int ret;
5785
5786 ret = send_write(sctx, offset + sent, size);
5787 if (ret < 0)
5788 return ret;
5789 sent += size;
5790 }
5791
5792 if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
5793 /*
5794 * Always operate only on ranges that are a multiple of the page
5795 * size. This is not only to prevent zeroing parts of a page in
5796 * the case of subpage sector size, but also to guarantee we evict
5797 * pages, as passing a range that is smaller than page size does
5798 * not evict the respective page (only zeroes part of its content).
5799 *
5800 * Always start from the end offset of the last range cleared.
5801 * This is because the readahead code may (and very often does)
5802 * reads pages beyond the range we request for readahead. So if
5803 * we have an extent layout like this:
5804 *
5805 * [ extent A ] [ extent B ] [ extent C ]
5806 *
5807 * When we ask page_cache_sync_readahead() to read extent A, it
5808 * may also trigger reads for pages of extent B. If we are doing
5809 * an incremental send and extent B has not changed between the
5810 * parent and send snapshots, some or all of its pages may end
5811 * up being read and placed in the page cache. So when truncating
5812 * the page cache we always start from the end offset of the
5813 * previously processed extent up to the end of the current
5814 * extent.
5815 */
5816 truncate_inode_pages_range(&sctx->cur_inode->i_data,
5817 sctx->page_cache_clear_start,
5818 end - 1);
5819 sctx->page_cache_clear_start = end;
5820 }
5821
5822 return 0;
5823 }
5824
5825 /*
5826 * Search for a capability xattr related to sctx->cur_ino. If the capability is
5827 * found, call send_set_xattr function to emit it.
5828 *
5829 * Return 0 if there isn't a capability, or when the capability was emitted
5830 * successfully, or < 0 if an error occurred.
5831 */
send_capabilities(struct send_ctx * sctx)5832 static int send_capabilities(struct send_ctx *sctx)
5833 {
5834 struct fs_path *fspath = NULL;
5835 struct btrfs_path *path;
5836 struct btrfs_dir_item *di;
5837 struct extent_buffer *leaf;
5838 unsigned long data_ptr;
5839 char *buf = NULL;
5840 int buf_len;
5841 int ret = 0;
5842
5843 path = alloc_path_for_send();
5844 if (!path)
5845 return -ENOMEM;
5846
5847 di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
5848 XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
5849 if (!di) {
5850 /* There is no xattr for this inode */
5851 goto out;
5852 } else if (IS_ERR(di)) {
5853 ret = PTR_ERR(di);
5854 goto out;
5855 }
5856
5857 leaf = path->nodes[0];
5858 buf_len = btrfs_dir_data_len(leaf, di);
5859
5860 fspath = fs_path_alloc();
5861 buf = kmalloc(buf_len, GFP_KERNEL);
5862 if (!fspath || !buf) {
5863 ret = -ENOMEM;
5864 goto out;
5865 }
5866
5867 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5868 if (ret < 0)
5869 goto out;
5870
5871 data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
5872 read_extent_buffer(leaf, buf, data_ptr, buf_len);
5873
5874 ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
5875 strlen(XATTR_NAME_CAPS), buf, buf_len);
5876 out:
5877 kfree(buf);
5878 fs_path_free(fspath);
5879 btrfs_free_path(path);
5880 return ret;
5881 }
5882
clone_range(struct send_ctx * sctx,struct btrfs_path * dst_path,struct clone_root * clone_root,const u64 disk_byte,u64 data_offset,u64 offset,u64 len)5883 static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
5884 struct clone_root *clone_root, const u64 disk_byte,
5885 u64 data_offset, u64 offset, u64 len)
5886 {
5887 struct btrfs_path *path;
5888 struct btrfs_key key;
5889 int ret;
5890 struct btrfs_inode_info info;
5891 u64 clone_src_i_size = 0;
5892
5893 /*
5894 * Prevent cloning from a zero offset with a length matching the sector
5895 * size because in some scenarios this will make the receiver fail.
5896 *
5897 * For example, if in the source filesystem the extent at offset 0
5898 * has a length of sectorsize and it was written using direct IO, then
5899 * it can never be an inline extent (even if compression is enabled).
5900 * Then this extent can be cloned in the original filesystem to a non
5901 * zero file offset, but it may not be possible to clone in the
5902 * destination filesystem because it can be inlined due to compression
5903 * on the destination filesystem (as the receiver's write operations are
5904 * always done using buffered IO). The same happens when the original
5905 * filesystem does not have compression enabled but the destination
5906 * filesystem has.
5907 */
5908 if (clone_root->offset == 0 &&
5909 len == sctx->send_root->fs_info->sectorsize)
5910 return send_extent_data(sctx, dst_path, offset, len);
5911
5912 path = alloc_path_for_send();
5913 if (!path)
5914 return -ENOMEM;
5915
5916 /*
5917 * There are inodes that have extents that lie behind its i_size. Don't
5918 * accept clones from these extents.
5919 */
5920 ret = get_inode_info(clone_root->root, clone_root->ino, &info);
5921 btrfs_release_path(path);
5922 if (ret < 0)
5923 goto out;
5924 clone_src_i_size = info.size;
5925
5926 /*
5927 * We can't send a clone operation for the entire range if we find
5928 * extent items in the respective range in the source file that
5929 * refer to different extents or if we find holes.
5930 * So check for that and do a mix of clone and regular write/copy
5931 * operations if needed.
5932 *
5933 * Example:
5934 *
5935 * mkfs.btrfs -f /dev/sda
5936 * mount /dev/sda /mnt
5937 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5938 * cp --reflink=always /mnt/foo /mnt/bar
5939 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5940 * btrfs subvolume snapshot -r /mnt /mnt/snap
5941 *
5942 * If when we send the snapshot and we are processing file bar (which
5943 * has a higher inode number than foo) we blindly send a clone operation
5944 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5945 * a file bar that matches the content of file foo - iow, doesn't match
5946 * the content from bar in the original filesystem.
5947 */
5948 key.objectid = clone_root->ino;
5949 key.type = BTRFS_EXTENT_DATA_KEY;
5950 key.offset = clone_root->offset;
5951 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
5952 if (ret < 0)
5953 goto out;
5954 if (ret > 0 && path->slots[0] > 0) {
5955 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
5956 if (key.objectid == clone_root->ino &&
5957 key.type == BTRFS_EXTENT_DATA_KEY)
5958 path->slots[0]--;
5959 }
5960
5961 while (true) {
5962 struct extent_buffer *leaf = path->nodes[0];
5963 int slot = path->slots[0];
5964 struct btrfs_file_extent_item *ei;
5965 u8 type;
5966 u64 ext_len;
5967 u64 clone_len;
5968 u64 clone_data_offset;
5969 bool crossed_src_i_size = false;
5970
5971 if (slot >= btrfs_header_nritems(leaf)) {
5972 ret = btrfs_next_leaf(clone_root->root, path);
5973 if (ret < 0)
5974 goto out;
5975 else if (ret > 0)
5976 break;
5977 continue;
5978 }
5979
5980 btrfs_item_key_to_cpu(leaf, &key, slot);
5981
5982 /*
5983 * We might have an implicit trailing hole (NO_HOLES feature
5984 * enabled). We deal with it after leaving this loop.
5985 */
5986 if (key.objectid != clone_root->ino ||
5987 key.type != BTRFS_EXTENT_DATA_KEY)
5988 break;
5989
5990 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5991 type = btrfs_file_extent_type(leaf, ei);
5992 if (type == BTRFS_FILE_EXTENT_INLINE) {
5993 ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
5994 ext_len = PAGE_ALIGN(ext_len);
5995 } else {
5996 ext_len = btrfs_file_extent_num_bytes(leaf, ei);
5997 }
5998
5999 if (key.offset + ext_len <= clone_root->offset)
6000 goto next;
6001
6002 if (key.offset > clone_root->offset) {
6003 /* Implicit hole, NO_HOLES feature enabled. */
6004 u64 hole_len = key.offset - clone_root->offset;
6005
6006 if (hole_len > len)
6007 hole_len = len;
6008 ret = send_extent_data(sctx, dst_path, offset,
6009 hole_len);
6010 if (ret < 0)
6011 goto out;
6012
6013 len -= hole_len;
6014 if (len == 0)
6015 break;
6016 offset += hole_len;
6017 clone_root->offset += hole_len;
6018 data_offset += hole_len;
6019 }
6020
6021 if (key.offset >= clone_root->offset + len)
6022 break;
6023
6024 if (key.offset >= clone_src_i_size)
6025 break;
6026
6027 if (key.offset + ext_len > clone_src_i_size) {
6028 ext_len = clone_src_i_size - key.offset;
6029 crossed_src_i_size = true;
6030 }
6031
6032 clone_data_offset = btrfs_file_extent_offset(leaf, ei);
6033 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
6034 clone_root->offset = key.offset;
6035 if (clone_data_offset < data_offset &&
6036 clone_data_offset + ext_len > data_offset) {
6037 u64 extent_offset;
6038
6039 extent_offset = data_offset - clone_data_offset;
6040 ext_len -= extent_offset;
6041 clone_data_offset += extent_offset;
6042 clone_root->offset += extent_offset;
6043 }
6044 }
6045
6046 clone_len = min_t(u64, ext_len, len);
6047
6048 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
6049 clone_data_offset == data_offset) {
6050 const u64 src_end = clone_root->offset + clone_len;
6051 const u64 sectorsize = SZ_64K;
6052
6053 /*
6054 * We can't clone the last block, when its size is not
6055 * sector size aligned, into the middle of a file. If we
6056 * do so, the receiver will get a failure (-EINVAL) when
6057 * trying to clone or will silently corrupt the data in
6058 * the destination file if it's on a kernel without the
6059 * fix introduced by commit ac765f83f1397646
6060 * ("Btrfs: fix data corruption due to cloning of eof
6061 * block).
6062 *
6063 * So issue a clone of the aligned down range plus a
6064 * regular write for the eof block, if we hit that case.
6065 *
6066 * Also, we use the maximum possible sector size, 64K,
6067 * because we don't know what's the sector size of the
6068 * filesystem that receives the stream, so we have to
6069 * assume the largest possible sector size.
6070 */
6071 if (src_end == clone_src_i_size &&
6072 !IS_ALIGNED(src_end, sectorsize) &&
6073 offset + clone_len < sctx->cur_inode_size) {
6074 u64 slen;
6075
6076 slen = ALIGN_DOWN(src_end - clone_root->offset,
6077 sectorsize);
6078 if (slen > 0) {
6079 ret = send_clone(sctx, offset, slen,
6080 clone_root);
6081 if (ret < 0)
6082 goto out;
6083 }
6084 ret = send_extent_data(sctx, dst_path,
6085 offset + slen,
6086 clone_len - slen);
6087 } else {
6088 ret = send_clone(sctx, offset, clone_len,
6089 clone_root);
6090 }
6091 } else if (crossed_src_i_size && clone_len < len) {
6092 /*
6093 * If we are at i_size of the clone source inode and we
6094 * can not clone from it, terminate the loop. This is
6095 * to avoid sending two write operations, one with a
6096 * length matching clone_len and the final one after
6097 * this loop with a length of len - clone_len.
6098 *
6099 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6100 * was passed to the send ioctl), this helps avoid
6101 * sending an encoded write for an offset that is not
6102 * sector size aligned, in case the i_size of the source
6103 * inode is not sector size aligned. That will make the
6104 * receiver fallback to decompression of the data and
6105 * writing it using regular buffered IO, therefore while
6106 * not incorrect, it's not optimal due decompression and
6107 * possible re-compression at the receiver.
6108 */
6109 break;
6110 } else {
6111 ret = send_extent_data(sctx, dst_path, offset,
6112 clone_len);
6113 }
6114
6115 if (ret < 0)
6116 goto out;
6117
6118 len -= clone_len;
6119 if (len == 0)
6120 break;
6121 offset += clone_len;
6122 clone_root->offset += clone_len;
6123
6124 /*
6125 * If we are cloning from the file we are currently processing,
6126 * and using the send root as the clone root, we must stop once
6127 * the current clone offset reaches the current eof of the file
6128 * at the receiver, otherwise we would issue an invalid clone
6129 * operation (source range going beyond eof) and cause the
6130 * receiver to fail. So if we reach the current eof, bail out
6131 * and fallback to a regular write.
6132 */
6133 if (clone_root->root == sctx->send_root &&
6134 clone_root->ino == sctx->cur_ino &&
6135 clone_root->offset >= sctx->cur_inode_next_write_offset)
6136 break;
6137
6138 data_offset += clone_len;
6139 next:
6140 path->slots[0]++;
6141 }
6142
6143 if (len > 0)
6144 ret = send_extent_data(sctx, dst_path, offset, len);
6145 else
6146 ret = 0;
6147 out:
6148 btrfs_free_path(path);
6149 return ret;
6150 }
6151
send_write_or_clone(struct send_ctx * sctx,struct btrfs_path * path,struct btrfs_key * key,struct clone_root * clone_root)6152 static int send_write_or_clone(struct send_ctx *sctx,
6153 struct btrfs_path *path,
6154 struct btrfs_key *key,
6155 struct clone_root *clone_root)
6156 {
6157 int ret = 0;
6158 u64 offset = key->offset;
6159 u64 end;
6160 u64 bs = sctx->send_root->fs_info->sectorsize;
6161 struct btrfs_file_extent_item *ei;
6162 u64 disk_byte;
6163 u64 data_offset;
6164 u64 num_bytes;
6165 struct btrfs_inode_info info = { 0 };
6166
6167 end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
6168 if (offset >= end)
6169 return 0;
6170
6171 num_bytes = end - offset;
6172
6173 if (!clone_root)
6174 goto write_data;
6175
6176 if (IS_ALIGNED(end, bs))
6177 goto clone_data;
6178
6179 /*
6180 * If the extent end is not aligned, we can clone if the extent ends at
6181 * the i_size of the inode and the clone range ends at the i_size of the
6182 * source inode, otherwise the clone operation fails with -EINVAL.
6183 */
6184 if (end != sctx->cur_inode_size)
6185 goto write_data;
6186
6187 ret = get_inode_info(clone_root->root, clone_root->ino, &info);
6188 if (ret < 0)
6189 return ret;
6190
6191 if (clone_root->offset + num_bytes == info.size) {
6192 /*
6193 * The final size of our file matches the end offset, but it may
6194 * be that its current size is larger, so we have to truncate it
6195 * to any value between the start offset of the range and the
6196 * final i_size, otherwise the clone operation is invalid
6197 * because it's unaligned and it ends before the current EOF.
6198 * We do this truncate to the final i_size when we finish
6199 * processing the inode, but it's too late by then. And here we
6200 * truncate to the start offset of the range because it's always
6201 * sector size aligned while if it were the final i_size it
6202 * would result in dirtying part of a page, filling part of a
6203 * page with zeroes and then having the clone operation at the
6204 * receiver trigger IO and wait for it due to the dirty page.
6205 */
6206 if (sctx->parent_root != NULL) {
6207 ret = send_truncate(sctx, sctx->cur_ino,
6208 sctx->cur_inode_gen, offset);
6209 if (ret < 0)
6210 return ret;
6211 }
6212 goto clone_data;
6213 }
6214
6215 write_data:
6216 ret = send_extent_data(sctx, path, offset, num_bytes);
6217 sctx->cur_inode_next_write_offset = end;
6218 return ret;
6219
6220 clone_data:
6221 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
6222 struct btrfs_file_extent_item);
6223 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
6224 data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
6225 ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
6226 num_bytes);
6227 sctx->cur_inode_next_write_offset = end;
6228 return ret;
6229 }
6230
is_extent_unchanged(struct send_ctx * sctx,struct btrfs_path * left_path,struct btrfs_key * ekey)6231 static int is_extent_unchanged(struct send_ctx *sctx,
6232 struct btrfs_path *left_path,
6233 struct btrfs_key *ekey)
6234 {
6235 int ret = 0;
6236 struct btrfs_key key;
6237 struct btrfs_path *path = NULL;
6238 struct extent_buffer *eb;
6239 int slot;
6240 struct btrfs_key found_key;
6241 struct btrfs_file_extent_item *ei;
6242 u64 left_disknr;
6243 u64 right_disknr;
6244 u64 left_offset;
6245 u64 right_offset;
6246 u64 left_offset_fixed;
6247 u64 left_len;
6248 u64 right_len;
6249 u64 left_gen;
6250 u64 right_gen;
6251 u8 left_type;
6252 u8 right_type;
6253
6254 path = alloc_path_for_send();
6255 if (!path)
6256 return -ENOMEM;
6257
6258 eb = left_path->nodes[0];
6259 slot = left_path->slots[0];
6260 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6261 left_type = btrfs_file_extent_type(eb, ei);
6262
6263 if (left_type != BTRFS_FILE_EXTENT_REG) {
6264 ret = 0;
6265 goto out;
6266 }
6267 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
6268 left_len = btrfs_file_extent_num_bytes(eb, ei);
6269 left_offset = btrfs_file_extent_offset(eb, ei);
6270 left_gen = btrfs_file_extent_generation(eb, ei);
6271
6272 /*
6273 * Following comments will refer to these graphics. L is the left
6274 * extents which we are checking at the moment. 1-8 are the right
6275 * extents that we iterate.
6276 *
6277 * |-----L-----|
6278 * |-1-|-2a-|-3-|-4-|-5-|-6-|
6279 *
6280 * |-----L-----|
6281 * |--1--|-2b-|...(same as above)
6282 *
6283 * Alternative situation. Happens on files where extents got split.
6284 * |-----L-----|
6285 * |-----------7-----------|-6-|
6286 *
6287 * Alternative situation. Happens on files which got larger.
6288 * |-----L-----|
6289 * |-8-|
6290 * Nothing follows after 8.
6291 */
6292
6293 key.objectid = ekey->objectid;
6294 key.type = BTRFS_EXTENT_DATA_KEY;
6295 key.offset = ekey->offset;
6296 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
6297 if (ret < 0)
6298 goto out;
6299 if (ret) {
6300 ret = 0;
6301 goto out;
6302 }
6303
6304 /*
6305 * Handle special case where the right side has no extents at all.
6306 */
6307 eb = path->nodes[0];
6308 slot = path->slots[0];
6309 btrfs_item_key_to_cpu(eb, &found_key, slot);
6310 if (found_key.objectid != key.objectid ||
6311 found_key.type != key.type) {
6312 /* If we're a hole then just pretend nothing changed */
6313 ret = (left_disknr) ? 0 : 1;
6314 goto out;
6315 }
6316
6317 /*
6318 * We're now on 2a, 2b or 7.
6319 */
6320 key = found_key;
6321 while (key.offset < ekey->offset + left_len) {
6322 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6323 right_type = btrfs_file_extent_type(eb, ei);
6324 if (right_type != BTRFS_FILE_EXTENT_REG &&
6325 right_type != BTRFS_FILE_EXTENT_INLINE) {
6326 ret = 0;
6327 goto out;
6328 }
6329
6330 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6331 right_len = btrfs_file_extent_ram_bytes(eb, ei);
6332 right_len = PAGE_ALIGN(right_len);
6333 } else {
6334 right_len = btrfs_file_extent_num_bytes(eb, ei);
6335 }
6336
6337 /*
6338 * Are we at extent 8? If yes, we know the extent is changed.
6339 * This may only happen on the first iteration.
6340 */
6341 if (found_key.offset + right_len <= ekey->offset) {
6342 /* If we're a hole just pretend nothing changed */
6343 ret = (left_disknr) ? 0 : 1;
6344 goto out;
6345 }
6346
6347 /*
6348 * We just wanted to see if when we have an inline extent, what
6349 * follows it is a regular extent (wanted to check the above
6350 * condition for inline extents too). This should normally not
6351 * happen but it's possible for example when we have an inline
6352 * compressed extent representing data with a size matching
6353 * the page size (currently the same as sector size).
6354 */
6355 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6356 ret = 0;
6357 goto out;
6358 }
6359
6360 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
6361 right_offset = btrfs_file_extent_offset(eb, ei);
6362 right_gen = btrfs_file_extent_generation(eb, ei);
6363
6364 left_offset_fixed = left_offset;
6365 if (key.offset < ekey->offset) {
6366 /* Fix the right offset for 2a and 7. */
6367 right_offset += ekey->offset - key.offset;
6368 } else {
6369 /* Fix the left offset for all behind 2a and 2b */
6370 left_offset_fixed += key.offset - ekey->offset;
6371 }
6372
6373 /*
6374 * Check if we have the same extent.
6375 */
6376 if (left_disknr != right_disknr ||
6377 left_offset_fixed != right_offset ||
6378 left_gen != right_gen) {
6379 ret = 0;
6380 goto out;
6381 }
6382
6383 /*
6384 * Go to the next extent.
6385 */
6386 ret = btrfs_next_item(sctx->parent_root, path);
6387 if (ret < 0)
6388 goto out;
6389 if (!ret) {
6390 eb = path->nodes[0];
6391 slot = path->slots[0];
6392 btrfs_item_key_to_cpu(eb, &found_key, slot);
6393 }
6394 if (ret || found_key.objectid != key.objectid ||
6395 found_key.type != key.type) {
6396 key.offset += right_len;
6397 break;
6398 }
6399 if (found_key.offset != key.offset + right_len) {
6400 ret = 0;
6401 goto out;
6402 }
6403 key = found_key;
6404 }
6405
6406 /*
6407 * We're now behind the left extent (treat as unchanged) or at the end
6408 * of the right side (treat as changed).
6409 */
6410 if (key.offset >= ekey->offset + left_len)
6411 ret = 1;
6412 else
6413 ret = 0;
6414
6415
6416 out:
6417 btrfs_free_path(path);
6418 return ret;
6419 }
6420
get_last_extent(struct send_ctx * sctx,u64 offset)6421 static int get_last_extent(struct send_ctx *sctx, u64 offset)
6422 {
6423 struct btrfs_path *path;
6424 struct btrfs_root *root = sctx->send_root;
6425 struct btrfs_key key;
6426 int ret;
6427
6428 path = alloc_path_for_send();
6429 if (!path)
6430 return -ENOMEM;
6431
6432 sctx->cur_inode_last_extent = 0;
6433
6434 key.objectid = sctx->cur_ino;
6435 key.type = BTRFS_EXTENT_DATA_KEY;
6436 key.offset = offset;
6437 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
6438 if (ret < 0)
6439 goto out;
6440 ret = 0;
6441 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
6442 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
6443 goto out;
6444
6445 sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6446 out:
6447 btrfs_free_path(path);
6448 return ret;
6449 }
6450
range_is_hole_in_parent(struct send_ctx * sctx,const u64 start,const u64 end)6451 static int range_is_hole_in_parent(struct send_ctx *sctx,
6452 const u64 start,
6453 const u64 end)
6454 {
6455 struct btrfs_path *path;
6456 struct btrfs_key key;
6457 struct btrfs_root *root = sctx->parent_root;
6458 u64 search_start = start;
6459 int ret;
6460
6461 path = alloc_path_for_send();
6462 if (!path)
6463 return -ENOMEM;
6464
6465 key.objectid = sctx->cur_ino;
6466 key.type = BTRFS_EXTENT_DATA_KEY;
6467 key.offset = search_start;
6468 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6469 if (ret < 0)
6470 goto out;
6471 if (ret > 0 && path->slots[0] > 0)
6472 path->slots[0]--;
6473
6474 while (search_start < end) {
6475 struct extent_buffer *leaf = path->nodes[0];
6476 int slot = path->slots[0];
6477 struct btrfs_file_extent_item *fi;
6478 u64 extent_end;
6479
6480 if (slot >= btrfs_header_nritems(leaf)) {
6481 ret = btrfs_next_leaf(root, path);
6482 if (ret < 0)
6483 goto out;
6484 else if (ret > 0)
6485 break;
6486 continue;
6487 }
6488
6489 btrfs_item_key_to_cpu(leaf, &key, slot);
6490 if (key.objectid < sctx->cur_ino ||
6491 key.type < BTRFS_EXTENT_DATA_KEY)
6492 goto next;
6493 if (key.objectid > sctx->cur_ino ||
6494 key.type > BTRFS_EXTENT_DATA_KEY ||
6495 key.offset >= end)
6496 break;
6497
6498 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6499 extent_end = btrfs_file_extent_end(path);
6500 if (extent_end <= start)
6501 goto next;
6502 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
6503 search_start = extent_end;
6504 goto next;
6505 }
6506 ret = 0;
6507 goto out;
6508 next:
6509 path->slots[0]++;
6510 }
6511 ret = 1;
6512 out:
6513 btrfs_free_path(path);
6514 return ret;
6515 }
6516
maybe_send_hole(struct send_ctx * sctx,struct btrfs_path * path,struct btrfs_key * key)6517 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
6518 struct btrfs_key *key)
6519 {
6520 int ret = 0;
6521
6522 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
6523 return 0;
6524
6525 if (sctx->cur_inode_last_extent == (u64)-1) {
6526 ret = get_last_extent(sctx, key->offset - 1);
6527 if (ret)
6528 return ret;
6529 }
6530
6531 if (path->slots[0] == 0 &&
6532 sctx->cur_inode_last_extent < key->offset) {
6533 /*
6534 * We might have skipped entire leafs that contained only
6535 * file extent items for our current inode. These leafs have
6536 * a generation number smaller (older) than the one in the
6537 * current leaf and the leaf our last extent came from, and
6538 * are located between these 2 leafs.
6539 */
6540 ret = get_last_extent(sctx, key->offset - 1);
6541 if (ret)
6542 return ret;
6543 }
6544
6545 if (sctx->cur_inode_last_extent < key->offset) {
6546 ret = range_is_hole_in_parent(sctx,
6547 sctx->cur_inode_last_extent,
6548 key->offset);
6549 if (ret < 0)
6550 return ret;
6551 else if (ret == 0)
6552 ret = send_hole(sctx, key->offset);
6553 else
6554 ret = 0;
6555 }
6556 sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6557 return ret;
6558 }
6559
process_extent(struct send_ctx * sctx,struct btrfs_path * path,struct btrfs_key * key)6560 static int process_extent(struct send_ctx *sctx,
6561 struct btrfs_path *path,
6562 struct btrfs_key *key)
6563 {
6564 struct clone_root *found_clone = NULL;
6565 int ret = 0;
6566
6567 if (S_ISLNK(sctx->cur_inode_mode))
6568 return 0;
6569
6570 if (sctx->parent_root && !sctx->cur_inode_new) {
6571 ret = is_extent_unchanged(sctx, path, key);
6572 if (ret < 0)
6573 goto out;
6574 if (ret) {
6575 ret = 0;
6576 goto out_hole;
6577 }
6578 } else {
6579 struct btrfs_file_extent_item *ei;
6580 u8 type;
6581
6582 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
6583 struct btrfs_file_extent_item);
6584 type = btrfs_file_extent_type(path->nodes[0], ei);
6585 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
6586 type == BTRFS_FILE_EXTENT_REG) {
6587 /*
6588 * The send spec does not have a prealloc command yet,
6589 * so just leave a hole for prealloc'ed extents until
6590 * we have enough commands queued up to justify rev'ing
6591 * the send spec.
6592 */
6593 if (type == BTRFS_FILE_EXTENT_PREALLOC) {
6594 ret = 0;
6595 goto out;
6596 }
6597
6598 /* Have a hole, just skip it. */
6599 if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
6600 ret = 0;
6601 goto out;
6602 }
6603 }
6604 }
6605
6606 ret = find_extent_clone(sctx, path, key->objectid, key->offset,
6607 sctx->cur_inode_size, &found_clone);
6608 if (ret != -ENOENT && ret < 0)
6609 goto out;
6610
6611 ret = send_write_or_clone(sctx, path, key, found_clone);
6612 if (ret)
6613 goto out;
6614 out_hole:
6615 ret = maybe_send_hole(sctx, path, key);
6616 out:
6617 return ret;
6618 }
6619
process_all_extents(struct send_ctx * sctx)6620 static int process_all_extents(struct send_ctx *sctx)
6621 {
6622 int ret = 0;
6623 int iter_ret = 0;
6624 struct btrfs_root *root;
6625 struct btrfs_path *path;
6626 struct btrfs_key key;
6627 struct btrfs_key found_key;
6628
6629 root = sctx->send_root;
6630 path = alloc_path_for_send();
6631 if (!path)
6632 return -ENOMEM;
6633
6634 key.objectid = sctx->cmp_key->objectid;
6635 key.type = BTRFS_EXTENT_DATA_KEY;
6636 key.offset = 0;
6637 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6638 if (found_key.objectid != key.objectid ||
6639 found_key.type != key.type) {
6640 ret = 0;
6641 break;
6642 }
6643
6644 ret = process_extent(sctx, path, &found_key);
6645 if (ret < 0)
6646 break;
6647 }
6648 /* Catch error found during iteration */
6649 if (iter_ret < 0)
6650 ret = iter_ret;
6651
6652 btrfs_free_path(path);
6653 return ret;
6654 }
6655
process_recorded_refs_if_needed(struct send_ctx * sctx,int at_end,int * pending_move,int * refs_processed)6656 static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
6657 int *pending_move,
6658 int *refs_processed)
6659 {
6660 int ret = 0;
6661
6662 if (sctx->cur_ino == 0)
6663 goto out;
6664 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6665 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6666 goto out;
6667 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
6668 goto out;
6669
6670 ret = process_recorded_refs(sctx, pending_move);
6671 if (ret < 0)
6672 goto out;
6673
6674 *refs_processed = 1;
6675 out:
6676 return ret;
6677 }
6678
finish_inode_if_needed(struct send_ctx * sctx,int at_end)6679 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
6680 {
6681 int ret = 0;
6682 struct btrfs_inode_info info;
6683 u64 left_mode;
6684 u64 left_uid;
6685 u64 left_gid;
6686 u64 left_fileattr;
6687 u64 right_mode;
6688 u64 right_uid;
6689 u64 right_gid;
6690 u64 right_fileattr;
6691 int need_chmod = 0;
6692 int need_chown = 0;
6693 bool need_fileattr = false;
6694 int need_truncate = 1;
6695 int pending_move = 0;
6696 int refs_processed = 0;
6697
6698 if (sctx->ignore_cur_inode)
6699 return 0;
6700
6701 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
6702 &refs_processed);
6703 if (ret < 0)
6704 goto out;
6705
6706 /*
6707 * We have processed the refs and thus need to advance send_progress.
6708 * Now, calls to get_cur_xxx will take the updated refs of the current
6709 * inode into account.
6710 *
6711 * On the other hand, if our current inode is a directory and couldn't
6712 * be moved/renamed because its parent was renamed/moved too and it has
6713 * a higher inode number, we can only move/rename our current inode
6714 * after we moved/renamed its parent. Therefore in this case operate on
6715 * the old path (pre move/rename) of our current inode, and the
6716 * move/rename will be performed later.
6717 */
6718 if (refs_processed && !pending_move)
6719 sctx->send_progress = sctx->cur_ino + 1;
6720
6721 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
6722 goto out;
6723 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6724 goto out;
6725 ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
6726 if (ret < 0)
6727 goto out;
6728 left_mode = info.mode;
6729 left_uid = info.uid;
6730 left_gid = info.gid;
6731 left_fileattr = info.fileattr;
6732
6733 if (!sctx->parent_root || sctx->cur_inode_new) {
6734 need_chown = 1;
6735 if (!S_ISLNK(sctx->cur_inode_mode))
6736 need_chmod = 1;
6737 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6738 need_truncate = 0;
6739 } else {
6740 u64 old_size;
6741
6742 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
6743 if (ret < 0)
6744 goto out;
6745 old_size = info.size;
6746 right_mode = info.mode;
6747 right_uid = info.uid;
6748 right_gid = info.gid;
6749 right_fileattr = info.fileattr;
6750
6751 if (left_uid != right_uid || left_gid != right_gid)
6752 need_chown = 1;
6753 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6754 need_chmod = 1;
6755 if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
6756 need_fileattr = true;
6757 if ((old_size == sctx->cur_inode_size) ||
6758 (sctx->cur_inode_size > old_size &&
6759 sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6760 need_truncate = 0;
6761 }
6762
6763 if (S_ISREG(sctx->cur_inode_mode)) {
6764 if (need_send_hole(sctx)) {
6765 if (sctx->cur_inode_last_extent == (u64)-1 ||
6766 sctx->cur_inode_last_extent <
6767 sctx->cur_inode_size) {
6768 ret = get_last_extent(sctx, (u64)-1);
6769 if (ret)
6770 goto out;
6771 }
6772 if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
6773 ret = range_is_hole_in_parent(sctx,
6774 sctx->cur_inode_last_extent,
6775 sctx->cur_inode_size);
6776 if (ret < 0) {
6777 goto out;
6778 } else if (ret == 0) {
6779 ret = send_hole(sctx, sctx->cur_inode_size);
6780 if (ret < 0)
6781 goto out;
6782 } else {
6783 /* Range is already a hole, skip. */
6784 ret = 0;
6785 }
6786 }
6787 }
6788 if (need_truncate) {
6789 ret = send_truncate(sctx, sctx->cur_ino,
6790 sctx->cur_inode_gen,
6791 sctx->cur_inode_size);
6792 if (ret < 0)
6793 goto out;
6794 }
6795 }
6796
6797 if (need_chown) {
6798 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6799 left_uid, left_gid);
6800 if (ret < 0)
6801 goto out;
6802 }
6803 if (need_chmod) {
6804 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6805 left_mode);
6806 if (ret < 0)
6807 goto out;
6808 }
6809 if (need_fileattr) {
6810 ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6811 left_fileattr);
6812 if (ret < 0)
6813 goto out;
6814 }
6815
6816 if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
6817 && sctx->cur_inode_needs_verity) {
6818 ret = process_verity(sctx);
6819 if (ret < 0)
6820 goto out;
6821 }
6822
6823 ret = send_capabilities(sctx);
6824 if (ret < 0)
6825 goto out;
6826
6827 /*
6828 * If other directory inodes depended on our current directory
6829 * inode's move/rename, now do their move/rename operations.
6830 */
6831 if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
6832 ret = apply_children_dir_moves(sctx);
6833 if (ret)
6834 goto out;
6835 /*
6836 * Need to send that every time, no matter if it actually
6837 * changed between the two trees as we have done changes to
6838 * the inode before. If our inode is a directory and it's
6839 * waiting to be moved/renamed, we will send its utimes when
6840 * it's moved/renamed, therefore we don't need to do it here.
6841 */
6842 sctx->send_progress = sctx->cur_ino + 1;
6843
6844 /*
6845 * If the current inode is a non-empty directory, delay issuing
6846 * the utimes command for it, as it's very likely we have inodes
6847 * with an higher number inside it. We want to issue the utimes
6848 * command only after adding all dentries to it.
6849 */
6850 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
6851 ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
6852 else
6853 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
6854
6855 if (ret < 0)
6856 goto out;
6857 }
6858
6859 out:
6860 if (!ret)
6861 ret = trim_dir_utimes_cache(sctx);
6862
6863 return ret;
6864 }
6865
close_current_inode(struct send_ctx * sctx)6866 static void close_current_inode(struct send_ctx *sctx)
6867 {
6868 u64 i_size;
6869
6870 if (sctx->cur_inode == NULL)
6871 return;
6872
6873 i_size = i_size_read(sctx->cur_inode);
6874
6875 /*
6876 * If we are doing an incremental send, we may have extents between the
6877 * last processed extent and the i_size that have not been processed
6878 * because they haven't changed but we may have read some of their pages
6879 * through readahead, see the comments at send_extent_data().
6880 */
6881 if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
6882 truncate_inode_pages_range(&sctx->cur_inode->i_data,
6883 sctx->page_cache_clear_start,
6884 round_up(i_size, PAGE_SIZE) - 1);
6885
6886 iput(sctx->cur_inode);
6887 sctx->cur_inode = NULL;
6888 }
6889
changed_inode(struct send_ctx * sctx,enum btrfs_compare_tree_result result)6890 static int changed_inode(struct send_ctx *sctx,
6891 enum btrfs_compare_tree_result result)
6892 {
6893 int ret = 0;
6894 struct btrfs_key *key = sctx->cmp_key;
6895 struct btrfs_inode_item *left_ii = NULL;
6896 struct btrfs_inode_item *right_ii = NULL;
6897 u64 left_gen = 0;
6898 u64 right_gen = 0;
6899
6900 close_current_inode(sctx);
6901
6902 sctx->cur_ino = key->objectid;
6903 sctx->cur_inode_new_gen = false;
6904 sctx->cur_inode_last_extent = (u64)-1;
6905 sctx->cur_inode_next_write_offset = 0;
6906 sctx->ignore_cur_inode = false;
6907
6908 /*
6909 * Set send_progress to current inode. This will tell all get_cur_xxx
6910 * functions that the current inode's refs are not updated yet. Later,
6911 * when process_recorded_refs is finished, it is set to cur_ino + 1.
6912 */
6913 sctx->send_progress = sctx->cur_ino;
6914
6915 if (result == BTRFS_COMPARE_TREE_NEW ||
6916 result == BTRFS_COMPARE_TREE_CHANGED) {
6917 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
6918 sctx->left_path->slots[0],
6919 struct btrfs_inode_item);
6920 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
6921 left_ii);
6922 } else {
6923 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6924 sctx->right_path->slots[0],
6925 struct btrfs_inode_item);
6926 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6927 right_ii);
6928 }
6929 if (result == BTRFS_COMPARE_TREE_CHANGED) {
6930 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6931 sctx->right_path->slots[0],
6932 struct btrfs_inode_item);
6933
6934 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6935 right_ii);
6936
6937 /*
6938 * The cur_ino = root dir case is special here. We can't treat
6939 * the inode as deleted+reused because it would generate a
6940 * stream that tries to delete/mkdir the root dir.
6941 */
6942 if (left_gen != right_gen &&
6943 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6944 sctx->cur_inode_new_gen = true;
6945 }
6946
6947 /*
6948 * Normally we do not find inodes with a link count of zero (orphans)
6949 * because the most common case is to create a snapshot and use it
6950 * for a send operation. However other less common use cases involve
6951 * using a subvolume and send it after turning it to RO mode just
6952 * after deleting all hard links of a file while holding an open
6953 * file descriptor against it or turning a RO snapshot into RW mode,
6954 * keep an open file descriptor against a file, delete it and then
6955 * turn the snapshot back to RO mode before using it for a send
6956 * operation. The former is what the receiver operation does.
6957 * Therefore, if we want to send these snapshots soon after they're
6958 * received, we need to handle orphan inodes as well. Moreover, orphans
6959 * can appear not only in the send snapshot but also in the parent
6960 * snapshot. Here are several cases:
6961 *
6962 * Case 1: BTRFS_COMPARE_TREE_NEW
6963 * | send snapshot | action
6964 * --------------------------------
6965 * nlink | 0 | ignore
6966 *
6967 * Case 2: BTRFS_COMPARE_TREE_DELETED
6968 * | parent snapshot | action
6969 * ----------------------------------
6970 * nlink | 0 | as usual
6971 * Note: No unlinks will be sent because there're no paths for it.
6972 *
6973 * Case 3: BTRFS_COMPARE_TREE_CHANGED
6974 * | | parent snapshot | send snapshot | action
6975 * -----------------------------------------------------------------------
6976 * subcase 1 | nlink | 0 | 0 | ignore
6977 * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
6978 * subcase 3 | nlink | 0 | >0 | new_gen(creation)
6979 *
6980 */
6981 if (result == BTRFS_COMPARE_TREE_NEW) {
6982 if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
6983 sctx->ignore_cur_inode = true;
6984 goto out;
6985 }
6986 sctx->cur_inode_gen = left_gen;
6987 sctx->cur_inode_new = true;
6988 sctx->cur_inode_deleted = false;
6989 sctx->cur_inode_size = btrfs_inode_size(
6990 sctx->left_path->nodes[0], left_ii);
6991 sctx->cur_inode_mode = btrfs_inode_mode(
6992 sctx->left_path->nodes[0], left_ii);
6993 sctx->cur_inode_rdev = btrfs_inode_rdev(
6994 sctx->left_path->nodes[0], left_ii);
6995 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6996 ret = send_create_inode_if_needed(sctx);
6997 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
6998 sctx->cur_inode_gen = right_gen;
6999 sctx->cur_inode_new = false;
7000 sctx->cur_inode_deleted = true;
7001 sctx->cur_inode_size = btrfs_inode_size(
7002 sctx->right_path->nodes[0], right_ii);
7003 sctx->cur_inode_mode = btrfs_inode_mode(
7004 sctx->right_path->nodes[0], right_ii);
7005 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
7006 u32 new_nlinks, old_nlinks;
7007
7008 new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
7009 old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
7010 if (new_nlinks == 0 && old_nlinks == 0) {
7011 sctx->ignore_cur_inode = true;
7012 goto out;
7013 } else if (new_nlinks == 0 || old_nlinks == 0) {
7014 sctx->cur_inode_new_gen = 1;
7015 }
7016 /*
7017 * We need to do some special handling in case the inode was
7018 * reported as changed with a changed generation number. This
7019 * means that the original inode was deleted and new inode
7020 * reused the same inum. So we have to treat the old inode as
7021 * deleted and the new one as new.
7022 */
7023 if (sctx->cur_inode_new_gen) {
7024 /*
7025 * First, process the inode as if it was deleted.
7026 */
7027 if (old_nlinks > 0) {
7028 sctx->cur_inode_gen = right_gen;
7029 sctx->cur_inode_new = false;
7030 sctx->cur_inode_deleted = true;
7031 sctx->cur_inode_size = btrfs_inode_size(
7032 sctx->right_path->nodes[0], right_ii);
7033 sctx->cur_inode_mode = btrfs_inode_mode(
7034 sctx->right_path->nodes[0], right_ii);
7035 ret = process_all_refs(sctx,
7036 BTRFS_COMPARE_TREE_DELETED);
7037 if (ret < 0)
7038 goto out;
7039 }
7040
7041 /*
7042 * Now process the inode as if it was new.
7043 */
7044 if (new_nlinks > 0) {
7045 sctx->cur_inode_gen = left_gen;
7046 sctx->cur_inode_new = true;
7047 sctx->cur_inode_deleted = false;
7048 sctx->cur_inode_size = btrfs_inode_size(
7049 sctx->left_path->nodes[0],
7050 left_ii);
7051 sctx->cur_inode_mode = btrfs_inode_mode(
7052 sctx->left_path->nodes[0],
7053 left_ii);
7054 sctx->cur_inode_rdev = btrfs_inode_rdev(
7055 sctx->left_path->nodes[0],
7056 left_ii);
7057 ret = send_create_inode_if_needed(sctx);
7058 if (ret < 0)
7059 goto out;
7060
7061 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
7062 if (ret < 0)
7063 goto out;
7064 /*
7065 * Advance send_progress now as we did not get
7066 * into process_recorded_refs_if_needed in the
7067 * new_gen case.
7068 */
7069 sctx->send_progress = sctx->cur_ino + 1;
7070
7071 /*
7072 * Now process all extents and xattrs of the
7073 * inode as if they were all new.
7074 */
7075 ret = process_all_extents(sctx);
7076 if (ret < 0)
7077 goto out;
7078 ret = process_all_new_xattrs(sctx);
7079 if (ret < 0)
7080 goto out;
7081 }
7082 } else {
7083 sctx->cur_inode_gen = left_gen;
7084 sctx->cur_inode_new = false;
7085 sctx->cur_inode_new_gen = false;
7086 sctx->cur_inode_deleted = false;
7087 sctx->cur_inode_size = btrfs_inode_size(
7088 sctx->left_path->nodes[0], left_ii);
7089 sctx->cur_inode_mode = btrfs_inode_mode(
7090 sctx->left_path->nodes[0], left_ii);
7091 }
7092 }
7093
7094 out:
7095 return ret;
7096 }
7097
7098 /*
7099 * We have to process new refs before deleted refs, but compare_trees gives us
7100 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
7101 * first and later process them in process_recorded_refs.
7102 * For the cur_inode_new_gen case, we skip recording completely because
7103 * changed_inode did already initiate processing of refs. The reason for this is
7104 * that in this case, compare_tree actually compares the refs of 2 different
7105 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
7106 * refs of the right tree as deleted and all refs of the left tree as new.
7107 */
changed_ref(struct send_ctx * sctx,enum btrfs_compare_tree_result result)7108 static int changed_ref(struct send_ctx *sctx,
7109 enum btrfs_compare_tree_result result)
7110 {
7111 int ret = 0;
7112
7113 if (sctx->cur_ino != sctx->cmp_key->objectid) {
7114 inconsistent_snapshot_error(sctx, result, "reference");
7115 return -EIO;
7116 }
7117
7118 if (!sctx->cur_inode_new_gen &&
7119 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
7120 if (result == BTRFS_COMPARE_TREE_NEW)
7121 ret = record_new_ref(sctx);
7122 else if (result == BTRFS_COMPARE_TREE_DELETED)
7123 ret = record_deleted_ref(sctx);
7124 else if (result == BTRFS_COMPARE_TREE_CHANGED)
7125 ret = record_changed_ref(sctx);
7126 }
7127
7128 return ret;
7129 }
7130
7131 /*
7132 * Process new/deleted/changed xattrs. We skip processing in the
7133 * cur_inode_new_gen case because changed_inode did already initiate processing
7134 * of xattrs. The reason is the same as in changed_ref
7135 */
changed_xattr(struct send_ctx * sctx,enum btrfs_compare_tree_result result)7136 static int changed_xattr(struct send_ctx *sctx,
7137 enum btrfs_compare_tree_result result)
7138 {
7139 int ret = 0;
7140
7141 if (sctx->cur_ino != sctx->cmp_key->objectid) {
7142 inconsistent_snapshot_error(sctx, result, "xattr");
7143 return -EIO;
7144 }
7145
7146 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7147 if (result == BTRFS_COMPARE_TREE_NEW)
7148 ret = process_new_xattr(sctx);
7149 else if (result == BTRFS_COMPARE_TREE_DELETED)
7150 ret = process_deleted_xattr(sctx);
7151 else if (result == BTRFS_COMPARE_TREE_CHANGED)
7152 ret = process_changed_xattr(sctx);
7153 }
7154
7155 return ret;
7156 }
7157
7158 /*
7159 * Process new/deleted/changed extents. We skip processing in the
7160 * cur_inode_new_gen case because changed_inode did already initiate processing
7161 * of extents. The reason is the same as in changed_ref
7162 */
changed_extent(struct send_ctx * sctx,enum btrfs_compare_tree_result result)7163 static int changed_extent(struct send_ctx *sctx,
7164 enum btrfs_compare_tree_result result)
7165 {
7166 int ret = 0;
7167
7168 /*
7169 * We have found an extent item that changed without the inode item
7170 * having changed. This can happen either after relocation (where the
7171 * disk_bytenr of an extent item is replaced at
7172 * relocation.c:replace_file_extents()) or after deduplication into a
7173 * file in both the parent and send snapshots (where an extent item can
7174 * get modified or replaced with a new one). Note that deduplication
7175 * updates the inode item, but it only changes the iversion (sequence
7176 * field in the inode item) of the inode, so if a file is deduplicated
7177 * the same amount of times in both the parent and send snapshots, its
7178 * iversion becomes the same in both snapshots, whence the inode item is
7179 * the same on both snapshots.
7180 */
7181 if (sctx->cur_ino != sctx->cmp_key->objectid)
7182 return 0;
7183
7184 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7185 if (result != BTRFS_COMPARE_TREE_DELETED)
7186 ret = process_extent(sctx, sctx->left_path,
7187 sctx->cmp_key);
7188 }
7189
7190 return ret;
7191 }
7192
changed_verity(struct send_ctx * sctx,enum btrfs_compare_tree_result result)7193 static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
7194 {
7195 int ret = 0;
7196
7197 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7198 if (result == BTRFS_COMPARE_TREE_NEW)
7199 sctx->cur_inode_needs_verity = true;
7200 }
7201 return ret;
7202 }
7203
dir_changed(struct send_ctx * sctx,u64 dir)7204 static int dir_changed(struct send_ctx *sctx, u64 dir)
7205 {
7206 u64 orig_gen, new_gen;
7207 int ret;
7208
7209 ret = get_inode_gen(sctx->send_root, dir, &new_gen);
7210 if (ret)
7211 return ret;
7212
7213 ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
7214 if (ret)
7215 return ret;
7216
7217 return (orig_gen != new_gen) ? 1 : 0;
7218 }
7219
compare_refs(struct send_ctx * sctx,struct btrfs_path * path,struct btrfs_key * key)7220 static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
7221 struct btrfs_key *key)
7222 {
7223 struct btrfs_inode_extref *extref;
7224 struct extent_buffer *leaf;
7225 u64 dirid = 0, last_dirid = 0;
7226 unsigned long ptr;
7227 u32 item_size;
7228 u32 cur_offset = 0;
7229 int ref_name_len;
7230 int ret = 0;
7231
7232 /* Easy case, just check this one dirid */
7233 if (key->type == BTRFS_INODE_REF_KEY) {
7234 dirid = key->offset;
7235
7236 ret = dir_changed(sctx, dirid);
7237 goto out;
7238 }
7239
7240 leaf = path->nodes[0];
7241 item_size = btrfs_item_size(leaf, path->slots[0]);
7242 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
7243 while (cur_offset < item_size) {
7244 extref = (struct btrfs_inode_extref *)(ptr +
7245 cur_offset);
7246 dirid = btrfs_inode_extref_parent(leaf, extref);
7247 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
7248 cur_offset += ref_name_len + sizeof(*extref);
7249 if (dirid == last_dirid)
7250 continue;
7251 ret = dir_changed(sctx, dirid);
7252 if (ret)
7253 break;
7254 last_dirid = dirid;
7255 }
7256 out:
7257 return ret;
7258 }
7259
7260 /*
7261 * Updates compare related fields in sctx and simply forwards to the actual
7262 * changed_xxx functions.
7263 */
changed_cb(struct btrfs_path * left_path,struct btrfs_path * right_path,struct btrfs_key * key,enum btrfs_compare_tree_result result,struct send_ctx * sctx)7264 static int changed_cb(struct btrfs_path *left_path,
7265 struct btrfs_path *right_path,
7266 struct btrfs_key *key,
7267 enum btrfs_compare_tree_result result,
7268 struct send_ctx *sctx)
7269 {
7270 int ret = 0;
7271
7272 /*
7273 * We can not hold the commit root semaphore here. This is because in
7274 * the case of sending and receiving to the same filesystem, using a
7275 * pipe, could result in a deadlock:
7276 *
7277 * 1) The task running send blocks on the pipe because it's full;
7278 *
7279 * 2) The task running receive, which is the only consumer of the pipe,
7280 * is waiting for a transaction commit (for example due to a space
7281 * reservation when doing a write or triggering a transaction commit
7282 * when creating a subvolume);
7283 *
7284 * 3) The transaction is waiting to write lock the commit root semaphore,
7285 * but can not acquire it since it's being held at 1).
7286 *
7287 * Down this call chain we write to the pipe through kernel_write().
7288 * The same type of problem can also happen when sending to a file that
7289 * is stored in the same filesystem - when reserving space for a write
7290 * into the file, we can trigger a transaction commit.
7291 *
7292 * Our caller has supplied us with clones of leaves from the send and
7293 * parent roots, so we're safe here from a concurrent relocation and
7294 * further reallocation of metadata extents while we are here. Below we
7295 * also assert that the leaves are clones.
7296 */
7297 lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
7298
7299 /*
7300 * We always have a send root, so left_path is never NULL. We will not
7301 * have a leaf when we have reached the end of the send root but have
7302 * not yet reached the end of the parent root.
7303 */
7304 if (left_path->nodes[0])
7305 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7306 &left_path->nodes[0]->bflags));
7307 /*
7308 * When doing a full send we don't have a parent root, so right_path is
7309 * NULL. When doing an incremental send, we may have reached the end of
7310 * the parent root already, so we don't have a leaf at right_path.
7311 */
7312 if (right_path && right_path->nodes[0])
7313 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7314 &right_path->nodes[0]->bflags));
7315
7316 if (result == BTRFS_COMPARE_TREE_SAME) {
7317 if (key->type == BTRFS_INODE_REF_KEY ||
7318 key->type == BTRFS_INODE_EXTREF_KEY) {
7319 ret = compare_refs(sctx, left_path, key);
7320 if (!ret)
7321 return 0;
7322 if (ret < 0)
7323 return ret;
7324 } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
7325 return maybe_send_hole(sctx, left_path, key);
7326 } else {
7327 return 0;
7328 }
7329 result = BTRFS_COMPARE_TREE_CHANGED;
7330 ret = 0;
7331 }
7332
7333 sctx->left_path = left_path;
7334 sctx->right_path = right_path;
7335 sctx->cmp_key = key;
7336
7337 ret = finish_inode_if_needed(sctx, 0);
7338 if (ret < 0)
7339 goto out;
7340
7341 /* Ignore non-FS objects */
7342 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
7343 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
7344 goto out;
7345
7346 if (key->type == BTRFS_INODE_ITEM_KEY) {
7347 ret = changed_inode(sctx, result);
7348 } else if (!sctx->ignore_cur_inode) {
7349 if (key->type == BTRFS_INODE_REF_KEY ||
7350 key->type == BTRFS_INODE_EXTREF_KEY)
7351 ret = changed_ref(sctx, result);
7352 else if (key->type == BTRFS_XATTR_ITEM_KEY)
7353 ret = changed_xattr(sctx, result);
7354 else if (key->type == BTRFS_EXTENT_DATA_KEY)
7355 ret = changed_extent(sctx, result);
7356 else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
7357 key->offset == 0)
7358 ret = changed_verity(sctx, result);
7359 }
7360
7361 out:
7362 return ret;
7363 }
7364
search_key_again(const struct send_ctx * sctx,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * key)7365 static int search_key_again(const struct send_ctx *sctx,
7366 struct btrfs_root *root,
7367 struct btrfs_path *path,
7368 const struct btrfs_key *key)
7369 {
7370 int ret;
7371
7372 if (!path->need_commit_sem)
7373 lockdep_assert_held_read(&root->fs_info->commit_root_sem);
7374
7375 /*
7376 * Roots used for send operations are readonly and no one can add,
7377 * update or remove keys from them, so we should be able to find our
7378 * key again. The only exception is deduplication, which can operate on
7379 * readonly roots and add, update or remove keys to/from them - but at
7380 * the moment we don't allow it to run in parallel with send.
7381 */
7382 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7383 ASSERT(ret <= 0);
7384 if (ret > 0) {
7385 btrfs_print_tree(path->nodes[path->lowest_level], false);
7386 btrfs_err(root->fs_info,
7387 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7388 key->objectid, key->type, key->offset,
7389 (root == sctx->parent_root ? "parent" : "send"),
7390 root->root_key.objectid, path->lowest_level,
7391 path->slots[path->lowest_level]);
7392 return -EUCLEAN;
7393 }
7394
7395 return ret;
7396 }
7397
full_send_tree(struct send_ctx * sctx)7398 static int full_send_tree(struct send_ctx *sctx)
7399 {
7400 int ret;
7401 struct btrfs_root *send_root = sctx->send_root;
7402 struct btrfs_key key;
7403 struct btrfs_fs_info *fs_info = send_root->fs_info;
7404 struct btrfs_path *path;
7405
7406 path = alloc_path_for_send();
7407 if (!path)
7408 return -ENOMEM;
7409 path->reada = READA_FORWARD_ALWAYS;
7410
7411 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7412 key.type = BTRFS_INODE_ITEM_KEY;
7413 key.offset = 0;
7414
7415 down_read(&fs_info->commit_root_sem);
7416 sctx->last_reloc_trans = fs_info->last_reloc_trans;
7417 up_read(&fs_info->commit_root_sem);
7418
7419 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
7420 if (ret < 0)
7421 goto out;
7422 if (ret)
7423 goto out_finish;
7424
7425 while (1) {
7426 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
7427
7428 ret = changed_cb(path, NULL, &key,
7429 BTRFS_COMPARE_TREE_NEW, sctx);
7430 if (ret < 0)
7431 goto out;
7432
7433 down_read(&fs_info->commit_root_sem);
7434 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7435 sctx->last_reloc_trans = fs_info->last_reloc_trans;
7436 up_read(&fs_info->commit_root_sem);
7437 /*
7438 * A transaction used for relocating a block group was
7439 * committed or is about to finish its commit. Release
7440 * our path (leaf) and restart the search, so that we
7441 * avoid operating on any file extent items that are
7442 * stale, with a disk_bytenr that reflects a pre
7443 * relocation value. This way we avoid as much as
7444 * possible to fallback to regular writes when checking
7445 * if we can clone file ranges.
7446 */
7447 btrfs_release_path(path);
7448 ret = search_key_again(sctx, send_root, path, &key);
7449 if (ret < 0)
7450 goto out;
7451 } else {
7452 up_read(&fs_info->commit_root_sem);
7453 }
7454
7455 ret = btrfs_next_item(send_root, path);
7456 if (ret < 0)
7457 goto out;
7458 if (ret) {
7459 ret = 0;
7460 break;
7461 }
7462 }
7463
7464 out_finish:
7465 ret = finish_inode_if_needed(sctx, 1);
7466
7467 out:
7468 btrfs_free_path(path);
7469 return ret;
7470 }
7471
replace_node_with_clone(struct btrfs_path * path,int level)7472 static int replace_node_with_clone(struct btrfs_path *path, int level)
7473 {
7474 struct extent_buffer *clone;
7475
7476 clone = btrfs_clone_extent_buffer(path->nodes[level]);
7477 if (!clone)
7478 return -ENOMEM;
7479
7480 free_extent_buffer(path->nodes[level]);
7481 path->nodes[level] = clone;
7482
7483 return 0;
7484 }
7485
tree_move_down(struct btrfs_path * path,int * level,u64 reada_min_gen)7486 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
7487 {
7488 struct extent_buffer *eb;
7489 struct extent_buffer *parent = path->nodes[*level];
7490 int slot = path->slots[*level];
7491 const int nritems = btrfs_header_nritems(parent);
7492 u64 reada_max;
7493 u64 reada_done = 0;
7494
7495 lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7496 ASSERT(*level != 0);
7497
7498 eb = btrfs_read_node_slot(parent, slot);
7499 if (IS_ERR(eb))
7500 return PTR_ERR(eb);
7501
7502 /*
7503 * Trigger readahead for the next leaves we will process, so that it is
7504 * very likely that when we need them they are already in memory and we
7505 * will not block on disk IO. For nodes we only do readahead for one,
7506 * since the time window between processing nodes is typically larger.
7507 */
7508 reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
7509
7510 for (slot++; slot < nritems && reada_done < reada_max; slot++) {
7511 if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
7512 btrfs_readahead_node_child(parent, slot);
7513 reada_done += eb->fs_info->nodesize;
7514 }
7515 }
7516
7517 path->nodes[*level - 1] = eb;
7518 path->slots[*level - 1] = 0;
7519 (*level)--;
7520
7521 if (*level == 0)
7522 return replace_node_with_clone(path, 0);
7523
7524 return 0;
7525 }
7526
tree_move_next_or_upnext(struct btrfs_path * path,int * level,int root_level)7527 static int tree_move_next_or_upnext(struct btrfs_path *path,
7528 int *level, int root_level)
7529 {
7530 int ret = 0;
7531 int nritems;
7532 nritems = btrfs_header_nritems(path->nodes[*level]);
7533
7534 path->slots[*level]++;
7535
7536 while (path->slots[*level] >= nritems) {
7537 if (*level == root_level) {
7538 path->slots[*level] = nritems - 1;
7539 return -1;
7540 }
7541
7542 /* move upnext */
7543 path->slots[*level] = 0;
7544 free_extent_buffer(path->nodes[*level]);
7545 path->nodes[*level] = NULL;
7546 (*level)++;
7547 path->slots[*level]++;
7548
7549 nritems = btrfs_header_nritems(path->nodes[*level]);
7550 ret = 1;
7551 }
7552 return ret;
7553 }
7554
7555 /*
7556 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
7557 * or down.
7558 */
tree_advance(struct btrfs_path * path,int * level,int root_level,int allow_down,struct btrfs_key * key,u64 reada_min_gen)7559 static int tree_advance(struct btrfs_path *path,
7560 int *level, int root_level,
7561 int allow_down,
7562 struct btrfs_key *key,
7563 u64 reada_min_gen)
7564 {
7565 int ret;
7566
7567 if (*level == 0 || !allow_down) {
7568 ret = tree_move_next_or_upnext(path, level, root_level);
7569 } else {
7570 ret = tree_move_down(path, level, reada_min_gen);
7571 }
7572
7573 /*
7574 * Even if we have reached the end of a tree, ret is -1, update the key
7575 * anyway, so that in case we need to restart due to a block group
7576 * relocation, we can assert that the last key of the root node still
7577 * exists in the tree.
7578 */
7579 if (*level == 0)
7580 btrfs_item_key_to_cpu(path->nodes[*level], key,
7581 path->slots[*level]);
7582 else
7583 btrfs_node_key_to_cpu(path->nodes[*level], key,
7584 path->slots[*level]);
7585
7586 return ret;
7587 }
7588
tree_compare_item(struct btrfs_path * left_path,struct btrfs_path * right_path,char * tmp_buf)7589 static int tree_compare_item(struct btrfs_path *left_path,
7590 struct btrfs_path *right_path,
7591 char *tmp_buf)
7592 {
7593 int cmp;
7594 int len1, len2;
7595 unsigned long off1, off2;
7596
7597 len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
7598 len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
7599 if (len1 != len2)
7600 return 1;
7601
7602 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
7603 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
7604 right_path->slots[0]);
7605
7606 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
7607
7608 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
7609 if (cmp)
7610 return 1;
7611 return 0;
7612 }
7613
7614 /*
7615 * A transaction used for relocating a block group was committed or is about to
7616 * finish its commit. Release our paths and restart the search, so that we are
7617 * not using stale extent buffers:
7618 *
7619 * 1) For levels > 0, we are only holding references of extent buffers, without
7620 * any locks on them, which does not prevent them from having been relocated
7621 * and reallocated after the last time we released the commit root semaphore.
7622 * The exception are the root nodes, for which we always have a clone, see
7623 * the comment at btrfs_compare_trees();
7624 *
7625 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7626 * we are safe from the concurrent relocation and reallocation. However they
7627 * can have file extent items with a pre relocation disk_bytenr value, so we
7628 * restart the start from the current commit roots and clone the new leaves so
7629 * that we get the post relocation disk_bytenr values. Not doing so, could
7630 * make us clone the wrong data in case there are new extents using the old
7631 * disk_bytenr that happen to be shared.
7632 */
restart_after_relocation(struct btrfs_path * left_path,struct btrfs_path * right_path,const struct btrfs_key * left_key,const struct btrfs_key * right_key,int left_level,int right_level,const struct send_ctx * sctx)7633 static int restart_after_relocation(struct btrfs_path *left_path,
7634 struct btrfs_path *right_path,
7635 const struct btrfs_key *left_key,
7636 const struct btrfs_key *right_key,
7637 int left_level,
7638 int right_level,
7639 const struct send_ctx *sctx)
7640 {
7641 int root_level;
7642 int ret;
7643
7644 lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
7645
7646 btrfs_release_path(left_path);
7647 btrfs_release_path(right_path);
7648
7649 /*
7650 * Since keys can not be added or removed to/from our roots because they
7651 * are readonly and we do not allow deduplication to run in parallel
7652 * (which can add, remove or change keys), the layout of the trees should
7653 * not change.
7654 */
7655 left_path->lowest_level = left_level;
7656 ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
7657 if (ret < 0)
7658 return ret;
7659
7660 right_path->lowest_level = right_level;
7661 ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
7662 if (ret < 0)
7663 return ret;
7664
7665 /*
7666 * If the lowest level nodes are leaves, clone them so that they can be
7667 * safely used by changed_cb() while not under the protection of the
7668 * commit root semaphore, even if relocation and reallocation happens in
7669 * parallel.
7670 */
7671 if (left_level == 0) {
7672 ret = replace_node_with_clone(left_path, 0);
7673 if (ret < 0)
7674 return ret;
7675 }
7676
7677 if (right_level == 0) {
7678 ret = replace_node_with_clone(right_path, 0);
7679 if (ret < 0)
7680 return ret;
7681 }
7682
7683 /*
7684 * Now clone the root nodes (unless they happen to be the leaves we have
7685 * already cloned). This is to protect against concurrent snapshotting of
7686 * the send and parent roots (see the comment at btrfs_compare_trees()).
7687 */
7688 root_level = btrfs_header_level(sctx->send_root->commit_root);
7689 if (root_level > 0) {
7690 ret = replace_node_with_clone(left_path, root_level);
7691 if (ret < 0)
7692 return ret;
7693 }
7694
7695 root_level = btrfs_header_level(sctx->parent_root->commit_root);
7696 if (root_level > 0) {
7697 ret = replace_node_with_clone(right_path, root_level);
7698 if (ret < 0)
7699 return ret;
7700 }
7701
7702 return 0;
7703 }
7704
7705 /*
7706 * This function compares two trees and calls the provided callback for
7707 * every changed/new/deleted item it finds.
7708 * If shared tree blocks are encountered, whole subtrees are skipped, making
7709 * the compare pretty fast on snapshotted subvolumes.
7710 *
7711 * This currently works on commit roots only. As commit roots are read only,
7712 * we don't do any locking. The commit roots are protected with transactions.
7713 * Transactions are ended and rejoined when a commit is tried in between.
7714 *
7715 * This function checks for modifications done to the trees while comparing.
7716 * If it detects a change, it aborts immediately.
7717 */
btrfs_compare_trees(struct btrfs_root * left_root,struct btrfs_root * right_root,struct send_ctx * sctx)7718 static int btrfs_compare_trees(struct btrfs_root *left_root,
7719 struct btrfs_root *right_root, struct send_ctx *sctx)
7720 {
7721 struct btrfs_fs_info *fs_info = left_root->fs_info;
7722 int ret;
7723 int cmp;
7724 struct btrfs_path *left_path = NULL;
7725 struct btrfs_path *right_path = NULL;
7726 struct btrfs_key left_key;
7727 struct btrfs_key right_key;
7728 char *tmp_buf = NULL;
7729 int left_root_level;
7730 int right_root_level;
7731 int left_level;
7732 int right_level;
7733 int left_end_reached = 0;
7734 int right_end_reached = 0;
7735 int advance_left = 0;
7736 int advance_right = 0;
7737 u64 left_blockptr;
7738 u64 right_blockptr;
7739 u64 left_gen;
7740 u64 right_gen;
7741 u64 reada_min_gen;
7742
7743 left_path = btrfs_alloc_path();
7744 if (!left_path) {
7745 ret = -ENOMEM;
7746 goto out;
7747 }
7748 right_path = btrfs_alloc_path();
7749 if (!right_path) {
7750 ret = -ENOMEM;
7751 goto out;
7752 }
7753
7754 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
7755 if (!tmp_buf) {
7756 ret = -ENOMEM;
7757 goto out;
7758 }
7759
7760 left_path->search_commit_root = 1;
7761 left_path->skip_locking = 1;
7762 right_path->search_commit_root = 1;
7763 right_path->skip_locking = 1;
7764
7765 /*
7766 * Strategy: Go to the first items of both trees. Then do
7767 *
7768 * If both trees are at level 0
7769 * Compare keys of current items
7770 * If left < right treat left item as new, advance left tree
7771 * and repeat
7772 * If left > right treat right item as deleted, advance right tree
7773 * and repeat
7774 * If left == right do deep compare of items, treat as changed if
7775 * needed, advance both trees and repeat
7776 * If both trees are at the same level but not at level 0
7777 * Compare keys of current nodes/leafs
7778 * If left < right advance left tree and repeat
7779 * If left > right advance right tree and repeat
7780 * If left == right compare blockptrs of the next nodes/leafs
7781 * If they match advance both trees but stay at the same level
7782 * and repeat
7783 * If they don't match advance both trees while allowing to go
7784 * deeper and repeat
7785 * If tree levels are different
7786 * Advance the tree that needs it and repeat
7787 *
7788 * Advancing a tree means:
7789 * If we are at level 0, try to go to the next slot. If that's not
7790 * possible, go one level up and repeat. Stop when we found a level
7791 * where we could go to the next slot. We may at this point be on a
7792 * node or a leaf.
7793 *
7794 * If we are not at level 0 and not on shared tree blocks, go one
7795 * level deeper.
7796 *
7797 * If we are not at level 0 and on shared tree blocks, go one slot to
7798 * the right if possible or go up and right.
7799 */
7800
7801 down_read(&fs_info->commit_root_sem);
7802 left_level = btrfs_header_level(left_root->commit_root);
7803 left_root_level = left_level;
7804 /*
7805 * We clone the root node of the send and parent roots to prevent races
7806 * with snapshot creation of these roots. Snapshot creation COWs the
7807 * root node of a tree, so after the transaction is committed the old
7808 * extent can be reallocated while this send operation is still ongoing.
7809 * So we clone them, under the commit root semaphore, to be race free.
7810 */
7811 left_path->nodes[left_level] =
7812 btrfs_clone_extent_buffer(left_root->commit_root);
7813 if (!left_path->nodes[left_level]) {
7814 ret = -ENOMEM;
7815 goto out_unlock;
7816 }
7817
7818 right_level = btrfs_header_level(right_root->commit_root);
7819 right_root_level = right_level;
7820 right_path->nodes[right_level] =
7821 btrfs_clone_extent_buffer(right_root->commit_root);
7822 if (!right_path->nodes[right_level]) {
7823 ret = -ENOMEM;
7824 goto out_unlock;
7825 }
7826 /*
7827 * Our right root is the parent root, while the left root is the "send"
7828 * root. We know that all new nodes/leaves in the left root must have
7829 * a generation greater than the right root's generation, so we trigger
7830 * readahead for those nodes and leaves of the left root, as we know we
7831 * will need to read them at some point.
7832 */
7833 reada_min_gen = btrfs_header_generation(right_root->commit_root);
7834
7835 if (left_level == 0)
7836 btrfs_item_key_to_cpu(left_path->nodes[left_level],
7837 &left_key, left_path->slots[left_level]);
7838 else
7839 btrfs_node_key_to_cpu(left_path->nodes[left_level],
7840 &left_key, left_path->slots[left_level]);
7841 if (right_level == 0)
7842 btrfs_item_key_to_cpu(right_path->nodes[right_level],
7843 &right_key, right_path->slots[right_level]);
7844 else
7845 btrfs_node_key_to_cpu(right_path->nodes[right_level],
7846 &right_key, right_path->slots[right_level]);
7847
7848 sctx->last_reloc_trans = fs_info->last_reloc_trans;
7849
7850 while (1) {
7851 if (need_resched() ||
7852 rwsem_is_contended(&fs_info->commit_root_sem)) {
7853 up_read(&fs_info->commit_root_sem);
7854 cond_resched();
7855 down_read(&fs_info->commit_root_sem);
7856 }
7857
7858 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7859 ret = restart_after_relocation(left_path, right_path,
7860 &left_key, &right_key,
7861 left_level, right_level,
7862 sctx);
7863 if (ret < 0)
7864 goto out_unlock;
7865 sctx->last_reloc_trans = fs_info->last_reloc_trans;
7866 }
7867
7868 if (advance_left && !left_end_reached) {
7869 ret = tree_advance(left_path, &left_level,
7870 left_root_level,
7871 advance_left != ADVANCE_ONLY_NEXT,
7872 &left_key, reada_min_gen);
7873 if (ret == -1)
7874 left_end_reached = ADVANCE;
7875 else if (ret < 0)
7876 goto out_unlock;
7877 advance_left = 0;
7878 }
7879 if (advance_right && !right_end_reached) {
7880 ret = tree_advance(right_path, &right_level,
7881 right_root_level,
7882 advance_right != ADVANCE_ONLY_NEXT,
7883 &right_key, reada_min_gen);
7884 if (ret == -1)
7885 right_end_reached = ADVANCE;
7886 else if (ret < 0)
7887 goto out_unlock;
7888 advance_right = 0;
7889 }
7890
7891 if (left_end_reached && right_end_reached) {
7892 ret = 0;
7893 goto out_unlock;
7894 } else if (left_end_reached) {
7895 if (right_level == 0) {
7896 up_read(&fs_info->commit_root_sem);
7897 ret = changed_cb(left_path, right_path,
7898 &right_key,
7899 BTRFS_COMPARE_TREE_DELETED,
7900 sctx);
7901 if (ret < 0)
7902 goto out;
7903 down_read(&fs_info->commit_root_sem);
7904 }
7905 advance_right = ADVANCE;
7906 continue;
7907 } else if (right_end_reached) {
7908 if (left_level == 0) {
7909 up_read(&fs_info->commit_root_sem);
7910 ret = changed_cb(left_path, right_path,
7911 &left_key,
7912 BTRFS_COMPARE_TREE_NEW,
7913 sctx);
7914 if (ret < 0)
7915 goto out;
7916 down_read(&fs_info->commit_root_sem);
7917 }
7918 advance_left = ADVANCE;
7919 continue;
7920 }
7921
7922 if (left_level == 0 && right_level == 0) {
7923 up_read(&fs_info->commit_root_sem);
7924 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7925 if (cmp < 0) {
7926 ret = changed_cb(left_path, right_path,
7927 &left_key,
7928 BTRFS_COMPARE_TREE_NEW,
7929 sctx);
7930 advance_left = ADVANCE;
7931 } else if (cmp > 0) {
7932 ret = changed_cb(left_path, right_path,
7933 &right_key,
7934 BTRFS_COMPARE_TREE_DELETED,
7935 sctx);
7936 advance_right = ADVANCE;
7937 } else {
7938 enum btrfs_compare_tree_result result;
7939
7940 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
7941 ret = tree_compare_item(left_path, right_path,
7942 tmp_buf);
7943 if (ret)
7944 result = BTRFS_COMPARE_TREE_CHANGED;
7945 else
7946 result = BTRFS_COMPARE_TREE_SAME;
7947 ret = changed_cb(left_path, right_path,
7948 &left_key, result, sctx);
7949 advance_left = ADVANCE;
7950 advance_right = ADVANCE;
7951 }
7952
7953 if (ret < 0)
7954 goto out;
7955 down_read(&fs_info->commit_root_sem);
7956 } else if (left_level == right_level) {
7957 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7958 if (cmp < 0) {
7959 advance_left = ADVANCE;
7960 } else if (cmp > 0) {
7961 advance_right = ADVANCE;
7962 } else {
7963 left_blockptr = btrfs_node_blockptr(
7964 left_path->nodes[left_level],
7965 left_path->slots[left_level]);
7966 right_blockptr = btrfs_node_blockptr(
7967 right_path->nodes[right_level],
7968 right_path->slots[right_level]);
7969 left_gen = btrfs_node_ptr_generation(
7970 left_path->nodes[left_level],
7971 left_path->slots[left_level]);
7972 right_gen = btrfs_node_ptr_generation(
7973 right_path->nodes[right_level],
7974 right_path->slots[right_level]);
7975 if (left_blockptr == right_blockptr &&
7976 left_gen == right_gen) {
7977 /*
7978 * As we're on a shared block, don't
7979 * allow to go deeper.
7980 */
7981 advance_left = ADVANCE_ONLY_NEXT;
7982 advance_right = ADVANCE_ONLY_NEXT;
7983 } else {
7984 advance_left = ADVANCE;
7985 advance_right = ADVANCE;
7986 }
7987 }
7988 } else if (left_level < right_level) {
7989 advance_right = ADVANCE;
7990 } else {
7991 advance_left = ADVANCE;
7992 }
7993 }
7994
7995 out_unlock:
7996 up_read(&fs_info->commit_root_sem);
7997 out:
7998 btrfs_free_path(left_path);
7999 btrfs_free_path(right_path);
8000 kvfree(tmp_buf);
8001 return ret;
8002 }
8003
send_subvol(struct send_ctx * sctx)8004 static int send_subvol(struct send_ctx *sctx)
8005 {
8006 int ret;
8007
8008 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
8009 ret = send_header(sctx);
8010 if (ret < 0)
8011 goto out;
8012 }
8013
8014 ret = send_subvol_begin(sctx);
8015 if (ret < 0)
8016 goto out;
8017
8018 if (sctx->parent_root) {
8019 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
8020 if (ret < 0)
8021 goto out;
8022 ret = finish_inode_if_needed(sctx, 1);
8023 if (ret < 0)
8024 goto out;
8025 } else {
8026 ret = full_send_tree(sctx);
8027 if (ret < 0)
8028 goto out;
8029 }
8030
8031 out:
8032 free_recorded_refs(sctx);
8033 return ret;
8034 }
8035
8036 /*
8037 * If orphan cleanup did remove any orphans from a root, it means the tree
8038 * was modified and therefore the commit root is not the same as the current
8039 * root anymore. This is a problem, because send uses the commit root and
8040 * therefore can see inode items that don't exist in the current root anymore,
8041 * and for example make calls to btrfs_iget, which will do tree lookups based
8042 * on the current root and not on the commit root. Those lookups will fail,
8043 * returning a -ESTALE error, and making send fail with that error. So make
8044 * sure a send does not see any orphans we have just removed, and that it will
8045 * see the same inodes regardless of whether a transaction commit happened
8046 * before it started (meaning that the commit root will be the same as the
8047 * current root) or not.
8048 */
ensure_commit_roots_uptodate(struct send_ctx * sctx)8049 static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
8050 {
8051 int i;
8052 struct btrfs_trans_handle *trans = NULL;
8053
8054 again:
8055 if (sctx->parent_root &&
8056 sctx->parent_root->node != sctx->parent_root->commit_root)
8057 goto commit_trans;
8058
8059 for (i = 0; i < sctx->clone_roots_cnt; i++)
8060 if (sctx->clone_roots[i].root->node !=
8061 sctx->clone_roots[i].root->commit_root)
8062 goto commit_trans;
8063
8064 if (trans)
8065 return btrfs_end_transaction(trans);
8066
8067 return 0;
8068
8069 commit_trans:
8070 /* Use any root, all fs roots will get their commit roots updated. */
8071 if (!trans) {
8072 trans = btrfs_join_transaction(sctx->send_root);
8073 if (IS_ERR(trans))
8074 return PTR_ERR(trans);
8075 goto again;
8076 }
8077
8078 return btrfs_commit_transaction(trans);
8079 }
8080
8081 /*
8082 * Make sure any existing dellaloc is flushed for any root used by a send
8083 * operation so that we do not miss any data and we do not race with writeback
8084 * finishing and changing a tree while send is using the tree. This could
8085 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
8086 * a send operation then uses the subvolume.
8087 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
8088 */
flush_delalloc_roots(struct send_ctx * sctx)8089 static int flush_delalloc_roots(struct send_ctx *sctx)
8090 {
8091 struct btrfs_root *root = sctx->parent_root;
8092 int ret;
8093 int i;
8094
8095 if (root) {
8096 ret = btrfs_start_delalloc_snapshot(root, false);
8097 if (ret)
8098 return ret;
8099 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
8100 }
8101
8102 for (i = 0; i < sctx->clone_roots_cnt; i++) {
8103 root = sctx->clone_roots[i].root;
8104 ret = btrfs_start_delalloc_snapshot(root, false);
8105 if (ret)
8106 return ret;
8107 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
8108 }
8109
8110 return 0;
8111 }
8112
btrfs_root_dec_send_in_progress(struct btrfs_root * root)8113 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
8114 {
8115 spin_lock(&root->root_item_lock);
8116 root->send_in_progress--;
8117 /*
8118 * Not much left to do, we don't know why it's unbalanced and
8119 * can't blindly reset it to 0.
8120 */
8121 if (root->send_in_progress < 0)
8122 btrfs_err(root->fs_info,
8123 "send_in_progress unbalanced %d root %llu",
8124 root->send_in_progress, root->root_key.objectid);
8125 spin_unlock(&root->root_item_lock);
8126 }
8127
dedupe_in_progress_warn(const struct btrfs_root * root)8128 static void dedupe_in_progress_warn(const struct btrfs_root *root)
8129 {
8130 btrfs_warn_rl(root->fs_info,
8131 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8132 root->root_key.objectid, root->dedupe_in_progress);
8133 }
8134
btrfs_ioctl_send(struct inode * inode,struct btrfs_ioctl_send_args * arg)8135 long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
8136 {
8137 int ret = 0;
8138 struct btrfs_root *send_root = BTRFS_I(inode)->root;
8139 struct btrfs_fs_info *fs_info = send_root->fs_info;
8140 struct btrfs_root *clone_root;
8141 struct send_ctx *sctx = NULL;
8142 u32 i;
8143 u64 *clone_sources_tmp = NULL;
8144 int clone_sources_to_rollback = 0;
8145 size_t alloc_size;
8146 int sort_clone_roots = 0;
8147 struct btrfs_lru_cache_entry *entry;
8148 struct btrfs_lru_cache_entry *tmp;
8149
8150 if (!capable(CAP_SYS_ADMIN))
8151 return -EPERM;
8152
8153 /*
8154 * The subvolume must remain read-only during send, protect against
8155 * making it RW. This also protects against deletion.
8156 */
8157 spin_lock(&send_root->root_item_lock);
8158 if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
8159 dedupe_in_progress_warn(send_root);
8160 spin_unlock(&send_root->root_item_lock);
8161 return -EAGAIN;
8162 }
8163 send_root->send_in_progress++;
8164 spin_unlock(&send_root->root_item_lock);
8165
8166 /*
8167 * Userspace tools do the checks and warn the user if it's
8168 * not RO.
8169 */
8170 if (!btrfs_root_readonly(send_root)) {
8171 ret = -EPERM;
8172 goto out;
8173 }
8174
8175 /*
8176 * Check that we don't overflow at later allocations, we request
8177 * clone_sources_count + 1 items, and compare to unsigned long inside
8178 * access_ok. Also set an upper limit for allocation size so this can't
8179 * easily exhaust memory. Max number of clone sources is about 200K.
8180 */
8181 if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
8182 ret = -EINVAL;
8183 goto out;
8184 }
8185
8186 if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
8187 ret = -EOPNOTSUPP;
8188 goto out;
8189 }
8190
8191 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
8192 if (!sctx) {
8193 ret = -ENOMEM;
8194 goto out;
8195 }
8196
8197 INIT_LIST_HEAD(&sctx->new_refs);
8198 INIT_LIST_HEAD(&sctx->deleted_refs);
8199
8200 btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
8201 btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
8202 btrfs_lru_cache_init(&sctx->dir_created_cache,
8203 SEND_MAX_DIR_CREATED_CACHE_SIZE);
8204 /*
8205 * This cache is periodically trimmed to a fixed size elsewhere, see
8206 * cache_dir_utimes() and trim_dir_utimes_cache().
8207 */
8208 btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
8209
8210 sctx->pending_dir_moves = RB_ROOT;
8211 sctx->waiting_dir_moves = RB_ROOT;
8212 sctx->orphan_dirs = RB_ROOT;
8213 sctx->rbtree_new_refs = RB_ROOT;
8214 sctx->rbtree_deleted_refs = RB_ROOT;
8215
8216 sctx->flags = arg->flags;
8217
8218 if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
8219 if (arg->version > BTRFS_SEND_STREAM_VERSION) {
8220 ret = -EPROTO;
8221 goto out;
8222 }
8223 /* Zero means "use the highest version" */
8224 sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
8225 } else {
8226 sctx->proto = 1;
8227 }
8228 if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
8229 ret = -EINVAL;
8230 goto out;
8231 }
8232
8233 sctx->send_filp = fget(arg->send_fd);
8234 if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
8235 ret = -EBADF;
8236 goto out;
8237 }
8238
8239 sctx->send_root = send_root;
8240 /*
8241 * Unlikely but possible, if the subvolume is marked for deletion but
8242 * is slow to remove the directory entry, send can still be started
8243 */
8244 if (btrfs_root_dead(sctx->send_root)) {
8245 ret = -EPERM;
8246 goto out;
8247 }
8248
8249 sctx->clone_roots_cnt = arg->clone_sources_count;
8250
8251 if (sctx->proto >= 2) {
8252 u32 send_buf_num_pages;
8253
8254 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
8255 sctx->send_buf = vmalloc(sctx->send_max_size);
8256 if (!sctx->send_buf) {
8257 ret = -ENOMEM;
8258 goto out;
8259 }
8260 send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
8261 sctx->send_buf_pages = kcalloc(send_buf_num_pages,
8262 sizeof(*sctx->send_buf_pages),
8263 GFP_KERNEL);
8264 if (!sctx->send_buf_pages) {
8265 ret = -ENOMEM;
8266 goto out;
8267 }
8268 for (i = 0; i < send_buf_num_pages; i++) {
8269 sctx->send_buf_pages[i] =
8270 vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
8271 }
8272 } else {
8273 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
8274 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
8275 }
8276 if (!sctx->send_buf) {
8277 ret = -ENOMEM;
8278 goto out;
8279 }
8280
8281 sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
8282 sizeof(*sctx->clone_roots),
8283 GFP_KERNEL);
8284 if (!sctx->clone_roots) {
8285 ret = -ENOMEM;
8286 goto out;
8287 }
8288
8289 alloc_size = array_size(sizeof(*arg->clone_sources),
8290 arg->clone_sources_count);
8291
8292 if (arg->clone_sources_count) {
8293 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
8294 if (!clone_sources_tmp) {
8295 ret = -ENOMEM;
8296 goto out;
8297 }
8298
8299 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
8300 alloc_size);
8301 if (ret) {
8302 ret = -EFAULT;
8303 goto out;
8304 }
8305
8306 for (i = 0; i < arg->clone_sources_count; i++) {
8307 clone_root = btrfs_get_fs_root(fs_info,
8308 clone_sources_tmp[i], true);
8309 if (IS_ERR(clone_root)) {
8310 ret = PTR_ERR(clone_root);
8311 goto out;
8312 }
8313 spin_lock(&clone_root->root_item_lock);
8314 if (!btrfs_root_readonly(clone_root) ||
8315 btrfs_root_dead(clone_root)) {
8316 spin_unlock(&clone_root->root_item_lock);
8317 btrfs_put_root(clone_root);
8318 ret = -EPERM;
8319 goto out;
8320 }
8321 if (clone_root->dedupe_in_progress) {
8322 dedupe_in_progress_warn(clone_root);
8323 spin_unlock(&clone_root->root_item_lock);
8324 btrfs_put_root(clone_root);
8325 ret = -EAGAIN;
8326 goto out;
8327 }
8328 clone_root->send_in_progress++;
8329 spin_unlock(&clone_root->root_item_lock);
8330
8331 sctx->clone_roots[i].root = clone_root;
8332 clone_sources_to_rollback = i + 1;
8333 }
8334 kvfree(clone_sources_tmp);
8335 clone_sources_tmp = NULL;
8336 }
8337
8338 if (arg->parent_root) {
8339 sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
8340 true);
8341 if (IS_ERR(sctx->parent_root)) {
8342 ret = PTR_ERR(sctx->parent_root);
8343 goto out;
8344 }
8345
8346 spin_lock(&sctx->parent_root->root_item_lock);
8347 sctx->parent_root->send_in_progress++;
8348 if (!btrfs_root_readonly(sctx->parent_root) ||
8349 btrfs_root_dead(sctx->parent_root)) {
8350 spin_unlock(&sctx->parent_root->root_item_lock);
8351 ret = -EPERM;
8352 goto out;
8353 }
8354 if (sctx->parent_root->dedupe_in_progress) {
8355 dedupe_in_progress_warn(sctx->parent_root);
8356 spin_unlock(&sctx->parent_root->root_item_lock);
8357 ret = -EAGAIN;
8358 goto out;
8359 }
8360 spin_unlock(&sctx->parent_root->root_item_lock);
8361 }
8362
8363 /*
8364 * Clones from send_root are allowed, but only if the clone source
8365 * is behind the current send position. This is checked while searching
8366 * for possible clone sources.
8367 */
8368 sctx->clone_roots[sctx->clone_roots_cnt++].root =
8369 btrfs_grab_root(sctx->send_root);
8370
8371 /* We do a bsearch later */
8372 sort(sctx->clone_roots, sctx->clone_roots_cnt,
8373 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
8374 NULL);
8375 sort_clone_roots = 1;
8376
8377 ret = flush_delalloc_roots(sctx);
8378 if (ret)
8379 goto out;
8380
8381 ret = ensure_commit_roots_uptodate(sctx);
8382 if (ret)
8383 goto out;
8384
8385 ret = send_subvol(sctx);
8386 if (ret < 0)
8387 goto out;
8388
8389 btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
8390 ret = send_utimes(sctx, entry->key, entry->gen);
8391 if (ret < 0)
8392 goto out;
8393 btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
8394 }
8395
8396 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
8397 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
8398 if (ret < 0)
8399 goto out;
8400 ret = send_cmd(sctx);
8401 if (ret < 0)
8402 goto out;
8403 }
8404
8405 out:
8406 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
8407 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
8408 struct rb_node *n;
8409 struct pending_dir_move *pm;
8410
8411 n = rb_first(&sctx->pending_dir_moves);
8412 pm = rb_entry(n, struct pending_dir_move, node);
8413 while (!list_empty(&pm->list)) {
8414 struct pending_dir_move *pm2;
8415
8416 pm2 = list_first_entry(&pm->list,
8417 struct pending_dir_move, list);
8418 free_pending_move(sctx, pm2);
8419 }
8420 free_pending_move(sctx, pm);
8421 }
8422
8423 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
8424 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
8425 struct rb_node *n;
8426 struct waiting_dir_move *dm;
8427
8428 n = rb_first(&sctx->waiting_dir_moves);
8429 dm = rb_entry(n, struct waiting_dir_move, node);
8430 rb_erase(&dm->node, &sctx->waiting_dir_moves);
8431 kfree(dm);
8432 }
8433
8434 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
8435 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
8436 struct rb_node *n;
8437 struct orphan_dir_info *odi;
8438
8439 n = rb_first(&sctx->orphan_dirs);
8440 odi = rb_entry(n, struct orphan_dir_info, node);
8441 free_orphan_dir_info(sctx, odi);
8442 }
8443
8444 if (sort_clone_roots) {
8445 for (i = 0; i < sctx->clone_roots_cnt; i++) {
8446 btrfs_root_dec_send_in_progress(
8447 sctx->clone_roots[i].root);
8448 btrfs_put_root(sctx->clone_roots[i].root);
8449 }
8450 } else {
8451 for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
8452 btrfs_root_dec_send_in_progress(
8453 sctx->clone_roots[i].root);
8454 btrfs_put_root(sctx->clone_roots[i].root);
8455 }
8456
8457 btrfs_root_dec_send_in_progress(send_root);
8458 }
8459 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
8460 btrfs_root_dec_send_in_progress(sctx->parent_root);
8461 btrfs_put_root(sctx->parent_root);
8462 }
8463
8464 kvfree(clone_sources_tmp);
8465
8466 if (sctx) {
8467 if (sctx->send_filp)
8468 fput(sctx->send_filp);
8469
8470 kvfree(sctx->clone_roots);
8471 kfree(sctx->send_buf_pages);
8472 kvfree(sctx->send_buf);
8473 kvfree(sctx->verity_descriptor);
8474
8475 close_current_inode(sctx);
8476
8477 btrfs_lru_cache_clear(&sctx->name_cache);
8478 btrfs_lru_cache_clear(&sctx->backref_cache);
8479 btrfs_lru_cache_clear(&sctx->dir_created_cache);
8480 btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
8481
8482 kfree(sctx);
8483 }
8484
8485 return ret;
8486 }
8487