xref: /openbmc/linux/fs/btrfs/relocation.c (revision b8d312aa)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "volumes.h"
16 #include "locking.h"
17 #include "btrfs_inode.h"
18 #include "async-thread.h"
19 #include "free-space-cache.h"
20 #include "inode-map.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 
25 /*
26  * backref_node, mapping_node and tree_block start with this
27  */
28 struct tree_entry {
29 	struct rb_node rb_node;
30 	u64 bytenr;
31 };
32 
33 /*
34  * present a tree block in the backref cache
35  */
36 struct backref_node {
37 	struct rb_node rb_node;
38 	u64 bytenr;
39 
40 	u64 new_bytenr;
41 	/* objectid of tree block owner, can be not uptodate */
42 	u64 owner;
43 	/* link to pending, changed or detached list */
44 	struct list_head list;
45 	/* list of upper level blocks reference this block */
46 	struct list_head upper;
47 	/* list of child blocks in the cache */
48 	struct list_head lower;
49 	/* NULL if this node is not tree root */
50 	struct btrfs_root *root;
51 	/* extent buffer got by COW the block */
52 	struct extent_buffer *eb;
53 	/* level of tree block */
54 	unsigned int level:8;
55 	/* is the block in non-reference counted tree */
56 	unsigned int cowonly:1;
57 	/* 1 if no child node in the cache */
58 	unsigned int lowest:1;
59 	/* is the extent buffer locked */
60 	unsigned int locked:1;
61 	/* has the block been processed */
62 	unsigned int processed:1;
63 	/* have backrefs of this block been checked */
64 	unsigned int checked:1;
65 	/*
66 	 * 1 if corresponding block has been cowed but some upper
67 	 * level block pointers may not point to the new location
68 	 */
69 	unsigned int pending:1;
70 	/*
71 	 * 1 if the backref node isn't connected to any other
72 	 * backref node.
73 	 */
74 	unsigned int detached:1;
75 };
76 
77 /*
78  * present a block pointer in the backref cache
79  */
80 struct backref_edge {
81 	struct list_head list[2];
82 	struct backref_node *node[2];
83 };
84 
85 #define LOWER	0
86 #define UPPER	1
87 #define RELOCATION_RESERVED_NODES	256
88 
89 struct backref_cache {
90 	/* red black tree of all backref nodes in the cache */
91 	struct rb_root rb_root;
92 	/* for passing backref nodes to btrfs_reloc_cow_block */
93 	struct backref_node *path[BTRFS_MAX_LEVEL];
94 	/*
95 	 * list of blocks that have been cowed but some block
96 	 * pointers in upper level blocks may not reflect the
97 	 * new location
98 	 */
99 	struct list_head pending[BTRFS_MAX_LEVEL];
100 	/* list of backref nodes with no child node */
101 	struct list_head leaves;
102 	/* list of blocks that have been cowed in current transaction */
103 	struct list_head changed;
104 	/* list of detached backref node. */
105 	struct list_head detached;
106 
107 	u64 last_trans;
108 
109 	int nr_nodes;
110 	int nr_edges;
111 };
112 
113 /*
114  * map address of tree root to tree
115  */
116 struct mapping_node {
117 	struct rb_node rb_node;
118 	u64 bytenr;
119 	void *data;
120 };
121 
122 struct mapping_tree {
123 	struct rb_root rb_root;
124 	spinlock_t lock;
125 };
126 
127 /*
128  * present a tree block to process
129  */
130 struct tree_block {
131 	struct rb_node rb_node;
132 	u64 bytenr;
133 	struct btrfs_key key;
134 	unsigned int level:8;
135 	unsigned int key_ready:1;
136 };
137 
138 #define MAX_EXTENTS 128
139 
140 struct file_extent_cluster {
141 	u64 start;
142 	u64 end;
143 	u64 boundary[MAX_EXTENTS];
144 	unsigned int nr;
145 };
146 
147 struct reloc_control {
148 	/* block group to relocate */
149 	struct btrfs_block_group_cache *block_group;
150 	/* extent tree */
151 	struct btrfs_root *extent_root;
152 	/* inode for moving data */
153 	struct inode *data_inode;
154 
155 	struct btrfs_block_rsv *block_rsv;
156 
157 	struct backref_cache backref_cache;
158 
159 	struct file_extent_cluster cluster;
160 	/* tree blocks have been processed */
161 	struct extent_io_tree processed_blocks;
162 	/* map start of tree root to corresponding reloc tree */
163 	struct mapping_tree reloc_root_tree;
164 	/* list of reloc trees */
165 	struct list_head reloc_roots;
166 	/* list of subvolume trees that get relocated */
167 	struct list_head dirty_subvol_roots;
168 	/* size of metadata reservation for merging reloc trees */
169 	u64 merging_rsv_size;
170 	/* size of relocated tree nodes */
171 	u64 nodes_relocated;
172 	/* reserved size for block group relocation*/
173 	u64 reserved_bytes;
174 
175 	u64 search_start;
176 	u64 extents_found;
177 
178 	unsigned int stage:8;
179 	unsigned int create_reloc_tree:1;
180 	unsigned int merge_reloc_tree:1;
181 	unsigned int found_file_extent:1;
182 };
183 
184 /* stages of data relocation */
185 #define MOVE_DATA_EXTENTS	0
186 #define UPDATE_DATA_PTRS	1
187 
188 static void remove_backref_node(struct backref_cache *cache,
189 				struct backref_node *node);
190 static void __mark_block_processed(struct reloc_control *rc,
191 				   struct backref_node *node);
192 
193 static void mapping_tree_init(struct mapping_tree *tree)
194 {
195 	tree->rb_root = RB_ROOT;
196 	spin_lock_init(&tree->lock);
197 }
198 
199 static void backref_cache_init(struct backref_cache *cache)
200 {
201 	int i;
202 	cache->rb_root = RB_ROOT;
203 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
204 		INIT_LIST_HEAD(&cache->pending[i]);
205 	INIT_LIST_HEAD(&cache->changed);
206 	INIT_LIST_HEAD(&cache->detached);
207 	INIT_LIST_HEAD(&cache->leaves);
208 }
209 
210 static void backref_cache_cleanup(struct backref_cache *cache)
211 {
212 	struct backref_node *node;
213 	int i;
214 
215 	while (!list_empty(&cache->detached)) {
216 		node = list_entry(cache->detached.next,
217 				  struct backref_node, list);
218 		remove_backref_node(cache, node);
219 	}
220 
221 	while (!list_empty(&cache->leaves)) {
222 		node = list_entry(cache->leaves.next,
223 				  struct backref_node, lower);
224 		remove_backref_node(cache, node);
225 	}
226 
227 	cache->last_trans = 0;
228 
229 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
230 		ASSERT(list_empty(&cache->pending[i]));
231 	ASSERT(list_empty(&cache->changed));
232 	ASSERT(list_empty(&cache->detached));
233 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
234 	ASSERT(!cache->nr_nodes);
235 	ASSERT(!cache->nr_edges);
236 }
237 
238 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
239 {
240 	struct backref_node *node;
241 
242 	node = kzalloc(sizeof(*node), GFP_NOFS);
243 	if (node) {
244 		INIT_LIST_HEAD(&node->list);
245 		INIT_LIST_HEAD(&node->upper);
246 		INIT_LIST_HEAD(&node->lower);
247 		RB_CLEAR_NODE(&node->rb_node);
248 		cache->nr_nodes++;
249 	}
250 	return node;
251 }
252 
253 static void free_backref_node(struct backref_cache *cache,
254 			      struct backref_node *node)
255 {
256 	if (node) {
257 		cache->nr_nodes--;
258 		kfree(node);
259 	}
260 }
261 
262 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
263 {
264 	struct backref_edge *edge;
265 
266 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
267 	if (edge)
268 		cache->nr_edges++;
269 	return edge;
270 }
271 
272 static void free_backref_edge(struct backref_cache *cache,
273 			      struct backref_edge *edge)
274 {
275 	if (edge) {
276 		cache->nr_edges--;
277 		kfree(edge);
278 	}
279 }
280 
281 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
282 				   struct rb_node *node)
283 {
284 	struct rb_node **p = &root->rb_node;
285 	struct rb_node *parent = NULL;
286 	struct tree_entry *entry;
287 
288 	while (*p) {
289 		parent = *p;
290 		entry = rb_entry(parent, struct tree_entry, rb_node);
291 
292 		if (bytenr < entry->bytenr)
293 			p = &(*p)->rb_left;
294 		else if (bytenr > entry->bytenr)
295 			p = &(*p)->rb_right;
296 		else
297 			return parent;
298 	}
299 
300 	rb_link_node(node, parent, p);
301 	rb_insert_color(node, root);
302 	return NULL;
303 }
304 
305 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
306 {
307 	struct rb_node *n = root->rb_node;
308 	struct tree_entry *entry;
309 
310 	while (n) {
311 		entry = rb_entry(n, struct tree_entry, rb_node);
312 
313 		if (bytenr < entry->bytenr)
314 			n = n->rb_left;
315 		else if (bytenr > entry->bytenr)
316 			n = n->rb_right;
317 		else
318 			return n;
319 	}
320 	return NULL;
321 }
322 
323 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
324 {
325 
326 	struct btrfs_fs_info *fs_info = NULL;
327 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
328 					      rb_node);
329 	if (bnode->root)
330 		fs_info = bnode->root->fs_info;
331 	btrfs_panic(fs_info, errno,
332 		    "Inconsistency in backref cache found at offset %llu",
333 		    bytenr);
334 }
335 
336 /*
337  * walk up backref nodes until reach node presents tree root
338  */
339 static struct backref_node *walk_up_backref(struct backref_node *node,
340 					    struct backref_edge *edges[],
341 					    int *index)
342 {
343 	struct backref_edge *edge;
344 	int idx = *index;
345 
346 	while (!list_empty(&node->upper)) {
347 		edge = list_entry(node->upper.next,
348 				  struct backref_edge, list[LOWER]);
349 		edges[idx++] = edge;
350 		node = edge->node[UPPER];
351 	}
352 	BUG_ON(node->detached);
353 	*index = idx;
354 	return node;
355 }
356 
357 /*
358  * walk down backref nodes to find start of next reference path
359  */
360 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
361 					      int *index)
362 {
363 	struct backref_edge *edge;
364 	struct backref_node *lower;
365 	int idx = *index;
366 
367 	while (idx > 0) {
368 		edge = edges[idx - 1];
369 		lower = edge->node[LOWER];
370 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
371 			idx--;
372 			continue;
373 		}
374 		edge = list_entry(edge->list[LOWER].next,
375 				  struct backref_edge, list[LOWER]);
376 		edges[idx - 1] = edge;
377 		*index = idx;
378 		return edge->node[UPPER];
379 	}
380 	*index = 0;
381 	return NULL;
382 }
383 
384 static void unlock_node_buffer(struct backref_node *node)
385 {
386 	if (node->locked) {
387 		btrfs_tree_unlock(node->eb);
388 		node->locked = 0;
389 	}
390 }
391 
392 static void drop_node_buffer(struct backref_node *node)
393 {
394 	if (node->eb) {
395 		unlock_node_buffer(node);
396 		free_extent_buffer(node->eb);
397 		node->eb = NULL;
398 	}
399 }
400 
401 static void drop_backref_node(struct backref_cache *tree,
402 			      struct backref_node *node)
403 {
404 	BUG_ON(!list_empty(&node->upper));
405 
406 	drop_node_buffer(node);
407 	list_del(&node->list);
408 	list_del(&node->lower);
409 	if (!RB_EMPTY_NODE(&node->rb_node))
410 		rb_erase(&node->rb_node, &tree->rb_root);
411 	free_backref_node(tree, node);
412 }
413 
414 /*
415  * remove a backref node from the backref cache
416  */
417 static void remove_backref_node(struct backref_cache *cache,
418 				struct backref_node *node)
419 {
420 	struct backref_node *upper;
421 	struct backref_edge *edge;
422 
423 	if (!node)
424 		return;
425 
426 	BUG_ON(!node->lowest && !node->detached);
427 	while (!list_empty(&node->upper)) {
428 		edge = list_entry(node->upper.next, struct backref_edge,
429 				  list[LOWER]);
430 		upper = edge->node[UPPER];
431 		list_del(&edge->list[LOWER]);
432 		list_del(&edge->list[UPPER]);
433 		free_backref_edge(cache, edge);
434 
435 		if (RB_EMPTY_NODE(&upper->rb_node)) {
436 			BUG_ON(!list_empty(&node->upper));
437 			drop_backref_node(cache, node);
438 			node = upper;
439 			node->lowest = 1;
440 			continue;
441 		}
442 		/*
443 		 * add the node to leaf node list if no other
444 		 * child block cached.
445 		 */
446 		if (list_empty(&upper->lower)) {
447 			list_add_tail(&upper->lower, &cache->leaves);
448 			upper->lowest = 1;
449 		}
450 	}
451 
452 	drop_backref_node(cache, node);
453 }
454 
455 static void update_backref_node(struct backref_cache *cache,
456 				struct backref_node *node, u64 bytenr)
457 {
458 	struct rb_node *rb_node;
459 	rb_erase(&node->rb_node, &cache->rb_root);
460 	node->bytenr = bytenr;
461 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
462 	if (rb_node)
463 		backref_tree_panic(rb_node, -EEXIST, bytenr);
464 }
465 
466 /*
467  * update backref cache after a transaction commit
468  */
469 static int update_backref_cache(struct btrfs_trans_handle *trans,
470 				struct backref_cache *cache)
471 {
472 	struct backref_node *node;
473 	int level = 0;
474 
475 	if (cache->last_trans == 0) {
476 		cache->last_trans = trans->transid;
477 		return 0;
478 	}
479 
480 	if (cache->last_trans == trans->transid)
481 		return 0;
482 
483 	/*
484 	 * detached nodes are used to avoid unnecessary backref
485 	 * lookup. transaction commit changes the extent tree.
486 	 * so the detached nodes are no longer useful.
487 	 */
488 	while (!list_empty(&cache->detached)) {
489 		node = list_entry(cache->detached.next,
490 				  struct backref_node, list);
491 		remove_backref_node(cache, node);
492 	}
493 
494 	while (!list_empty(&cache->changed)) {
495 		node = list_entry(cache->changed.next,
496 				  struct backref_node, list);
497 		list_del_init(&node->list);
498 		BUG_ON(node->pending);
499 		update_backref_node(cache, node, node->new_bytenr);
500 	}
501 
502 	/*
503 	 * some nodes can be left in the pending list if there were
504 	 * errors during processing the pending nodes.
505 	 */
506 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
507 		list_for_each_entry(node, &cache->pending[level], list) {
508 			BUG_ON(!node->pending);
509 			if (node->bytenr == node->new_bytenr)
510 				continue;
511 			update_backref_node(cache, node, node->new_bytenr);
512 		}
513 	}
514 
515 	cache->last_trans = 0;
516 	return 1;
517 }
518 
519 
520 static int should_ignore_root(struct btrfs_root *root)
521 {
522 	struct btrfs_root *reloc_root;
523 
524 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
525 		return 0;
526 
527 	reloc_root = root->reloc_root;
528 	if (!reloc_root)
529 		return 0;
530 
531 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
532 	    root->fs_info->running_transaction->transid - 1)
533 		return 0;
534 	/*
535 	 * if there is reloc tree and it was created in previous
536 	 * transaction backref lookup can find the reloc tree,
537 	 * so backref node for the fs tree root is useless for
538 	 * relocation.
539 	 */
540 	return 1;
541 }
542 /*
543  * find reloc tree by address of tree root
544  */
545 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
546 					  u64 bytenr)
547 {
548 	struct rb_node *rb_node;
549 	struct mapping_node *node;
550 	struct btrfs_root *root = NULL;
551 
552 	spin_lock(&rc->reloc_root_tree.lock);
553 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
554 	if (rb_node) {
555 		node = rb_entry(rb_node, struct mapping_node, rb_node);
556 		root = (struct btrfs_root *)node->data;
557 	}
558 	spin_unlock(&rc->reloc_root_tree.lock);
559 	return root;
560 }
561 
562 static int is_cowonly_root(u64 root_objectid)
563 {
564 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
565 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
566 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
567 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
568 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
569 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
570 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
571 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
572 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
573 		return 1;
574 	return 0;
575 }
576 
577 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
578 					u64 root_objectid)
579 {
580 	struct btrfs_key key;
581 
582 	key.objectid = root_objectid;
583 	key.type = BTRFS_ROOT_ITEM_KEY;
584 	if (is_cowonly_root(root_objectid))
585 		key.offset = 0;
586 	else
587 		key.offset = (u64)-1;
588 
589 	return btrfs_get_fs_root(fs_info, &key, false);
590 }
591 
592 static noinline_for_stack
593 int find_inline_backref(struct extent_buffer *leaf, int slot,
594 			unsigned long *ptr, unsigned long *end)
595 {
596 	struct btrfs_key key;
597 	struct btrfs_extent_item *ei;
598 	struct btrfs_tree_block_info *bi;
599 	u32 item_size;
600 
601 	btrfs_item_key_to_cpu(leaf, &key, slot);
602 
603 	item_size = btrfs_item_size_nr(leaf, slot);
604 	if (item_size < sizeof(*ei)) {
605 		btrfs_print_v0_err(leaf->fs_info);
606 		btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL);
607 		return 1;
608 	}
609 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
610 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
611 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
612 
613 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
614 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
615 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
616 		return 1;
617 	}
618 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
619 	    item_size <= sizeof(*ei)) {
620 		WARN_ON(item_size < sizeof(*ei));
621 		return 1;
622 	}
623 
624 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
625 		bi = (struct btrfs_tree_block_info *)(ei + 1);
626 		*ptr = (unsigned long)(bi + 1);
627 	} else {
628 		*ptr = (unsigned long)(ei + 1);
629 	}
630 	*end = (unsigned long)ei + item_size;
631 	return 0;
632 }
633 
634 /*
635  * build backref tree for a given tree block. root of the backref tree
636  * corresponds the tree block, leaves of the backref tree correspond
637  * roots of b-trees that reference the tree block.
638  *
639  * the basic idea of this function is check backrefs of a given block
640  * to find upper level blocks that reference the block, and then check
641  * backrefs of these upper level blocks recursively. the recursion stop
642  * when tree root is reached or backrefs for the block is cached.
643  *
644  * NOTE: if we find backrefs for a block are cached, we know backrefs
645  * for all upper level blocks that directly/indirectly reference the
646  * block are also cached.
647  */
648 static noinline_for_stack
649 struct backref_node *build_backref_tree(struct reloc_control *rc,
650 					struct btrfs_key *node_key,
651 					int level, u64 bytenr)
652 {
653 	struct backref_cache *cache = &rc->backref_cache;
654 	struct btrfs_path *path1; /* For searching extent root */
655 	struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
656 	struct extent_buffer *eb;
657 	struct btrfs_root *root;
658 	struct backref_node *cur;
659 	struct backref_node *upper;
660 	struct backref_node *lower;
661 	struct backref_node *node = NULL;
662 	struct backref_node *exist = NULL;
663 	struct backref_edge *edge;
664 	struct rb_node *rb_node;
665 	struct btrfs_key key;
666 	unsigned long end;
667 	unsigned long ptr;
668 	LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
669 	LIST_HEAD(useless);
670 	int cowonly;
671 	int ret;
672 	int err = 0;
673 	bool need_check = true;
674 
675 	path1 = btrfs_alloc_path();
676 	path2 = btrfs_alloc_path();
677 	if (!path1 || !path2) {
678 		err = -ENOMEM;
679 		goto out;
680 	}
681 	path1->reada = READA_FORWARD;
682 	path2->reada = READA_FORWARD;
683 
684 	node = alloc_backref_node(cache);
685 	if (!node) {
686 		err = -ENOMEM;
687 		goto out;
688 	}
689 
690 	node->bytenr = bytenr;
691 	node->level = level;
692 	node->lowest = 1;
693 	cur = node;
694 again:
695 	end = 0;
696 	ptr = 0;
697 	key.objectid = cur->bytenr;
698 	key.type = BTRFS_METADATA_ITEM_KEY;
699 	key.offset = (u64)-1;
700 
701 	path1->search_commit_root = 1;
702 	path1->skip_locking = 1;
703 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
704 				0, 0);
705 	if (ret < 0) {
706 		err = ret;
707 		goto out;
708 	}
709 	ASSERT(ret);
710 	ASSERT(path1->slots[0]);
711 
712 	path1->slots[0]--;
713 
714 	WARN_ON(cur->checked);
715 	if (!list_empty(&cur->upper)) {
716 		/*
717 		 * the backref was added previously when processing
718 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
719 		 */
720 		ASSERT(list_is_singular(&cur->upper));
721 		edge = list_entry(cur->upper.next, struct backref_edge,
722 				  list[LOWER]);
723 		ASSERT(list_empty(&edge->list[UPPER]));
724 		exist = edge->node[UPPER];
725 		/*
726 		 * add the upper level block to pending list if we need
727 		 * check its backrefs
728 		 */
729 		if (!exist->checked)
730 			list_add_tail(&edge->list[UPPER], &list);
731 	} else {
732 		exist = NULL;
733 	}
734 
735 	while (1) {
736 		cond_resched();
737 		eb = path1->nodes[0];
738 
739 		if (ptr >= end) {
740 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
741 				ret = btrfs_next_leaf(rc->extent_root, path1);
742 				if (ret < 0) {
743 					err = ret;
744 					goto out;
745 				}
746 				if (ret > 0)
747 					break;
748 				eb = path1->nodes[0];
749 			}
750 
751 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
752 			if (key.objectid != cur->bytenr) {
753 				WARN_ON(exist);
754 				break;
755 			}
756 
757 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
758 			    key.type == BTRFS_METADATA_ITEM_KEY) {
759 				ret = find_inline_backref(eb, path1->slots[0],
760 							  &ptr, &end);
761 				if (ret)
762 					goto next;
763 			}
764 		}
765 
766 		if (ptr < end) {
767 			/* update key for inline back ref */
768 			struct btrfs_extent_inline_ref *iref;
769 			int type;
770 			iref = (struct btrfs_extent_inline_ref *)ptr;
771 			type = btrfs_get_extent_inline_ref_type(eb, iref,
772 							BTRFS_REF_TYPE_BLOCK);
773 			if (type == BTRFS_REF_TYPE_INVALID) {
774 				err = -EUCLEAN;
775 				goto out;
776 			}
777 			key.type = type;
778 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
779 
780 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
781 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
782 		}
783 
784 		/*
785 		 * Parent node found and matches current inline ref, no need to
786 		 * rebuild this node for this inline ref.
787 		 */
788 		if (exist &&
789 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
790 		      exist->owner == key.offset) ||
791 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
792 		      exist->bytenr == key.offset))) {
793 			exist = NULL;
794 			goto next;
795 		}
796 
797 		/* SHARED_BLOCK_REF means key.offset is the parent bytenr */
798 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
799 			if (key.objectid == key.offset) {
800 				/*
801 				 * Only root blocks of reloc trees use backref
802 				 * pointing to itself.
803 				 */
804 				root = find_reloc_root(rc, cur->bytenr);
805 				ASSERT(root);
806 				cur->root = root;
807 				break;
808 			}
809 
810 			edge = alloc_backref_edge(cache);
811 			if (!edge) {
812 				err = -ENOMEM;
813 				goto out;
814 			}
815 			rb_node = tree_search(&cache->rb_root, key.offset);
816 			if (!rb_node) {
817 				upper = alloc_backref_node(cache);
818 				if (!upper) {
819 					free_backref_edge(cache, edge);
820 					err = -ENOMEM;
821 					goto out;
822 				}
823 				upper->bytenr = key.offset;
824 				upper->level = cur->level + 1;
825 				/*
826 				 *  backrefs for the upper level block isn't
827 				 *  cached, add the block to pending list
828 				 */
829 				list_add_tail(&edge->list[UPPER], &list);
830 			} else {
831 				upper = rb_entry(rb_node, struct backref_node,
832 						 rb_node);
833 				ASSERT(upper->checked);
834 				INIT_LIST_HEAD(&edge->list[UPPER]);
835 			}
836 			list_add_tail(&edge->list[LOWER], &cur->upper);
837 			edge->node[LOWER] = cur;
838 			edge->node[UPPER] = upper;
839 
840 			goto next;
841 		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
842 			err = -EINVAL;
843 			btrfs_print_v0_err(rc->extent_root->fs_info);
844 			btrfs_handle_fs_error(rc->extent_root->fs_info, err,
845 					      NULL);
846 			goto out;
847 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
848 			goto next;
849 		}
850 
851 		/*
852 		 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
853 		 * means the root objectid. We need to search the tree to get
854 		 * its parent bytenr.
855 		 */
856 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
857 		if (IS_ERR(root)) {
858 			err = PTR_ERR(root);
859 			goto out;
860 		}
861 
862 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
863 			cur->cowonly = 1;
864 
865 		if (btrfs_root_level(&root->root_item) == cur->level) {
866 			/* tree root */
867 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
868 			       cur->bytenr);
869 			if (should_ignore_root(root))
870 				list_add(&cur->list, &useless);
871 			else
872 				cur->root = root;
873 			break;
874 		}
875 
876 		level = cur->level + 1;
877 
878 		/* Search the tree to find parent blocks referring the block. */
879 		path2->search_commit_root = 1;
880 		path2->skip_locking = 1;
881 		path2->lowest_level = level;
882 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
883 		path2->lowest_level = 0;
884 		if (ret < 0) {
885 			err = ret;
886 			goto out;
887 		}
888 		if (ret > 0 && path2->slots[level] > 0)
889 			path2->slots[level]--;
890 
891 		eb = path2->nodes[level];
892 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
893 		    cur->bytenr) {
894 			btrfs_err(root->fs_info,
895 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
896 				  cur->bytenr, level - 1,
897 				  root->root_key.objectid,
898 				  node_key->objectid, node_key->type,
899 				  node_key->offset);
900 			err = -ENOENT;
901 			goto out;
902 		}
903 		lower = cur;
904 		need_check = true;
905 
906 		/* Add all nodes and edges in the path */
907 		for (; level < BTRFS_MAX_LEVEL; level++) {
908 			if (!path2->nodes[level]) {
909 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
910 				       lower->bytenr);
911 				if (should_ignore_root(root))
912 					list_add(&lower->list, &useless);
913 				else
914 					lower->root = root;
915 				break;
916 			}
917 
918 			edge = alloc_backref_edge(cache);
919 			if (!edge) {
920 				err = -ENOMEM;
921 				goto out;
922 			}
923 
924 			eb = path2->nodes[level];
925 			rb_node = tree_search(&cache->rb_root, eb->start);
926 			if (!rb_node) {
927 				upper = alloc_backref_node(cache);
928 				if (!upper) {
929 					free_backref_edge(cache, edge);
930 					err = -ENOMEM;
931 					goto out;
932 				}
933 				upper->bytenr = eb->start;
934 				upper->owner = btrfs_header_owner(eb);
935 				upper->level = lower->level + 1;
936 				if (!test_bit(BTRFS_ROOT_REF_COWS,
937 					      &root->state))
938 					upper->cowonly = 1;
939 
940 				/*
941 				 * if we know the block isn't shared
942 				 * we can void checking its backrefs.
943 				 */
944 				if (btrfs_block_can_be_shared(root, eb))
945 					upper->checked = 0;
946 				else
947 					upper->checked = 1;
948 
949 				/*
950 				 * add the block to pending list if we
951 				 * need check its backrefs, we only do this once
952 				 * while walking up a tree as we will catch
953 				 * anything else later on.
954 				 */
955 				if (!upper->checked && need_check) {
956 					need_check = false;
957 					list_add_tail(&edge->list[UPPER],
958 						      &list);
959 				} else {
960 					if (upper->checked)
961 						need_check = true;
962 					INIT_LIST_HEAD(&edge->list[UPPER]);
963 				}
964 			} else {
965 				upper = rb_entry(rb_node, struct backref_node,
966 						 rb_node);
967 				ASSERT(upper->checked);
968 				INIT_LIST_HEAD(&edge->list[UPPER]);
969 				if (!upper->owner)
970 					upper->owner = btrfs_header_owner(eb);
971 			}
972 			list_add_tail(&edge->list[LOWER], &lower->upper);
973 			edge->node[LOWER] = lower;
974 			edge->node[UPPER] = upper;
975 
976 			if (rb_node)
977 				break;
978 			lower = upper;
979 			upper = NULL;
980 		}
981 		btrfs_release_path(path2);
982 next:
983 		if (ptr < end) {
984 			ptr += btrfs_extent_inline_ref_size(key.type);
985 			if (ptr >= end) {
986 				WARN_ON(ptr > end);
987 				ptr = 0;
988 				end = 0;
989 			}
990 		}
991 		if (ptr >= end)
992 			path1->slots[0]++;
993 	}
994 	btrfs_release_path(path1);
995 
996 	cur->checked = 1;
997 	WARN_ON(exist);
998 
999 	/* the pending list isn't empty, take the first block to process */
1000 	if (!list_empty(&list)) {
1001 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1002 		list_del_init(&edge->list[UPPER]);
1003 		cur = edge->node[UPPER];
1004 		goto again;
1005 	}
1006 
1007 	/*
1008 	 * everything goes well, connect backref nodes and insert backref nodes
1009 	 * into the cache.
1010 	 */
1011 	ASSERT(node->checked);
1012 	cowonly = node->cowonly;
1013 	if (!cowonly) {
1014 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1015 				      &node->rb_node);
1016 		if (rb_node)
1017 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1018 		list_add_tail(&node->lower, &cache->leaves);
1019 	}
1020 
1021 	list_for_each_entry(edge, &node->upper, list[LOWER])
1022 		list_add_tail(&edge->list[UPPER], &list);
1023 
1024 	while (!list_empty(&list)) {
1025 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1026 		list_del_init(&edge->list[UPPER]);
1027 		upper = edge->node[UPPER];
1028 		if (upper->detached) {
1029 			list_del(&edge->list[LOWER]);
1030 			lower = edge->node[LOWER];
1031 			free_backref_edge(cache, edge);
1032 			if (list_empty(&lower->upper))
1033 				list_add(&lower->list, &useless);
1034 			continue;
1035 		}
1036 
1037 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1038 			if (upper->lowest) {
1039 				list_del_init(&upper->lower);
1040 				upper->lowest = 0;
1041 			}
1042 
1043 			list_add_tail(&edge->list[UPPER], &upper->lower);
1044 			continue;
1045 		}
1046 
1047 		if (!upper->checked) {
1048 			/*
1049 			 * Still want to blow up for developers since this is a
1050 			 * logic bug.
1051 			 */
1052 			ASSERT(0);
1053 			err = -EINVAL;
1054 			goto out;
1055 		}
1056 		if (cowonly != upper->cowonly) {
1057 			ASSERT(0);
1058 			err = -EINVAL;
1059 			goto out;
1060 		}
1061 
1062 		if (!cowonly) {
1063 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1064 					      &upper->rb_node);
1065 			if (rb_node)
1066 				backref_tree_panic(rb_node, -EEXIST,
1067 						   upper->bytenr);
1068 		}
1069 
1070 		list_add_tail(&edge->list[UPPER], &upper->lower);
1071 
1072 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1073 			list_add_tail(&edge->list[UPPER], &list);
1074 	}
1075 	/*
1076 	 * process useless backref nodes. backref nodes for tree leaves
1077 	 * are deleted from the cache. backref nodes for upper level
1078 	 * tree blocks are left in the cache to avoid unnecessary backref
1079 	 * lookup.
1080 	 */
1081 	while (!list_empty(&useless)) {
1082 		upper = list_entry(useless.next, struct backref_node, list);
1083 		list_del_init(&upper->list);
1084 		ASSERT(list_empty(&upper->upper));
1085 		if (upper == node)
1086 			node = NULL;
1087 		if (upper->lowest) {
1088 			list_del_init(&upper->lower);
1089 			upper->lowest = 0;
1090 		}
1091 		while (!list_empty(&upper->lower)) {
1092 			edge = list_entry(upper->lower.next,
1093 					  struct backref_edge, list[UPPER]);
1094 			list_del(&edge->list[UPPER]);
1095 			list_del(&edge->list[LOWER]);
1096 			lower = edge->node[LOWER];
1097 			free_backref_edge(cache, edge);
1098 
1099 			if (list_empty(&lower->upper))
1100 				list_add(&lower->list, &useless);
1101 		}
1102 		__mark_block_processed(rc, upper);
1103 		if (upper->level > 0) {
1104 			list_add(&upper->list, &cache->detached);
1105 			upper->detached = 1;
1106 		} else {
1107 			rb_erase(&upper->rb_node, &cache->rb_root);
1108 			free_backref_node(cache, upper);
1109 		}
1110 	}
1111 out:
1112 	btrfs_free_path(path1);
1113 	btrfs_free_path(path2);
1114 	if (err) {
1115 		while (!list_empty(&useless)) {
1116 			lower = list_entry(useless.next,
1117 					   struct backref_node, list);
1118 			list_del_init(&lower->list);
1119 		}
1120 		while (!list_empty(&list)) {
1121 			edge = list_first_entry(&list, struct backref_edge,
1122 						list[UPPER]);
1123 			list_del(&edge->list[UPPER]);
1124 			list_del(&edge->list[LOWER]);
1125 			lower = edge->node[LOWER];
1126 			upper = edge->node[UPPER];
1127 			free_backref_edge(cache, edge);
1128 
1129 			/*
1130 			 * Lower is no longer linked to any upper backref nodes
1131 			 * and isn't in the cache, we can free it ourselves.
1132 			 */
1133 			if (list_empty(&lower->upper) &&
1134 			    RB_EMPTY_NODE(&lower->rb_node))
1135 				list_add(&lower->list, &useless);
1136 
1137 			if (!RB_EMPTY_NODE(&upper->rb_node))
1138 				continue;
1139 
1140 			/* Add this guy's upper edges to the list to process */
1141 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1142 				list_add_tail(&edge->list[UPPER], &list);
1143 			if (list_empty(&upper->upper))
1144 				list_add(&upper->list, &useless);
1145 		}
1146 
1147 		while (!list_empty(&useless)) {
1148 			lower = list_entry(useless.next,
1149 					   struct backref_node, list);
1150 			list_del_init(&lower->list);
1151 			if (lower == node)
1152 				node = NULL;
1153 			free_backref_node(cache, lower);
1154 		}
1155 
1156 		free_backref_node(cache, node);
1157 		return ERR_PTR(err);
1158 	}
1159 	ASSERT(!node || !node->detached);
1160 	return node;
1161 }
1162 
1163 /*
1164  * helper to add backref node for the newly created snapshot.
1165  * the backref node is created by cloning backref node that
1166  * corresponds to root of source tree
1167  */
1168 static int clone_backref_node(struct btrfs_trans_handle *trans,
1169 			      struct reloc_control *rc,
1170 			      struct btrfs_root *src,
1171 			      struct btrfs_root *dest)
1172 {
1173 	struct btrfs_root *reloc_root = src->reloc_root;
1174 	struct backref_cache *cache = &rc->backref_cache;
1175 	struct backref_node *node = NULL;
1176 	struct backref_node *new_node;
1177 	struct backref_edge *edge;
1178 	struct backref_edge *new_edge;
1179 	struct rb_node *rb_node;
1180 
1181 	if (cache->last_trans > 0)
1182 		update_backref_cache(trans, cache);
1183 
1184 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1185 	if (rb_node) {
1186 		node = rb_entry(rb_node, struct backref_node, rb_node);
1187 		if (node->detached)
1188 			node = NULL;
1189 		else
1190 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1191 	}
1192 
1193 	if (!node) {
1194 		rb_node = tree_search(&cache->rb_root,
1195 				      reloc_root->commit_root->start);
1196 		if (rb_node) {
1197 			node = rb_entry(rb_node, struct backref_node,
1198 					rb_node);
1199 			BUG_ON(node->detached);
1200 		}
1201 	}
1202 
1203 	if (!node)
1204 		return 0;
1205 
1206 	new_node = alloc_backref_node(cache);
1207 	if (!new_node)
1208 		return -ENOMEM;
1209 
1210 	new_node->bytenr = dest->node->start;
1211 	new_node->level = node->level;
1212 	new_node->lowest = node->lowest;
1213 	new_node->checked = 1;
1214 	new_node->root = dest;
1215 
1216 	if (!node->lowest) {
1217 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1218 			new_edge = alloc_backref_edge(cache);
1219 			if (!new_edge)
1220 				goto fail;
1221 
1222 			new_edge->node[UPPER] = new_node;
1223 			new_edge->node[LOWER] = edge->node[LOWER];
1224 			list_add_tail(&new_edge->list[UPPER],
1225 				      &new_node->lower);
1226 		}
1227 	} else {
1228 		list_add_tail(&new_node->lower, &cache->leaves);
1229 	}
1230 
1231 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1232 			      &new_node->rb_node);
1233 	if (rb_node)
1234 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1235 
1236 	if (!new_node->lowest) {
1237 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1238 			list_add_tail(&new_edge->list[LOWER],
1239 				      &new_edge->node[LOWER]->upper);
1240 		}
1241 	}
1242 	return 0;
1243 fail:
1244 	while (!list_empty(&new_node->lower)) {
1245 		new_edge = list_entry(new_node->lower.next,
1246 				      struct backref_edge, list[UPPER]);
1247 		list_del(&new_edge->list[UPPER]);
1248 		free_backref_edge(cache, new_edge);
1249 	}
1250 	free_backref_node(cache, new_node);
1251 	return -ENOMEM;
1252 }
1253 
1254 /*
1255  * helper to add 'address of tree root -> reloc tree' mapping
1256  */
1257 static int __must_check __add_reloc_root(struct btrfs_root *root)
1258 {
1259 	struct btrfs_fs_info *fs_info = root->fs_info;
1260 	struct rb_node *rb_node;
1261 	struct mapping_node *node;
1262 	struct reloc_control *rc = fs_info->reloc_ctl;
1263 
1264 	node = kmalloc(sizeof(*node), GFP_NOFS);
1265 	if (!node)
1266 		return -ENOMEM;
1267 
1268 	node->bytenr = root->node->start;
1269 	node->data = root;
1270 
1271 	spin_lock(&rc->reloc_root_tree.lock);
1272 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1273 			      node->bytenr, &node->rb_node);
1274 	spin_unlock(&rc->reloc_root_tree.lock);
1275 	if (rb_node) {
1276 		btrfs_panic(fs_info, -EEXIST,
1277 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1278 			    node->bytenr);
1279 	}
1280 
1281 	list_add_tail(&root->root_list, &rc->reloc_roots);
1282 	return 0;
1283 }
1284 
1285 /*
1286  * helper to delete the 'address of tree root -> reloc tree'
1287  * mapping
1288  */
1289 static void __del_reloc_root(struct btrfs_root *root)
1290 {
1291 	struct btrfs_fs_info *fs_info = root->fs_info;
1292 	struct rb_node *rb_node;
1293 	struct mapping_node *node = NULL;
1294 	struct reloc_control *rc = fs_info->reloc_ctl;
1295 
1296 	if (rc && root->node) {
1297 		spin_lock(&rc->reloc_root_tree.lock);
1298 		rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1299 				      root->node->start);
1300 		if (rb_node) {
1301 			node = rb_entry(rb_node, struct mapping_node, rb_node);
1302 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1303 		}
1304 		spin_unlock(&rc->reloc_root_tree.lock);
1305 		if (!node)
1306 			return;
1307 		BUG_ON((struct btrfs_root *)node->data != root);
1308 	}
1309 
1310 	spin_lock(&fs_info->trans_lock);
1311 	list_del_init(&root->root_list);
1312 	spin_unlock(&fs_info->trans_lock);
1313 	kfree(node);
1314 }
1315 
1316 /*
1317  * helper to update the 'address of tree root -> reloc tree'
1318  * mapping
1319  */
1320 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1321 {
1322 	struct btrfs_fs_info *fs_info = root->fs_info;
1323 	struct rb_node *rb_node;
1324 	struct mapping_node *node = NULL;
1325 	struct reloc_control *rc = fs_info->reloc_ctl;
1326 
1327 	spin_lock(&rc->reloc_root_tree.lock);
1328 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1329 			      root->node->start);
1330 	if (rb_node) {
1331 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1332 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1333 	}
1334 	spin_unlock(&rc->reloc_root_tree.lock);
1335 
1336 	if (!node)
1337 		return 0;
1338 	BUG_ON((struct btrfs_root *)node->data != root);
1339 
1340 	spin_lock(&rc->reloc_root_tree.lock);
1341 	node->bytenr = new_bytenr;
1342 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1343 			      node->bytenr, &node->rb_node);
1344 	spin_unlock(&rc->reloc_root_tree.lock);
1345 	if (rb_node)
1346 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1347 	return 0;
1348 }
1349 
1350 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1351 					struct btrfs_root *root, u64 objectid)
1352 {
1353 	struct btrfs_fs_info *fs_info = root->fs_info;
1354 	struct btrfs_root *reloc_root;
1355 	struct extent_buffer *eb;
1356 	struct btrfs_root_item *root_item;
1357 	struct btrfs_key root_key;
1358 	int ret;
1359 
1360 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1361 	BUG_ON(!root_item);
1362 
1363 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1364 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1365 	root_key.offset = objectid;
1366 
1367 	if (root->root_key.objectid == objectid) {
1368 		u64 commit_root_gen;
1369 
1370 		/* called by btrfs_init_reloc_root */
1371 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1372 				      BTRFS_TREE_RELOC_OBJECTID);
1373 		BUG_ON(ret);
1374 		/*
1375 		 * Set the last_snapshot field to the generation of the commit
1376 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1377 		 * correctly (returns true) when the relocation root is created
1378 		 * either inside the critical section of a transaction commit
1379 		 * (through transaction.c:qgroup_account_snapshot()) and when
1380 		 * it's created before the transaction commit is started.
1381 		 */
1382 		commit_root_gen = btrfs_header_generation(root->commit_root);
1383 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1384 	} else {
1385 		/*
1386 		 * called by btrfs_reloc_post_snapshot_hook.
1387 		 * the source tree is a reloc tree, all tree blocks
1388 		 * modified after it was created have RELOC flag
1389 		 * set in their headers. so it's OK to not update
1390 		 * the 'last_snapshot'.
1391 		 */
1392 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1393 				      BTRFS_TREE_RELOC_OBJECTID);
1394 		BUG_ON(ret);
1395 	}
1396 
1397 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1398 	btrfs_set_root_bytenr(root_item, eb->start);
1399 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1400 	btrfs_set_root_generation(root_item, trans->transid);
1401 
1402 	if (root->root_key.objectid == objectid) {
1403 		btrfs_set_root_refs(root_item, 0);
1404 		memset(&root_item->drop_progress, 0,
1405 		       sizeof(struct btrfs_disk_key));
1406 		root_item->drop_level = 0;
1407 	}
1408 
1409 	btrfs_tree_unlock(eb);
1410 	free_extent_buffer(eb);
1411 
1412 	ret = btrfs_insert_root(trans, fs_info->tree_root,
1413 				&root_key, root_item);
1414 	BUG_ON(ret);
1415 	kfree(root_item);
1416 
1417 	reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1418 	BUG_ON(IS_ERR(reloc_root));
1419 	reloc_root->last_trans = trans->transid;
1420 	return reloc_root;
1421 }
1422 
1423 /*
1424  * create reloc tree for a given fs tree. reloc tree is just a
1425  * snapshot of the fs tree with special root objectid.
1426  */
1427 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1428 			  struct btrfs_root *root)
1429 {
1430 	struct btrfs_fs_info *fs_info = root->fs_info;
1431 	struct btrfs_root *reloc_root;
1432 	struct reloc_control *rc = fs_info->reloc_ctl;
1433 	struct btrfs_block_rsv *rsv;
1434 	int clear_rsv = 0;
1435 	int ret;
1436 
1437 	if (root->reloc_root) {
1438 		reloc_root = root->reloc_root;
1439 		reloc_root->last_trans = trans->transid;
1440 		return 0;
1441 	}
1442 
1443 	if (!rc || !rc->create_reloc_tree ||
1444 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1445 		return 0;
1446 
1447 	if (!trans->reloc_reserved) {
1448 		rsv = trans->block_rsv;
1449 		trans->block_rsv = rc->block_rsv;
1450 		clear_rsv = 1;
1451 	}
1452 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1453 	if (clear_rsv)
1454 		trans->block_rsv = rsv;
1455 
1456 	ret = __add_reloc_root(reloc_root);
1457 	BUG_ON(ret < 0);
1458 	root->reloc_root = reloc_root;
1459 	return 0;
1460 }
1461 
1462 /*
1463  * update root item of reloc tree
1464  */
1465 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1466 			    struct btrfs_root *root)
1467 {
1468 	struct btrfs_fs_info *fs_info = root->fs_info;
1469 	struct btrfs_root *reloc_root;
1470 	struct btrfs_root_item *root_item;
1471 	int ret;
1472 
1473 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
1474 	    !root->reloc_root)
1475 		goto out;
1476 
1477 	reloc_root = root->reloc_root;
1478 	root_item = &reloc_root->root_item;
1479 
1480 	/* root->reloc_root will stay until current relocation finished */
1481 	if (fs_info->reloc_ctl->merge_reloc_tree &&
1482 	    btrfs_root_refs(root_item) == 0) {
1483 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1484 		__del_reloc_root(reloc_root);
1485 	}
1486 
1487 	if (reloc_root->commit_root != reloc_root->node) {
1488 		btrfs_set_root_node(root_item, reloc_root->node);
1489 		free_extent_buffer(reloc_root->commit_root);
1490 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1491 	}
1492 
1493 	ret = btrfs_update_root(trans, fs_info->tree_root,
1494 				&reloc_root->root_key, root_item);
1495 	BUG_ON(ret);
1496 
1497 out:
1498 	return 0;
1499 }
1500 
1501 /*
1502  * helper to find first cached inode with inode number >= objectid
1503  * in a subvolume
1504  */
1505 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1506 {
1507 	struct rb_node *node;
1508 	struct rb_node *prev;
1509 	struct btrfs_inode *entry;
1510 	struct inode *inode;
1511 
1512 	spin_lock(&root->inode_lock);
1513 again:
1514 	node = root->inode_tree.rb_node;
1515 	prev = NULL;
1516 	while (node) {
1517 		prev = node;
1518 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1519 
1520 		if (objectid < btrfs_ino(entry))
1521 			node = node->rb_left;
1522 		else if (objectid > btrfs_ino(entry))
1523 			node = node->rb_right;
1524 		else
1525 			break;
1526 	}
1527 	if (!node) {
1528 		while (prev) {
1529 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1530 			if (objectid <= btrfs_ino(entry)) {
1531 				node = prev;
1532 				break;
1533 			}
1534 			prev = rb_next(prev);
1535 		}
1536 	}
1537 	while (node) {
1538 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1539 		inode = igrab(&entry->vfs_inode);
1540 		if (inode) {
1541 			spin_unlock(&root->inode_lock);
1542 			return inode;
1543 		}
1544 
1545 		objectid = btrfs_ino(entry) + 1;
1546 		if (cond_resched_lock(&root->inode_lock))
1547 			goto again;
1548 
1549 		node = rb_next(node);
1550 	}
1551 	spin_unlock(&root->inode_lock);
1552 	return NULL;
1553 }
1554 
1555 static int in_block_group(u64 bytenr,
1556 			  struct btrfs_block_group_cache *block_group)
1557 {
1558 	if (bytenr >= block_group->key.objectid &&
1559 	    bytenr < block_group->key.objectid + block_group->key.offset)
1560 		return 1;
1561 	return 0;
1562 }
1563 
1564 /*
1565  * get new location of data
1566  */
1567 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1568 			    u64 bytenr, u64 num_bytes)
1569 {
1570 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1571 	struct btrfs_path *path;
1572 	struct btrfs_file_extent_item *fi;
1573 	struct extent_buffer *leaf;
1574 	int ret;
1575 
1576 	path = btrfs_alloc_path();
1577 	if (!path)
1578 		return -ENOMEM;
1579 
1580 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1581 	ret = btrfs_lookup_file_extent(NULL, root, path,
1582 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1583 	if (ret < 0)
1584 		goto out;
1585 	if (ret > 0) {
1586 		ret = -ENOENT;
1587 		goto out;
1588 	}
1589 
1590 	leaf = path->nodes[0];
1591 	fi = btrfs_item_ptr(leaf, path->slots[0],
1592 			    struct btrfs_file_extent_item);
1593 
1594 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1595 	       btrfs_file_extent_compression(leaf, fi) ||
1596 	       btrfs_file_extent_encryption(leaf, fi) ||
1597 	       btrfs_file_extent_other_encoding(leaf, fi));
1598 
1599 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1600 		ret = -EINVAL;
1601 		goto out;
1602 	}
1603 
1604 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1605 	ret = 0;
1606 out:
1607 	btrfs_free_path(path);
1608 	return ret;
1609 }
1610 
1611 /*
1612  * update file extent items in the tree leaf to point to
1613  * the new locations.
1614  */
1615 static noinline_for_stack
1616 int replace_file_extents(struct btrfs_trans_handle *trans,
1617 			 struct reloc_control *rc,
1618 			 struct btrfs_root *root,
1619 			 struct extent_buffer *leaf)
1620 {
1621 	struct btrfs_fs_info *fs_info = root->fs_info;
1622 	struct btrfs_key key;
1623 	struct btrfs_file_extent_item *fi;
1624 	struct inode *inode = NULL;
1625 	u64 parent;
1626 	u64 bytenr;
1627 	u64 new_bytenr = 0;
1628 	u64 num_bytes;
1629 	u64 end;
1630 	u32 nritems;
1631 	u32 i;
1632 	int ret = 0;
1633 	int first = 1;
1634 	int dirty = 0;
1635 
1636 	if (rc->stage != UPDATE_DATA_PTRS)
1637 		return 0;
1638 
1639 	/* reloc trees always use full backref */
1640 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1641 		parent = leaf->start;
1642 	else
1643 		parent = 0;
1644 
1645 	nritems = btrfs_header_nritems(leaf);
1646 	for (i = 0; i < nritems; i++) {
1647 		struct btrfs_ref ref = { 0 };
1648 
1649 		cond_resched();
1650 		btrfs_item_key_to_cpu(leaf, &key, i);
1651 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1652 			continue;
1653 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1654 		if (btrfs_file_extent_type(leaf, fi) ==
1655 		    BTRFS_FILE_EXTENT_INLINE)
1656 			continue;
1657 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1658 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1659 		if (bytenr == 0)
1660 			continue;
1661 		if (!in_block_group(bytenr, rc->block_group))
1662 			continue;
1663 
1664 		/*
1665 		 * if we are modifying block in fs tree, wait for readpage
1666 		 * to complete and drop the extent cache
1667 		 */
1668 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1669 			if (first) {
1670 				inode = find_next_inode(root, key.objectid);
1671 				first = 0;
1672 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1673 				btrfs_add_delayed_iput(inode);
1674 				inode = find_next_inode(root, key.objectid);
1675 			}
1676 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1677 				end = key.offset +
1678 				      btrfs_file_extent_num_bytes(leaf, fi);
1679 				WARN_ON(!IS_ALIGNED(key.offset,
1680 						    fs_info->sectorsize));
1681 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1682 				end--;
1683 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1684 						      key.offset, end);
1685 				if (!ret)
1686 					continue;
1687 
1688 				btrfs_drop_extent_cache(BTRFS_I(inode),
1689 						key.offset,	end, 1);
1690 				unlock_extent(&BTRFS_I(inode)->io_tree,
1691 					      key.offset, end);
1692 			}
1693 		}
1694 
1695 		ret = get_new_location(rc->data_inode, &new_bytenr,
1696 				       bytenr, num_bytes);
1697 		if (ret) {
1698 			/*
1699 			 * Don't have to abort since we've not changed anything
1700 			 * in the file extent yet.
1701 			 */
1702 			break;
1703 		}
1704 
1705 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1706 		dirty = 1;
1707 
1708 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1709 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1710 				       num_bytes, parent);
1711 		ref.real_root = root->root_key.objectid;
1712 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1713 				    key.objectid, key.offset);
1714 		ret = btrfs_inc_extent_ref(trans, &ref);
1715 		if (ret) {
1716 			btrfs_abort_transaction(trans, ret);
1717 			break;
1718 		}
1719 
1720 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1721 				       num_bytes, parent);
1722 		ref.real_root = root->root_key.objectid;
1723 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1724 				    key.objectid, key.offset);
1725 		ret = btrfs_free_extent(trans, &ref);
1726 		if (ret) {
1727 			btrfs_abort_transaction(trans, ret);
1728 			break;
1729 		}
1730 	}
1731 	if (dirty)
1732 		btrfs_mark_buffer_dirty(leaf);
1733 	if (inode)
1734 		btrfs_add_delayed_iput(inode);
1735 	return ret;
1736 }
1737 
1738 static noinline_for_stack
1739 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1740 		     struct btrfs_path *path, int level)
1741 {
1742 	struct btrfs_disk_key key1;
1743 	struct btrfs_disk_key key2;
1744 	btrfs_node_key(eb, &key1, slot);
1745 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1746 	return memcmp(&key1, &key2, sizeof(key1));
1747 }
1748 
1749 /*
1750  * try to replace tree blocks in fs tree with the new blocks
1751  * in reloc tree. tree blocks haven't been modified since the
1752  * reloc tree was create can be replaced.
1753  *
1754  * if a block was replaced, level of the block + 1 is returned.
1755  * if no block got replaced, 0 is returned. if there are other
1756  * errors, a negative error number is returned.
1757  */
1758 static noinline_for_stack
1759 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1760 		 struct btrfs_root *dest, struct btrfs_root *src,
1761 		 struct btrfs_path *path, struct btrfs_key *next_key,
1762 		 int lowest_level, int max_level)
1763 {
1764 	struct btrfs_fs_info *fs_info = dest->fs_info;
1765 	struct extent_buffer *eb;
1766 	struct extent_buffer *parent;
1767 	struct btrfs_ref ref = { 0 };
1768 	struct btrfs_key key;
1769 	u64 old_bytenr;
1770 	u64 new_bytenr;
1771 	u64 old_ptr_gen;
1772 	u64 new_ptr_gen;
1773 	u64 last_snapshot;
1774 	u32 blocksize;
1775 	int cow = 0;
1776 	int level;
1777 	int ret;
1778 	int slot;
1779 
1780 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1781 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1782 
1783 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1784 again:
1785 	slot = path->slots[lowest_level];
1786 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1787 
1788 	eb = btrfs_lock_root_node(dest);
1789 	btrfs_set_lock_blocking_write(eb);
1790 	level = btrfs_header_level(eb);
1791 
1792 	if (level < lowest_level) {
1793 		btrfs_tree_unlock(eb);
1794 		free_extent_buffer(eb);
1795 		return 0;
1796 	}
1797 
1798 	if (cow) {
1799 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1800 		BUG_ON(ret);
1801 	}
1802 	btrfs_set_lock_blocking_write(eb);
1803 
1804 	if (next_key) {
1805 		next_key->objectid = (u64)-1;
1806 		next_key->type = (u8)-1;
1807 		next_key->offset = (u64)-1;
1808 	}
1809 
1810 	parent = eb;
1811 	while (1) {
1812 		struct btrfs_key first_key;
1813 
1814 		level = btrfs_header_level(parent);
1815 		BUG_ON(level < lowest_level);
1816 
1817 		ret = btrfs_bin_search(parent, &key, level, &slot);
1818 		if (ret < 0)
1819 			break;
1820 		if (ret && slot > 0)
1821 			slot--;
1822 
1823 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1824 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1825 
1826 		old_bytenr = btrfs_node_blockptr(parent, slot);
1827 		blocksize = fs_info->nodesize;
1828 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1829 		btrfs_node_key_to_cpu(parent, &first_key, slot);
1830 
1831 		if (level <= max_level) {
1832 			eb = path->nodes[level];
1833 			new_bytenr = btrfs_node_blockptr(eb,
1834 							path->slots[level]);
1835 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1836 							path->slots[level]);
1837 		} else {
1838 			new_bytenr = 0;
1839 			new_ptr_gen = 0;
1840 		}
1841 
1842 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1843 			ret = level;
1844 			break;
1845 		}
1846 
1847 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1848 		    memcmp_node_keys(parent, slot, path, level)) {
1849 			if (level <= lowest_level) {
1850 				ret = 0;
1851 				break;
1852 			}
1853 
1854 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1855 					     level - 1, &first_key);
1856 			if (IS_ERR(eb)) {
1857 				ret = PTR_ERR(eb);
1858 				break;
1859 			} else if (!extent_buffer_uptodate(eb)) {
1860 				ret = -EIO;
1861 				free_extent_buffer(eb);
1862 				break;
1863 			}
1864 			btrfs_tree_lock(eb);
1865 			if (cow) {
1866 				ret = btrfs_cow_block(trans, dest, eb, parent,
1867 						      slot, &eb);
1868 				BUG_ON(ret);
1869 			}
1870 			btrfs_set_lock_blocking_write(eb);
1871 
1872 			btrfs_tree_unlock(parent);
1873 			free_extent_buffer(parent);
1874 
1875 			parent = eb;
1876 			continue;
1877 		}
1878 
1879 		if (!cow) {
1880 			btrfs_tree_unlock(parent);
1881 			free_extent_buffer(parent);
1882 			cow = 1;
1883 			goto again;
1884 		}
1885 
1886 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1887 				      path->slots[level]);
1888 		btrfs_release_path(path);
1889 
1890 		path->lowest_level = level;
1891 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1892 		path->lowest_level = 0;
1893 		BUG_ON(ret);
1894 
1895 		/*
1896 		 * Info qgroup to trace both subtrees.
1897 		 *
1898 		 * We must trace both trees.
1899 		 * 1) Tree reloc subtree
1900 		 *    If not traced, we will leak data numbers
1901 		 * 2) Fs subtree
1902 		 *    If not traced, we will double count old data
1903 		 *
1904 		 * We don't scan the subtree right now, but only record
1905 		 * the swapped tree blocks.
1906 		 * The real subtree rescan is delayed until we have new
1907 		 * CoW on the subtree root node before transaction commit.
1908 		 */
1909 		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1910 				rc->block_group, parent, slot,
1911 				path->nodes[level], path->slots[level],
1912 				last_snapshot);
1913 		if (ret < 0)
1914 			break;
1915 		/*
1916 		 * swap blocks in fs tree and reloc tree.
1917 		 */
1918 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1919 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1920 		btrfs_mark_buffer_dirty(parent);
1921 
1922 		btrfs_set_node_blockptr(path->nodes[level],
1923 					path->slots[level], old_bytenr);
1924 		btrfs_set_node_ptr_generation(path->nodes[level],
1925 					      path->slots[level], old_ptr_gen);
1926 		btrfs_mark_buffer_dirty(path->nodes[level]);
1927 
1928 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1929 				       blocksize, path->nodes[level]->start);
1930 		ref.skip_qgroup = true;
1931 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1932 		ret = btrfs_inc_extent_ref(trans, &ref);
1933 		BUG_ON(ret);
1934 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1935 				       blocksize, 0);
1936 		ref.skip_qgroup = true;
1937 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1938 		ret = btrfs_inc_extent_ref(trans, &ref);
1939 		BUG_ON(ret);
1940 
1941 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1942 				       blocksize, path->nodes[level]->start);
1943 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1944 		ref.skip_qgroup = true;
1945 		ret = btrfs_free_extent(trans, &ref);
1946 		BUG_ON(ret);
1947 
1948 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1949 				       blocksize, 0);
1950 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1951 		ref.skip_qgroup = true;
1952 		ret = btrfs_free_extent(trans, &ref);
1953 		BUG_ON(ret);
1954 
1955 		btrfs_unlock_up_safe(path, 0);
1956 
1957 		ret = level;
1958 		break;
1959 	}
1960 	btrfs_tree_unlock(parent);
1961 	free_extent_buffer(parent);
1962 	return ret;
1963 }
1964 
1965 /*
1966  * helper to find next relocated block in reloc tree
1967  */
1968 static noinline_for_stack
1969 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1970 		       int *level)
1971 {
1972 	struct extent_buffer *eb;
1973 	int i;
1974 	u64 last_snapshot;
1975 	u32 nritems;
1976 
1977 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1978 
1979 	for (i = 0; i < *level; i++) {
1980 		free_extent_buffer(path->nodes[i]);
1981 		path->nodes[i] = NULL;
1982 	}
1983 
1984 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1985 		eb = path->nodes[i];
1986 		nritems = btrfs_header_nritems(eb);
1987 		while (path->slots[i] + 1 < nritems) {
1988 			path->slots[i]++;
1989 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1990 			    last_snapshot)
1991 				continue;
1992 
1993 			*level = i;
1994 			return 0;
1995 		}
1996 		free_extent_buffer(path->nodes[i]);
1997 		path->nodes[i] = NULL;
1998 	}
1999 	return 1;
2000 }
2001 
2002 /*
2003  * walk down reloc tree to find relocated block of lowest level
2004  */
2005 static noinline_for_stack
2006 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2007 			 int *level)
2008 {
2009 	struct btrfs_fs_info *fs_info = root->fs_info;
2010 	struct extent_buffer *eb = NULL;
2011 	int i;
2012 	u64 bytenr;
2013 	u64 ptr_gen = 0;
2014 	u64 last_snapshot;
2015 	u32 nritems;
2016 
2017 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2018 
2019 	for (i = *level; i > 0; i--) {
2020 		struct btrfs_key first_key;
2021 
2022 		eb = path->nodes[i];
2023 		nritems = btrfs_header_nritems(eb);
2024 		while (path->slots[i] < nritems) {
2025 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2026 			if (ptr_gen > last_snapshot)
2027 				break;
2028 			path->slots[i]++;
2029 		}
2030 		if (path->slots[i] >= nritems) {
2031 			if (i == *level)
2032 				break;
2033 			*level = i + 1;
2034 			return 0;
2035 		}
2036 		if (i == 1) {
2037 			*level = i;
2038 			return 0;
2039 		}
2040 
2041 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2042 		btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2043 		eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2044 				     &first_key);
2045 		if (IS_ERR(eb)) {
2046 			return PTR_ERR(eb);
2047 		} else if (!extent_buffer_uptodate(eb)) {
2048 			free_extent_buffer(eb);
2049 			return -EIO;
2050 		}
2051 		BUG_ON(btrfs_header_level(eb) != i - 1);
2052 		path->nodes[i - 1] = eb;
2053 		path->slots[i - 1] = 0;
2054 	}
2055 	return 1;
2056 }
2057 
2058 /*
2059  * invalidate extent cache for file extents whose key in range of
2060  * [min_key, max_key)
2061  */
2062 static int invalidate_extent_cache(struct btrfs_root *root,
2063 				   struct btrfs_key *min_key,
2064 				   struct btrfs_key *max_key)
2065 {
2066 	struct btrfs_fs_info *fs_info = root->fs_info;
2067 	struct inode *inode = NULL;
2068 	u64 objectid;
2069 	u64 start, end;
2070 	u64 ino;
2071 
2072 	objectid = min_key->objectid;
2073 	while (1) {
2074 		cond_resched();
2075 		iput(inode);
2076 
2077 		if (objectid > max_key->objectid)
2078 			break;
2079 
2080 		inode = find_next_inode(root, objectid);
2081 		if (!inode)
2082 			break;
2083 		ino = btrfs_ino(BTRFS_I(inode));
2084 
2085 		if (ino > max_key->objectid) {
2086 			iput(inode);
2087 			break;
2088 		}
2089 
2090 		objectid = ino + 1;
2091 		if (!S_ISREG(inode->i_mode))
2092 			continue;
2093 
2094 		if (unlikely(min_key->objectid == ino)) {
2095 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2096 				continue;
2097 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2098 				start = 0;
2099 			else {
2100 				start = min_key->offset;
2101 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2102 			}
2103 		} else {
2104 			start = 0;
2105 		}
2106 
2107 		if (unlikely(max_key->objectid == ino)) {
2108 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2109 				continue;
2110 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2111 				end = (u64)-1;
2112 			} else {
2113 				if (max_key->offset == 0)
2114 					continue;
2115 				end = max_key->offset;
2116 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2117 				end--;
2118 			}
2119 		} else {
2120 			end = (u64)-1;
2121 		}
2122 
2123 		/* the lock_extent waits for readpage to complete */
2124 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2125 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2126 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2127 	}
2128 	return 0;
2129 }
2130 
2131 static int find_next_key(struct btrfs_path *path, int level,
2132 			 struct btrfs_key *key)
2133 
2134 {
2135 	while (level < BTRFS_MAX_LEVEL) {
2136 		if (!path->nodes[level])
2137 			break;
2138 		if (path->slots[level] + 1 <
2139 		    btrfs_header_nritems(path->nodes[level])) {
2140 			btrfs_node_key_to_cpu(path->nodes[level], key,
2141 					      path->slots[level] + 1);
2142 			return 0;
2143 		}
2144 		level++;
2145 	}
2146 	return 1;
2147 }
2148 
2149 /*
2150  * Insert current subvolume into reloc_control::dirty_subvol_roots
2151  */
2152 static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2153 				struct reloc_control *rc,
2154 				struct btrfs_root *root)
2155 {
2156 	struct btrfs_root *reloc_root = root->reloc_root;
2157 	struct btrfs_root_item *reloc_root_item;
2158 
2159 	/* @root must be a subvolume tree root with a valid reloc tree */
2160 	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2161 	ASSERT(reloc_root);
2162 
2163 	reloc_root_item = &reloc_root->root_item;
2164 	memset(&reloc_root_item->drop_progress, 0,
2165 		sizeof(reloc_root_item->drop_progress));
2166 	reloc_root_item->drop_level = 0;
2167 	btrfs_set_root_refs(reloc_root_item, 0);
2168 	btrfs_update_reloc_root(trans, root);
2169 
2170 	if (list_empty(&root->reloc_dirty_list)) {
2171 		btrfs_grab_fs_root(root);
2172 		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2173 	}
2174 }
2175 
2176 static int clean_dirty_subvols(struct reloc_control *rc)
2177 {
2178 	struct btrfs_root *root;
2179 	struct btrfs_root *next;
2180 	int ret = 0;
2181 	int ret2;
2182 
2183 	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2184 				 reloc_dirty_list) {
2185 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2186 			/* Merged subvolume, cleanup its reloc root */
2187 			struct btrfs_root *reloc_root = root->reloc_root;
2188 
2189 			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
2190 			list_del_init(&root->reloc_dirty_list);
2191 			root->reloc_root = NULL;
2192 			if (reloc_root) {
2193 
2194 				ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
2195 				if (ret2 < 0 && !ret)
2196 					ret = ret2;
2197 			}
2198 			btrfs_put_fs_root(root);
2199 		} else {
2200 			/* Orphan reloc tree, just clean it up */
2201 			ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
2202 			if (ret2 < 0 && !ret)
2203 				ret = ret2;
2204 		}
2205 	}
2206 	return ret;
2207 }
2208 
2209 /*
2210  * merge the relocated tree blocks in reloc tree with corresponding
2211  * fs tree.
2212  */
2213 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2214 					       struct btrfs_root *root)
2215 {
2216 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2217 	struct btrfs_key key;
2218 	struct btrfs_key next_key;
2219 	struct btrfs_trans_handle *trans = NULL;
2220 	struct btrfs_root *reloc_root;
2221 	struct btrfs_root_item *root_item;
2222 	struct btrfs_path *path;
2223 	struct extent_buffer *leaf;
2224 	int level;
2225 	int max_level;
2226 	int replaced = 0;
2227 	int ret;
2228 	int err = 0;
2229 	u32 min_reserved;
2230 
2231 	path = btrfs_alloc_path();
2232 	if (!path)
2233 		return -ENOMEM;
2234 	path->reada = READA_FORWARD;
2235 
2236 	reloc_root = root->reloc_root;
2237 	root_item = &reloc_root->root_item;
2238 
2239 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2240 		level = btrfs_root_level(root_item);
2241 		extent_buffer_get(reloc_root->node);
2242 		path->nodes[level] = reloc_root->node;
2243 		path->slots[level] = 0;
2244 	} else {
2245 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2246 
2247 		level = root_item->drop_level;
2248 		BUG_ON(level == 0);
2249 		path->lowest_level = level;
2250 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2251 		path->lowest_level = 0;
2252 		if (ret < 0) {
2253 			btrfs_free_path(path);
2254 			return ret;
2255 		}
2256 
2257 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2258 				      path->slots[level]);
2259 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2260 
2261 		btrfs_unlock_up_safe(path, 0);
2262 	}
2263 
2264 	min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2265 	memset(&next_key, 0, sizeof(next_key));
2266 
2267 	while (1) {
2268 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2269 					     BTRFS_RESERVE_FLUSH_ALL);
2270 		if (ret) {
2271 			err = ret;
2272 			goto out;
2273 		}
2274 		trans = btrfs_start_transaction(root, 0);
2275 		if (IS_ERR(trans)) {
2276 			err = PTR_ERR(trans);
2277 			trans = NULL;
2278 			goto out;
2279 		}
2280 		trans->block_rsv = rc->block_rsv;
2281 
2282 		replaced = 0;
2283 		max_level = level;
2284 
2285 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2286 		if (ret < 0) {
2287 			err = ret;
2288 			goto out;
2289 		}
2290 		if (ret > 0)
2291 			break;
2292 
2293 		if (!find_next_key(path, level, &key) &&
2294 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2295 			ret = 0;
2296 		} else {
2297 			ret = replace_path(trans, rc, root, reloc_root, path,
2298 					   &next_key, level, max_level);
2299 		}
2300 		if (ret < 0) {
2301 			err = ret;
2302 			goto out;
2303 		}
2304 
2305 		if (ret > 0) {
2306 			level = ret;
2307 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2308 					      path->slots[level]);
2309 			replaced = 1;
2310 		}
2311 
2312 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2313 		if (ret > 0)
2314 			break;
2315 
2316 		BUG_ON(level == 0);
2317 		/*
2318 		 * save the merging progress in the drop_progress.
2319 		 * this is OK since root refs == 1 in this case.
2320 		 */
2321 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2322 			       path->slots[level]);
2323 		root_item->drop_level = level;
2324 
2325 		btrfs_end_transaction_throttle(trans);
2326 		trans = NULL;
2327 
2328 		btrfs_btree_balance_dirty(fs_info);
2329 
2330 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2331 			invalidate_extent_cache(root, &key, &next_key);
2332 	}
2333 
2334 	/*
2335 	 * handle the case only one block in the fs tree need to be
2336 	 * relocated and the block is tree root.
2337 	 */
2338 	leaf = btrfs_lock_root_node(root);
2339 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2340 	btrfs_tree_unlock(leaf);
2341 	free_extent_buffer(leaf);
2342 	if (ret < 0)
2343 		err = ret;
2344 out:
2345 	btrfs_free_path(path);
2346 
2347 	if (err == 0)
2348 		insert_dirty_subvol(trans, rc, root);
2349 
2350 	if (trans)
2351 		btrfs_end_transaction_throttle(trans);
2352 
2353 	btrfs_btree_balance_dirty(fs_info);
2354 
2355 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2356 		invalidate_extent_cache(root, &key, &next_key);
2357 
2358 	return err;
2359 }
2360 
2361 static noinline_for_stack
2362 int prepare_to_merge(struct reloc_control *rc, int err)
2363 {
2364 	struct btrfs_root *root = rc->extent_root;
2365 	struct btrfs_fs_info *fs_info = root->fs_info;
2366 	struct btrfs_root *reloc_root;
2367 	struct btrfs_trans_handle *trans;
2368 	LIST_HEAD(reloc_roots);
2369 	u64 num_bytes = 0;
2370 	int ret;
2371 
2372 	mutex_lock(&fs_info->reloc_mutex);
2373 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2374 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2375 	mutex_unlock(&fs_info->reloc_mutex);
2376 
2377 again:
2378 	if (!err) {
2379 		num_bytes = rc->merging_rsv_size;
2380 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2381 					  BTRFS_RESERVE_FLUSH_ALL);
2382 		if (ret)
2383 			err = ret;
2384 	}
2385 
2386 	trans = btrfs_join_transaction(rc->extent_root);
2387 	if (IS_ERR(trans)) {
2388 		if (!err)
2389 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2390 						num_bytes);
2391 		return PTR_ERR(trans);
2392 	}
2393 
2394 	if (!err) {
2395 		if (num_bytes != rc->merging_rsv_size) {
2396 			btrfs_end_transaction(trans);
2397 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2398 						num_bytes);
2399 			goto again;
2400 		}
2401 	}
2402 
2403 	rc->merge_reloc_tree = 1;
2404 
2405 	while (!list_empty(&rc->reloc_roots)) {
2406 		reloc_root = list_entry(rc->reloc_roots.next,
2407 					struct btrfs_root, root_list);
2408 		list_del_init(&reloc_root->root_list);
2409 
2410 		root = read_fs_root(fs_info, reloc_root->root_key.offset);
2411 		BUG_ON(IS_ERR(root));
2412 		BUG_ON(root->reloc_root != reloc_root);
2413 
2414 		/*
2415 		 * set reference count to 1, so btrfs_recover_relocation
2416 		 * knows it should resumes merging
2417 		 */
2418 		if (!err)
2419 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2420 		btrfs_update_reloc_root(trans, root);
2421 
2422 		list_add(&reloc_root->root_list, &reloc_roots);
2423 	}
2424 
2425 	list_splice(&reloc_roots, &rc->reloc_roots);
2426 
2427 	if (!err)
2428 		btrfs_commit_transaction(trans);
2429 	else
2430 		btrfs_end_transaction(trans);
2431 	return err;
2432 }
2433 
2434 static noinline_for_stack
2435 void free_reloc_roots(struct list_head *list)
2436 {
2437 	struct btrfs_root *reloc_root;
2438 
2439 	while (!list_empty(list)) {
2440 		reloc_root = list_entry(list->next, struct btrfs_root,
2441 					root_list);
2442 		__del_reloc_root(reloc_root);
2443 		free_extent_buffer(reloc_root->node);
2444 		free_extent_buffer(reloc_root->commit_root);
2445 		reloc_root->node = NULL;
2446 		reloc_root->commit_root = NULL;
2447 	}
2448 }
2449 
2450 static noinline_for_stack
2451 void merge_reloc_roots(struct reloc_control *rc)
2452 {
2453 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2454 	struct btrfs_root *root;
2455 	struct btrfs_root *reloc_root;
2456 	LIST_HEAD(reloc_roots);
2457 	int found = 0;
2458 	int ret = 0;
2459 again:
2460 	root = rc->extent_root;
2461 
2462 	/*
2463 	 * this serializes us with btrfs_record_root_in_transaction,
2464 	 * we have to make sure nobody is in the middle of
2465 	 * adding their roots to the list while we are
2466 	 * doing this splice
2467 	 */
2468 	mutex_lock(&fs_info->reloc_mutex);
2469 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2470 	mutex_unlock(&fs_info->reloc_mutex);
2471 
2472 	while (!list_empty(&reloc_roots)) {
2473 		found = 1;
2474 		reloc_root = list_entry(reloc_roots.next,
2475 					struct btrfs_root, root_list);
2476 
2477 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2478 			root = read_fs_root(fs_info,
2479 					    reloc_root->root_key.offset);
2480 			BUG_ON(IS_ERR(root));
2481 			BUG_ON(root->reloc_root != reloc_root);
2482 
2483 			ret = merge_reloc_root(rc, root);
2484 			if (ret) {
2485 				if (list_empty(&reloc_root->root_list))
2486 					list_add_tail(&reloc_root->root_list,
2487 						      &reloc_roots);
2488 				goto out;
2489 			}
2490 		} else {
2491 			list_del_init(&reloc_root->root_list);
2492 			/* Don't forget to queue this reloc root for cleanup */
2493 			list_add_tail(&reloc_root->reloc_dirty_list,
2494 				      &rc->dirty_subvol_roots);
2495 		}
2496 	}
2497 
2498 	if (found) {
2499 		found = 0;
2500 		goto again;
2501 	}
2502 out:
2503 	if (ret) {
2504 		btrfs_handle_fs_error(fs_info, ret, NULL);
2505 		if (!list_empty(&reloc_roots))
2506 			free_reloc_roots(&reloc_roots);
2507 
2508 		/* new reloc root may be added */
2509 		mutex_lock(&fs_info->reloc_mutex);
2510 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2511 		mutex_unlock(&fs_info->reloc_mutex);
2512 		if (!list_empty(&reloc_roots))
2513 			free_reloc_roots(&reloc_roots);
2514 	}
2515 
2516 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2517 }
2518 
2519 static void free_block_list(struct rb_root *blocks)
2520 {
2521 	struct tree_block *block;
2522 	struct rb_node *rb_node;
2523 	while ((rb_node = rb_first(blocks))) {
2524 		block = rb_entry(rb_node, struct tree_block, rb_node);
2525 		rb_erase(rb_node, blocks);
2526 		kfree(block);
2527 	}
2528 }
2529 
2530 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2531 				      struct btrfs_root *reloc_root)
2532 {
2533 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2534 	struct btrfs_root *root;
2535 
2536 	if (reloc_root->last_trans == trans->transid)
2537 		return 0;
2538 
2539 	root = read_fs_root(fs_info, reloc_root->root_key.offset);
2540 	BUG_ON(IS_ERR(root));
2541 	BUG_ON(root->reloc_root != reloc_root);
2542 
2543 	return btrfs_record_root_in_trans(trans, root);
2544 }
2545 
2546 static noinline_for_stack
2547 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2548 				     struct reloc_control *rc,
2549 				     struct backref_node *node,
2550 				     struct backref_edge *edges[])
2551 {
2552 	struct backref_node *next;
2553 	struct btrfs_root *root;
2554 	int index = 0;
2555 
2556 	next = node;
2557 	while (1) {
2558 		cond_resched();
2559 		next = walk_up_backref(next, edges, &index);
2560 		root = next->root;
2561 		BUG_ON(!root);
2562 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2563 
2564 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2565 			record_reloc_root_in_trans(trans, root);
2566 			break;
2567 		}
2568 
2569 		btrfs_record_root_in_trans(trans, root);
2570 		root = root->reloc_root;
2571 
2572 		if (next->new_bytenr != root->node->start) {
2573 			BUG_ON(next->new_bytenr);
2574 			BUG_ON(!list_empty(&next->list));
2575 			next->new_bytenr = root->node->start;
2576 			next->root = root;
2577 			list_add_tail(&next->list,
2578 				      &rc->backref_cache.changed);
2579 			__mark_block_processed(rc, next);
2580 			break;
2581 		}
2582 
2583 		WARN_ON(1);
2584 		root = NULL;
2585 		next = walk_down_backref(edges, &index);
2586 		if (!next || next->level <= node->level)
2587 			break;
2588 	}
2589 	if (!root)
2590 		return NULL;
2591 
2592 	next = node;
2593 	/* setup backref node path for btrfs_reloc_cow_block */
2594 	while (1) {
2595 		rc->backref_cache.path[next->level] = next;
2596 		if (--index < 0)
2597 			break;
2598 		next = edges[index]->node[UPPER];
2599 	}
2600 	return root;
2601 }
2602 
2603 /*
2604  * select a tree root for relocation. return NULL if the block
2605  * is reference counted. we should use do_relocation() in this
2606  * case. return a tree root pointer if the block isn't reference
2607  * counted. return -ENOENT if the block is root of reloc tree.
2608  */
2609 static noinline_for_stack
2610 struct btrfs_root *select_one_root(struct backref_node *node)
2611 {
2612 	struct backref_node *next;
2613 	struct btrfs_root *root;
2614 	struct btrfs_root *fs_root = NULL;
2615 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2616 	int index = 0;
2617 
2618 	next = node;
2619 	while (1) {
2620 		cond_resched();
2621 		next = walk_up_backref(next, edges, &index);
2622 		root = next->root;
2623 		BUG_ON(!root);
2624 
2625 		/* no other choice for non-references counted tree */
2626 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2627 			return root;
2628 
2629 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2630 			fs_root = root;
2631 
2632 		if (next != node)
2633 			return NULL;
2634 
2635 		next = walk_down_backref(edges, &index);
2636 		if (!next || next->level <= node->level)
2637 			break;
2638 	}
2639 
2640 	if (!fs_root)
2641 		return ERR_PTR(-ENOENT);
2642 	return fs_root;
2643 }
2644 
2645 static noinline_for_stack
2646 u64 calcu_metadata_size(struct reloc_control *rc,
2647 			struct backref_node *node, int reserve)
2648 {
2649 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2650 	struct backref_node *next = node;
2651 	struct backref_edge *edge;
2652 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2653 	u64 num_bytes = 0;
2654 	int index = 0;
2655 
2656 	BUG_ON(reserve && node->processed);
2657 
2658 	while (next) {
2659 		cond_resched();
2660 		while (1) {
2661 			if (next->processed && (reserve || next != node))
2662 				break;
2663 
2664 			num_bytes += fs_info->nodesize;
2665 
2666 			if (list_empty(&next->upper))
2667 				break;
2668 
2669 			edge = list_entry(next->upper.next,
2670 					  struct backref_edge, list[LOWER]);
2671 			edges[index++] = edge;
2672 			next = edge->node[UPPER];
2673 		}
2674 		next = walk_down_backref(edges, &index);
2675 	}
2676 	return num_bytes;
2677 }
2678 
2679 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2680 				  struct reloc_control *rc,
2681 				  struct backref_node *node)
2682 {
2683 	struct btrfs_root *root = rc->extent_root;
2684 	struct btrfs_fs_info *fs_info = root->fs_info;
2685 	u64 num_bytes;
2686 	int ret;
2687 	u64 tmp;
2688 
2689 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2690 
2691 	trans->block_rsv = rc->block_rsv;
2692 	rc->reserved_bytes += num_bytes;
2693 
2694 	/*
2695 	 * We are under a transaction here so we can only do limited flushing.
2696 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2697 	 * transaction and try to refill when we can flush all the things.
2698 	 */
2699 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2700 				BTRFS_RESERVE_FLUSH_LIMIT);
2701 	if (ret) {
2702 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2703 		while (tmp <= rc->reserved_bytes)
2704 			tmp <<= 1;
2705 		/*
2706 		 * only one thread can access block_rsv at this point,
2707 		 * so we don't need hold lock to protect block_rsv.
2708 		 * we expand more reservation size here to allow enough
2709 		 * space for relocation and we will return earlier in
2710 		 * enospc case.
2711 		 */
2712 		rc->block_rsv->size = tmp + fs_info->nodesize *
2713 				      RELOCATION_RESERVED_NODES;
2714 		return -EAGAIN;
2715 	}
2716 
2717 	return 0;
2718 }
2719 
2720 /*
2721  * relocate a block tree, and then update pointers in upper level
2722  * blocks that reference the block to point to the new location.
2723  *
2724  * if called by link_to_upper, the block has already been relocated.
2725  * in that case this function just updates pointers.
2726  */
2727 static int do_relocation(struct btrfs_trans_handle *trans,
2728 			 struct reloc_control *rc,
2729 			 struct backref_node *node,
2730 			 struct btrfs_key *key,
2731 			 struct btrfs_path *path, int lowest)
2732 {
2733 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2734 	struct backref_node *upper;
2735 	struct backref_edge *edge;
2736 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2737 	struct btrfs_root *root;
2738 	struct extent_buffer *eb;
2739 	u32 blocksize;
2740 	u64 bytenr;
2741 	u64 generation;
2742 	int slot;
2743 	int ret;
2744 	int err = 0;
2745 
2746 	BUG_ON(lowest && node->eb);
2747 
2748 	path->lowest_level = node->level + 1;
2749 	rc->backref_cache.path[node->level] = node;
2750 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2751 		struct btrfs_key first_key;
2752 		struct btrfs_ref ref = { 0 };
2753 
2754 		cond_resched();
2755 
2756 		upper = edge->node[UPPER];
2757 		root = select_reloc_root(trans, rc, upper, edges);
2758 		BUG_ON(!root);
2759 
2760 		if (upper->eb && !upper->locked) {
2761 			if (!lowest) {
2762 				ret = btrfs_bin_search(upper->eb, key,
2763 						       upper->level, &slot);
2764 				if (ret < 0) {
2765 					err = ret;
2766 					goto next;
2767 				}
2768 				BUG_ON(ret);
2769 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2770 				if (node->eb->start == bytenr)
2771 					goto next;
2772 			}
2773 			drop_node_buffer(upper);
2774 		}
2775 
2776 		if (!upper->eb) {
2777 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2778 			if (ret) {
2779 				if (ret < 0)
2780 					err = ret;
2781 				else
2782 					err = -ENOENT;
2783 
2784 				btrfs_release_path(path);
2785 				break;
2786 			}
2787 
2788 			if (!upper->eb) {
2789 				upper->eb = path->nodes[upper->level];
2790 				path->nodes[upper->level] = NULL;
2791 			} else {
2792 				BUG_ON(upper->eb != path->nodes[upper->level]);
2793 			}
2794 
2795 			upper->locked = 1;
2796 			path->locks[upper->level] = 0;
2797 
2798 			slot = path->slots[upper->level];
2799 			btrfs_release_path(path);
2800 		} else {
2801 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2802 					       &slot);
2803 			if (ret < 0) {
2804 				err = ret;
2805 				goto next;
2806 			}
2807 			BUG_ON(ret);
2808 		}
2809 
2810 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2811 		if (lowest) {
2812 			if (bytenr != node->bytenr) {
2813 				btrfs_err(root->fs_info,
2814 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2815 					  bytenr, node->bytenr, slot,
2816 					  upper->eb->start);
2817 				err = -EIO;
2818 				goto next;
2819 			}
2820 		} else {
2821 			if (node->eb->start == bytenr)
2822 				goto next;
2823 		}
2824 
2825 		blocksize = root->fs_info->nodesize;
2826 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2827 		btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2828 		eb = read_tree_block(fs_info, bytenr, generation,
2829 				     upper->level - 1, &first_key);
2830 		if (IS_ERR(eb)) {
2831 			err = PTR_ERR(eb);
2832 			goto next;
2833 		} else if (!extent_buffer_uptodate(eb)) {
2834 			free_extent_buffer(eb);
2835 			err = -EIO;
2836 			goto next;
2837 		}
2838 		btrfs_tree_lock(eb);
2839 		btrfs_set_lock_blocking_write(eb);
2840 
2841 		if (!node->eb) {
2842 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2843 					      slot, &eb);
2844 			btrfs_tree_unlock(eb);
2845 			free_extent_buffer(eb);
2846 			if (ret < 0) {
2847 				err = ret;
2848 				goto next;
2849 			}
2850 			BUG_ON(node->eb != eb);
2851 		} else {
2852 			btrfs_set_node_blockptr(upper->eb, slot,
2853 						node->eb->start);
2854 			btrfs_set_node_ptr_generation(upper->eb, slot,
2855 						      trans->transid);
2856 			btrfs_mark_buffer_dirty(upper->eb);
2857 
2858 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2859 					       node->eb->start, blocksize,
2860 					       upper->eb->start);
2861 			ref.real_root = root->root_key.objectid;
2862 			btrfs_init_tree_ref(&ref, node->level,
2863 					    btrfs_header_owner(upper->eb));
2864 			ret = btrfs_inc_extent_ref(trans, &ref);
2865 			BUG_ON(ret);
2866 
2867 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2868 			BUG_ON(ret);
2869 		}
2870 next:
2871 		if (!upper->pending)
2872 			drop_node_buffer(upper);
2873 		else
2874 			unlock_node_buffer(upper);
2875 		if (err)
2876 			break;
2877 	}
2878 
2879 	if (!err && node->pending) {
2880 		drop_node_buffer(node);
2881 		list_move_tail(&node->list, &rc->backref_cache.changed);
2882 		node->pending = 0;
2883 	}
2884 
2885 	path->lowest_level = 0;
2886 	BUG_ON(err == -ENOSPC);
2887 	return err;
2888 }
2889 
2890 static int link_to_upper(struct btrfs_trans_handle *trans,
2891 			 struct reloc_control *rc,
2892 			 struct backref_node *node,
2893 			 struct btrfs_path *path)
2894 {
2895 	struct btrfs_key key;
2896 
2897 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2898 	return do_relocation(trans, rc, node, &key, path, 0);
2899 }
2900 
2901 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2902 				struct reloc_control *rc,
2903 				struct btrfs_path *path, int err)
2904 {
2905 	LIST_HEAD(list);
2906 	struct backref_cache *cache = &rc->backref_cache;
2907 	struct backref_node *node;
2908 	int level;
2909 	int ret;
2910 
2911 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2912 		while (!list_empty(&cache->pending[level])) {
2913 			node = list_entry(cache->pending[level].next,
2914 					  struct backref_node, list);
2915 			list_move_tail(&node->list, &list);
2916 			BUG_ON(!node->pending);
2917 
2918 			if (!err) {
2919 				ret = link_to_upper(trans, rc, node, path);
2920 				if (ret < 0)
2921 					err = ret;
2922 			}
2923 		}
2924 		list_splice_init(&list, &cache->pending[level]);
2925 	}
2926 	return err;
2927 }
2928 
2929 static void mark_block_processed(struct reloc_control *rc,
2930 				 u64 bytenr, u32 blocksize)
2931 {
2932 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2933 			EXTENT_DIRTY);
2934 }
2935 
2936 static void __mark_block_processed(struct reloc_control *rc,
2937 				   struct backref_node *node)
2938 {
2939 	u32 blocksize;
2940 	if (node->level == 0 ||
2941 	    in_block_group(node->bytenr, rc->block_group)) {
2942 		blocksize = rc->extent_root->fs_info->nodesize;
2943 		mark_block_processed(rc, node->bytenr, blocksize);
2944 	}
2945 	node->processed = 1;
2946 }
2947 
2948 /*
2949  * mark a block and all blocks directly/indirectly reference the block
2950  * as processed.
2951  */
2952 static void update_processed_blocks(struct reloc_control *rc,
2953 				    struct backref_node *node)
2954 {
2955 	struct backref_node *next = node;
2956 	struct backref_edge *edge;
2957 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2958 	int index = 0;
2959 
2960 	while (next) {
2961 		cond_resched();
2962 		while (1) {
2963 			if (next->processed)
2964 				break;
2965 
2966 			__mark_block_processed(rc, next);
2967 
2968 			if (list_empty(&next->upper))
2969 				break;
2970 
2971 			edge = list_entry(next->upper.next,
2972 					  struct backref_edge, list[LOWER]);
2973 			edges[index++] = edge;
2974 			next = edge->node[UPPER];
2975 		}
2976 		next = walk_down_backref(edges, &index);
2977 	}
2978 }
2979 
2980 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2981 {
2982 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2983 
2984 	if (test_range_bit(&rc->processed_blocks, bytenr,
2985 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2986 		return 1;
2987 	return 0;
2988 }
2989 
2990 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2991 			      struct tree_block *block)
2992 {
2993 	struct extent_buffer *eb;
2994 
2995 	BUG_ON(block->key_ready);
2996 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
2997 			     block->level, NULL);
2998 	if (IS_ERR(eb)) {
2999 		return PTR_ERR(eb);
3000 	} else if (!extent_buffer_uptodate(eb)) {
3001 		free_extent_buffer(eb);
3002 		return -EIO;
3003 	}
3004 	if (block->level == 0)
3005 		btrfs_item_key_to_cpu(eb, &block->key, 0);
3006 	else
3007 		btrfs_node_key_to_cpu(eb, &block->key, 0);
3008 	free_extent_buffer(eb);
3009 	block->key_ready = 1;
3010 	return 0;
3011 }
3012 
3013 /*
3014  * helper function to relocate a tree block
3015  */
3016 static int relocate_tree_block(struct btrfs_trans_handle *trans,
3017 				struct reloc_control *rc,
3018 				struct backref_node *node,
3019 				struct btrfs_key *key,
3020 				struct btrfs_path *path)
3021 {
3022 	struct btrfs_root *root;
3023 	int ret = 0;
3024 
3025 	if (!node)
3026 		return 0;
3027 
3028 	BUG_ON(node->processed);
3029 	root = select_one_root(node);
3030 	if (root == ERR_PTR(-ENOENT)) {
3031 		update_processed_blocks(rc, node);
3032 		goto out;
3033 	}
3034 
3035 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3036 		ret = reserve_metadata_space(trans, rc, node);
3037 		if (ret)
3038 			goto out;
3039 	}
3040 
3041 	if (root) {
3042 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3043 			BUG_ON(node->new_bytenr);
3044 			BUG_ON(!list_empty(&node->list));
3045 			btrfs_record_root_in_trans(trans, root);
3046 			root = root->reloc_root;
3047 			node->new_bytenr = root->node->start;
3048 			node->root = root;
3049 			list_add_tail(&node->list, &rc->backref_cache.changed);
3050 		} else {
3051 			path->lowest_level = node->level;
3052 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3053 			btrfs_release_path(path);
3054 			if (ret > 0)
3055 				ret = 0;
3056 		}
3057 		if (!ret)
3058 			update_processed_blocks(rc, node);
3059 	} else {
3060 		ret = do_relocation(trans, rc, node, key, path, 1);
3061 	}
3062 out:
3063 	if (ret || node->level == 0 || node->cowonly)
3064 		remove_backref_node(&rc->backref_cache, node);
3065 	return ret;
3066 }
3067 
3068 /*
3069  * relocate a list of blocks
3070  */
3071 static noinline_for_stack
3072 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3073 			 struct reloc_control *rc, struct rb_root *blocks)
3074 {
3075 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3076 	struct backref_node *node;
3077 	struct btrfs_path *path;
3078 	struct tree_block *block;
3079 	struct tree_block *next;
3080 	int ret;
3081 	int err = 0;
3082 
3083 	path = btrfs_alloc_path();
3084 	if (!path) {
3085 		err = -ENOMEM;
3086 		goto out_free_blocks;
3087 	}
3088 
3089 	/* Kick in readahead for tree blocks with missing keys */
3090 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3091 		if (!block->key_ready)
3092 			readahead_tree_block(fs_info, block->bytenr);
3093 	}
3094 
3095 	/* Get first keys */
3096 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3097 		if (!block->key_ready) {
3098 			err = get_tree_block_key(fs_info, block);
3099 			if (err)
3100 				goto out_free_path;
3101 		}
3102 	}
3103 
3104 	/* Do tree relocation */
3105 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3106 		node = build_backref_tree(rc, &block->key,
3107 					  block->level, block->bytenr);
3108 		if (IS_ERR(node)) {
3109 			err = PTR_ERR(node);
3110 			goto out;
3111 		}
3112 
3113 		ret = relocate_tree_block(trans, rc, node, &block->key,
3114 					  path);
3115 		if (ret < 0) {
3116 			if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
3117 				err = ret;
3118 			goto out;
3119 		}
3120 	}
3121 out:
3122 	err = finish_pending_nodes(trans, rc, path, err);
3123 
3124 out_free_path:
3125 	btrfs_free_path(path);
3126 out_free_blocks:
3127 	free_block_list(blocks);
3128 	return err;
3129 }
3130 
3131 static noinline_for_stack
3132 int prealloc_file_extent_cluster(struct inode *inode,
3133 				 struct file_extent_cluster *cluster)
3134 {
3135 	u64 alloc_hint = 0;
3136 	u64 start;
3137 	u64 end;
3138 	u64 offset = BTRFS_I(inode)->index_cnt;
3139 	u64 num_bytes;
3140 	int nr = 0;
3141 	int ret = 0;
3142 	u64 prealloc_start = cluster->start - offset;
3143 	u64 prealloc_end = cluster->end - offset;
3144 	u64 cur_offset;
3145 	struct extent_changeset *data_reserved = NULL;
3146 
3147 	BUG_ON(cluster->start != cluster->boundary[0]);
3148 	inode_lock(inode);
3149 
3150 	ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3151 					  prealloc_end + 1 - prealloc_start);
3152 	if (ret)
3153 		goto out;
3154 
3155 	cur_offset = prealloc_start;
3156 	while (nr < cluster->nr) {
3157 		start = cluster->boundary[nr] - offset;
3158 		if (nr + 1 < cluster->nr)
3159 			end = cluster->boundary[nr + 1] - 1 - offset;
3160 		else
3161 			end = cluster->end - offset;
3162 
3163 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3164 		num_bytes = end + 1 - start;
3165 		if (cur_offset < start)
3166 			btrfs_free_reserved_data_space(inode, data_reserved,
3167 					cur_offset, start - cur_offset);
3168 		ret = btrfs_prealloc_file_range(inode, 0, start,
3169 						num_bytes, num_bytes,
3170 						end + 1, &alloc_hint);
3171 		cur_offset = end + 1;
3172 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3173 		if (ret)
3174 			break;
3175 		nr++;
3176 	}
3177 	if (cur_offset < prealloc_end)
3178 		btrfs_free_reserved_data_space(inode, data_reserved,
3179 				cur_offset, prealloc_end + 1 - cur_offset);
3180 out:
3181 	inode_unlock(inode);
3182 	extent_changeset_free(data_reserved);
3183 	return ret;
3184 }
3185 
3186 static noinline_for_stack
3187 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3188 			 u64 block_start)
3189 {
3190 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3191 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3192 	struct extent_map *em;
3193 	int ret = 0;
3194 
3195 	em = alloc_extent_map();
3196 	if (!em)
3197 		return -ENOMEM;
3198 
3199 	em->start = start;
3200 	em->len = end + 1 - start;
3201 	em->block_len = em->len;
3202 	em->block_start = block_start;
3203 	em->bdev = fs_info->fs_devices->latest_bdev;
3204 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3205 
3206 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3207 	while (1) {
3208 		write_lock(&em_tree->lock);
3209 		ret = add_extent_mapping(em_tree, em, 0);
3210 		write_unlock(&em_tree->lock);
3211 		if (ret != -EEXIST) {
3212 			free_extent_map(em);
3213 			break;
3214 		}
3215 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3216 	}
3217 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3218 	return ret;
3219 }
3220 
3221 static int relocate_file_extent_cluster(struct inode *inode,
3222 					struct file_extent_cluster *cluster)
3223 {
3224 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3225 	u64 page_start;
3226 	u64 page_end;
3227 	u64 offset = BTRFS_I(inode)->index_cnt;
3228 	unsigned long index;
3229 	unsigned long last_index;
3230 	struct page *page;
3231 	struct file_ra_state *ra;
3232 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3233 	int nr = 0;
3234 	int ret = 0;
3235 
3236 	if (!cluster->nr)
3237 		return 0;
3238 
3239 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3240 	if (!ra)
3241 		return -ENOMEM;
3242 
3243 	ret = prealloc_file_extent_cluster(inode, cluster);
3244 	if (ret)
3245 		goto out;
3246 
3247 	file_ra_state_init(ra, inode->i_mapping);
3248 
3249 	ret = setup_extent_mapping(inode, cluster->start - offset,
3250 				   cluster->end - offset, cluster->start);
3251 	if (ret)
3252 		goto out;
3253 
3254 	index = (cluster->start - offset) >> PAGE_SHIFT;
3255 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3256 	while (index <= last_index) {
3257 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3258 				PAGE_SIZE);
3259 		if (ret)
3260 			goto out;
3261 
3262 		page = find_lock_page(inode->i_mapping, index);
3263 		if (!page) {
3264 			page_cache_sync_readahead(inode->i_mapping,
3265 						  ra, NULL, index,
3266 						  last_index + 1 - index);
3267 			page = find_or_create_page(inode->i_mapping, index,
3268 						   mask);
3269 			if (!page) {
3270 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3271 							PAGE_SIZE, true);
3272 				ret = -ENOMEM;
3273 				goto out;
3274 			}
3275 		}
3276 
3277 		if (PageReadahead(page)) {
3278 			page_cache_async_readahead(inode->i_mapping,
3279 						   ra, NULL, page, index,
3280 						   last_index + 1 - index);
3281 		}
3282 
3283 		if (!PageUptodate(page)) {
3284 			btrfs_readpage(NULL, page);
3285 			lock_page(page);
3286 			if (!PageUptodate(page)) {
3287 				unlock_page(page);
3288 				put_page(page);
3289 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3290 							PAGE_SIZE, true);
3291 				btrfs_delalloc_release_extents(BTRFS_I(inode),
3292 							       PAGE_SIZE, true);
3293 				ret = -EIO;
3294 				goto out;
3295 			}
3296 		}
3297 
3298 		page_start = page_offset(page);
3299 		page_end = page_start + PAGE_SIZE - 1;
3300 
3301 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3302 
3303 		set_page_extent_mapped(page);
3304 
3305 		if (nr < cluster->nr &&
3306 		    page_start + offset == cluster->boundary[nr]) {
3307 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3308 					page_start, page_end,
3309 					EXTENT_BOUNDARY);
3310 			nr++;
3311 		}
3312 
3313 		ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
3314 						NULL, 0);
3315 		if (ret) {
3316 			unlock_page(page);
3317 			put_page(page);
3318 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
3319 							 PAGE_SIZE, true);
3320 			btrfs_delalloc_release_extents(BTRFS_I(inode),
3321 			                               PAGE_SIZE, true);
3322 
3323 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
3324 					  page_start, page_end,
3325 					  EXTENT_LOCKED | EXTENT_BOUNDARY);
3326 			goto out;
3327 
3328 		}
3329 		set_page_dirty(page);
3330 
3331 		unlock_extent(&BTRFS_I(inode)->io_tree,
3332 			      page_start, page_end);
3333 		unlock_page(page);
3334 		put_page(page);
3335 
3336 		index++;
3337 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
3338 					       false);
3339 		balance_dirty_pages_ratelimited(inode->i_mapping);
3340 		btrfs_throttle(fs_info);
3341 	}
3342 	WARN_ON(nr != cluster->nr);
3343 out:
3344 	kfree(ra);
3345 	return ret;
3346 }
3347 
3348 static noinline_for_stack
3349 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3350 			 struct file_extent_cluster *cluster)
3351 {
3352 	int ret;
3353 
3354 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3355 		ret = relocate_file_extent_cluster(inode, cluster);
3356 		if (ret)
3357 			return ret;
3358 		cluster->nr = 0;
3359 	}
3360 
3361 	if (!cluster->nr)
3362 		cluster->start = extent_key->objectid;
3363 	else
3364 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3365 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3366 	cluster->boundary[cluster->nr] = extent_key->objectid;
3367 	cluster->nr++;
3368 
3369 	if (cluster->nr >= MAX_EXTENTS) {
3370 		ret = relocate_file_extent_cluster(inode, cluster);
3371 		if (ret)
3372 			return ret;
3373 		cluster->nr = 0;
3374 	}
3375 	return 0;
3376 }
3377 
3378 /*
3379  * helper to add a tree block to the list.
3380  * the major work is getting the generation and level of the block
3381  */
3382 static int add_tree_block(struct reloc_control *rc,
3383 			  struct btrfs_key *extent_key,
3384 			  struct btrfs_path *path,
3385 			  struct rb_root *blocks)
3386 {
3387 	struct extent_buffer *eb;
3388 	struct btrfs_extent_item *ei;
3389 	struct btrfs_tree_block_info *bi;
3390 	struct tree_block *block;
3391 	struct rb_node *rb_node;
3392 	u32 item_size;
3393 	int level = -1;
3394 	u64 generation;
3395 
3396 	eb =  path->nodes[0];
3397 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3398 
3399 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3400 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3401 		ei = btrfs_item_ptr(eb, path->slots[0],
3402 				struct btrfs_extent_item);
3403 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3404 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3405 			level = btrfs_tree_block_level(eb, bi);
3406 		} else {
3407 			level = (int)extent_key->offset;
3408 		}
3409 		generation = btrfs_extent_generation(eb, ei);
3410 	} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3411 		btrfs_print_v0_err(eb->fs_info);
3412 		btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3413 		return -EINVAL;
3414 	} else {
3415 		BUG();
3416 	}
3417 
3418 	btrfs_release_path(path);
3419 
3420 	BUG_ON(level == -1);
3421 
3422 	block = kmalloc(sizeof(*block), GFP_NOFS);
3423 	if (!block)
3424 		return -ENOMEM;
3425 
3426 	block->bytenr = extent_key->objectid;
3427 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3428 	block->key.offset = generation;
3429 	block->level = level;
3430 	block->key_ready = 0;
3431 
3432 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3433 	if (rb_node)
3434 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3435 
3436 	return 0;
3437 }
3438 
3439 /*
3440  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3441  */
3442 static int __add_tree_block(struct reloc_control *rc,
3443 			    u64 bytenr, u32 blocksize,
3444 			    struct rb_root *blocks)
3445 {
3446 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3447 	struct btrfs_path *path;
3448 	struct btrfs_key key;
3449 	int ret;
3450 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3451 
3452 	if (tree_block_processed(bytenr, rc))
3453 		return 0;
3454 
3455 	if (tree_search(blocks, bytenr))
3456 		return 0;
3457 
3458 	path = btrfs_alloc_path();
3459 	if (!path)
3460 		return -ENOMEM;
3461 again:
3462 	key.objectid = bytenr;
3463 	if (skinny) {
3464 		key.type = BTRFS_METADATA_ITEM_KEY;
3465 		key.offset = (u64)-1;
3466 	} else {
3467 		key.type = BTRFS_EXTENT_ITEM_KEY;
3468 		key.offset = blocksize;
3469 	}
3470 
3471 	path->search_commit_root = 1;
3472 	path->skip_locking = 1;
3473 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3474 	if (ret < 0)
3475 		goto out;
3476 
3477 	if (ret > 0 && skinny) {
3478 		if (path->slots[0]) {
3479 			path->slots[0]--;
3480 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3481 					      path->slots[0]);
3482 			if (key.objectid == bytenr &&
3483 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3484 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3485 			      key.offset == blocksize)))
3486 				ret = 0;
3487 		}
3488 
3489 		if (ret) {
3490 			skinny = false;
3491 			btrfs_release_path(path);
3492 			goto again;
3493 		}
3494 	}
3495 	if (ret) {
3496 		ASSERT(ret == 1);
3497 		btrfs_print_leaf(path->nodes[0]);
3498 		btrfs_err(fs_info,
3499 	     "tree block extent item (%llu) is not found in extent tree",
3500 		     bytenr);
3501 		WARN_ON(1);
3502 		ret = -EINVAL;
3503 		goto out;
3504 	}
3505 
3506 	ret = add_tree_block(rc, &key, path, blocks);
3507 out:
3508 	btrfs_free_path(path);
3509 	return ret;
3510 }
3511 
3512 /*
3513  * helper to check if the block use full backrefs for pointers in it
3514  */
3515 static int block_use_full_backref(struct reloc_control *rc,
3516 				  struct extent_buffer *eb)
3517 {
3518 	u64 flags;
3519 	int ret;
3520 
3521 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3522 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3523 		return 1;
3524 
3525 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3526 				       eb->start, btrfs_header_level(eb), 1,
3527 				       NULL, &flags);
3528 	BUG_ON(ret);
3529 
3530 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3531 		ret = 1;
3532 	else
3533 		ret = 0;
3534 	return ret;
3535 }
3536 
3537 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3538 				    struct btrfs_block_group_cache *block_group,
3539 				    struct inode *inode,
3540 				    u64 ino)
3541 {
3542 	struct btrfs_key key;
3543 	struct btrfs_root *root = fs_info->tree_root;
3544 	struct btrfs_trans_handle *trans;
3545 	int ret = 0;
3546 
3547 	if (inode)
3548 		goto truncate;
3549 
3550 	key.objectid = ino;
3551 	key.type = BTRFS_INODE_ITEM_KEY;
3552 	key.offset = 0;
3553 
3554 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3555 	if (IS_ERR(inode))
3556 		return -ENOENT;
3557 
3558 truncate:
3559 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3560 						 &fs_info->global_block_rsv);
3561 	if (ret)
3562 		goto out;
3563 
3564 	trans = btrfs_join_transaction(root);
3565 	if (IS_ERR(trans)) {
3566 		ret = PTR_ERR(trans);
3567 		goto out;
3568 	}
3569 
3570 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3571 
3572 	btrfs_end_transaction(trans);
3573 	btrfs_btree_balance_dirty(fs_info);
3574 out:
3575 	iput(inode);
3576 	return ret;
3577 }
3578 
3579 /*
3580  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3581  * this function scans fs tree to find blocks reference the data extent
3582  */
3583 static int find_data_references(struct reloc_control *rc,
3584 				struct btrfs_key *extent_key,
3585 				struct extent_buffer *leaf,
3586 				struct btrfs_extent_data_ref *ref,
3587 				struct rb_root *blocks)
3588 {
3589 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3590 	struct btrfs_path *path;
3591 	struct tree_block *block;
3592 	struct btrfs_root *root;
3593 	struct btrfs_file_extent_item *fi;
3594 	struct rb_node *rb_node;
3595 	struct btrfs_key key;
3596 	u64 ref_root;
3597 	u64 ref_objectid;
3598 	u64 ref_offset;
3599 	u32 ref_count;
3600 	u32 nritems;
3601 	int err = 0;
3602 	int added = 0;
3603 	int counted;
3604 	int ret;
3605 
3606 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3607 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3608 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3609 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3610 
3611 	/*
3612 	 * This is an extent belonging to the free space cache, lets just delete
3613 	 * it and redo the search.
3614 	 */
3615 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3616 		ret = delete_block_group_cache(fs_info, rc->block_group,
3617 					       NULL, ref_objectid);
3618 		if (ret != -ENOENT)
3619 			return ret;
3620 		ret = 0;
3621 	}
3622 
3623 	path = btrfs_alloc_path();
3624 	if (!path)
3625 		return -ENOMEM;
3626 	path->reada = READA_FORWARD;
3627 
3628 	root = read_fs_root(fs_info, ref_root);
3629 	if (IS_ERR(root)) {
3630 		err = PTR_ERR(root);
3631 		goto out;
3632 	}
3633 
3634 	key.objectid = ref_objectid;
3635 	key.type = BTRFS_EXTENT_DATA_KEY;
3636 	if (ref_offset > ((u64)-1 << 32))
3637 		key.offset = 0;
3638 	else
3639 		key.offset = ref_offset;
3640 
3641 	path->search_commit_root = 1;
3642 	path->skip_locking = 1;
3643 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3644 	if (ret < 0) {
3645 		err = ret;
3646 		goto out;
3647 	}
3648 
3649 	leaf = path->nodes[0];
3650 	nritems = btrfs_header_nritems(leaf);
3651 	/*
3652 	 * the references in tree blocks that use full backrefs
3653 	 * are not counted in
3654 	 */
3655 	if (block_use_full_backref(rc, leaf))
3656 		counted = 0;
3657 	else
3658 		counted = 1;
3659 	rb_node = tree_search(blocks, leaf->start);
3660 	if (rb_node) {
3661 		if (counted)
3662 			added = 1;
3663 		else
3664 			path->slots[0] = nritems;
3665 	}
3666 
3667 	while (ref_count > 0) {
3668 		while (path->slots[0] >= nritems) {
3669 			ret = btrfs_next_leaf(root, path);
3670 			if (ret < 0) {
3671 				err = ret;
3672 				goto out;
3673 			}
3674 			if (WARN_ON(ret > 0))
3675 				goto out;
3676 
3677 			leaf = path->nodes[0];
3678 			nritems = btrfs_header_nritems(leaf);
3679 			added = 0;
3680 
3681 			if (block_use_full_backref(rc, leaf))
3682 				counted = 0;
3683 			else
3684 				counted = 1;
3685 			rb_node = tree_search(blocks, leaf->start);
3686 			if (rb_node) {
3687 				if (counted)
3688 					added = 1;
3689 				else
3690 					path->slots[0] = nritems;
3691 			}
3692 		}
3693 
3694 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3695 		if (WARN_ON(key.objectid != ref_objectid ||
3696 		    key.type != BTRFS_EXTENT_DATA_KEY))
3697 			break;
3698 
3699 		fi = btrfs_item_ptr(leaf, path->slots[0],
3700 				    struct btrfs_file_extent_item);
3701 
3702 		if (btrfs_file_extent_type(leaf, fi) ==
3703 		    BTRFS_FILE_EXTENT_INLINE)
3704 			goto next;
3705 
3706 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3707 		    extent_key->objectid)
3708 			goto next;
3709 
3710 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3711 		if (key.offset != ref_offset)
3712 			goto next;
3713 
3714 		if (counted)
3715 			ref_count--;
3716 		if (added)
3717 			goto next;
3718 
3719 		if (!tree_block_processed(leaf->start, rc)) {
3720 			block = kmalloc(sizeof(*block), GFP_NOFS);
3721 			if (!block) {
3722 				err = -ENOMEM;
3723 				break;
3724 			}
3725 			block->bytenr = leaf->start;
3726 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3727 			block->level = 0;
3728 			block->key_ready = 1;
3729 			rb_node = tree_insert(blocks, block->bytenr,
3730 					      &block->rb_node);
3731 			if (rb_node)
3732 				backref_tree_panic(rb_node, -EEXIST,
3733 						   block->bytenr);
3734 		}
3735 		if (counted)
3736 			added = 1;
3737 		else
3738 			path->slots[0] = nritems;
3739 next:
3740 		path->slots[0]++;
3741 
3742 	}
3743 out:
3744 	btrfs_free_path(path);
3745 	return err;
3746 }
3747 
3748 /*
3749  * helper to find all tree blocks that reference a given data extent
3750  */
3751 static noinline_for_stack
3752 int add_data_references(struct reloc_control *rc,
3753 			struct btrfs_key *extent_key,
3754 			struct btrfs_path *path,
3755 			struct rb_root *blocks)
3756 {
3757 	struct btrfs_key key;
3758 	struct extent_buffer *eb;
3759 	struct btrfs_extent_data_ref *dref;
3760 	struct btrfs_extent_inline_ref *iref;
3761 	unsigned long ptr;
3762 	unsigned long end;
3763 	u32 blocksize = rc->extent_root->fs_info->nodesize;
3764 	int ret = 0;
3765 	int err = 0;
3766 
3767 	eb = path->nodes[0];
3768 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3769 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3770 	ptr += sizeof(struct btrfs_extent_item);
3771 
3772 	while (ptr < end) {
3773 		iref = (struct btrfs_extent_inline_ref *)ptr;
3774 		key.type = btrfs_get_extent_inline_ref_type(eb, iref,
3775 							BTRFS_REF_TYPE_DATA);
3776 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3777 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3778 			ret = __add_tree_block(rc, key.offset, blocksize,
3779 					       blocks);
3780 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3781 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3782 			ret = find_data_references(rc, extent_key,
3783 						   eb, dref, blocks);
3784 		} else {
3785 			ret = -EUCLEAN;
3786 			btrfs_err(rc->extent_root->fs_info,
3787 		     "extent %llu slot %d has an invalid inline ref type",
3788 			     eb->start, path->slots[0]);
3789 		}
3790 		if (ret) {
3791 			err = ret;
3792 			goto out;
3793 		}
3794 		ptr += btrfs_extent_inline_ref_size(key.type);
3795 	}
3796 	WARN_ON(ptr > end);
3797 
3798 	while (1) {
3799 		cond_resched();
3800 		eb = path->nodes[0];
3801 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3802 			ret = btrfs_next_leaf(rc->extent_root, path);
3803 			if (ret < 0) {
3804 				err = ret;
3805 				break;
3806 			}
3807 			if (ret > 0)
3808 				break;
3809 			eb = path->nodes[0];
3810 		}
3811 
3812 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3813 		if (key.objectid != extent_key->objectid)
3814 			break;
3815 
3816 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3817 			ret = __add_tree_block(rc, key.offset, blocksize,
3818 					       blocks);
3819 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3820 			dref = btrfs_item_ptr(eb, path->slots[0],
3821 					      struct btrfs_extent_data_ref);
3822 			ret = find_data_references(rc, extent_key,
3823 						   eb, dref, blocks);
3824 		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
3825 			btrfs_print_v0_err(eb->fs_info);
3826 			btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3827 			ret = -EINVAL;
3828 		} else {
3829 			ret = 0;
3830 		}
3831 		if (ret) {
3832 			err = ret;
3833 			break;
3834 		}
3835 		path->slots[0]++;
3836 	}
3837 out:
3838 	btrfs_release_path(path);
3839 	if (err)
3840 		free_block_list(blocks);
3841 	return err;
3842 }
3843 
3844 /*
3845  * helper to find next unprocessed extent
3846  */
3847 static noinline_for_stack
3848 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3849 		     struct btrfs_key *extent_key)
3850 {
3851 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3852 	struct btrfs_key key;
3853 	struct extent_buffer *leaf;
3854 	u64 start, end, last;
3855 	int ret;
3856 
3857 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3858 	while (1) {
3859 		cond_resched();
3860 		if (rc->search_start >= last) {
3861 			ret = 1;
3862 			break;
3863 		}
3864 
3865 		key.objectid = rc->search_start;
3866 		key.type = BTRFS_EXTENT_ITEM_KEY;
3867 		key.offset = 0;
3868 
3869 		path->search_commit_root = 1;
3870 		path->skip_locking = 1;
3871 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3872 					0, 0);
3873 		if (ret < 0)
3874 			break;
3875 next:
3876 		leaf = path->nodes[0];
3877 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3878 			ret = btrfs_next_leaf(rc->extent_root, path);
3879 			if (ret != 0)
3880 				break;
3881 			leaf = path->nodes[0];
3882 		}
3883 
3884 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3885 		if (key.objectid >= last) {
3886 			ret = 1;
3887 			break;
3888 		}
3889 
3890 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3891 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3892 			path->slots[0]++;
3893 			goto next;
3894 		}
3895 
3896 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3897 		    key.objectid + key.offset <= rc->search_start) {
3898 			path->slots[0]++;
3899 			goto next;
3900 		}
3901 
3902 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3903 		    key.objectid + fs_info->nodesize <=
3904 		    rc->search_start) {
3905 			path->slots[0]++;
3906 			goto next;
3907 		}
3908 
3909 		ret = find_first_extent_bit(&rc->processed_blocks,
3910 					    key.objectid, &start, &end,
3911 					    EXTENT_DIRTY, NULL);
3912 
3913 		if (ret == 0 && start <= key.objectid) {
3914 			btrfs_release_path(path);
3915 			rc->search_start = end + 1;
3916 		} else {
3917 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3918 				rc->search_start = key.objectid + key.offset;
3919 			else
3920 				rc->search_start = key.objectid +
3921 					fs_info->nodesize;
3922 			memcpy(extent_key, &key, sizeof(key));
3923 			return 0;
3924 		}
3925 	}
3926 	btrfs_release_path(path);
3927 	return ret;
3928 }
3929 
3930 static void set_reloc_control(struct reloc_control *rc)
3931 {
3932 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3933 
3934 	mutex_lock(&fs_info->reloc_mutex);
3935 	fs_info->reloc_ctl = rc;
3936 	mutex_unlock(&fs_info->reloc_mutex);
3937 }
3938 
3939 static void unset_reloc_control(struct reloc_control *rc)
3940 {
3941 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3942 
3943 	mutex_lock(&fs_info->reloc_mutex);
3944 	fs_info->reloc_ctl = NULL;
3945 	mutex_unlock(&fs_info->reloc_mutex);
3946 }
3947 
3948 static int check_extent_flags(u64 flags)
3949 {
3950 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3951 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3952 		return 1;
3953 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3954 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3955 		return 1;
3956 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3957 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3958 		return 1;
3959 	return 0;
3960 }
3961 
3962 static noinline_for_stack
3963 int prepare_to_relocate(struct reloc_control *rc)
3964 {
3965 	struct btrfs_trans_handle *trans;
3966 	int ret;
3967 
3968 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3969 					      BTRFS_BLOCK_RSV_TEMP);
3970 	if (!rc->block_rsv)
3971 		return -ENOMEM;
3972 
3973 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3974 	rc->search_start = rc->block_group->key.objectid;
3975 	rc->extents_found = 0;
3976 	rc->nodes_relocated = 0;
3977 	rc->merging_rsv_size = 0;
3978 	rc->reserved_bytes = 0;
3979 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3980 			      RELOCATION_RESERVED_NODES;
3981 	ret = btrfs_block_rsv_refill(rc->extent_root,
3982 				     rc->block_rsv, rc->block_rsv->size,
3983 				     BTRFS_RESERVE_FLUSH_ALL);
3984 	if (ret)
3985 		return ret;
3986 
3987 	rc->create_reloc_tree = 1;
3988 	set_reloc_control(rc);
3989 
3990 	trans = btrfs_join_transaction(rc->extent_root);
3991 	if (IS_ERR(trans)) {
3992 		unset_reloc_control(rc);
3993 		/*
3994 		 * extent tree is not a ref_cow tree and has no reloc_root to
3995 		 * cleanup.  And callers are responsible to free the above
3996 		 * block rsv.
3997 		 */
3998 		return PTR_ERR(trans);
3999 	}
4000 	btrfs_commit_transaction(trans);
4001 	return 0;
4002 }
4003 
4004 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4005 {
4006 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
4007 	struct rb_root blocks = RB_ROOT;
4008 	struct btrfs_key key;
4009 	struct btrfs_trans_handle *trans = NULL;
4010 	struct btrfs_path *path;
4011 	struct btrfs_extent_item *ei;
4012 	u64 flags;
4013 	u32 item_size;
4014 	int ret;
4015 	int err = 0;
4016 	int progress = 0;
4017 
4018 	path = btrfs_alloc_path();
4019 	if (!path)
4020 		return -ENOMEM;
4021 	path->reada = READA_FORWARD;
4022 
4023 	ret = prepare_to_relocate(rc);
4024 	if (ret) {
4025 		err = ret;
4026 		goto out_free;
4027 	}
4028 
4029 	while (1) {
4030 		rc->reserved_bytes = 0;
4031 		ret = btrfs_block_rsv_refill(rc->extent_root,
4032 					rc->block_rsv, rc->block_rsv->size,
4033 					BTRFS_RESERVE_FLUSH_ALL);
4034 		if (ret) {
4035 			err = ret;
4036 			break;
4037 		}
4038 		progress++;
4039 		trans = btrfs_start_transaction(rc->extent_root, 0);
4040 		if (IS_ERR(trans)) {
4041 			err = PTR_ERR(trans);
4042 			trans = NULL;
4043 			break;
4044 		}
4045 restart:
4046 		if (update_backref_cache(trans, &rc->backref_cache)) {
4047 			btrfs_end_transaction(trans);
4048 			trans = NULL;
4049 			continue;
4050 		}
4051 
4052 		ret = find_next_extent(rc, path, &key);
4053 		if (ret < 0)
4054 			err = ret;
4055 		if (ret != 0)
4056 			break;
4057 
4058 		rc->extents_found++;
4059 
4060 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4061 				    struct btrfs_extent_item);
4062 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4063 		if (item_size >= sizeof(*ei)) {
4064 			flags = btrfs_extent_flags(path->nodes[0], ei);
4065 			ret = check_extent_flags(flags);
4066 			BUG_ON(ret);
4067 		} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
4068 			err = -EINVAL;
4069 			btrfs_print_v0_err(trans->fs_info);
4070 			btrfs_abort_transaction(trans, err);
4071 			break;
4072 		} else {
4073 			BUG();
4074 		}
4075 
4076 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4077 			ret = add_tree_block(rc, &key, path, &blocks);
4078 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4079 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4080 			ret = add_data_references(rc, &key, path, &blocks);
4081 		} else {
4082 			btrfs_release_path(path);
4083 			ret = 0;
4084 		}
4085 		if (ret < 0) {
4086 			err = ret;
4087 			break;
4088 		}
4089 
4090 		if (!RB_EMPTY_ROOT(&blocks)) {
4091 			ret = relocate_tree_blocks(trans, rc, &blocks);
4092 			if (ret < 0) {
4093 				/*
4094 				 * if we fail to relocate tree blocks, force to update
4095 				 * backref cache when committing transaction.
4096 				 */
4097 				rc->backref_cache.last_trans = trans->transid - 1;
4098 
4099 				if (ret != -EAGAIN) {
4100 					err = ret;
4101 					break;
4102 				}
4103 				rc->extents_found--;
4104 				rc->search_start = key.objectid;
4105 			}
4106 		}
4107 
4108 		btrfs_end_transaction_throttle(trans);
4109 		btrfs_btree_balance_dirty(fs_info);
4110 		trans = NULL;
4111 
4112 		if (rc->stage == MOVE_DATA_EXTENTS &&
4113 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4114 			rc->found_file_extent = 1;
4115 			ret = relocate_data_extent(rc->data_inode,
4116 						   &key, &rc->cluster);
4117 			if (ret < 0) {
4118 				err = ret;
4119 				break;
4120 			}
4121 		}
4122 	}
4123 	if (trans && progress && err == -ENOSPC) {
4124 		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
4125 		if (ret == 1) {
4126 			err = 0;
4127 			progress = 0;
4128 			goto restart;
4129 		}
4130 	}
4131 
4132 	btrfs_release_path(path);
4133 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4134 
4135 	if (trans) {
4136 		btrfs_end_transaction_throttle(trans);
4137 		btrfs_btree_balance_dirty(fs_info);
4138 	}
4139 
4140 	if (!err) {
4141 		ret = relocate_file_extent_cluster(rc->data_inode,
4142 						   &rc->cluster);
4143 		if (ret < 0)
4144 			err = ret;
4145 	}
4146 
4147 	rc->create_reloc_tree = 0;
4148 	set_reloc_control(rc);
4149 
4150 	backref_cache_cleanup(&rc->backref_cache);
4151 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4152 
4153 	err = prepare_to_merge(rc, err);
4154 
4155 	merge_reloc_roots(rc);
4156 
4157 	rc->merge_reloc_tree = 0;
4158 	unset_reloc_control(rc);
4159 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4160 
4161 	/* get rid of pinned extents */
4162 	trans = btrfs_join_transaction(rc->extent_root);
4163 	if (IS_ERR(trans)) {
4164 		err = PTR_ERR(trans);
4165 		goto out_free;
4166 	}
4167 	btrfs_commit_transaction(trans);
4168 	ret = clean_dirty_subvols(rc);
4169 	if (ret < 0 && !err)
4170 		err = ret;
4171 out_free:
4172 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
4173 	btrfs_free_path(path);
4174 	return err;
4175 }
4176 
4177 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4178 				 struct btrfs_root *root, u64 objectid)
4179 {
4180 	struct btrfs_path *path;
4181 	struct btrfs_inode_item *item;
4182 	struct extent_buffer *leaf;
4183 	int ret;
4184 
4185 	path = btrfs_alloc_path();
4186 	if (!path)
4187 		return -ENOMEM;
4188 
4189 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4190 	if (ret)
4191 		goto out;
4192 
4193 	leaf = path->nodes[0];
4194 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4195 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4196 	btrfs_set_inode_generation(leaf, item, 1);
4197 	btrfs_set_inode_size(leaf, item, 0);
4198 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4199 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4200 					  BTRFS_INODE_PREALLOC);
4201 	btrfs_mark_buffer_dirty(leaf);
4202 out:
4203 	btrfs_free_path(path);
4204 	return ret;
4205 }
4206 
4207 /*
4208  * helper to create inode for data relocation.
4209  * the inode is in data relocation tree and its link count is 0
4210  */
4211 static noinline_for_stack
4212 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4213 				 struct btrfs_block_group_cache *group)
4214 {
4215 	struct inode *inode = NULL;
4216 	struct btrfs_trans_handle *trans;
4217 	struct btrfs_root *root;
4218 	struct btrfs_key key;
4219 	u64 objectid;
4220 	int err = 0;
4221 
4222 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4223 	if (IS_ERR(root))
4224 		return ERR_CAST(root);
4225 
4226 	trans = btrfs_start_transaction(root, 6);
4227 	if (IS_ERR(trans))
4228 		return ERR_CAST(trans);
4229 
4230 	err = btrfs_find_free_objectid(root, &objectid);
4231 	if (err)
4232 		goto out;
4233 
4234 	err = __insert_orphan_inode(trans, root, objectid);
4235 	BUG_ON(err);
4236 
4237 	key.objectid = objectid;
4238 	key.type = BTRFS_INODE_ITEM_KEY;
4239 	key.offset = 0;
4240 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4241 	BUG_ON(IS_ERR(inode));
4242 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4243 
4244 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4245 out:
4246 	btrfs_end_transaction(trans);
4247 	btrfs_btree_balance_dirty(fs_info);
4248 	if (err) {
4249 		if (inode)
4250 			iput(inode);
4251 		inode = ERR_PTR(err);
4252 	}
4253 	return inode;
4254 }
4255 
4256 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4257 {
4258 	struct reloc_control *rc;
4259 
4260 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4261 	if (!rc)
4262 		return NULL;
4263 
4264 	INIT_LIST_HEAD(&rc->reloc_roots);
4265 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4266 	backref_cache_init(&rc->backref_cache);
4267 	mapping_tree_init(&rc->reloc_root_tree);
4268 	extent_io_tree_init(fs_info, &rc->processed_blocks,
4269 			    IO_TREE_RELOC_BLOCKS, NULL);
4270 	return rc;
4271 }
4272 
4273 /*
4274  * Print the block group being relocated
4275  */
4276 static void describe_relocation(struct btrfs_fs_info *fs_info,
4277 				struct btrfs_block_group_cache *block_group)
4278 {
4279 	char buf[128] = {'\0'};
4280 
4281 	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4282 
4283 	btrfs_info(fs_info,
4284 		   "relocating block group %llu flags %s",
4285 		   block_group->key.objectid, buf);
4286 }
4287 
4288 /*
4289  * function to relocate all extents in a block group.
4290  */
4291 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4292 {
4293 	struct btrfs_block_group_cache *bg;
4294 	struct btrfs_root *extent_root = fs_info->extent_root;
4295 	struct reloc_control *rc;
4296 	struct inode *inode;
4297 	struct btrfs_path *path;
4298 	int ret;
4299 	int rw = 0;
4300 	int err = 0;
4301 
4302 	bg = btrfs_lookup_block_group(fs_info, group_start);
4303 	if (!bg)
4304 		return -ENOENT;
4305 
4306 	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4307 		btrfs_put_block_group(bg);
4308 		return -ETXTBSY;
4309 	}
4310 
4311 	rc = alloc_reloc_control(fs_info);
4312 	if (!rc) {
4313 		btrfs_put_block_group(bg);
4314 		return -ENOMEM;
4315 	}
4316 
4317 	rc->extent_root = extent_root;
4318 	rc->block_group = bg;
4319 
4320 	ret = btrfs_inc_block_group_ro(rc->block_group);
4321 	if (ret) {
4322 		err = ret;
4323 		goto out;
4324 	}
4325 	rw = 1;
4326 
4327 	path = btrfs_alloc_path();
4328 	if (!path) {
4329 		err = -ENOMEM;
4330 		goto out;
4331 	}
4332 
4333 	inode = lookup_free_space_inode(rc->block_group, path);
4334 	btrfs_free_path(path);
4335 
4336 	if (!IS_ERR(inode))
4337 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4338 	else
4339 		ret = PTR_ERR(inode);
4340 
4341 	if (ret && ret != -ENOENT) {
4342 		err = ret;
4343 		goto out;
4344 	}
4345 
4346 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4347 	if (IS_ERR(rc->data_inode)) {
4348 		err = PTR_ERR(rc->data_inode);
4349 		rc->data_inode = NULL;
4350 		goto out;
4351 	}
4352 
4353 	describe_relocation(fs_info, rc->block_group);
4354 
4355 	btrfs_wait_block_group_reservations(rc->block_group);
4356 	btrfs_wait_nocow_writers(rc->block_group);
4357 	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4358 				 rc->block_group->key.objectid,
4359 				 rc->block_group->key.offset);
4360 
4361 	while (1) {
4362 		mutex_lock(&fs_info->cleaner_mutex);
4363 		ret = relocate_block_group(rc);
4364 		mutex_unlock(&fs_info->cleaner_mutex);
4365 		if (ret < 0)
4366 			err = ret;
4367 
4368 		/*
4369 		 * We may have gotten ENOSPC after we already dirtied some
4370 		 * extents.  If writeout happens while we're relocating a
4371 		 * different block group we could end up hitting the
4372 		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4373 		 * btrfs_reloc_cow_block.  Make sure we write everything out
4374 		 * properly so we don't trip over this problem, and then break
4375 		 * out of the loop if we hit an error.
4376 		 */
4377 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4378 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4379 						       (u64)-1);
4380 			if (ret)
4381 				err = ret;
4382 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4383 						 0, -1);
4384 			rc->stage = UPDATE_DATA_PTRS;
4385 		}
4386 
4387 		if (err < 0)
4388 			goto out;
4389 
4390 		if (rc->extents_found == 0)
4391 			break;
4392 
4393 		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4394 
4395 	}
4396 
4397 	WARN_ON(rc->block_group->pinned > 0);
4398 	WARN_ON(rc->block_group->reserved > 0);
4399 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4400 out:
4401 	if (err && rw)
4402 		btrfs_dec_block_group_ro(rc->block_group);
4403 	iput(rc->data_inode);
4404 	btrfs_put_block_group(rc->block_group);
4405 	kfree(rc);
4406 	return err;
4407 }
4408 
4409 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4410 {
4411 	struct btrfs_fs_info *fs_info = root->fs_info;
4412 	struct btrfs_trans_handle *trans;
4413 	int ret, err;
4414 
4415 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4416 	if (IS_ERR(trans))
4417 		return PTR_ERR(trans);
4418 
4419 	memset(&root->root_item.drop_progress, 0,
4420 		sizeof(root->root_item.drop_progress));
4421 	root->root_item.drop_level = 0;
4422 	btrfs_set_root_refs(&root->root_item, 0);
4423 	ret = btrfs_update_root(trans, fs_info->tree_root,
4424 				&root->root_key, &root->root_item);
4425 
4426 	err = btrfs_end_transaction(trans);
4427 	if (err)
4428 		return err;
4429 	return ret;
4430 }
4431 
4432 /*
4433  * recover relocation interrupted by system crash.
4434  *
4435  * this function resumes merging reloc trees with corresponding fs trees.
4436  * this is important for keeping the sharing of tree blocks
4437  */
4438 int btrfs_recover_relocation(struct btrfs_root *root)
4439 {
4440 	struct btrfs_fs_info *fs_info = root->fs_info;
4441 	LIST_HEAD(reloc_roots);
4442 	struct btrfs_key key;
4443 	struct btrfs_root *fs_root;
4444 	struct btrfs_root *reloc_root;
4445 	struct btrfs_path *path;
4446 	struct extent_buffer *leaf;
4447 	struct reloc_control *rc = NULL;
4448 	struct btrfs_trans_handle *trans;
4449 	int ret;
4450 	int err = 0;
4451 
4452 	path = btrfs_alloc_path();
4453 	if (!path)
4454 		return -ENOMEM;
4455 	path->reada = READA_BACK;
4456 
4457 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4458 	key.type = BTRFS_ROOT_ITEM_KEY;
4459 	key.offset = (u64)-1;
4460 
4461 	while (1) {
4462 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4463 					path, 0, 0);
4464 		if (ret < 0) {
4465 			err = ret;
4466 			goto out;
4467 		}
4468 		if (ret > 0) {
4469 			if (path->slots[0] == 0)
4470 				break;
4471 			path->slots[0]--;
4472 		}
4473 		leaf = path->nodes[0];
4474 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4475 		btrfs_release_path(path);
4476 
4477 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4478 		    key.type != BTRFS_ROOT_ITEM_KEY)
4479 			break;
4480 
4481 		reloc_root = btrfs_read_fs_root(root, &key);
4482 		if (IS_ERR(reloc_root)) {
4483 			err = PTR_ERR(reloc_root);
4484 			goto out;
4485 		}
4486 
4487 		list_add(&reloc_root->root_list, &reloc_roots);
4488 
4489 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4490 			fs_root = read_fs_root(fs_info,
4491 					       reloc_root->root_key.offset);
4492 			if (IS_ERR(fs_root)) {
4493 				ret = PTR_ERR(fs_root);
4494 				if (ret != -ENOENT) {
4495 					err = ret;
4496 					goto out;
4497 				}
4498 				ret = mark_garbage_root(reloc_root);
4499 				if (ret < 0) {
4500 					err = ret;
4501 					goto out;
4502 				}
4503 			}
4504 		}
4505 
4506 		if (key.offset == 0)
4507 			break;
4508 
4509 		key.offset--;
4510 	}
4511 	btrfs_release_path(path);
4512 
4513 	if (list_empty(&reloc_roots))
4514 		goto out;
4515 
4516 	rc = alloc_reloc_control(fs_info);
4517 	if (!rc) {
4518 		err = -ENOMEM;
4519 		goto out;
4520 	}
4521 
4522 	rc->extent_root = fs_info->extent_root;
4523 
4524 	set_reloc_control(rc);
4525 
4526 	trans = btrfs_join_transaction(rc->extent_root);
4527 	if (IS_ERR(trans)) {
4528 		unset_reloc_control(rc);
4529 		err = PTR_ERR(trans);
4530 		goto out_free;
4531 	}
4532 
4533 	rc->merge_reloc_tree = 1;
4534 
4535 	while (!list_empty(&reloc_roots)) {
4536 		reloc_root = list_entry(reloc_roots.next,
4537 					struct btrfs_root, root_list);
4538 		list_del(&reloc_root->root_list);
4539 
4540 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4541 			list_add_tail(&reloc_root->root_list,
4542 				      &rc->reloc_roots);
4543 			continue;
4544 		}
4545 
4546 		fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4547 		if (IS_ERR(fs_root)) {
4548 			err = PTR_ERR(fs_root);
4549 			goto out_free;
4550 		}
4551 
4552 		err = __add_reloc_root(reloc_root);
4553 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4554 		fs_root->reloc_root = reloc_root;
4555 	}
4556 
4557 	err = btrfs_commit_transaction(trans);
4558 	if (err)
4559 		goto out_free;
4560 
4561 	merge_reloc_roots(rc);
4562 
4563 	unset_reloc_control(rc);
4564 
4565 	trans = btrfs_join_transaction(rc->extent_root);
4566 	if (IS_ERR(trans)) {
4567 		err = PTR_ERR(trans);
4568 		goto out_free;
4569 	}
4570 	err = btrfs_commit_transaction(trans);
4571 
4572 	ret = clean_dirty_subvols(rc);
4573 	if (ret < 0 && !err)
4574 		err = ret;
4575 out_free:
4576 	kfree(rc);
4577 out:
4578 	if (!list_empty(&reloc_roots))
4579 		free_reloc_roots(&reloc_roots);
4580 
4581 	btrfs_free_path(path);
4582 
4583 	if (err == 0) {
4584 		/* cleanup orphan inode in data relocation tree */
4585 		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4586 		if (IS_ERR(fs_root))
4587 			err = PTR_ERR(fs_root);
4588 		else
4589 			err = btrfs_orphan_cleanup(fs_root);
4590 	}
4591 	return err;
4592 }
4593 
4594 /*
4595  * helper to add ordered checksum for data relocation.
4596  *
4597  * cloning checksum properly handles the nodatasum extents.
4598  * it also saves CPU time to re-calculate the checksum.
4599  */
4600 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4601 {
4602 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4603 	struct btrfs_ordered_sum *sums;
4604 	struct btrfs_ordered_extent *ordered;
4605 	int ret;
4606 	u64 disk_bytenr;
4607 	u64 new_bytenr;
4608 	LIST_HEAD(list);
4609 
4610 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4611 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4612 
4613 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4614 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4615 				       disk_bytenr + len - 1, &list, 0);
4616 	if (ret)
4617 		goto out;
4618 
4619 	while (!list_empty(&list)) {
4620 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4621 		list_del_init(&sums->list);
4622 
4623 		/*
4624 		 * We need to offset the new_bytenr based on where the csum is.
4625 		 * We need to do this because we will read in entire prealloc
4626 		 * extents but we may have written to say the middle of the
4627 		 * prealloc extent, so we need to make sure the csum goes with
4628 		 * the right disk offset.
4629 		 *
4630 		 * We can do this because the data reloc inode refers strictly
4631 		 * to the on disk bytes, so we don't have to worry about
4632 		 * disk_len vs real len like with real inodes since it's all
4633 		 * disk length.
4634 		 */
4635 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4636 		sums->bytenr = new_bytenr;
4637 
4638 		btrfs_add_ordered_sum(ordered, sums);
4639 	}
4640 out:
4641 	btrfs_put_ordered_extent(ordered);
4642 	return ret;
4643 }
4644 
4645 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4646 			  struct btrfs_root *root, struct extent_buffer *buf,
4647 			  struct extent_buffer *cow)
4648 {
4649 	struct btrfs_fs_info *fs_info = root->fs_info;
4650 	struct reloc_control *rc;
4651 	struct backref_node *node;
4652 	int first_cow = 0;
4653 	int level;
4654 	int ret = 0;
4655 
4656 	rc = fs_info->reloc_ctl;
4657 	if (!rc)
4658 		return 0;
4659 
4660 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4661 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4662 
4663 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4664 		if (buf == root->node)
4665 			__update_reloc_root(root, cow->start);
4666 	}
4667 
4668 	level = btrfs_header_level(buf);
4669 	if (btrfs_header_generation(buf) <=
4670 	    btrfs_root_last_snapshot(&root->root_item))
4671 		first_cow = 1;
4672 
4673 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4674 	    rc->create_reloc_tree) {
4675 		WARN_ON(!first_cow && level == 0);
4676 
4677 		node = rc->backref_cache.path[level];
4678 		BUG_ON(node->bytenr != buf->start &&
4679 		       node->new_bytenr != buf->start);
4680 
4681 		drop_node_buffer(node);
4682 		extent_buffer_get(cow);
4683 		node->eb = cow;
4684 		node->new_bytenr = cow->start;
4685 
4686 		if (!node->pending) {
4687 			list_move_tail(&node->list,
4688 				       &rc->backref_cache.pending[level]);
4689 			node->pending = 1;
4690 		}
4691 
4692 		if (first_cow)
4693 			__mark_block_processed(rc, node);
4694 
4695 		if (first_cow && level > 0)
4696 			rc->nodes_relocated += buf->len;
4697 	}
4698 
4699 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4700 		ret = replace_file_extents(trans, rc, root, cow);
4701 	return ret;
4702 }
4703 
4704 /*
4705  * called before creating snapshot. it calculates metadata reservation
4706  * required for relocating tree blocks in the snapshot
4707  */
4708 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4709 			      u64 *bytes_to_reserve)
4710 {
4711 	struct btrfs_root *root = pending->root;
4712 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4713 
4714 	if (!root->reloc_root || !rc)
4715 		return;
4716 
4717 	if (!rc->merge_reloc_tree)
4718 		return;
4719 
4720 	root = root->reloc_root;
4721 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4722 	/*
4723 	 * relocation is in the stage of merging trees. the space
4724 	 * used by merging a reloc tree is twice the size of
4725 	 * relocated tree nodes in the worst case. half for cowing
4726 	 * the reloc tree, half for cowing the fs tree. the space
4727 	 * used by cowing the reloc tree will be freed after the
4728 	 * tree is dropped. if we create snapshot, cowing the fs
4729 	 * tree may use more space than it frees. so we need
4730 	 * reserve extra space.
4731 	 */
4732 	*bytes_to_reserve += rc->nodes_relocated;
4733 }
4734 
4735 /*
4736  * called after snapshot is created. migrate block reservation
4737  * and create reloc root for the newly created snapshot
4738  */
4739 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4740 			       struct btrfs_pending_snapshot *pending)
4741 {
4742 	struct btrfs_root *root = pending->root;
4743 	struct btrfs_root *reloc_root;
4744 	struct btrfs_root *new_root;
4745 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4746 	int ret;
4747 
4748 	if (!root->reloc_root || !rc)
4749 		return 0;
4750 
4751 	rc = root->fs_info->reloc_ctl;
4752 	rc->merging_rsv_size += rc->nodes_relocated;
4753 
4754 	if (rc->merge_reloc_tree) {
4755 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4756 					      rc->block_rsv,
4757 					      rc->nodes_relocated, true);
4758 		if (ret)
4759 			return ret;
4760 	}
4761 
4762 	new_root = pending->snap;
4763 	reloc_root = create_reloc_root(trans, root->reloc_root,
4764 				       new_root->root_key.objectid);
4765 	if (IS_ERR(reloc_root))
4766 		return PTR_ERR(reloc_root);
4767 
4768 	ret = __add_reloc_root(reloc_root);
4769 	BUG_ON(ret < 0);
4770 	new_root->reloc_root = reloc_root;
4771 
4772 	if (rc->create_reloc_tree)
4773 		ret = clone_backref_node(trans, rc, root, reloc_root);
4774 	return ret;
4775 }
4776