xref: /openbmc/linux/fs/btrfs/relocation.c (revision b0e55fef624e511e060fa05e4ca96cae6d902f04)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "volumes.h"
16 #include "locking.h"
17 #include "btrfs_inode.h"
18 #include "async-thread.h"
19 #include "free-space-cache.h"
20 #include "inode-map.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 #include "block-group.h"
25 
26 /*
27  * backref_node, mapping_node and tree_block start with this
28  */
29 struct tree_entry {
30 	struct rb_node rb_node;
31 	u64 bytenr;
32 };
33 
34 /*
35  * present a tree block in the backref cache
36  */
37 struct backref_node {
38 	struct rb_node rb_node;
39 	u64 bytenr;
40 
41 	u64 new_bytenr;
42 	/* objectid of tree block owner, can be not uptodate */
43 	u64 owner;
44 	/* link to pending, changed or detached list */
45 	struct list_head list;
46 	/* list of upper level blocks reference this block */
47 	struct list_head upper;
48 	/* list of child blocks in the cache */
49 	struct list_head lower;
50 	/* NULL if this node is not tree root */
51 	struct btrfs_root *root;
52 	/* extent buffer got by COW the block */
53 	struct extent_buffer *eb;
54 	/* level of tree block */
55 	unsigned int level:8;
56 	/* is the block in non-reference counted tree */
57 	unsigned int cowonly:1;
58 	/* 1 if no child node in the cache */
59 	unsigned int lowest:1;
60 	/* is the extent buffer locked */
61 	unsigned int locked:1;
62 	/* has the block been processed */
63 	unsigned int processed:1;
64 	/* have backrefs of this block been checked */
65 	unsigned int checked:1;
66 	/*
67 	 * 1 if corresponding block has been cowed but some upper
68 	 * level block pointers may not point to the new location
69 	 */
70 	unsigned int pending:1;
71 	/*
72 	 * 1 if the backref node isn't connected to any other
73 	 * backref node.
74 	 */
75 	unsigned int detached:1;
76 };
77 
78 /*
79  * present a block pointer in the backref cache
80  */
81 struct backref_edge {
82 	struct list_head list[2];
83 	struct backref_node *node[2];
84 };
85 
86 #define LOWER	0
87 #define UPPER	1
88 #define RELOCATION_RESERVED_NODES	256
89 
90 struct backref_cache {
91 	/* red black tree of all backref nodes in the cache */
92 	struct rb_root rb_root;
93 	/* for passing backref nodes to btrfs_reloc_cow_block */
94 	struct backref_node *path[BTRFS_MAX_LEVEL];
95 	/*
96 	 * list of blocks that have been cowed but some block
97 	 * pointers in upper level blocks may not reflect the
98 	 * new location
99 	 */
100 	struct list_head pending[BTRFS_MAX_LEVEL];
101 	/* list of backref nodes with no child node */
102 	struct list_head leaves;
103 	/* list of blocks that have been cowed in current transaction */
104 	struct list_head changed;
105 	/* list of detached backref node. */
106 	struct list_head detached;
107 
108 	u64 last_trans;
109 
110 	int nr_nodes;
111 	int nr_edges;
112 };
113 
114 /*
115  * map address of tree root to tree
116  */
117 struct mapping_node {
118 	struct rb_node rb_node;
119 	u64 bytenr;
120 	void *data;
121 };
122 
123 struct mapping_tree {
124 	struct rb_root rb_root;
125 	spinlock_t lock;
126 };
127 
128 /*
129  * present a tree block to process
130  */
131 struct tree_block {
132 	struct rb_node rb_node;
133 	u64 bytenr;
134 	struct btrfs_key key;
135 	unsigned int level:8;
136 	unsigned int key_ready:1;
137 };
138 
139 #define MAX_EXTENTS 128
140 
141 struct file_extent_cluster {
142 	u64 start;
143 	u64 end;
144 	u64 boundary[MAX_EXTENTS];
145 	unsigned int nr;
146 };
147 
148 struct reloc_control {
149 	/* block group to relocate */
150 	struct btrfs_block_group *block_group;
151 	/* extent tree */
152 	struct btrfs_root *extent_root;
153 	/* inode for moving data */
154 	struct inode *data_inode;
155 
156 	struct btrfs_block_rsv *block_rsv;
157 
158 	struct backref_cache backref_cache;
159 
160 	struct file_extent_cluster cluster;
161 	/* tree blocks have been processed */
162 	struct extent_io_tree processed_blocks;
163 	/* map start of tree root to corresponding reloc tree */
164 	struct mapping_tree reloc_root_tree;
165 	/* list of reloc trees */
166 	struct list_head reloc_roots;
167 	/* list of subvolume trees that get relocated */
168 	struct list_head dirty_subvol_roots;
169 	/* size of metadata reservation for merging reloc trees */
170 	u64 merging_rsv_size;
171 	/* size of relocated tree nodes */
172 	u64 nodes_relocated;
173 	/* reserved size for block group relocation*/
174 	u64 reserved_bytes;
175 
176 	u64 search_start;
177 	u64 extents_found;
178 
179 	unsigned int stage:8;
180 	unsigned int create_reloc_tree:1;
181 	unsigned int merge_reloc_tree:1;
182 	unsigned int found_file_extent:1;
183 };
184 
185 /* stages of data relocation */
186 #define MOVE_DATA_EXTENTS	0
187 #define UPDATE_DATA_PTRS	1
188 
189 static void remove_backref_node(struct backref_cache *cache,
190 				struct backref_node *node);
191 static void __mark_block_processed(struct reloc_control *rc,
192 				   struct backref_node *node);
193 
194 static void mapping_tree_init(struct mapping_tree *tree)
195 {
196 	tree->rb_root = RB_ROOT;
197 	spin_lock_init(&tree->lock);
198 }
199 
200 static void backref_cache_init(struct backref_cache *cache)
201 {
202 	int i;
203 	cache->rb_root = RB_ROOT;
204 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
205 		INIT_LIST_HEAD(&cache->pending[i]);
206 	INIT_LIST_HEAD(&cache->changed);
207 	INIT_LIST_HEAD(&cache->detached);
208 	INIT_LIST_HEAD(&cache->leaves);
209 }
210 
211 static void backref_cache_cleanup(struct backref_cache *cache)
212 {
213 	struct backref_node *node;
214 	int i;
215 
216 	while (!list_empty(&cache->detached)) {
217 		node = list_entry(cache->detached.next,
218 				  struct backref_node, list);
219 		remove_backref_node(cache, node);
220 	}
221 
222 	while (!list_empty(&cache->leaves)) {
223 		node = list_entry(cache->leaves.next,
224 				  struct backref_node, lower);
225 		remove_backref_node(cache, node);
226 	}
227 
228 	cache->last_trans = 0;
229 
230 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
231 		ASSERT(list_empty(&cache->pending[i]));
232 	ASSERT(list_empty(&cache->changed));
233 	ASSERT(list_empty(&cache->detached));
234 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
235 	ASSERT(!cache->nr_nodes);
236 	ASSERT(!cache->nr_edges);
237 }
238 
239 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
240 {
241 	struct backref_node *node;
242 
243 	node = kzalloc(sizeof(*node), GFP_NOFS);
244 	if (node) {
245 		INIT_LIST_HEAD(&node->list);
246 		INIT_LIST_HEAD(&node->upper);
247 		INIT_LIST_HEAD(&node->lower);
248 		RB_CLEAR_NODE(&node->rb_node);
249 		cache->nr_nodes++;
250 	}
251 	return node;
252 }
253 
254 static void free_backref_node(struct backref_cache *cache,
255 			      struct backref_node *node)
256 {
257 	if (node) {
258 		cache->nr_nodes--;
259 		kfree(node);
260 	}
261 }
262 
263 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
264 {
265 	struct backref_edge *edge;
266 
267 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
268 	if (edge)
269 		cache->nr_edges++;
270 	return edge;
271 }
272 
273 static void free_backref_edge(struct backref_cache *cache,
274 			      struct backref_edge *edge)
275 {
276 	if (edge) {
277 		cache->nr_edges--;
278 		kfree(edge);
279 	}
280 }
281 
282 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
283 				   struct rb_node *node)
284 {
285 	struct rb_node **p = &root->rb_node;
286 	struct rb_node *parent = NULL;
287 	struct tree_entry *entry;
288 
289 	while (*p) {
290 		parent = *p;
291 		entry = rb_entry(parent, struct tree_entry, rb_node);
292 
293 		if (bytenr < entry->bytenr)
294 			p = &(*p)->rb_left;
295 		else if (bytenr > entry->bytenr)
296 			p = &(*p)->rb_right;
297 		else
298 			return parent;
299 	}
300 
301 	rb_link_node(node, parent, p);
302 	rb_insert_color(node, root);
303 	return NULL;
304 }
305 
306 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
307 {
308 	struct rb_node *n = root->rb_node;
309 	struct tree_entry *entry;
310 
311 	while (n) {
312 		entry = rb_entry(n, struct tree_entry, rb_node);
313 
314 		if (bytenr < entry->bytenr)
315 			n = n->rb_left;
316 		else if (bytenr > entry->bytenr)
317 			n = n->rb_right;
318 		else
319 			return n;
320 	}
321 	return NULL;
322 }
323 
324 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
325 {
326 
327 	struct btrfs_fs_info *fs_info = NULL;
328 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
329 					      rb_node);
330 	if (bnode->root)
331 		fs_info = bnode->root->fs_info;
332 	btrfs_panic(fs_info, errno,
333 		    "Inconsistency in backref cache found at offset %llu",
334 		    bytenr);
335 }
336 
337 /*
338  * walk up backref nodes until reach node presents tree root
339  */
340 static struct backref_node *walk_up_backref(struct backref_node *node,
341 					    struct backref_edge *edges[],
342 					    int *index)
343 {
344 	struct backref_edge *edge;
345 	int idx = *index;
346 
347 	while (!list_empty(&node->upper)) {
348 		edge = list_entry(node->upper.next,
349 				  struct backref_edge, list[LOWER]);
350 		edges[idx++] = edge;
351 		node = edge->node[UPPER];
352 	}
353 	BUG_ON(node->detached);
354 	*index = idx;
355 	return node;
356 }
357 
358 /*
359  * walk down backref nodes to find start of next reference path
360  */
361 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
362 					      int *index)
363 {
364 	struct backref_edge *edge;
365 	struct backref_node *lower;
366 	int idx = *index;
367 
368 	while (idx > 0) {
369 		edge = edges[idx - 1];
370 		lower = edge->node[LOWER];
371 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
372 			idx--;
373 			continue;
374 		}
375 		edge = list_entry(edge->list[LOWER].next,
376 				  struct backref_edge, list[LOWER]);
377 		edges[idx - 1] = edge;
378 		*index = idx;
379 		return edge->node[UPPER];
380 	}
381 	*index = 0;
382 	return NULL;
383 }
384 
385 static void unlock_node_buffer(struct backref_node *node)
386 {
387 	if (node->locked) {
388 		btrfs_tree_unlock(node->eb);
389 		node->locked = 0;
390 	}
391 }
392 
393 static void drop_node_buffer(struct backref_node *node)
394 {
395 	if (node->eb) {
396 		unlock_node_buffer(node);
397 		free_extent_buffer(node->eb);
398 		node->eb = NULL;
399 	}
400 }
401 
402 static void drop_backref_node(struct backref_cache *tree,
403 			      struct backref_node *node)
404 {
405 	BUG_ON(!list_empty(&node->upper));
406 
407 	drop_node_buffer(node);
408 	list_del(&node->list);
409 	list_del(&node->lower);
410 	if (!RB_EMPTY_NODE(&node->rb_node))
411 		rb_erase(&node->rb_node, &tree->rb_root);
412 	free_backref_node(tree, node);
413 }
414 
415 /*
416  * remove a backref node from the backref cache
417  */
418 static void remove_backref_node(struct backref_cache *cache,
419 				struct backref_node *node)
420 {
421 	struct backref_node *upper;
422 	struct backref_edge *edge;
423 
424 	if (!node)
425 		return;
426 
427 	BUG_ON(!node->lowest && !node->detached);
428 	while (!list_empty(&node->upper)) {
429 		edge = list_entry(node->upper.next, struct backref_edge,
430 				  list[LOWER]);
431 		upper = edge->node[UPPER];
432 		list_del(&edge->list[LOWER]);
433 		list_del(&edge->list[UPPER]);
434 		free_backref_edge(cache, edge);
435 
436 		if (RB_EMPTY_NODE(&upper->rb_node)) {
437 			BUG_ON(!list_empty(&node->upper));
438 			drop_backref_node(cache, node);
439 			node = upper;
440 			node->lowest = 1;
441 			continue;
442 		}
443 		/*
444 		 * add the node to leaf node list if no other
445 		 * child block cached.
446 		 */
447 		if (list_empty(&upper->lower)) {
448 			list_add_tail(&upper->lower, &cache->leaves);
449 			upper->lowest = 1;
450 		}
451 	}
452 
453 	drop_backref_node(cache, node);
454 }
455 
456 static void update_backref_node(struct backref_cache *cache,
457 				struct backref_node *node, u64 bytenr)
458 {
459 	struct rb_node *rb_node;
460 	rb_erase(&node->rb_node, &cache->rb_root);
461 	node->bytenr = bytenr;
462 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
463 	if (rb_node)
464 		backref_tree_panic(rb_node, -EEXIST, bytenr);
465 }
466 
467 /*
468  * update backref cache after a transaction commit
469  */
470 static int update_backref_cache(struct btrfs_trans_handle *trans,
471 				struct backref_cache *cache)
472 {
473 	struct backref_node *node;
474 	int level = 0;
475 
476 	if (cache->last_trans == 0) {
477 		cache->last_trans = trans->transid;
478 		return 0;
479 	}
480 
481 	if (cache->last_trans == trans->transid)
482 		return 0;
483 
484 	/*
485 	 * detached nodes are used to avoid unnecessary backref
486 	 * lookup. transaction commit changes the extent tree.
487 	 * so the detached nodes are no longer useful.
488 	 */
489 	while (!list_empty(&cache->detached)) {
490 		node = list_entry(cache->detached.next,
491 				  struct backref_node, list);
492 		remove_backref_node(cache, node);
493 	}
494 
495 	while (!list_empty(&cache->changed)) {
496 		node = list_entry(cache->changed.next,
497 				  struct backref_node, list);
498 		list_del_init(&node->list);
499 		BUG_ON(node->pending);
500 		update_backref_node(cache, node, node->new_bytenr);
501 	}
502 
503 	/*
504 	 * some nodes can be left in the pending list if there were
505 	 * errors during processing the pending nodes.
506 	 */
507 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
508 		list_for_each_entry(node, &cache->pending[level], list) {
509 			BUG_ON(!node->pending);
510 			if (node->bytenr == node->new_bytenr)
511 				continue;
512 			update_backref_node(cache, node, node->new_bytenr);
513 		}
514 	}
515 
516 	cache->last_trans = 0;
517 	return 1;
518 }
519 
520 
521 static int should_ignore_root(struct btrfs_root *root)
522 {
523 	struct btrfs_root *reloc_root;
524 
525 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
526 		return 0;
527 
528 	reloc_root = root->reloc_root;
529 	if (!reloc_root)
530 		return 0;
531 
532 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
533 	    root->fs_info->running_transaction->transid - 1)
534 		return 0;
535 	/*
536 	 * if there is reloc tree and it was created in previous
537 	 * transaction backref lookup can find the reloc tree,
538 	 * so backref node for the fs tree root is useless for
539 	 * relocation.
540 	 */
541 	return 1;
542 }
543 /*
544  * find reloc tree by address of tree root
545  */
546 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
547 					  u64 bytenr)
548 {
549 	struct rb_node *rb_node;
550 	struct mapping_node *node;
551 	struct btrfs_root *root = NULL;
552 
553 	spin_lock(&rc->reloc_root_tree.lock);
554 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
555 	if (rb_node) {
556 		node = rb_entry(rb_node, struct mapping_node, rb_node);
557 		root = (struct btrfs_root *)node->data;
558 	}
559 	spin_unlock(&rc->reloc_root_tree.lock);
560 	return root;
561 }
562 
563 static int is_cowonly_root(u64 root_objectid)
564 {
565 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
566 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
567 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
568 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
569 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
570 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
571 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
572 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
573 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
574 		return 1;
575 	return 0;
576 }
577 
578 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
579 					u64 root_objectid)
580 {
581 	struct btrfs_key key;
582 
583 	key.objectid = root_objectid;
584 	key.type = BTRFS_ROOT_ITEM_KEY;
585 	if (is_cowonly_root(root_objectid))
586 		key.offset = 0;
587 	else
588 		key.offset = (u64)-1;
589 
590 	return btrfs_get_fs_root(fs_info, &key, false);
591 }
592 
593 static noinline_for_stack
594 int find_inline_backref(struct extent_buffer *leaf, int slot,
595 			unsigned long *ptr, unsigned long *end)
596 {
597 	struct btrfs_key key;
598 	struct btrfs_extent_item *ei;
599 	struct btrfs_tree_block_info *bi;
600 	u32 item_size;
601 
602 	btrfs_item_key_to_cpu(leaf, &key, slot);
603 
604 	item_size = btrfs_item_size_nr(leaf, slot);
605 	if (item_size < sizeof(*ei)) {
606 		btrfs_print_v0_err(leaf->fs_info);
607 		btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL);
608 		return 1;
609 	}
610 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
611 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
612 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
613 
614 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
615 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
616 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
617 		return 1;
618 	}
619 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
620 	    item_size <= sizeof(*ei)) {
621 		WARN_ON(item_size < sizeof(*ei));
622 		return 1;
623 	}
624 
625 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
626 		bi = (struct btrfs_tree_block_info *)(ei + 1);
627 		*ptr = (unsigned long)(bi + 1);
628 	} else {
629 		*ptr = (unsigned long)(ei + 1);
630 	}
631 	*end = (unsigned long)ei + item_size;
632 	return 0;
633 }
634 
635 /*
636  * build backref tree for a given tree block. root of the backref tree
637  * corresponds the tree block, leaves of the backref tree correspond
638  * roots of b-trees that reference the tree block.
639  *
640  * the basic idea of this function is check backrefs of a given block
641  * to find upper level blocks that reference the block, and then check
642  * backrefs of these upper level blocks recursively. the recursion stop
643  * when tree root is reached or backrefs for the block is cached.
644  *
645  * NOTE: if we find backrefs for a block are cached, we know backrefs
646  * for all upper level blocks that directly/indirectly reference the
647  * block are also cached.
648  */
649 static noinline_for_stack
650 struct backref_node *build_backref_tree(struct reloc_control *rc,
651 					struct btrfs_key *node_key,
652 					int level, u64 bytenr)
653 {
654 	struct backref_cache *cache = &rc->backref_cache;
655 	struct btrfs_path *path1; /* For searching extent root */
656 	struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
657 	struct extent_buffer *eb;
658 	struct btrfs_root *root;
659 	struct backref_node *cur;
660 	struct backref_node *upper;
661 	struct backref_node *lower;
662 	struct backref_node *node = NULL;
663 	struct backref_node *exist = NULL;
664 	struct backref_edge *edge;
665 	struct rb_node *rb_node;
666 	struct btrfs_key key;
667 	unsigned long end;
668 	unsigned long ptr;
669 	LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
670 	LIST_HEAD(useless);
671 	int cowonly;
672 	int ret;
673 	int err = 0;
674 	bool need_check = true;
675 
676 	path1 = btrfs_alloc_path();
677 	path2 = btrfs_alloc_path();
678 	if (!path1 || !path2) {
679 		err = -ENOMEM;
680 		goto out;
681 	}
682 	path1->reada = READA_FORWARD;
683 	path2->reada = READA_FORWARD;
684 
685 	node = alloc_backref_node(cache);
686 	if (!node) {
687 		err = -ENOMEM;
688 		goto out;
689 	}
690 
691 	node->bytenr = bytenr;
692 	node->level = level;
693 	node->lowest = 1;
694 	cur = node;
695 again:
696 	end = 0;
697 	ptr = 0;
698 	key.objectid = cur->bytenr;
699 	key.type = BTRFS_METADATA_ITEM_KEY;
700 	key.offset = (u64)-1;
701 
702 	path1->search_commit_root = 1;
703 	path1->skip_locking = 1;
704 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
705 				0, 0);
706 	if (ret < 0) {
707 		err = ret;
708 		goto out;
709 	}
710 	ASSERT(ret);
711 	ASSERT(path1->slots[0]);
712 
713 	path1->slots[0]--;
714 
715 	WARN_ON(cur->checked);
716 	if (!list_empty(&cur->upper)) {
717 		/*
718 		 * the backref was added previously when processing
719 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
720 		 */
721 		ASSERT(list_is_singular(&cur->upper));
722 		edge = list_entry(cur->upper.next, struct backref_edge,
723 				  list[LOWER]);
724 		ASSERT(list_empty(&edge->list[UPPER]));
725 		exist = edge->node[UPPER];
726 		/*
727 		 * add the upper level block to pending list if we need
728 		 * check its backrefs
729 		 */
730 		if (!exist->checked)
731 			list_add_tail(&edge->list[UPPER], &list);
732 	} else {
733 		exist = NULL;
734 	}
735 
736 	while (1) {
737 		cond_resched();
738 		eb = path1->nodes[0];
739 
740 		if (ptr >= end) {
741 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
742 				ret = btrfs_next_leaf(rc->extent_root, path1);
743 				if (ret < 0) {
744 					err = ret;
745 					goto out;
746 				}
747 				if (ret > 0)
748 					break;
749 				eb = path1->nodes[0];
750 			}
751 
752 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
753 			if (key.objectid != cur->bytenr) {
754 				WARN_ON(exist);
755 				break;
756 			}
757 
758 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
759 			    key.type == BTRFS_METADATA_ITEM_KEY) {
760 				ret = find_inline_backref(eb, path1->slots[0],
761 							  &ptr, &end);
762 				if (ret)
763 					goto next;
764 			}
765 		}
766 
767 		if (ptr < end) {
768 			/* update key for inline back ref */
769 			struct btrfs_extent_inline_ref *iref;
770 			int type;
771 			iref = (struct btrfs_extent_inline_ref *)ptr;
772 			type = btrfs_get_extent_inline_ref_type(eb, iref,
773 							BTRFS_REF_TYPE_BLOCK);
774 			if (type == BTRFS_REF_TYPE_INVALID) {
775 				err = -EUCLEAN;
776 				goto out;
777 			}
778 			key.type = type;
779 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
780 
781 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
782 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
783 		}
784 
785 		/*
786 		 * Parent node found and matches current inline ref, no need to
787 		 * rebuild this node for this inline ref.
788 		 */
789 		if (exist &&
790 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
791 		      exist->owner == key.offset) ||
792 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
793 		      exist->bytenr == key.offset))) {
794 			exist = NULL;
795 			goto next;
796 		}
797 
798 		/* SHARED_BLOCK_REF means key.offset is the parent bytenr */
799 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
800 			if (key.objectid == key.offset) {
801 				/*
802 				 * Only root blocks of reloc trees use backref
803 				 * pointing to itself.
804 				 */
805 				root = find_reloc_root(rc, cur->bytenr);
806 				ASSERT(root);
807 				cur->root = root;
808 				break;
809 			}
810 
811 			edge = alloc_backref_edge(cache);
812 			if (!edge) {
813 				err = -ENOMEM;
814 				goto out;
815 			}
816 			rb_node = tree_search(&cache->rb_root, key.offset);
817 			if (!rb_node) {
818 				upper = alloc_backref_node(cache);
819 				if (!upper) {
820 					free_backref_edge(cache, edge);
821 					err = -ENOMEM;
822 					goto out;
823 				}
824 				upper->bytenr = key.offset;
825 				upper->level = cur->level + 1;
826 				/*
827 				 *  backrefs for the upper level block isn't
828 				 *  cached, add the block to pending list
829 				 */
830 				list_add_tail(&edge->list[UPPER], &list);
831 			} else {
832 				upper = rb_entry(rb_node, struct backref_node,
833 						 rb_node);
834 				ASSERT(upper->checked);
835 				INIT_LIST_HEAD(&edge->list[UPPER]);
836 			}
837 			list_add_tail(&edge->list[LOWER], &cur->upper);
838 			edge->node[LOWER] = cur;
839 			edge->node[UPPER] = upper;
840 
841 			goto next;
842 		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
843 			err = -EINVAL;
844 			btrfs_print_v0_err(rc->extent_root->fs_info);
845 			btrfs_handle_fs_error(rc->extent_root->fs_info, err,
846 					      NULL);
847 			goto out;
848 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
849 			goto next;
850 		}
851 
852 		/*
853 		 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
854 		 * means the root objectid. We need to search the tree to get
855 		 * its parent bytenr.
856 		 */
857 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
858 		if (IS_ERR(root)) {
859 			err = PTR_ERR(root);
860 			goto out;
861 		}
862 
863 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
864 			cur->cowonly = 1;
865 
866 		if (btrfs_root_level(&root->root_item) == cur->level) {
867 			/* tree root */
868 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
869 			       cur->bytenr);
870 			if (should_ignore_root(root))
871 				list_add(&cur->list, &useless);
872 			else
873 				cur->root = root;
874 			break;
875 		}
876 
877 		level = cur->level + 1;
878 
879 		/* Search the tree to find parent blocks referring the block. */
880 		path2->search_commit_root = 1;
881 		path2->skip_locking = 1;
882 		path2->lowest_level = level;
883 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
884 		path2->lowest_level = 0;
885 		if (ret < 0) {
886 			err = ret;
887 			goto out;
888 		}
889 		if (ret > 0 && path2->slots[level] > 0)
890 			path2->slots[level]--;
891 
892 		eb = path2->nodes[level];
893 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
894 		    cur->bytenr) {
895 			btrfs_err(root->fs_info,
896 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
897 				  cur->bytenr, level - 1,
898 				  root->root_key.objectid,
899 				  node_key->objectid, node_key->type,
900 				  node_key->offset);
901 			err = -ENOENT;
902 			goto out;
903 		}
904 		lower = cur;
905 		need_check = true;
906 
907 		/* Add all nodes and edges in the path */
908 		for (; level < BTRFS_MAX_LEVEL; level++) {
909 			if (!path2->nodes[level]) {
910 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
911 				       lower->bytenr);
912 				if (should_ignore_root(root))
913 					list_add(&lower->list, &useless);
914 				else
915 					lower->root = root;
916 				break;
917 			}
918 
919 			edge = alloc_backref_edge(cache);
920 			if (!edge) {
921 				err = -ENOMEM;
922 				goto out;
923 			}
924 
925 			eb = path2->nodes[level];
926 			rb_node = tree_search(&cache->rb_root, eb->start);
927 			if (!rb_node) {
928 				upper = alloc_backref_node(cache);
929 				if (!upper) {
930 					free_backref_edge(cache, edge);
931 					err = -ENOMEM;
932 					goto out;
933 				}
934 				upper->bytenr = eb->start;
935 				upper->owner = btrfs_header_owner(eb);
936 				upper->level = lower->level + 1;
937 				if (!test_bit(BTRFS_ROOT_REF_COWS,
938 					      &root->state))
939 					upper->cowonly = 1;
940 
941 				/*
942 				 * if we know the block isn't shared
943 				 * we can void checking its backrefs.
944 				 */
945 				if (btrfs_block_can_be_shared(root, eb))
946 					upper->checked = 0;
947 				else
948 					upper->checked = 1;
949 
950 				/*
951 				 * add the block to pending list if we
952 				 * need check its backrefs, we only do this once
953 				 * while walking up a tree as we will catch
954 				 * anything else later on.
955 				 */
956 				if (!upper->checked && need_check) {
957 					need_check = false;
958 					list_add_tail(&edge->list[UPPER],
959 						      &list);
960 				} else {
961 					if (upper->checked)
962 						need_check = true;
963 					INIT_LIST_HEAD(&edge->list[UPPER]);
964 				}
965 			} else {
966 				upper = rb_entry(rb_node, struct backref_node,
967 						 rb_node);
968 				ASSERT(upper->checked);
969 				INIT_LIST_HEAD(&edge->list[UPPER]);
970 				if (!upper->owner)
971 					upper->owner = btrfs_header_owner(eb);
972 			}
973 			list_add_tail(&edge->list[LOWER], &lower->upper);
974 			edge->node[LOWER] = lower;
975 			edge->node[UPPER] = upper;
976 
977 			if (rb_node)
978 				break;
979 			lower = upper;
980 			upper = NULL;
981 		}
982 		btrfs_release_path(path2);
983 next:
984 		if (ptr < end) {
985 			ptr += btrfs_extent_inline_ref_size(key.type);
986 			if (ptr >= end) {
987 				WARN_ON(ptr > end);
988 				ptr = 0;
989 				end = 0;
990 			}
991 		}
992 		if (ptr >= end)
993 			path1->slots[0]++;
994 	}
995 	btrfs_release_path(path1);
996 
997 	cur->checked = 1;
998 	WARN_ON(exist);
999 
1000 	/* the pending list isn't empty, take the first block to process */
1001 	if (!list_empty(&list)) {
1002 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1003 		list_del_init(&edge->list[UPPER]);
1004 		cur = edge->node[UPPER];
1005 		goto again;
1006 	}
1007 
1008 	/*
1009 	 * everything goes well, connect backref nodes and insert backref nodes
1010 	 * into the cache.
1011 	 */
1012 	ASSERT(node->checked);
1013 	cowonly = node->cowonly;
1014 	if (!cowonly) {
1015 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1016 				      &node->rb_node);
1017 		if (rb_node)
1018 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1019 		list_add_tail(&node->lower, &cache->leaves);
1020 	}
1021 
1022 	list_for_each_entry(edge, &node->upper, list[LOWER])
1023 		list_add_tail(&edge->list[UPPER], &list);
1024 
1025 	while (!list_empty(&list)) {
1026 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1027 		list_del_init(&edge->list[UPPER]);
1028 		upper = edge->node[UPPER];
1029 		if (upper->detached) {
1030 			list_del(&edge->list[LOWER]);
1031 			lower = edge->node[LOWER];
1032 			free_backref_edge(cache, edge);
1033 			if (list_empty(&lower->upper))
1034 				list_add(&lower->list, &useless);
1035 			continue;
1036 		}
1037 
1038 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1039 			if (upper->lowest) {
1040 				list_del_init(&upper->lower);
1041 				upper->lowest = 0;
1042 			}
1043 
1044 			list_add_tail(&edge->list[UPPER], &upper->lower);
1045 			continue;
1046 		}
1047 
1048 		if (!upper->checked) {
1049 			/*
1050 			 * Still want to blow up for developers since this is a
1051 			 * logic bug.
1052 			 */
1053 			ASSERT(0);
1054 			err = -EINVAL;
1055 			goto out;
1056 		}
1057 		if (cowonly != upper->cowonly) {
1058 			ASSERT(0);
1059 			err = -EINVAL;
1060 			goto out;
1061 		}
1062 
1063 		if (!cowonly) {
1064 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1065 					      &upper->rb_node);
1066 			if (rb_node)
1067 				backref_tree_panic(rb_node, -EEXIST,
1068 						   upper->bytenr);
1069 		}
1070 
1071 		list_add_tail(&edge->list[UPPER], &upper->lower);
1072 
1073 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1074 			list_add_tail(&edge->list[UPPER], &list);
1075 	}
1076 	/*
1077 	 * process useless backref nodes. backref nodes for tree leaves
1078 	 * are deleted from the cache. backref nodes for upper level
1079 	 * tree blocks are left in the cache to avoid unnecessary backref
1080 	 * lookup.
1081 	 */
1082 	while (!list_empty(&useless)) {
1083 		upper = list_entry(useless.next, struct backref_node, list);
1084 		list_del_init(&upper->list);
1085 		ASSERT(list_empty(&upper->upper));
1086 		if (upper == node)
1087 			node = NULL;
1088 		if (upper->lowest) {
1089 			list_del_init(&upper->lower);
1090 			upper->lowest = 0;
1091 		}
1092 		while (!list_empty(&upper->lower)) {
1093 			edge = list_entry(upper->lower.next,
1094 					  struct backref_edge, list[UPPER]);
1095 			list_del(&edge->list[UPPER]);
1096 			list_del(&edge->list[LOWER]);
1097 			lower = edge->node[LOWER];
1098 			free_backref_edge(cache, edge);
1099 
1100 			if (list_empty(&lower->upper))
1101 				list_add(&lower->list, &useless);
1102 		}
1103 		__mark_block_processed(rc, upper);
1104 		if (upper->level > 0) {
1105 			list_add(&upper->list, &cache->detached);
1106 			upper->detached = 1;
1107 		} else {
1108 			rb_erase(&upper->rb_node, &cache->rb_root);
1109 			free_backref_node(cache, upper);
1110 		}
1111 	}
1112 out:
1113 	btrfs_free_path(path1);
1114 	btrfs_free_path(path2);
1115 	if (err) {
1116 		while (!list_empty(&useless)) {
1117 			lower = list_entry(useless.next,
1118 					   struct backref_node, list);
1119 			list_del_init(&lower->list);
1120 		}
1121 		while (!list_empty(&list)) {
1122 			edge = list_first_entry(&list, struct backref_edge,
1123 						list[UPPER]);
1124 			list_del(&edge->list[UPPER]);
1125 			list_del(&edge->list[LOWER]);
1126 			lower = edge->node[LOWER];
1127 			upper = edge->node[UPPER];
1128 			free_backref_edge(cache, edge);
1129 
1130 			/*
1131 			 * Lower is no longer linked to any upper backref nodes
1132 			 * and isn't in the cache, we can free it ourselves.
1133 			 */
1134 			if (list_empty(&lower->upper) &&
1135 			    RB_EMPTY_NODE(&lower->rb_node))
1136 				list_add(&lower->list, &useless);
1137 
1138 			if (!RB_EMPTY_NODE(&upper->rb_node))
1139 				continue;
1140 
1141 			/* Add this guy's upper edges to the list to process */
1142 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1143 				list_add_tail(&edge->list[UPPER], &list);
1144 			if (list_empty(&upper->upper))
1145 				list_add(&upper->list, &useless);
1146 		}
1147 
1148 		while (!list_empty(&useless)) {
1149 			lower = list_entry(useless.next,
1150 					   struct backref_node, list);
1151 			list_del_init(&lower->list);
1152 			if (lower == node)
1153 				node = NULL;
1154 			free_backref_node(cache, lower);
1155 		}
1156 
1157 		free_backref_node(cache, node);
1158 		return ERR_PTR(err);
1159 	}
1160 	ASSERT(!node || !node->detached);
1161 	return node;
1162 }
1163 
1164 /*
1165  * helper to add backref node for the newly created snapshot.
1166  * the backref node is created by cloning backref node that
1167  * corresponds to root of source tree
1168  */
1169 static int clone_backref_node(struct btrfs_trans_handle *trans,
1170 			      struct reloc_control *rc,
1171 			      struct btrfs_root *src,
1172 			      struct btrfs_root *dest)
1173 {
1174 	struct btrfs_root *reloc_root = src->reloc_root;
1175 	struct backref_cache *cache = &rc->backref_cache;
1176 	struct backref_node *node = NULL;
1177 	struct backref_node *new_node;
1178 	struct backref_edge *edge;
1179 	struct backref_edge *new_edge;
1180 	struct rb_node *rb_node;
1181 
1182 	if (cache->last_trans > 0)
1183 		update_backref_cache(trans, cache);
1184 
1185 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1186 	if (rb_node) {
1187 		node = rb_entry(rb_node, struct backref_node, rb_node);
1188 		if (node->detached)
1189 			node = NULL;
1190 		else
1191 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1192 	}
1193 
1194 	if (!node) {
1195 		rb_node = tree_search(&cache->rb_root,
1196 				      reloc_root->commit_root->start);
1197 		if (rb_node) {
1198 			node = rb_entry(rb_node, struct backref_node,
1199 					rb_node);
1200 			BUG_ON(node->detached);
1201 		}
1202 	}
1203 
1204 	if (!node)
1205 		return 0;
1206 
1207 	new_node = alloc_backref_node(cache);
1208 	if (!new_node)
1209 		return -ENOMEM;
1210 
1211 	new_node->bytenr = dest->node->start;
1212 	new_node->level = node->level;
1213 	new_node->lowest = node->lowest;
1214 	new_node->checked = 1;
1215 	new_node->root = dest;
1216 
1217 	if (!node->lowest) {
1218 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1219 			new_edge = alloc_backref_edge(cache);
1220 			if (!new_edge)
1221 				goto fail;
1222 
1223 			new_edge->node[UPPER] = new_node;
1224 			new_edge->node[LOWER] = edge->node[LOWER];
1225 			list_add_tail(&new_edge->list[UPPER],
1226 				      &new_node->lower);
1227 		}
1228 	} else {
1229 		list_add_tail(&new_node->lower, &cache->leaves);
1230 	}
1231 
1232 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1233 			      &new_node->rb_node);
1234 	if (rb_node)
1235 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1236 
1237 	if (!new_node->lowest) {
1238 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1239 			list_add_tail(&new_edge->list[LOWER],
1240 				      &new_edge->node[LOWER]->upper);
1241 		}
1242 	}
1243 	return 0;
1244 fail:
1245 	while (!list_empty(&new_node->lower)) {
1246 		new_edge = list_entry(new_node->lower.next,
1247 				      struct backref_edge, list[UPPER]);
1248 		list_del(&new_edge->list[UPPER]);
1249 		free_backref_edge(cache, new_edge);
1250 	}
1251 	free_backref_node(cache, new_node);
1252 	return -ENOMEM;
1253 }
1254 
1255 /*
1256  * helper to add 'address of tree root -> reloc tree' mapping
1257  */
1258 static int __must_check __add_reloc_root(struct btrfs_root *root)
1259 {
1260 	struct btrfs_fs_info *fs_info = root->fs_info;
1261 	struct rb_node *rb_node;
1262 	struct mapping_node *node;
1263 	struct reloc_control *rc = fs_info->reloc_ctl;
1264 
1265 	node = kmalloc(sizeof(*node), GFP_NOFS);
1266 	if (!node)
1267 		return -ENOMEM;
1268 
1269 	node->bytenr = root->node->start;
1270 	node->data = root;
1271 
1272 	spin_lock(&rc->reloc_root_tree.lock);
1273 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1274 			      node->bytenr, &node->rb_node);
1275 	spin_unlock(&rc->reloc_root_tree.lock);
1276 	if (rb_node) {
1277 		btrfs_panic(fs_info, -EEXIST,
1278 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1279 			    node->bytenr);
1280 	}
1281 
1282 	list_add_tail(&root->root_list, &rc->reloc_roots);
1283 	return 0;
1284 }
1285 
1286 /*
1287  * helper to delete the 'address of tree root -> reloc tree'
1288  * mapping
1289  */
1290 static void __del_reloc_root(struct btrfs_root *root)
1291 {
1292 	struct btrfs_fs_info *fs_info = root->fs_info;
1293 	struct rb_node *rb_node;
1294 	struct mapping_node *node = NULL;
1295 	struct reloc_control *rc = fs_info->reloc_ctl;
1296 
1297 	if (rc && root->node) {
1298 		spin_lock(&rc->reloc_root_tree.lock);
1299 		rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1300 				      root->node->start);
1301 		if (rb_node) {
1302 			node = rb_entry(rb_node, struct mapping_node, rb_node);
1303 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1304 		}
1305 		spin_unlock(&rc->reloc_root_tree.lock);
1306 		if (!node)
1307 			return;
1308 		BUG_ON((struct btrfs_root *)node->data != root);
1309 	}
1310 
1311 	spin_lock(&fs_info->trans_lock);
1312 	list_del_init(&root->root_list);
1313 	spin_unlock(&fs_info->trans_lock);
1314 	kfree(node);
1315 }
1316 
1317 /*
1318  * helper to update the 'address of tree root -> reloc tree'
1319  * mapping
1320  */
1321 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1322 {
1323 	struct btrfs_fs_info *fs_info = root->fs_info;
1324 	struct rb_node *rb_node;
1325 	struct mapping_node *node = NULL;
1326 	struct reloc_control *rc = fs_info->reloc_ctl;
1327 
1328 	spin_lock(&rc->reloc_root_tree.lock);
1329 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1330 			      root->node->start);
1331 	if (rb_node) {
1332 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1333 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1334 	}
1335 	spin_unlock(&rc->reloc_root_tree.lock);
1336 
1337 	if (!node)
1338 		return 0;
1339 	BUG_ON((struct btrfs_root *)node->data != root);
1340 
1341 	spin_lock(&rc->reloc_root_tree.lock);
1342 	node->bytenr = new_bytenr;
1343 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1344 			      node->bytenr, &node->rb_node);
1345 	spin_unlock(&rc->reloc_root_tree.lock);
1346 	if (rb_node)
1347 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1348 	return 0;
1349 }
1350 
1351 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1352 					struct btrfs_root *root, u64 objectid)
1353 {
1354 	struct btrfs_fs_info *fs_info = root->fs_info;
1355 	struct btrfs_root *reloc_root;
1356 	struct extent_buffer *eb;
1357 	struct btrfs_root_item *root_item;
1358 	struct btrfs_key root_key;
1359 	int ret;
1360 
1361 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1362 	BUG_ON(!root_item);
1363 
1364 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1365 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1366 	root_key.offset = objectid;
1367 
1368 	if (root->root_key.objectid == objectid) {
1369 		u64 commit_root_gen;
1370 
1371 		/* called by btrfs_init_reloc_root */
1372 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1373 				      BTRFS_TREE_RELOC_OBJECTID);
1374 		BUG_ON(ret);
1375 		/*
1376 		 * Set the last_snapshot field to the generation of the commit
1377 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1378 		 * correctly (returns true) when the relocation root is created
1379 		 * either inside the critical section of a transaction commit
1380 		 * (through transaction.c:qgroup_account_snapshot()) and when
1381 		 * it's created before the transaction commit is started.
1382 		 */
1383 		commit_root_gen = btrfs_header_generation(root->commit_root);
1384 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1385 	} else {
1386 		/*
1387 		 * called by btrfs_reloc_post_snapshot_hook.
1388 		 * the source tree is a reloc tree, all tree blocks
1389 		 * modified after it was created have RELOC flag
1390 		 * set in their headers. so it's OK to not update
1391 		 * the 'last_snapshot'.
1392 		 */
1393 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1394 				      BTRFS_TREE_RELOC_OBJECTID);
1395 		BUG_ON(ret);
1396 	}
1397 
1398 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1399 	btrfs_set_root_bytenr(root_item, eb->start);
1400 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1401 	btrfs_set_root_generation(root_item, trans->transid);
1402 
1403 	if (root->root_key.objectid == objectid) {
1404 		btrfs_set_root_refs(root_item, 0);
1405 		memset(&root_item->drop_progress, 0,
1406 		       sizeof(struct btrfs_disk_key));
1407 		root_item->drop_level = 0;
1408 	}
1409 
1410 	btrfs_tree_unlock(eb);
1411 	free_extent_buffer(eb);
1412 
1413 	ret = btrfs_insert_root(trans, fs_info->tree_root,
1414 				&root_key, root_item);
1415 	BUG_ON(ret);
1416 	kfree(root_item);
1417 
1418 	reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1419 	BUG_ON(IS_ERR(reloc_root));
1420 	reloc_root->last_trans = trans->transid;
1421 	return reloc_root;
1422 }
1423 
1424 /*
1425  * create reloc tree for a given fs tree. reloc tree is just a
1426  * snapshot of the fs tree with special root objectid.
1427  */
1428 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1429 			  struct btrfs_root *root)
1430 {
1431 	struct btrfs_fs_info *fs_info = root->fs_info;
1432 	struct btrfs_root *reloc_root;
1433 	struct reloc_control *rc = fs_info->reloc_ctl;
1434 	struct btrfs_block_rsv *rsv;
1435 	int clear_rsv = 0;
1436 	int ret;
1437 
1438 	/*
1439 	 * The subvolume has reloc tree but the swap is finished, no need to
1440 	 * create/update the dead reloc tree
1441 	 */
1442 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
1443 		return 0;
1444 
1445 	if (root->reloc_root) {
1446 		reloc_root = root->reloc_root;
1447 		reloc_root->last_trans = trans->transid;
1448 		return 0;
1449 	}
1450 
1451 	if (!rc || !rc->create_reloc_tree ||
1452 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1453 		return 0;
1454 
1455 	if (!trans->reloc_reserved) {
1456 		rsv = trans->block_rsv;
1457 		trans->block_rsv = rc->block_rsv;
1458 		clear_rsv = 1;
1459 	}
1460 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1461 	if (clear_rsv)
1462 		trans->block_rsv = rsv;
1463 
1464 	ret = __add_reloc_root(reloc_root);
1465 	BUG_ON(ret < 0);
1466 	root->reloc_root = reloc_root;
1467 	return 0;
1468 }
1469 
1470 /*
1471  * update root item of reloc tree
1472  */
1473 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1474 			    struct btrfs_root *root)
1475 {
1476 	struct btrfs_fs_info *fs_info = root->fs_info;
1477 	struct btrfs_root *reloc_root;
1478 	struct btrfs_root_item *root_item;
1479 	int ret;
1480 
1481 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
1482 	    !root->reloc_root)
1483 		goto out;
1484 
1485 	reloc_root = root->reloc_root;
1486 	root_item = &reloc_root->root_item;
1487 
1488 	/* root->reloc_root will stay until current relocation finished */
1489 	if (fs_info->reloc_ctl->merge_reloc_tree &&
1490 	    btrfs_root_refs(root_item) == 0) {
1491 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1492 		__del_reloc_root(reloc_root);
1493 	}
1494 
1495 	if (reloc_root->commit_root != reloc_root->node) {
1496 		btrfs_set_root_node(root_item, reloc_root->node);
1497 		free_extent_buffer(reloc_root->commit_root);
1498 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1499 	}
1500 
1501 	ret = btrfs_update_root(trans, fs_info->tree_root,
1502 				&reloc_root->root_key, root_item);
1503 	BUG_ON(ret);
1504 
1505 out:
1506 	return 0;
1507 }
1508 
1509 /*
1510  * helper to find first cached inode with inode number >= objectid
1511  * in a subvolume
1512  */
1513 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1514 {
1515 	struct rb_node *node;
1516 	struct rb_node *prev;
1517 	struct btrfs_inode *entry;
1518 	struct inode *inode;
1519 
1520 	spin_lock(&root->inode_lock);
1521 again:
1522 	node = root->inode_tree.rb_node;
1523 	prev = NULL;
1524 	while (node) {
1525 		prev = node;
1526 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1527 
1528 		if (objectid < btrfs_ino(entry))
1529 			node = node->rb_left;
1530 		else if (objectid > btrfs_ino(entry))
1531 			node = node->rb_right;
1532 		else
1533 			break;
1534 	}
1535 	if (!node) {
1536 		while (prev) {
1537 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1538 			if (objectid <= btrfs_ino(entry)) {
1539 				node = prev;
1540 				break;
1541 			}
1542 			prev = rb_next(prev);
1543 		}
1544 	}
1545 	while (node) {
1546 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1547 		inode = igrab(&entry->vfs_inode);
1548 		if (inode) {
1549 			spin_unlock(&root->inode_lock);
1550 			return inode;
1551 		}
1552 
1553 		objectid = btrfs_ino(entry) + 1;
1554 		if (cond_resched_lock(&root->inode_lock))
1555 			goto again;
1556 
1557 		node = rb_next(node);
1558 	}
1559 	spin_unlock(&root->inode_lock);
1560 	return NULL;
1561 }
1562 
1563 static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group)
1564 {
1565 	if (bytenr >= block_group->start &&
1566 	    bytenr < block_group->start + block_group->length)
1567 		return 1;
1568 	return 0;
1569 }
1570 
1571 /*
1572  * get new location of data
1573  */
1574 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1575 			    u64 bytenr, u64 num_bytes)
1576 {
1577 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1578 	struct btrfs_path *path;
1579 	struct btrfs_file_extent_item *fi;
1580 	struct extent_buffer *leaf;
1581 	int ret;
1582 
1583 	path = btrfs_alloc_path();
1584 	if (!path)
1585 		return -ENOMEM;
1586 
1587 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1588 	ret = btrfs_lookup_file_extent(NULL, root, path,
1589 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1590 	if (ret < 0)
1591 		goto out;
1592 	if (ret > 0) {
1593 		ret = -ENOENT;
1594 		goto out;
1595 	}
1596 
1597 	leaf = path->nodes[0];
1598 	fi = btrfs_item_ptr(leaf, path->slots[0],
1599 			    struct btrfs_file_extent_item);
1600 
1601 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1602 	       btrfs_file_extent_compression(leaf, fi) ||
1603 	       btrfs_file_extent_encryption(leaf, fi) ||
1604 	       btrfs_file_extent_other_encoding(leaf, fi));
1605 
1606 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1607 		ret = -EINVAL;
1608 		goto out;
1609 	}
1610 
1611 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1612 	ret = 0;
1613 out:
1614 	btrfs_free_path(path);
1615 	return ret;
1616 }
1617 
1618 /*
1619  * update file extent items in the tree leaf to point to
1620  * the new locations.
1621  */
1622 static noinline_for_stack
1623 int replace_file_extents(struct btrfs_trans_handle *trans,
1624 			 struct reloc_control *rc,
1625 			 struct btrfs_root *root,
1626 			 struct extent_buffer *leaf)
1627 {
1628 	struct btrfs_fs_info *fs_info = root->fs_info;
1629 	struct btrfs_key key;
1630 	struct btrfs_file_extent_item *fi;
1631 	struct inode *inode = NULL;
1632 	u64 parent;
1633 	u64 bytenr;
1634 	u64 new_bytenr = 0;
1635 	u64 num_bytes;
1636 	u64 end;
1637 	u32 nritems;
1638 	u32 i;
1639 	int ret = 0;
1640 	int first = 1;
1641 	int dirty = 0;
1642 
1643 	if (rc->stage != UPDATE_DATA_PTRS)
1644 		return 0;
1645 
1646 	/* reloc trees always use full backref */
1647 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1648 		parent = leaf->start;
1649 	else
1650 		parent = 0;
1651 
1652 	nritems = btrfs_header_nritems(leaf);
1653 	for (i = 0; i < nritems; i++) {
1654 		struct btrfs_ref ref = { 0 };
1655 
1656 		cond_resched();
1657 		btrfs_item_key_to_cpu(leaf, &key, i);
1658 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1659 			continue;
1660 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1661 		if (btrfs_file_extent_type(leaf, fi) ==
1662 		    BTRFS_FILE_EXTENT_INLINE)
1663 			continue;
1664 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1665 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1666 		if (bytenr == 0)
1667 			continue;
1668 		if (!in_block_group(bytenr, rc->block_group))
1669 			continue;
1670 
1671 		/*
1672 		 * if we are modifying block in fs tree, wait for readpage
1673 		 * to complete and drop the extent cache
1674 		 */
1675 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1676 			if (first) {
1677 				inode = find_next_inode(root, key.objectid);
1678 				first = 0;
1679 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1680 				btrfs_add_delayed_iput(inode);
1681 				inode = find_next_inode(root, key.objectid);
1682 			}
1683 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1684 				end = key.offset +
1685 				      btrfs_file_extent_num_bytes(leaf, fi);
1686 				WARN_ON(!IS_ALIGNED(key.offset,
1687 						    fs_info->sectorsize));
1688 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1689 				end--;
1690 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1691 						      key.offset, end);
1692 				if (!ret)
1693 					continue;
1694 
1695 				btrfs_drop_extent_cache(BTRFS_I(inode),
1696 						key.offset,	end, 1);
1697 				unlock_extent(&BTRFS_I(inode)->io_tree,
1698 					      key.offset, end);
1699 			}
1700 		}
1701 
1702 		ret = get_new_location(rc->data_inode, &new_bytenr,
1703 				       bytenr, num_bytes);
1704 		if (ret) {
1705 			/*
1706 			 * Don't have to abort since we've not changed anything
1707 			 * in the file extent yet.
1708 			 */
1709 			break;
1710 		}
1711 
1712 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1713 		dirty = 1;
1714 
1715 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1716 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1717 				       num_bytes, parent);
1718 		ref.real_root = root->root_key.objectid;
1719 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1720 				    key.objectid, key.offset);
1721 		ret = btrfs_inc_extent_ref(trans, &ref);
1722 		if (ret) {
1723 			btrfs_abort_transaction(trans, ret);
1724 			break;
1725 		}
1726 
1727 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1728 				       num_bytes, parent);
1729 		ref.real_root = root->root_key.objectid;
1730 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1731 				    key.objectid, key.offset);
1732 		ret = btrfs_free_extent(trans, &ref);
1733 		if (ret) {
1734 			btrfs_abort_transaction(trans, ret);
1735 			break;
1736 		}
1737 	}
1738 	if (dirty)
1739 		btrfs_mark_buffer_dirty(leaf);
1740 	if (inode)
1741 		btrfs_add_delayed_iput(inode);
1742 	return ret;
1743 }
1744 
1745 static noinline_for_stack
1746 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1747 		     struct btrfs_path *path, int level)
1748 {
1749 	struct btrfs_disk_key key1;
1750 	struct btrfs_disk_key key2;
1751 	btrfs_node_key(eb, &key1, slot);
1752 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1753 	return memcmp(&key1, &key2, sizeof(key1));
1754 }
1755 
1756 /*
1757  * try to replace tree blocks in fs tree with the new blocks
1758  * in reloc tree. tree blocks haven't been modified since the
1759  * reloc tree was create can be replaced.
1760  *
1761  * if a block was replaced, level of the block + 1 is returned.
1762  * if no block got replaced, 0 is returned. if there are other
1763  * errors, a negative error number is returned.
1764  */
1765 static noinline_for_stack
1766 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1767 		 struct btrfs_root *dest, struct btrfs_root *src,
1768 		 struct btrfs_path *path, struct btrfs_key *next_key,
1769 		 int lowest_level, int max_level)
1770 {
1771 	struct btrfs_fs_info *fs_info = dest->fs_info;
1772 	struct extent_buffer *eb;
1773 	struct extent_buffer *parent;
1774 	struct btrfs_ref ref = { 0 };
1775 	struct btrfs_key key;
1776 	u64 old_bytenr;
1777 	u64 new_bytenr;
1778 	u64 old_ptr_gen;
1779 	u64 new_ptr_gen;
1780 	u64 last_snapshot;
1781 	u32 blocksize;
1782 	int cow = 0;
1783 	int level;
1784 	int ret;
1785 	int slot;
1786 
1787 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1788 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1789 
1790 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1791 again:
1792 	slot = path->slots[lowest_level];
1793 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1794 
1795 	eb = btrfs_lock_root_node(dest);
1796 	btrfs_set_lock_blocking_write(eb);
1797 	level = btrfs_header_level(eb);
1798 
1799 	if (level < lowest_level) {
1800 		btrfs_tree_unlock(eb);
1801 		free_extent_buffer(eb);
1802 		return 0;
1803 	}
1804 
1805 	if (cow) {
1806 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1807 		BUG_ON(ret);
1808 	}
1809 	btrfs_set_lock_blocking_write(eb);
1810 
1811 	if (next_key) {
1812 		next_key->objectid = (u64)-1;
1813 		next_key->type = (u8)-1;
1814 		next_key->offset = (u64)-1;
1815 	}
1816 
1817 	parent = eb;
1818 	while (1) {
1819 		struct btrfs_key first_key;
1820 
1821 		level = btrfs_header_level(parent);
1822 		BUG_ON(level < lowest_level);
1823 
1824 		ret = btrfs_bin_search(parent, &key, level, &slot);
1825 		if (ret < 0)
1826 			break;
1827 		if (ret && slot > 0)
1828 			slot--;
1829 
1830 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1831 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1832 
1833 		old_bytenr = btrfs_node_blockptr(parent, slot);
1834 		blocksize = fs_info->nodesize;
1835 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1836 		btrfs_node_key_to_cpu(parent, &first_key, slot);
1837 
1838 		if (level <= max_level) {
1839 			eb = path->nodes[level];
1840 			new_bytenr = btrfs_node_blockptr(eb,
1841 							path->slots[level]);
1842 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1843 							path->slots[level]);
1844 		} else {
1845 			new_bytenr = 0;
1846 			new_ptr_gen = 0;
1847 		}
1848 
1849 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1850 			ret = level;
1851 			break;
1852 		}
1853 
1854 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1855 		    memcmp_node_keys(parent, slot, path, level)) {
1856 			if (level <= lowest_level) {
1857 				ret = 0;
1858 				break;
1859 			}
1860 
1861 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1862 					     level - 1, &first_key);
1863 			if (IS_ERR(eb)) {
1864 				ret = PTR_ERR(eb);
1865 				break;
1866 			} else if (!extent_buffer_uptodate(eb)) {
1867 				ret = -EIO;
1868 				free_extent_buffer(eb);
1869 				break;
1870 			}
1871 			btrfs_tree_lock(eb);
1872 			if (cow) {
1873 				ret = btrfs_cow_block(trans, dest, eb, parent,
1874 						      slot, &eb);
1875 				BUG_ON(ret);
1876 			}
1877 			btrfs_set_lock_blocking_write(eb);
1878 
1879 			btrfs_tree_unlock(parent);
1880 			free_extent_buffer(parent);
1881 
1882 			parent = eb;
1883 			continue;
1884 		}
1885 
1886 		if (!cow) {
1887 			btrfs_tree_unlock(parent);
1888 			free_extent_buffer(parent);
1889 			cow = 1;
1890 			goto again;
1891 		}
1892 
1893 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1894 				      path->slots[level]);
1895 		btrfs_release_path(path);
1896 
1897 		path->lowest_level = level;
1898 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1899 		path->lowest_level = 0;
1900 		BUG_ON(ret);
1901 
1902 		/*
1903 		 * Info qgroup to trace both subtrees.
1904 		 *
1905 		 * We must trace both trees.
1906 		 * 1) Tree reloc subtree
1907 		 *    If not traced, we will leak data numbers
1908 		 * 2) Fs subtree
1909 		 *    If not traced, we will double count old data
1910 		 *
1911 		 * We don't scan the subtree right now, but only record
1912 		 * the swapped tree blocks.
1913 		 * The real subtree rescan is delayed until we have new
1914 		 * CoW on the subtree root node before transaction commit.
1915 		 */
1916 		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1917 				rc->block_group, parent, slot,
1918 				path->nodes[level], path->slots[level],
1919 				last_snapshot);
1920 		if (ret < 0)
1921 			break;
1922 		/*
1923 		 * swap blocks in fs tree and reloc tree.
1924 		 */
1925 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1926 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1927 		btrfs_mark_buffer_dirty(parent);
1928 
1929 		btrfs_set_node_blockptr(path->nodes[level],
1930 					path->slots[level], old_bytenr);
1931 		btrfs_set_node_ptr_generation(path->nodes[level],
1932 					      path->slots[level], old_ptr_gen);
1933 		btrfs_mark_buffer_dirty(path->nodes[level]);
1934 
1935 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1936 				       blocksize, path->nodes[level]->start);
1937 		ref.skip_qgroup = true;
1938 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1939 		ret = btrfs_inc_extent_ref(trans, &ref);
1940 		BUG_ON(ret);
1941 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1942 				       blocksize, 0);
1943 		ref.skip_qgroup = true;
1944 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1945 		ret = btrfs_inc_extent_ref(trans, &ref);
1946 		BUG_ON(ret);
1947 
1948 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1949 				       blocksize, path->nodes[level]->start);
1950 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1951 		ref.skip_qgroup = true;
1952 		ret = btrfs_free_extent(trans, &ref);
1953 		BUG_ON(ret);
1954 
1955 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1956 				       blocksize, 0);
1957 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1958 		ref.skip_qgroup = true;
1959 		ret = btrfs_free_extent(trans, &ref);
1960 		BUG_ON(ret);
1961 
1962 		btrfs_unlock_up_safe(path, 0);
1963 
1964 		ret = level;
1965 		break;
1966 	}
1967 	btrfs_tree_unlock(parent);
1968 	free_extent_buffer(parent);
1969 	return ret;
1970 }
1971 
1972 /*
1973  * helper to find next relocated block in reloc tree
1974  */
1975 static noinline_for_stack
1976 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1977 		       int *level)
1978 {
1979 	struct extent_buffer *eb;
1980 	int i;
1981 	u64 last_snapshot;
1982 	u32 nritems;
1983 
1984 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1985 
1986 	for (i = 0; i < *level; i++) {
1987 		free_extent_buffer(path->nodes[i]);
1988 		path->nodes[i] = NULL;
1989 	}
1990 
1991 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1992 		eb = path->nodes[i];
1993 		nritems = btrfs_header_nritems(eb);
1994 		while (path->slots[i] + 1 < nritems) {
1995 			path->slots[i]++;
1996 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1997 			    last_snapshot)
1998 				continue;
1999 
2000 			*level = i;
2001 			return 0;
2002 		}
2003 		free_extent_buffer(path->nodes[i]);
2004 		path->nodes[i] = NULL;
2005 	}
2006 	return 1;
2007 }
2008 
2009 /*
2010  * walk down reloc tree to find relocated block of lowest level
2011  */
2012 static noinline_for_stack
2013 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2014 			 int *level)
2015 {
2016 	struct btrfs_fs_info *fs_info = root->fs_info;
2017 	struct extent_buffer *eb = NULL;
2018 	int i;
2019 	u64 bytenr;
2020 	u64 ptr_gen = 0;
2021 	u64 last_snapshot;
2022 	u32 nritems;
2023 
2024 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2025 
2026 	for (i = *level; i > 0; i--) {
2027 		struct btrfs_key first_key;
2028 
2029 		eb = path->nodes[i];
2030 		nritems = btrfs_header_nritems(eb);
2031 		while (path->slots[i] < nritems) {
2032 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2033 			if (ptr_gen > last_snapshot)
2034 				break;
2035 			path->slots[i]++;
2036 		}
2037 		if (path->slots[i] >= nritems) {
2038 			if (i == *level)
2039 				break;
2040 			*level = i + 1;
2041 			return 0;
2042 		}
2043 		if (i == 1) {
2044 			*level = i;
2045 			return 0;
2046 		}
2047 
2048 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2049 		btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2050 		eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2051 				     &first_key);
2052 		if (IS_ERR(eb)) {
2053 			return PTR_ERR(eb);
2054 		} else if (!extent_buffer_uptodate(eb)) {
2055 			free_extent_buffer(eb);
2056 			return -EIO;
2057 		}
2058 		BUG_ON(btrfs_header_level(eb) != i - 1);
2059 		path->nodes[i - 1] = eb;
2060 		path->slots[i - 1] = 0;
2061 	}
2062 	return 1;
2063 }
2064 
2065 /*
2066  * invalidate extent cache for file extents whose key in range of
2067  * [min_key, max_key)
2068  */
2069 static int invalidate_extent_cache(struct btrfs_root *root,
2070 				   struct btrfs_key *min_key,
2071 				   struct btrfs_key *max_key)
2072 {
2073 	struct btrfs_fs_info *fs_info = root->fs_info;
2074 	struct inode *inode = NULL;
2075 	u64 objectid;
2076 	u64 start, end;
2077 	u64 ino;
2078 
2079 	objectid = min_key->objectid;
2080 	while (1) {
2081 		cond_resched();
2082 		iput(inode);
2083 
2084 		if (objectid > max_key->objectid)
2085 			break;
2086 
2087 		inode = find_next_inode(root, objectid);
2088 		if (!inode)
2089 			break;
2090 		ino = btrfs_ino(BTRFS_I(inode));
2091 
2092 		if (ino > max_key->objectid) {
2093 			iput(inode);
2094 			break;
2095 		}
2096 
2097 		objectid = ino + 1;
2098 		if (!S_ISREG(inode->i_mode))
2099 			continue;
2100 
2101 		if (unlikely(min_key->objectid == ino)) {
2102 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2103 				continue;
2104 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2105 				start = 0;
2106 			else {
2107 				start = min_key->offset;
2108 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2109 			}
2110 		} else {
2111 			start = 0;
2112 		}
2113 
2114 		if (unlikely(max_key->objectid == ino)) {
2115 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2116 				continue;
2117 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2118 				end = (u64)-1;
2119 			} else {
2120 				if (max_key->offset == 0)
2121 					continue;
2122 				end = max_key->offset;
2123 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2124 				end--;
2125 			}
2126 		} else {
2127 			end = (u64)-1;
2128 		}
2129 
2130 		/* the lock_extent waits for readpage to complete */
2131 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2132 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2133 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2134 	}
2135 	return 0;
2136 }
2137 
2138 static int find_next_key(struct btrfs_path *path, int level,
2139 			 struct btrfs_key *key)
2140 
2141 {
2142 	while (level < BTRFS_MAX_LEVEL) {
2143 		if (!path->nodes[level])
2144 			break;
2145 		if (path->slots[level] + 1 <
2146 		    btrfs_header_nritems(path->nodes[level])) {
2147 			btrfs_node_key_to_cpu(path->nodes[level], key,
2148 					      path->slots[level] + 1);
2149 			return 0;
2150 		}
2151 		level++;
2152 	}
2153 	return 1;
2154 }
2155 
2156 /*
2157  * Insert current subvolume into reloc_control::dirty_subvol_roots
2158  */
2159 static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2160 				struct reloc_control *rc,
2161 				struct btrfs_root *root)
2162 {
2163 	struct btrfs_root *reloc_root = root->reloc_root;
2164 	struct btrfs_root_item *reloc_root_item;
2165 
2166 	/* @root must be a subvolume tree root with a valid reloc tree */
2167 	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2168 	ASSERT(reloc_root);
2169 
2170 	reloc_root_item = &reloc_root->root_item;
2171 	memset(&reloc_root_item->drop_progress, 0,
2172 		sizeof(reloc_root_item->drop_progress));
2173 	reloc_root_item->drop_level = 0;
2174 	btrfs_set_root_refs(reloc_root_item, 0);
2175 	btrfs_update_reloc_root(trans, root);
2176 
2177 	if (list_empty(&root->reloc_dirty_list)) {
2178 		btrfs_grab_fs_root(root);
2179 		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2180 	}
2181 }
2182 
2183 static int clean_dirty_subvols(struct reloc_control *rc)
2184 {
2185 	struct btrfs_root *root;
2186 	struct btrfs_root *next;
2187 	int ret = 0;
2188 	int ret2;
2189 
2190 	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2191 				 reloc_dirty_list) {
2192 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2193 			/* Merged subvolume, cleanup its reloc root */
2194 			struct btrfs_root *reloc_root = root->reloc_root;
2195 
2196 			list_del_init(&root->reloc_dirty_list);
2197 			root->reloc_root = NULL;
2198 			if (reloc_root) {
2199 
2200 				ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
2201 				if (ret2 < 0 && !ret)
2202 					ret = ret2;
2203 			}
2204 			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
2205 			btrfs_put_fs_root(root);
2206 		} else {
2207 			/* Orphan reloc tree, just clean it up */
2208 			ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
2209 			if (ret2 < 0 && !ret)
2210 				ret = ret2;
2211 		}
2212 	}
2213 	return ret;
2214 }
2215 
2216 /*
2217  * merge the relocated tree blocks in reloc tree with corresponding
2218  * fs tree.
2219  */
2220 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2221 					       struct btrfs_root *root)
2222 {
2223 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2224 	struct btrfs_key key;
2225 	struct btrfs_key next_key;
2226 	struct btrfs_trans_handle *trans = NULL;
2227 	struct btrfs_root *reloc_root;
2228 	struct btrfs_root_item *root_item;
2229 	struct btrfs_path *path;
2230 	struct extent_buffer *leaf;
2231 	int level;
2232 	int max_level;
2233 	int replaced = 0;
2234 	int ret;
2235 	int err = 0;
2236 	u32 min_reserved;
2237 
2238 	path = btrfs_alloc_path();
2239 	if (!path)
2240 		return -ENOMEM;
2241 	path->reada = READA_FORWARD;
2242 
2243 	reloc_root = root->reloc_root;
2244 	root_item = &reloc_root->root_item;
2245 
2246 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2247 		level = btrfs_root_level(root_item);
2248 		atomic_inc(&reloc_root->node->refs);
2249 		path->nodes[level] = reloc_root->node;
2250 		path->slots[level] = 0;
2251 	} else {
2252 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2253 
2254 		level = root_item->drop_level;
2255 		BUG_ON(level == 0);
2256 		path->lowest_level = level;
2257 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2258 		path->lowest_level = 0;
2259 		if (ret < 0) {
2260 			btrfs_free_path(path);
2261 			return ret;
2262 		}
2263 
2264 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2265 				      path->slots[level]);
2266 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2267 
2268 		btrfs_unlock_up_safe(path, 0);
2269 	}
2270 
2271 	min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2272 	memset(&next_key, 0, sizeof(next_key));
2273 
2274 	while (1) {
2275 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2276 					     BTRFS_RESERVE_FLUSH_ALL);
2277 		if (ret) {
2278 			err = ret;
2279 			goto out;
2280 		}
2281 		trans = btrfs_start_transaction(root, 0);
2282 		if (IS_ERR(trans)) {
2283 			err = PTR_ERR(trans);
2284 			trans = NULL;
2285 			goto out;
2286 		}
2287 		trans->block_rsv = rc->block_rsv;
2288 
2289 		replaced = 0;
2290 		max_level = level;
2291 
2292 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2293 		if (ret < 0) {
2294 			err = ret;
2295 			goto out;
2296 		}
2297 		if (ret > 0)
2298 			break;
2299 
2300 		if (!find_next_key(path, level, &key) &&
2301 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2302 			ret = 0;
2303 		} else {
2304 			ret = replace_path(trans, rc, root, reloc_root, path,
2305 					   &next_key, level, max_level);
2306 		}
2307 		if (ret < 0) {
2308 			err = ret;
2309 			goto out;
2310 		}
2311 
2312 		if (ret > 0) {
2313 			level = ret;
2314 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2315 					      path->slots[level]);
2316 			replaced = 1;
2317 		}
2318 
2319 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2320 		if (ret > 0)
2321 			break;
2322 
2323 		BUG_ON(level == 0);
2324 		/*
2325 		 * save the merging progress in the drop_progress.
2326 		 * this is OK since root refs == 1 in this case.
2327 		 */
2328 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2329 			       path->slots[level]);
2330 		root_item->drop_level = level;
2331 
2332 		btrfs_end_transaction_throttle(trans);
2333 		trans = NULL;
2334 
2335 		btrfs_btree_balance_dirty(fs_info);
2336 
2337 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2338 			invalidate_extent_cache(root, &key, &next_key);
2339 	}
2340 
2341 	/*
2342 	 * handle the case only one block in the fs tree need to be
2343 	 * relocated and the block is tree root.
2344 	 */
2345 	leaf = btrfs_lock_root_node(root);
2346 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2347 	btrfs_tree_unlock(leaf);
2348 	free_extent_buffer(leaf);
2349 	if (ret < 0)
2350 		err = ret;
2351 out:
2352 	btrfs_free_path(path);
2353 
2354 	if (err == 0)
2355 		insert_dirty_subvol(trans, rc, root);
2356 
2357 	if (trans)
2358 		btrfs_end_transaction_throttle(trans);
2359 
2360 	btrfs_btree_balance_dirty(fs_info);
2361 
2362 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2363 		invalidate_extent_cache(root, &key, &next_key);
2364 
2365 	return err;
2366 }
2367 
2368 static noinline_for_stack
2369 int prepare_to_merge(struct reloc_control *rc, int err)
2370 {
2371 	struct btrfs_root *root = rc->extent_root;
2372 	struct btrfs_fs_info *fs_info = root->fs_info;
2373 	struct btrfs_root *reloc_root;
2374 	struct btrfs_trans_handle *trans;
2375 	LIST_HEAD(reloc_roots);
2376 	u64 num_bytes = 0;
2377 	int ret;
2378 
2379 	mutex_lock(&fs_info->reloc_mutex);
2380 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2381 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2382 	mutex_unlock(&fs_info->reloc_mutex);
2383 
2384 again:
2385 	if (!err) {
2386 		num_bytes = rc->merging_rsv_size;
2387 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2388 					  BTRFS_RESERVE_FLUSH_ALL);
2389 		if (ret)
2390 			err = ret;
2391 	}
2392 
2393 	trans = btrfs_join_transaction(rc->extent_root);
2394 	if (IS_ERR(trans)) {
2395 		if (!err)
2396 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2397 						num_bytes);
2398 		return PTR_ERR(trans);
2399 	}
2400 
2401 	if (!err) {
2402 		if (num_bytes != rc->merging_rsv_size) {
2403 			btrfs_end_transaction(trans);
2404 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2405 						num_bytes);
2406 			goto again;
2407 		}
2408 	}
2409 
2410 	rc->merge_reloc_tree = 1;
2411 
2412 	while (!list_empty(&rc->reloc_roots)) {
2413 		reloc_root = list_entry(rc->reloc_roots.next,
2414 					struct btrfs_root, root_list);
2415 		list_del_init(&reloc_root->root_list);
2416 
2417 		root = read_fs_root(fs_info, reloc_root->root_key.offset);
2418 		BUG_ON(IS_ERR(root));
2419 		BUG_ON(root->reloc_root != reloc_root);
2420 
2421 		/*
2422 		 * set reference count to 1, so btrfs_recover_relocation
2423 		 * knows it should resumes merging
2424 		 */
2425 		if (!err)
2426 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2427 		btrfs_update_reloc_root(trans, root);
2428 
2429 		list_add(&reloc_root->root_list, &reloc_roots);
2430 	}
2431 
2432 	list_splice(&reloc_roots, &rc->reloc_roots);
2433 
2434 	if (!err)
2435 		btrfs_commit_transaction(trans);
2436 	else
2437 		btrfs_end_transaction(trans);
2438 	return err;
2439 }
2440 
2441 static noinline_for_stack
2442 void free_reloc_roots(struct list_head *list)
2443 {
2444 	struct btrfs_root *reloc_root;
2445 
2446 	while (!list_empty(list)) {
2447 		reloc_root = list_entry(list->next, struct btrfs_root,
2448 					root_list);
2449 		__del_reloc_root(reloc_root);
2450 		free_extent_buffer(reloc_root->node);
2451 		free_extent_buffer(reloc_root->commit_root);
2452 		reloc_root->node = NULL;
2453 		reloc_root->commit_root = NULL;
2454 	}
2455 }
2456 
2457 static noinline_for_stack
2458 void merge_reloc_roots(struct reloc_control *rc)
2459 {
2460 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2461 	struct btrfs_root *root;
2462 	struct btrfs_root *reloc_root;
2463 	LIST_HEAD(reloc_roots);
2464 	int found = 0;
2465 	int ret = 0;
2466 again:
2467 	root = rc->extent_root;
2468 
2469 	/*
2470 	 * this serializes us with btrfs_record_root_in_transaction,
2471 	 * we have to make sure nobody is in the middle of
2472 	 * adding their roots to the list while we are
2473 	 * doing this splice
2474 	 */
2475 	mutex_lock(&fs_info->reloc_mutex);
2476 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2477 	mutex_unlock(&fs_info->reloc_mutex);
2478 
2479 	while (!list_empty(&reloc_roots)) {
2480 		found = 1;
2481 		reloc_root = list_entry(reloc_roots.next,
2482 					struct btrfs_root, root_list);
2483 
2484 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2485 			root = read_fs_root(fs_info,
2486 					    reloc_root->root_key.offset);
2487 			BUG_ON(IS_ERR(root));
2488 			BUG_ON(root->reloc_root != reloc_root);
2489 
2490 			ret = merge_reloc_root(rc, root);
2491 			if (ret) {
2492 				if (list_empty(&reloc_root->root_list))
2493 					list_add_tail(&reloc_root->root_list,
2494 						      &reloc_roots);
2495 				goto out;
2496 			}
2497 		} else {
2498 			list_del_init(&reloc_root->root_list);
2499 			/* Don't forget to queue this reloc root for cleanup */
2500 			list_add_tail(&reloc_root->reloc_dirty_list,
2501 				      &rc->dirty_subvol_roots);
2502 		}
2503 	}
2504 
2505 	if (found) {
2506 		found = 0;
2507 		goto again;
2508 	}
2509 out:
2510 	if (ret) {
2511 		btrfs_handle_fs_error(fs_info, ret, NULL);
2512 		if (!list_empty(&reloc_roots))
2513 			free_reloc_roots(&reloc_roots);
2514 
2515 		/* new reloc root may be added */
2516 		mutex_lock(&fs_info->reloc_mutex);
2517 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2518 		mutex_unlock(&fs_info->reloc_mutex);
2519 		if (!list_empty(&reloc_roots))
2520 			free_reloc_roots(&reloc_roots);
2521 	}
2522 
2523 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2524 }
2525 
2526 static void free_block_list(struct rb_root *blocks)
2527 {
2528 	struct tree_block *block;
2529 	struct rb_node *rb_node;
2530 	while ((rb_node = rb_first(blocks))) {
2531 		block = rb_entry(rb_node, struct tree_block, rb_node);
2532 		rb_erase(rb_node, blocks);
2533 		kfree(block);
2534 	}
2535 }
2536 
2537 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2538 				      struct btrfs_root *reloc_root)
2539 {
2540 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2541 	struct btrfs_root *root;
2542 
2543 	if (reloc_root->last_trans == trans->transid)
2544 		return 0;
2545 
2546 	root = read_fs_root(fs_info, reloc_root->root_key.offset);
2547 	BUG_ON(IS_ERR(root));
2548 	BUG_ON(root->reloc_root != reloc_root);
2549 
2550 	return btrfs_record_root_in_trans(trans, root);
2551 }
2552 
2553 static noinline_for_stack
2554 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2555 				     struct reloc_control *rc,
2556 				     struct backref_node *node,
2557 				     struct backref_edge *edges[])
2558 {
2559 	struct backref_node *next;
2560 	struct btrfs_root *root;
2561 	int index = 0;
2562 
2563 	next = node;
2564 	while (1) {
2565 		cond_resched();
2566 		next = walk_up_backref(next, edges, &index);
2567 		root = next->root;
2568 		BUG_ON(!root);
2569 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2570 
2571 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2572 			record_reloc_root_in_trans(trans, root);
2573 			break;
2574 		}
2575 
2576 		btrfs_record_root_in_trans(trans, root);
2577 		root = root->reloc_root;
2578 
2579 		if (next->new_bytenr != root->node->start) {
2580 			BUG_ON(next->new_bytenr);
2581 			BUG_ON(!list_empty(&next->list));
2582 			next->new_bytenr = root->node->start;
2583 			next->root = root;
2584 			list_add_tail(&next->list,
2585 				      &rc->backref_cache.changed);
2586 			__mark_block_processed(rc, next);
2587 			break;
2588 		}
2589 
2590 		WARN_ON(1);
2591 		root = NULL;
2592 		next = walk_down_backref(edges, &index);
2593 		if (!next || next->level <= node->level)
2594 			break;
2595 	}
2596 	if (!root)
2597 		return NULL;
2598 
2599 	next = node;
2600 	/* setup backref node path for btrfs_reloc_cow_block */
2601 	while (1) {
2602 		rc->backref_cache.path[next->level] = next;
2603 		if (--index < 0)
2604 			break;
2605 		next = edges[index]->node[UPPER];
2606 	}
2607 	return root;
2608 }
2609 
2610 /*
2611  * select a tree root for relocation. return NULL if the block
2612  * is reference counted. we should use do_relocation() in this
2613  * case. return a tree root pointer if the block isn't reference
2614  * counted. return -ENOENT if the block is root of reloc tree.
2615  */
2616 static noinline_for_stack
2617 struct btrfs_root *select_one_root(struct backref_node *node)
2618 {
2619 	struct backref_node *next;
2620 	struct btrfs_root *root;
2621 	struct btrfs_root *fs_root = NULL;
2622 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2623 	int index = 0;
2624 
2625 	next = node;
2626 	while (1) {
2627 		cond_resched();
2628 		next = walk_up_backref(next, edges, &index);
2629 		root = next->root;
2630 		BUG_ON(!root);
2631 
2632 		/* no other choice for non-references counted tree */
2633 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2634 			return root;
2635 
2636 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2637 			fs_root = root;
2638 
2639 		if (next != node)
2640 			return NULL;
2641 
2642 		next = walk_down_backref(edges, &index);
2643 		if (!next || next->level <= node->level)
2644 			break;
2645 	}
2646 
2647 	if (!fs_root)
2648 		return ERR_PTR(-ENOENT);
2649 	return fs_root;
2650 }
2651 
2652 static noinline_for_stack
2653 u64 calcu_metadata_size(struct reloc_control *rc,
2654 			struct backref_node *node, int reserve)
2655 {
2656 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2657 	struct backref_node *next = node;
2658 	struct backref_edge *edge;
2659 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2660 	u64 num_bytes = 0;
2661 	int index = 0;
2662 
2663 	BUG_ON(reserve && node->processed);
2664 
2665 	while (next) {
2666 		cond_resched();
2667 		while (1) {
2668 			if (next->processed && (reserve || next != node))
2669 				break;
2670 
2671 			num_bytes += fs_info->nodesize;
2672 
2673 			if (list_empty(&next->upper))
2674 				break;
2675 
2676 			edge = list_entry(next->upper.next,
2677 					  struct backref_edge, list[LOWER]);
2678 			edges[index++] = edge;
2679 			next = edge->node[UPPER];
2680 		}
2681 		next = walk_down_backref(edges, &index);
2682 	}
2683 	return num_bytes;
2684 }
2685 
2686 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2687 				  struct reloc_control *rc,
2688 				  struct backref_node *node)
2689 {
2690 	struct btrfs_root *root = rc->extent_root;
2691 	struct btrfs_fs_info *fs_info = root->fs_info;
2692 	u64 num_bytes;
2693 	int ret;
2694 	u64 tmp;
2695 
2696 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2697 
2698 	trans->block_rsv = rc->block_rsv;
2699 	rc->reserved_bytes += num_bytes;
2700 
2701 	/*
2702 	 * We are under a transaction here so we can only do limited flushing.
2703 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2704 	 * transaction and try to refill when we can flush all the things.
2705 	 */
2706 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2707 				BTRFS_RESERVE_FLUSH_LIMIT);
2708 	if (ret) {
2709 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2710 		while (tmp <= rc->reserved_bytes)
2711 			tmp <<= 1;
2712 		/*
2713 		 * only one thread can access block_rsv at this point,
2714 		 * so we don't need hold lock to protect block_rsv.
2715 		 * we expand more reservation size here to allow enough
2716 		 * space for relocation and we will return earlier in
2717 		 * enospc case.
2718 		 */
2719 		rc->block_rsv->size = tmp + fs_info->nodesize *
2720 				      RELOCATION_RESERVED_NODES;
2721 		return -EAGAIN;
2722 	}
2723 
2724 	return 0;
2725 }
2726 
2727 /*
2728  * relocate a block tree, and then update pointers in upper level
2729  * blocks that reference the block to point to the new location.
2730  *
2731  * if called by link_to_upper, the block has already been relocated.
2732  * in that case this function just updates pointers.
2733  */
2734 static int do_relocation(struct btrfs_trans_handle *trans,
2735 			 struct reloc_control *rc,
2736 			 struct backref_node *node,
2737 			 struct btrfs_key *key,
2738 			 struct btrfs_path *path, int lowest)
2739 {
2740 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2741 	struct backref_node *upper;
2742 	struct backref_edge *edge;
2743 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2744 	struct btrfs_root *root;
2745 	struct extent_buffer *eb;
2746 	u32 blocksize;
2747 	u64 bytenr;
2748 	u64 generation;
2749 	int slot;
2750 	int ret;
2751 	int err = 0;
2752 
2753 	BUG_ON(lowest && node->eb);
2754 
2755 	path->lowest_level = node->level + 1;
2756 	rc->backref_cache.path[node->level] = node;
2757 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2758 		struct btrfs_key first_key;
2759 		struct btrfs_ref ref = { 0 };
2760 
2761 		cond_resched();
2762 
2763 		upper = edge->node[UPPER];
2764 		root = select_reloc_root(trans, rc, upper, edges);
2765 		BUG_ON(!root);
2766 
2767 		if (upper->eb && !upper->locked) {
2768 			if (!lowest) {
2769 				ret = btrfs_bin_search(upper->eb, key,
2770 						       upper->level, &slot);
2771 				if (ret < 0) {
2772 					err = ret;
2773 					goto next;
2774 				}
2775 				BUG_ON(ret);
2776 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2777 				if (node->eb->start == bytenr)
2778 					goto next;
2779 			}
2780 			drop_node_buffer(upper);
2781 		}
2782 
2783 		if (!upper->eb) {
2784 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2785 			if (ret) {
2786 				if (ret < 0)
2787 					err = ret;
2788 				else
2789 					err = -ENOENT;
2790 
2791 				btrfs_release_path(path);
2792 				break;
2793 			}
2794 
2795 			if (!upper->eb) {
2796 				upper->eb = path->nodes[upper->level];
2797 				path->nodes[upper->level] = NULL;
2798 			} else {
2799 				BUG_ON(upper->eb != path->nodes[upper->level]);
2800 			}
2801 
2802 			upper->locked = 1;
2803 			path->locks[upper->level] = 0;
2804 
2805 			slot = path->slots[upper->level];
2806 			btrfs_release_path(path);
2807 		} else {
2808 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2809 					       &slot);
2810 			if (ret < 0) {
2811 				err = ret;
2812 				goto next;
2813 			}
2814 			BUG_ON(ret);
2815 		}
2816 
2817 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2818 		if (lowest) {
2819 			if (bytenr != node->bytenr) {
2820 				btrfs_err(root->fs_info,
2821 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2822 					  bytenr, node->bytenr, slot,
2823 					  upper->eb->start);
2824 				err = -EIO;
2825 				goto next;
2826 			}
2827 		} else {
2828 			if (node->eb->start == bytenr)
2829 				goto next;
2830 		}
2831 
2832 		blocksize = root->fs_info->nodesize;
2833 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2834 		btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2835 		eb = read_tree_block(fs_info, bytenr, generation,
2836 				     upper->level - 1, &first_key);
2837 		if (IS_ERR(eb)) {
2838 			err = PTR_ERR(eb);
2839 			goto next;
2840 		} else if (!extent_buffer_uptodate(eb)) {
2841 			free_extent_buffer(eb);
2842 			err = -EIO;
2843 			goto next;
2844 		}
2845 		btrfs_tree_lock(eb);
2846 		btrfs_set_lock_blocking_write(eb);
2847 
2848 		if (!node->eb) {
2849 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2850 					      slot, &eb);
2851 			btrfs_tree_unlock(eb);
2852 			free_extent_buffer(eb);
2853 			if (ret < 0) {
2854 				err = ret;
2855 				goto next;
2856 			}
2857 			BUG_ON(node->eb != eb);
2858 		} else {
2859 			btrfs_set_node_blockptr(upper->eb, slot,
2860 						node->eb->start);
2861 			btrfs_set_node_ptr_generation(upper->eb, slot,
2862 						      trans->transid);
2863 			btrfs_mark_buffer_dirty(upper->eb);
2864 
2865 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2866 					       node->eb->start, blocksize,
2867 					       upper->eb->start);
2868 			ref.real_root = root->root_key.objectid;
2869 			btrfs_init_tree_ref(&ref, node->level,
2870 					    btrfs_header_owner(upper->eb));
2871 			ret = btrfs_inc_extent_ref(trans, &ref);
2872 			BUG_ON(ret);
2873 
2874 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2875 			BUG_ON(ret);
2876 		}
2877 next:
2878 		if (!upper->pending)
2879 			drop_node_buffer(upper);
2880 		else
2881 			unlock_node_buffer(upper);
2882 		if (err)
2883 			break;
2884 	}
2885 
2886 	if (!err && node->pending) {
2887 		drop_node_buffer(node);
2888 		list_move_tail(&node->list, &rc->backref_cache.changed);
2889 		node->pending = 0;
2890 	}
2891 
2892 	path->lowest_level = 0;
2893 	BUG_ON(err == -ENOSPC);
2894 	return err;
2895 }
2896 
2897 static int link_to_upper(struct btrfs_trans_handle *trans,
2898 			 struct reloc_control *rc,
2899 			 struct backref_node *node,
2900 			 struct btrfs_path *path)
2901 {
2902 	struct btrfs_key key;
2903 
2904 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2905 	return do_relocation(trans, rc, node, &key, path, 0);
2906 }
2907 
2908 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2909 				struct reloc_control *rc,
2910 				struct btrfs_path *path, int err)
2911 {
2912 	LIST_HEAD(list);
2913 	struct backref_cache *cache = &rc->backref_cache;
2914 	struct backref_node *node;
2915 	int level;
2916 	int ret;
2917 
2918 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2919 		while (!list_empty(&cache->pending[level])) {
2920 			node = list_entry(cache->pending[level].next,
2921 					  struct backref_node, list);
2922 			list_move_tail(&node->list, &list);
2923 			BUG_ON(!node->pending);
2924 
2925 			if (!err) {
2926 				ret = link_to_upper(trans, rc, node, path);
2927 				if (ret < 0)
2928 					err = ret;
2929 			}
2930 		}
2931 		list_splice_init(&list, &cache->pending[level]);
2932 	}
2933 	return err;
2934 }
2935 
2936 static void mark_block_processed(struct reloc_control *rc,
2937 				 u64 bytenr, u32 blocksize)
2938 {
2939 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2940 			EXTENT_DIRTY);
2941 }
2942 
2943 static void __mark_block_processed(struct reloc_control *rc,
2944 				   struct backref_node *node)
2945 {
2946 	u32 blocksize;
2947 	if (node->level == 0 ||
2948 	    in_block_group(node->bytenr, rc->block_group)) {
2949 		blocksize = rc->extent_root->fs_info->nodesize;
2950 		mark_block_processed(rc, node->bytenr, blocksize);
2951 	}
2952 	node->processed = 1;
2953 }
2954 
2955 /*
2956  * mark a block and all blocks directly/indirectly reference the block
2957  * as processed.
2958  */
2959 static void update_processed_blocks(struct reloc_control *rc,
2960 				    struct backref_node *node)
2961 {
2962 	struct backref_node *next = node;
2963 	struct backref_edge *edge;
2964 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2965 	int index = 0;
2966 
2967 	while (next) {
2968 		cond_resched();
2969 		while (1) {
2970 			if (next->processed)
2971 				break;
2972 
2973 			__mark_block_processed(rc, next);
2974 
2975 			if (list_empty(&next->upper))
2976 				break;
2977 
2978 			edge = list_entry(next->upper.next,
2979 					  struct backref_edge, list[LOWER]);
2980 			edges[index++] = edge;
2981 			next = edge->node[UPPER];
2982 		}
2983 		next = walk_down_backref(edges, &index);
2984 	}
2985 }
2986 
2987 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2988 {
2989 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2990 
2991 	if (test_range_bit(&rc->processed_blocks, bytenr,
2992 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2993 		return 1;
2994 	return 0;
2995 }
2996 
2997 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2998 			      struct tree_block *block)
2999 {
3000 	struct extent_buffer *eb;
3001 
3002 	BUG_ON(block->key_ready);
3003 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
3004 			     block->level, NULL);
3005 	if (IS_ERR(eb)) {
3006 		return PTR_ERR(eb);
3007 	} else if (!extent_buffer_uptodate(eb)) {
3008 		free_extent_buffer(eb);
3009 		return -EIO;
3010 	}
3011 	if (block->level == 0)
3012 		btrfs_item_key_to_cpu(eb, &block->key, 0);
3013 	else
3014 		btrfs_node_key_to_cpu(eb, &block->key, 0);
3015 	free_extent_buffer(eb);
3016 	block->key_ready = 1;
3017 	return 0;
3018 }
3019 
3020 /*
3021  * helper function to relocate a tree block
3022  */
3023 static int relocate_tree_block(struct btrfs_trans_handle *trans,
3024 				struct reloc_control *rc,
3025 				struct backref_node *node,
3026 				struct btrfs_key *key,
3027 				struct btrfs_path *path)
3028 {
3029 	struct btrfs_root *root;
3030 	int ret = 0;
3031 
3032 	if (!node)
3033 		return 0;
3034 
3035 	BUG_ON(node->processed);
3036 	root = select_one_root(node);
3037 	if (root == ERR_PTR(-ENOENT)) {
3038 		update_processed_blocks(rc, node);
3039 		goto out;
3040 	}
3041 
3042 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3043 		ret = reserve_metadata_space(trans, rc, node);
3044 		if (ret)
3045 			goto out;
3046 	}
3047 
3048 	if (root) {
3049 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3050 			BUG_ON(node->new_bytenr);
3051 			BUG_ON(!list_empty(&node->list));
3052 			btrfs_record_root_in_trans(trans, root);
3053 			root = root->reloc_root;
3054 			node->new_bytenr = root->node->start;
3055 			node->root = root;
3056 			list_add_tail(&node->list, &rc->backref_cache.changed);
3057 		} else {
3058 			path->lowest_level = node->level;
3059 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3060 			btrfs_release_path(path);
3061 			if (ret > 0)
3062 				ret = 0;
3063 		}
3064 		if (!ret)
3065 			update_processed_blocks(rc, node);
3066 	} else {
3067 		ret = do_relocation(trans, rc, node, key, path, 1);
3068 	}
3069 out:
3070 	if (ret || node->level == 0 || node->cowonly)
3071 		remove_backref_node(&rc->backref_cache, node);
3072 	return ret;
3073 }
3074 
3075 /*
3076  * relocate a list of blocks
3077  */
3078 static noinline_for_stack
3079 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3080 			 struct reloc_control *rc, struct rb_root *blocks)
3081 {
3082 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3083 	struct backref_node *node;
3084 	struct btrfs_path *path;
3085 	struct tree_block *block;
3086 	struct tree_block *next;
3087 	int ret;
3088 	int err = 0;
3089 
3090 	path = btrfs_alloc_path();
3091 	if (!path) {
3092 		err = -ENOMEM;
3093 		goto out_free_blocks;
3094 	}
3095 
3096 	/* Kick in readahead for tree blocks with missing keys */
3097 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3098 		if (!block->key_ready)
3099 			readahead_tree_block(fs_info, block->bytenr);
3100 	}
3101 
3102 	/* Get first keys */
3103 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3104 		if (!block->key_ready) {
3105 			err = get_tree_block_key(fs_info, block);
3106 			if (err)
3107 				goto out_free_path;
3108 		}
3109 	}
3110 
3111 	/* Do tree relocation */
3112 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3113 		node = build_backref_tree(rc, &block->key,
3114 					  block->level, block->bytenr);
3115 		if (IS_ERR(node)) {
3116 			err = PTR_ERR(node);
3117 			goto out;
3118 		}
3119 
3120 		ret = relocate_tree_block(trans, rc, node, &block->key,
3121 					  path);
3122 		if (ret < 0) {
3123 			if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
3124 				err = ret;
3125 			goto out;
3126 		}
3127 	}
3128 out:
3129 	err = finish_pending_nodes(trans, rc, path, err);
3130 
3131 out_free_path:
3132 	btrfs_free_path(path);
3133 out_free_blocks:
3134 	free_block_list(blocks);
3135 	return err;
3136 }
3137 
3138 static noinline_for_stack
3139 int prealloc_file_extent_cluster(struct inode *inode,
3140 				 struct file_extent_cluster *cluster)
3141 {
3142 	u64 alloc_hint = 0;
3143 	u64 start;
3144 	u64 end;
3145 	u64 offset = BTRFS_I(inode)->index_cnt;
3146 	u64 num_bytes;
3147 	int nr = 0;
3148 	int ret = 0;
3149 	u64 prealloc_start = cluster->start - offset;
3150 	u64 prealloc_end = cluster->end - offset;
3151 	u64 cur_offset;
3152 	struct extent_changeset *data_reserved = NULL;
3153 
3154 	BUG_ON(cluster->start != cluster->boundary[0]);
3155 	inode_lock(inode);
3156 
3157 	ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3158 					  prealloc_end + 1 - prealloc_start);
3159 	if (ret)
3160 		goto out;
3161 
3162 	cur_offset = prealloc_start;
3163 	while (nr < cluster->nr) {
3164 		start = cluster->boundary[nr] - offset;
3165 		if (nr + 1 < cluster->nr)
3166 			end = cluster->boundary[nr + 1] - 1 - offset;
3167 		else
3168 			end = cluster->end - offset;
3169 
3170 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3171 		num_bytes = end + 1 - start;
3172 		if (cur_offset < start)
3173 			btrfs_free_reserved_data_space(inode, data_reserved,
3174 					cur_offset, start - cur_offset);
3175 		ret = btrfs_prealloc_file_range(inode, 0, start,
3176 						num_bytes, num_bytes,
3177 						end + 1, &alloc_hint);
3178 		cur_offset = end + 1;
3179 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3180 		if (ret)
3181 			break;
3182 		nr++;
3183 	}
3184 	if (cur_offset < prealloc_end)
3185 		btrfs_free_reserved_data_space(inode, data_reserved,
3186 				cur_offset, prealloc_end + 1 - cur_offset);
3187 out:
3188 	inode_unlock(inode);
3189 	extent_changeset_free(data_reserved);
3190 	return ret;
3191 }
3192 
3193 static noinline_for_stack
3194 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3195 			 u64 block_start)
3196 {
3197 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3198 	struct extent_map *em;
3199 	int ret = 0;
3200 
3201 	em = alloc_extent_map();
3202 	if (!em)
3203 		return -ENOMEM;
3204 
3205 	em->start = start;
3206 	em->len = end + 1 - start;
3207 	em->block_len = em->len;
3208 	em->block_start = block_start;
3209 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3210 
3211 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3212 	while (1) {
3213 		write_lock(&em_tree->lock);
3214 		ret = add_extent_mapping(em_tree, em, 0);
3215 		write_unlock(&em_tree->lock);
3216 		if (ret != -EEXIST) {
3217 			free_extent_map(em);
3218 			break;
3219 		}
3220 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3221 	}
3222 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3223 	return ret;
3224 }
3225 
3226 static int relocate_file_extent_cluster(struct inode *inode,
3227 					struct file_extent_cluster *cluster)
3228 {
3229 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3230 	u64 page_start;
3231 	u64 page_end;
3232 	u64 offset = BTRFS_I(inode)->index_cnt;
3233 	unsigned long index;
3234 	unsigned long last_index;
3235 	struct page *page;
3236 	struct file_ra_state *ra;
3237 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3238 	int nr = 0;
3239 	int ret = 0;
3240 
3241 	if (!cluster->nr)
3242 		return 0;
3243 
3244 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3245 	if (!ra)
3246 		return -ENOMEM;
3247 
3248 	ret = prealloc_file_extent_cluster(inode, cluster);
3249 	if (ret)
3250 		goto out;
3251 
3252 	file_ra_state_init(ra, inode->i_mapping);
3253 
3254 	ret = setup_extent_mapping(inode, cluster->start - offset,
3255 				   cluster->end - offset, cluster->start);
3256 	if (ret)
3257 		goto out;
3258 
3259 	index = (cluster->start - offset) >> PAGE_SHIFT;
3260 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3261 	while (index <= last_index) {
3262 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3263 				PAGE_SIZE);
3264 		if (ret)
3265 			goto out;
3266 
3267 		page = find_lock_page(inode->i_mapping, index);
3268 		if (!page) {
3269 			page_cache_sync_readahead(inode->i_mapping,
3270 						  ra, NULL, index,
3271 						  last_index + 1 - index);
3272 			page = find_or_create_page(inode->i_mapping, index,
3273 						   mask);
3274 			if (!page) {
3275 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3276 							PAGE_SIZE, true);
3277 				btrfs_delalloc_release_extents(BTRFS_I(inode),
3278 							PAGE_SIZE);
3279 				ret = -ENOMEM;
3280 				goto out;
3281 			}
3282 		}
3283 
3284 		if (PageReadahead(page)) {
3285 			page_cache_async_readahead(inode->i_mapping,
3286 						   ra, NULL, page, index,
3287 						   last_index + 1 - index);
3288 		}
3289 
3290 		if (!PageUptodate(page)) {
3291 			btrfs_readpage(NULL, page);
3292 			lock_page(page);
3293 			if (!PageUptodate(page)) {
3294 				unlock_page(page);
3295 				put_page(page);
3296 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3297 							PAGE_SIZE, true);
3298 				btrfs_delalloc_release_extents(BTRFS_I(inode),
3299 							       PAGE_SIZE);
3300 				ret = -EIO;
3301 				goto out;
3302 			}
3303 		}
3304 
3305 		page_start = page_offset(page);
3306 		page_end = page_start + PAGE_SIZE - 1;
3307 
3308 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3309 
3310 		set_page_extent_mapped(page);
3311 
3312 		if (nr < cluster->nr &&
3313 		    page_start + offset == cluster->boundary[nr]) {
3314 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3315 					page_start, page_end,
3316 					EXTENT_BOUNDARY);
3317 			nr++;
3318 		}
3319 
3320 		ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
3321 						NULL);
3322 		if (ret) {
3323 			unlock_page(page);
3324 			put_page(page);
3325 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
3326 							 PAGE_SIZE, true);
3327 			btrfs_delalloc_release_extents(BTRFS_I(inode),
3328 			                               PAGE_SIZE);
3329 
3330 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
3331 					  page_start, page_end,
3332 					  EXTENT_LOCKED | EXTENT_BOUNDARY);
3333 			goto out;
3334 
3335 		}
3336 		set_page_dirty(page);
3337 
3338 		unlock_extent(&BTRFS_I(inode)->io_tree,
3339 			      page_start, page_end);
3340 		unlock_page(page);
3341 		put_page(page);
3342 
3343 		index++;
3344 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
3345 		balance_dirty_pages_ratelimited(inode->i_mapping);
3346 		btrfs_throttle(fs_info);
3347 	}
3348 	WARN_ON(nr != cluster->nr);
3349 out:
3350 	kfree(ra);
3351 	return ret;
3352 }
3353 
3354 static noinline_for_stack
3355 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3356 			 struct file_extent_cluster *cluster)
3357 {
3358 	int ret;
3359 
3360 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3361 		ret = relocate_file_extent_cluster(inode, cluster);
3362 		if (ret)
3363 			return ret;
3364 		cluster->nr = 0;
3365 	}
3366 
3367 	if (!cluster->nr)
3368 		cluster->start = extent_key->objectid;
3369 	else
3370 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3371 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3372 	cluster->boundary[cluster->nr] = extent_key->objectid;
3373 	cluster->nr++;
3374 
3375 	if (cluster->nr >= MAX_EXTENTS) {
3376 		ret = relocate_file_extent_cluster(inode, cluster);
3377 		if (ret)
3378 			return ret;
3379 		cluster->nr = 0;
3380 	}
3381 	return 0;
3382 }
3383 
3384 /*
3385  * helper to add a tree block to the list.
3386  * the major work is getting the generation and level of the block
3387  */
3388 static int add_tree_block(struct reloc_control *rc,
3389 			  struct btrfs_key *extent_key,
3390 			  struct btrfs_path *path,
3391 			  struct rb_root *blocks)
3392 {
3393 	struct extent_buffer *eb;
3394 	struct btrfs_extent_item *ei;
3395 	struct btrfs_tree_block_info *bi;
3396 	struct tree_block *block;
3397 	struct rb_node *rb_node;
3398 	u32 item_size;
3399 	int level = -1;
3400 	u64 generation;
3401 
3402 	eb =  path->nodes[0];
3403 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3404 
3405 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3406 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3407 		ei = btrfs_item_ptr(eb, path->slots[0],
3408 				struct btrfs_extent_item);
3409 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3410 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3411 			level = btrfs_tree_block_level(eb, bi);
3412 		} else {
3413 			level = (int)extent_key->offset;
3414 		}
3415 		generation = btrfs_extent_generation(eb, ei);
3416 	} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3417 		btrfs_print_v0_err(eb->fs_info);
3418 		btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3419 		return -EINVAL;
3420 	} else {
3421 		BUG();
3422 	}
3423 
3424 	btrfs_release_path(path);
3425 
3426 	BUG_ON(level == -1);
3427 
3428 	block = kmalloc(sizeof(*block), GFP_NOFS);
3429 	if (!block)
3430 		return -ENOMEM;
3431 
3432 	block->bytenr = extent_key->objectid;
3433 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3434 	block->key.offset = generation;
3435 	block->level = level;
3436 	block->key_ready = 0;
3437 
3438 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3439 	if (rb_node)
3440 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3441 
3442 	return 0;
3443 }
3444 
3445 /*
3446  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3447  */
3448 static int __add_tree_block(struct reloc_control *rc,
3449 			    u64 bytenr, u32 blocksize,
3450 			    struct rb_root *blocks)
3451 {
3452 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3453 	struct btrfs_path *path;
3454 	struct btrfs_key key;
3455 	int ret;
3456 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3457 
3458 	if (tree_block_processed(bytenr, rc))
3459 		return 0;
3460 
3461 	if (tree_search(blocks, bytenr))
3462 		return 0;
3463 
3464 	path = btrfs_alloc_path();
3465 	if (!path)
3466 		return -ENOMEM;
3467 again:
3468 	key.objectid = bytenr;
3469 	if (skinny) {
3470 		key.type = BTRFS_METADATA_ITEM_KEY;
3471 		key.offset = (u64)-1;
3472 	} else {
3473 		key.type = BTRFS_EXTENT_ITEM_KEY;
3474 		key.offset = blocksize;
3475 	}
3476 
3477 	path->search_commit_root = 1;
3478 	path->skip_locking = 1;
3479 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3480 	if (ret < 0)
3481 		goto out;
3482 
3483 	if (ret > 0 && skinny) {
3484 		if (path->slots[0]) {
3485 			path->slots[0]--;
3486 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3487 					      path->slots[0]);
3488 			if (key.objectid == bytenr &&
3489 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3490 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3491 			      key.offset == blocksize)))
3492 				ret = 0;
3493 		}
3494 
3495 		if (ret) {
3496 			skinny = false;
3497 			btrfs_release_path(path);
3498 			goto again;
3499 		}
3500 	}
3501 	if (ret) {
3502 		ASSERT(ret == 1);
3503 		btrfs_print_leaf(path->nodes[0]);
3504 		btrfs_err(fs_info,
3505 	     "tree block extent item (%llu) is not found in extent tree",
3506 		     bytenr);
3507 		WARN_ON(1);
3508 		ret = -EINVAL;
3509 		goto out;
3510 	}
3511 
3512 	ret = add_tree_block(rc, &key, path, blocks);
3513 out:
3514 	btrfs_free_path(path);
3515 	return ret;
3516 }
3517 
3518 /*
3519  * helper to check if the block use full backrefs for pointers in it
3520  */
3521 static int block_use_full_backref(struct reloc_control *rc,
3522 				  struct extent_buffer *eb)
3523 {
3524 	u64 flags;
3525 	int ret;
3526 
3527 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3528 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3529 		return 1;
3530 
3531 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3532 				       eb->start, btrfs_header_level(eb), 1,
3533 				       NULL, &flags);
3534 	BUG_ON(ret);
3535 
3536 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3537 		ret = 1;
3538 	else
3539 		ret = 0;
3540 	return ret;
3541 }
3542 
3543 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3544 				    struct btrfs_block_group *block_group,
3545 				    struct inode *inode,
3546 				    u64 ino)
3547 {
3548 	struct btrfs_key key;
3549 	struct btrfs_root *root = fs_info->tree_root;
3550 	struct btrfs_trans_handle *trans;
3551 	int ret = 0;
3552 
3553 	if (inode)
3554 		goto truncate;
3555 
3556 	key.objectid = ino;
3557 	key.type = BTRFS_INODE_ITEM_KEY;
3558 	key.offset = 0;
3559 
3560 	inode = btrfs_iget(fs_info->sb, &key, root);
3561 	if (IS_ERR(inode))
3562 		return -ENOENT;
3563 
3564 truncate:
3565 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3566 						 &fs_info->global_block_rsv);
3567 	if (ret)
3568 		goto out;
3569 
3570 	trans = btrfs_join_transaction(root);
3571 	if (IS_ERR(trans)) {
3572 		ret = PTR_ERR(trans);
3573 		goto out;
3574 	}
3575 
3576 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3577 
3578 	btrfs_end_transaction(trans);
3579 	btrfs_btree_balance_dirty(fs_info);
3580 out:
3581 	iput(inode);
3582 	return ret;
3583 }
3584 
3585 /*
3586  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3587  * this function scans fs tree to find blocks reference the data extent
3588  */
3589 static int find_data_references(struct reloc_control *rc,
3590 				struct btrfs_key *extent_key,
3591 				struct extent_buffer *leaf,
3592 				struct btrfs_extent_data_ref *ref,
3593 				struct rb_root *blocks)
3594 {
3595 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3596 	struct btrfs_path *path;
3597 	struct tree_block *block;
3598 	struct btrfs_root *root;
3599 	struct btrfs_file_extent_item *fi;
3600 	struct rb_node *rb_node;
3601 	struct btrfs_key key;
3602 	u64 ref_root;
3603 	u64 ref_objectid;
3604 	u64 ref_offset;
3605 	u32 ref_count;
3606 	u32 nritems;
3607 	int err = 0;
3608 	int added = 0;
3609 	int counted;
3610 	int ret;
3611 
3612 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3613 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3614 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3615 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3616 
3617 	/*
3618 	 * This is an extent belonging to the free space cache, lets just delete
3619 	 * it and redo the search.
3620 	 */
3621 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3622 		ret = delete_block_group_cache(fs_info, rc->block_group,
3623 					       NULL, ref_objectid);
3624 		if (ret != -ENOENT)
3625 			return ret;
3626 		ret = 0;
3627 	}
3628 
3629 	path = btrfs_alloc_path();
3630 	if (!path)
3631 		return -ENOMEM;
3632 	path->reada = READA_FORWARD;
3633 
3634 	root = read_fs_root(fs_info, ref_root);
3635 	if (IS_ERR(root)) {
3636 		err = PTR_ERR(root);
3637 		goto out;
3638 	}
3639 
3640 	key.objectid = ref_objectid;
3641 	key.type = BTRFS_EXTENT_DATA_KEY;
3642 	if (ref_offset > ((u64)-1 << 32))
3643 		key.offset = 0;
3644 	else
3645 		key.offset = ref_offset;
3646 
3647 	path->search_commit_root = 1;
3648 	path->skip_locking = 1;
3649 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3650 	if (ret < 0) {
3651 		err = ret;
3652 		goto out;
3653 	}
3654 
3655 	leaf = path->nodes[0];
3656 	nritems = btrfs_header_nritems(leaf);
3657 	/*
3658 	 * the references in tree blocks that use full backrefs
3659 	 * are not counted in
3660 	 */
3661 	if (block_use_full_backref(rc, leaf))
3662 		counted = 0;
3663 	else
3664 		counted = 1;
3665 	rb_node = tree_search(blocks, leaf->start);
3666 	if (rb_node) {
3667 		if (counted)
3668 			added = 1;
3669 		else
3670 			path->slots[0] = nritems;
3671 	}
3672 
3673 	while (ref_count > 0) {
3674 		while (path->slots[0] >= nritems) {
3675 			ret = btrfs_next_leaf(root, path);
3676 			if (ret < 0) {
3677 				err = ret;
3678 				goto out;
3679 			}
3680 			if (WARN_ON(ret > 0))
3681 				goto out;
3682 
3683 			leaf = path->nodes[0];
3684 			nritems = btrfs_header_nritems(leaf);
3685 			added = 0;
3686 
3687 			if (block_use_full_backref(rc, leaf))
3688 				counted = 0;
3689 			else
3690 				counted = 1;
3691 			rb_node = tree_search(blocks, leaf->start);
3692 			if (rb_node) {
3693 				if (counted)
3694 					added = 1;
3695 				else
3696 					path->slots[0] = nritems;
3697 			}
3698 		}
3699 
3700 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3701 		if (WARN_ON(key.objectid != ref_objectid ||
3702 		    key.type != BTRFS_EXTENT_DATA_KEY))
3703 			break;
3704 
3705 		fi = btrfs_item_ptr(leaf, path->slots[0],
3706 				    struct btrfs_file_extent_item);
3707 
3708 		if (btrfs_file_extent_type(leaf, fi) ==
3709 		    BTRFS_FILE_EXTENT_INLINE)
3710 			goto next;
3711 
3712 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3713 		    extent_key->objectid)
3714 			goto next;
3715 
3716 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3717 		if (key.offset != ref_offset)
3718 			goto next;
3719 
3720 		if (counted)
3721 			ref_count--;
3722 		if (added)
3723 			goto next;
3724 
3725 		if (!tree_block_processed(leaf->start, rc)) {
3726 			block = kmalloc(sizeof(*block), GFP_NOFS);
3727 			if (!block) {
3728 				err = -ENOMEM;
3729 				break;
3730 			}
3731 			block->bytenr = leaf->start;
3732 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3733 			block->level = 0;
3734 			block->key_ready = 1;
3735 			rb_node = tree_insert(blocks, block->bytenr,
3736 					      &block->rb_node);
3737 			if (rb_node)
3738 				backref_tree_panic(rb_node, -EEXIST,
3739 						   block->bytenr);
3740 		}
3741 		if (counted)
3742 			added = 1;
3743 		else
3744 			path->slots[0] = nritems;
3745 next:
3746 		path->slots[0]++;
3747 
3748 	}
3749 out:
3750 	btrfs_free_path(path);
3751 	return err;
3752 }
3753 
3754 /*
3755  * helper to find all tree blocks that reference a given data extent
3756  */
3757 static noinline_for_stack
3758 int add_data_references(struct reloc_control *rc,
3759 			struct btrfs_key *extent_key,
3760 			struct btrfs_path *path,
3761 			struct rb_root *blocks)
3762 {
3763 	struct btrfs_key key;
3764 	struct extent_buffer *eb;
3765 	struct btrfs_extent_data_ref *dref;
3766 	struct btrfs_extent_inline_ref *iref;
3767 	unsigned long ptr;
3768 	unsigned long end;
3769 	u32 blocksize = rc->extent_root->fs_info->nodesize;
3770 	int ret = 0;
3771 	int err = 0;
3772 
3773 	eb = path->nodes[0];
3774 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3775 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3776 	ptr += sizeof(struct btrfs_extent_item);
3777 
3778 	while (ptr < end) {
3779 		iref = (struct btrfs_extent_inline_ref *)ptr;
3780 		key.type = btrfs_get_extent_inline_ref_type(eb, iref,
3781 							BTRFS_REF_TYPE_DATA);
3782 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3783 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3784 			ret = __add_tree_block(rc, key.offset, blocksize,
3785 					       blocks);
3786 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3787 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3788 			ret = find_data_references(rc, extent_key,
3789 						   eb, dref, blocks);
3790 		} else {
3791 			ret = -EUCLEAN;
3792 			btrfs_err(rc->extent_root->fs_info,
3793 		     "extent %llu slot %d has an invalid inline ref type",
3794 			     eb->start, path->slots[0]);
3795 		}
3796 		if (ret) {
3797 			err = ret;
3798 			goto out;
3799 		}
3800 		ptr += btrfs_extent_inline_ref_size(key.type);
3801 	}
3802 	WARN_ON(ptr > end);
3803 
3804 	while (1) {
3805 		cond_resched();
3806 		eb = path->nodes[0];
3807 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3808 			ret = btrfs_next_leaf(rc->extent_root, path);
3809 			if (ret < 0) {
3810 				err = ret;
3811 				break;
3812 			}
3813 			if (ret > 0)
3814 				break;
3815 			eb = path->nodes[0];
3816 		}
3817 
3818 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3819 		if (key.objectid != extent_key->objectid)
3820 			break;
3821 
3822 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3823 			ret = __add_tree_block(rc, key.offset, blocksize,
3824 					       blocks);
3825 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3826 			dref = btrfs_item_ptr(eb, path->slots[0],
3827 					      struct btrfs_extent_data_ref);
3828 			ret = find_data_references(rc, extent_key,
3829 						   eb, dref, blocks);
3830 		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
3831 			btrfs_print_v0_err(eb->fs_info);
3832 			btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3833 			ret = -EINVAL;
3834 		} else {
3835 			ret = 0;
3836 		}
3837 		if (ret) {
3838 			err = ret;
3839 			break;
3840 		}
3841 		path->slots[0]++;
3842 	}
3843 out:
3844 	btrfs_release_path(path);
3845 	if (err)
3846 		free_block_list(blocks);
3847 	return err;
3848 }
3849 
3850 /*
3851  * helper to find next unprocessed extent
3852  */
3853 static noinline_for_stack
3854 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3855 		     struct btrfs_key *extent_key)
3856 {
3857 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3858 	struct btrfs_key key;
3859 	struct extent_buffer *leaf;
3860 	u64 start, end, last;
3861 	int ret;
3862 
3863 	last = rc->block_group->start + rc->block_group->length;
3864 	while (1) {
3865 		cond_resched();
3866 		if (rc->search_start >= last) {
3867 			ret = 1;
3868 			break;
3869 		}
3870 
3871 		key.objectid = rc->search_start;
3872 		key.type = BTRFS_EXTENT_ITEM_KEY;
3873 		key.offset = 0;
3874 
3875 		path->search_commit_root = 1;
3876 		path->skip_locking = 1;
3877 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3878 					0, 0);
3879 		if (ret < 0)
3880 			break;
3881 next:
3882 		leaf = path->nodes[0];
3883 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3884 			ret = btrfs_next_leaf(rc->extent_root, path);
3885 			if (ret != 0)
3886 				break;
3887 			leaf = path->nodes[0];
3888 		}
3889 
3890 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3891 		if (key.objectid >= last) {
3892 			ret = 1;
3893 			break;
3894 		}
3895 
3896 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3897 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3898 			path->slots[0]++;
3899 			goto next;
3900 		}
3901 
3902 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3903 		    key.objectid + key.offset <= rc->search_start) {
3904 			path->slots[0]++;
3905 			goto next;
3906 		}
3907 
3908 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3909 		    key.objectid + fs_info->nodesize <=
3910 		    rc->search_start) {
3911 			path->slots[0]++;
3912 			goto next;
3913 		}
3914 
3915 		ret = find_first_extent_bit(&rc->processed_blocks,
3916 					    key.objectid, &start, &end,
3917 					    EXTENT_DIRTY, NULL);
3918 
3919 		if (ret == 0 && start <= key.objectid) {
3920 			btrfs_release_path(path);
3921 			rc->search_start = end + 1;
3922 		} else {
3923 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3924 				rc->search_start = key.objectid + key.offset;
3925 			else
3926 				rc->search_start = key.objectid +
3927 					fs_info->nodesize;
3928 			memcpy(extent_key, &key, sizeof(key));
3929 			return 0;
3930 		}
3931 	}
3932 	btrfs_release_path(path);
3933 	return ret;
3934 }
3935 
3936 static void set_reloc_control(struct reloc_control *rc)
3937 {
3938 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3939 
3940 	mutex_lock(&fs_info->reloc_mutex);
3941 	fs_info->reloc_ctl = rc;
3942 	mutex_unlock(&fs_info->reloc_mutex);
3943 }
3944 
3945 static void unset_reloc_control(struct reloc_control *rc)
3946 {
3947 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3948 
3949 	mutex_lock(&fs_info->reloc_mutex);
3950 	fs_info->reloc_ctl = NULL;
3951 	mutex_unlock(&fs_info->reloc_mutex);
3952 }
3953 
3954 static int check_extent_flags(u64 flags)
3955 {
3956 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3957 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3958 		return 1;
3959 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3960 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3961 		return 1;
3962 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3963 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3964 		return 1;
3965 	return 0;
3966 }
3967 
3968 static noinline_for_stack
3969 int prepare_to_relocate(struct reloc_control *rc)
3970 {
3971 	struct btrfs_trans_handle *trans;
3972 	int ret;
3973 
3974 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3975 					      BTRFS_BLOCK_RSV_TEMP);
3976 	if (!rc->block_rsv)
3977 		return -ENOMEM;
3978 
3979 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3980 	rc->search_start = rc->block_group->start;
3981 	rc->extents_found = 0;
3982 	rc->nodes_relocated = 0;
3983 	rc->merging_rsv_size = 0;
3984 	rc->reserved_bytes = 0;
3985 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3986 			      RELOCATION_RESERVED_NODES;
3987 	ret = btrfs_block_rsv_refill(rc->extent_root,
3988 				     rc->block_rsv, rc->block_rsv->size,
3989 				     BTRFS_RESERVE_FLUSH_ALL);
3990 	if (ret)
3991 		return ret;
3992 
3993 	rc->create_reloc_tree = 1;
3994 	set_reloc_control(rc);
3995 
3996 	trans = btrfs_join_transaction(rc->extent_root);
3997 	if (IS_ERR(trans)) {
3998 		unset_reloc_control(rc);
3999 		/*
4000 		 * extent tree is not a ref_cow tree and has no reloc_root to
4001 		 * cleanup.  And callers are responsible to free the above
4002 		 * block rsv.
4003 		 */
4004 		return PTR_ERR(trans);
4005 	}
4006 	btrfs_commit_transaction(trans);
4007 	return 0;
4008 }
4009 
4010 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4011 {
4012 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
4013 	struct rb_root blocks = RB_ROOT;
4014 	struct btrfs_key key;
4015 	struct btrfs_trans_handle *trans = NULL;
4016 	struct btrfs_path *path;
4017 	struct btrfs_extent_item *ei;
4018 	u64 flags;
4019 	u32 item_size;
4020 	int ret;
4021 	int err = 0;
4022 	int progress = 0;
4023 
4024 	path = btrfs_alloc_path();
4025 	if (!path)
4026 		return -ENOMEM;
4027 	path->reada = READA_FORWARD;
4028 
4029 	ret = prepare_to_relocate(rc);
4030 	if (ret) {
4031 		err = ret;
4032 		goto out_free;
4033 	}
4034 
4035 	while (1) {
4036 		rc->reserved_bytes = 0;
4037 		ret = btrfs_block_rsv_refill(rc->extent_root,
4038 					rc->block_rsv, rc->block_rsv->size,
4039 					BTRFS_RESERVE_FLUSH_ALL);
4040 		if (ret) {
4041 			err = ret;
4042 			break;
4043 		}
4044 		progress++;
4045 		trans = btrfs_start_transaction(rc->extent_root, 0);
4046 		if (IS_ERR(trans)) {
4047 			err = PTR_ERR(trans);
4048 			trans = NULL;
4049 			break;
4050 		}
4051 restart:
4052 		if (update_backref_cache(trans, &rc->backref_cache)) {
4053 			btrfs_end_transaction(trans);
4054 			trans = NULL;
4055 			continue;
4056 		}
4057 
4058 		ret = find_next_extent(rc, path, &key);
4059 		if (ret < 0)
4060 			err = ret;
4061 		if (ret != 0)
4062 			break;
4063 
4064 		rc->extents_found++;
4065 
4066 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4067 				    struct btrfs_extent_item);
4068 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4069 		if (item_size >= sizeof(*ei)) {
4070 			flags = btrfs_extent_flags(path->nodes[0], ei);
4071 			ret = check_extent_flags(flags);
4072 			BUG_ON(ret);
4073 		} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
4074 			err = -EINVAL;
4075 			btrfs_print_v0_err(trans->fs_info);
4076 			btrfs_abort_transaction(trans, err);
4077 			break;
4078 		} else {
4079 			BUG();
4080 		}
4081 
4082 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4083 			ret = add_tree_block(rc, &key, path, &blocks);
4084 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4085 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4086 			ret = add_data_references(rc, &key, path, &blocks);
4087 		} else {
4088 			btrfs_release_path(path);
4089 			ret = 0;
4090 		}
4091 		if (ret < 0) {
4092 			err = ret;
4093 			break;
4094 		}
4095 
4096 		if (!RB_EMPTY_ROOT(&blocks)) {
4097 			ret = relocate_tree_blocks(trans, rc, &blocks);
4098 			if (ret < 0) {
4099 				/*
4100 				 * if we fail to relocate tree blocks, force to update
4101 				 * backref cache when committing transaction.
4102 				 */
4103 				rc->backref_cache.last_trans = trans->transid - 1;
4104 
4105 				if (ret != -EAGAIN) {
4106 					err = ret;
4107 					break;
4108 				}
4109 				rc->extents_found--;
4110 				rc->search_start = key.objectid;
4111 			}
4112 		}
4113 
4114 		btrfs_end_transaction_throttle(trans);
4115 		btrfs_btree_balance_dirty(fs_info);
4116 		trans = NULL;
4117 
4118 		if (rc->stage == MOVE_DATA_EXTENTS &&
4119 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4120 			rc->found_file_extent = 1;
4121 			ret = relocate_data_extent(rc->data_inode,
4122 						   &key, &rc->cluster);
4123 			if (ret < 0) {
4124 				err = ret;
4125 				break;
4126 			}
4127 		}
4128 	}
4129 	if (trans && progress && err == -ENOSPC) {
4130 		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
4131 		if (ret == 1) {
4132 			err = 0;
4133 			progress = 0;
4134 			goto restart;
4135 		}
4136 	}
4137 
4138 	btrfs_release_path(path);
4139 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4140 
4141 	if (trans) {
4142 		btrfs_end_transaction_throttle(trans);
4143 		btrfs_btree_balance_dirty(fs_info);
4144 	}
4145 
4146 	if (!err) {
4147 		ret = relocate_file_extent_cluster(rc->data_inode,
4148 						   &rc->cluster);
4149 		if (ret < 0)
4150 			err = ret;
4151 	}
4152 
4153 	rc->create_reloc_tree = 0;
4154 	set_reloc_control(rc);
4155 
4156 	backref_cache_cleanup(&rc->backref_cache);
4157 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4158 
4159 	err = prepare_to_merge(rc, err);
4160 
4161 	merge_reloc_roots(rc);
4162 
4163 	rc->merge_reloc_tree = 0;
4164 	unset_reloc_control(rc);
4165 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4166 
4167 	/* get rid of pinned extents */
4168 	trans = btrfs_join_transaction(rc->extent_root);
4169 	if (IS_ERR(trans)) {
4170 		err = PTR_ERR(trans);
4171 		goto out_free;
4172 	}
4173 	btrfs_commit_transaction(trans);
4174 	ret = clean_dirty_subvols(rc);
4175 	if (ret < 0 && !err)
4176 		err = ret;
4177 out_free:
4178 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
4179 	btrfs_free_path(path);
4180 	return err;
4181 }
4182 
4183 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4184 				 struct btrfs_root *root, u64 objectid)
4185 {
4186 	struct btrfs_path *path;
4187 	struct btrfs_inode_item *item;
4188 	struct extent_buffer *leaf;
4189 	int ret;
4190 
4191 	path = btrfs_alloc_path();
4192 	if (!path)
4193 		return -ENOMEM;
4194 
4195 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4196 	if (ret)
4197 		goto out;
4198 
4199 	leaf = path->nodes[0];
4200 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4201 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4202 	btrfs_set_inode_generation(leaf, item, 1);
4203 	btrfs_set_inode_size(leaf, item, 0);
4204 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4205 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4206 					  BTRFS_INODE_PREALLOC);
4207 	btrfs_mark_buffer_dirty(leaf);
4208 out:
4209 	btrfs_free_path(path);
4210 	return ret;
4211 }
4212 
4213 /*
4214  * helper to create inode for data relocation.
4215  * the inode is in data relocation tree and its link count is 0
4216  */
4217 static noinline_for_stack
4218 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4219 				 struct btrfs_block_group *group)
4220 {
4221 	struct inode *inode = NULL;
4222 	struct btrfs_trans_handle *trans;
4223 	struct btrfs_root *root;
4224 	struct btrfs_key key;
4225 	u64 objectid;
4226 	int err = 0;
4227 
4228 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4229 	if (IS_ERR(root))
4230 		return ERR_CAST(root);
4231 
4232 	trans = btrfs_start_transaction(root, 6);
4233 	if (IS_ERR(trans))
4234 		return ERR_CAST(trans);
4235 
4236 	err = btrfs_find_free_objectid(root, &objectid);
4237 	if (err)
4238 		goto out;
4239 
4240 	err = __insert_orphan_inode(trans, root, objectid);
4241 	BUG_ON(err);
4242 
4243 	key.objectid = objectid;
4244 	key.type = BTRFS_INODE_ITEM_KEY;
4245 	key.offset = 0;
4246 	inode = btrfs_iget(fs_info->sb, &key, root);
4247 	BUG_ON(IS_ERR(inode));
4248 	BTRFS_I(inode)->index_cnt = group->start;
4249 
4250 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4251 out:
4252 	btrfs_end_transaction(trans);
4253 	btrfs_btree_balance_dirty(fs_info);
4254 	if (err) {
4255 		if (inode)
4256 			iput(inode);
4257 		inode = ERR_PTR(err);
4258 	}
4259 	return inode;
4260 }
4261 
4262 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4263 {
4264 	struct reloc_control *rc;
4265 
4266 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4267 	if (!rc)
4268 		return NULL;
4269 
4270 	INIT_LIST_HEAD(&rc->reloc_roots);
4271 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4272 	backref_cache_init(&rc->backref_cache);
4273 	mapping_tree_init(&rc->reloc_root_tree);
4274 	extent_io_tree_init(fs_info, &rc->processed_blocks,
4275 			    IO_TREE_RELOC_BLOCKS, NULL);
4276 	return rc;
4277 }
4278 
4279 /*
4280  * Print the block group being relocated
4281  */
4282 static void describe_relocation(struct btrfs_fs_info *fs_info,
4283 				struct btrfs_block_group *block_group)
4284 {
4285 	char buf[128] = {'\0'};
4286 
4287 	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4288 
4289 	btrfs_info(fs_info,
4290 		   "relocating block group %llu flags %s",
4291 		   block_group->start, buf);
4292 }
4293 
4294 /*
4295  * function to relocate all extents in a block group.
4296  */
4297 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4298 {
4299 	struct btrfs_block_group *bg;
4300 	struct btrfs_root *extent_root = fs_info->extent_root;
4301 	struct reloc_control *rc;
4302 	struct inode *inode;
4303 	struct btrfs_path *path;
4304 	int ret;
4305 	int rw = 0;
4306 	int err = 0;
4307 
4308 	bg = btrfs_lookup_block_group(fs_info, group_start);
4309 	if (!bg)
4310 		return -ENOENT;
4311 
4312 	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4313 		btrfs_put_block_group(bg);
4314 		return -ETXTBSY;
4315 	}
4316 
4317 	rc = alloc_reloc_control(fs_info);
4318 	if (!rc) {
4319 		btrfs_put_block_group(bg);
4320 		return -ENOMEM;
4321 	}
4322 
4323 	rc->extent_root = extent_root;
4324 	rc->block_group = bg;
4325 
4326 	ret = btrfs_inc_block_group_ro(rc->block_group, true);
4327 	if (ret) {
4328 		err = ret;
4329 		goto out;
4330 	}
4331 	rw = 1;
4332 
4333 	path = btrfs_alloc_path();
4334 	if (!path) {
4335 		err = -ENOMEM;
4336 		goto out;
4337 	}
4338 
4339 	inode = lookup_free_space_inode(rc->block_group, path);
4340 	btrfs_free_path(path);
4341 
4342 	if (!IS_ERR(inode))
4343 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4344 	else
4345 		ret = PTR_ERR(inode);
4346 
4347 	if (ret && ret != -ENOENT) {
4348 		err = ret;
4349 		goto out;
4350 	}
4351 
4352 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4353 	if (IS_ERR(rc->data_inode)) {
4354 		err = PTR_ERR(rc->data_inode);
4355 		rc->data_inode = NULL;
4356 		goto out;
4357 	}
4358 
4359 	describe_relocation(fs_info, rc->block_group);
4360 
4361 	btrfs_wait_block_group_reservations(rc->block_group);
4362 	btrfs_wait_nocow_writers(rc->block_group);
4363 	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4364 				 rc->block_group->start,
4365 				 rc->block_group->length);
4366 
4367 	while (1) {
4368 		mutex_lock(&fs_info->cleaner_mutex);
4369 		ret = relocate_block_group(rc);
4370 		mutex_unlock(&fs_info->cleaner_mutex);
4371 		if (ret < 0)
4372 			err = ret;
4373 
4374 		/*
4375 		 * We may have gotten ENOSPC after we already dirtied some
4376 		 * extents.  If writeout happens while we're relocating a
4377 		 * different block group we could end up hitting the
4378 		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4379 		 * btrfs_reloc_cow_block.  Make sure we write everything out
4380 		 * properly so we don't trip over this problem, and then break
4381 		 * out of the loop if we hit an error.
4382 		 */
4383 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4384 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4385 						       (u64)-1);
4386 			if (ret)
4387 				err = ret;
4388 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4389 						 0, -1);
4390 			rc->stage = UPDATE_DATA_PTRS;
4391 		}
4392 
4393 		if (err < 0)
4394 			goto out;
4395 
4396 		if (rc->extents_found == 0)
4397 			break;
4398 
4399 		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4400 
4401 	}
4402 
4403 	WARN_ON(rc->block_group->pinned > 0);
4404 	WARN_ON(rc->block_group->reserved > 0);
4405 	WARN_ON(rc->block_group->used > 0);
4406 out:
4407 	if (err && rw)
4408 		btrfs_dec_block_group_ro(rc->block_group);
4409 	iput(rc->data_inode);
4410 	btrfs_put_block_group(rc->block_group);
4411 	kfree(rc);
4412 	return err;
4413 }
4414 
4415 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4416 {
4417 	struct btrfs_fs_info *fs_info = root->fs_info;
4418 	struct btrfs_trans_handle *trans;
4419 	int ret, err;
4420 
4421 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4422 	if (IS_ERR(trans))
4423 		return PTR_ERR(trans);
4424 
4425 	memset(&root->root_item.drop_progress, 0,
4426 		sizeof(root->root_item.drop_progress));
4427 	root->root_item.drop_level = 0;
4428 	btrfs_set_root_refs(&root->root_item, 0);
4429 	ret = btrfs_update_root(trans, fs_info->tree_root,
4430 				&root->root_key, &root->root_item);
4431 
4432 	err = btrfs_end_transaction(trans);
4433 	if (err)
4434 		return err;
4435 	return ret;
4436 }
4437 
4438 /*
4439  * recover relocation interrupted by system crash.
4440  *
4441  * this function resumes merging reloc trees with corresponding fs trees.
4442  * this is important for keeping the sharing of tree blocks
4443  */
4444 int btrfs_recover_relocation(struct btrfs_root *root)
4445 {
4446 	struct btrfs_fs_info *fs_info = root->fs_info;
4447 	LIST_HEAD(reloc_roots);
4448 	struct btrfs_key key;
4449 	struct btrfs_root *fs_root;
4450 	struct btrfs_root *reloc_root;
4451 	struct btrfs_path *path;
4452 	struct extent_buffer *leaf;
4453 	struct reloc_control *rc = NULL;
4454 	struct btrfs_trans_handle *trans;
4455 	int ret;
4456 	int err = 0;
4457 
4458 	path = btrfs_alloc_path();
4459 	if (!path)
4460 		return -ENOMEM;
4461 	path->reada = READA_BACK;
4462 
4463 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4464 	key.type = BTRFS_ROOT_ITEM_KEY;
4465 	key.offset = (u64)-1;
4466 
4467 	while (1) {
4468 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4469 					path, 0, 0);
4470 		if (ret < 0) {
4471 			err = ret;
4472 			goto out;
4473 		}
4474 		if (ret > 0) {
4475 			if (path->slots[0] == 0)
4476 				break;
4477 			path->slots[0]--;
4478 		}
4479 		leaf = path->nodes[0];
4480 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4481 		btrfs_release_path(path);
4482 
4483 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4484 		    key.type != BTRFS_ROOT_ITEM_KEY)
4485 			break;
4486 
4487 		reloc_root = btrfs_read_fs_root(root, &key);
4488 		if (IS_ERR(reloc_root)) {
4489 			err = PTR_ERR(reloc_root);
4490 			goto out;
4491 		}
4492 
4493 		list_add(&reloc_root->root_list, &reloc_roots);
4494 
4495 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4496 			fs_root = read_fs_root(fs_info,
4497 					       reloc_root->root_key.offset);
4498 			if (IS_ERR(fs_root)) {
4499 				ret = PTR_ERR(fs_root);
4500 				if (ret != -ENOENT) {
4501 					err = ret;
4502 					goto out;
4503 				}
4504 				ret = mark_garbage_root(reloc_root);
4505 				if (ret < 0) {
4506 					err = ret;
4507 					goto out;
4508 				}
4509 			}
4510 		}
4511 
4512 		if (key.offset == 0)
4513 			break;
4514 
4515 		key.offset--;
4516 	}
4517 	btrfs_release_path(path);
4518 
4519 	if (list_empty(&reloc_roots))
4520 		goto out;
4521 
4522 	rc = alloc_reloc_control(fs_info);
4523 	if (!rc) {
4524 		err = -ENOMEM;
4525 		goto out;
4526 	}
4527 
4528 	rc->extent_root = fs_info->extent_root;
4529 
4530 	set_reloc_control(rc);
4531 
4532 	trans = btrfs_join_transaction(rc->extent_root);
4533 	if (IS_ERR(trans)) {
4534 		unset_reloc_control(rc);
4535 		err = PTR_ERR(trans);
4536 		goto out_free;
4537 	}
4538 
4539 	rc->merge_reloc_tree = 1;
4540 
4541 	while (!list_empty(&reloc_roots)) {
4542 		reloc_root = list_entry(reloc_roots.next,
4543 					struct btrfs_root, root_list);
4544 		list_del(&reloc_root->root_list);
4545 
4546 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4547 			list_add_tail(&reloc_root->root_list,
4548 				      &rc->reloc_roots);
4549 			continue;
4550 		}
4551 
4552 		fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4553 		if (IS_ERR(fs_root)) {
4554 			err = PTR_ERR(fs_root);
4555 			goto out_free;
4556 		}
4557 
4558 		err = __add_reloc_root(reloc_root);
4559 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4560 		fs_root->reloc_root = reloc_root;
4561 	}
4562 
4563 	err = btrfs_commit_transaction(trans);
4564 	if (err)
4565 		goto out_free;
4566 
4567 	merge_reloc_roots(rc);
4568 
4569 	unset_reloc_control(rc);
4570 
4571 	trans = btrfs_join_transaction(rc->extent_root);
4572 	if (IS_ERR(trans)) {
4573 		err = PTR_ERR(trans);
4574 		goto out_free;
4575 	}
4576 	err = btrfs_commit_transaction(trans);
4577 
4578 	ret = clean_dirty_subvols(rc);
4579 	if (ret < 0 && !err)
4580 		err = ret;
4581 out_free:
4582 	kfree(rc);
4583 out:
4584 	if (!list_empty(&reloc_roots))
4585 		free_reloc_roots(&reloc_roots);
4586 
4587 	btrfs_free_path(path);
4588 
4589 	if (err == 0) {
4590 		/* cleanup orphan inode in data relocation tree */
4591 		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4592 		if (IS_ERR(fs_root))
4593 			err = PTR_ERR(fs_root);
4594 		else
4595 			err = btrfs_orphan_cleanup(fs_root);
4596 	}
4597 	return err;
4598 }
4599 
4600 /*
4601  * helper to add ordered checksum for data relocation.
4602  *
4603  * cloning checksum properly handles the nodatasum extents.
4604  * it also saves CPU time to re-calculate the checksum.
4605  */
4606 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4607 {
4608 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4609 	struct btrfs_ordered_sum *sums;
4610 	struct btrfs_ordered_extent *ordered;
4611 	int ret;
4612 	u64 disk_bytenr;
4613 	u64 new_bytenr;
4614 	LIST_HEAD(list);
4615 
4616 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4617 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4618 
4619 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4620 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4621 				       disk_bytenr + len - 1, &list, 0);
4622 	if (ret)
4623 		goto out;
4624 
4625 	while (!list_empty(&list)) {
4626 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4627 		list_del_init(&sums->list);
4628 
4629 		/*
4630 		 * We need to offset the new_bytenr based on where the csum is.
4631 		 * We need to do this because we will read in entire prealloc
4632 		 * extents but we may have written to say the middle of the
4633 		 * prealloc extent, so we need to make sure the csum goes with
4634 		 * the right disk offset.
4635 		 *
4636 		 * We can do this because the data reloc inode refers strictly
4637 		 * to the on disk bytes, so we don't have to worry about
4638 		 * disk_len vs real len like with real inodes since it's all
4639 		 * disk length.
4640 		 */
4641 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4642 		sums->bytenr = new_bytenr;
4643 
4644 		btrfs_add_ordered_sum(ordered, sums);
4645 	}
4646 out:
4647 	btrfs_put_ordered_extent(ordered);
4648 	return ret;
4649 }
4650 
4651 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4652 			  struct btrfs_root *root, struct extent_buffer *buf,
4653 			  struct extent_buffer *cow)
4654 {
4655 	struct btrfs_fs_info *fs_info = root->fs_info;
4656 	struct reloc_control *rc;
4657 	struct backref_node *node;
4658 	int first_cow = 0;
4659 	int level;
4660 	int ret = 0;
4661 
4662 	rc = fs_info->reloc_ctl;
4663 	if (!rc)
4664 		return 0;
4665 
4666 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4667 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4668 
4669 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4670 		if (buf == root->node)
4671 			__update_reloc_root(root, cow->start);
4672 	}
4673 
4674 	level = btrfs_header_level(buf);
4675 	if (btrfs_header_generation(buf) <=
4676 	    btrfs_root_last_snapshot(&root->root_item))
4677 		first_cow = 1;
4678 
4679 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4680 	    rc->create_reloc_tree) {
4681 		WARN_ON(!first_cow && level == 0);
4682 
4683 		node = rc->backref_cache.path[level];
4684 		BUG_ON(node->bytenr != buf->start &&
4685 		       node->new_bytenr != buf->start);
4686 
4687 		drop_node_buffer(node);
4688 		atomic_inc(&cow->refs);
4689 		node->eb = cow;
4690 		node->new_bytenr = cow->start;
4691 
4692 		if (!node->pending) {
4693 			list_move_tail(&node->list,
4694 				       &rc->backref_cache.pending[level]);
4695 			node->pending = 1;
4696 		}
4697 
4698 		if (first_cow)
4699 			__mark_block_processed(rc, node);
4700 
4701 		if (first_cow && level > 0)
4702 			rc->nodes_relocated += buf->len;
4703 	}
4704 
4705 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4706 		ret = replace_file_extents(trans, rc, root, cow);
4707 	return ret;
4708 }
4709 
4710 /*
4711  * called before creating snapshot. it calculates metadata reservation
4712  * required for relocating tree blocks in the snapshot
4713  */
4714 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4715 			      u64 *bytes_to_reserve)
4716 {
4717 	struct btrfs_root *root = pending->root;
4718 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4719 
4720 	if (!root->reloc_root || !rc)
4721 		return;
4722 
4723 	if (!rc->merge_reloc_tree)
4724 		return;
4725 
4726 	root = root->reloc_root;
4727 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4728 	/*
4729 	 * relocation is in the stage of merging trees. the space
4730 	 * used by merging a reloc tree is twice the size of
4731 	 * relocated tree nodes in the worst case. half for cowing
4732 	 * the reloc tree, half for cowing the fs tree. the space
4733 	 * used by cowing the reloc tree will be freed after the
4734 	 * tree is dropped. if we create snapshot, cowing the fs
4735 	 * tree may use more space than it frees. so we need
4736 	 * reserve extra space.
4737 	 */
4738 	*bytes_to_reserve += rc->nodes_relocated;
4739 }
4740 
4741 /*
4742  * called after snapshot is created. migrate block reservation
4743  * and create reloc root for the newly created snapshot
4744  */
4745 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4746 			       struct btrfs_pending_snapshot *pending)
4747 {
4748 	struct btrfs_root *root = pending->root;
4749 	struct btrfs_root *reloc_root;
4750 	struct btrfs_root *new_root;
4751 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4752 	int ret;
4753 
4754 	if (!root->reloc_root || !rc)
4755 		return 0;
4756 
4757 	rc = root->fs_info->reloc_ctl;
4758 	rc->merging_rsv_size += rc->nodes_relocated;
4759 
4760 	if (rc->merge_reloc_tree) {
4761 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4762 					      rc->block_rsv,
4763 					      rc->nodes_relocated, true);
4764 		if (ret)
4765 			return ret;
4766 	}
4767 
4768 	new_root = pending->snap;
4769 	reloc_root = create_reloc_root(trans, root->reloc_root,
4770 				       new_root->root_key.objectid);
4771 	if (IS_ERR(reloc_root))
4772 		return PTR_ERR(reloc_root);
4773 
4774 	ret = __add_reloc_root(reloc_root);
4775 	BUG_ON(ret < 0);
4776 	new_root->reloc_root = reloc_root;
4777 
4778 	if (rc->create_reloc_tree)
4779 		ret = clone_backref_node(trans, rc, root, reloc_root);
4780 	return ret;
4781 }
4782