xref: /openbmc/linux/fs/btrfs/relocation.c (revision b1818dab9bda1da8f3ea5a13230b5d91ae964f00)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include <linux/error-injection.h>
13 #include "ctree.h"
14 #include "disk-io.h"
15 #include "transaction.h"
16 #include "volumes.h"
17 #include "locking.h"
18 #include "btrfs_inode.h"
19 #include "async-thread.h"
20 #include "free-space-cache.h"
21 #include "inode-map.h"
22 #include "qgroup.h"
23 #include "print-tree.h"
24 #include "delalloc-space.h"
25 #include "block-group.h"
26 #include "backref.h"
27 #include "misc.h"
28 
29 /*
30  * Relocation overview
31  *
32  * [What does relocation do]
33  *
34  * The objective of relocation is to relocate all extents of the target block
35  * group to other block groups.
36  * This is utilized by resize (shrink only), profile converting, compacting
37  * space, or balance routine to spread chunks over devices.
38  *
39  * 		Before		|		After
40  * ------------------------------------------------------------------
41  *  BG A: 10 data extents	| BG A: deleted
42  *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
43  *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
44  *
45  * [How does relocation work]
46  *
47  * 1.   Mark the target block group read-only
48  *      New extents won't be allocated from the target block group.
49  *
50  * 2.1  Record each extent in the target block group
51  *      To build a proper map of extents to be relocated.
52  *
53  * 2.2  Build data reloc tree and reloc trees
54  *      Data reloc tree will contain an inode, recording all newly relocated
55  *      data extents.
56  *      There will be only one data reloc tree for one data block group.
57  *
58  *      Reloc tree will be a special snapshot of its source tree, containing
59  *      relocated tree blocks.
60  *      Each tree referring to a tree block in target block group will get its
61  *      reloc tree built.
62  *
63  * 2.3  Swap source tree with its corresponding reloc tree
64  *      Each involved tree only refers to new extents after swap.
65  *
66  * 3.   Cleanup reloc trees and data reloc tree.
67  *      As old extents in the target block group are still referenced by reloc
68  *      trees, we need to clean them up before really freeing the target block
69  *      group.
70  *
71  * The main complexity is in steps 2.2 and 2.3.
72  *
73  * The entry point of relocation is relocate_block_group() function.
74  */
75 
76 #define RELOCATION_RESERVED_NODES	256
77 /*
78  * map address of tree root to tree
79  */
80 struct mapping_node {
81 	struct {
82 		struct rb_node rb_node;
83 		u64 bytenr;
84 	}; /* Use rb_simle_node for search/insert */
85 	void *data;
86 };
87 
88 struct mapping_tree {
89 	struct rb_root rb_root;
90 	spinlock_t lock;
91 };
92 
93 /*
94  * present a tree block to process
95  */
96 struct tree_block {
97 	struct {
98 		struct rb_node rb_node;
99 		u64 bytenr;
100 	}; /* Use rb_simple_node for search/insert */
101 	struct btrfs_key key;
102 	unsigned int level:8;
103 	unsigned int key_ready:1;
104 };
105 
106 #define MAX_EXTENTS 128
107 
108 struct file_extent_cluster {
109 	u64 start;
110 	u64 end;
111 	u64 boundary[MAX_EXTENTS];
112 	unsigned int nr;
113 };
114 
115 struct reloc_control {
116 	/* block group to relocate */
117 	struct btrfs_block_group *block_group;
118 	/* extent tree */
119 	struct btrfs_root *extent_root;
120 	/* inode for moving data */
121 	struct inode *data_inode;
122 
123 	struct btrfs_block_rsv *block_rsv;
124 
125 	struct btrfs_backref_cache backref_cache;
126 
127 	struct file_extent_cluster cluster;
128 	/* tree blocks have been processed */
129 	struct extent_io_tree processed_blocks;
130 	/* map start of tree root to corresponding reloc tree */
131 	struct mapping_tree reloc_root_tree;
132 	/* list of reloc trees */
133 	struct list_head reloc_roots;
134 	/* list of subvolume trees that get relocated */
135 	struct list_head dirty_subvol_roots;
136 	/* size of metadata reservation for merging reloc trees */
137 	u64 merging_rsv_size;
138 	/* size of relocated tree nodes */
139 	u64 nodes_relocated;
140 	/* reserved size for block group relocation*/
141 	u64 reserved_bytes;
142 
143 	u64 search_start;
144 	u64 extents_found;
145 
146 	unsigned int stage:8;
147 	unsigned int create_reloc_tree:1;
148 	unsigned int merge_reloc_tree:1;
149 	unsigned int found_file_extent:1;
150 };
151 
152 /* stages of data relocation */
153 #define MOVE_DATA_EXTENTS	0
154 #define UPDATE_DATA_PTRS	1
155 
156 static void remove_backref_node(struct btrfs_backref_cache *cache,
157 				struct btrfs_backref_node *node);
158 
159 static void mark_block_processed(struct reloc_control *rc,
160 				 struct btrfs_backref_node *node)
161 {
162 	u32 blocksize;
163 
164 	if (node->level == 0 ||
165 	    in_range(node->bytenr, rc->block_group->start,
166 		     rc->block_group->length)) {
167 		blocksize = rc->extent_root->fs_info->nodesize;
168 		set_extent_bits(&rc->processed_blocks, node->bytenr,
169 				node->bytenr + blocksize - 1, EXTENT_DIRTY);
170 	}
171 	node->processed = 1;
172 }
173 
174 
175 static void mapping_tree_init(struct mapping_tree *tree)
176 {
177 	tree->rb_root = RB_ROOT;
178 	spin_lock_init(&tree->lock);
179 }
180 
181 static void backref_cache_cleanup(struct btrfs_backref_cache *cache)
182 {
183 	struct btrfs_backref_node *node;
184 	int i;
185 
186 	while (!list_empty(&cache->detached)) {
187 		node = list_entry(cache->detached.next,
188 				  struct btrfs_backref_node, list);
189 		remove_backref_node(cache, node);
190 	}
191 
192 	while (!list_empty(&cache->leaves)) {
193 		node = list_entry(cache->leaves.next,
194 				  struct btrfs_backref_node, lower);
195 		remove_backref_node(cache, node);
196 	}
197 
198 	cache->last_trans = 0;
199 
200 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
201 		ASSERT(list_empty(&cache->pending[i]));
202 	ASSERT(list_empty(&cache->pending_edge));
203 	ASSERT(list_empty(&cache->useless_node));
204 	ASSERT(list_empty(&cache->changed));
205 	ASSERT(list_empty(&cache->detached));
206 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
207 	ASSERT(!cache->nr_nodes);
208 	ASSERT(!cache->nr_edges);
209 }
210 
211 static void free_backref_node(struct btrfs_backref_cache *cache,
212 			      struct btrfs_backref_node *node)
213 {
214 	if (node) {
215 		cache->nr_nodes--;
216 		btrfs_put_root(node->root);
217 		kfree(node);
218 	}
219 }
220 
221 static struct btrfs_backref_edge *alloc_backref_edge(
222 		struct btrfs_backref_cache *cache)
223 {
224 	struct btrfs_backref_edge *edge;
225 
226 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
227 	if (edge)
228 		cache->nr_edges++;
229 	return edge;
230 }
231 
232 #define		LINK_LOWER	(1 << 0)
233 #define		LINK_UPPER	(1 << 1)
234 static void link_backref_edge(struct btrfs_backref_edge *edge,
235 			      struct btrfs_backref_node *lower,
236 			      struct btrfs_backref_node *upper,
237 			      int link_which)
238 {
239 	ASSERT(upper && lower && upper->level == lower->level + 1);
240 	edge->node[LOWER] = lower;
241 	edge->node[UPPER] = upper;
242 	if (link_which & LINK_LOWER)
243 		list_add_tail(&edge->list[LOWER], &lower->upper);
244 	if (link_which & LINK_UPPER)
245 		list_add_tail(&edge->list[UPPER], &upper->lower);
246 }
247 
248 static void free_backref_edge(struct btrfs_backref_cache *cache,
249 			      struct btrfs_backref_edge *edge)
250 {
251 	if (edge) {
252 		cache->nr_edges--;
253 		kfree(edge);
254 	}
255 }
256 
257 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
258 {
259 
260 	struct btrfs_fs_info *fs_info = NULL;
261 	struct btrfs_backref_node *bnode = rb_entry(rb_node,
262 			struct btrfs_backref_node, rb_node);
263 	if (bnode->root)
264 		fs_info = bnode->root->fs_info;
265 	btrfs_panic(fs_info, errno,
266 		    "Inconsistency in backref cache found at offset %llu",
267 		    bytenr);
268 }
269 
270 /*
271  * walk up backref nodes until reach node presents tree root
272  */
273 static struct btrfs_backref_node *walk_up_backref(
274 		struct btrfs_backref_node *node,
275 		struct btrfs_backref_edge *edges[], int *index)
276 {
277 	struct btrfs_backref_edge *edge;
278 	int idx = *index;
279 
280 	while (!list_empty(&node->upper)) {
281 		edge = list_entry(node->upper.next,
282 				  struct btrfs_backref_edge, list[LOWER]);
283 		edges[idx++] = edge;
284 		node = edge->node[UPPER];
285 	}
286 	BUG_ON(node->detached);
287 	*index = idx;
288 	return node;
289 }
290 
291 /*
292  * walk down backref nodes to find start of next reference path
293  */
294 static struct btrfs_backref_node *walk_down_backref(
295 		struct btrfs_backref_edge *edges[], int *index)
296 {
297 	struct btrfs_backref_edge *edge;
298 	struct btrfs_backref_node *lower;
299 	int idx = *index;
300 
301 	while (idx > 0) {
302 		edge = edges[idx - 1];
303 		lower = edge->node[LOWER];
304 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
305 			idx--;
306 			continue;
307 		}
308 		edge = list_entry(edge->list[LOWER].next,
309 				  struct btrfs_backref_edge, list[LOWER]);
310 		edges[idx - 1] = edge;
311 		*index = idx;
312 		return edge->node[UPPER];
313 	}
314 	*index = 0;
315 	return NULL;
316 }
317 
318 static void unlock_node_buffer(struct btrfs_backref_node *node)
319 {
320 	if (node->locked) {
321 		btrfs_tree_unlock(node->eb);
322 		node->locked = 0;
323 	}
324 }
325 
326 static void drop_node_buffer(struct btrfs_backref_node *node)
327 {
328 	if (node->eb) {
329 		unlock_node_buffer(node);
330 		free_extent_buffer(node->eb);
331 		node->eb = NULL;
332 	}
333 }
334 
335 static void drop_backref_node(struct btrfs_backref_cache *tree,
336 			      struct btrfs_backref_node *node)
337 {
338 	BUG_ON(!list_empty(&node->upper));
339 
340 	drop_node_buffer(node);
341 	list_del(&node->list);
342 	list_del(&node->lower);
343 	if (!RB_EMPTY_NODE(&node->rb_node))
344 		rb_erase(&node->rb_node, &tree->rb_root);
345 	free_backref_node(tree, node);
346 }
347 
348 /*
349  * remove a backref node from the backref cache
350  */
351 static void remove_backref_node(struct btrfs_backref_cache *cache,
352 				struct btrfs_backref_node *node)
353 {
354 	struct btrfs_backref_node *upper;
355 	struct btrfs_backref_edge *edge;
356 
357 	if (!node)
358 		return;
359 
360 	BUG_ON(!node->lowest && !node->detached);
361 	while (!list_empty(&node->upper)) {
362 		edge = list_entry(node->upper.next, struct btrfs_backref_edge,
363 				  list[LOWER]);
364 		upper = edge->node[UPPER];
365 		list_del(&edge->list[LOWER]);
366 		list_del(&edge->list[UPPER]);
367 		free_backref_edge(cache, edge);
368 
369 		if (RB_EMPTY_NODE(&upper->rb_node)) {
370 			BUG_ON(!list_empty(&node->upper));
371 			drop_backref_node(cache, node);
372 			node = upper;
373 			node->lowest = 1;
374 			continue;
375 		}
376 		/*
377 		 * add the node to leaf node list if no other
378 		 * child block cached.
379 		 */
380 		if (list_empty(&upper->lower)) {
381 			list_add_tail(&upper->lower, &cache->leaves);
382 			upper->lowest = 1;
383 		}
384 	}
385 
386 	drop_backref_node(cache, node);
387 }
388 
389 static void update_backref_node(struct btrfs_backref_cache *cache,
390 				struct btrfs_backref_node *node, u64 bytenr)
391 {
392 	struct rb_node *rb_node;
393 	rb_erase(&node->rb_node, &cache->rb_root);
394 	node->bytenr = bytenr;
395 	rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
396 	if (rb_node)
397 		backref_tree_panic(rb_node, -EEXIST, bytenr);
398 }
399 
400 /*
401  * update backref cache after a transaction commit
402  */
403 static int update_backref_cache(struct btrfs_trans_handle *trans,
404 				struct btrfs_backref_cache *cache)
405 {
406 	struct btrfs_backref_node *node;
407 	int level = 0;
408 
409 	if (cache->last_trans == 0) {
410 		cache->last_trans = trans->transid;
411 		return 0;
412 	}
413 
414 	if (cache->last_trans == trans->transid)
415 		return 0;
416 
417 	/*
418 	 * detached nodes are used to avoid unnecessary backref
419 	 * lookup. transaction commit changes the extent tree.
420 	 * so the detached nodes are no longer useful.
421 	 */
422 	while (!list_empty(&cache->detached)) {
423 		node = list_entry(cache->detached.next,
424 				  struct btrfs_backref_node, list);
425 		remove_backref_node(cache, node);
426 	}
427 
428 	while (!list_empty(&cache->changed)) {
429 		node = list_entry(cache->changed.next,
430 				  struct btrfs_backref_node, list);
431 		list_del_init(&node->list);
432 		BUG_ON(node->pending);
433 		update_backref_node(cache, node, node->new_bytenr);
434 	}
435 
436 	/*
437 	 * some nodes can be left in the pending list if there were
438 	 * errors during processing the pending nodes.
439 	 */
440 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
441 		list_for_each_entry(node, &cache->pending[level], list) {
442 			BUG_ON(!node->pending);
443 			if (node->bytenr == node->new_bytenr)
444 				continue;
445 			update_backref_node(cache, node, node->new_bytenr);
446 		}
447 	}
448 
449 	cache->last_trans = 0;
450 	return 1;
451 }
452 
453 static bool reloc_root_is_dead(struct btrfs_root *root)
454 {
455 	/*
456 	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
457 	 * btrfs_update_reloc_root. We need to see the updated bit before
458 	 * trying to access reloc_root
459 	 */
460 	smp_rmb();
461 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
462 		return true;
463 	return false;
464 }
465 
466 /*
467  * Check if this subvolume tree has valid reloc tree.
468  *
469  * Reloc tree after swap is considered dead, thus not considered as valid.
470  * This is enough for most callers, as they don't distinguish dead reloc root
471  * from no reloc root.  But should_ignore_root() below is a special case.
472  */
473 static bool have_reloc_root(struct btrfs_root *root)
474 {
475 	if (reloc_root_is_dead(root))
476 		return false;
477 	if (!root->reloc_root)
478 		return false;
479 	return true;
480 }
481 
482 static int should_ignore_root(struct btrfs_root *root)
483 {
484 	struct btrfs_root *reloc_root;
485 
486 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
487 		return 0;
488 
489 	/* This root has been merged with its reloc tree, we can ignore it */
490 	if (reloc_root_is_dead(root))
491 		return 1;
492 
493 	reloc_root = root->reloc_root;
494 	if (!reloc_root)
495 		return 0;
496 
497 	if (btrfs_header_generation(reloc_root->commit_root) ==
498 	    root->fs_info->running_transaction->transid)
499 		return 0;
500 	/*
501 	 * if there is reloc tree and it was created in previous
502 	 * transaction backref lookup can find the reloc tree,
503 	 * so backref node for the fs tree root is useless for
504 	 * relocation.
505 	 */
506 	return 1;
507 }
508 /*
509  * find reloc tree by address of tree root
510  */
511 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
512 {
513 	struct reloc_control *rc = fs_info->reloc_ctl;
514 	struct rb_node *rb_node;
515 	struct mapping_node *node;
516 	struct btrfs_root *root = NULL;
517 
518 	ASSERT(rc);
519 	spin_lock(&rc->reloc_root_tree.lock);
520 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
521 	if (rb_node) {
522 		node = rb_entry(rb_node, struct mapping_node, rb_node);
523 		root = (struct btrfs_root *)node->data;
524 	}
525 	spin_unlock(&rc->reloc_root_tree.lock);
526 	return btrfs_grab_root(root);
527 }
528 
529 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
530 					u64 root_objectid)
531 {
532 	struct btrfs_key key;
533 
534 	key.objectid = root_objectid;
535 	key.type = BTRFS_ROOT_ITEM_KEY;
536 	key.offset = (u64)-1;
537 
538 	return btrfs_get_fs_root(fs_info, &key, false);
539 }
540 
541 /*
542  * Handle direct tree backref
543  *
544  * Direct tree backref means, the backref item shows its parent bytenr
545  * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
546  *
547  * @ref_key:	The converted backref key.
548  *		For keyed backref, it's the item key.
549  *		For inlined backref, objectid is the bytenr,
550  *		type is btrfs_inline_ref_type, offset is
551  *		btrfs_inline_ref_offset.
552  */
553 static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
554 				      struct btrfs_key *ref_key,
555 				      struct btrfs_backref_node *cur)
556 {
557 	struct btrfs_backref_edge *edge;
558 	struct btrfs_backref_node *upper;
559 	struct rb_node *rb_node;
560 
561 	ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
562 
563 	/* Only reloc root uses backref pointing to itself */
564 	if (ref_key->objectid == ref_key->offset) {
565 		struct btrfs_root *root;
566 
567 		cur->is_reloc_root = 1;
568 		/* Only reloc backref cache cares about a specific root */
569 		if (cache->is_reloc) {
570 			root = find_reloc_root(cache->fs_info, cur->bytenr);
571 			if (WARN_ON(!root))
572 				return -ENOENT;
573 			cur->root = root;
574 		} else {
575 			/*
576 			 * For generic purpose backref cache, reloc root node
577 			 * is useless.
578 			 */
579 			list_add(&cur->list, &cache->useless_node);
580 		}
581 		return 0;
582 	}
583 
584 	edge = alloc_backref_edge(cache);
585 	if (!edge)
586 		return -ENOMEM;
587 
588 	rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
589 	if (!rb_node) {
590 		/* Parent node not yet cached */
591 		upper = btrfs_backref_alloc_node(cache, ref_key->offset,
592 					   cur->level + 1);
593 		if (!upper) {
594 			free_backref_edge(cache, edge);
595 			return -ENOMEM;
596 		}
597 
598 		/*
599 		 *  Backrefs for the upper level block isn't cached, add the
600 		 *  block to pending list
601 		 */
602 		list_add_tail(&edge->list[UPPER], &cache->pending_edge);
603 	} else {
604 		/* Parent node already cached */
605 		upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
606 		ASSERT(upper->checked);
607 		INIT_LIST_HEAD(&edge->list[UPPER]);
608 	}
609 	link_backref_edge(edge, cur, upper, LINK_LOWER);
610 	return 0;
611 }
612 
613 /*
614  * Handle indirect tree backref
615  *
616  * Indirect tree backref means, we only know which tree the node belongs to.
617  * We still need to do a tree search to find out the parents. This is for
618  * TREE_BLOCK_REF backref (keyed or inlined).
619  *
620  * @ref_key:	The same as @ref_key in  handle_direct_tree_backref()
621  * @tree_key:	The first key of this tree block.
622  * @path:	A clean (released) path, to avoid allocating path everytime
623  *		the function get called.
624  */
625 static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
626 					struct btrfs_path *path,
627 					struct btrfs_key *ref_key,
628 					struct btrfs_key *tree_key,
629 					struct btrfs_backref_node *cur)
630 {
631 	struct btrfs_fs_info *fs_info = cache->fs_info;
632 	struct btrfs_backref_node *upper;
633 	struct btrfs_backref_node *lower;
634 	struct btrfs_backref_edge *edge;
635 	struct extent_buffer *eb;
636 	struct btrfs_root *root;
637 	struct rb_node *rb_node;
638 	int level;
639 	bool need_check = true;
640 	int ret;
641 
642 	root = read_fs_root(fs_info, ref_key->offset);
643 	if (IS_ERR(root))
644 		return PTR_ERR(root);
645 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
646 		cur->cowonly = 1;
647 
648 	if (btrfs_root_level(&root->root_item) == cur->level) {
649 		/* Tree root */
650 		ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
651 		if (should_ignore_root(root)) {
652 			btrfs_put_root(root);
653 			list_add(&cur->list, &cache->useless_node);
654 		} else {
655 			cur->root = root;
656 		}
657 		return 0;
658 	}
659 
660 	level = cur->level + 1;
661 
662 	/* Search the tree to find parent blocks referring to the block */
663 	path->search_commit_root = 1;
664 	path->skip_locking = 1;
665 	path->lowest_level = level;
666 	ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
667 	path->lowest_level = 0;
668 	if (ret < 0) {
669 		btrfs_put_root(root);
670 		return ret;
671 	}
672 	if (ret > 0 && path->slots[level] > 0)
673 		path->slots[level]--;
674 
675 	eb = path->nodes[level];
676 	if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
677 		btrfs_err(fs_info,
678 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
679 			  cur->bytenr, level - 1, root->root_key.objectid,
680 			  tree_key->objectid, tree_key->type, tree_key->offset);
681 		btrfs_put_root(root);
682 		ret = -ENOENT;
683 		goto out;
684 	}
685 	lower = cur;
686 
687 	/* Add all nodes and edges in the path */
688 	for (; level < BTRFS_MAX_LEVEL; level++) {
689 		if (!path->nodes[level]) {
690 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
691 			       lower->bytenr);
692 			if (should_ignore_root(root)) {
693 				btrfs_put_root(root);
694 				list_add(&lower->list, &cache->useless_node);
695 			} else {
696 				lower->root = root;
697 			}
698 			break;
699 		}
700 
701 		edge = alloc_backref_edge(cache);
702 		if (!edge) {
703 			btrfs_put_root(root);
704 			ret = -ENOMEM;
705 			goto out;
706 		}
707 
708 		eb = path->nodes[level];
709 		rb_node = rb_simple_search(&cache->rb_root, eb->start);
710 		if (!rb_node) {
711 			upper = btrfs_backref_alloc_node(cache, eb->start,
712 							 lower->level + 1);
713 			if (!upper) {
714 				btrfs_put_root(root);
715 				free_backref_edge(cache, edge);
716 				ret = -ENOMEM;
717 				goto out;
718 			}
719 			upper->owner = btrfs_header_owner(eb);
720 			if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
721 				upper->cowonly = 1;
722 
723 			/*
724 			 * If we know the block isn't shared we can avoid
725 			 * checking its backrefs.
726 			 */
727 			if (btrfs_block_can_be_shared(root, eb))
728 				upper->checked = 0;
729 			else
730 				upper->checked = 1;
731 
732 			/*
733 			 * Add the block to pending list if we need to check its
734 			 * backrefs, we only do this once while walking up a
735 			 * tree as we will catch anything else later on.
736 			 */
737 			if (!upper->checked && need_check) {
738 				need_check = false;
739 				list_add_tail(&edge->list[UPPER],
740 					      &cache->pending_edge);
741 			} else {
742 				if (upper->checked)
743 					need_check = true;
744 				INIT_LIST_HEAD(&edge->list[UPPER]);
745 			}
746 		} else {
747 			upper = rb_entry(rb_node, struct btrfs_backref_node,
748 					 rb_node);
749 			ASSERT(upper->checked);
750 			INIT_LIST_HEAD(&edge->list[UPPER]);
751 			if (!upper->owner)
752 				upper->owner = btrfs_header_owner(eb);
753 		}
754 		link_backref_edge(edge, lower, upper, LINK_LOWER);
755 
756 		if (rb_node) {
757 			btrfs_put_root(root);
758 			break;
759 		}
760 		lower = upper;
761 		upper = NULL;
762 	}
763 out:
764 	btrfs_release_path(path);
765 	return ret;
766 }
767 
768 static int handle_one_tree_block(struct btrfs_backref_cache *cache,
769 				 struct btrfs_path *path,
770 				 struct btrfs_backref_iter *iter,
771 				 struct btrfs_key *node_key,
772 				 struct btrfs_backref_node *cur)
773 {
774 	struct btrfs_fs_info *fs_info = cache->fs_info;
775 	struct btrfs_backref_edge *edge;
776 	struct btrfs_backref_node *exist;
777 	int ret;
778 
779 	ret = btrfs_backref_iter_start(iter, cur->bytenr);
780 	if (ret < 0)
781 		return ret;
782 	/*
783 	 * We skip the first btrfs_tree_block_info, as we don't use the key
784 	 * stored in it, but fetch it from the tree block
785 	 */
786 	if (btrfs_backref_has_tree_block_info(iter)) {
787 		ret = btrfs_backref_iter_next(iter);
788 		if (ret < 0)
789 			goto out;
790 		/* No extra backref? This means the tree block is corrupted */
791 		if (ret > 0) {
792 			ret = -EUCLEAN;
793 			goto out;
794 		}
795 	}
796 	WARN_ON(cur->checked);
797 	if (!list_empty(&cur->upper)) {
798 		/*
799 		 * the backref was added previously when processing
800 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
801 		 */
802 		ASSERT(list_is_singular(&cur->upper));
803 		edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
804 				  list[LOWER]);
805 		ASSERT(list_empty(&edge->list[UPPER]));
806 		exist = edge->node[UPPER];
807 		/*
808 		 * add the upper level block to pending list if we need
809 		 * check its backrefs
810 		 */
811 		if (!exist->checked)
812 			list_add_tail(&edge->list[UPPER], &cache->pending_edge);
813 	} else {
814 		exist = NULL;
815 	}
816 
817 	for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
818 		struct extent_buffer *eb;
819 		struct btrfs_key key;
820 		int type;
821 
822 		cond_resched();
823 		eb = btrfs_backref_get_eb(iter);
824 
825 		key.objectid = iter->bytenr;
826 		if (btrfs_backref_iter_is_inline_ref(iter)) {
827 			struct btrfs_extent_inline_ref *iref;
828 
829 			/* update key for inline back ref */
830 			iref = (struct btrfs_extent_inline_ref *)
831 				((unsigned long)iter->cur_ptr);
832 			type = btrfs_get_extent_inline_ref_type(eb, iref,
833 							BTRFS_REF_TYPE_BLOCK);
834 			if (type == BTRFS_REF_TYPE_INVALID) {
835 				ret = -EUCLEAN;
836 				goto out;
837 			}
838 			key.type = type;
839 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
840 		} else {
841 			key.type = iter->cur_key.type;
842 			key.offset = iter->cur_key.offset;
843 		}
844 
845 		/*
846 		 * Parent node found and matches current inline ref, no need to
847 		 * rebuild this node for this inline ref.
848 		 */
849 		if (exist &&
850 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
851 		      exist->owner == key.offset) ||
852 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
853 		      exist->bytenr == key.offset))) {
854 			exist = NULL;
855 			continue;
856 		}
857 
858 		/* SHARED_BLOCK_REF means key.offset is the parent bytenr */
859 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
860 			ret = handle_direct_tree_backref(cache, &key, cur);
861 			if (ret < 0)
862 				goto out;
863 			continue;
864 		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
865 			ret = -EINVAL;
866 			btrfs_print_v0_err(fs_info);
867 			btrfs_handle_fs_error(fs_info, ret, NULL);
868 			goto out;
869 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
870 			continue;
871 		}
872 
873 		/*
874 		 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
875 		 * means the root objectid. We need to search the tree to get
876 		 * its parent bytenr.
877 		 */
878 		ret = handle_indirect_tree_backref(cache, path, &key, node_key,
879 						   cur);
880 		if (ret < 0)
881 			goto out;
882 	}
883 	ret = 0;
884 	cur->checked = 1;
885 	WARN_ON(exist);
886 out:
887 	btrfs_backref_iter_release(iter);
888 	return ret;
889 }
890 
891 /*
892  * In handle_one_tree_backref(), we have only linked the lower node to the edge,
893  * but the upper node hasn't been linked to the edge.
894  * This means we can only iterate through btrfs_backref_node::upper to reach
895  * parent edges, but not through btrfs_backref_node::lower to reach children
896  * edges.
897  *
898  * This function will finish the btrfs_backref_node::lower to related edges,
899  * so that backref cache can be bi-directionally iterated.
900  *
901  * Also, this will add the nodes to backref cache for the next run.
902  */
903 static int finish_upper_links(struct btrfs_backref_cache *cache,
904 			      struct btrfs_backref_node *start)
905 {
906 	struct list_head *useless_node = &cache->useless_node;
907 	struct btrfs_backref_edge *edge;
908 	struct rb_node *rb_node;
909 	LIST_HEAD(pending_edge);
910 
911 	ASSERT(start->checked);
912 
913 	/* Insert this node to cache if it's not COW-only */
914 	if (!start->cowonly) {
915 		rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
916 					   &start->rb_node);
917 		if (rb_node)
918 			backref_tree_panic(rb_node, -EEXIST, start->bytenr);
919 		list_add_tail(&start->lower, &cache->leaves);
920 	}
921 
922 	/*
923 	 * Use breadth first search to iterate all related edges.
924 	 *
925 	 * The starting points are all the edges of this node
926 	 */
927 	list_for_each_entry(edge, &start->upper, list[LOWER])
928 		list_add_tail(&edge->list[UPPER], &pending_edge);
929 
930 	while (!list_empty(&pending_edge)) {
931 		struct btrfs_backref_node *upper;
932 		struct btrfs_backref_node *lower;
933 		struct rb_node *rb_node;
934 
935 		edge = list_first_entry(&pending_edge,
936 				struct btrfs_backref_edge, list[UPPER]);
937 		list_del_init(&edge->list[UPPER]);
938 		upper = edge->node[UPPER];
939 		lower = edge->node[LOWER];
940 
941 		/* Parent is detached, no need to keep any edges */
942 		if (upper->detached) {
943 			list_del(&edge->list[LOWER]);
944 			free_backref_edge(cache, edge);
945 
946 			/* Lower node is orphan, queue for cleanup */
947 			if (list_empty(&lower->upper))
948 				list_add(&lower->list, useless_node);
949 			continue;
950 		}
951 
952 		/*
953 		 * All new nodes added in current build_backref_tree() haven't
954 		 * been linked to the cache rb tree.
955 		 * So if we have upper->rb_node populated, this means a cache
956 		 * hit. We only need to link the edge, as @upper and all its
957 		 * parent have already been linked.
958 		 */
959 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
960 			if (upper->lowest) {
961 				list_del_init(&upper->lower);
962 				upper->lowest = 0;
963 			}
964 
965 			list_add_tail(&edge->list[UPPER], &upper->lower);
966 			continue;
967 		}
968 
969 		/* Sanity check, we shouldn't have any unchecked nodes */
970 		if (!upper->checked) {
971 			ASSERT(0);
972 			return -EUCLEAN;
973 		}
974 
975 		/* Sanity check, COW-only node has non-COW-only parent */
976 		if (start->cowonly != upper->cowonly) {
977 			ASSERT(0);
978 			return -EUCLEAN;
979 		}
980 
981 		/* Only cache non-COW-only (subvolume trees) tree blocks */
982 		if (!upper->cowonly) {
983 			rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
984 						   &upper->rb_node);
985 			if (rb_node) {
986 				backref_tree_panic(rb_node, -EEXIST,
987 						   upper->bytenr);
988 				return -EUCLEAN;
989 			}
990 		}
991 
992 		list_add_tail(&edge->list[UPPER], &upper->lower);
993 
994 		/*
995 		 * Also queue all the parent edges of this uncached node to
996 		 * finish the upper linkage
997 		 */
998 		list_for_each_entry(edge, &upper->upper, list[LOWER])
999 			list_add_tail(&edge->list[UPPER], &pending_edge);
1000 	}
1001 	return 0;
1002 }
1003 
1004 /*
1005  * For useless nodes, do two major clean ups:
1006  *
1007  * - Cleanup the children edges and nodes
1008  *   If child node is also orphan (no parent) during cleanup, then the child
1009  *   node will also be cleaned up.
1010  *
1011  * - Freeing up leaves (level 0), keeps nodes detached
1012  *   For nodes, the node is still cached as "detached"
1013  *
1014  * Return false if @node is not in the @useless_nodes list.
1015  * Return true if @node is in the @useless_nodes list.
1016  */
1017 static bool handle_useless_nodes(struct reloc_control *rc,
1018 				 struct btrfs_backref_node *node)
1019 {
1020 	struct btrfs_backref_cache *cache = &rc->backref_cache;
1021 	struct list_head *useless_node = &cache->useless_node;
1022 	bool ret = false;
1023 
1024 	while (!list_empty(useless_node)) {
1025 		struct btrfs_backref_node *cur;
1026 
1027 		cur = list_first_entry(useless_node, struct btrfs_backref_node,
1028 				 list);
1029 		list_del_init(&cur->list);
1030 
1031 		/* Only tree root nodes can be added to @useless_nodes */
1032 		ASSERT(list_empty(&cur->upper));
1033 
1034 		if (cur == node)
1035 			ret = true;
1036 
1037 		/* The node is the lowest node */
1038 		if (cur->lowest) {
1039 			list_del_init(&cur->lower);
1040 			cur->lowest = 0;
1041 		}
1042 
1043 		/* Cleanup the lower edges */
1044 		while (!list_empty(&cur->lower)) {
1045 			struct btrfs_backref_edge *edge;
1046 			struct btrfs_backref_node *lower;
1047 
1048 			edge = list_entry(cur->lower.next,
1049 					struct btrfs_backref_edge, list[UPPER]);
1050 			list_del(&edge->list[UPPER]);
1051 			list_del(&edge->list[LOWER]);
1052 			lower = edge->node[LOWER];
1053 			free_backref_edge(cache, edge);
1054 
1055 			/* Child node is also orphan, queue for cleanup */
1056 			if (list_empty(&lower->upper))
1057 				list_add(&lower->list, useless_node);
1058 		}
1059 		/* Mark this block processed for relocation */
1060 		mark_block_processed(rc, cur);
1061 
1062 		/*
1063 		 * Backref nodes for tree leaves are deleted from the cache.
1064 		 * Backref nodes for upper level tree blocks are left in the
1065 		 * cache to avoid unnecessary backref lookup.
1066 		 */
1067 		if (cur->level > 0) {
1068 			list_add(&cur->list, &cache->detached);
1069 			cur->detached = 1;
1070 		} else {
1071 			rb_erase(&cur->rb_node, &cache->rb_root);
1072 			free_backref_node(cache, cur);
1073 		}
1074 	}
1075 	return ret;
1076 }
1077 
1078 /*
1079  * Build backref tree for a given tree block. Root of the backref tree
1080  * corresponds the tree block, leaves of the backref tree correspond roots of
1081  * b-trees that reference the tree block.
1082  *
1083  * The basic idea of this function is check backrefs of a given block to find
1084  * upper level blocks that reference the block, and then check backrefs of
1085  * these upper level blocks recursively. The recursion stops when tree root is
1086  * reached or backrefs for the block is cached.
1087  *
1088  * NOTE: if we find that backrefs for a block are cached, we know backrefs for
1089  * all upper level blocks that directly/indirectly reference the block are also
1090  * cached.
1091  */
1092 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
1093 			struct reloc_control *rc, struct btrfs_key *node_key,
1094 			int level, u64 bytenr)
1095 {
1096 	struct btrfs_backref_iter *iter;
1097 	struct btrfs_backref_cache *cache = &rc->backref_cache;
1098 	/* For searching parent of TREE_BLOCK_REF */
1099 	struct btrfs_path *path;
1100 	struct btrfs_backref_node *cur;
1101 	struct btrfs_backref_node *upper;
1102 	struct btrfs_backref_node *lower;
1103 	struct btrfs_backref_node *node = NULL;
1104 	struct btrfs_backref_edge *edge;
1105 	int ret;
1106 	int err = 0;
1107 
1108 	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
1109 	if (!iter)
1110 		return ERR_PTR(-ENOMEM);
1111 	path = btrfs_alloc_path();
1112 	if (!path) {
1113 		err = -ENOMEM;
1114 		goto out;
1115 	}
1116 
1117 	node = btrfs_backref_alloc_node(cache, bytenr, level);
1118 	if (!node) {
1119 		err = -ENOMEM;
1120 		goto out;
1121 	}
1122 
1123 	node->lowest = 1;
1124 	cur = node;
1125 
1126 	/* Breadth-first search to build backref cache */
1127 	do {
1128 		ret = handle_one_tree_block(cache, path, iter, node_key, cur);
1129 		if (ret < 0) {
1130 			err = ret;
1131 			goto out;
1132 		}
1133 		edge = list_first_entry_or_null(&cache->pending_edge,
1134 				struct btrfs_backref_edge, list[UPPER]);
1135 		/*
1136 		 * The pending list isn't empty, take the first block to
1137 		 * process
1138 		 */
1139 		if (edge) {
1140 			list_del_init(&edge->list[UPPER]);
1141 			cur = edge->node[UPPER];
1142 		}
1143 	} while (edge);
1144 
1145 	/* Finish the upper linkage of newly added edges/nodes */
1146 	ret = finish_upper_links(cache, node);
1147 	if (ret < 0) {
1148 		err = ret;
1149 		goto out;
1150 	}
1151 
1152 	if (handle_useless_nodes(rc, node))
1153 		node = NULL;
1154 out:
1155 	btrfs_backref_iter_free(iter);
1156 	btrfs_free_path(path);
1157 	if (err) {
1158 		while (!list_empty(&cache->useless_node)) {
1159 			lower = list_first_entry(&cache->useless_node,
1160 					   struct btrfs_backref_node, list);
1161 			list_del_init(&lower->list);
1162 		}
1163 		while (!list_empty(&cache->pending_edge)) {
1164 			edge = list_first_entry(&cache->pending_edge,
1165 					struct btrfs_backref_edge, list[UPPER]);
1166 			list_del(&edge->list[UPPER]);
1167 			list_del(&edge->list[LOWER]);
1168 			lower = edge->node[LOWER];
1169 			upper = edge->node[UPPER];
1170 			free_backref_edge(cache, edge);
1171 
1172 			/*
1173 			 * Lower is no longer linked to any upper backref nodes
1174 			 * and isn't in the cache, we can free it ourselves.
1175 			 */
1176 			if (list_empty(&lower->upper) &&
1177 			    RB_EMPTY_NODE(&lower->rb_node))
1178 				list_add(&lower->list, &cache->useless_node);
1179 
1180 			if (!RB_EMPTY_NODE(&upper->rb_node))
1181 				continue;
1182 
1183 			/* Add this guy's upper edges to the list to process */
1184 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1185 				list_add_tail(&edge->list[UPPER],
1186 					      &cache->pending_edge);
1187 			if (list_empty(&upper->upper))
1188 				list_add(&upper->list, &cache->useless_node);
1189 		}
1190 
1191 		while (!list_empty(&cache->useless_node)) {
1192 			lower = list_first_entry(&cache->useless_node,
1193 					   struct btrfs_backref_node, list);
1194 			list_del_init(&lower->list);
1195 			if (lower == node)
1196 				node = NULL;
1197 			free_backref_node(cache, lower);
1198 		}
1199 
1200 		remove_backref_node(cache, node);
1201 		ASSERT(list_empty(&cache->useless_node) &&
1202 		       list_empty(&cache->pending_edge));
1203 		return ERR_PTR(err);
1204 	}
1205 	ASSERT(!node || !node->detached);
1206 	ASSERT(list_empty(&cache->useless_node) &&
1207 	       list_empty(&cache->pending_edge));
1208 	return node;
1209 }
1210 
1211 /*
1212  * helper to add backref node for the newly created snapshot.
1213  * the backref node is created by cloning backref node that
1214  * corresponds to root of source tree
1215  */
1216 static int clone_backref_node(struct btrfs_trans_handle *trans,
1217 			      struct reloc_control *rc,
1218 			      struct btrfs_root *src,
1219 			      struct btrfs_root *dest)
1220 {
1221 	struct btrfs_root *reloc_root = src->reloc_root;
1222 	struct btrfs_backref_cache *cache = &rc->backref_cache;
1223 	struct btrfs_backref_node *node = NULL;
1224 	struct btrfs_backref_node *new_node;
1225 	struct btrfs_backref_edge *edge;
1226 	struct btrfs_backref_edge *new_edge;
1227 	struct rb_node *rb_node;
1228 
1229 	if (cache->last_trans > 0)
1230 		update_backref_cache(trans, cache);
1231 
1232 	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
1233 	if (rb_node) {
1234 		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
1235 		if (node->detached)
1236 			node = NULL;
1237 		else
1238 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1239 	}
1240 
1241 	if (!node) {
1242 		rb_node = rb_simple_search(&cache->rb_root,
1243 					   reloc_root->commit_root->start);
1244 		if (rb_node) {
1245 			node = rb_entry(rb_node, struct btrfs_backref_node,
1246 					rb_node);
1247 			BUG_ON(node->detached);
1248 		}
1249 	}
1250 
1251 	if (!node)
1252 		return 0;
1253 
1254 	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
1255 					    node->level);
1256 	if (!new_node)
1257 		return -ENOMEM;
1258 
1259 	new_node->lowest = node->lowest;
1260 	new_node->checked = 1;
1261 	new_node->root = btrfs_grab_root(dest);
1262 	ASSERT(new_node->root);
1263 
1264 	if (!node->lowest) {
1265 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1266 			new_edge = alloc_backref_edge(cache);
1267 			if (!new_edge)
1268 				goto fail;
1269 
1270 			link_backref_edge(new_edge, edge->node[LOWER], new_node,
1271 					  LINK_UPPER);
1272 		}
1273 	} else {
1274 		list_add_tail(&new_node->lower, &cache->leaves);
1275 	}
1276 
1277 	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
1278 				   &new_node->rb_node);
1279 	if (rb_node)
1280 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1281 
1282 	if (!new_node->lowest) {
1283 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1284 			list_add_tail(&new_edge->list[LOWER],
1285 				      &new_edge->node[LOWER]->upper);
1286 		}
1287 	}
1288 	return 0;
1289 fail:
1290 	while (!list_empty(&new_node->lower)) {
1291 		new_edge = list_entry(new_node->lower.next,
1292 				      struct btrfs_backref_edge, list[UPPER]);
1293 		list_del(&new_edge->list[UPPER]);
1294 		free_backref_edge(cache, new_edge);
1295 	}
1296 	free_backref_node(cache, new_node);
1297 	return -ENOMEM;
1298 }
1299 
1300 /*
1301  * helper to add 'address of tree root -> reloc tree' mapping
1302  */
1303 static int __must_check __add_reloc_root(struct btrfs_root *root)
1304 {
1305 	struct btrfs_fs_info *fs_info = root->fs_info;
1306 	struct rb_node *rb_node;
1307 	struct mapping_node *node;
1308 	struct reloc_control *rc = fs_info->reloc_ctl;
1309 
1310 	node = kmalloc(sizeof(*node), GFP_NOFS);
1311 	if (!node)
1312 		return -ENOMEM;
1313 
1314 	node->bytenr = root->commit_root->start;
1315 	node->data = root;
1316 
1317 	spin_lock(&rc->reloc_root_tree.lock);
1318 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
1319 				   node->bytenr, &node->rb_node);
1320 	spin_unlock(&rc->reloc_root_tree.lock);
1321 	if (rb_node) {
1322 		btrfs_panic(fs_info, -EEXIST,
1323 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1324 			    node->bytenr);
1325 	}
1326 
1327 	list_add_tail(&root->root_list, &rc->reloc_roots);
1328 	return 0;
1329 }
1330 
1331 /*
1332  * helper to delete the 'address of tree root -> reloc tree'
1333  * mapping
1334  */
1335 static void __del_reloc_root(struct btrfs_root *root)
1336 {
1337 	struct btrfs_fs_info *fs_info = root->fs_info;
1338 	struct rb_node *rb_node;
1339 	struct mapping_node *node = NULL;
1340 	struct reloc_control *rc = fs_info->reloc_ctl;
1341 	bool put_ref = false;
1342 
1343 	if (rc && root->node) {
1344 		spin_lock(&rc->reloc_root_tree.lock);
1345 		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
1346 					   root->commit_root->start);
1347 		if (rb_node) {
1348 			node = rb_entry(rb_node, struct mapping_node, rb_node);
1349 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1350 			RB_CLEAR_NODE(&node->rb_node);
1351 		}
1352 		spin_unlock(&rc->reloc_root_tree.lock);
1353 		if (!node)
1354 			return;
1355 		BUG_ON((struct btrfs_root *)node->data != root);
1356 	}
1357 
1358 	/*
1359 	 * We only put the reloc root here if it's on the list.  There's a lot
1360 	 * of places where the pattern is to splice the rc->reloc_roots, process
1361 	 * the reloc roots, and then add the reloc root back onto
1362 	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
1363 	 * list we don't want the reference being dropped, because the guy
1364 	 * messing with the list is in charge of the reference.
1365 	 */
1366 	spin_lock(&fs_info->trans_lock);
1367 	if (!list_empty(&root->root_list)) {
1368 		put_ref = true;
1369 		list_del_init(&root->root_list);
1370 	}
1371 	spin_unlock(&fs_info->trans_lock);
1372 	if (put_ref)
1373 		btrfs_put_root(root);
1374 	kfree(node);
1375 }
1376 
1377 /*
1378  * helper to update the 'address of tree root -> reloc tree'
1379  * mapping
1380  */
1381 static int __update_reloc_root(struct btrfs_root *root)
1382 {
1383 	struct btrfs_fs_info *fs_info = root->fs_info;
1384 	struct rb_node *rb_node;
1385 	struct mapping_node *node = NULL;
1386 	struct reloc_control *rc = fs_info->reloc_ctl;
1387 
1388 	spin_lock(&rc->reloc_root_tree.lock);
1389 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
1390 				   root->commit_root->start);
1391 	if (rb_node) {
1392 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1393 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1394 	}
1395 	spin_unlock(&rc->reloc_root_tree.lock);
1396 
1397 	if (!node)
1398 		return 0;
1399 	BUG_ON((struct btrfs_root *)node->data != root);
1400 
1401 	spin_lock(&rc->reloc_root_tree.lock);
1402 	node->bytenr = root->node->start;
1403 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
1404 				   node->bytenr, &node->rb_node);
1405 	spin_unlock(&rc->reloc_root_tree.lock);
1406 	if (rb_node)
1407 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1408 	return 0;
1409 }
1410 
1411 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1412 					struct btrfs_root *root, u64 objectid)
1413 {
1414 	struct btrfs_fs_info *fs_info = root->fs_info;
1415 	struct btrfs_root *reloc_root;
1416 	struct extent_buffer *eb;
1417 	struct btrfs_root_item *root_item;
1418 	struct btrfs_key root_key;
1419 	int ret;
1420 
1421 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1422 	BUG_ON(!root_item);
1423 
1424 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1425 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1426 	root_key.offset = objectid;
1427 
1428 	if (root->root_key.objectid == objectid) {
1429 		u64 commit_root_gen;
1430 
1431 		/* called by btrfs_init_reloc_root */
1432 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1433 				      BTRFS_TREE_RELOC_OBJECTID);
1434 		BUG_ON(ret);
1435 		/*
1436 		 * Set the last_snapshot field to the generation of the commit
1437 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1438 		 * correctly (returns true) when the relocation root is created
1439 		 * either inside the critical section of a transaction commit
1440 		 * (through transaction.c:qgroup_account_snapshot()) and when
1441 		 * it's created before the transaction commit is started.
1442 		 */
1443 		commit_root_gen = btrfs_header_generation(root->commit_root);
1444 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1445 	} else {
1446 		/*
1447 		 * called by btrfs_reloc_post_snapshot_hook.
1448 		 * the source tree is a reloc tree, all tree blocks
1449 		 * modified after it was created have RELOC flag
1450 		 * set in their headers. so it's OK to not update
1451 		 * the 'last_snapshot'.
1452 		 */
1453 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1454 				      BTRFS_TREE_RELOC_OBJECTID);
1455 		BUG_ON(ret);
1456 	}
1457 
1458 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1459 	btrfs_set_root_bytenr(root_item, eb->start);
1460 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1461 	btrfs_set_root_generation(root_item, trans->transid);
1462 
1463 	if (root->root_key.objectid == objectid) {
1464 		btrfs_set_root_refs(root_item, 0);
1465 		memset(&root_item->drop_progress, 0,
1466 		       sizeof(struct btrfs_disk_key));
1467 		root_item->drop_level = 0;
1468 	}
1469 
1470 	btrfs_tree_unlock(eb);
1471 	free_extent_buffer(eb);
1472 
1473 	ret = btrfs_insert_root(trans, fs_info->tree_root,
1474 				&root_key, root_item);
1475 	BUG_ON(ret);
1476 	kfree(root_item);
1477 
1478 	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
1479 	BUG_ON(IS_ERR(reloc_root));
1480 	set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
1481 	reloc_root->last_trans = trans->transid;
1482 	return reloc_root;
1483 }
1484 
1485 /*
1486  * create reloc tree for a given fs tree. reloc tree is just a
1487  * snapshot of the fs tree with special root objectid.
1488  *
1489  * The reloc_root comes out of here with two references, one for
1490  * root->reloc_root, and another for being on the rc->reloc_roots list.
1491  */
1492 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1493 			  struct btrfs_root *root)
1494 {
1495 	struct btrfs_fs_info *fs_info = root->fs_info;
1496 	struct btrfs_root *reloc_root;
1497 	struct reloc_control *rc = fs_info->reloc_ctl;
1498 	struct btrfs_block_rsv *rsv;
1499 	int clear_rsv = 0;
1500 	int ret;
1501 
1502 	if (!rc)
1503 		return 0;
1504 
1505 	/*
1506 	 * The subvolume has reloc tree but the swap is finished, no need to
1507 	 * create/update the dead reloc tree
1508 	 */
1509 	if (reloc_root_is_dead(root))
1510 		return 0;
1511 
1512 	/*
1513 	 * This is subtle but important.  We do not do
1514 	 * record_root_in_transaction for reloc roots, instead we record their
1515 	 * corresponding fs root, and then here we update the last trans for the
1516 	 * reloc root.  This means that we have to do this for the entire life
1517 	 * of the reloc root, regardless of which stage of the relocation we are
1518 	 * in.
1519 	 */
1520 	if (root->reloc_root) {
1521 		reloc_root = root->reloc_root;
1522 		reloc_root->last_trans = trans->transid;
1523 		return 0;
1524 	}
1525 
1526 	/*
1527 	 * We are merging reloc roots, we do not need new reloc trees.  Also
1528 	 * reloc trees never need their own reloc tree.
1529 	 */
1530 	if (!rc->create_reloc_tree ||
1531 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1532 		return 0;
1533 
1534 	if (!trans->reloc_reserved) {
1535 		rsv = trans->block_rsv;
1536 		trans->block_rsv = rc->block_rsv;
1537 		clear_rsv = 1;
1538 	}
1539 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1540 	if (clear_rsv)
1541 		trans->block_rsv = rsv;
1542 
1543 	ret = __add_reloc_root(reloc_root);
1544 	BUG_ON(ret < 0);
1545 	root->reloc_root = btrfs_grab_root(reloc_root);
1546 	return 0;
1547 }
1548 
1549 /*
1550  * update root item of reloc tree
1551  */
1552 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1553 			    struct btrfs_root *root)
1554 {
1555 	struct btrfs_fs_info *fs_info = root->fs_info;
1556 	struct btrfs_root *reloc_root;
1557 	struct btrfs_root_item *root_item;
1558 	int ret;
1559 
1560 	if (!have_reloc_root(root))
1561 		goto out;
1562 
1563 	reloc_root = root->reloc_root;
1564 	root_item = &reloc_root->root_item;
1565 
1566 	/*
1567 	 * We are probably ok here, but __del_reloc_root() will drop its ref of
1568 	 * the root.  We have the ref for root->reloc_root, but just in case
1569 	 * hold it while we update the reloc root.
1570 	 */
1571 	btrfs_grab_root(reloc_root);
1572 
1573 	/* root->reloc_root will stay until current relocation finished */
1574 	if (fs_info->reloc_ctl->merge_reloc_tree &&
1575 	    btrfs_root_refs(root_item) == 0) {
1576 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1577 		/*
1578 		 * Mark the tree as dead before we change reloc_root so
1579 		 * have_reloc_root will not touch it from now on.
1580 		 */
1581 		smp_wmb();
1582 		__del_reloc_root(reloc_root);
1583 	}
1584 
1585 	if (reloc_root->commit_root != reloc_root->node) {
1586 		__update_reloc_root(reloc_root);
1587 		btrfs_set_root_node(root_item, reloc_root->node);
1588 		free_extent_buffer(reloc_root->commit_root);
1589 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1590 	}
1591 
1592 	ret = btrfs_update_root(trans, fs_info->tree_root,
1593 				&reloc_root->root_key, root_item);
1594 	BUG_ON(ret);
1595 	btrfs_put_root(reloc_root);
1596 out:
1597 	return 0;
1598 }
1599 
1600 /*
1601  * helper to find first cached inode with inode number >= objectid
1602  * in a subvolume
1603  */
1604 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1605 {
1606 	struct rb_node *node;
1607 	struct rb_node *prev;
1608 	struct btrfs_inode *entry;
1609 	struct inode *inode;
1610 
1611 	spin_lock(&root->inode_lock);
1612 again:
1613 	node = root->inode_tree.rb_node;
1614 	prev = NULL;
1615 	while (node) {
1616 		prev = node;
1617 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1618 
1619 		if (objectid < btrfs_ino(entry))
1620 			node = node->rb_left;
1621 		else if (objectid > btrfs_ino(entry))
1622 			node = node->rb_right;
1623 		else
1624 			break;
1625 	}
1626 	if (!node) {
1627 		while (prev) {
1628 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1629 			if (objectid <= btrfs_ino(entry)) {
1630 				node = prev;
1631 				break;
1632 			}
1633 			prev = rb_next(prev);
1634 		}
1635 	}
1636 	while (node) {
1637 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1638 		inode = igrab(&entry->vfs_inode);
1639 		if (inode) {
1640 			spin_unlock(&root->inode_lock);
1641 			return inode;
1642 		}
1643 
1644 		objectid = btrfs_ino(entry) + 1;
1645 		if (cond_resched_lock(&root->inode_lock))
1646 			goto again;
1647 
1648 		node = rb_next(node);
1649 	}
1650 	spin_unlock(&root->inode_lock);
1651 	return NULL;
1652 }
1653 
1654 /*
1655  * get new location of data
1656  */
1657 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1658 			    u64 bytenr, u64 num_bytes)
1659 {
1660 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1661 	struct btrfs_path *path;
1662 	struct btrfs_file_extent_item *fi;
1663 	struct extent_buffer *leaf;
1664 	int ret;
1665 
1666 	path = btrfs_alloc_path();
1667 	if (!path)
1668 		return -ENOMEM;
1669 
1670 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1671 	ret = btrfs_lookup_file_extent(NULL, root, path,
1672 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1673 	if (ret < 0)
1674 		goto out;
1675 	if (ret > 0) {
1676 		ret = -ENOENT;
1677 		goto out;
1678 	}
1679 
1680 	leaf = path->nodes[0];
1681 	fi = btrfs_item_ptr(leaf, path->slots[0],
1682 			    struct btrfs_file_extent_item);
1683 
1684 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1685 	       btrfs_file_extent_compression(leaf, fi) ||
1686 	       btrfs_file_extent_encryption(leaf, fi) ||
1687 	       btrfs_file_extent_other_encoding(leaf, fi));
1688 
1689 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1690 		ret = -EINVAL;
1691 		goto out;
1692 	}
1693 
1694 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1695 	ret = 0;
1696 out:
1697 	btrfs_free_path(path);
1698 	return ret;
1699 }
1700 
1701 /*
1702  * update file extent items in the tree leaf to point to
1703  * the new locations.
1704  */
1705 static noinline_for_stack
1706 int replace_file_extents(struct btrfs_trans_handle *trans,
1707 			 struct reloc_control *rc,
1708 			 struct btrfs_root *root,
1709 			 struct extent_buffer *leaf)
1710 {
1711 	struct btrfs_fs_info *fs_info = root->fs_info;
1712 	struct btrfs_key key;
1713 	struct btrfs_file_extent_item *fi;
1714 	struct inode *inode = NULL;
1715 	u64 parent;
1716 	u64 bytenr;
1717 	u64 new_bytenr = 0;
1718 	u64 num_bytes;
1719 	u64 end;
1720 	u32 nritems;
1721 	u32 i;
1722 	int ret = 0;
1723 	int first = 1;
1724 	int dirty = 0;
1725 
1726 	if (rc->stage != UPDATE_DATA_PTRS)
1727 		return 0;
1728 
1729 	/* reloc trees always use full backref */
1730 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1731 		parent = leaf->start;
1732 	else
1733 		parent = 0;
1734 
1735 	nritems = btrfs_header_nritems(leaf);
1736 	for (i = 0; i < nritems; i++) {
1737 		struct btrfs_ref ref = { 0 };
1738 
1739 		cond_resched();
1740 		btrfs_item_key_to_cpu(leaf, &key, i);
1741 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1742 			continue;
1743 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1744 		if (btrfs_file_extent_type(leaf, fi) ==
1745 		    BTRFS_FILE_EXTENT_INLINE)
1746 			continue;
1747 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1748 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1749 		if (bytenr == 0)
1750 			continue;
1751 		if (!in_range(bytenr, rc->block_group->start,
1752 			      rc->block_group->length))
1753 			continue;
1754 
1755 		/*
1756 		 * if we are modifying block in fs tree, wait for readpage
1757 		 * to complete and drop the extent cache
1758 		 */
1759 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1760 			if (first) {
1761 				inode = find_next_inode(root, key.objectid);
1762 				first = 0;
1763 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1764 				btrfs_add_delayed_iput(inode);
1765 				inode = find_next_inode(root, key.objectid);
1766 			}
1767 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1768 				end = key.offset +
1769 				      btrfs_file_extent_num_bytes(leaf, fi);
1770 				WARN_ON(!IS_ALIGNED(key.offset,
1771 						    fs_info->sectorsize));
1772 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1773 				end--;
1774 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1775 						      key.offset, end);
1776 				if (!ret)
1777 					continue;
1778 
1779 				btrfs_drop_extent_cache(BTRFS_I(inode),
1780 						key.offset,	end, 1);
1781 				unlock_extent(&BTRFS_I(inode)->io_tree,
1782 					      key.offset, end);
1783 			}
1784 		}
1785 
1786 		ret = get_new_location(rc->data_inode, &new_bytenr,
1787 				       bytenr, num_bytes);
1788 		if (ret) {
1789 			/*
1790 			 * Don't have to abort since we've not changed anything
1791 			 * in the file extent yet.
1792 			 */
1793 			break;
1794 		}
1795 
1796 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1797 		dirty = 1;
1798 
1799 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1800 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1801 				       num_bytes, parent);
1802 		ref.real_root = root->root_key.objectid;
1803 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1804 				    key.objectid, key.offset);
1805 		ret = btrfs_inc_extent_ref(trans, &ref);
1806 		if (ret) {
1807 			btrfs_abort_transaction(trans, ret);
1808 			break;
1809 		}
1810 
1811 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1812 				       num_bytes, parent);
1813 		ref.real_root = root->root_key.objectid;
1814 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1815 				    key.objectid, key.offset);
1816 		ret = btrfs_free_extent(trans, &ref);
1817 		if (ret) {
1818 			btrfs_abort_transaction(trans, ret);
1819 			break;
1820 		}
1821 	}
1822 	if (dirty)
1823 		btrfs_mark_buffer_dirty(leaf);
1824 	if (inode)
1825 		btrfs_add_delayed_iput(inode);
1826 	return ret;
1827 }
1828 
1829 static noinline_for_stack
1830 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1831 		     struct btrfs_path *path, int level)
1832 {
1833 	struct btrfs_disk_key key1;
1834 	struct btrfs_disk_key key2;
1835 	btrfs_node_key(eb, &key1, slot);
1836 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1837 	return memcmp(&key1, &key2, sizeof(key1));
1838 }
1839 
1840 /*
1841  * try to replace tree blocks in fs tree with the new blocks
1842  * in reloc tree. tree blocks haven't been modified since the
1843  * reloc tree was create can be replaced.
1844  *
1845  * if a block was replaced, level of the block + 1 is returned.
1846  * if no block got replaced, 0 is returned. if there are other
1847  * errors, a negative error number is returned.
1848  */
1849 static noinline_for_stack
1850 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1851 		 struct btrfs_root *dest, struct btrfs_root *src,
1852 		 struct btrfs_path *path, struct btrfs_key *next_key,
1853 		 int lowest_level, int max_level)
1854 {
1855 	struct btrfs_fs_info *fs_info = dest->fs_info;
1856 	struct extent_buffer *eb;
1857 	struct extent_buffer *parent;
1858 	struct btrfs_ref ref = { 0 };
1859 	struct btrfs_key key;
1860 	u64 old_bytenr;
1861 	u64 new_bytenr;
1862 	u64 old_ptr_gen;
1863 	u64 new_ptr_gen;
1864 	u64 last_snapshot;
1865 	u32 blocksize;
1866 	int cow = 0;
1867 	int level;
1868 	int ret;
1869 	int slot;
1870 
1871 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1872 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1873 
1874 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1875 again:
1876 	slot = path->slots[lowest_level];
1877 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1878 
1879 	eb = btrfs_lock_root_node(dest);
1880 	btrfs_set_lock_blocking_write(eb);
1881 	level = btrfs_header_level(eb);
1882 
1883 	if (level < lowest_level) {
1884 		btrfs_tree_unlock(eb);
1885 		free_extent_buffer(eb);
1886 		return 0;
1887 	}
1888 
1889 	if (cow) {
1890 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1891 		BUG_ON(ret);
1892 	}
1893 	btrfs_set_lock_blocking_write(eb);
1894 
1895 	if (next_key) {
1896 		next_key->objectid = (u64)-1;
1897 		next_key->type = (u8)-1;
1898 		next_key->offset = (u64)-1;
1899 	}
1900 
1901 	parent = eb;
1902 	while (1) {
1903 		struct btrfs_key first_key;
1904 
1905 		level = btrfs_header_level(parent);
1906 		BUG_ON(level < lowest_level);
1907 
1908 		ret = btrfs_bin_search(parent, &key, level, &slot);
1909 		if (ret < 0)
1910 			break;
1911 		if (ret && slot > 0)
1912 			slot--;
1913 
1914 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1915 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1916 
1917 		old_bytenr = btrfs_node_blockptr(parent, slot);
1918 		blocksize = fs_info->nodesize;
1919 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1920 		btrfs_node_key_to_cpu(parent, &first_key, slot);
1921 
1922 		if (level <= max_level) {
1923 			eb = path->nodes[level];
1924 			new_bytenr = btrfs_node_blockptr(eb,
1925 							path->slots[level]);
1926 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1927 							path->slots[level]);
1928 		} else {
1929 			new_bytenr = 0;
1930 			new_ptr_gen = 0;
1931 		}
1932 
1933 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1934 			ret = level;
1935 			break;
1936 		}
1937 
1938 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1939 		    memcmp_node_keys(parent, slot, path, level)) {
1940 			if (level <= lowest_level) {
1941 				ret = 0;
1942 				break;
1943 			}
1944 
1945 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1946 					     level - 1, &first_key);
1947 			if (IS_ERR(eb)) {
1948 				ret = PTR_ERR(eb);
1949 				break;
1950 			} else if (!extent_buffer_uptodate(eb)) {
1951 				ret = -EIO;
1952 				free_extent_buffer(eb);
1953 				break;
1954 			}
1955 			btrfs_tree_lock(eb);
1956 			if (cow) {
1957 				ret = btrfs_cow_block(trans, dest, eb, parent,
1958 						      slot, &eb);
1959 				BUG_ON(ret);
1960 			}
1961 			btrfs_set_lock_blocking_write(eb);
1962 
1963 			btrfs_tree_unlock(parent);
1964 			free_extent_buffer(parent);
1965 
1966 			parent = eb;
1967 			continue;
1968 		}
1969 
1970 		if (!cow) {
1971 			btrfs_tree_unlock(parent);
1972 			free_extent_buffer(parent);
1973 			cow = 1;
1974 			goto again;
1975 		}
1976 
1977 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1978 				      path->slots[level]);
1979 		btrfs_release_path(path);
1980 
1981 		path->lowest_level = level;
1982 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1983 		path->lowest_level = 0;
1984 		BUG_ON(ret);
1985 
1986 		/*
1987 		 * Info qgroup to trace both subtrees.
1988 		 *
1989 		 * We must trace both trees.
1990 		 * 1) Tree reloc subtree
1991 		 *    If not traced, we will leak data numbers
1992 		 * 2) Fs subtree
1993 		 *    If not traced, we will double count old data
1994 		 *
1995 		 * We don't scan the subtree right now, but only record
1996 		 * the swapped tree blocks.
1997 		 * The real subtree rescan is delayed until we have new
1998 		 * CoW on the subtree root node before transaction commit.
1999 		 */
2000 		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
2001 				rc->block_group, parent, slot,
2002 				path->nodes[level], path->slots[level],
2003 				last_snapshot);
2004 		if (ret < 0)
2005 			break;
2006 		/*
2007 		 * swap blocks in fs tree and reloc tree.
2008 		 */
2009 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
2010 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
2011 		btrfs_mark_buffer_dirty(parent);
2012 
2013 		btrfs_set_node_blockptr(path->nodes[level],
2014 					path->slots[level], old_bytenr);
2015 		btrfs_set_node_ptr_generation(path->nodes[level],
2016 					      path->slots[level], old_ptr_gen);
2017 		btrfs_mark_buffer_dirty(path->nodes[level]);
2018 
2019 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
2020 				       blocksize, path->nodes[level]->start);
2021 		ref.skip_qgroup = true;
2022 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2023 		ret = btrfs_inc_extent_ref(trans, &ref);
2024 		BUG_ON(ret);
2025 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
2026 				       blocksize, 0);
2027 		ref.skip_qgroup = true;
2028 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2029 		ret = btrfs_inc_extent_ref(trans, &ref);
2030 		BUG_ON(ret);
2031 
2032 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
2033 				       blocksize, path->nodes[level]->start);
2034 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2035 		ref.skip_qgroup = true;
2036 		ret = btrfs_free_extent(trans, &ref);
2037 		BUG_ON(ret);
2038 
2039 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
2040 				       blocksize, 0);
2041 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2042 		ref.skip_qgroup = true;
2043 		ret = btrfs_free_extent(trans, &ref);
2044 		BUG_ON(ret);
2045 
2046 		btrfs_unlock_up_safe(path, 0);
2047 
2048 		ret = level;
2049 		break;
2050 	}
2051 	btrfs_tree_unlock(parent);
2052 	free_extent_buffer(parent);
2053 	return ret;
2054 }
2055 
2056 /*
2057  * helper to find next relocated block in reloc tree
2058  */
2059 static noinline_for_stack
2060 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2061 		       int *level)
2062 {
2063 	struct extent_buffer *eb;
2064 	int i;
2065 	u64 last_snapshot;
2066 	u32 nritems;
2067 
2068 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2069 
2070 	for (i = 0; i < *level; i++) {
2071 		free_extent_buffer(path->nodes[i]);
2072 		path->nodes[i] = NULL;
2073 	}
2074 
2075 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
2076 		eb = path->nodes[i];
2077 		nritems = btrfs_header_nritems(eb);
2078 		while (path->slots[i] + 1 < nritems) {
2079 			path->slots[i]++;
2080 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2081 			    last_snapshot)
2082 				continue;
2083 
2084 			*level = i;
2085 			return 0;
2086 		}
2087 		free_extent_buffer(path->nodes[i]);
2088 		path->nodes[i] = NULL;
2089 	}
2090 	return 1;
2091 }
2092 
2093 /*
2094  * walk down reloc tree to find relocated block of lowest level
2095  */
2096 static noinline_for_stack
2097 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2098 			 int *level)
2099 {
2100 	struct btrfs_fs_info *fs_info = root->fs_info;
2101 	struct extent_buffer *eb = NULL;
2102 	int i;
2103 	u64 bytenr;
2104 	u64 ptr_gen = 0;
2105 	u64 last_snapshot;
2106 	u32 nritems;
2107 
2108 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2109 
2110 	for (i = *level; i > 0; i--) {
2111 		struct btrfs_key first_key;
2112 
2113 		eb = path->nodes[i];
2114 		nritems = btrfs_header_nritems(eb);
2115 		while (path->slots[i] < nritems) {
2116 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2117 			if (ptr_gen > last_snapshot)
2118 				break;
2119 			path->slots[i]++;
2120 		}
2121 		if (path->slots[i] >= nritems) {
2122 			if (i == *level)
2123 				break;
2124 			*level = i + 1;
2125 			return 0;
2126 		}
2127 		if (i == 1) {
2128 			*level = i;
2129 			return 0;
2130 		}
2131 
2132 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2133 		btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2134 		eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2135 				     &first_key);
2136 		if (IS_ERR(eb)) {
2137 			return PTR_ERR(eb);
2138 		} else if (!extent_buffer_uptodate(eb)) {
2139 			free_extent_buffer(eb);
2140 			return -EIO;
2141 		}
2142 		BUG_ON(btrfs_header_level(eb) != i - 1);
2143 		path->nodes[i - 1] = eb;
2144 		path->slots[i - 1] = 0;
2145 	}
2146 	return 1;
2147 }
2148 
2149 /*
2150  * invalidate extent cache for file extents whose key in range of
2151  * [min_key, max_key)
2152  */
2153 static int invalidate_extent_cache(struct btrfs_root *root,
2154 				   struct btrfs_key *min_key,
2155 				   struct btrfs_key *max_key)
2156 {
2157 	struct btrfs_fs_info *fs_info = root->fs_info;
2158 	struct inode *inode = NULL;
2159 	u64 objectid;
2160 	u64 start, end;
2161 	u64 ino;
2162 
2163 	objectid = min_key->objectid;
2164 	while (1) {
2165 		cond_resched();
2166 		iput(inode);
2167 
2168 		if (objectid > max_key->objectid)
2169 			break;
2170 
2171 		inode = find_next_inode(root, objectid);
2172 		if (!inode)
2173 			break;
2174 		ino = btrfs_ino(BTRFS_I(inode));
2175 
2176 		if (ino > max_key->objectid) {
2177 			iput(inode);
2178 			break;
2179 		}
2180 
2181 		objectid = ino + 1;
2182 		if (!S_ISREG(inode->i_mode))
2183 			continue;
2184 
2185 		if (unlikely(min_key->objectid == ino)) {
2186 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2187 				continue;
2188 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2189 				start = 0;
2190 			else {
2191 				start = min_key->offset;
2192 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2193 			}
2194 		} else {
2195 			start = 0;
2196 		}
2197 
2198 		if (unlikely(max_key->objectid == ino)) {
2199 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2200 				continue;
2201 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2202 				end = (u64)-1;
2203 			} else {
2204 				if (max_key->offset == 0)
2205 					continue;
2206 				end = max_key->offset;
2207 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2208 				end--;
2209 			}
2210 		} else {
2211 			end = (u64)-1;
2212 		}
2213 
2214 		/* the lock_extent waits for readpage to complete */
2215 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2216 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2217 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2218 	}
2219 	return 0;
2220 }
2221 
2222 static int find_next_key(struct btrfs_path *path, int level,
2223 			 struct btrfs_key *key)
2224 
2225 {
2226 	while (level < BTRFS_MAX_LEVEL) {
2227 		if (!path->nodes[level])
2228 			break;
2229 		if (path->slots[level] + 1 <
2230 		    btrfs_header_nritems(path->nodes[level])) {
2231 			btrfs_node_key_to_cpu(path->nodes[level], key,
2232 					      path->slots[level] + 1);
2233 			return 0;
2234 		}
2235 		level++;
2236 	}
2237 	return 1;
2238 }
2239 
2240 /*
2241  * Insert current subvolume into reloc_control::dirty_subvol_roots
2242  */
2243 static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2244 				struct reloc_control *rc,
2245 				struct btrfs_root *root)
2246 {
2247 	struct btrfs_root *reloc_root = root->reloc_root;
2248 	struct btrfs_root_item *reloc_root_item;
2249 
2250 	/* @root must be a subvolume tree root with a valid reloc tree */
2251 	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2252 	ASSERT(reloc_root);
2253 
2254 	reloc_root_item = &reloc_root->root_item;
2255 	memset(&reloc_root_item->drop_progress, 0,
2256 		sizeof(reloc_root_item->drop_progress));
2257 	reloc_root_item->drop_level = 0;
2258 	btrfs_set_root_refs(reloc_root_item, 0);
2259 	btrfs_update_reloc_root(trans, root);
2260 
2261 	if (list_empty(&root->reloc_dirty_list)) {
2262 		btrfs_grab_root(root);
2263 		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2264 	}
2265 }
2266 
2267 static int clean_dirty_subvols(struct reloc_control *rc)
2268 {
2269 	struct btrfs_root *root;
2270 	struct btrfs_root *next;
2271 	int ret = 0;
2272 	int ret2;
2273 
2274 	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2275 				 reloc_dirty_list) {
2276 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2277 			/* Merged subvolume, cleanup its reloc root */
2278 			struct btrfs_root *reloc_root = root->reloc_root;
2279 
2280 			list_del_init(&root->reloc_dirty_list);
2281 			root->reloc_root = NULL;
2282 			/*
2283 			 * Need barrier to ensure clear_bit() only happens after
2284 			 * root->reloc_root = NULL. Pairs with have_reloc_root.
2285 			 */
2286 			smp_wmb();
2287 			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
2288 			if (reloc_root) {
2289 				/*
2290 				 * btrfs_drop_snapshot drops our ref we hold for
2291 				 * ->reloc_root.  If it fails however we must
2292 				 * drop the ref ourselves.
2293 				 */
2294 				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
2295 				if (ret2 < 0) {
2296 					btrfs_put_root(reloc_root);
2297 					if (!ret)
2298 						ret = ret2;
2299 				}
2300 			}
2301 			btrfs_put_root(root);
2302 		} else {
2303 			/* Orphan reloc tree, just clean it up */
2304 			ret2 = btrfs_drop_snapshot(root, 0, 1);
2305 			if (ret2 < 0) {
2306 				btrfs_put_root(root);
2307 				if (!ret)
2308 					ret = ret2;
2309 			}
2310 		}
2311 	}
2312 	return ret;
2313 }
2314 
2315 /*
2316  * merge the relocated tree blocks in reloc tree with corresponding
2317  * fs tree.
2318  */
2319 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2320 					       struct btrfs_root *root)
2321 {
2322 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2323 	struct btrfs_key key;
2324 	struct btrfs_key next_key;
2325 	struct btrfs_trans_handle *trans = NULL;
2326 	struct btrfs_root *reloc_root;
2327 	struct btrfs_root_item *root_item;
2328 	struct btrfs_path *path;
2329 	struct extent_buffer *leaf;
2330 	int level;
2331 	int max_level;
2332 	int replaced = 0;
2333 	int ret;
2334 	int err = 0;
2335 	u32 min_reserved;
2336 
2337 	path = btrfs_alloc_path();
2338 	if (!path)
2339 		return -ENOMEM;
2340 	path->reada = READA_FORWARD;
2341 
2342 	reloc_root = root->reloc_root;
2343 	root_item = &reloc_root->root_item;
2344 
2345 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2346 		level = btrfs_root_level(root_item);
2347 		atomic_inc(&reloc_root->node->refs);
2348 		path->nodes[level] = reloc_root->node;
2349 		path->slots[level] = 0;
2350 	} else {
2351 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2352 
2353 		level = root_item->drop_level;
2354 		BUG_ON(level == 0);
2355 		path->lowest_level = level;
2356 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2357 		path->lowest_level = 0;
2358 		if (ret < 0) {
2359 			btrfs_free_path(path);
2360 			return ret;
2361 		}
2362 
2363 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2364 				      path->slots[level]);
2365 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2366 
2367 		btrfs_unlock_up_safe(path, 0);
2368 	}
2369 
2370 	min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2371 	memset(&next_key, 0, sizeof(next_key));
2372 
2373 	while (1) {
2374 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2375 					     BTRFS_RESERVE_FLUSH_ALL);
2376 		if (ret) {
2377 			err = ret;
2378 			goto out;
2379 		}
2380 		trans = btrfs_start_transaction(root, 0);
2381 		if (IS_ERR(trans)) {
2382 			err = PTR_ERR(trans);
2383 			trans = NULL;
2384 			goto out;
2385 		}
2386 
2387 		/*
2388 		 * At this point we no longer have a reloc_control, so we can't
2389 		 * depend on btrfs_init_reloc_root to update our last_trans.
2390 		 *
2391 		 * But that's ok, we started the trans handle on our
2392 		 * corresponding fs_root, which means it's been added to the
2393 		 * dirty list.  At commit time we'll still call
2394 		 * btrfs_update_reloc_root() and update our root item
2395 		 * appropriately.
2396 		 */
2397 		reloc_root->last_trans = trans->transid;
2398 		trans->block_rsv = rc->block_rsv;
2399 
2400 		replaced = 0;
2401 		max_level = level;
2402 
2403 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2404 		if (ret < 0) {
2405 			err = ret;
2406 			goto out;
2407 		}
2408 		if (ret > 0)
2409 			break;
2410 
2411 		if (!find_next_key(path, level, &key) &&
2412 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2413 			ret = 0;
2414 		} else {
2415 			ret = replace_path(trans, rc, root, reloc_root, path,
2416 					   &next_key, level, max_level);
2417 		}
2418 		if (ret < 0) {
2419 			err = ret;
2420 			goto out;
2421 		}
2422 
2423 		if (ret > 0) {
2424 			level = ret;
2425 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2426 					      path->slots[level]);
2427 			replaced = 1;
2428 		}
2429 
2430 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2431 		if (ret > 0)
2432 			break;
2433 
2434 		BUG_ON(level == 0);
2435 		/*
2436 		 * save the merging progress in the drop_progress.
2437 		 * this is OK since root refs == 1 in this case.
2438 		 */
2439 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2440 			       path->slots[level]);
2441 		root_item->drop_level = level;
2442 
2443 		btrfs_end_transaction_throttle(trans);
2444 		trans = NULL;
2445 
2446 		btrfs_btree_balance_dirty(fs_info);
2447 
2448 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2449 			invalidate_extent_cache(root, &key, &next_key);
2450 	}
2451 
2452 	/*
2453 	 * handle the case only one block in the fs tree need to be
2454 	 * relocated and the block is tree root.
2455 	 */
2456 	leaf = btrfs_lock_root_node(root);
2457 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2458 	btrfs_tree_unlock(leaf);
2459 	free_extent_buffer(leaf);
2460 	if (ret < 0)
2461 		err = ret;
2462 out:
2463 	btrfs_free_path(path);
2464 
2465 	if (err == 0)
2466 		insert_dirty_subvol(trans, rc, root);
2467 
2468 	if (trans)
2469 		btrfs_end_transaction_throttle(trans);
2470 
2471 	btrfs_btree_balance_dirty(fs_info);
2472 
2473 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2474 		invalidate_extent_cache(root, &key, &next_key);
2475 
2476 	return err;
2477 }
2478 
2479 static noinline_for_stack
2480 int prepare_to_merge(struct reloc_control *rc, int err)
2481 {
2482 	struct btrfs_root *root = rc->extent_root;
2483 	struct btrfs_fs_info *fs_info = root->fs_info;
2484 	struct btrfs_root *reloc_root;
2485 	struct btrfs_trans_handle *trans;
2486 	LIST_HEAD(reloc_roots);
2487 	u64 num_bytes = 0;
2488 	int ret;
2489 
2490 	mutex_lock(&fs_info->reloc_mutex);
2491 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2492 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2493 	mutex_unlock(&fs_info->reloc_mutex);
2494 
2495 again:
2496 	if (!err) {
2497 		num_bytes = rc->merging_rsv_size;
2498 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2499 					  BTRFS_RESERVE_FLUSH_ALL);
2500 		if (ret)
2501 			err = ret;
2502 	}
2503 
2504 	trans = btrfs_join_transaction(rc->extent_root);
2505 	if (IS_ERR(trans)) {
2506 		if (!err)
2507 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2508 						num_bytes, NULL);
2509 		return PTR_ERR(trans);
2510 	}
2511 
2512 	if (!err) {
2513 		if (num_bytes != rc->merging_rsv_size) {
2514 			btrfs_end_transaction(trans);
2515 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2516 						num_bytes, NULL);
2517 			goto again;
2518 		}
2519 	}
2520 
2521 	rc->merge_reloc_tree = 1;
2522 
2523 	while (!list_empty(&rc->reloc_roots)) {
2524 		reloc_root = list_entry(rc->reloc_roots.next,
2525 					struct btrfs_root, root_list);
2526 		list_del_init(&reloc_root->root_list);
2527 
2528 		root = read_fs_root(fs_info, reloc_root->root_key.offset);
2529 		BUG_ON(IS_ERR(root));
2530 		BUG_ON(root->reloc_root != reloc_root);
2531 
2532 		/*
2533 		 * set reference count to 1, so btrfs_recover_relocation
2534 		 * knows it should resumes merging
2535 		 */
2536 		if (!err)
2537 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2538 		btrfs_update_reloc_root(trans, root);
2539 
2540 		list_add(&reloc_root->root_list, &reloc_roots);
2541 		btrfs_put_root(root);
2542 	}
2543 
2544 	list_splice(&reloc_roots, &rc->reloc_roots);
2545 
2546 	if (!err)
2547 		btrfs_commit_transaction(trans);
2548 	else
2549 		btrfs_end_transaction(trans);
2550 	return err;
2551 }
2552 
2553 static noinline_for_stack
2554 void free_reloc_roots(struct list_head *list)
2555 {
2556 	struct btrfs_root *reloc_root;
2557 
2558 	while (!list_empty(list)) {
2559 		reloc_root = list_entry(list->next, struct btrfs_root,
2560 					root_list);
2561 		__del_reloc_root(reloc_root);
2562 	}
2563 }
2564 
2565 static noinline_for_stack
2566 void merge_reloc_roots(struct reloc_control *rc)
2567 {
2568 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2569 	struct btrfs_root *root;
2570 	struct btrfs_root *reloc_root;
2571 	LIST_HEAD(reloc_roots);
2572 	int found = 0;
2573 	int ret = 0;
2574 again:
2575 	root = rc->extent_root;
2576 
2577 	/*
2578 	 * this serializes us with btrfs_record_root_in_transaction,
2579 	 * we have to make sure nobody is in the middle of
2580 	 * adding their roots to the list while we are
2581 	 * doing this splice
2582 	 */
2583 	mutex_lock(&fs_info->reloc_mutex);
2584 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2585 	mutex_unlock(&fs_info->reloc_mutex);
2586 
2587 	while (!list_empty(&reloc_roots)) {
2588 		found = 1;
2589 		reloc_root = list_entry(reloc_roots.next,
2590 					struct btrfs_root, root_list);
2591 
2592 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2593 			root = read_fs_root(fs_info,
2594 					    reloc_root->root_key.offset);
2595 			BUG_ON(IS_ERR(root));
2596 			BUG_ON(root->reloc_root != reloc_root);
2597 
2598 			ret = merge_reloc_root(rc, root);
2599 			btrfs_put_root(root);
2600 			if (ret) {
2601 				if (list_empty(&reloc_root->root_list))
2602 					list_add_tail(&reloc_root->root_list,
2603 						      &reloc_roots);
2604 				goto out;
2605 			}
2606 		} else {
2607 			list_del_init(&reloc_root->root_list);
2608 			/* Don't forget to queue this reloc root for cleanup */
2609 			list_add_tail(&reloc_root->reloc_dirty_list,
2610 				      &rc->dirty_subvol_roots);
2611 		}
2612 	}
2613 
2614 	if (found) {
2615 		found = 0;
2616 		goto again;
2617 	}
2618 out:
2619 	if (ret) {
2620 		btrfs_handle_fs_error(fs_info, ret, NULL);
2621 		if (!list_empty(&reloc_roots))
2622 			free_reloc_roots(&reloc_roots);
2623 
2624 		/* new reloc root may be added */
2625 		mutex_lock(&fs_info->reloc_mutex);
2626 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2627 		mutex_unlock(&fs_info->reloc_mutex);
2628 		if (!list_empty(&reloc_roots))
2629 			free_reloc_roots(&reloc_roots);
2630 	}
2631 
2632 	/*
2633 	 * We used to have
2634 	 *
2635 	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2636 	 *
2637 	 * here, but it's wrong.  If we fail to start the transaction in
2638 	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2639 	 * have actually been removed from the reloc_root_tree rb tree.  This is
2640 	 * fine because we're bailing here, and we hold a reference on the root
2641 	 * for the list that holds it, so these roots will be cleaned up when we
2642 	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2643 	 * will be cleaned up on unmount.
2644 	 *
2645 	 * The remaining nodes will be cleaned up by free_reloc_control.
2646 	 */
2647 }
2648 
2649 static void free_block_list(struct rb_root *blocks)
2650 {
2651 	struct tree_block *block;
2652 	struct rb_node *rb_node;
2653 	while ((rb_node = rb_first(blocks))) {
2654 		block = rb_entry(rb_node, struct tree_block, rb_node);
2655 		rb_erase(rb_node, blocks);
2656 		kfree(block);
2657 	}
2658 }
2659 
2660 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2661 				      struct btrfs_root *reloc_root)
2662 {
2663 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2664 	struct btrfs_root *root;
2665 	int ret;
2666 
2667 	if (reloc_root->last_trans == trans->transid)
2668 		return 0;
2669 
2670 	root = read_fs_root(fs_info, reloc_root->root_key.offset);
2671 	BUG_ON(IS_ERR(root));
2672 	BUG_ON(root->reloc_root != reloc_root);
2673 	ret = btrfs_record_root_in_trans(trans, root);
2674 	btrfs_put_root(root);
2675 
2676 	return ret;
2677 }
2678 
2679 static noinline_for_stack
2680 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2681 				     struct reloc_control *rc,
2682 				     struct btrfs_backref_node *node,
2683 				     struct btrfs_backref_edge *edges[])
2684 {
2685 	struct btrfs_backref_node *next;
2686 	struct btrfs_root *root;
2687 	int index = 0;
2688 
2689 	next = node;
2690 	while (1) {
2691 		cond_resched();
2692 		next = walk_up_backref(next, edges, &index);
2693 		root = next->root;
2694 		BUG_ON(!root);
2695 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2696 
2697 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2698 			record_reloc_root_in_trans(trans, root);
2699 			break;
2700 		}
2701 
2702 		btrfs_record_root_in_trans(trans, root);
2703 		root = root->reloc_root;
2704 
2705 		if (next->new_bytenr != root->node->start) {
2706 			BUG_ON(next->new_bytenr);
2707 			BUG_ON(!list_empty(&next->list));
2708 			next->new_bytenr = root->node->start;
2709 			btrfs_put_root(next->root);
2710 			next->root = btrfs_grab_root(root);
2711 			ASSERT(next->root);
2712 			list_add_tail(&next->list,
2713 				      &rc->backref_cache.changed);
2714 			mark_block_processed(rc, next);
2715 			break;
2716 		}
2717 
2718 		WARN_ON(1);
2719 		root = NULL;
2720 		next = walk_down_backref(edges, &index);
2721 		if (!next || next->level <= node->level)
2722 			break;
2723 	}
2724 	if (!root)
2725 		return NULL;
2726 
2727 	next = node;
2728 	/* setup backref node path for btrfs_reloc_cow_block */
2729 	while (1) {
2730 		rc->backref_cache.path[next->level] = next;
2731 		if (--index < 0)
2732 			break;
2733 		next = edges[index]->node[UPPER];
2734 	}
2735 	return root;
2736 }
2737 
2738 /*
2739  * select a tree root for relocation. return NULL if the block
2740  * is reference counted. we should use do_relocation() in this
2741  * case. return a tree root pointer if the block isn't reference
2742  * counted. return -ENOENT if the block is root of reloc tree.
2743  */
2744 static noinline_for_stack
2745 struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2746 {
2747 	struct btrfs_backref_node *next;
2748 	struct btrfs_root *root;
2749 	struct btrfs_root *fs_root = NULL;
2750 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2751 	int index = 0;
2752 
2753 	next = node;
2754 	while (1) {
2755 		cond_resched();
2756 		next = walk_up_backref(next, edges, &index);
2757 		root = next->root;
2758 		BUG_ON(!root);
2759 
2760 		/* no other choice for non-references counted tree */
2761 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2762 			return root;
2763 
2764 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2765 			fs_root = root;
2766 
2767 		if (next != node)
2768 			return NULL;
2769 
2770 		next = walk_down_backref(edges, &index);
2771 		if (!next || next->level <= node->level)
2772 			break;
2773 	}
2774 
2775 	if (!fs_root)
2776 		return ERR_PTR(-ENOENT);
2777 	return fs_root;
2778 }
2779 
2780 static noinline_for_stack
2781 u64 calcu_metadata_size(struct reloc_control *rc,
2782 			struct btrfs_backref_node *node, int reserve)
2783 {
2784 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2785 	struct btrfs_backref_node *next = node;
2786 	struct btrfs_backref_edge *edge;
2787 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2788 	u64 num_bytes = 0;
2789 	int index = 0;
2790 
2791 	BUG_ON(reserve && node->processed);
2792 
2793 	while (next) {
2794 		cond_resched();
2795 		while (1) {
2796 			if (next->processed && (reserve || next != node))
2797 				break;
2798 
2799 			num_bytes += fs_info->nodesize;
2800 
2801 			if (list_empty(&next->upper))
2802 				break;
2803 
2804 			edge = list_entry(next->upper.next,
2805 					struct btrfs_backref_edge, list[LOWER]);
2806 			edges[index++] = edge;
2807 			next = edge->node[UPPER];
2808 		}
2809 		next = walk_down_backref(edges, &index);
2810 	}
2811 	return num_bytes;
2812 }
2813 
2814 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2815 				  struct reloc_control *rc,
2816 				  struct btrfs_backref_node *node)
2817 {
2818 	struct btrfs_root *root = rc->extent_root;
2819 	struct btrfs_fs_info *fs_info = root->fs_info;
2820 	u64 num_bytes;
2821 	int ret;
2822 	u64 tmp;
2823 
2824 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2825 
2826 	trans->block_rsv = rc->block_rsv;
2827 	rc->reserved_bytes += num_bytes;
2828 
2829 	/*
2830 	 * We are under a transaction here so we can only do limited flushing.
2831 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2832 	 * transaction and try to refill when we can flush all the things.
2833 	 */
2834 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2835 				BTRFS_RESERVE_FLUSH_LIMIT);
2836 	if (ret) {
2837 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2838 		while (tmp <= rc->reserved_bytes)
2839 			tmp <<= 1;
2840 		/*
2841 		 * only one thread can access block_rsv at this point,
2842 		 * so we don't need hold lock to protect block_rsv.
2843 		 * we expand more reservation size here to allow enough
2844 		 * space for relocation and we will return earlier in
2845 		 * enospc case.
2846 		 */
2847 		rc->block_rsv->size = tmp + fs_info->nodesize *
2848 				      RELOCATION_RESERVED_NODES;
2849 		return -EAGAIN;
2850 	}
2851 
2852 	return 0;
2853 }
2854 
2855 /*
2856  * relocate a block tree, and then update pointers in upper level
2857  * blocks that reference the block to point to the new location.
2858  *
2859  * if called by link_to_upper, the block has already been relocated.
2860  * in that case this function just updates pointers.
2861  */
2862 static int do_relocation(struct btrfs_trans_handle *trans,
2863 			 struct reloc_control *rc,
2864 			 struct btrfs_backref_node *node,
2865 			 struct btrfs_key *key,
2866 			 struct btrfs_path *path, int lowest)
2867 {
2868 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2869 	struct btrfs_backref_node *upper;
2870 	struct btrfs_backref_edge *edge;
2871 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2872 	struct btrfs_root *root;
2873 	struct extent_buffer *eb;
2874 	u32 blocksize;
2875 	u64 bytenr;
2876 	u64 generation;
2877 	int slot;
2878 	int ret;
2879 	int err = 0;
2880 
2881 	BUG_ON(lowest && node->eb);
2882 
2883 	path->lowest_level = node->level + 1;
2884 	rc->backref_cache.path[node->level] = node;
2885 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2886 		struct btrfs_key first_key;
2887 		struct btrfs_ref ref = { 0 };
2888 
2889 		cond_resched();
2890 
2891 		upper = edge->node[UPPER];
2892 		root = select_reloc_root(trans, rc, upper, edges);
2893 		BUG_ON(!root);
2894 
2895 		if (upper->eb && !upper->locked) {
2896 			if (!lowest) {
2897 				ret = btrfs_bin_search(upper->eb, key,
2898 						       upper->level, &slot);
2899 				if (ret < 0) {
2900 					err = ret;
2901 					goto next;
2902 				}
2903 				BUG_ON(ret);
2904 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2905 				if (node->eb->start == bytenr)
2906 					goto next;
2907 			}
2908 			drop_node_buffer(upper);
2909 		}
2910 
2911 		if (!upper->eb) {
2912 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2913 			if (ret) {
2914 				if (ret < 0)
2915 					err = ret;
2916 				else
2917 					err = -ENOENT;
2918 
2919 				btrfs_release_path(path);
2920 				break;
2921 			}
2922 
2923 			if (!upper->eb) {
2924 				upper->eb = path->nodes[upper->level];
2925 				path->nodes[upper->level] = NULL;
2926 			} else {
2927 				BUG_ON(upper->eb != path->nodes[upper->level]);
2928 			}
2929 
2930 			upper->locked = 1;
2931 			path->locks[upper->level] = 0;
2932 
2933 			slot = path->slots[upper->level];
2934 			btrfs_release_path(path);
2935 		} else {
2936 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2937 					       &slot);
2938 			if (ret < 0) {
2939 				err = ret;
2940 				goto next;
2941 			}
2942 			BUG_ON(ret);
2943 		}
2944 
2945 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2946 		if (lowest) {
2947 			if (bytenr != node->bytenr) {
2948 				btrfs_err(root->fs_info,
2949 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2950 					  bytenr, node->bytenr, slot,
2951 					  upper->eb->start);
2952 				err = -EIO;
2953 				goto next;
2954 			}
2955 		} else {
2956 			if (node->eb->start == bytenr)
2957 				goto next;
2958 		}
2959 
2960 		blocksize = root->fs_info->nodesize;
2961 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2962 		btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2963 		eb = read_tree_block(fs_info, bytenr, generation,
2964 				     upper->level - 1, &first_key);
2965 		if (IS_ERR(eb)) {
2966 			err = PTR_ERR(eb);
2967 			goto next;
2968 		} else if (!extent_buffer_uptodate(eb)) {
2969 			free_extent_buffer(eb);
2970 			err = -EIO;
2971 			goto next;
2972 		}
2973 		btrfs_tree_lock(eb);
2974 		btrfs_set_lock_blocking_write(eb);
2975 
2976 		if (!node->eb) {
2977 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2978 					      slot, &eb);
2979 			btrfs_tree_unlock(eb);
2980 			free_extent_buffer(eb);
2981 			if (ret < 0) {
2982 				err = ret;
2983 				goto next;
2984 			}
2985 			BUG_ON(node->eb != eb);
2986 		} else {
2987 			btrfs_set_node_blockptr(upper->eb, slot,
2988 						node->eb->start);
2989 			btrfs_set_node_ptr_generation(upper->eb, slot,
2990 						      trans->transid);
2991 			btrfs_mark_buffer_dirty(upper->eb);
2992 
2993 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2994 					       node->eb->start, blocksize,
2995 					       upper->eb->start);
2996 			ref.real_root = root->root_key.objectid;
2997 			btrfs_init_tree_ref(&ref, node->level,
2998 					    btrfs_header_owner(upper->eb));
2999 			ret = btrfs_inc_extent_ref(trans, &ref);
3000 			BUG_ON(ret);
3001 
3002 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
3003 			BUG_ON(ret);
3004 		}
3005 next:
3006 		if (!upper->pending)
3007 			drop_node_buffer(upper);
3008 		else
3009 			unlock_node_buffer(upper);
3010 		if (err)
3011 			break;
3012 	}
3013 
3014 	if (!err && node->pending) {
3015 		drop_node_buffer(node);
3016 		list_move_tail(&node->list, &rc->backref_cache.changed);
3017 		node->pending = 0;
3018 	}
3019 
3020 	path->lowest_level = 0;
3021 	BUG_ON(err == -ENOSPC);
3022 	return err;
3023 }
3024 
3025 static int link_to_upper(struct btrfs_trans_handle *trans,
3026 			 struct reloc_control *rc,
3027 			 struct btrfs_backref_node *node,
3028 			 struct btrfs_path *path)
3029 {
3030 	struct btrfs_key key;
3031 
3032 	btrfs_node_key_to_cpu(node->eb, &key, 0);
3033 	return do_relocation(trans, rc, node, &key, path, 0);
3034 }
3035 
3036 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
3037 				struct reloc_control *rc,
3038 				struct btrfs_path *path, int err)
3039 {
3040 	LIST_HEAD(list);
3041 	struct btrfs_backref_cache *cache = &rc->backref_cache;
3042 	struct btrfs_backref_node *node;
3043 	int level;
3044 	int ret;
3045 
3046 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3047 		while (!list_empty(&cache->pending[level])) {
3048 			node = list_entry(cache->pending[level].next,
3049 					  struct btrfs_backref_node, list);
3050 			list_move_tail(&node->list, &list);
3051 			BUG_ON(!node->pending);
3052 
3053 			if (!err) {
3054 				ret = link_to_upper(trans, rc, node, path);
3055 				if (ret < 0)
3056 					err = ret;
3057 			}
3058 		}
3059 		list_splice_init(&list, &cache->pending[level]);
3060 	}
3061 	return err;
3062 }
3063 
3064 /*
3065  * mark a block and all blocks directly/indirectly reference the block
3066  * as processed.
3067  */
3068 static void update_processed_blocks(struct reloc_control *rc,
3069 				    struct btrfs_backref_node *node)
3070 {
3071 	struct btrfs_backref_node *next = node;
3072 	struct btrfs_backref_edge *edge;
3073 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3074 	int index = 0;
3075 
3076 	while (next) {
3077 		cond_resched();
3078 		while (1) {
3079 			if (next->processed)
3080 				break;
3081 
3082 			mark_block_processed(rc, next);
3083 
3084 			if (list_empty(&next->upper))
3085 				break;
3086 
3087 			edge = list_entry(next->upper.next,
3088 					struct btrfs_backref_edge, list[LOWER]);
3089 			edges[index++] = edge;
3090 			next = edge->node[UPPER];
3091 		}
3092 		next = walk_down_backref(edges, &index);
3093 	}
3094 }
3095 
3096 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
3097 {
3098 	u32 blocksize = rc->extent_root->fs_info->nodesize;
3099 
3100 	if (test_range_bit(&rc->processed_blocks, bytenr,
3101 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
3102 		return 1;
3103 	return 0;
3104 }
3105 
3106 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
3107 			      struct tree_block *block)
3108 {
3109 	struct extent_buffer *eb;
3110 
3111 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
3112 			     block->level, NULL);
3113 	if (IS_ERR(eb)) {
3114 		return PTR_ERR(eb);
3115 	} else if (!extent_buffer_uptodate(eb)) {
3116 		free_extent_buffer(eb);
3117 		return -EIO;
3118 	}
3119 	if (block->level == 0)
3120 		btrfs_item_key_to_cpu(eb, &block->key, 0);
3121 	else
3122 		btrfs_node_key_to_cpu(eb, &block->key, 0);
3123 	free_extent_buffer(eb);
3124 	block->key_ready = 1;
3125 	return 0;
3126 }
3127 
3128 /*
3129  * helper function to relocate a tree block
3130  */
3131 static int relocate_tree_block(struct btrfs_trans_handle *trans,
3132 				struct reloc_control *rc,
3133 				struct btrfs_backref_node *node,
3134 				struct btrfs_key *key,
3135 				struct btrfs_path *path)
3136 {
3137 	struct btrfs_root *root;
3138 	int ret = 0;
3139 
3140 	if (!node)
3141 		return 0;
3142 
3143 	/*
3144 	 * If we fail here we want to drop our backref_node because we are going
3145 	 * to start over and regenerate the tree for it.
3146 	 */
3147 	ret = reserve_metadata_space(trans, rc, node);
3148 	if (ret)
3149 		goto out;
3150 
3151 	BUG_ON(node->processed);
3152 	root = select_one_root(node);
3153 	if (root == ERR_PTR(-ENOENT)) {
3154 		update_processed_blocks(rc, node);
3155 		goto out;
3156 	}
3157 
3158 	if (root) {
3159 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3160 			BUG_ON(node->new_bytenr);
3161 			BUG_ON(!list_empty(&node->list));
3162 			btrfs_record_root_in_trans(trans, root);
3163 			root = root->reloc_root;
3164 			node->new_bytenr = root->node->start;
3165 			btrfs_put_root(node->root);
3166 			node->root = btrfs_grab_root(root);
3167 			ASSERT(node->root);
3168 			list_add_tail(&node->list, &rc->backref_cache.changed);
3169 		} else {
3170 			path->lowest_level = node->level;
3171 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3172 			btrfs_release_path(path);
3173 			if (ret > 0)
3174 				ret = 0;
3175 		}
3176 		if (!ret)
3177 			update_processed_blocks(rc, node);
3178 	} else {
3179 		ret = do_relocation(trans, rc, node, key, path, 1);
3180 	}
3181 out:
3182 	if (ret || node->level == 0 || node->cowonly)
3183 		remove_backref_node(&rc->backref_cache, node);
3184 	return ret;
3185 }
3186 
3187 /*
3188  * relocate a list of blocks
3189  */
3190 static noinline_for_stack
3191 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3192 			 struct reloc_control *rc, struct rb_root *blocks)
3193 {
3194 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3195 	struct btrfs_backref_node *node;
3196 	struct btrfs_path *path;
3197 	struct tree_block *block;
3198 	struct tree_block *next;
3199 	int ret;
3200 	int err = 0;
3201 
3202 	path = btrfs_alloc_path();
3203 	if (!path) {
3204 		err = -ENOMEM;
3205 		goto out_free_blocks;
3206 	}
3207 
3208 	/* Kick in readahead for tree blocks with missing keys */
3209 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3210 		if (!block->key_ready)
3211 			readahead_tree_block(fs_info, block->bytenr);
3212 	}
3213 
3214 	/* Get first keys */
3215 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3216 		if (!block->key_ready) {
3217 			err = get_tree_block_key(fs_info, block);
3218 			if (err)
3219 				goto out_free_path;
3220 		}
3221 	}
3222 
3223 	/* Do tree relocation */
3224 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3225 		node = build_backref_tree(rc, &block->key,
3226 					  block->level, block->bytenr);
3227 		if (IS_ERR(node)) {
3228 			err = PTR_ERR(node);
3229 			goto out;
3230 		}
3231 
3232 		ret = relocate_tree_block(trans, rc, node, &block->key,
3233 					  path);
3234 		if (ret < 0) {
3235 			err = ret;
3236 			break;
3237 		}
3238 	}
3239 out:
3240 	err = finish_pending_nodes(trans, rc, path, err);
3241 
3242 out_free_path:
3243 	btrfs_free_path(path);
3244 out_free_blocks:
3245 	free_block_list(blocks);
3246 	return err;
3247 }
3248 
3249 static noinline_for_stack
3250 int prealloc_file_extent_cluster(struct inode *inode,
3251 				 struct file_extent_cluster *cluster)
3252 {
3253 	u64 alloc_hint = 0;
3254 	u64 start;
3255 	u64 end;
3256 	u64 offset = BTRFS_I(inode)->index_cnt;
3257 	u64 num_bytes;
3258 	int nr = 0;
3259 	int ret = 0;
3260 	u64 prealloc_start = cluster->start - offset;
3261 	u64 prealloc_end = cluster->end - offset;
3262 	u64 cur_offset;
3263 	struct extent_changeset *data_reserved = NULL;
3264 
3265 	BUG_ON(cluster->start != cluster->boundary[0]);
3266 	inode_lock(inode);
3267 
3268 	ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3269 					  prealloc_end + 1 - prealloc_start);
3270 	if (ret)
3271 		goto out;
3272 
3273 	cur_offset = prealloc_start;
3274 	while (nr < cluster->nr) {
3275 		start = cluster->boundary[nr] - offset;
3276 		if (nr + 1 < cluster->nr)
3277 			end = cluster->boundary[nr + 1] - 1 - offset;
3278 		else
3279 			end = cluster->end - offset;
3280 
3281 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3282 		num_bytes = end + 1 - start;
3283 		if (cur_offset < start)
3284 			btrfs_free_reserved_data_space(inode, data_reserved,
3285 					cur_offset, start - cur_offset);
3286 		ret = btrfs_prealloc_file_range(inode, 0, start,
3287 						num_bytes, num_bytes,
3288 						end + 1, &alloc_hint);
3289 		cur_offset = end + 1;
3290 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3291 		if (ret)
3292 			break;
3293 		nr++;
3294 	}
3295 	if (cur_offset < prealloc_end)
3296 		btrfs_free_reserved_data_space(inode, data_reserved,
3297 				cur_offset, prealloc_end + 1 - cur_offset);
3298 out:
3299 	inode_unlock(inode);
3300 	extent_changeset_free(data_reserved);
3301 	return ret;
3302 }
3303 
3304 static noinline_for_stack
3305 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3306 			 u64 block_start)
3307 {
3308 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3309 	struct extent_map *em;
3310 	int ret = 0;
3311 
3312 	em = alloc_extent_map();
3313 	if (!em)
3314 		return -ENOMEM;
3315 
3316 	em->start = start;
3317 	em->len = end + 1 - start;
3318 	em->block_len = em->len;
3319 	em->block_start = block_start;
3320 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3321 
3322 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3323 	while (1) {
3324 		write_lock(&em_tree->lock);
3325 		ret = add_extent_mapping(em_tree, em, 0);
3326 		write_unlock(&em_tree->lock);
3327 		if (ret != -EEXIST) {
3328 			free_extent_map(em);
3329 			break;
3330 		}
3331 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3332 	}
3333 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3334 	return ret;
3335 }
3336 
3337 /*
3338  * Allow error injection to test balance cancellation
3339  */
3340 int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
3341 {
3342 	return atomic_read(&fs_info->balance_cancel_req);
3343 }
3344 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
3345 
3346 static int relocate_file_extent_cluster(struct inode *inode,
3347 					struct file_extent_cluster *cluster)
3348 {
3349 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3350 	u64 page_start;
3351 	u64 page_end;
3352 	u64 offset = BTRFS_I(inode)->index_cnt;
3353 	unsigned long index;
3354 	unsigned long last_index;
3355 	struct page *page;
3356 	struct file_ra_state *ra;
3357 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3358 	int nr = 0;
3359 	int ret = 0;
3360 
3361 	if (!cluster->nr)
3362 		return 0;
3363 
3364 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3365 	if (!ra)
3366 		return -ENOMEM;
3367 
3368 	ret = prealloc_file_extent_cluster(inode, cluster);
3369 	if (ret)
3370 		goto out;
3371 
3372 	file_ra_state_init(ra, inode->i_mapping);
3373 
3374 	ret = setup_extent_mapping(inode, cluster->start - offset,
3375 				   cluster->end - offset, cluster->start);
3376 	if (ret)
3377 		goto out;
3378 
3379 	index = (cluster->start - offset) >> PAGE_SHIFT;
3380 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3381 	while (index <= last_index) {
3382 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3383 				PAGE_SIZE);
3384 		if (ret)
3385 			goto out;
3386 
3387 		page = find_lock_page(inode->i_mapping, index);
3388 		if (!page) {
3389 			page_cache_sync_readahead(inode->i_mapping,
3390 						  ra, NULL, index,
3391 						  last_index + 1 - index);
3392 			page = find_or_create_page(inode->i_mapping, index,
3393 						   mask);
3394 			if (!page) {
3395 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3396 							PAGE_SIZE, true);
3397 				btrfs_delalloc_release_extents(BTRFS_I(inode),
3398 							PAGE_SIZE);
3399 				ret = -ENOMEM;
3400 				goto out;
3401 			}
3402 		}
3403 
3404 		if (PageReadahead(page)) {
3405 			page_cache_async_readahead(inode->i_mapping,
3406 						   ra, NULL, page, index,
3407 						   last_index + 1 - index);
3408 		}
3409 
3410 		if (!PageUptodate(page)) {
3411 			btrfs_readpage(NULL, page);
3412 			lock_page(page);
3413 			if (!PageUptodate(page)) {
3414 				unlock_page(page);
3415 				put_page(page);
3416 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3417 							PAGE_SIZE, true);
3418 				btrfs_delalloc_release_extents(BTRFS_I(inode),
3419 							       PAGE_SIZE);
3420 				ret = -EIO;
3421 				goto out;
3422 			}
3423 		}
3424 
3425 		page_start = page_offset(page);
3426 		page_end = page_start + PAGE_SIZE - 1;
3427 
3428 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3429 
3430 		set_page_extent_mapped(page);
3431 
3432 		if (nr < cluster->nr &&
3433 		    page_start + offset == cluster->boundary[nr]) {
3434 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3435 					page_start, page_end,
3436 					EXTENT_BOUNDARY);
3437 			nr++;
3438 		}
3439 
3440 		ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
3441 						NULL);
3442 		if (ret) {
3443 			unlock_page(page);
3444 			put_page(page);
3445 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
3446 							 PAGE_SIZE, true);
3447 			btrfs_delalloc_release_extents(BTRFS_I(inode),
3448 			                               PAGE_SIZE);
3449 
3450 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
3451 					  page_start, page_end,
3452 					  EXTENT_LOCKED | EXTENT_BOUNDARY);
3453 			goto out;
3454 
3455 		}
3456 		set_page_dirty(page);
3457 
3458 		unlock_extent(&BTRFS_I(inode)->io_tree,
3459 			      page_start, page_end);
3460 		unlock_page(page);
3461 		put_page(page);
3462 
3463 		index++;
3464 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
3465 		balance_dirty_pages_ratelimited(inode->i_mapping);
3466 		btrfs_throttle(fs_info);
3467 		if (btrfs_should_cancel_balance(fs_info)) {
3468 			ret = -ECANCELED;
3469 			goto out;
3470 		}
3471 	}
3472 	WARN_ON(nr != cluster->nr);
3473 out:
3474 	kfree(ra);
3475 	return ret;
3476 }
3477 
3478 static noinline_for_stack
3479 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3480 			 struct file_extent_cluster *cluster)
3481 {
3482 	int ret;
3483 
3484 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3485 		ret = relocate_file_extent_cluster(inode, cluster);
3486 		if (ret)
3487 			return ret;
3488 		cluster->nr = 0;
3489 	}
3490 
3491 	if (!cluster->nr)
3492 		cluster->start = extent_key->objectid;
3493 	else
3494 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3495 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3496 	cluster->boundary[cluster->nr] = extent_key->objectid;
3497 	cluster->nr++;
3498 
3499 	if (cluster->nr >= MAX_EXTENTS) {
3500 		ret = relocate_file_extent_cluster(inode, cluster);
3501 		if (ret)
3502 			return ret;
3503 		cluster->nr = 0;
3504 	}
3505 	return 0;
3506 }
3507 
3508 /*
3509  * helper to add a tree block to the list.
3510  * the major work is getting the generation and level of the block
3511  */
3512 static int add_tree_block(struct reloc_control *rc,
3513 			  struct btrfs_key *extent_key,
3514 			  struct btrfs_path *path,
3515 			  struct rb_root *blocks)
3516 {
3517 	struct extent_buffer *eb;
3518 	struct btrfs_extent_item *ei;
3519 	struct btrfs_tree_block_info *bi;
3520 	struct tree_block *block;
3521 	struct rb_node *rb_node;
3522 	u32 item_size;
3523 	int level = -1;
3524 	u64 generation;
3525 
3526 	eb =  path->nodes[0];
3527 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3528 
3529 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3530 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3531 		ei = btrfs_item_ptr(eb, path->slots[0],
3532 				struct btrfs_extent_item);
3533 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3534 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3535 			level = btrfs_tree_block_level(eb, bi);
3536 		} else {
3537 			level = (int)extent_key->offset;
3538 		}
3539 		generation = btrfs_extent_generation(eb, ei);
3540 	} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3541 		btrfs_print_v0_err(eb->fs_info);
3542 		btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3543 		return -EINVAL;
3544 	} else {
3545 		BUG();
3546 	}
3547 
3548 	btrfs_release_path(path);
3549 
3550 	BUG_ON(level == -1);
3551 
3552 	block = kmalloc(sizeof(*block), GFP_NOFS);
3553 	if (!block)
3554 		return -ENOMEM;
3555 
3556 	block->bytenr = extent_key->objectid;
3557 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3558 	block->key.offset = generation;
3559 	block->level = level;
3560 	block->key_ready = 0;
3561 
3562 	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3563 	if (rb_node)
3564 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3565 
3566 	return 0;
3567 }
3568 
3569 /*
3570  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3571  */
3572 static int __add_tree_block(struct reloc_control *rc,
3573 			    u64 bytenr, u32 blocksize,
3574 			    struct rb_root *blocks)
3575 {
3576 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3577 	struct btrfs_path *path;
3578 	struct btrfs_key key;
3579 	int ret;
3580 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3581 
3582 	if (tree_block_processed(bytenr, rc))
3583 		return 0;
3584 
3585 	if (rb_simple_search(blocks, bytenr))
3586 		return 0;
3587 
3588 	path = btrfs_alloc_path();
3589 	if (!path)
3590 		return -ENOMEM;
3591 again:
3592 	key.objectid = bytenr;
3593 	if (skinny) {
3594 		key.type = BTRFS_METADATA_ITEM_KEY;
3595 		key.offset = (u64)-1;
3596 	} else {
3597 		key.type = BTRFS_EXTENT_ITEM_KEY;
3598 		key.offset = blocksize;
3599 	}
3600 
3601 	path->search_commit_root = 1;
3602 	path->skip_locking = 1;
3603 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3604 	if (ret < 0)
3605 		goto out;
3606 
3607 	if (ret > 0 && skinny) {
3608 		if (path->slots[0]) {
3609 			path->slots[0]--;
3610 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3611 					      path->slots[0]);
3612 			if (key.objectid == bytenr &&
3613 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3614 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3615 			      key.offset == blocksize)))
3616 				ret = 0;
3617 		}
3618 
3619 		if (ret) {
3620 			skinny = false;
3621 			btrfs_release_path(path);
3622 			goto again;
3623 		}
3624 	}
3625 	if (ret) {
3626 		ASSERT(ret == 1);
3627 		btrfs_print_leaf(path->nodes[0]);
3628 		btrfs_err(fs_info,
3629 	     "tree block extent item (%llu) is not found in extent tree",
3630 		     bytenr);
3631 		WARN_ON(1);
3632 		ret = -EINVAL;
3633 		goto out;
3634 	}
3635 
3636 	ret = add_tree_block(rc, &key, path, blocks);
3637 out:
3638 	btrfs_free_path(path);
3639 	return ret;
3640 }
3641 
3642 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3643 				    struct btrfs_block_group *block_group,
3644 				    struct inode *inode,
3645 				    u64 ino)
3646 {
3647 	struct btrfs_key key;
3648 	struct btrfs_root *root = fs_info->tree_root;
3649 	struct btrfs_trans_handle *trans;
3650 	int ret = 0;
3651 
3652 	if (inode)
3653 		goto truncate;
3654 
3655 	key.objectid = ino;
3656 	key.type = BTRFS_INODE_ITEM_KEY;
3657 	key.offset = 0;
3658 
3659 	inode = btrfs_iget(fs_info->sb, &key, root);
3660 	if (IS_ERR(inode))
3661 		return -ENOENT;
3662 
3663 truncate:
3664 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3665 						 &fs_info->global_block_rsv);
3666 	if (ret)
3667 		goto out;
3668 
3669 	trans = btrfs_join_transaction(root);
3670 	if (IS_ERR(trans)) {
3671 		ret = PTR_ERR(trans);
3672 		goto out;
3673 	}
3674 
3675 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3676 
3677 	btrfs_end_transaction(trans);
3678 	btrfs_btree_balance_dirty(fs_info);
3679 out:
3680 	iput(inode);
3681 	return ret;
3682 }
3683 
3684 /*
3685  * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3686  * cache inode, to avoid free space cache data extent blocking data relocation.
3687  */
3688 static int delete_v1_space_cache(struct extent_buffer *leaf,
3689 				 struct btrfs_block_group *block_group,
3690 				 u64 data_bytenr)
3691 {
3692 	u64 space_cache_ino;
3693 	struct btrfs_file_extent_item *ei;
3694 	struct btrfs_key key;
3695 	bool found = false;
3696 	int i;
3697 	int ret;
3698 
3699 	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3700 		return 0;
3701 
3702 	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3703 		btrfs_item_key_to_cpu(leaf, &key, i);
3704 		if (key.type != BTRFS_EXTENT_DATA_KEY)
3705 			continue;
3706 		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3707 		if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
3708 		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3709 			found = true;
3710 			space_cache_ino = key.objectid;
3711 			break;
3712 		}
3713 	}
3714 	if (!found)
3715 		return -ENOENT;
3716 	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3717 					space_cache_ino);
3718 	return ret;
3719 }
3720 
3721 /*
3722  * helper to find all tree blocks that reference a given data extent
3723  */
3724 static noinline_for_stack
3725 int add_data_references(struct reloc_control *rc,
3726 			struct btrfs_key *extent_key,
3727 			struct btrfs_path *path,
3728 			struct rb_root *blocks)
3729 {
3730 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3731 	struct ulist *leaves = NULL;
3732 	struct ulist_iterator leaf_uiter;
3733 	struct ulist_node *ref_node = NULL;
3734 	const u32 blocksize = fs_info->nodesize;
3735 	int ret = 0;
3736 
3737 	btrfs_release_path(path);
3738 	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3739 				   0, &leaves, NULL, true);
3740 	if (ret < 0)
3741 		return ret;
3742 
3743 	ULIST_ITER_INIT(&leaf_uiter);
3744 	while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3745 		struct extent_buffer *eb;
3746 
3747 		eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
3748 		if (IS_ERR(eb)) {
3749 			ret = PTR_ERR(eb);
3750 			break;
3751 		}
3752 		ret = delete_v1_space_cache(eb, rc->block_group,
3753 					    extent_key->objectid);
3754 		free_extent_buffer(eb);
3755 		if (ret < 0)
3756 			break;
3757 		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3758 		if (ret < 0)
3759 			break;
3760 	}
3761 	if (ret < 0)
3762 		free_block_list(blocks);
3763 	ulist_free(leaves);
3764 	return ret;
3765 }
3766 
3767 /*
3768  * helper to find next unprocessed extent
3769  */
3770 static noinline_for_stack
3771 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3772 		     struct btrfs_key *extent_key)
3773 {
3774 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3775 	struct btrfs_key key;
3776 	struct extent_buffer *leaf;
3777 	u64 start, end, last;
3778 	int ret;
3779 
3780 	last = rc->block_group->start + rc->block_group->length;
3781 	while (1) {
3782 		cond_resched();
3783 		if (rc->search_start >= last) {
3784 			ret = 1;
3785 			break;
3786 		}
3787 
3788 		key.objectid = rc->search_start;
3789 		key.type = BTRFS_EXTENT_ITEM_KEY;
3790 		key.offset = 0;
3791 
3792 		path->search_commit_root = 1;
3793 		path->skip_locking = 1;
3794 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3795 					0, 0);
3796 		if (ret < 0)
3797 			break;
3798 next:
3799 		leaf = path->nodes[0];
3800 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3801 			ret = btrfs_next_leaf(rc->extent_root, path);
3802 			if (ret != 0)
3803 				break;
3804 			leaf = path->nodes[0];
3805 		}
3806 
3807 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3808 		if (key.objectid >= last) {
3809 			ret = 1;
3810 			break;
3811 		}
3812 
3813 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3814 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3815 			path->slots[0]++;
3816 			goto next;
3817 		}
3818 
3819 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3820 		    key.objectid + key.offset <= rc->search_start) {
3821 			path->slots[0]++;
3822 			goto next;
3823 		}
3824 
3825 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3826 		    key.objectid + fs_info->nodesize <=
3827 		    rc->search_start) {
3828 			path->slots[0]++;
3829 			goto next;
3830 		}
3831 
3832 		ret = find_first_extent_bit(&rc->processed_blocks,
3833 					    key.objectid, &start, &end,
3834 					    EXTENT_DIRTY, NULL);
3835 
3836 		if (ret == 0 && start <= key.objectid) {
3837 			btrfs_release_path(path);
3838 			rc->search_start = end + 1;
3839 		} else {
3840 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3841 				rc->search_start = key.objectid + key.offset;
3842 			else
3843 				rc->search_start = key.objectid +
3844 					fs_info->nodesize;
3845 			memcpy(extent_key, &key, sizeof(key));
3846 			return 0;
3847 		}
3848 	}
3849 	btrfs_release_path(path);
3850 	return ret;
3851 }
3852 
3853 static void set_reloc_control(struct reloc_control *rc)
3854 {
3855 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3856 
3857 	mutex_lock(&fs_info->reloc_mutex);
3858 	fs_info->reloc_ctl = rc;
3859 	mutex_unlock(&fs_info->reloc_mutex);
3860 }
3861 
3862 static void unset_reloc_control(struct reloc_control *rc)
3863 {
3864 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3865 
3866 	mutex_lock(&fs_info->reloc_mutex);
3867 	fs_info->reloc_ctl = NULL;
3868 	mutex_unlock(&fs_info->reloc_mutex);
3869 }
3870 
3871 static int check_extent_flags(u64 flags)
3872 {
3873 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3874 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3875 		return 1;
3876 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3877 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3878 		return 1;
3879 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3880 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3881 		return 1;
3882 	return 0;
3883 }
3884 
3885 static noinline_for_stack
3886 int prepare_to_relocate(struct reloc_control *rc)
3887 {
3888 	struct btrfs_trans_handle *trans;
3889 	int ret;
3890 
3891 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3892 					      BTRFS_BLOCK_RSV_TEMP);
3893 	if (!rc->block_rsv)
3894 		return -ENOMEM;
3895 
3896 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3897 	rc->search_start = rc->block_group->start;
3898 	rc->extents_found = 0;
3899 	rc->nodes_relocated = 0;
3900 	rc->merging_rsv_size = 0;
3901 	rc->reserved_bytes = 0;
3902 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3903 			      RELOCATION_RESERVED_NODES;
3904 	ret = btrfs_block_rsv_refill(rc->extent_root,
3905 				     rc->block_rsv, rc->block_rsv->size,
3906 				     BTRFS_RESERVE_FLUSH_ALL);
3907 	if (ret)
3908 		return ret;
3909 
3910 	rc->create_reloc_tree = 1;
3911 	set_reloc_control(rc);
3912 
3913 	trans = btrfs_join_transaction(rc->extent_root);
3914 	if (IS_ERR(trans)) {
3915 		unset_reloc_control(rc);
3916 		/*
3917 		 * extent tree is not a ref_cow tree and has no reloc_root to
3918 		 * cleanup.  And callers are responsible to free the above
3919 		 * block rsv.
3920 		 */
3921 		return PTR_ERR(trans);
3922 	}
3923 	btrfs_commit_transaction(trans);
3924 	return 0;
3925 }
3926 
3927 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3928 {
3929 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3930 	struct rb_root blocks = RB_ROOT;
3931 	struct btrfs_key key;
3932 	struct btrfs_trans_handle *trans = NULL;
3933 	struct btrfs_path *path;
3934 	struct btrfs_extent_item *ei;
3935 	u64 flags;
3936 	u32 item_size;
3937 	int ret;
3938 	int err = 0;
3939 	int progress = 0;
3940 
3941 	path = btrfs_alloc_path();
3942 	if (!path)
3943 		return -ENOMEM;
3944 	path->reada = READA_FORWARD;
3945 
3946 	ret = prepare_to_relocate(rc);
3947 	if (ret) {
3948 		err = ret;
3949 		goto out_free;
3950 	}
3951 
3952 	while (1) {
3953 		rc->reserved_bytes = 0;
3954 		ret = btrfs_block_rsv_refill(rc->extent_root,
3955 					rc->block_rsv, rc->block_rsv->size,
3956 					BTRFS_RESERVE_FLUSH_ALL);
3957 		if (ret) {
3958 			err = ret;
3959 			break;
3960 		}
3961 		progress++;
3962 		trans = btrfs_start_transaction(rc->extent_root, 0);
3963 		if (IS_ERR(trans)) {
3964 			err = PTR_ERR(trans);
3965 			trans = NULL;
3966 			break;
3967 		}
3968 restart:
3969 		if (update_backref_cache(trans, &rc->backref_cache)) {
3970 			btrfs_end_transaction(trans);
3971 			trans = NULL;
3972 			continue;
3973 		}
3974 
3975 		ret = find_next_extent(rc, path, &key);
3976 		if (ret < 0)
3977 			err = ret;
3978 		if (ret != 0)
3979 			break;
3980 
3981 		rc->extents_found++;
3982 
3983 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3984 				    struct btrfs_extent_item);
3985 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3986 		if (item_size >= sizeof(*ei)) {
3987 			flags = btrfs_extent_flags(path->nodes[0], ei);
3988 			ret = check_extent_flags(flags);
3989 			BUG_ON(ret);
3990 		} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3991 			err = -EINVAL;
3992 			btrfs_print_v0_err(trans->fs_info);
3993 			btrfs_abort_transaction(trans, err);
3994 			break;
3995 		} else {
3996 			BUG();
3997 		}
3998 
3999 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4000 			ret = add_tree_block(rc, &key, path, &blocks);
4001 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4002 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4003 			ret = add_data_references(rc, &key, path, &blocks);
4004 		} else {
4005 			btrfs_release_path(path);
4006 			ret = 0;
4007 		}
4008 		if (ret < 0) {
4009 			err = ret;
4010 			break;
4011 		}
4012 
4013 		if (!RB_EMPTY_ROOT(&blocks)) {
4014 			ret = relocate_tree_blocks(trans, rc, &blocks);
4015 			if (ret < 0) {
4016 				if (ret != -EAGAIN) {
4017 					err = ret;
4018 					break;
4019 				}
4020 				rc->extents_found--;
4021 				rc->search_start = key.objectid;
4022 			}
4023 		}
4024 
4025 		btrfs_end_transaction_throttle(trans);
4026 		btrfs_btree_balance_dirty(fs_info);
4027 		trans = NULL;
4028 
4029 		if (rc->stage == MOVE_DATA_EXTENTS &&
4030 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4031 			rc->found_file_extent = 1;
4032 			ret = relocate_data_extent(rc->data_inode,
4033 						   &key, &rc->cluster);
4034 			if (ret < 0) {
4035 				err = ret;
4036 				break;
4037 			}
4038 		}
4039 		if (btrfs_should_cancel_balance(fs_info)) {
4040 			err = -ECANCELED;
4041 			break;
4042 		}
4043 	}
4044 	if (trans && progress && err == -ENOSPC) {
4045 		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
4046 		if (ret == 1) {
4047 			err = 0;
4048 			progress = 0;
4049 			goto restart;
4050 		}
4051 	}
4052 
4053 	btrfs_release_path(path);
4054 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4055 
4056 	if (trans) {
4057 		btrfs_end_transaction_throttle(trans);
4058 		btrfs_btree_balance_dirty(fs_info);
4059 	}
4060 
4061 	if (!err) {
4062 		ret = relocate_file_extent_cluster(rc->data_inode,
4063 						   &rc->cluster);
4064 		if (ret < 0)
4065 			err = ret;
4066 	}
4067 
4068 	rc->create_reloc_tree = 0;
4069 	set_reloc_control(rc);
4070 
4071 	backref_cache_cleanup(&rc->backref_cache);
4072 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
4073 
4074 	/*
4075 	 * Even in the case when the relocation is cancelled, we should all go
4076 	 * through prepare_to_merge() and merge_reloc_roots().
4077 	 *
4078 	 * For error (including cancelled balance), prepare_to_merge() will
4079 	 * mark all reloc trees orphan, then queue them for cleanup in
4080 	 * merge_reloc_roots()
4081 	 */
4082 	err = prepare_to_merge(rc, err);
4083 
4084 	merge_reloc_roots(rc);
4085 
4086 	rc->merge_reloc_tree = 0;
4087 	unset_reloc_control(rc);
4088 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
4089 
4090 	/* get rid of pinned extents */
4091 	trans = btrfs_join_transaction(rc->extent_root);
4092 	if (IS_ERR(trans)) {
4093 		err = PTR_ERR(trans);
4094 		goto out_free;
4095 	}
4096 	btrfs_commit_transaction(trans);
4097 out_free:
4098 	ret = clean_dirty_subvols(rc);
4099 	if (ret < 0 && !err)
4100 		err = ret;
4101 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
4102 	btrfs_free_path(path);
4103 	return err;
4104 }
4105 
4106 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4107 				 struct btrfs_root *root, u64 objectid)
4108 {
4109 	struct btrfs_path *path;
4110 	struct btrfs_inode_item *item;
4111 	struct extent_buffer *leaf;
4112 	int ret;
4113 
4114 	path = btrfs_alloc_path();
4115 	if (!path)
4116 		return -ENOMEM;
4117 
4118 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4119 	if (ret)
4120 		goto out;
4121 
4122 	leaf = path->nodes[0];
4123 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4124 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4125 	btrfs_set_inode_generation(leaf, item, 1);
4126 	btrfs_set_inode_size(leaf, item, 0);
4127 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4128 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4129 					  BTRFS_INODE_PREALLOC);
4130 	btrfs_mark_buffer_dirty(leaf);
4131 out:
4132 	btrfs_free_path(path);
4133 	return ret;
4134 }
4135 
4136 /*
4137  * helper to create inode for data relocation.
4138  * the inode is in data relocation tree and its link count is 0
4139  */
4140 static noinline_for_stack
4141 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4142 				 struct btrfs_block_group *group)
4143 {
4144 	struct inode *inode = NULL;
4145 	struct btrfs_trans_handle *trans;
4146 	struct btrfs_root *root;
4147 	struct btrfs_key key;
4148 	u64 objectid;
4149 	int err = 0;
4150 
4151 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4152 	if (IS_ERR(root))
4153 		return ERR_CAST(root);
4154 
4155 	trans = btrfs_start_transaction(root, 6);
4156 	if (IS_ERR(trans)) {
4157 		btrfs_put_root(root);
4158 		return ERR_CAST(trans);
4159 	}
4160 
4161 	err = btrfs_find_free_objectid(root, &objectid);
4162 	if (err)
4163 		goto out;
4164 
4165 	err = __insert_orphan_inode(trans, root, objectid);
4166 	BUG_ON(err);
4167 
4168 	key.objectid = objectid;
4169 	key.type = BTRFS_INODE_ITEM_KEY;
4170 	key.offset = 0;
4171 	inode = btrfs_iget(fs_info->sb, &key, root);
4172 	BUG_ON(IS_ERR(inode));
4173 	BTRFS_I(inode)->index_cnt = group->start;
4174 
4175 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4176 out:
4177 	btrfs_put_root(root);
4178 	btrfs_end_transaction(trans);
4179 	btrfs_btree_balance_dirty(fs_info);
4180 	if (err) {
4181 		if (inode)
4182 			iput(inode);
4183 		inode = ERR_PTR(err);
4184 	}
4185 	return inode;
4186 }
4187 
4188 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4189 {
4190 	struct reloc_control *rc;
4191 
4192 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4193 	if (!rc)
4194 		return NULL;
4195 
4196 	INIT_LIST_HEAD(&rc->reloc_roots);
4197 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4198 	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
4199 	mapping_tree_init(&rc->reloc_root_tree);
4200 	extent_io_tree_init(fs_info, &rc->processed_blocks,
4201 			    IO_TREE_RELOC_BLOCKS, NULL);
4202 	return rc;
4203 }
4204 
4205 static void free_reloc_control(struct reloc_control *rc)
4206 {
4207 	struct mapping_node *node, *tmp;
4208 
4209 	free_reloc_roots(&rc->reloc_roots);
4210 	rbtree_postorder_for_each_entry_safe(node, tmp,
4211 			&rc->reloc_root_tree.rb_root, rb_node)
4212 		kfree(node);
4213 
4214 	kfree(rc);
4215 }
4216 
4217 /*
4218  * Print the block group being relocated
4219  */
4220 static void describe_relocation(struct btrfs_fs_info *fs_info,
4221 				struct btrfs_block_group *block_group)
4222 {
4223 	char buf[128] = {'\0'};
4224 
4225 	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4226 
4227 	btrfs_info(fs_info,
4228 		   "relocating block group %llu flags %s",
4229 		   block_group->start, buf);
4230 }
4231 
4232 static const char *stage_to_string(int stage)
4233 {
4234 	if (stage == MOVE_DATA_EXTENTS)
4235 		return "move data extents";
4236 	if (stage == UPDATE_DATA_PTRS)
4237 		return "update data pointers";
4238 	return "unknown";
4239 }
4240 
4241 /*
4242  * function to relocate all extents in a block group.
4243  */
4244 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4245 {
4246 	struct btrfs_block_group *bg;
4247 	struct btrfs_root *extent_root = fs_info->extent_root;
4248 	struct reloc_control *rc;
4249 	struct inode *inode;
4250 	struct btrfs_path *path;
4251 	int ret;
4252 	int rw = 0;
4253 	int err = 0;
4254 
4255 	bg = btrfs_lookup_block_group(fs_info, group_start);
4256 	if (!bg)
4257 		return -ENOENT;
4258 
4259 	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4260 		btrfs_put_block_group(bg);
4261 		return -ETXTBSY;
4262 	}
4263 
4264 	rc = alloc_reloc_control(fs_info);
4265 	if (!rc) {
4266 		btrfs_put_block_group(bg);
4267 		return -ENOMEM;
4268 	}
4269 
4270 	rc->extent_root = extent_root;
4271 	rc->block_group = bg;
4272 
4273 	ret = btrfs_inc_block_group_ro(rc->block_group, true);
4274 	if (ret) {
4275 		err = ret;
4276 		goto out;
4277 	}
4278 	rw = 1;
4279 
4280 	path = btrfs_alloc_path();
4281 	if (!path) {
4282 		err = -ENOMEM;
4283 		goto out;
4284 	}
4285 
4286 	inode = lookup_free_space_inode(rc->block_group, path);
4287 	btrfs_free_path(path);
4288 
4289 	if (!IS_ERR(inode))
4290 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4291 	else
4292 		ret = PTR_ERR(inode);
4293 
4294 	if (ret && ret != -ENOENT) {
4295 		err = ret;
4296 		goto out;
4297 	}
4298 
4299 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4300 	if (IS_ERR(rc->data_inode)) {
4301 		err = PTR_ERR(rc->data_inode);
4302 		rc->data_inode = NULL;
4303 		goto out;
4304 	}
4305 
4306 	describe_relocation(fs_info, rc->block_group);
4307 
4308 	btrfs_wait_block_group_reservations(rc->block_group);
4309 	btrfs_wait_nocow_writers(rc->block_group);
4310 	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4311 				 rc->block_group->start,
4312 				 rc->block_group->length);
4313 
4314 	while (1) {
4315 		int finishes_stage;
4316 
4317 		mutex_lock(&fs_info->cleaner_mutex);
4318 		ret = relocate_block_group(rc);
4319 		mutex_unlock(&fs_info->cleaner_mutex);
4320 		if (ret < 0)
4321 			err = ret;
4322 
4323 		finishes_stage = rc->stage;
4324 		/*
4325 		 * We may have gotten ENOSPC after we already dirtied some
4326 		 * extents.  If writeout happens while we're relocating a
4327 		 * different block group we could end up hitting the
4328 		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4329 		 * btrfs_reloc_cow_block.  Make sure we write everything out
4330 		 * properly so we don't trip over this problem, and then break
4331 		 * out of the loop if we hit an error.
4332 		 */
4333 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4334 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4335 						       (u64)-1);
4336 			if (ret)
4337 				err = ret;
4338 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4339 						 0, -1);
4340 			rc->stage = UPDATE_DATA_PTRS;
4341 		}
4342 
4343 		if (err < 0)
4344 			goto out;
4345 
4346 		if (rc->extents_found == 0)
4347 			break;
4348 
4349 		btrfs_info(fs_info, "found %llu extents, stage: %s",
4350 			   rc->extents_found, stage_to_string(finishes_stage));
4351 	}
4352 
4353 	WARN_ON(rc->block_group->pinned > 0);
4354 	WARN_ON(rc->block_group->reserved > 0);
4355 	WARN_ON(rc->block_group->used > 0);
4356 out:
4357 	if (err && rw)
4358 		btrfs_dec_block_group_ro(rc->block_group);
4359 	iput(rc->data_inode);
4360 	btrfs_put_block_group(rc->block_group);
4361 	free_reloc_control(rc);
4362 	return err;
4363 }
4364 
4365 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4366 {
4367 	struct btrfs_fs_info *fs_info = root->fs_info;
4368 	struct btrfs_trans_handle *trans;
4369 	int ret, err;
4370 
4371 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4372 	if (IS_ERR(trans))
4373 		return PTR_ERR(trans);
4374 
4375 	memset(&root->root_item.drop_progress, 0,
4376 		sizeof(root->root_item.drop_progress));
4377 	root->root_item.drop_level = 0;
4378 	btrfs_set_root_refs(&root->root_item, 0);
4379 	ret = btrfs_update_root(trans, fs_info->tree_root,
4380 				&root->root_key, &root->root_item);
4381 
4382 	err = btrfs_end_transaction(trans);
4383 	if (err)
4384 		return err;
4385 	return ret;
4386 }
4387 
4388 /*
4389  * recover relocation interrupted by system crash.
4390  *
4391  * this function resumes merging reloc trees with corresponding fs trees.
4392  * this is important for keeping the sharing of tree blocks
4393  */
4394 int btrfs_recover_relocation(struct btrfs_root *root)
4395 {
4396 	struct btrfs_fs_info *fs_info = root->fs_info;
4397 	LIST_HEAD(reloc_roots);
4398 	struct btrfs_key key;
4399 	struct btrfs_root *fs_root;
4400 	struct btrfs_root *reloc_root;
4401 	struct btrfs_path *path;
4402 	struct extent_buffer *leaf;
4403 	struct reloc_control *rc = NULL;
4404 	struct btrfs_trans_handle *trans;
4405 	int ret;
4406 	int err = 0;
4407 
4408 	path = btrfs_alloc_path();
4409 	if (!path)
4410 		return -ENOMEM;
4411 	path->reada = READA_BACK;
4412 
4413 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4414 	key.type = BTRFS_ROOT_ITEM_KEY;
4415 	key.offset = (u64)-1;
4416 
4417 	while (1) {
4418 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4419 					path, 0, 0);
4420 		if (ret < 0) {
4421 			err = ret;
4422 			goto out;
4423 		}
4424 		if (ret > 0) {
4425 			if (path->slots[0] == 0)
4426 				break;
4427 			path->slots[0]--;
4428 		}
4429 		leaf = path->nodes[0];
4430 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4431 		btrfs_release_path(path);
4432 
4433 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4434 		    key.type != BTRFS_ROOT_ITEM_KEY)
4435 			break;
4436 
4437 		reloc_root = btrfs_read_tree_root(root, &key);
4438 		if (IS_ERR(reloc_root)) {
4439 			err = PTR_ERR(reloc_root);
4440 			goto out;
4441 		}
4442 
4443 		set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
4444 		list_add(&reloc_root->root_list, &reloc_roots);
4445 
4446 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4447 			fs_root = read_fs_root(fs_info,
4448 					       reloc_root->root_key.offset);
4449 			if (IS_ERR(fs_root)) {
4450 				ret = PTR_ERR(fs_root);
4451 				if (ret != -ENOENT) {
4452 					err = ret;
4453 					goto out;
4454 				}
4455 				ret = mark_garbage_root(reloc_root);
4456 				if (ret < 0) {
4457 					err = ret;
4458 					goto out;
4459 				}
4460 			} else {
4461 				btrfs_put_root(fs_root);
4462 			}
4463 		}
4464 
4465 		if (key.offset == 0)
4466 			break;
4467 
4468 		key.offset--;
4469 	}
4470 	btrfs_release_path(path);
4471 
4472 	if (list_empty(&reloc_roots))
4473 		goto out;
4474 
4475 	rc = alloc_reloc_control(fs_info);
4476 	if (!rc) {
4477 		err = -ENOMEM;
4478 		goto out;
4479 	}
4480 
4481 	rc->extent_root = fs_info->extent_root;
4482 
4483 	set_reloc_control(rc);
4484 
4485 	trans = btrfs_join_transaction(rc->extent_root);
4486 	if (IS_ERR(trans)) {
4487 		err = PTR_ERR(trans);
4488 		goto out_unset;
4489 	}
4490 
4491 	rc->merge_reloc_tree = 1;
4492 
4493 	while (!list_empty(&reloc_roots)) {
4494 		reloc_root = list_entry(reloc_roots.next,
4495 					struct btrfs_root, root_list);
4496 		list_del(&reloc_root->root_list);
4497 
4498 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4499 			list_add_tail(&reloc_root->root_list,
4500 				      &rc->reloc_roots);
4501 			continue;
4502 		}
4503 
4504 		fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4505 		if (IS_ERR(fs_root)) {
4506 			err = PTR_ERR(fs_root);
4507 			list_add_tail(&reloc_root->root_list, &reloc_roots);
4508 			btrfs_end_transaction(trans);
4509 			goto out_unset;
4510 		}
4511 
4512 		err = __add_reloc_root(reloc_root);
4513 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4514 		fs_root->reloc_root = btrfs_grab_root(reloc_root);
4515 		btrfs_put_root(fs_root);
4516 	}
4517 
4518 	err = btrfs_commit_transaction(trans);
4519 	if (err)
4520 		goto out_unset;
4521 
4522 	merge_reloc_roots(rc);
4523 
4524 	unset_reloc_control(rc);
4525 
4526 	trans = btrfs_join_transaction(rc->extent_root);
4527 	if (IS_ERR(trans)) {
4528 		err = PTR_ERR(trans);
4529 		goto out_clean;
4530 	}
4531 	err = btrfs_commit_transaction(trans);
4532 out_clean:
4533 	ret = clean_dirty_subvols(rc);
4534 	if (ret < 0 && !err)
4535 		err = ret;
4536 out_unset:
4537 	unset_reloc_control(rc);
4538 	free_reloc_control(rc);
4539 out:
4540 	if (!list_empty(&reloc_roots))
4541 		free_reloc_roots(&reloc_roots);
4542 
4543 	btrfs_free_path(path);
4544 
4545 	if (err == 0) {
4546 		/* cleanup orphan inode in data relocation tree */
4547 		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4548 		if (IS_ERR(fs_root)) {
4549 			err = PTR_ERR(fs_root);
4550 		} else {
4551 			err = btrfs_orphan_cleanup(fs_root);
4552 			btrfs_put_root(fs_root);
4553 		}
4554 	}
4555 	return err;
4556 }
4557 
4558 /*
4559  * helper to add ordered checksum for data relocation.
4560  *
4561  * cloning checksum properly handles the nodatasum extents.
4562  * it also saves CPU time to re-calculate the checksum.
4563  */
4564 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4565 {
4566 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4567 	struct btrfs_ordered_sum *sums;
4568 	struct btrfs_ordered_extent *ordered;
4569 	int ret;
4570 	u64 disk_bytenr;
4571 	u64 new_bytenr;
4572 	LIST_HEAD(list);
4573 
4574 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4575 	BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
4576 
4577 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4578 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4579 				       disk_bytenr + len - 1, &list, 0);
4580 	if (ret)
4581 		goto out;
4582 
4583 	while (!list_empty(&list)) {
4584 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4585 		list_del_init(&sums->list);
4586 
4587 		/*
4588 		 * We need to offset the new_bytenr based on where the csum is.
4589 		 * We need to do this because we will read in entire prealloc
4590 		 * extents but we may have written to say the middle of the
4591 		 * prealloc extent, so we need to make sure the csum goes with
4592 		 * the right disk offset.
4593 		 *
4594 		 * We can do this because the data reloc inode refers strictly
4595 		 * to the on disk bytes, so we don't have to worry about
4596 		 * disk_len vs real len like with real inodes since it's all
4597 		 * disk length.
4598 		 */
4599 		new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4600 		sums->bytenr = new_bytenr;
4601 
4602 		btrfs_add_ordered_sum(ordered, sums);
4603 	}
4604 out:
4605 	btrfs_put_ordered_extent(ordered);
4606 	return ret;
4607 }
4608 
4609 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4610 			  struct btrfs_root *root, struct extent_buffer *buf,
4611 			  struct extent_buffer *cow)
4612 {
4613 	struct btrfs_fs_info *fs_info = root->fs_info;
4614 	struct reloc_control *rc;
4615 	struct btrfs_backref_node *node;
4616 	int first_cow = 0;
4617 	int level;
4618 	int ret = 0;
4619 
4620 	rc = fs_info->reloc_ctl;
4621 	if (!rc)
4622 		return 0;
4623 
4624 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4625 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4626 
4627 	level = btrfs_header_level(buf);
4628 	if (btrfs_header_generation(buf) <=
4629 	    btrfs_root_last_snapshot(&root->root_item))
4630 		first_cow = 1;
4631 
4632 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4633 	    rc->create_reloc_tree) {
4634 		WARN_ON(!first_cow && level == 0);
4635 
4636 		node = rc->backref_cache.path[level];
4637 		BUG_ON(node->bytenr != buf->start &&
4638 		       node->new_bytenr != buf->start);
4639 
4640 		drop_node_buffer(node);
4641 		atomic_inc(&cow->refs);
4642 		node->eb = cow;
4643 		node->new_bytenr = cow->start;
4644 
4645 		if (!node->pending) {
4646 			list_move_tail(&node->list,
4647 				       &rc->backref_cache.pending[level]);
4648 			node->pending = 1;
4649 		}
4650 
4651 		if (first_cow)
4652 			mark_block_processed(rc, node);
4653 
4654 		if (first_cow && level > 0)
4655 			rc->nodes_relocated += buf->len;
4656 	}
4657 
4658 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4659 		ret = replace_file_extents(trans, rc, root, cow);
4660 	return ret;
4661 }
4662 
4663 /*
4664  * called before creating snapshot. it calculates metadata reservation
4665  * required for relocating tree blocks in the snapshot
4666  */
4667 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4668 			      u64 *bytes_to_reserve)
4669 {
4670 	struct btrfs_root *root = pending->root;
4671 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4672 
4673 	if (!rc || !have_reloc_root(root))
4674 		return;
4675 
4676 	if (!rc->merge_reloc_tree)
4677 		return;
4678 
4679 	root = root->reloc_root;
4680 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4681 	/*
4682 	 * relocation is in the stage of merging trees. the space
4683 	 * used by merging a reloc tree is twice the size of
4684 	 * relocated tree nodes in the worst case. half for cowing
4685 	 * the reloc tree, half for cowing the fs tree. the space
4686 	 * used by cowing the reloc tree will be freed after the
4687 	 * tree is dropped. if we create snapshot, cowing the fs
4688 	 * tree may use more space than it frees. so we need
4689 	 * reserve extra space.
4690 	 */
4691 	*bytes_to_reserve += rc->nodes_relocated;
4692 }
4693 
4694 /*
4695  * called after snapshot is created. migrate block reservation
4696  * and create reloc root for the newly created snapshot
4697  *
4698  * This is similar to btrfs_init_reloc_root(), we come out of here with two
4699  * references held on the reloc_root, one for root->reloc_root and one for
4700  * rc->reloc_roots.
4701  */
4702 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4703 			       struct btrfs_pending_snapshot *pending)
4704 {
4705 	struct btrfs_root *root = pending->root;
4706 	struct btrfs_root *reloc_root;
4707 	struct btrfs_root *new_root;
4708 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4709 	int ret;
4710 
4711 	if (!rc || !have_reloc_root(root))
4712 		return 0;
4713 
4714 	rc = root->fs_info->reloc_ctl;
4715 	rc->merging_rsv_size += rc->nodes_relocated;
4716 
4717 	if (rc->merge_reloc_tree) {
4718 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4719 					      rc->block_rsv,
4720 					      rc->nodes_relocated, true);
4721 		if (ret)
4722 			return ret;
4723 	}
4724 
4725 	new_root = pending->snap;
4726 	reloc_root = create_reloc_root(trans, root->reloc_root,
4727 				       new_root->root_key.objectid);
4728 	if (IS_ERR(reloc_root))
4729 		return PTR_ERR(reloc_root);
4730 
4731 	ret = __add_reloc_root(reloc_root);
4732 	BUG_ON(ret < 0);
4733 	new_root->reloc_root = btrfs_grab_root(reloc_root);
4734 
4735 	if (rc->create_reloc_tree)
4736 		ret = clone_backref_node(trans, rc, root, reloc_root);
4737 	return ret;
4738 }
4739