xref: /openbmc/linux/fs/btrfs/relocation.c (revision aac5987a)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34 #include "qgroup.h"
35 
36 /*
37  * backref_node, mapping_node and tree_block start with this
38  */
39 struct tree_entry {
40 	struct rb_node rb_node;
41 	u64 bytenr;
42 };
43 
44 /*
45  * present a tree block in the backref cache
46  */
47 struct backref_node {
48 	struct rb_node rb_node;
49 	u64 bytenr;
50 
51 	u64 new_bytenr;
52 	/* objectid of tree block owner, can be not uptodate */
53 	u64 owner;
54 	/* link to pending, changed or detached list */
55 	struct list_head list;
56 	/* list of upper level blocks reference this block */
57 	struct list_head upper;
58 	/* list of child blocks in the cache */
59 	struct list_head lower;
60 	/* NULL if this node is not tree root */
61 	struct btrfs_root *root;
62 	/* extent buffer got by COW the block */
63 	struct extent_buffer *eb;
64 	/* level of tree block */
65 	unsigned int level:8;
66 	/* is the block in non-reference counted tree */
67 	unsigned int cowonly:1;
68 	/* 1 if no child node in the cache */
69 	unsigned int lowest:1;
70 	/* is the extent buffer locked */
71 	unsigned int locked:1;
72 	/* has the block been processed */
73 	unsigned int processed:1;
74 	/* have backrefs of this block been checked */
75 	unsigned int checked:1;
76 	/*
77 	 * 1 if corresponding block has been cowed but some upper
78 	 * level block pointers may not point to the new location
79 	 */
80 	unsigned int pending:1;
81 	/*
82 	 * 1 if the backref node isn't connected to any other
83 	 * backref node.
84 	 */
85 	unsigned int detached:1;
86 };
87 
88 /*
89  * present a block pointer in the backref cache
90  */
91 struct backref_edge {
92 	struct list_head list[2];
93 	struct backref_node *node[2];
94 };
95 
96 #define LOWER	0
97 #define UPPER	1
98 #define RELOCATION_RESERVED_NODES	256
99 
100 struct backref_cache {
101 	/* red black tree of all backref nodes in the cache */
102 	struct rb_root rb_root;
103 	/* for passing backref nodes to btrfs_reloc_cow_block */
104 	struct backref_node *path[BTRFS_MAX_LEVEL];
105 	/*
106 	 * list of blocks that have been cowed but some block
107 	 * pointers in upper level blocks may not reflect the
108 	 * new location
109 	 */
110 	struct list_head pending[BTRFS_MAX_LEVEL];
111 	/* list of backref nodes with no child node */
112 	struct list_head leaves;
113 	/* list of blocks that have been cowed in current transaction */
114 	struct list_head changed;
115 	/* list of detached backref node. */
116 	struct list_head detached;
117 
118 	u64 last_trans;
119 
120 	int nr_nodes;
121 	int nr_edges;
122 };
123 
124 /*
125  * map address of tree root to tree
126  */
127 struct mapping_node {
128 	struct rb_node rb_node;
129 	u64 bytenr;
130 	void *data;
131 };
132 
133 struct mapping_tree {
134 	struct rb_root rb_root;
135 	spinlock_t lock;
136 };
137 
138 /*
139  * present a tree block to process
140  */
141 struct tree_block {
142 	struct rb_node rb_node;
143 	u64 bytenr;
144 	struct btrfs_key key;
145 	unsigned int level:8;
146 	unsigned int key_ready:1;
147 };
148 
149 #define MAX_EXTENTS 128
150 
151 struct file_extent_cluster {
152 	u64 start;
153 	u64 end;
154 	u64 boundary[MAX_EXTENTS];
155 	unsigned int nr;
156 };
157 
158 struct reloc_control {
159 	/* block group to relocate */
160 	struct btrfs_block_group_cache *block_group;
161 	/* extent tree */
162 	struct btrfs_root *extent_root;
163 	/* inode for moving data */
164 	struct inode *data_inode;
165 
166 	struct btrfs_block_rsv *block_rsv;
167 
168 	struct backref_cache backref_cache;
169 
170 	struct file_extent_cluster cluster;
171 	/* tree blocks have been processed */
172 	struct extent_io_tree processed_blocks;
173 	/* map start of tree root to corresponding reloc tree */
174 	struct mapping_tree reloc_root_tree;
175 	/* list of reloc trees */
176 	struct list_head reloc_roots;
177 	/* size of metadata reservation for merging reloc trees */
178 	u64 merging_rsv_size;
179 	/* size of relocated tree nodes */
180 	u64 nodes_relocated;
181 	/* reserved size for block group relocation*/
182 	u64 reserved_bytes;
183 
184 	u64 search_start;
185 	u64 extents_found;
186 
187 	unsigned int stage:8;
188 	unsigned int create_reloc_tree:1;
189 	unsigned int merge_reloc_tree:1;
190 	unsigned int found_file_extent:1;
191 };
192 
193 /* stages of data relocation */
194 #define MOVE_DATA_EXTENTS	0
195 #define UPDATE_DATA_PTRS	1
196 
197 static void remove_backref_node(struct backref_cache *cache,
198 				struct backref_node *node);
199 static void __mark_block_processed(struct reloc_control *rc,
200 				   struct backref_node *node);
201 
202 static void mapping_tree_init(struct mapping_tree *tree)
203 {
204 	tree->rb_root = RB_ROOT;
205 	spin_lock_init(&tree->lock);
206 }
207 
208 static void backref_cache_init(struct backref_cache *cache)
209 {
210 	int i;
211 	cache->rb_root = RB_ROOT;
212 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
213 		INIT_LIST_HEAD(&cache->pending[i]);
214 	INIT_LIST_HEAD(&cache->changed);
215 	INIT_LIST_HEAD(&cache->detached);
216 	INIT_LIST_HEAD(&cache->leaves);
217 }
218 
219 static void backref_cache_cleanup(struct backref_cache *cache)
220 {
221 	struct backref_node *node;
222 	int i;
223 
224 	while (!list_empty(&cache->detached)) {
225 		node = list_entry(cache->detached.next,
226 				  struct backref_node, list);
227 		remove_backref_node(cache, node);
228 	}
229 
230 	while (!list_empty(&cache->leaves)) {
231 		node = list_entry(cache->leaves.next,
232 				  struct backref_node, lower);
233 		remove_backref_node(cache, node);
234 	}
235 
236 	cache->last_trans = 0;
237 
238 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
239 		ASSERT(list_empty(&cache->pending[i]));
240 	ASSERT(list_empty(&cache->changed));
241 	ASSERT(list_empty(&cache->detached));
242 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
243 	ASSERT(!cache->nr_nodes);
244 	ASSERT(!cache->nr_edges);
245 }
246 
247 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
248 {
249 	struct backref_node *node;
250 
251 	node = kzalloc(sizeof(*node), GFP_NOFS);
252 	if (node) {
253 		INIT_LIST_HEAD(&node->list);
254 		INIT_LIST_HEAD(&node->upper);
255 		INIT_LIST_HEAD(&node->lower);
256 		RB_CLEAR_NODE(&node->rb_node);
257 		cache->nr_nodes++;
258 	}
259 	return node;
260 }
261 
262 static void free_backref_node(struct backref_cache *cache,
263 			      struct backref_node *node)
264 {
265 	if (node) {
266 		cache->nr_nodes--;
267 		kfree(node);
268 	}
269 }
270 
271 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
272 {
273 	struct backref_edge *edge;
274 
275 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
276 	if (edge)
277 		cache->nr_edges++;
278 	return edge;
279 }
280 
281 static void free_backref_edge(struct backref_cache *cache,
282 			      struct backref_edge *edge)
283 {
284 	if (edge) {
285 		cache->nr_edges--;
286 		kfree(edge);
287 	}
288 }
289 
290 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
291 				   struct rb_node *node)
292 {
293 	struct rb_node **p = &root->rb_node;
294 	struct rb_node *parent = NULL;
295 	struct tree_entry *entry;
296 
297 	while (*p) {
298 		parent = *p;
299 		entry = rb_entry(parent, struct tree_entry, rb_node);
300 
301 		if (bytenr < entry->bytenr)
302 			p = &(*p)->rb_left;
303 		else if (bytenr > entry->bytenr)
304 			p = &(*p)->rb_right;
305 		else
306 			return parent;
307 	}
308 
309 	rb_link_node(node, parent, p);
310 	rb_insert_color(node, root);
311 	return NULL;
312 }
313 
314 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
315 {
316 	struct rb_node *n = root->rb_node;
317 	struct tree_entry *entry;
318 
319 	while (n) {
320 		entry = rb_entry(n, struct tree_entry, rb_node);
321 
322 		if (bytenr < entry->bytenr)
323 			n = n->rb_left;
324 		else if (bytenr > entry->bytenr)
325 			n = n->rb_right;
326 		else
327 			return n;
328 	}
329 	return NULL;
330 }
331 
332 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
333 {
334 
335 	struct btrfs_fs_info *fs_info = NULL;
336 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
337 					      rb_node);
338 	if (bnode->root)
339 		fs_info = bnode->root->fs_info;
340 	btrfs_panic(fs_info, errno,
341 		    "Inconsistency in backref cache found at offset %llu",
342 		    bytenr);
343 }
344 
345 /*
346  * walk up backref nodes until reach node presents tree root
347  */
348 static struct backref_node *walk_up_backref(struct backref_node *node,
349 					    struct backref_edge *edges[],
350 					    int *index)
351 {
352 	struct backref_edge *edge;
353 	int idx = *index;
354 
355 	while (!list_empty(&node->upper)) {
356 		edge = list_entry(node->upper.next,
357 				  struct backref_edge, list[LOWER]);
358 		edges[idx++] = edge;
359 		node = edge->node[UPPER];
360 	}
361 	BUG_ON(node->detached);
362 	*index = idx;
363 	return node;
364 }
365 
366 /*
367  * walk down backref nodes to find start of next reference path
368  */
369 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
370 					      int *index)
371 {
372 	struct backref_edge *edge;
373 	struct backref_node *lower;
374 	int idx = *index;
375 
376 	while (idx > 0) {
377 		edge = edges[idx - 1];
378 		lower = edge->node[LOWER];
379 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
380 			idx--;
381 			continue;
382 		}
383 		edge = list_entry(edge->list[LOWER].next,
384 				  struct backref_edge, list[LOWER]);
385 		edges[idx - 1] = edge;
386 		*index = idx;
387 		return edge->node[UPPER];
388 	}
389 	*index = 0;
390 	return NULL;
391 }
392 
393 static void unlock_node_buffer(struct backref_node *node)
394 {
395 	if (node->locked) {
396 		btrfs_tree_unlock(node->eb);
397 		node->locked = 0;
398 	}
399 }
400 
401 static void drop_node_buffer(struct backref_node *node)
402 {
403 	if (node->eb) {
404 		unlock_node_buffer(node);
405 		free_extent_buffer(node->eb);
406 		node->eb = NULL;
407 	}
408 }
409 
410 static void drop_backref_node(struct backref_cache *tree,
411 			      struct backref_node *node)
412 {
413 	BUG_ON(!list_empty(&node->upper));
414 
415 	drop_node_buffer(node);
416 	list_del(&node->list);
417 	list_del(&node->lower);
418 	if (!RB_EMPTY_NODE(&node->rb_node))
419 		rb_erase(&node->rb_node, &tree->rb_root);
420 	free_backref_node(tree, node);
421 }
422 
423 /*
424  * remove a backref node from the backref cache
425  */
426 static void remove_backref_node(struct backref_cache *cache,
427 				struct backref_node *node)
428 {
429 	struct backref_node *upper;
430 	struct backref_edge *edge;
431 
432 	if (!node)
433 		return;
434 
435 	BUG_ON(!node->lowest && !node->detached);
436 	while (!list_empty(&node->upper)) {
437 		edge = list_entry(node->upper.next, struct backref_edge,
438 				  list[LOWER]);
439 		upper = edge->node[UPPER];
440 		list_del(&edge->list[LOWER]);
441 		list_del(&edge->list[UPPER]);
442 		free_backref_edge(cache, edge);
443 
444 		if (RB_EMPTY_NODE(&upper->rb_node)) {
445 			BUG_ON(!list_empty(&node->upper));
446 			drop_backref_node(cache, node);
447 			node = upper;
448 			node->lowest = 1;
449 			continue;
450 		}
451 		/*
452 		 * add the node to leaf node list if no other
453 		 * child block cached.
454 		 */
455 		if (list_empty(&upper->lower)) {
456 			list_add_tail(&upper->lower, &cache->leaves);
457 			upper->lowest = 1;
458 		}
459 	}
460 
461 	drop_backref_node(cache, node);
462 }
463 
464 static void update_backref_node(struct backref_cache *cache,
465 				struct backref_node *node, u64 bytenr)
466 {
467 	struct rb_node *rb_node;
468 	rb_erase(&node->rb_node, &cache->rb_root);
469 	node->bytenr = bytenr;
470 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
471 	if (rb_node)
472 		backref_tree_panic(rb_node, -EEXIST, bytenr);
473 }
474 
475 /*
476  * update backref cache after a transaction commit
477  */
478 static int update_backref_cache(struct btrfs_trans_handle *trans,
479 				struct backref_cache *cache)
480 {
481 	struct backref_node *node;
482 	int level = 0;
483 
484 	if (cache->last_trans == 0) {
485 		cache->last_trans = trans->transid;
486 		return 0;
487 	}
488 
489 	if (cache->last_trans == trans->transid)
490 		return 0;
491 
492 	/*
493 	 * detached nodes are used to avoid unnecessary backref
494 	 * lookup. transaction commit changes the extent tree.
495 	 * so the detached nodes are no longer useful.
496 	 */
497 	while (!list_empty(&cache->detached)) {
498 		node = list_entry(cache->detached.next,
499 				  struct backref_node, list);
500 		remove_backref_node(cache, node);
501 	}
502 
503 	while (!list_empty(&cache->changed)) {
504 		node = list_entry(cache->changed.next,
505 				  struct backref_node, list);
506 		list_del_init(&node->list);
507 		BUG_ON(node->pending);
508 		update_backref_node(cache, node, node->new_bytenr);
509 	}
510 
511 	/*
512 	 * some nodes can be left in the pending list if there were
513 	 * errors during processing the pending nodes.
514 	 */
515 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
516 		list_for_each_entry(node, &cache->pending[level], list) {
517 			BUG_ON(!node->pending);
518 			if (node->bytenr == node->new_bytenr)
519 				continue;
520 			update_backref_node(cache, node, node->new_bytenr);
521 		}
522 	}
523 
524 	cache->last_trans = 0;
525 	return 1;
526 }
527 
528 
529 static int should_ignore_root(struct btrfs_root *root)
530 {
531 	struct btrfs_root *reloc_root;
532 
533 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
534 		return 0;
535 
536 	reloc_root = root->reloc_root;
537 	if (!reloc_root)
538 		return 0;
539 
540 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
541 	    root->fs_info->running_transaction->transid - 1)
542 		return 0;
543 	/*
544 	 * if there is reloc tree and it was created in previous
545 	 * transaction backref lookup can find the reloc tree,
546 	 * so backref node for the fs tree root is useless for
547 	 * relocation.
548 	 */
549 	return 1;
550 }
551 /*
552  * find reloc tree by address of tree root
553  */
554 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
555 					  u64 bytenr)
556 {
557 	struct rb_node *rb_node;
558 	struct mapping_node *node;
559 	struct btrfs_root *root = NULL;
560 
561 	spin_lock(&rc->reloc_root_tree.lock);
562 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
563 	if (rb_node) {
564 		node = rb_entry(rb_node, struct mapping_node, rb_node);
565 		root = (struct btrfs_root *)node->data;
566 	}
567 	spin_unlock(&rc->reloc_root_tree.lock);
568 	return root;
569 }
570 
571 static int is_cowonly_root(u64 root_objectid)
572 {
573 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
574 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
575 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
576 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
577 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
578 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
579 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
580 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
581 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
582 		return 1;
583 	return 0;
584 }
585 
586 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
587 					u64 root_objectid)
588 {
589 	struct btrfs_key key;
590 
591 	key.objectid = root_objectid;
592 	key.type = BTRFS_ROOT_ITEM_KEY;
593 	if (is_cowonly_root(root_objectid))
594 		key.offset = 0;
595 	else
596 		key.offset = (u64)-1;
597 
598 	return btrfs_get_fs_root(fs_info, &key, false);
599 }
600 
601 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
602 static noinline_for_stack
603 struct btrfs_root *find_tree_root(struct reloc_control *rc,
604 				  struct extent_buffer *leaf,
605 				  struct btrfs_extent_ref_v0 *ref0)
606 {
607 	struct btrfs_root *root;
608 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
609 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
610 
611 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
612 
613 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
614 	BUG_ON(IS_ERR(root));
615 
616 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
617 	    generation != btrfs_root_generation(&root->root_item))
618 		return NULL;
619 
620 	return root;
621 }
622 #endif
623 
624 static noinline_for_stack
625 int find_inline_backref(struct extent_buffer *leaf, int slot,
626 			unsigned long *ptr, unsigned long *end)
627 {
628 	struct btrfs_key key;
629 	struct btrfs_extent_item *ei;
630 	struct btrfs_tree_block_info *bi;
631 	u32 item_size;
632 
633 	btrfs_item_key_to_cpu(leaf, &key, slot);
634 
635 	item_size = btrfs_item_size_nr(leaf, slot);
636 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
637 	if (item_size < sizeof(*ei)) {
638 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
639 		return 1;
640 	}
641 #endif
642 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
643 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
644 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
645 
646 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
647 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
648 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
649 		return 1;
650 	}
651 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
652 	    item_size <= sizeof(*ei)) {
653 		WARN_ON(item_size < sizeof(*ei));
654 		return 1;
655 	}
656 
657 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
658 		bi = (struct btrfs_tree_block_info *)(ei + 1);
659 		*ptr = (unsigned long)(bi + 1);
660 	} else {
661 		*ptr = (unsigned long)(ei + 1);
662 	}
663 	*end = (unsigned long)ei + item_size;
664 	return 0;
665 }
666 
667 /*
668  * build backref tree for a given tree block. root of the backref tree
669  * corresponds the tree block, leaves of the backref tree correspond
670  * roots of b-trees that reference the tree block.
671  *
672  * the basic idea of this function is check backrefs of a given block
673  * to find upper level blocks that reference the block, and then check
674  * backrefs of these upper level blocks recursively. the recursion stop
675  * when tree root is reached or backrefs for the block is cached.
676  *
677  * NOTE: if we find backrefs for a block are cached, we know backrefs
678  * for all upper level blocks that directly/indirectly reference the
679  * block are also cached.
680  */
681 static noinline_for_stack
682 struct backref_node *build_backref_tree(struct reloc_control *rc,
683 					struct btrfs_key *node_key,
684 					int level, u64 bytenr)
685 {
686 	struct backref_cache *cache = &rc->backref_cache;
687 	struct btrfs_path *path1;
688 	struct btrfs_path *path2;
689 	struct extent_buffer *eb;
690 	struct btrfs_root *root;
691 	struct backref_node *cur;
692 	struct backref_node *upper;
693 	struct backref_node *lower;
694 	struct backref_node *node = NULL;
695 	struct backref_node *exist = NULL;
696 	struct backref_edge *edge;
697 	struct rb_node *rb_node;
698 	struct btrfs_key key;
699 	unsigned long end;
700 	unsigned long ptr;
701 	LIST_HEAD(list);
702 	LIST_HEAD(useless);
703 	int cowonly;
704 	int ret;
705 	int err = 0;
706 	bool need_check = true;
707 
708 	path1 = btrfs_alloc_path();
709 	path2 = btrfs_alloc_path();
710 	if (!path1 || !path2) {
711 		err = -ENOMEM;
712 		goto out;
713 	}
714 	path1->reada = READA_FORWARD;
715 	path2->reada = READA_FORWARD;
716 
717 	node = alloc_backref_node(cache);
718 	if (!node) {
719 		err = -ENOMEM;
720 		goto out;
721 	}
722 
723 	node->bytenr = bytenr;
724 	node->level = level;
725 	node->lowest = 1;
726 	cur = node;
727 again:
728 	end = 0;
729 	ptr = 0;
730 	key.objectid = cur->bytenr;
731 	key.type = BTRFS_METADATA_ITEM_KEY;
732 	key.offset = (u64)-1;
733 
734 	path1->search_commit_root = 1;
735 	path1->skip_locking = 1;
736 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
737 				0, 0);
738 	if (ret < 0) {
739 		err = ret;
740 		goto out;
741 	}
742 	ASSERT(ret);
743 	ASSERT(path1->slots[0]);
744 
745 	path1->slots[0]--;
746 
747 	WARN_ON(cur->checked);
748 	if (!list_empty(&cur->upper)) {
749 		/*
750 		 * the backref was added previously when processing
751 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
752 		 */
753 		ASSERT(list_is_singular(&cur->upper));
754 		edge = list_entry(cur->upper.next, struct backref_edge,
755 				  list[LOWER]);
756 		ASSERT(list_empty(&edge->list[UPPER]));
757 		exist = edge->node[UPPER];
758 		/*
759 		 * add the upper level block to pending list if we need
760 		 * check its backrefs
761 		 */
762 		if (!exist->checked)
763 			list_add_tail(&edge->list[UPPER], &list);
764 	} else {
765 		exist = NULL;
766 	}
767 
768 	while (1) {
769 		cond_resched();
770 		eb = path1->nodes[0];
771 
772 		if (ptr >= end) {
773 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
774 				ret = btrfs_next_leaf(rc->extent_root, path1);
775 				if (ret < 0) {
776 					err = ret;
777 					goto out;
778 				}
779 				if (ret > 0)
780 					break;
781 				eb = path1->nodes[0];
782 			}
783 
784 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
785 			if (key.objectid != cur->bytenr) {
786 				WARN_ON(exist);
787 				break;
788 			}
789 
790 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
791 			    key.type == BTRFS_METADATA_ITEM_KEY) {
792 				ret = find_inline_backref(eb, path1->slots[0],
793 							  &ptr, &end);
794 				if (ret)
795 					goto next;
796 			}
797 		}
798 
799 		if (ptr < end) {
800 			/* update key for inline back ref */
801 			struct btrfs_extent_inline_ref *iref;
802 			iref = (struct btrfs_extent_inline_ref *)ptr;
803 			key.type = btrfs_extent_inline_ref_type(eb, iref);
804 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
805 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
806 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
807 		}
808 
809 		if (exist &&
810 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
811 		      exist->owner == key.offset) ||
812 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
813 		      exist->bytenr == key.offset))) {
814 			exist = NULL;
815 			goto next;
816 		}
817 
818 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
819 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
820 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
821 			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
822 				struct btrfs_extent_ref_v0 *ref0;
823 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
824 						struct btrfs_extent_ref_v0);
825 				if (key.objectid == key.offset) {
826 					root = find_tree_root(rc, eb, ref0);
827 					if (root && !should_ignore_root(root))
828 						cur->root = root;
829 					else
830 						list_add(&cur->list, &useless);
831 					break;
832 				}
833 				if (is_cowonly_root(btrfs_ref_root_v0(eb,
834 								      ref0)))
835 					cur->cowonly = 1;
836 			}
837 #else
838 		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
839 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
840 #endif
841 			if (key.objectid == key.offset) {
842 				/*
843 				 * only root blocks of reloc trees use
844 				 * backref of this type.
845 				 */
846 				root = find_reloc_root(rc, cur->bytenr);
847 				ASSERT(root);
848 				cur->root = root;
849 				break;
850 			}
851 
852 			edge = alloc_backref_edge(cache);
853 			if (!edge) {
854 				err = -ENOMEM;
855 				goto out;
856 			}
857 			rb_node = tree_search(&cache->rb_root, key.offset);
858 			if (!rb_node) {
859 				upper = alloc_backref_node(cache);
860 				if (!upper) {
861 					free_backref_edge(cache, edge);
862 					err = -ENOMEM;
863 					goto out;
864 				}
865 				upper->bytenr = key.offset;
866 				upper->level = cur->level + 1;
867 				/*
868 				 *  backrefs for the upper level block isn't
869 				 *  cached, add the block to pending list
870 				 */
871 				list_add_tail(&edge->list[UPPER], &list);
872 			} else {
873 				upper = rb_entry(rb_node, struct backref_node,
874 						 rb_node);
875 				ASSERT(upper->checked);
876 				INIT_LIST_HEAD(&edge->list[UPPER]);
877 			}
878 			list_add_tail(&edge->list[LOWER], &cur->upper);
879 			edge->node[LOWER] = cur;
880 			edge->node[UPPER] = upper;
881 
882 			goto next;
883 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
884 			goto next;
885 		}
886 
887 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
888 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
889 		if (IS_ERR(root)) {
890 			err = PTR_ERR(root);
891 			goto out;
892 		}
893 
894 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
895 			cur->cowonly = 1;
896 
897 		if (btrfs_root_level(&root->root_item) == cur->level) {
898 			/* tree root */
899 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
900 			       cur->bytenr);
901 			if (should_ignore_root(root))
902 				list_add(&cur->list, &useless);
903 			else
904 				cur->root = root;
905 			break;
906 		}
907 
908 		level = cur->level + 1;
909 
910 		/*
911 		 * searching the tree to find upper level blocks
912 		 * reference the block.
913 		 */
914 		path2->search_commit_root = 1;
915 		path2->skip_locking = 1;
916 		path2->lowest_level = level;
917 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
918 		path2->lowest_level = 0;
919 		if (ret < 0) {
920 			err = ret;
921 			goto out;
922 		}
923 		if (ret > 0 && path2->slots[level] > 0)
924 			path2->slots[level]--;
925 
926 		eb = path2->nodes[level];
927 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
928 		    cur->bytenr) {
929 			btrfs_err(root->fs_info,
930 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
931 				  cur->bytenr, level - 1, root->objectid,
932 				  node_key->objectid, node_key->type,
933 				  node_key->offset);
934 			err = -ENOENT;
935 			goto out;
936 		}
937 		lower = cur;
938 		need_check = true;
939 		for (; level < BTRFS_MAX_LEVEL; level++) {
940 			if (!path2->nodes[level]) {
941 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
942 				       lower->bytenr);
943 				if (should_ignore_root(root))
944 					list_add(&lower->list, &useless);
945 				else
946 					lower->root = root;
947 				break;
948 			}
949 
950 			edge = alloc_backref_edge(cache);
951 			if (!edge) {
952 				err = -ENOMEM;
953 				goto out;
954 			}
955 
956 			eb = path2->nodes[level];
957 			rb_node = tree_search(&cache->rb_root, eb->start);
958 			if (!rb_node) {
959 				upper = alloc_backref_node(cache);
960 				if (!upper) {
961 					free_backref_edge(cache, edge);
962 					err = -ENOMEM;
963 					goto out;
964 				}
965 				upper->bytenr = eb->start;
966 				upper->owner = btrfs_header_owner(eb);
967 				upper->level = lower->level + 1;
968 				if (!test_bit(BTRFS_ROOT_REF_COWS,
969 					      &root->state))
970 					upper->cowonly = 1;
971 
972 				/*
973 				 * if we know the block isn't shared
974 				 * we can void checking its backrefs.
975 				 */
976 				if (btrfs_block_can_be_shared(root, eb))
977 					upper->checked = 0;
978 				else
979 					upper->checked = 1;
980 
981 				/*
982 				 * add the block to pending list if we
983 				 * need check its backrefs, we only do this once
984 				 * while walking up a tree as we will catch
985 				 * anything else later on.
986 				 */
987 				if (!upper->checked && need_check) {
988 					need_check = false;
989 					list_add_tail(&edge->list[UPPER],
990 						      &list);
991 				} else {
992 					if (upper->checked)
993 						need_check = true;
994 					INIT_LIST_HEAD(&edge->list[UPPER]);
995 				}
996 			} else {
997 				upper = rb_entry(rb_node, struct backref_node,
998 						 rb_node);
999 				ASSERT(upper->checked);
1000 				INIT_LIST_HEAD(&edge->list[UPPER]);
1001 				if (!upper->owner)
1002 					upper->owner = btrfs_header_owner(eb);
1003 			}
1004 			list_add_tail(&edge->list[LOWER], &lower->upper);
1005 			edge->node[LOWER] = lower;
1006 			edge->node[UPPER] = upper;
1007 
1008 			if (rb_node)
1009 				break;
1010 			lower = upper;
1011 			upper = NULL;
1012 		}
1013 		btrfs_release_path(path2);
1014 next:
1015 		if (ptr < end) {
1016 			ptr += btrfs_extent_inline_ref_size(key.type);
1017 			if (ptr >= end) {
1018 				WARN_ON(ptr > end);
1019 				ptr = 0;
1020 				end = 0;
1021 			}
1022 		}
1023 		if (ptr >= end)
1024 			path1->slots[0]++;
1025 	}
1026 	btrfs_release_path(path1);
1027 
1028 	cur->checked = 1;
1029 	WARN_ON(exist);
1030 
1031 	/* the pending list isn't empty, take the first block to process */
1032 	if (!list_empty(&list)) {
1033 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034 		list_del_init(&edge->list[UPPER]);
1035 		cur = edge->node[UPPER];
1036 		goto again;
1037 	}
1038 
1039 	/*
1040 	 * everything goes well, connect backref nodes and insert backref nodes
1041 	 * into the cache.
1042 	 */
1043 	ASSERT(node->checked);
1044 	cowonly = node->cowonly;
1045 	if (!cowonly) {
1046 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1047 				      &node->rb_node);
1048 		if (rb_node)
1049 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1050 		list_add_tail(&node->lower, &cache->leaves);
1051 	}
1052 
1053 	list_for_each_entry(edge, &node->upper, list[LOWER])
1054 		list_add_tail(&edge->list[UPPER], &list);
1055 
1056 	while (!list_empty(&list)) {
1057 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1058 		list_del_init(&edge->list[UPPER]);
1059 		upper = edge->node[UPPER];
1060 		if (upper->detached) {
1061 			list_del(&edge->list[LOWER]);
1062 			lower = edge->node[LOWER];
1063 			free_backref_edge(cache, edge);
1064 			if (list_empty(&lower->upper))
1065 				list_add(&lower->list, &useless);
1066 			continue;
1067 		}
1068 
1069 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1070 			if (upper->lowest) {
1071 				list_del_init(&upper->lower);
1072 				upper->lowest = 0;
1073 			}
1074 
1075 			list_add_tail(&edge->list[UPPER], &upper->lower);
1076 			continue;
1077 		}
1078 
1079 		if (!upper->checked) {
1080 			/*
1081 			 * Still want to blow up for developers since this is a
1082 			 * logic bug.
1083 			 */
1084 			ASSERT(0);
1085 			err = -EINVAL;
1086 			goto out;
1087 		}
1088 		if (cowonly != upper->cowonly) {
1089 			ASSERT(0);
1090 			err = -EINVAL;
1091 			goto out;
1092 		}
1093 
1094 		if (!cowonly) {
1095 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1096 					      &upper->rb_node);
1097 			if (rb_node)
1098 				backref_tree_panic(rb_node, -EEXIST,
1099 						   upper->bytenr);
1100 		}
1101 
1102 		list_add_tail(&edge->list[UPPER], &upper->lower);
1103 
1104 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1105 			list_add_tail(&edge->list[UPPER], &list);
1106 	}
1107 	/*
1108 	 * process useless backref nodes. backref nodes for tree leaves
1109 	 * are deleted from the cache. backref nodes for upper level
1110 	 * tree blocks are left in the cache to avoid unnecessary backref
1111 	 * lookup.
1112 	 */
1113 	while (!list_empty(&useless)) {
1114 		upper = list_entry(useless.next, struct backref_node, list);
1115 		list_del_init(&upper->list);
1116 		ASSERT(list_empty(&upper->upper));
1117 		if (upper == node)
1118 			node = NULL;
1119 		if (upper->lowest) {
1120 			list_del_init(&upper->lower);
1121 			upper->lowest = 0;
1122 		}
1123 		while (!list_empty(&upper->lower)) {
1124 			edge = list_entry(upper->lower.next,
1125 					  struct backref_edge, list[UPPER]);
1126 			list_del(&edge->list[UPPER]);
1127 			list_del(&edge->list[LOWER]);
1128 			lower = edge->node[LOWER];
1129 			free_backref_edge(cache, edge);
1130 
1131 			if (list_empty(&lower->upper))
1132 				list_add(&lower->list, &useless);
1133 		}
1134 		__mark_block_processed(rc, upper);
1135 		if (upper->level > 0) {
1136 			list_add(&upper->list, &cache->detached);
1137 			upper->detached = 1;
1138 		} else {
1139 			rb_erase(&upper->rb_node, &cache->rb_root);
1140 			free_backref_node(cache, upper);
1141 		}
1142 	}
1143 out:
1144 	btrfs_free_path(path1);
1145 	btrfs_free_path(path2);
1146 	if (err) {
1147 		while (!list_empty(&useless)) {
1148 			lower = list_entry(useless.next,
1149 					   struct backref_node, list);
1150 			list_del_init(&lower->list);
1151 		}
1152 		while (!list_empty(&list)) {
1153 			edge = list_first_entry(&list, struct backref_edge,
1154 						list[UPPER]);
1155 			list_del(&edge->list[UPPER]);
1156 			list_del(&edge->list[LOWER]);
1157 			lower = edge->node[LOWER];
1158 			upper = edge->node[UPPER];
1159 			free_backref_edge(cache, edge);
1160 
1161 			/*
1162 			 * Lower is no longer linked to any upper backref nodes
1163 			 * and isn't in the cache, we can free it ourselves.
1164 			 */
1165 			if (list_empty(&lower->upper) &&
1166 			    RB_EMPTY_NODE(&lower->rb_node))
1167 				list_add(&lower->list, &useless);
1168 
1169 			if (!RB_EMPTY_NODE(&upper->rb_node))
1170 				continue;
1171 
1172 			/* Add this guy's upper edges to the list to process */
1173 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1174 				list_add_tail(&edge->list[UPPER], &list);
1175 			if (list_empty(&upper->upper))
1176 				list_add(&upper->list, &useless);
1177 		}
1178 
1179 		while (!list_empty(&useless)) {
1180 			lower = list_entry(useless.next,
1181 					   struct backref_node, list);
1182 			list_del_init(&lower->list);
1183 			if (lower == node)
1184 				node = NULL;
1185 			free_backref_node(cache, lower);
1186 		}
1187 
1188 		free_backref_node(cache, node);
1189 		return ERR_PTR(err);
1190 	}
1191 	ASSERT(!node || !node->detached);
1192 	return node;
1193 }
1194 
1195 /*
1196  * helper to add backref node for the newly created snapshot.
1197  * the backref node is created by cloning backref node that
1198  * corresponds to root of source tree
1199  */
1200 static int clone_backref_node(struct btrfs_trans_handle *trans,
1201 			      struct reloc_control *rc,
1202 			      struct btrfs_root *src,
1203 			      struct btrfs_root *dest)
1204 {
1205 	struct btrfs_root *reloc_root = src->reloc_root;
1206 	struct backref_cache *cache = &rc->backref_cache;
1207 	struct backref_node *node = NULL;
1208 	struct backref_node *new_node;
1209 	struct backref_edge *edge;
1210 	struct backref_edge *new_edge;
1211 	struct rb_node *rb_node;
1212 
1213 	if (cache->last_trans > 0)
1214 		update_backref_cache(trans, cache);
1215 
1216 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1217 	if (rb_node) {
1218 		node = rb_entry(rb_node, struct backref_node, rb_node);
1219 		if (node->detached)
1220 			node = NULL;
1221 		else
1222 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1223 	}
1224 
1225 	if (!node) {
1226 		rb_node = tree_search(&cache->rb_root,
1227 				      reloc_root->commit_root->start);
1228 		if (rb_node) {
1229 			node = rb_entry(rb_node, struct backref_node,
1230 					rb_node);
1231 			BUG_ON(node->detached);
1232 		}
1233 	}
1234 
1235 	if (!node)
1236 		return 0;
1237 
1238 	new_node = alloc_backref_node(cache);
1239 	if (!new_node)
1240 		return -ENOMEM;
1241 
1242 	new_node->bytenr = dest->node->start;
1243 	new_node->level = node->level;
1244 	new_node->lowest = node->lowest;
1245 	new_node->checked = 1;
1246 	new_node->root = dest;
1247 
1248 	if (!node->lowest) {
1249 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1250 			new_edge = alloc_backref_edge(cache);
1251 			if (!new_edge)
1252 				goto fail;
1253 
1254 			new_edge->node[UPPER] = new_node;
1255 			new_edge->node[LOWER] = edge->node[LOWER];
1256 			list_add_tail(&new_edge->list[UPPER],
1257 				      &new_node->lower);
1258 		}
1259 	} else {
1260 		list_add_tail(&new_node->lower, &cache->leaves);
1261 	}
1262 
1263 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1264 			      &new_node->rb_node);
1265 	if (rb_node)
1266 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1267 
1268 	if (!new_node->lowest) {
1269 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1270 			list_add_tail(&new_edge->list[LOWER],
1271 				      &new_edge->node[LOWER]->upper);
1272 		}
1273 	}
1274 	return 0;
1275 fail:
1276 	while (!list_empty(&new_node->lower)) {
1277 		new_edge = list_entry(new_node->lower.next,
1278 				      struct backref_edge, list[UPPER]);
1279 		list_del(&new_edge->list[UPPER]);
1280 		free_backref_edge(cache, new_edge);
1281 	}
1282 	free_backref_node(cache, new_node);
1283 	return -ENOMEM;
1284 }
1285 
1286 /*
1287  * helper to add 'address of tree root -> reloc tree' mapping
1288  */
1289 static int __must_check __add_reloc_root(struct btrfs_root *root)
1290 {
1291 	struct btrfs_fs_info *fs_info = root->fs_info;
1292 	struct rb_node *rb_node;
1293 	struct mapping_node *node;
1294 	struct reloc_control *rc = fs_info->reloc_ctl;
1295 
1296 	node = kmalloc(sizeof(*node), GFP_NOFS);
1297 	if (!node)
1298 		return -ENOMEM;
1299 
1300 	node->bytenr = root->node->start;
1301 	node->data = root;
1302 
1303 	spin_lock(&rc->reloc_root_tree.lock);
1304 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1305 			      node->bytenr, &node->rb_node);
1306 	spin_unlock(&rc->reloc_root_tree.lock);
1307 	if (rb_node) {
1308 		btrfs_panic(fs_info, -EEXIST,
1309 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1310 			    node->bytenr);
1311 		kfree(node);
1312 		return -EEXIST;
1313 	}
1314 
1315 	list_add_tail(&root->root_list, &rc->reloc_roots);
1316 	return 0;
1317 }
1318 
1319 /*
1320  * helper to delete the 'address of tree root -> reloc tree'
1321  * mapping
1322  */
1323 static void __del_reloc_root(struct btrfs_root *root)
1324 {
1325 	struct btrfs_fs_info *fs_info = root->fs_info;
1326 	struct rb_node *rb_node;
1327 	struct mapping_node *node = NULL;
1328 	struct reloc_control *rc = fs_info->reloc_ctl;
1329 
1330 	spin_lock(&rc->reloc_root_tree.lock);
1331 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1332 			      root->node->start);
1333 	if (rb_node) {
1334 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1335 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1336 	}
1337 	spin_unlock(&rc->reloc_root_tree.lock);
1338 
1339 	if (!node)
1340 		return;
1341 	BUG_ON((struct btrfs_root *)node->data != root);
1342 
1343 	spin_lock(&fs_info->trans_lock);
1344 	list_del_init(&root->root_list);
1345 	spin_unlock(&fs_info->trans_lock);
1346 	kfree(node);
1347 }
1348 
1349 /*
1350  * helper to update the 'address of tree root -> reloc tree'
1351  * mapping
1352  */
1353 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1354 {
1355 	struct btrfs_fs_info *fs_info = root->fs_info;
1356 	struct rb_node *rb_node;
1357 	struct mapping_node *node = NULL;
1358 	struct reloc_control *rc = fs_info->reloc_ctl;
1359 
1360 	spin_lock(&rc->reloc_root_tree.lock);
1361 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1362 			      root->node->start);
1363 	if (rb_node) {
1364 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1365 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1366 	}
1367 	spin_unlock(&rc->reloc_root_tree.lock);
1368 
1369 	if (!node)
1370 		return 0;
1371 	BUG_ON((struct btrfs_root *)node->data != root);
1372 
1373 	spin_lock(&rc->reloc_root_tree.lock);
1374 	node->bytenr = new_bytenr;
1375 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1376 			      node->bytenr, &node->rb_node);
1377 	spin_unlock(&rc->reloc_root_tree.lock);
1378 	if (rb_node)
1379 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1380 	return 0;
1381 }
1382 
1383 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1384 					struct btrfs_root *root, u64 objectid)
1385 {
1386 	struct btrfs_fs_info *fs_info = root->fs_info;
1387 	struct btrfs_root *reloc_root;
1388 	struct extent_buffer *eb;
1389 	struct btrfs_root_item *root_item;
1390 	struct btrfs_key root_key;
1391 	int ret;
1392 
1393 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1394 	BUG_ON(!root_item);
1395 
1396 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1397 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1398 	root_key.offset = objectid;
1399 
1400 	if (root->root_key.objectid == objectid) {
1401 		u64 commit_root_gen;
1402 
1403 		/* called by btrfs_init_reloc_root */
1404 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1405 				      BTRFS_TREE_RELOC_OBJECTID);
1406 		BUG_ON(ret);
1407 		/*
1408 		 * Set the last_snapshot field to the generation of the commit
1409 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1410 		 * correctly (returns true) when the relocation root is created
1411 		 * either inside the critical section of a transaction commit
1412 		 * (through transaction.c:qgroup_account_snapshot()) and when
1413 		 * it's created before the transaction commit is started.
1414 		 */
1415 		commit_root_gen = btrfs_header_generation(root->commit_root);
1416 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1417 	} else {
1418 		/*
1419 		 * called by btrfs_reloc_post_snapshot_hook.
1420 		 * the source tree is a reloc tree, all tree blocks
1421 		 * modified after it was created have RELOC flag
1422 		 * set in their headers. so it's OK to not update
1423 		 * the 'last_snapshot'.
1424 		 */
1425 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1426 				      BTRFS_TREE_RELOC_OBJECTID);
1427 		BUG_ON(ret);
1428 	}
1429 
1430 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1431 	btrfs_set_root_bytenr(root_item, eb->start);
1432 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1433 	btrfs_set_root_generation(root_item, trans->transid);
1434 
1435 	if (root->root_key.objectid == objectid) {
1436 		btrfs_set_root_refs(root_item, 0);
1437 		memset(&root_item->drop_progress, 0,
1438 		       sizeof(struct btrfs_disk_key));
1439 		root_item->drop_level = 0;
1440 	}
1441 
1442 	btrfs_tree_unlock(eb);
1443 	free_extent_buffer(eb);
1444 
1445 	ret = btrfs_insert_root(trans, fs_info->tree_root,
1446 				&root_key, root_item);
1447 	BUG_ON(ret);
1448 	kfree(root_item);
1449 
1450 	reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1451 	BUG_ON(IS_ERR(reloc_root));
1452 	reloc_root->last_trans = trans->transid;
1453 	return reloc_root;
1454 }
1455 
1456 /*
1457  * create reloc tree for a given fs tree. reloc tree is just a
1458  * snapshot of the fs tree with special root objectid.
1459  */
1460 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1461 			  struct btrfs_root *root)
1462 {
1463 	struct btrfs_fs_info *fs_info = root->fs_info;
1464 	struct btrfs_root *reloc_root;
1465 	struct reloc_control *rc = fs_info->reloc_ctl;
1466 	struct btrfs_block_rsv *rsv;
1467 	int clear_rsv = 0;
1468 	int ret;
1469 
1470 	if (root->reloc_root) {
1471 		reloc_root = root->reloc_root;
1472 		reloc_root->last_trans = trans->transid;
1473 		return 0;
1474 	}
1475 
1476 	if (!rc || !rc->create_reloc_tree ||
1477 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1478 		return 0;
1479 
1480 	if (!trans->reloc_reserved) {
1481 		rsv = trans->block_rsv;
1482 		trans->block_rsv = rc->block_rsv;
1483 		clear_rsv = 1;
1484 	}
1485 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1486 	if (clear_rsv)
1487 		trans->block_rsv = rsv;
1488 
1489 	ret = __add_reloc_root(reloc_root);
1490 	BUG_ON(ret < 0);
1491 	root->reloc_root = reloc_root;
1492 	return 0;
1493 }
1494 
1495 /*
1496  * update root item of reloc tree
1497  */
1498 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1499 			    struct btrfs_root *root)
1500 {
1501 	struct btrfs_fs_info *fs_info = root->fs_info;
1502 	struct btrfs_root *reloc_root;
1503 	struct btrfs_root_item *root_item;
1504 	int ret;
1505 
1506 	if (!root->reloc_root)
1507 		goto out;
1508 
1509 	reloc_root = root->reloc_root;
1510 	root_item = &reloc_root->root_item;
1511 
1512 	if (fs_info->reloc_ctl->merge_reloc_tree &&
1513 	    btrfs_root_refs(root_item) == 0) {
1514 		root->reloc_root = NULL;
1515 		__del_reloc_root(reloc_root);
1516 	}
1517 
1518 	if (reloc_root->commit_root != reloc_root->node) {
1519 		btrfs_set_root_node(root_item, reloc_root->node);
1520 		free_extent_buffer(reloc_root->commit_root);
1521 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1522 	}
1523 
1524 	ret = btrfs_update_root(trans, fs_info->tree_root,
1525 				&reloc_root->root_key, root_item);
1526 	BUG_ON(ret);
1527 
1528 out:
1529 	return 0;
1530 }
1531 
1532 /*
1533  * helper to find first cached inode with inode number >= objectid
1534  * in a subvolume
1535  */
1536 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1537 {
1538 	struct rb_node *node;
1539 	struct rb_node *prev;
1540 	struct btrfs_inode *entry;
1541 	struct inode *inode;
1542 
1543 	spin_lock(&root->inode_lock);
1544 again:
1545 	node = root->inode_tree.rb_node;
1546 	prev = NULL;
1547 	while (node) {
1548 		prev = node;
1549 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1550 
1551 		if (objectid < btrfs_ino(entry))
1552 			node = node->rb_left;
1553 		else if (objectid > btrfs_ino(entry))
1554 			node = node->rb_right;
1555 		else
1556 			break;
1557 	}
1558 	if (!node) {
1559 		while (prev) {
1560 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1561 			if (objectid <= btrfs_ino(entry)) {
1562 				node = prev;
1563 				break;
1564 			}
1565 			prev = rb_next(prev);
1566 		}
1567 	}
1568 	while (node) {
1569 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1570 		inode = igrab(&entry->vfs_inode);
1571 		if (inode) {
1572 			spin_unlock(&root->inode_lock);
1573 			return inode;
1574 		}
1575 
1576 		objectid = btrfs_ino(entry) + 1;
1577 		if (cond_resched_lock(&root->inode_lock))
1578 			goto again;
1579 
1580 		node = rb_next(node);
1581 	}
1582 	spin_unlock(&root->inode_lock);
1583 	return NULL;
1584 }
1585 
1586 static int in_block_group(u64 bytenr,
1587 			  struct btrfs_block_group_cache *block_group)
1588 {
1589 	if (bytenr >= block_group->key.objectid &&
1590 	    bytenr < block_group->key.objectid + block_group->key.offset)
1591 		return 1;
1592 	return 0;
1593 }
1594 
1595 /*
1596  * get new location of data
1597  */
1598 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1599 			    u64 bytenr, u64 num_bytes)
1600 {
1601 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1602 	struct btrfs_path *path;
1603 	struct btrfs_file_extent_item *fi;
1604 	struct extent_buffer *leaf;
1605 	int ret;
1606 
1607 	path = btrfs_alloc_path();
1608 	if (!path)
1609 		return -ENOMEM;
1610 
1611 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1612 	ret = btrfs_lookup_file_extent(NULL, root, path,
1613 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1614 	if (ret < 0)
1615 		goto out;
1616 	if (ret > 0) {
1617 		ret = -ENOENT;
1618 		goto out;
1619 	}
1620 
1621 	leaf = path->nodes[0];
1622 	fi = btrfs_item_ptr(leaf, path->slots[0],
1623 			    struct btrfs_file_extent_item);
1624 
1625 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1626 	       btrfs_file_extent_compression(leaf, fi) ||
1627 	       btrfs_file_extent_encryption(leaf, fi) ||
1628 	       btrfs_file_extent_other_encoding(leaf, fi));
1629 
1630 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1631 		ret = -EINVAL;
1632 		goto out;
1633 	}
1634 
1635 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1636 	ret = 0;
1637 out:
1638 	btrfs_free_path(path);
1639 	return ret;
1640 }
1641 
1642 /*
1643  * update file extent items in the tree leaf to point to
1644  * the new locations.
1645  */
1646 static noinline_for_stack
1647 int replace_file_extents(struct btrfs_trans_handle *trans,
1648 			 struct reloc_control *rc,
1649 			 struct btrfs_root *root,
1650 			 struct extent_buffer *leaf)
1651 {
1652 	struct btrfs_fs_info *fs_info = root->fs_info;
1653 	struct btrfs_key key;
1654 	struct btrfs_file_extent_item *fi;
1655 	struct inode *inode = NULL;
1656 	u64 parent;
1657 	u64 bytenr;
1658 	u64 new_bytenr = 0;
1659 	u64 num_bytes;
1660 	u64 end;
1661 	u32 nritems;
1662 	u32 i;
1663 	int ret = 0;
1664 	int first = 1;
1665 	int dirty = 0;
1666 
1667 	if (rc->stage != UPDATE_DATA_PTRS)
1668 		return 0;
1669 
1670 	/* reloc trees always use full backref */
1671 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1672 		parent = leaf->start;
1673 	else
1674 		parent = 0;
1675 
1676 	nritems = btrfs_header_nritems(leaf);
1677 	for (i = 0; i < nritems; i++) {
1678 		cond_resched();
1679 		btrfs_item_key_to_cpu(leaf, &key, i);
1680 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1681 			continue;
1682 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1683 		if (btrfs_file_extent_type(leaf, fi) ==
1684 		    BTRFS_FILE_EXTENT_INLINE)
1685 			continue;
1686 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1687 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1688 		if (bytenr == 0)
1689 			continue;
1690 		if (!in_block_group(bytenr, rc->block_group))
1691 			continue;
1692 
1693 		/*
1694 		 * if we are modifying block in fs tree, wait for readpage
1695 		 * to complete and drop the extent cache
1696 		 */
1697 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1698 			if (first) {
1699 				inode = find_next_inode(root, key.objectid);
1700 				first = 0;
1701 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1702 				btrfs_add_delayed_iput(inode);
1703 				inode = find_next_inode(root, key.objectid);
1704 			}
1705 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1706 				end = key.offset +
1707 				      btrfs_file_extent_num_bytes(leaf, fi);
1708 				WARN_ON(!IS_ALIGNED(key.offset,
1709 						    fs_info->sectorsize));
1710 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1711 				end--;
1712 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1713 						      key.offset, end);
1714 				if (!ret)
1715 					continue;
1716 
1717 				btrfs_drop_extent_cache(BTRFS_I(inode),
1718 						key.offset,	end, 1);
1719 				unlock_extent(&BTRFS_I(inode)->io_tree,
1720 					      key.offset, end);
1721 			}
1722 		}
1723 
1724 		ret = get_new_location(rc->data_inode, &new_bytenr,
1725 				       bytenr, num_bytes);
1726 		if (ret) {
1727 			/*
1728 			 * Don't have to abort since we've not changed anything
1729 			 * in the file extent yet.
1730 			 */
1731 			break;
1732 		}
1733 
1734 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1735 		dirty = 1;
1736 
1737 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1738 		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1739 					   num_bytes, parent,
1740 					   btrfs_header_owner(leaf),
1741 					   key.objectid, key.offset);
1742 		if (ret) {
1743 			btrfs_abort_transaction(trans, ret);
1744 			break;
1745 		}
1746 
1747 		ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
1748 					parent, btrfs_header_owner(leaf),
1749 					key.objectid, key.offset);
1750 		if (ret) {
1751 			btrfs_abort_transaction(trans, ret);
1752 			break;
1753 		}
1754 	}
1755 	if (dirty)
1756 		btrfs_mark_buffer_dirty(leaf);
1757 	if (inode)
1758 		btrfs_add_delayed_iput(inode);
1759 	return ret;
1760 }
1761 
1762 static noinline_for_stack
1763 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1764 		     struct btrfs_path *path, int level)
1765 {
1766 	struct btrfs_disk_key key1;
1767 	struct btrfs_disk_key key2;
1768 	btrfs_node_key(eb, &key1, slot);
1769 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1770 	return memcmp(&key1, &key2, sizeof(key1));
1771 }
1772 
1773 /*
1774  * try to replace tree blocks in fs tree with the new blocks
1775  * in reloc tree. tree blocks haven't been modified since the
1776  * reloc tree was create can be replaced.
1777  *
1778  * if a block was replaced, level of the block + 1 is returned.
1779  * if no block got replaced, 0 is returned. if there are other
1780  * errors, a negative error number is returned.
1781  */
1782 static noinline_for_stack
1783 int replace_path(struct btrfs_trans_handle *trans,
1784 		 struct btrfs_root *dest, struct btrfs_root *src,
1785 		 struct btrfs_path *path, struct btrfs_key *next_key,
1786 		 int lowest_level, int max_level)
1787 {
1788 	struct btrfs_fs_info *fs_info = dest->fs_info;
1789 	struct extent_buffer *eb;
1790 	struct extent_buffer *parent;
1791 	struct btrfs_key key;
1792 	u64 old_bytenr;
1793 	u64 new_bytenr;
1794 	u64 old_ptr_gen;
1795 	u64 new_ptr_gen;
1796 	u64 last_snapshot;
1797 	u32 blocksize;
1798 	int cow = 0;
1799 	int level;
1800 	int ret;
1801 	int slot;
1802 
1803 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1804 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1805 
1806 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1807 again:
1808 	slot = path->slots[lowest_level];
1809 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1810 
1811 	eb = btrfs_lock_root_node(dest);
1812 	btrfs_set_lock_blocking(eb);
1813 	level = btrfs_header_level(eb);
1814 
1815 	if (level < lowest_level) {
1816 		btrfs_tree_unlock(eb);
1817 		free_extent_buffer(eb);
1818 		return 0;
1819 	}
1820 
1821 	if (cow) {
1822 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1823 		BUG_ON(ret);
1824 	}
1825 	btrfs_set_lock_blocking(eb);
1826 
1827 	if (next_key) {
1828 		next_key->objectid = (u64)-1;
1829 		next_key->type = (u8)-1;
1830 		next_key->offset = (u64)-1;
1831 	}
1832 
1833 	parent = eb;
1834 	while (1) {
1835 		level = btrfs_header_level(parent);
1836 		BUG_ON(level < lowest_level);
1837 
1838 		ret = btrfs_bin_search(parent, &key, level, &slot);
1839 		if (ret && slot > 0)
1840 			slot--;
1841 
1842 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1843 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1844 
1845 		old_bytenr = btrfs_node_blockptr(parent, slot);
1846 		blocksize = fs_info->nodesize;
1847 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1848 
1849 		if (level <= max_level) {
1850 			eb = path->nodes[level];
1851 			new_bytenr = btrfs_node_blockptr(eb,
1852 							path->slots[level]);
1853 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1854 							path->slots[level]);
1855 		} else {
1856 			new_bytenr = 0;
1857 			new_ptr_gen = 0;
1858 		}
1859 
1860 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1861 			ret = level;
1862 			break;
1863 		}
1864 
1865 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1866 		    memcmp_node_keys(parent, slot, path, level)) {
1867 			if (level <= lowest_level) {
1868 				ret = 0;
1869 				break;
1870 			}
1871 
1872 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
1873 			if (IS_ERR(eb)) {
1874 				ret = PTR_ERR(eb);
1875 				break;
1876 			} else if (!extent_buffer_uptodate(eb)) {
1877 				ret = -EIO;
1878 				free_extent_buffer(eb);
1879 				break;
1880 			}
1881 			btrfs_tree_lock(eb);
1882 			if (cow) {
1883 				ret = btrfs_cow_block(trans, dest, eb, parent,
1884 						      slot, &eb);
1885 				BUG_ON(ret);
1886 			}
1887 			btrfs_set_lock_blocking(eb);
1888 
1889 			btrfs_tree_unlock(parent);
1890 			free_extent_buffer(parent);
1891 
1892 			parent = eb;
1893 			continue;
1894 		}
1895 
1896 		if (!cow) {
1897 			btrfs_tree_unlock(parent);
1898 			free_extent_buffer(parent);
1899 			cow = 1;
1900 			goto again;
1901 		}
1902 
1903 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1904 				      path->slots[level]);
1905 		btrfs_release_path(path);
1906 
1907 		path->lowest_level = level;
1908 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1909 		path->lowest_level = 0;
1910 		BUG_ON(ret);
1911 
1912 		/*
1913 		 * Info qgroup to trace both subtrees.
1914 		 *
1915 		 * We must trace both trees.
1916 		 * 1) Tree reloc subtree
1917 		 *    If not traced, we will leak data numbers
1918 		 * 2) Fs subtree
1919 		 *    If not traced, we will double count old data
1920 		 *    and tree block numbers, if current trans doesn't free
1921 		 *    data reloc tree inode.
1922 		 */
1923 		ret = btrfs_qgroup_trace_subtree(trans, src, parent,
1924 				btrfs_header_generation(parent),
1925 				btrfs_header_level(parent));
1926 		if (ret < 0)
1927 			break;
1928 		ret = btrfs_qgroup_trace_subtree(trans, dest,
1929 				path->nodes[level],
1930 				btrfs_header_generation(path->nodes[level]),
1931 				btrfs_header_level(path->nodes[level]));
1932 		if (ret < 0)
1933 			break;
1934 
1935 		/*
1936 		 * swap blocks in fs tree and reloc tree.
1937 		 */
1938 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1939 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1940 		btrfs_mark_buffer_dirty(parent);
1941 
1942 		btrfs_set_node_blockptr(path->nodes[level],
1943 					path->slots[level], old_bytenr);
1944 		btrfs_set_node_ptr_generation(path->nodes[level],
1945 					      path->slots[level], old_ptr_gen);
1946 		btrfs_mark_buffer_dirty(path->nodes[level]);
1947 
1948 		ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
1949 					blocksize, path->nodes[level]->start,
1950 					src->root_key.objectid, level - 1, 0);
1951 		BUG_ON(ret);
1952 		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1953 					blocksize, 0, dest->root_key.objectid,
1954 					level - 1, 0);
1955 		BUG_ON(ret);
1956 
1957 		ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
1958 					path->nodes[level]->start,
1959 					src->root_key.objectid, level - 1, 0);
1960 		BUG_ON(ret);
1961 
1962 		ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
1963 					0, dest->root_key.objectid, level - 1,
1964 					0);
1965 		BUG_ON(ret);
1966 
1967 		btrfs_unlock_up_safe(path, 0);
1968 
1969 		ret = level;
1970 		break;
1971 	}
1972 	btrfs_tree_unlock(parent);
1973 	free_extent_buffer(parent);
1974 	return ret;
1975 }
1976 
1977 /*
1978  * helper to find next relocated block in reloc tree
1979  */
1980 static noinline_for_stack
1981 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1982 		       int *level)
1983 {
1984 	struct extent_buffer *eb;
1985 	int i;
1986 	u64 last_snapshot;
1987 	u32 nritems;
1988 
1989 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1990 
1991 	for (i = 0; i < *level; i++) {
1992 		free_extent_buffer(path->nodes[i]);
1993 		path->nodes[i] = NULL;
1994 	}
1995 
1996 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1997 		eb = path->nodes[i];
1998 		nritems = btrfs_header_nritems(eb);
1999 		while (path->slots[i] + 1 < nritems) {
2000 			path->slots[i]++;
2001 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2002 			    last_snapshot)
2003 				continue;
2004 
2005 			*level = i;
2006 			return 0;
2007 		}
2008 		free_extent_buffer(path->nodes[i]);
2009 		path->nodes[i] = NULL;
2010 	}
2011 	return 1;
2012 }
2013 
2014 /*
2015  * walk down reloc tree to find relocated block of lowest level
2016  */
2017 static noinline_for_stack
2018 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2019 			 int *level)
2020 {
2021 	struct btrfs_fs_info *fs_info = root->fs_info;
2022 	struct extent_buffer *eb = NULL;
2023 	int i;
2024 	u64 bytenr;
2025 	u64 ptr_gen = 0;
2026 	u64 last_snapshot;
2027 	u32 nritems;
2028 
2029 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2030 
2031 	for (i = *level; i > 0; i--) {
2032 		eb = path->nodes[i];
2033 		nritems = btrfs_header_nritems(eb);
2034 		while (path->slots[i] < nritems) {
2035 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2036 			if (ptr_gen > last_snapshot)
2037 				break;
2038 			path->slots[i]++;
2039 		}
2040 		if (path->slots[i] >= nritems) {
2041 			if (i == *level)
2042 				break;
2043 			*level = i + 1;
2044 			return 0;
2045 		}
2046 		if (i == 1) {
2047 			*level = i;
2048 			return 0;
2049 		}
2050 
2051 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2052 		eb = read_tree_block(fs_info, bytenr, ptr_gen);
2053 		if (IS_ERR(eb)) {
2054 			return PTR_ERR(eb);
2055 		} else if (!extent_buffer_uptodate(eb)) {
2056 			free_extent_buffer(eb);
2057 			return -EIO;
2058 		}
2059 		BUG_ON(btrfs_header_level(eb) != i - 1);
2060 		path->nodes[i - 1] = eb;
2061 		path->slots[i - 1] = 0;
2062 	}
2063 	return 1;
2064 }
2065 
2066 /*
2067  * invalidate extent cache for file extents whose key in range of
2068  * [min_key, max_key)
2069  */
2070 static int invalidate_extent_cache(struct btrfs_root *root,
2071 				   struct btrfs_key *min_key,
2072 				   struct btrfs_key *max_key)
2073 {
2074 	struct btrfs_fs_info *fs_info = root->fs_info;
2075 	struct inode *inode = NULL;
2076 	u64 objectid;
2077 	u64 start, end;
2078 	u64 ino;
2079 
2080 	objectid = min_key->objectid;
2081 	while (1) {
2082 		cond_resched();
2083 		iput(inode);
2084 
2085 		if (objectid > max_key->objectid)
2086 			break;
2087 
2088 		inode = find_next_inode(root, objectid);
2089 		if (!inode)
2090 			break;
2091 		ino = btrfs_ino(BTRFS_I(inode));
2092 
2093 		if (ino > max_key->objectid) {
2094 			iput(inode);
2095 			break;
2096 		}
2097 
2098 		objectid = ino + 1;
2099 		if (!S_ISREG(inode->i_mode))
2100 			continue;
2101 
2102 		if (unlikely(min_key->objectid == ino)) {
2103 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2104 				continue;
2105 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2106 				start = 0;
2107 			else {
2108 				start = min_key->offset;
2109 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2110 			}
2111 		} else {
2112 			start = 0;
2113 		}
2114 
2115 		if (unlikely(max_key->objectid == ino)) {
2116 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2117 				continue;
2118 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2119 				end = (u64)-1;
2120 			} else {
2121 				if (max_key->offset == 0)
2122 					continue;
2123 				end = max_key->offset;
2124 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2125 				end--;
2126 			}
2127 		} else {
2128 			end = (u64)-1;
2129 		}
2130 
2131 		/* the lock_extent waits for readpage to complete */
2132 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2133 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2134 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2135 	}
2136 	return 0;
2137 }
2138 
2139 static int find_next_key(struct btrfs_path *path, int level,
2140 			 struct btrfs_key *key)
2141 
2142 {
2143 	while (level < BTRFS_MAX_LEVEL) {
2144 		if (!path->nodes[level])
2145 			break;
2146 		if (path->slots[level] + 1 <
2147 		    btrfs_header_nritems(path->nodes[level])) {
2148 			btrfs_node_key_to_cpu(path->nodes[level], key,
2149 					      path->slots[level] + 1);
2150 			return 0;
2151 		}
2152 		level++;
2153 	}
2154 	return 1;
2155 }
2156 
2157 /*
2158  * merge the relocated tree blocks in reloc tree with corresponding
2159  * fs tree.
2160  */
2161 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2162 					       struct btrfs_root *root)
2163 {
2164 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2165 	LIST_HEAD(inode_list);
2166 	struct btrfs_key key;
2167 	struct btrfs_key next_key;
2168 	struct btrfs_trans_handle *trans = NULL;
2169 	struct btrfs_root *reloc_root;
2170 	struct btrfs_root_item *root_item;
2171 	struct btrfs_path *path;
2172 	struct extent_buffer *leaf;
2173 	int level;
2174 	int max_level;
2175 	int replaced = 0;
2176 	int ret;
2177 	int err = 0;
2178 	u32 min_reserved;
2179 
2180 	path = btrfs_alloc_path();
2181 	if (!path)
2182 		return -ENOMEM;
2183 	path->reada = READA_FORWARD;
2184 
2185 	reloc_root = root->reloc_root;
2186 	root_item = &reloc_root->root_item;
2187 
2188 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2189 		level = btrfs_root_level(root_item);
2190 		extent_buffer_get(reloc_root->node);
2191 		path->nodes[level] = reloc_root->node;
2192 		path->slots[level] = 0;
2193 	} else {
2194 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2195 
2196 		level = root_item->drop_level;
2197 		BUG_ON(level == 0);
2198 		path->lowest_level = level;
2199 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2200 		path->lowest_level = 0;
2201 		if (ret < 0) {
2202 			btrfs_free_path(path);
2203 			return ret;
2204 		}
2205 
2206 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2207 				      path->slots[level]);
2208 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2209 
2210 		btrfs_unlock_up_safe(path, 0);
2211 	}
2212 
2213 	min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2214 	memset(&next_key, 0, sizeof(next_key));
2215 
2216 	while (1) {
2217 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2218 					     BTRFS_RESERVE_FLUSH_ALL);
2219 		if (ret) {
2220 			err = ret;
2221 			goto out;
2222 		}
2223 		trans = btrfs_start_transaction(root, 0);
2224 		if (IS_ERR(trans)) {
2225 			err = PTR_ERR(trans);
2226 			trans = NULL;
2227 			goto out;
2228 		}
2229 		trans->block_rsv = rc->block_rsv;
2230 
2231 		replaced = 0;
2232 		max_level = level;
2233 
2234 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2235 		if (ret < 0) {
2236 			err = ret;
2237 			goto out;
2238 		}
2239 		if (ret > 0)
2240 			break;
2241 
2242 		if (!find_next_key(path, level, &key) &&
2243 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2244 			ret = 0;
2245 		} else {
2246 			ret = replace_path(trans, root, reloc_root, path,
2247 					   &next_key, level, max_level);
2248 		}
2249 		if (ret < 0) {
2250 			err = ret;
2251 			goto out;
2252 		}
2253 
2254 		if (ret > 0) {
2255 			level = ret;
2256 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2257 					      path->slots[level]);
2258 			replaced = 1;
2259 		}
2260 
2261 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2262 		if (ret > 0)
2263 			break;
2264 
2265 		BUG_ON(level == 0);
2266 		/*
2267 		 * save the merging progress in the drop_progress.
2268 		 * this is OK since root refs == 1 in this case.
2269 		 */
2270 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2271 			       path->slots[level]);
2272 		root_item->drop_level = level;
2273 
2274 		btrfs_end_transaction_throttle(trans);
2275 		trans = NULL;
2276 
2277 		btrfs_btree_balance_dirty(fs_info);
2278 
2279 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2280 			invalidate_extent_cache(root, &key, &next_key);
2281 	}
2282 
2283 	/*
2284 	 * handle the case only one block in the fs tree need to be
2285 	 * relocated and the block is tree root.
2286 	 */
2287 	leaf = btrfs_lock_root_node(root);
2288 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2289 	btrfs_tree_unlock(leaf);
2290 	free_extent_buffer(leaf);
2291 	if (ret < 0)
2292 		err = ret;
2293 out:
2294 	btrfs_free_path(path);
2295 
2296 	if (err == 0) {
2297 		memset(&root_item->drop_progress, 0,
2298 		       sizeof(root_item->drop_progress));
2299 		root_item->drop_level = 0;
2300 		btrfs_set_root_refs(root_item, 0);
2301 		btrfs_update_reloc_root(trans, root);
2302 	}
2303 
2304 	if (trans)
2305 		btrfs_end_transaction_throttle(trans);
2306 
2307 	btrfs_btree_balance_dirty(fs_info);
2308 
2309 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2310 		invalidate_extent_cache(root, &key, &next_key);
2311 
2312 	return err;
2313 }
2314 
2315 static noinline_for_stack
2316 int prepare_to_merge(struct reloc_control *rc, int err)
2317 {
2318 	struct btrfs_root *root = rc->extent_root;
2319 	struct btrfs_fs_info *fs_info = root->fs_info;
2320 	struct btrfs_root *reloc_root;
2321 	struct btrfs_trans_handle *trans;
2322 	LIST_HEAD(reloc_roots);
2323 	u64 num_bytes = 0;
2324 	int ret;
2325 
2326 	mutex_lock(&fs_info->reloc_mutex);
2327 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2328 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2329 	mutex_unlock(&fs_info->reloc_mutex);
2330 
2331 again:
2332 	if (!err) {
2333 		num_bytes = rc->merging_rsv_size;
2334 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2335 					  BTRFS_RESERVE_FLUSH_ALL);
2336 		if (ret)
2337 			err = ret;
2338 	}
2339 
2340 	trans = btrfs_join_transaction(rc->extent_root);
2341 	if (IS_ERR(trans)) {
2342 		if (!err)
2343 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2344 						num_bytes);
2345 		return PTR_ERR(trans);
2346 	}
2347 
2348 	if (!err) {
2349 		if (num_bytes != rc->merging_rsv_size) {
2350 			btrfs_end_transaction(trans);
2351 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2352 						num_bytes);
2353 			goto again;
2354 		}
2355 	}
2356 
2357 	rc->merge_reloc_tree = 1;
2358 
2359 	while (!list_empty(&rc->reloc_roots)) {
2360 		reloc_root = list_entry(rc->reloc_roots.next,
2361 					struct btrfs_root, root_list);
2362 		list_del_init(&reloc_root->root_list);
2363 
2364 		root = read_fs_root(fs_info, reloc_root->root_key.offset);
2365 		BUG_ON(IS_ERR(root));
2366 		BUG_ON(root->reloc_root != reloc_root);
2367 
2368 		/*
2369 		 * set reference count to 1, so btrfs_recover_relocation
2370 		 * knows it should resumes merging
2371 		 */
2372 		if (!err)
2373 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2374 		btrfs_update_reloc_root(trans, root);
2375 
2376 		list_add(&reloc_root->root_list, &reloc_roots);
2377 	}
2378 
2379 	list_splice(&reloc_roots, &rc->reloc_roots);
2380 
2381 	if (!err)
2382 		btrfs_commit_transaction(trans);
2383 	else
2384 		btrfs_end_transaction(trans);
2385 	return err;
2386 }
2387 
2388 static noinline_for_stack
2389 void free_reloc_roots(struct list_head *list)
2390 {
2391 	struct btrfs_root *reloc_root;
2392 
2393 	while (!list_empty(list)) {
2394 		reloc_root = list_entry(list->next, struct btrfs_root,
2395 					root_list);
2396 		free_extent_buffer(reloc_root->node);
2397 		free_extent_buffer(reloc_root->commit_root);
2398 		reloc_root->node = NULL;
2399 		reloc_root->commit_root = NULL;
2400 		__del_reloc_root(reloc_root);
2401 	}
2402 }
2403 
2404 static noinline_for_stack
2405 void merge_reloc_roots(struct reloc_control *rc)
2406 {
2407 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2408 	struct btrfs_root *root;
2409 	struct btrfs_root *reloc_root;
2410 	LIST_HEAD(reloc_roots);
2411 	int found = 0;
2412 	int ret = 0;
2413 again:
2414 	root = rc->extent_root;
2415 
2416 	/*
2417 	 * this serializes us with btrfs_record_root_in_transaction,
2418 	 * we have to make sure nobody is in the middle of
2419 	 * adding their roots to the list while we are
2420 	 * doing this splice
2421 	 */
2422 	mutex_lock(&fs_info->reloc_mutex);
2423 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2424 	mutex_unlock(&fs_info->reloc_mutex);
2425 
2426 	while (!list_empty(&reloc_roots)) {
2427 		found = 1;
2428 		reloc_root = list_entry(reloc_roots.next,
2429 					struct btrfs_root, root_list);
2430 
2431 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2432 			root = read_fs_root(fs_info,
2433 					    reloc_root->root_key.offset);
2434 			BUG_ON(IS_ERR(root));
2435 			BUG_ON(root->reloc_root != reloc_root);
2436 
2437 			ret = merge_reloc_root(rc, root);
2438 			if (ret) {
2439 				if (list_empty(&reloc_root->root_list))
2440 					list_add_tail(&reloc_root->root_list,
2441 						      &reloc_roots);
2442 				goto out;
2443 			}
2444 		} else {
2445 			list_del_init(&reloc_root->root_list);
2446 		}
2447 
2448 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2449 		if (ret < 0) {
2450 			if (list_empty(&reloc_root->root_list))
2451 				list_add_tail(&reloc_root->root_list,
2452 					      &reloc_roots);
2453 			goto out;
2454 		}
2455 	}
2456 
2457 	if (found) {
2458 		found = 0;
2459 		goto again;
2460 	}
2461 out:
2462 	if (ret) {
2463 		btrfs_handle_fs_error(fs_info, ret, NULL);
2464 		if (!list_empty(&reloc_roots))
2465 			free_reloc_roots(&reloc_roots);
2466 
2467 		/* new reloc root may be added */
2468 		mutex_lock(&fs_info->reloc_mutex);
2469 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2470 		mutex_unlock(&fs_info->reloc_mutex);
2471 		if (!list_empty(&reloc_roots))
2472 			free_reloc_roots(&reloc_roots);
2473 	}
2474 
2475 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2476 }
2477 
2478 static void free_block_list(struct rb_root *blocks)
2479 {
2480 	struct tree_block *block;
2481 	struct rb_node *rb_node;
2482 	while ((rb_node = rb_first(blocks))) {
2483 		block = rb_entry(rb_node, struct tree_block, rb_node);
2484 		rb_erase(rb_node, blocks);
2485 		kfree(block);
2486 	}
2487 }
2488 
2489 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2490 				      struct btrfs_root *reloc_root)
2491 {
2492 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2493 	struct btrfs_root *root;
2494 
2495 	if (reloc_root->last_trans == trans->transid)
2496 		return 0;
2497 
2498 	root = read_fs_root(fs_info, reloc_root->root_key.offset);
2499 	BUG_ON(IS_ERR(root));
2500 	BUG_ON(root->reloc_root != reloc_root);
2501 
2502 	return btrfs_record_root_in_trans(trans, root);
2503 }
2504 
2505 static noinline_for_stack
2506 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2507 				     struct reloc_control *rc,
2508 				     struct backref_node *node,
2509 				     struct backref_edge *edges[])
2510 {
2511 	struct backref_node *next;
2512 	struct btrfs_root *root;
2513 	int index = 0;
2514 
2515 	next = node;
2516 	while (1) {
2517 		cond_resched();
2518 		next = walk_up_backref(next, edges, &index);
2519 		root = next->root;
2520 		BUG_ON(!root);
2521 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2522 
2523 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2524 			record_reloc_root_in_trans(trans, root);
2525 			break;
2526 		}
2527 
2528 		btrfs_record_root_in_trans(trans, root);
2529 		root = root->reloc_root;
2530 
2531 		if (next->new_bytenr != root->node->start) {
2532 			BUG_ON(next->new_bytenr);
2533 			BUG_ON(!list_empty(&next->list));
2534 			next->new_bytenr = root->node->start;
2535 			next->root = root;
2536 			list_add_tail(&next->list,
2537 				      &rc->backref_cache.changed);
2538 			__mark_block_processed(rc, next);
2539 			break;
2540 		}
2541 
2542 		WARN_ON(1);
2543 		root = NULL;
2544 		next = walk_down_backref(edges, &index);
2545 		if (!next || next->level <= node->level)
2546 			break;
2547 	}
2548 	if (!root)
2549 		return NULL;
2550 
2551 	next = node;
2552 	/* setup backref node path for btrfs_reloc_cow_block */
2553 	while (1) {
2554 		rc->backref_cache.path[next->level] = next;
2555 		if (--index < 0)
2556 			break;
2557 		next = edges[index]->node[UPPER];
2558 	}
2559 	return root;
2560 }
2561 
2562 /*
2563  * select a tree root for relocation. return NULL if the block
2564  * is reference counted. we should use do_relocation() in this
2565  * case. return a tree root pointer if the block isn't reference
2566  * counted. return -ENOENT if the block is root of reloc tree.
2567  */
2568 static noinline_for_stack
2569 struct btrfs_root *select_one_root(struct backref_node *node)
2570 {
2571 	struct backref_node *next;
2572 	struct btrfs_root *root;
2573 	struct btrfs_root *fs_root = NULL;
2574 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2575 	int index = 0;
2576 
2577 	next = node;
2578 	while (1) {
2579 		cond_resched();
2580 		next = walk_up_backref(next, edges, &index);
2581 		root = next->root;
2582 		BUG_ON(!root);
2583 
2584 		/* no other choice for non-references counted tree */
2585 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2586 			return root;
2587 
2588 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2589 			fs_root = root;
2590 
2591 		if (next != node)
2592 			return NULL;
2593 
2594 		next = walk_down_backref(edges, &index);
2595 		if (!next || next->level <= node->level)
2596 			break;
2597 	}
2598 
2599 	if (!fs_root)
2600 		return ERR_PTR(-ENOENT);
2601 	return fs_root;
2602 }
2603 
2604 static noinline_for_stack
2605 u64 calcu_metadata_size(struct reloc_control *rc,
2606 			struct backref_node *node, int reserve)
2607 {
2608 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2609 	struct backref_node *next = node;
2610 	struct backref_edge *edge;
2611 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2612 	u64 num_bytes = 0;
2613 	int index = 0;
2614 
2615 	BUG_ON(reserve && node->processed);
2616 
2617 	while (next) {
2618 		cond_resched();
2619 		while (1) {
2620 			if (next->processed && (reserve || next != node))
2621 				break;
2622 
2623 			num_bytes += fs_info->nodesize;
2624 
2625 			if (list_empty(&next->upper))
2626 				break;
2627 
2628 			edge = list_entry(next->upper.next,
2629 					  struct backref_edge, list[LOWER]);
2630 			edges[index++] = edge;
2631 			next = edge->node[UPPER];
2632 		}
2633 		next = walk_down_backref(edges, &index);
2634 	}
2635 	return num_bytes;
2636 }
2637 
2638 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2639 				  struct reloc_control *rc,
2640 				  struct backref_node *node)
2641 {
2642 	struct btrfs_root *root = rc->extent_root;
2643 	struct btrfs_fs_info *fs_info = root->fs_info;
2644 	u64 num_bytes;
2645 	int ret;
2646 	u64 tmp;
2647 
2648 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2649 
2650 	trans->block_rsv = rc->block_rsv;
2651 	rc->reserved_bytes += num_bytes;
2652 
2653 	/*
2654 	 * We are under a transaction here so we can only do limited flushing.
2655 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2656 	 * transaction and try to refill when we can flush all the things.
2657 	 */
2658 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2659 				BTRFS_RESERVE_FLUSH_LIMIT);
2660 	if (ret) {
2661 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2662 		while (tmp <= rc->reserved_bytes)
2663 			tmp <<= 1;
2664 		/*
2665 		 * only one thread can access block_rsv at this point,
2666 		 * so we don't need hold lock to protect block_rsv.
2667 		 * we expand more reservation size here to allow enough
2668 		 * space for relocation and we will return eailer in
2669 		 * enospc case.
2670 		 */
2671 		rc->block_rsv->size = tmp + fs_info->nodesize *
2672 				      RELOCATION_RESERVED_NODES;
2673 		return -EAGAIN;
2674 	}
2675 
2676 	return 0;
2677 }
2678 
2679 /*
2680  * relocate a block tree, and then update pointers in upper level
2681  * blocks that reference the block to point to the new location.
2682  *
2683  * if called by link_to_upper, the block has already been relocated.
2684  * in that case this function just updates pointers.
2685  */
2686 static int do_relocation(struct btrfs_trans_handle *trans,
2687 			 struct reloc_control *rc,
2688 			 struct backref_node *node,
2689 			 struct btrfs_key *key,
2690 			 struct btrfs_path *path, int lowest)
2691 {
2692 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2693 	struct backref_node *upper;
2694 	struct backref_edge *edge;
2695 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2696 	struct btrfs_root *root;
2697 	struct extent_buffer *eb;
2698 	u32 blocksize;
2699 	u64 bytenr;
2700 	u64 generation;
2701 	int slot;
2702 	int ret;
2703 	int err = 0;
2704 
2705 	BUG_ON(lowest && node->eb);
2706 
2707 	path->lowest_level = node->level + 1;
2708 	rc->backref_cache.path[node->level] = node;
2709 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2710 		cond_resched();
2711 
2712 		upper = edge->node[UPPER];
2713 		root = select_reloc_root(trans, rc, upper, edges);
2714 		BUG_ON(!root);
2715 
2716 		if (upper->eb && !upper->locked) {
2717 			if (!lowest) {
2718 				ret = btrfs_bin_search(upper->eb, key,
2719 						       upper->level, &slot);
2720 				BUG_ON(ret);
2721 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2722 				if (node->eb->start == bytenr)
2723 					goto next;
2724 			}
2725 			drop_node_buffer(upper);
2726 		}
2727 
2728 		if (!upper->eb) {
2729 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730 			if (ret) {
2731 				if (ret < 0)
2732 					err = ret;
2733 				else
2734 					err = -ENOENT;
2735 
2736 				btrfs_release_path(path);
2737 				break;
2738 			}
2739 
2740 			if (!upper->eb) {
2741 				upper->eb = path->nodes[upper->level];
2742 				path->nodes[upper->level] = NULL;
2743 			} else {
2744 				BUG_ON(upper->eb != path->nodes[upper->level]);
2745 			}
2746 
2747 			upper->locked = 1;
2748 			path->locks[upper->level] = 0;
2749 
2750 			slot = path->slots[upper->level];
2751 			btrfs_release_path(path);
2752 		} else {
2753 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2754 					       &slot);
2755 			BUG_ON(ret);
2756 		}
2757 
2758 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2759 		if (lowest) {
2760 			if (bytenr != node->bytenr) {
2761 				btrfs_err(root->fs_info,
2762 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2763 					  bytenr, node->bytenr, slot,
2764 					  upper->eb->start);
2765 				err = -EIO;
2766 				goto next;
2767 			}
2768 		} else {
2769 			if (node->eb->start == bytenr)
2770 				goto next;
2771 		}
2772 
2773 		blocksize = root->fs_info->nodesize;
2774 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2775 		eb = read_tree_block(fs_info, bytenr, generation);
2776 		if (IS_ERR(eb)) {
2777 			err = PTR_ERR(eb);
2778 			goto next;
2779 		} else if (!extent_buffer_uptodate(eb)) {
2780 			free_extent_buffer(eb);
2781 			err = -EIO;
2782 			goto next;
2783 		}
2784 		btrfs_tree_lock(eb);
2785 		btrfs_set_lock_blocking(eb);
2786 
2787 		if (!node->eb) {
2788 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2789 					      slot, &eb);
2790 			btrfs_tree_unlock(eb);
2791 			free_extent_buffer(eb);
2792 			if (ret < 0) {
2793 				err = ret;
2794 				goto next;
2795 			}
2796 			BUG_ON(node->eb != eb);
2797 		} else {
2798 			btrfs_set_node_blockptr(upper->eb, slot,
2799 						node->eb->start);
2800 			btrfs_set_node_ptr_generation(upper->eb, slot,
2801 						      trans->transid);
2802 			btrfs_mark_buffer_dirty(upper->eb);
2803 
2804 			ret = btrfs_inc_extent_ref(trans, root->fs_info,
2805 						node->eb->start, blocksize,
2806 						upper->eb->start,
2807 						btrfs_header_owner(upper->eb),
2808 						node->level, 0);
2809 			BUG_ON(ret);
2810 
2811 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2812 			BUG_ON(ret);
2813 		}
2814 next:
2815 		if (!upper->pending)
2816 			drop_node_buffer(upper);
2817 		else
2818 			unlock_node_buffer(upper);
2819 		if (err)
2820 			break;
2821 	}
2822 
2823 	if (!err && node->pending) {
2824 		drop_node_buffer(node);
2825 		list_move_tail(&node->list, &rc->backref_cache.changed);
2826 		node->pending = 0;
2827 	}
2828 
2829 	path->lowest_level = 0;
2830 	BUG_ON(err == -ENOSPC);
2831 	return err;
2832 }
2833 
2834 static int link_to_upper(struct btrfs_trans_handle *trans,
2835 			 struct reloc_control *rc,
2836 			 struct backref_node *node,
2837 			 struct btrfs_path *path)
2838 {
2839 	struct btrfs_key key;
2840 
2841 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2842 	return do_relocation(trans, rc, node, &key, path, 0);
2843 }
2844 
2845 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2846 				struct reloc_control *rc,
2847 				struct btrfs_path *path, int err)
2848 {
2849 	LIST_HEAD(list);
2850 	struct backref_cache *cache = &rc->backref_cache;
2851 	struct backref_node *node;
2852 	int level;
2853 	int ret;
2854 
2855 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2856 		while (!list_empty(&cache->pending[level])) {
2857 			node = list_entry(cache->pending[level].next,
2858 					  struct backref_node, list);
2859 			list_move_tail(&node->list, &list);
2860 			BUG_ON(!node->pending);
2861 
2862 			if (!err) {
2863 				ret = link_to_upper(trans, rc, node, path);
2864 				if (ret < 0)
2865 					err = ret;
2866 			}
2867 		}
2868 		list_splice_init(&list, &cache->pending[level]);
2869 	}
2870 	return err;
2871 }
2872 
2873 static void mark_block_processed(struct reloc_control *rc,
2874 				 u64 bytenr, u32 blocksize)
2875 {
2876 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2877 			EXTENT_DIRTY);
2878 }
2879 
2880 static void __mark_block_processed(struct reloc_control *rc,
2881 				   struct backref_node *node)
2882 {
2883 	u32 blocksize;
2884 	if (node->level == 0 ||
2885 	    in_block_group(node->bytenr, rc->block_group)) {
2886 		blocksize = rc->extent_root->fs_info->nodesize;
2887 		mark_block_processed(rc, node->bytenr, blocksize);
2888 	}
2889 	node->processed = 1;
2890 }
2891 
2892 /*
2893  * mark a block and all blocks directly/indirectly reference the block
2894  * as processed.
2895  */
2896 static void update_processed_blocks(struct reloc_control *rc,
2897 				    struct backref_node *node)
2898 {
2899 	struct backref_node *next = node;
2900 	struct backref_edge *edge;
2901 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2902 	int index = 0;
2903 
2904 	while (next) {
2905 		cond_resched();
2906 		while (1) {
2907 			if (next->processed)
2908 				break;
2909 
2910 			__mark_block_processed(rc, next);
2911 
2912 			if (list_empty(&next->upper))
2913 				break;
2914 
2915 			edge = list_entry(next->upper.next,
2916 					  struct backref_edge, list[LOWER]);
2917 			edges[index++] = edge;
2918 			next = edge->node[UPPER];
2919 		}
2920 		next = walk_down_backref(edges, &index);
2921 	}
2922 }
2923 
2924 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2925 {
2926 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2927 
2928 	if (test_range_bit(&rc->processed_blocks, bytenr,
2929 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2930 		return 1;
2931 	return 0;
2932 }
2933 
2934 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2935 			      struct tree_block *block)
2936 {
2937 	struct extent_buffer *eb;
2938 
2939 	BUG_ON(block->key_ready);
2940 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
2941 	if (IS_ERR(eb)) {
2942 		return PTR_ERR(eb);
2943 	} else if (!extent_buffer_uptodate(eb)) {
2944 		free_extent_buffer(eb);
2945 		return -EIO;
2946 	}
2947 	WARN_ON(btrfs_header_level(eb) != block->level);
2948 	if (block->level == 0)
2949 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2950 	else
2951 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2952 	free_extent_buffer(eb);
2953 	block->key_ready = 1;
2954 	return 0;
2955 }
2956 
2957 /*
2958  * helper function to relocate a tree block
2959  */
2960 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2961 				struct reloc_control *rc,
2962 				struct backref_node *node,
2963 				struct btrfs_key *key,
2964 				struct btrfs_path *path)
2965 {
2966 	struct btrfs_root *root;
2967 	int ret = 0;
2968 
2969 	if (!node)
2970 		return 0;
2971 
2972 	BUG_ON(node->processed);
2973 	root = select_one_root(node);
2974 	if (root == ERR_PTR(-ENOENT)) {
2975 		update_processed_blocks(rc, node);
2976 		goto out;
2977 	}
2978 
2979 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2980 		ret = reserve_metadata_space(trans, rc, node);
2981 		if (ret)
2982 			goto out;
2983 	}
2984 
2985 	if (root) {
2986 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2987 			BUG_ON(node->new_bytenr);
2988 			BUG_ON(!list_empty(&node->list));
2989 			btrfs_record_root_in_trans(trans, root);
2990 			root = root->reloc_root;
2991 			node->new_bytenr = root->node->start;
2992 			node->root = root;
2993 			list_add_tail(&node->list, &rc->backref_cache.changed);
2994 		} else {
2995 			path->lowest_level = node->level;
2996 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2997 			btrfs_release_path(path);
2998 			if (ret > 0)
2999 				ret = 0;
3000 		}
3001 		if (!ret)
3002 			update_processed_blocks(rc, node);
3003 	} else {
3004 		ret = do_relocation(trans, rc, node, key, path, 1);
3005 	}
3006 out:
3007 	if (ret || node->level == 0 || node->cowonly)
3008 		remove_backref_node(&rc->backref_cache, node);
3009 	return ret;
3010 }
3011 
3012 /*
3013  * relocate a list of blocks
3014  */
3015 static noinline_for_stack
3016 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3017 			 struct reloc_control *rc, struct rb_root *blocks)
3018 {
3019 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3020 	struct backref_node *node;
3021 	struct btrfs_path *path;
3022 	struct tree_block *block;
3023 	struct rb_node *rb_node;
3024 	int ret;
3025 	int err = 0;
3026 
3027 	path = btrfs_alloc_path();
3028 	if (!path) {
3029 		err = -ENOMEM;
3030 		goto out_free_blocks;
3031 	}
3032 
3033 	rb_node = rb_first(blocks);
3034 	while (rb_node) {
3035 		block = rb_entry(rb_node, struct tree_block, rb_node);
3036 		if (!block->key_ready)
3037 			readahead_tree_block(fs_info, block->bytenr);
3038 		rb_node = rb_next(rb_node);
3039 	}
3040 
3041 	rb_node = rb_first(blocks);
3042 	while (rb_node) {
3043 		block = rb_entry(rb_node, struct tree_block, rb_node);
3044 		if (!block->key_ready) {
3045 			err = get_tree_block_key(fs_info, block);
3046 			if (err)
3047 				goto out_free_path;
3048 		}
3049 		rb_node = rb_next(rb_node);
3050 	}
3051 
3052 	rb_node = rb_first(blocks);
3053 	while (rb_node) {
3054 		block = rb_entry(rb_node, struct tree_block, rb_node);
3055 
3056 		node = build_backref_tree(rc, &block->key,
3057 					  block->level, block->bytenr);
3058 		if (IS_ERR(node)) {
3059 			err = PTR_ERR(node);
3060 			goto out;
3061 		}
3062 
3063 		ret = relocate_tree_block(trans, rc, node, &block->key,
3064 					  path);
3065 		if (ret < 0) {
3066 			if (ret != -EAGAIN || rb_node == rb_first(blocks))
3067 				err = ret;
3068 			goto out;
3069 		}
3070 		rb_node = rb_next(rb_node);
3071 	}
3072 out:
3073 	err = finish_pending_nodes(trans, rc, path, err);
3074 
3075 out_free_path:
3076 	btrfs_free_path(path);
3077 out_free_blocks:
3078 	free_block_list(blocks);
3079 	return err;
3080 }
3081 
3082 static noinline_for_stack
3083 int prealloc_file_extent_cluster(struct inode *inode,
3084 				 struct file_extent_cluster *cluster)
3085 {
3086 	u64 alloc_hint = 0;
3087 	u64 start;
3088 	u64 end;
3089 	u64 offset = BTRFS_I(inode)->index_cnt;
3090 	u64 num_bytes;
3091 	int nr = 0;
3092 	int ret = 0;
3093 	u64 prealloc_start = cluster->start - offset;
3094 	u64 prealloc_end = cluster->end - offset;
3095 	u64 cur_offset;
3096 
3097 	BUG_ON(cluster->start != cluster->boundary[0]);
3098 	inode_lock(inode);
3099 
3100 	ret = btrfs_check_data_free_space(inode, prealloc_start,
3101 					  prealloc_end + 1 - prealloc_start);
3102 	if (ret)
3103 		goto out;
3104 
3105 	cur_offset = prealloc_start;
3106 	while (nr < cluster->nr) {
3107 		start = cluster->boundary[nr] - offset;
3108 		if (nr + 1 < cluster->nr)
3109 			end = cluster->boundary[nr + 1] - 1 - offset;
3110 		else
3111 			end = cluster->end - offset;
3112 
3113 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3114 		num_bytes = end + 1 - start;
3115 		if (cur_offset < start)
3116 			btrfs_free_reserved_data_space(inode, cur_offset,
3117 					start - cur_offset);
3118 		ret = btrfs_prealloc_file_range(inode, 0, start,
3119 						num_bytes, num_bytes,
3120 						end + 1, &alloc_hint);
3121 		cur_offset = end + 1;
3122 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3123 		if (ret)
3124 			break;
3125 		nr++;
3126 	}
3127 	if (cur_offset < prealloc_end)
3128 		btrfs_free_reserved_data_space(inode, cur_offset,
3129 				       prealloc_end + 1 - cur_offset);
3130 out:
3131 	inode_unlock(inode);
3132 	return ret;
3133 }
3134 
3135 static noinline_for_stack
3136 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3137 			 u64 block_start)
3138 {
3139 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3140 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3141 	struct extent_map *em;
3142 	int ret = 0;
3143 
3144 	em = alloc_extent_map();
3145 	if (!em)
3146 		return -ENOMEM;
3147 
3148 	em->start = start;
3149 	em->len = end + 1 - start;
3150 	em->block_len = em->len;
3151 	em->block_start = block_start;
3152 	em->bdev = fs_info->fs_devices->latest_bdev;
3153 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3154 
3155 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3156 	while (1) {
3157 		write_lock(&em_tree->lock);
3158 		ret = add_extent_mapping(em_tree, em, 0);
3159 		write_unlock(&em_tree->lock);
3160 		if (ret != -EEXIST) {
3161 			free_extent_map(em);
3162 			break;
3163 		}
3164 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3165 	}
3166 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3167 	return ret;
3168 }
3169 
3170 static int relocate_file_extent_cluster(struct inode *inode,
3171 					struct file_extent_cluster *cluster)
3172 {
3173 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3174 	u64 page_start;
3175 	u64 page_end;
3176 	u64 offset = BTRFS_I(inode)->index_cnt;
3177 	unsigned long index;
3178 	unsigned long last_index;
3179 	struct page *page;
3180 	struct file_ra_state *ra;
3181 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3182 	int nr = 0;
3183 	int ret = 0;
3184 
3185 	if (!cluster->nr)
3186 		return 0;
3187 
3188 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3189 	if (!ra)
3190 		return -ENOMEM;
3191 
3192 	ret = prealloc_file_extent_cluster(inode, cluster);
3193 	if (ret)
3194 		goto out;
3195 
3196 	file_ra_state_init(ra, inode->i_mapping);
3197 
3198 	ret = setup_extent_mapping(inode, cluster->start - offset,
3199 				   cluster->end - offset, cluster->start);
3200 	if (ret)
3201 		goto out;
3202 
3203 	index = (cluster->start - offset) >> PAGE_SHIFT;
3204 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3205 	while (index <= last_index) {
3206 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3207 				PAGE_SIZE);
3208 		if (ret)
3209 			goto out;
3210 
3211 		page = find_lock_page(inode->i_mapping, index);
3212 		if (!page) {
3213 			page_cache_sync_readahead(inode->i_mapping,
3214 						  ra, NULL, index,
3215 						  last_index + 1 - index);
3216 			page = find_or_create_page(inode->i_mapping, index,
3217 						   mask);
3218 			if (!page) {
3219 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3220 							PAGE_SIZE);
3221 				ret = -ENOMEM;
3222 				goto out;
3223 			}
3224 		}
3225 
3226 		if (PageReadahead(page)) {
3227 			page_cache_async_readahead(inode->i_mapping,
3228 						   ra, NULL, page, index,
3229 						   last_index + 1 - index);
3230 		}
3231 
3232 		if (!PageUptodate(page)) {
3233 			btrfs_readpage(NULL, page);
3234 			lock_page(page);
3235 			if (!PageUptodate(page)) {
3236 				unlock_page(page);
3237 				put_page(page);
3238 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3239 							PAGE_SIZE);
3240 				ret = -EIO;
3241 				goto out;
3242 			}
3243 		}
3244 
3245 		page_start = page_offset(page);
3246 		page_end = page_start + PAGE_SIZE - 1;
3247 
3248 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3249 
3250 		set_page_extent_mapped(page);
3251 
3252 		if (nr < cluster->nr &&
3253 		    page_start + offset == cluster->boundary[nr]) {
3254 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3255 					page_start, page_end,
3256 					EXTENT_BOUNDARY);
3257 			nr++;
3258 		}
3259 
3260 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
3261 		set_page_dirty(page);
3262 
3263 		unlock_extent(&BTRFS_I(inode)->io_tree,
3264 			      page_start, page_end);
3265 		unlock_page(page);
3266 		put_page(page);
3267 
3268 		index++;
3269 		balance_dirty_pages_ratelimited(inode->i_mapping);
3270 		btrfs_throttle(fs_info);
3271 	}
3272 	WARN_ON(nr != cluster->nr);
3273 out:
3274 	kfree(ra);
3275 	return ret;
3276 }
3277 
3278 static noinline_for_stack
3279 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3280 			 struct file_extent_cluster *cluster)
3281 {
3282 	int ret;
3283 
3284 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3285 		ret = relocate_file_extent_cluster(inode, cluster);
3286 		if (ret)
3287 			return ret;
3288 		cluster->nr = 0;
3289 	}
3290 
3291 	if (!cluster->nr)
3292 		cluster->start = extent_key->objectid;
3293 	else
3294 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3295 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3296 	cluster->boundary[cluster->nr] = extent_key->objectid;
3297 	cluster->nr++;
3298 
3299 	if (cluster->nr >= MAX_EXTENTS) {
3300 		ret = relocate_file_extent_cluster(inode, cluster);
3301 		if (ret)
3302 			return ret;
3303 		cluster->nr = 0;
3304 	}
3305 	return 0;
3306 }
3307 
3308 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3309 static int get_ref_objectid_v0(struct reloc_control *rc,
3310 			       struct btrfs_path *path,
3311 			       struct btrfs_key *extent_key,
3312 			       u64 *ref_objectid, int *path_change)
3313 {
3314 	struct btrfs_key key;
3315 	struct extent_buffer *leaf;
3316 	struct btrfs_extent_ref_v0 *ref0;
3317 	int ret;
3318 	int slot;
3319 
3320 	leaf = path->nodes[0];
3321 	slot = path->slots[0];
3322 	while (1) {
3323 		if (slot >= btrfs_header_nritems(leaf)) {
3324 			ret = btrfs_next_leaf(rc->extent_root, path);
3325 			if (ret < 0)
3326 				return ret;
3327 			BUG_ON(ret > 0);
3328 			leaf = path->nodes[0];
3329 			slot = path->slots[0];
3330 			if (path_change)
3331 				*path_change = 1;
3332 		}
3333 		btrfs_item_key_to_cpu(leaf, &key, slot);
3334 		if (key.objectid != extent_key->objectid)
3335 			return -ENOENT;
3336 
3337 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3338 			slot++;
3339 			continue;
3340 		}
3341 		ref0 = btrfs_item_ptr(leaf, slot,
3342 				struct btrfs_extent_ref_v0);
3343 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3344 		break;
3345 	}
3346 	return 0;
3347 }
3348 #endif
3349 
3350 /*
3351  * helper to add a tree block to the list.
3352  * the major work is getting the generation and level of the block
3353  */
3354 static int add_tree_block(struct reloc_control *rc,
3355 			  struct btrfs_key *extent_key,
3356 			  struct btrfs_path *path,
3357 			  struct rb_root *blocks)
3358 {
3359 	struct extent_buffer *eb;
3360 	struct btrfs_extent_item *ei;
3361 	struct btrfs_tree_block_info *bi;
3362 	struct tree_block *block;
3363 	struct rb_node *rb_node;
3364 	u32 item_size;
3365 	int level = -1;
3366 	u64 generation;
3367 
3368 	eb =  path->nodes[0];
3369 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3370 
3371 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3372 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3373 		ei = btrfs_item_ptr(eb, path->slots[0],
3374 				struct btrfs_extent_item);
3375 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3376 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3377 			level = btrfs_tree_block_level(eb, bi);
3378 		} else {
3379 			level = (int)extent_key->offset;
3380 		}
3381 		generation = btrfs_extent_generation(eb, ei);
3382 	} else {
3383 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3384 		u64 ref_owner;
3385 		int ret;
3386 
3387 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3388 		ret = get_ref_objectid_v0(rc, path, extent_key,
3389 					  &ref_owner, NULL);
3390 		if (ret < 0)
3391 			return ret;
3392 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3393 		level = (int)ref_owner;
3394 		/* FIXME: get real generation */
3395 		generation = 0;
3396 #else
3397 		BUG();
3398 #endif
3399 	}
3400 
3401 	btrfs_release_path(path);
3402 
3403 	BUG_ON(level == -1);
3404 
3405 	block = kmalloc(sizeof(*block), GFP_NOFS);
3406 	if (!block)
3407 		return -ENOMEM;
3408 
3409 	block->bytenr = extent_key->objectid;
3410 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3411 	block->key.offset = generation;
3412 	block->level = level;
3413 	block->key_ready = 0;
3414 
3415 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3416 	if (rb_node)
3417 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3418 
3419 	return 0;
3420 }
3421 
3422 /*
3423  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3424  */
3425 static int __add_tree_block(struct reloc_control *rc,
3426 			    u64 bytenr, u32 blocksize,
3427 			    struct rb_root *blocks)
3428 {
3429 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3430 	struct btrfs_path *path;
3431 	struct btrfs_key key;
3432 	int ret;
3433 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3434 
3435 	if (tree_block_processed(bytenr, rc))
3436 		return 0;
3437 
3438 	if (tree_search(blocks, bytenr))
3439 		return 0;
3440 
3441 	path = btrfs_alloc_path();
3442 	if (!path)
3443 		return -ENOMEM;
3444 again:
3445 	key.objectid = bytenr;
3446 	if (skinny) {
3447 		key.type = BTRFS_METADATA_ITEM_KEY;
3448 		key.offset = (u64)-1;
3449 	} else {
3450 		key.type = BTRFS_EXTENT_ITEM_KEY;
3451 		key.offset = blocksize;
3452 	}
3453 
3454 	path->search_commit_root = 1;
3455 	path->skip_locking = 1;
3456 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3457 	if (ret < 0)
3458 		goto out;
3459 
3460 	if (ret > 0 && skinny) {
3461 		if (path->slots[0]) {
3462 			path->slots[0]--;
3463 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3464 					      path->slots[0]);
3465 			if (key.objectid == bytenr &&
3466 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3467 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3468 			      key.offset == blocksize)))
3469 				ret = 0;
3470 		}
3471 
3472 		if (ret) {
3473 			skinny = false;
3474 			btrfs_release_path(path);
3475 			goto again;
3476 		}
3477 	}
3478 	BUG_ON(ret);
3479 
3480 	ret = add_tree_block(rc, &key, path, blocks);
3481 out:
3482 	btrfs_free_path(path);
3483 	return ret;
3484 }
3485 
3486 /*
3487  * helper to check if the block use full backrefs for pointers in it
3488  */
3489 static int block_use_full_backref(struct reloc_control *rc,
3490 				  struct extent_buffer *eb)
3491 {
3492 	u64 flags;
3493 	int ret;
3494 
3495 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3496 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3497 		return 1;
3498 
3499 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3500 				       eb->start, btrfs_header_level(eb), 1,
3501 				       NULL, &flags);
3502 	BUG_ON(ret);
3503 
3504 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3505 		ret = 1;
3506 	else
3507 		ret = 0;
3508 	return ret;
3509 }
3510 
3511 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3512 				    struct btrfs_block_group_cache *block_group,
3513 				    struct inode *inode,
3514 				    u64 ino)
3515 {
3516 	struct btrfs_key key;
3517 	struct btrfs_root *root = fs_info->tree_root;
3518 	struct btrfs_trans_handle *trans;
3519 	int ret = 0;
3520 
3521 	if (inode)
3522 		goto truncate;
3523 
3524 	key.objectid = ino;
3525 	key.type = BTRFS_INODE_ITEM_KEY;
3526 	key.offset = 0;
3527 
3528 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3529 	if (IS_ERR(inode) || is_bad_inode(inode)) {
3530 		if (!IS_ERR(inode))
3531 			iput(inode);
3532 		return -ENOENT;
3533 	}
3534 
3535 truncate:
3536 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3537 						 &fs_info->global_block_rsv);
3538 	if (ret)
3539 		goto out;
3540 
3541 	trans = btrfs_join_transaction(root);
3542 	if (IS_ERR(trans)) {
3543 		ret = PTR_ERR(trans);
3544 		goto out;
3545 	}
3546 
3547 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3548 
3549 	btrfs_end_transaction(trans);
3550 	btrfs_btree_balance_dirty(fs_info);
3551 out:
3552 	iput(inode);
3553 	return ret;
3554 }
3555 
3556 /*
3557  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3558  * this function scans fs tree to find blocks reference the data extent
3559  */
3560 static int find_data_references(struct reloc_control *rc,
3561 				struct btrfs_key *extent_key,
3562 				struct extent_buffer *leaf,
3563 				struct btrfs_extent_data_ref *ref,
3564 				struct rb_root *blocks)
3565 {
3566 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3567 	struct btrfs_path *path;
3568 	struct tree_block *block;
3569 	struct btrfs_root *root;
3570 	struct btrfs_file_extent_item *fi;
3571 	struct rb_node *rb_node;
3572 	struct btrfs_key key;
3573 	u64 ref_root;
3574 	u64 ref_objectid;
3575 	u64 ref_offset;
3576 	u32 ref_count;
3577 	u32 nritems;
3578 	int err = 0;
3579 	int added = 0;
3580 	int counted;
3581 	int ret;
3582 
3583 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3584 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3585 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3586 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3587 
3588 	/*
3589 	 * This is an extent belonging to the free space cache, lets just delete
3590 	 * it and redo the search.
3591 	 */
3592 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3593 		ret = delete_block_group_cache(fs_info, rc->block_group,
3594 					       NULL, ref_objectid);
3595 		if (ret != -ENOENT)
3596 			return ret;
3597 		ret = 0;
3598 	}
3599 
3600 	path = btrfs_alloc_path();
3601 	if (!path)
3602 		return -ENOMEM;
3603 	path->reada = READA_FORWARD;
3604 
3605 	root = read_fs_root(fs_info, ref_root);
3606 	if (IS_ERR(root)) {
3607 		err = PTR_ERR(root);
3608 		goto out;
3609 	}
3610 
3611 	key.objectid = ref_objectid;
3612 	key.type = BTRFS_EXTENT_DATA_KEY;
3613 	if (ref_offset > ((u64)-1 << 32))
3614 		key.offset = 0;
3615 	else
3616 		key.offset = ref_offset;
3617 
3618 	path->search_commit_root = 1;
3619 	path->skip_locking = 1;
3620 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3621 	if (ret < 0) {
3622 		err = ret;
3623 		goto out;
3624 	}
3625 
3626 	leaf = path->nodes[0];
3627 	nritems = btrfs_header_nritems(leaf);
3628 	/*
3629 	 * the references in tree blocks that use full backrefs
3630 	 * are not counted in
3631 	 */
3632 	if (block_use_full_backref(rc, leaf))
3633 		counted = 0;
3634 	else
3635 		counted = 1;
3636 	rb_node = tree_search(blocks, leaf->start);
3637 	if (rb_node) {
3638 		if (counted)
3639 			added = 1;
3640 		else
3641 			path->slots[0] = nritems;
3642 	}
3643 
3644 	while (ref_count > 0) {
3645 		while (path->slots[0] >= nritems) {
3646 			ret = btrfs_next_leaf(root, path);
3647 			if (ret < 0) {
3648 				err = ret;
3649 				goto out;
3650 			}
3651 			if (WARN_ON(ret > 0))
3652 				goto out;
3653 
3654 			leaf = path->nodes[0];
3655 			nritems = btrfs_header_nritems(leaf);
3656 			added = 0;
3657 
3658 			if (block_use_full_backref(rc, leaf))
3659 				counted = 0;
3660 			else
3661 				counted = 1;
3662 			rb_node = tree_search(blocks, leaf->start);
3663 			if (rb_node) {
3664 				if (counted)
3665 					added = 1;
3666 				else
3667 					path->slots[0] = nritems;
3668 			}
3669 		}
3670 
3671 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3672 		if (WARN_ON(key.objectid != ref_objectid ||
3673 		    key.type != BTRFS_EXTENT_DATA_KEY))
3674 			break;
3675 
3676 		fi = btrfs_item_ptr(leaf, path->slots[0],
3677 				    struct btrfs_file_extent_item);
3678 
3679 		if (btrfs_file_extent_type(leaf, fi) ==
3680 		    BTRFS_FILE_EXTENT_INLINE)
3681 			goto next;
3682 
3683 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3684 		    extent_key->objectid)
3685 			goto next;
3686 
3687 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3688 		if (key.offset != ref_offset)
3689 			goto next;
3690 
3691 		if (counted)
3692 			ref_count--;
3693 		if (added)
3694 			goto next;
3695 
3696 		if (!tree_block_processed(leaf->start, rc)) {
3697 			block = kmalloc(sizeof(*block), GFP_NOFS);
3698 			if (!block) {
3699 				err = -ENOMEM;
3700 				break;
3701 			}
3702 			block->bytenr = leaf->start;
3703 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3704 			block->level = 0;
3705 			block->key_ready = 1;
3706 			rb_node = tree_insert(blocks, block->bytenr,
3707 					      &block->rb_node);
3708 			if (rb_node)
3709 				backref_tree_panic(rb_node, -EEXIST,
3710 						   block->bytenr);
3711 		}
3712 		if (counted)
3713 			added = 1;
3714 		else
3715 			path->slots[0] = nritems;
3716 next:
3717 		path->slots[0]++;
3718 
3719 	}
3720 out:
3721 	btrfs_free_path(path);
3722 	return err;
3723 }
3724 
3725 /*
3726  * helper to find all tree blocks that reference a given data extent
3727  */
3728 static noinline_for_stack
3729 int add_data_references(struct reloc_control *rc,
3730 			struct btrfs_key *extent_key,
3731 			struct btrfs_path *path,
3732 			struct rb_root *blocks)
3733 {
3734 	struct btrfs_key key;
3735 	struct extent_buffer *eb;
3736 	struct btrfs_extent_data_ref *dref;
3737 	struct btrfs_extent_inline_ref *iref;
3738 	unsigned long ptr;
3739 	unsigned long end;
3740 	u32 blocksize = rc->extent_root->fs_info->nodesize;
3741 	int ret = 0;
3742 	int err = 0;
3743 
3744 	eb = path->nodes[0];
3745 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3746 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3747 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3748 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3749 		ptr = end;
3750 	else
3751 #endif
3752 		ptr += sizeof(struct btrfs_extent_item);
3753 
3754 	while (ptr < end) {
3755 		iref = (struct btrfs_extent_inline_ref *)ptr;
3756 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3757 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3758 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3759 			ret = __add_tree_block(rc, key.offset, blocksize,
3760 					       blocks);
3761 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3762 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3763 			ret = find_data_references(rc, extent_key,
3764 						   eb, dref, blocks);
3765 		} else {
3766 			BUG();
3767 		}
3768 		if (ret) {
3769 			err = ret;
3770 			goto out;
3771 		}
3772 		ptr += btrfs_extent_inline_ref_size(key.type);
3773 	}
3774 	WARN_ON(ptr > end);
3775 
3776 	while (1) {
3777 		cond_resched();
3778 		eb = path->nodes[0];
3779 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3780 			ret = btrfs_next_leaf(rc->extent_root, path);
3781 			if (ret < 0) {
3782 				err = ret;
3783 				break;
3784 			}
3785 			if (ret > 0)
3786 				break;
3787 			eb = path->nodes[0];
3788 		}
3789 
3790 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3791 		if (key.objectid != extent_key->objectid)
3792 			break;
3793 
3794 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3795 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3796 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3797 #else
3798 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3799 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3800 #endif
3801 			ret = __add_tree_block(rc, key.offset, blocksize,
3802 					       blocks);
3803 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3804 			dref = btrfs_item_ptr(eb, path->slots[0],
3805 					      struct btrfs_extent_data_ref);
3806 			ret = find_data_references(rc, extent_key,
3807 						   eb, dref, blocks);
3808 		} else {
3809 			ret = 0;
3810 		}
3811 		if (ret) {
3812 			err = ret;
3813 			break;
3814 		}
3815 		path->slots[0]++;
3816 	}
3817 out:
3818 	btrfs_release_path(path);
3819 	if (err)
3820 		free_block_list(blocks);
3821 	return err;
3822 }
3823 
3824 /*
3825  * helper to find next unprocessed extent
3826  */
3827 static noinline_for_stack
3828 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3829 		     struct btrfs_key *extent_key)
3830 {
3831 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3832 	struct btrfs_key key;
3833 	struct extent_buffer *leaf;
3834 	u64 start, end, last;
3835 	int ret;
3836 
3837 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3838 	while (1) {
3839 		cond_resched();
3840 		if (rc->search_start >= last) {
3841 			ret = 1;
3842 			break;
3843 		}
3844 
3845 		key.objectid = rc->search_start;
3846 		key.type = BTRFS_EXTENT_ITEM_KEY;
3847 		key.offset = 0;
3848 
3849 		path->search_commit_root = 1;
3850 		path->skip_locking = 1;
3851 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3852 					0, 0);
3853 		if (ret < 0)
3854 			break;
3855 next:
3856 		leaf = path->nodes[0];
3857 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3858 			ret = btrfs_next_leaf(rc->extent_root, path);
3859 			if (ret != 0)
3860 				break;
3861 			leaf = path->nodes[0];
3862 		}
3863 
3864 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3865 		if (key.objectid >= last) {
3866 			ret = 1;
3867 			break;
3868 		}
3869 
3870 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3871 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3872 			path->slots[0]++;
3873 			goto next;
3874 		}
3875 
3876 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3877 		    key.objectid + key.offset <= rc->search_start) {
3878 			path->slots[0]++;
3879 			goto next;
3880 		}
3881 
3882 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3883 		    key.objectid + fs_info->nodesize <=
3884 		    rc->search_start) {
3885 			path->slots[0]++;
3886 			goto next;
3887 		}
3888 
3889 		ret = find_first_extent_bit(&rc->processed_blocks,
3890 					    key.objectid, &start, &end,
3891 					    EXTENT_DIRTY, NULL);
3892 
3893 		if (ret == 0 && start <= key.objectid) {
3894 			btrfs_release_path(path);
3895 			rc->search_start = end + 1;
3896 		} else {
3897 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3898 				rc->search_start = key.objectid + key.offset;
3899 			else
3900 				rc->search_start = key.objectid +
3901 					fs_info->nodesize;
3902 			memcpy(extent_key, &key, sizeof(key));
3903 			return 0;
3904 		}
3905 	}
3906 	btrfs_release_path(path);
3907 	return ret;
3908 }
3909 
3910 static void set_reloc_control(struct reloc_control *rc)
3911 {
3912 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3913 
3914 	mutex_lock(&fs_info->reloc_mutex);
3915 	fs_info->reloc_ctl = rc;
3916 	mutex_unlock(&fs_info->reloc_mutex);
3917 }
3918 
3919 static void unset_reloc_control(struct reloc_control *rc)
3920 {
3921 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3922 
3923 	mutex_lock(&fs_info->reloc_mutex);
3924 	fs_info->reloc_ctl = NULL;
3925 	mutex_unlock(&fs_info->reloc_mutex);
3926 }
3927 
3928 static int check_extent_flags(u64 flags)
3929 {
3930 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3931 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3932 		return 1;
3933 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3934 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3935 		return 1;
3936 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3937 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3938 		return 1;
3939 	return 0;
3940 }
3941 
3942 static noinline_for_stack
3943 int prepare_to_relocate(struct reloc_control *rc)
3944 {
3945 	struct btrfs_trans_handle *trans;
3946 	int ret;
3947 
3948 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3949 					      BTRFS_BLOCK_RSV_TEMP);
3950 	if (!rc->block_rsv)
3951 		return -ENOMEM;
3952 
3953 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3954 	rc->search_start = rc->block_group->key.objectid;
3955 	rc->extents_found = 0;
3956 	rc->nodes_relocated = 0;
3957 	rc->merging_rsv_size = 0;
3958 	rc->reserved_bytes = 0;
3959 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3960 			      RELOCATION_RESERVED_NODES;
3961 	ret = btrfs_block_rsv_refill(rc->extent_root,
3962 				     rc->block_rsv, rc->block_rsv->size,
3963 				     BTRFS_RESERVE_FLUSH_ALL);
3964 	if (ret)
3965 		return ret;
3966 
3967 	rc->create_reloc_tree = 1;
3968 	set_reloc_control(rc);
3969 
3970 	trans = btrfs_join_transaction(rc->extent_root);
3971 	if (IS_ERR(trans)) {
3972 		unset_reloc_control(rc);
3973 		/*
3974 		 * extent tree is not a ref_cow tree and has no reloc_root to
3975 		 * cleanup.  And callers are responsible to free the above
3976 		 * block rsv.
3977 		 */
3978 		return PTR_ERR(trans);
3979 	}
3980 	btrfs_commit_transaction(trans);
3981 	return 0;
3982 }
3983 
3984 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3985 {
3986 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3987 	struct rb_root blocks = RB_ROOT;
3988 	struct btrfs_key key;
3989 	struct btrfs_trans_handle *trans = NULL;
3990 	struct btrfs_path *path;
3991 	struct btrfs_extent_item *ei;
3992 	u64 flags;
3993 	u32 item_size;
3994 	int ret;
3995 	int err = 0;
3996 	int progress = 0;
3997 
3998 	path = btrfs_alloc_path();
3999 	if (!path)
4000 		return -ENOMEM;
4001 	path->reada = READA_FORWARD;
4002 
4003 	ret = prepare_to_relocate(rc);
4004 	if (ret) {
4005 		err = ret;
4006 		goto out_free;
4007 	}
4008 
4009 	while (1) {
4010 		rc->reserved_bytes = 0;
4011 		ret = btrfs_block_rsv_refill(rc->extent_root,
4012 					rc->block_rsv, rc->block_rsv->size,
4013 					BTRFS_RESERVE_FLUSH_ALL);
4014 		if (ret) {
4015 			err = ret;
4016 			break;
4017 		}
4018 		progress++;
4019 		trans = btrfs_start_transaction(rc->extent_root, 0);
4020 		if (IS_ERR(trans)) {
4021 			err = PTR_ERR(trans);
4022 			trans = NULL;
4023 			break;
4024 		}
4025 restart:
4026 		if (update_backref_cache(trans, &rc->backref_cache)) {
4027 			btrfs_end_transaction(trans);
4028 			continue;
4029 		}
4030 
4031 		ret = find_next_extent(rc, path, &key);
4032 		if (ret < 0)
4033 			err = ret;
4034 		if (ret != 0)
4035 			break;
4036 
4037 		rc->extents_found++;
4038 
4039 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4040 				    struct btrfs_extent_item);
4041 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4042 		if (item_size >= sizeof(*ei)) {
4043 			flags = btrfs_extent_flags(path->nodes[0], ei);
4044 			ret = check_extent_flags(flags);
4045 			BUG_ON(ret);
4046 
4047 		} else {
4048 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4049 			u64 ref_owner;
4050 			int path_change = 0;
4051 
4052 			BUG_ON(item_size !=
4053 			       sizeof(struct btrfs_extent_item_v0));
4054 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4055 						  &path_change);
4056 			if (ret < 0) {
4057 				err = ret;
4058 				break;
4059 			}
4060 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4061 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4062 			else
4063 				flags = BTRFS_EXTENT_FLAG_DATA;
4064 
4065 			if (path_change) {
4066 				btrfs_release_path(path);
4067 
4068 				path->search_commit_root = 1;
4069 				path->skip_locking = 1;
4070 				ret = btrfs_search_slot(NULL, rc->extent_root,
4071 							&key, path, 0, 0);
4072 				if (ret < 0) {
4073 					err = ret;
4074 					break;
4075 				}
4076 				BUG_ON(ret > 0);
4077 			}
4078 #else
4079 			BUG();
4080 #endif
4081 		}
4082 
4083 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4084 			ret = add_tree_block(rc, &key, path, &blocks);
4085 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4086 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4087 			ret = add_data_references(rc, &key, path, &blocks);
4088 		} else {
4089 			btrfs_release_path(path);
4090 			ret = 0;
4091 		}
4092 		if (ret < 0) {
4093 			err = ret;
4094 			break;
4095 		}
4096 
4097 		if (!RB_EMPTY_ROOT(&blocks)) {
4098 			ret = relocate_tree_blocks(trans, rc, &blocks);
4099 			if (ret < 0) {
4100 				/*
4101 				 * if we fail to relocate tree blocks, force to update
4102 				 * backref cache when committing transaction.
4103 				 */
4104 				rc->backref_cache.last_trans = trans->transid - 1;
4105 
4106 				if (ret != -EAGAIN) {
4107 					err = ret;
4108 					break;
4109 				}
4110 				rc->extents_found--;
4111 				rc->search_start = key.objectid;
4112 			}
4113 		}
4114 
4115 		btrfs_end_transaction_throttle(trans);
4116 		btrfs_btree_balance_dirty(fs_info);
4117 		trans = NULL;
4118 
4119 		if (rc->stage == MOVE_DATA_EXTENTS &&
4120 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4121 			rc->found_file_extent = 1;
4122 			ret = relocate_data_extent(rc->data_inode,
4123 						   &key, &rc->cluster);
4124 			if (ret < 0) {
4125 				err = ret;
4126 				break;
4127 			}
4128 		}
4129 	}
4130 	if (trans && progress && err == -ENOSPC) {
4131 		ret = btrfs_force_chunk_alloc(trans, fs_info,
4132 					      rc->block_group->flags);
4133 		if (ret == 1) {
4134 			err = 0;
4135 			progress = 0;
4136 			goto restart;
4137 		}
4138 	}
4139 
4140 	btrfs_release_path(path);
4141 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4142 
4143 	if (trans) {
4144 		btrfs_end_transaction_throttle(trans);
4145 		btrfs_btree_balance_dirty(fs_info);
4146 	}
4147 
4148 	if (!err) {
4149 		ret = relocate_file_extent_cluster(rc->data_inode,
4150 						   &rc->cluster);
4151 		if (ret < 0)
4152 			err = ret;
4153 	}
4154 
4155 	rc->create_reloc_tree = 0;
4156 	set_reloc_control(rc);
4157 
4158 	backref_cache_cleanup(&rc->backref_cache);
4159 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4160 
4161 	err = prepare_to_merge(rc, err);
4162 
4163 	merge_reloc_roots(rc);
4164 
4165 	rc->merge_reloc_tree = 0;
4166 	unset_reloc_control(rc);
4167 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4168 
4169 	/* get rid of pinned extents */
4170 	trans = btrfs_join_transaction(rc->extent_root);
4171 	if (IS_ERR(trans)) {
4172 		err = PTR_ERR(trans);
4173 		goto out_free;
4174 	}
4175 	btrfs_commit_transaction(trans);
4176 out_free:
4177 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
4178 	btrfs_free_path(path);
4179 	return err;
4180 }
4181 
4182 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4183 				 struct btrfs_root *root, u64 objectid)
4184 {
4185 	struct btrfs_path *path;
4186 	struct btrfs_inode_item *item;
4187 	struct extent_buffer *leaf;
4188 	int ret;
4189 
4190 	path = btrfs_alloc_path();
4191 	if (!path)
4192 		return -ENOMEM;
4193 
4194 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4195 	if (ret)
4196 		goto out;
4197 
4198 	leaf = path->nodes[0];
4199 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4200 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4201 	btrfs_set_inode_generation(leaf, item, 1);
4202 	btrfs_set_inode_size(leaf, item, 0);
4203 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4204 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4205 					  BTRFS_INODE_PREALLOC);
4206 	btrfs_mark_buffer_dirty(leaf);
4207 out:
4208 	btrfs_free_path(path);
4209 	return ret;
4210 }
4211 
4212 /*
4213  * helper to create inode for data relocation.
4214  * the inode is in data relocation tree and its link count is 0
4215  */
4216 static noinline_for_stack
4217 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4218 				 struct btrfs_block_group_cache *group)
4219 {
4220 	struct inode *inode = NULL;
4221 	struct btrfs_trans_handle *trans;
4222 	struct btrfs_root *root;
4223 	struct btrfs_key key;
4224 	u64 objectid;
4225 	int err = 0;
4226 
4227 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4228 	if (IS_ERR(root))
4229 		return ERR_CAST(root);
4230 
4231 	trans = btrfs_start_transaction(root, 6);
4232 	if (IS_ERR(trans))
4233 		return ERR_CAST(trans);
4234 
4235 	err = btrfs_find_free_objectid(root, &objectid);
4236 	if (err)
4237 		goto out;
4238 
4239 	err = __insert_orphan_inode(trans, root, objectid);
4240 	BUG_ON(err);
4241 
4242 	key.objectid = objectid;
4243 	key.type = BTRFS_INODE_ITEM_KEY;
4244 	key.offset = 0;
4245 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4246 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4247 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4248 
4249 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4250 out:
4251 	btrfs_end_transaction(trans);
4252 	btrfs_btree_balance_dirty(fs_info);
4253 	if (err) {
4254 		if (inode)
4255 			iput(inode);
4256 		inode = ERR_PTR(err);
4257 	}
4258 	return inode;
4259 }
4260 
4261 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4262 {
4263 	struct reloc_control *rc;
4264 
4265 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4266 	if (!rc)
4267 		return NULL;
4268 
4269 	INIT_LIST_HEAD(&rc->reloc_roots);
4270 	backref_cache_init(&rc->backref_cache);
4271 	mapping_tree_init(&rc->reloc_root_tree);
4272 	extent_io_tree_init(&rc->processed_blocks,
4273 			    fs_info->btree_inode->i_mapping);
4274 	return rc;
4275 }
4276 
4277 /*
4278  * Print the block group being relocated
4279  */
4280 static void describe_relocation(struct btrfs_fs_info *fs_info,
4281 				struct btrfs_block_group_cache *block_group)
4282 {
4283 	char buf[128];		/* prefixed by a '|' that'll be dropped */
4284 	u64 flags = block_group->flags;
4285 
4286 	/* Shouldn't happen */
4287 	if (!flags) {
4288 		strcpy(buf, "|NONE");
4289 	} else {
4290 		char *bp = buf;
4291 
4292 #define DESCRIBE_FLAG(f, d) \
4293 		if (flags & BTRFS_BLOCK_GROUP_##f) { \
4294 			bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
4295 			flags &= ~BTRFS_BLOCK_GROUP_##f; \
4296 		}
4297 		DESCRIBE_FLAG(DATA,     "data");
4298 		DESCRIBE_FLAG(SYSTEM,   "system");
4299 		DESCRIBE_FLAG(METADATA, "metadata");
4300 		DESCRIBE_FLAG(RAID0,    "raid0");
4301 		DESCRIBE_FLAG(RAID1,    "raid1");
4302 		DESCRIBE_FLAG(DUP,      "dup");
4303 		DESCRIBE_FLAG(RAID10,   "raid10");
4304 		DESCRIBE_FLAG(RAID5,    "raid5");
4305 		DESCRIBE_FLAG(RAID6,    "raid6");
4306 		if (flags)
4307 			snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
4308 #undef DESCRIBE_FLAG
4309 	}
4310 
4311 	btrfs_info(fs_info,
4312 		   "relocating block group %llu flags %s",
4313 		   block_group->key.objectid, buf + 1);
4314 }
4315 
4316 /*
4317  * function to relocate all extents in a block group.
4318  */
4319 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4320 {
4321 	struct btrfs_root *extent_root = fs_info->extent_root;
4322 	struct reloc_control *rc;
4323 	struct inode *inode;
4324 	struct btrfs_path *path;
4325 	int ret;
4326 	int rw = 0;
4327 	int err = 0;
4328 
4329 	rc = alloc_reloc_control(fs_info);
4330 	if (!rc)
4331 		return -ENOMEM;
4332 
4333 	rc->extent_root = extent_root;
4334 
4335 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4336 	BUG_ON(!rc->block_group);
4337 
4338 	ret = btrfs_inc_block_group_ro(fs_info, rc->block_group);
4339 	if (ret) {
4340 		err = ret;
4341 		goto out;
4342 	}
4343 	rw = 1;
4344 
4345 	path = btrfs_alloc_path();
4346 	if (!path) {
4347 		err = -ENOMEM;
4348 		goto out;
4349 	}
4350 
4351 	inode = lookup_free_space_inode(fs_info, rc->block_group, path);
4352 	btrfs_free_path(path);
4353 
4354 	if (!IS_ERR(inode))
4355 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4356 	else
4357 		ret = PTR_ERR(inode);
4358 
4359 	if (ret && ret != -ENOENT) {
4360 		err = ret;
4361 		goto out;
4362 	}
4363 
4364 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4365 	if (IS_ERR(rc->data_inode)) {
4366 		err = PTR_ERR(rc->data_inode);
4367 		rc->data_inode = NULL;
4368 		goto out;
4369 	}
4370 
4371 	describe_relocation(fs_info, rc->block_group);
4372 
4373 	btrfs_wait_block_group_reservations(rc->block_group);
4374 	btrfs_wait_nocow_writers(rc->block_group);
4375 	btrfs_wait_ordered_roots(fs_info, -1,
4376 				 rc->block_group->key.objectid,
4377 				 rc->block_group->key.offset);
4378 
4379 	while (1) {
4380 		mutex_lock(&fs_info->cleaner_mutex);
4381 		ret = relocate_block_group(rc);
4382 		mutex_unlock(&fs_info->cleaner_mutex);
4383 		if (ret < 0) {
4384 			err = ret;
4385 			goto out;
4386 		}
4387 
4388 		if (rc->extents_found == 0)
4389 			break;
4390 
4391 		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4392 
4393 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4394 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4395 						       (u64)-1);
4396 			if (ret) {
4397 				err = ret;
4398 				goto out;
4399 			}
4400 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4401 						 0, -1);
4402 			rc->stage = UPDATE_DATA_PTRS;
4403 		}
4404 	}
4405 
4406 	WARN_ON(rc->block_group->pinned > 0);
4407 	WARN_ON(rc->block_group->reserved > 0);
4408 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4409 out:
4410 	if (err && rw)
4411 		btrfs_dec_block_group_ro(rc->block_group);
4412 	iput(rc->data_inode);
4413 	btrfs_put_block_group(rc->block_group);
4414 	kfree(rc);
4415 	return err;
4416 }
4417 
4418 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4419 {
4420 	struct btrfs_fs_info *fs_info = root->fs_info;
4421 	struct btrfs_trans_handle *trans;
4422 	int ret, err;
4423 
4424 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4425 	if (IS_ERR(trans))
4426 		return PTR_ERR(trans);
4427 
4428 	memset(&root->root_item.drop_progress, 0,
4429 		sizeof(root->root_item.drop_progress));
4430 	root->root_item.drop_level = 0;
4431 	btrfs_set_root_refs(&root->root_item, 0);
4432 	ret = btrfs_update_root(trans, fs_info->tree_root,
4433 				&root->root_key, &root->root_item);
4434 
4435 	err = btrfs_end_transaction(trans);
4436 	if (err)
4437 		return err;
4438 	return ret;
4439 }
4440 
4441 /*
4442  * recover relocation interrupted by system crash.
4443  *
4444  * this function resumes merging reloc trees with corresponding fs trees.
4445  * this is important for keeping the sharing of tree blocks
4446  */
4447 int btrfs_recover_relocation(struct btrfs_root *root)
4448 {
4449 	struct btrfs_fs_info *fs_info = root->fs_info;
4450 	LIST_HEAD(reloc_roots);
4451 	struct btrfs_key key;
4452 	struct btrfs_root *fs_root;
4453 	struct btrfs_root *reloc_root;
4454 	struct btrfs_path *path;
4455 	struct extent_buffer *leaf;
4456 	struct reloc_control *rc = NULL;
4457 	struct btrfs_trans_handle *trans;
4458 	int ret;
4459 	int err = 0;
4460 
4461 	path = btrfs_alloc_path();
4462 	if (!path)
4463 		return -ENOMEM;
4464 	path->reada = READA_BACK;
4465 
4466 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4467 	key.type = BTRFS_ROOT_ITEM_KEY;
4468 	key.offset = (u64)-1;
4469 
4470 	while (1) {
4471 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4472 					path, 0, 0);
4473 		if (ret < 0) {
4474 			err = ret;
4475 			goto out;
4476 		}
4477 		if (ret > 0) {
4478 			if (path->slots[0] == 0)
4479 				break;
4480 			path->slots[0]--;
4481 		}
4482 		leaf = path->nodes[0];
4483 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4484 		btrfs_release_path(path);
4485 
4486 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4487 		    key.type != BTRFS_ROOT_ITEM_KEY)
4488 			break;
4489 
4490 		reloc_root = btrfs_read_fs_root(root, &key);
4491 		if (IS_ERR(reloc_root)) {
4492 			err = PTR_ERR(reloc_root);
4493 			goto out;
4494 		}
4495 
4496 		list_add(&reloc_root->root_list, &reloc_roots);
4497 
4498 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4499 			fs_root = read_fs_root(fs_info,
4500 					       reloc_root->root_key.offset);
4501 			if (IS_ERR(fs_root)) {
4502 				ret = PTR_ERR(fs_root);
4503 				if (ret != -ENOENT) {
4504 					err = ret;
4505 					goto out;
4506 				}
4507 				ret = mark_garbage_root(reloc_root);
4508 				if (ret < 0) {
4509 					err = ret;
4510 					goto out;
4511 				}
4512 			}
4513 		}
4514 
4515 		if (key.offset == 0)
4516 			break;
4517 
4518 		key.offset--;
4519 	}
4520 	btrfs_release_path(path);
4521 
4522 	if (list_empty(&reloc_roots))
4523 		goto out;
4524 
4525 	rc = alloc_reloc_control(fs_info);
4526 	if (!rc) {
4527 		err = -ENOMEM;
4528 		goto out;
4529 	}
4530 
4531 	rc->extent_root = fs_info->extent_root;
4532 
4533 	set_reloc_control(rc);
4534 
4535 	trans = btrfs_join_transaction(rc->extent_root);
4536 	if (IS_ERR(trans)) {
4537 		unset_reloc_control(rc);
4538 		err = PTR_ERR(trans);
4539 		goto out_free;
4540 	}
4541 
4542 	rc->merge_reloc_tree = 1;
4543 
4544 	while (!list_empty(&reloc_roots)) {
4545 		reloc_root = list_entry(reloc_roots.next,
4546 					struct btrfs_root, root_list);
4547 		list_del(&reloc_root->root_list);
4548 
4549 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4550 			list_add_tail(&reloc_root->root_list,
4551 				      &rc->reloc_roots);
4552 			continue;
4553 		}
4554 
4555 		fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4556 		if (IS_ERR(fs_root)) {
4557 			err = PTR_ERR(fs_root);
4558 			goto out_free;
4559 		}
4560 
4561 		err = __add_reloc_root(reloc_root);
4562 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4563 		fs_root->reloc_root = reloc_root;
4564 	}
4565 
4566 	err = btrfs_commit_transaction(trans);
4567 	if (err)
4568 		goto out_free;
4569 
4570 	merge_reloc_roots(rc);
4571 
4572 	unset_reloc_control(rc);
4573 
4574 	trans = btrfs_join_transaction(rc->extent_root);
4575 	if (IS_ERR(trans)) {
4576 		err = PTR_ERR(trans);
4577 		goto out_free;
4578 	}
4579 	err = btrfs_commit_transaction(trans);
4580 out_free:
4581 	kfree(rc);
4582 out:
4583 	if (!list_empty(&reloc_roots))
4584 		free_reloc_roots(&reloc_roots);
4585 
4586 	btrfs_free_path(path);
4587 
4588 	if (err == 0) {
4589 		/* cleanup orphan inode in data relocation tree */
4590 		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4591 		if (IS_ERR(fs_root))
4592 			err = PTR_ERR(fs_root);
4593 		else
4594 			err = btrfs_orphan_cleanup(fs_root);
4595 	}
4596 	return err;
4597 }
4598 
4599 /*
4600  * helper to add ordered checksum for data relocation.
4601  *
4602  * cloning checksum properly handles the nodatasum extents.
4603  * it also saves CPU time to re-calculate the checksum.
4604  */
4605 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4606 {
4607 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4608 	struct btrfs_ordered_sum *sums;
4609 	struct btrfs_ordered_extent *ordered;
4610 	int ret;
4611 	u64 disk_bytenr;
4612 	u64 new_bytenr;
4613 	LIST_HEAD(list);
4614 
4615 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4616 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4617 
4618 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4619 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4620 				       disk_bytenr + len - 1, &list, 0);
4621 	if (ret)
4622 		goto out;
4623 
4624 	while (!list_empty(&list)) {
4625 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4626 		list_del_init(&sums->list);
4627 
4628 		/*
4629 		 * We need to offset the new_bytenr based on where the csum is.
4630 		 * We need to do this because we will read in entire prealloc
4631 		 * extents but we may have written to say the middle of the
4632 		 * prealloc extent, so we need to make sure the csum goes with
4633 		 * the right disk offset.
4634 		 *
4635 		 * We can do this because the data reloc inode refers strictly
4636 		 * to the on disk bytes, so we don't have to worry about
4637 		 * disk_len vs real len like with real inodes since it's all
4638 		 * disk length.
4639 		 */
4640 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4641 		sums->bytenr = new_bytenr;
4642 
4643 		btrfs_add_ordered_sum(inode, ordered, sums);
4644 	}
4645 out:
4646 	btrfs_put_ordered_extent(ordered);
4647 	return ret;
4648 }
4649 
4650 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4651 			  struct btrfs_root *root, struct extent_buffer *buf,
4652 			  struct extent_buffer *cow)
4653 {
4654 	struct btrfs_fs_info *fs_info = root->fs_info;
4655 	struct reloc_control *rc;
4656 	struct backref_node *node;
4657 	int first_cow = 0;
4658 	int level;
4659 	int ret = 0;
4660 
4661 	rc = fs_info->reloc_ctl;
4662 	if (!rc)
4663 		return 0;
4664 
4665 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4666 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4667 
4668 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4669 		if (buf == root->node)
4670 			__update_reloc_root(root, cow->start);
4671 	}
4672 
4673 	level = btrfs_header_level(buf);
4674 	if (btrfs_header_generation(buf) <=
4675 	    btrfs_root_last_snapshot(&root->root_item))
4676 		first_cow = 1;
4677 
4678 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4679 	    rc->create_reloc_tree) {
4680 		WARN_ON(!first_cow && level == 0);
4681 
4682 		node = rc->backref_cache.path[level];
4683 		BUG_ON(node->bytenr != buf->start &&
4684 		       node->new_bytenr != buf->start);
4685 
4686 		drop_node_buffer(node);
4687 		extent_buffer_get(cow);
4688 		node->eb = cow;
4689 		node->new_bytenr = cow->start;
4690 
4691 		if (!node->pending) {
4692 			list_move_tail(&node->list,
4693 				       &rc->backref_cache.pending[level]);
4694 			node->pending = 1;
4695 		}
4696 
4697 		if (first_cow)
4698 			__mark_block_processed(rc, node);
4699 
4700 		if (first_cow && level > 0)
4701 			rc->nodes_relocated += buf->len;
4702 	}
4703 
4704 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4705 		ret = replace_file_extents(trans, rc, root, cow);
4706 	return ret;
4707 }
4708 
4709 /*
4710  * called before creating snapshot. it calculates metadata reservation
4711  * required for relocating tree blocks in the snapshot
4712  */
4713 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4714 			      u64 *bytes_to_reserve)
4715 {
4716 	struct btrfs_root *root;
4717 	struct reloc_control *rc;
4718 
4719 	root = pending->root;
4720 	if (!root->reloc_root)
4721 		return;
4722 
4723 	rc = root->fs_info->reloc_ctl;
4724 	if (!rc->merge_reloc_tree)
4725 		return;
4726 
4727 	root = root->reloc_root;
4728 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4729 	/*
4730 	 * relocation is in the stage of merging trees. the space
4731 	 * used by merging a reloc tree is twice the size of
4732 	 * relocated tree nodes in the worst case. half for cowing
4733 	 * the reloc tree, half for cowing the fs tree. the space
4734 	 * used by cowing the reloc tree will be freed after the
4735 	 * tree is dropped. if we create snapshot, cowing the fs
4736 	 * tree may use more space than it frees. so we need
4737 	 * reserve extra space.
4738 	 */
4739 	*bytes_to_reserve += rc->nodes_relocated;
4740 }
4741 
4742 /*
4743  * called after snapshot is created. migrate block reservation
4744  * and create reloc root for the newly created snapshot
4745  */
4746 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4747 			       struct btrfs_pending_snapshot *pending)
4748 {
4749 	struct btrfs_root *root = pending->root;
4750 	struct btrfs_root *reloc_root;
4751 	struct btrfs_root *new_root;
4752 	struct reloc_control *rc;
4753 	int ret;
4754 
4755 	if (!root->reloc_root)
4756 		return 0;
4757 
4758 	rc = root->fs_info->reloc_ctl;
4759 	rc->merging_rsv_size += rc->nodes_relocated;
4760 
4761 	if (rc->merge_reloc_tree) {
4762 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4763 					      rc->block_rsv,
4764 					      rc->nodes_relocated, 1);
4765 		if (ret)
4766 			return ret;
4767 	}
4768 
4769 	new_root = pending->snap;
4770 	reloc_root = create_reloc_root(trans, root->reloc_root,
4771 				       new_root->root_key.objectid);
4772 	if (IS_ERR(reloc_root))
4773 		return PTR_ERR(reloc_root);
4774 
4775 	ret = __add_reloc_root(reloc_root);
4776 	BUG_ON(ret < 0);
4777 	new_root->reloc_root = reloc_root;
4778 
4779 	if (rc->create_reloc_tree)
4780 		ret = clone_backref_node(trans, rc, root, reloc_root);
4781 	return ret;
4782 }
4783