xref: /openbmc/linux/fs/btrfs/relocation.c (revision c51d39010a1bccc9c1294e2d7c00005aefeb2b5c)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34 #include "qgroup.h"
35 
36 /*
37  * backref_node, mapping_node and tree_block start with this
38  */
39 struct tree_entry {
40 	struct rb_node rb_node;
41 	u64 bytenr;
42 };
43 
44 /*
45  * present a tree block in the backref cache
46  */
47 struct backref_node {
48 	struct rb_node rb_node;
49 	u64 bytenr;
50 
51 	u64 new_bytenr;
52 	/* objectid of tree block owner, can be not uptodate */
53 	u64 owner;
54 	/* link to pending, changed or detached list */
55 	struct list_head list;
56 	/* list of upper level blocks reference this block */
57 	struct list_head upper;
58 	/* list of child blocks in the cache */
59 	struct list_head lower;
60 	/* NULL if this node is not tree root */
61 	struct btrfs_root *root;
62 	/* extent buffer got by COW the block */
63 	struct extent_buffer *eb;
64 	/* level of tree block */
65 	unsigned int level:8;
66 	/* is the block in non-reference counted tree */
67 	unsigned int cowonly:1;
68 	/* 1 if no child node in the cache */
69 	unsigned int lowest:1;
70 	/* is the extent buffer locked */
71 	unsigned int locked:1;
72 	/* has the block been processed */
73 	unsigned int processed:1;
74 	/* have backrefs of this block been checked */
75 	unsigned int checked:1;
76 	/*
77 	 * 1 if corresponding block has been cowed but some upper
78 	 * level block pointers may not point to the new location
79 	 */
80 	unsigned int pending:1;
81 	/*
82 	 * 1 if the backref node isn't connected to any other
83 	 * backref node.
84 	 */
85 	unsigned int detached:1;
86 };
87 
88 /*
89  * present a block pointer in the backref cache
90  */
91 struct backref_edge {
92 	struct list_head list[2];
93 	struct backref_node *node[2];
94 };
95 
96 #define LOWER	0
97 #define UPPER	1
98 #define RELOCATION_RESERVED_NODES	256
99 
100 struct backref_cache {
101 	/* red black tree of all backref nodes in the cache */
102 	struct rb_root rb_root;
103 	/* for passing backref nodes to btrfs_reloc_cow_block */
104 	struct backref_node *path[BTRFS_MAX_LEVEL];
105 	/*
106 	 * list of blocks that have been cowed but some block
107 	 * pointers in upper level blocks may not reflect the
108 	 * new location
109 	 */
110 	struct list_head pending[BTRFS_MAX_LEVEL];
111 	/* list of backref nodes with no child node */
112 	struct list_head leaves;
113 	/* list of blocks that have been cowed in current transaction */
114 	struct list_head changed;
115 	/* list of detached backref node. */
116 	struct list_head detached;
117 
118 	u64 last_trans;
119 
120 	int nr_nodes;
121 	int nr_edges;
122 };
123 
124 /*
125  * map address of tree root to tree
126  */
127 struct mapping_node {
128 	struct rb_node rb_node;
129 	u64 bytenr;
130 	void *data;
131 };
132 
133 struct mapping_tree {
134 	struct rb_root rb_root;
135 	spinlock_t lock;
136 };
137 
138 /*
139  * present a tree block to process
140  */
141 struct tree_block {
142 	struct rb_node rb_node;
143 	u64 bytenr;
144 	struct btrfs_key key;
145 	unsigned int level:8;
146 	unsigned int key_ready:1;
147 };
148 
149 #define MAX_EXTENTS 128
150 
151 struct file_extent_cluster {
152 	u64 start;
153 	u64 end;
154 	u64 boundary[MAX_EXTENTS];
155 	unsigned int nr;
156 };
157 
158 struct reloc_control {
159 	/* block group to relocate */
160 	struct btrfs_block_group_cache *block_group;
161 	/* extent tree */
162 	struct btrfs_root *extent_root;
163 	/* inode for moving data */
164 	struct inode *data_inode;
165 
166 	struct btrfs_block_rsv *block_rsv;
167 
168 	struct backref_cache backref_cache;
169 
170 	struct file_extent_cluster cluster;
171 	/* tree blocks have been processed */
172 	struct extent_io_tree processed_blocks;
173 	/* map start of tree root to corresponding reloc tree */
174 	struct mapping_tree reloc_root_tree;
175 	/* list of reloc trees */
176 	struct list_head reloc_roots;
177 	/* size of metadata reservation for merging reloc trees */
178 	u64 merging_rsv_size;
179 	/* size of relocated tree nodes */
180 	u64 nodes_relocated;
181 	/* reserved size for block group relocation*/
182 	u64 reserved_bytes;
183 
184 	u64 search_start;
185 	u64 extents_found;
186 
187 	unsigned int stage:8;
188 	unsigned int create_reloc_tree:1;
189 	unsigned int merge_reloc_tree:1;
190 	unsigned int found_file_extent:1;
191 };
192 
193 /* stages of data relocation */
194 #define MOVE_DATA_EXTENTS	0
195 #define UPDATE_DATA_PTRS	1
196 
197 static void remove_backref_node(struct backref_cache *cache,
198 				struct backref_node *node);
199 static void __mark_block_processed(struct reloc_control *rc,
200 				   struct backref_node *node);
201 
202 static void mapping_tree_init(struct mapping_tree *tree)
203 {
204 	tree->rb_root = RB_ROOT;
205 	spin_lock_init(&tree->lock);
206 }
207 
208 static void backref_cache_init(struct backref_cache *cache)
209 {
210 	int i;
211 	cache->rb_root = RB_ROOT;
212 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
213 		INIT_LIST_HEAD(&cache->pending[i]);
214 	INIT_LIST_HEAD(&cache->changed);
215 	INIT_LIST_HEAD(&cache->detached);
216 	INIT_LIST_HEAD(&cache->leaves);
217 }
218 
219 static void backref_cache_cleanup(struct backref_cache *cache)
220 {
221 	struct backref_node *node;
222 	int i;
223 
224 	while (!list_empty(&cache->detached)) {
225 		node = list_entry(cache->detached.next,
226 				  struct backref_node, list);
227 		remove_backref_node(cache, node);
228 	}
229 
230 	while (!list_empty(&cache->leaves)) {
231 		node = list_entry(cache->leaves.next,
232 				  struct backref_node, lower);
233 		remove_backref_node(cache, node);
234 	}
235 
236 	cache->last_trans = 0;
237 
238 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
239 		ASSERT(list_empty(&cache->pending[i]));
240 	ASSERT(list_empty(&cache->changed));
241 	ASSERT(list_empty(&cache->detached));
242 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
243 	ASSERT(!cache->nr_nodes);
244 	ASSERT(!cache->nr_edges);
245 }
246 
247 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
248 {
249 	struct backref_node *node;
250 
251 	node = kzalloc(sizeof(*node), GFP_NOFS);
252 	if (node) {
253 		INIT_LIST_HEAD(&node->list);
254 		INIT_LIST_HEAD(&node->upper);
255 		INIT_LIST_HEAD(&node->lower);
256 		RB_CLEAR_NODE(&node->rb_node);
257 		cache->nr_nodes++;
258 	}
259 	return node;
260 }
261 
262 static void free_backref_node(struct backref_cache *cache,
263 			      struct backref_node *node)
264 {
265 	if (node) {
266 		cache->nr_nodes--;
267 		kfree(node);
268 	}
269 }
270 
271 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
272 {
273 	struct backref_edge *edge;
274 
275 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
276 	if (edge)
277 		cache->nr_edges++;
278 	return edge;
279 }
280 
281 static void free_backref_edge(struct backref_cache *cache,
282 			      struct backref_edge *edge)
283 {
284 	if (edge) {
285 		cache->nr_edges--;
286 		kfree(edge);
287 	}
288 }
289 
290 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
291 				   struct rb_node *node)
292 {
293 	struct rb_node **p = &root->rb_node;
294 	struct rb_node *parent = NULL;
295 	struct tree_entry *entry;
296 
297 	while (*p) {
298 		parent = *p;
299 		entry = rb_entry(parent, struct tree_entry, rb_node);
300 
301 		if (bytenr < entry->bytenr)
302 			p = &(*p)->rb_left;
303 		else if (bytenr > entry->bytenr)
304 			p = &(*p)->rb_right;
305 		else
306 			return parent;
307 	}
308 
309 	rb_link_node(node, parent, p);
310 	rb_insert_color(node, root);
311 	return NULL;
312 }
313 
314 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
315 {
316 	struct rb_node *n = root->rb_node;
317 	struct tree_entry *entry;
318 
319 	while (n) {
320 		entry = rb_entry(n, struct tree_entry, rb_node);
321 
322 		if (bytenr < entry->bytenr)
323 			n = n->rb_left;
324 		else if (bytenr > entry->bytenr)
325 			n = n->rb_right;
326 		else
327 			return n;
328 	}
329 	return NULL;
330 }
331 
332 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
333 {
334 
335 	struct btrfs_fs_info *fs_info = NULL;
336 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
337 					      rb_node);
338 	if (bnode->root)
339 		fs_info = bnode->root->fs_info;
340 	btrfs_panic(fs_info, errno,
341 		    "Inconsistency in backref cache found at offset %llu",
342 		    bytenr);
343 }
344 
345 /*
346  * walk up backref nodes until reach node presents tree root
347  */
348 static struct backref_node *walk_up_backref(struct backref_node *node,
349 					    struct backref_edge *edges[],
350 					    int *index)
351 {
352 	struct backref_edge *edge;
353 	int idx = *index;
354 
355 	while (!list_empty(&node->upper)) {
356 		edge = list_entry(node->upper.next,
357 				  struct backref_edge, list[LOWER]);
358 		edges[idx++] = edge;
359 		node = edge->node[UPPER];
360 	}
361 	BUG_ON(node->detached);
362 	*index = idx;
363 	return node;
364 }
365 
366 /*
367  * walk down backref nodes to find start of next reference path
368  */
369 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
370 					      int *index)
371 {
372 	struct backref_edge *edge;
373 	struct backref_node *lower;
374 	int idx = *index;
375 
376 	while (idx > 0) {
377 		edge = edges[idx - 1];
378 		lower = edge->node[LOWER];
379 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
380 			idx--;
381 			continue;
382 		}
383 		edge = list_entry(edge->list[LOWER].next,
384 				  struct backref_edge, list[LOWER]);
385 		edges[idx - 1] = edge;
386 		*index = idx;
387 		return edge->node[UPPER];
388 	}
389 	*index = 0;
390 	return NULL;
391 }
392 
393 static void unlock_node_buffer(struct backref_node *node)
394 {
395 	if (node->locked) {
396 		btrfs_tree_unlock(node->eb);
397 		node->locked = 0;
398 	}
399 }
400 
401 static void drop_node_buffer(struct backref_node *node)
402 {
403 	if (node->eb) {
404 		unlock_node_buffer(node);
405 		free_extent_buffer(node->eb);
406 		node->eb = NULL;
407 	}
408 }
409 
410 static void drop_backref_node(struct backref_cache *tree,
411 			      struct backref_node *node)
412 {
413 	BUG_ON(!list_empty(&node->upper));
414 
415 	drop_node_buffer(node);
416 	list_del(&node->list);
417 	list_del(&node->lower);
418 	if (!RB_EMPTY_NODE(&node->rb_node))
419 		rb_erase(&node->rb_node, &tree->rb_root);
420 	free_backref_node(tree, node);
421 }
422 
423 /*
424  * remove a backref node from the backref cache
425  */
426 static void remove_backref_node(struct backref_cache *cache,
427 				struct backref_node *node)
428 {
429 	struct backref_node *upper;
430 	struct backref_edge *edge;
431 
432 	if (!node)
433 		return;
434 
435 	BUG_ON(!node->lowest && !node->detached);
436 	while (!list_empty(&node->upper)) {
437 		edge = list_entry(node->upper.next, struct backref_edge,
438 				  list[LOWER]);
439 		upper = edge->node[UPPER];
440 		list_del(&edge->list[LOWER]);
441 		list_del(&edge->list[UPPER]);
442 		free_backref_edge(cache, edge);
443 
444 		if (RB_EMPTY_NODE(&upper->rb_node)) {
445 			BUG_ON(!list_empty(&node->upper));
446 			drop_backref_node(cache, node);
447 			node = upper;
448 			node->lowest = 1;
449 			continue;
450 		}
451 		/*
452 		 * add the node to leaf node list if no other
453 		 * child block cached.
454 		 */
455 		if (list_empty(&upper->lower)) {
456 			list_add_tail(&upper->lower, &cache->leaves);
457 			upper->lowest = 1;
458 		}
459 	}
460 
461 	drop_backref_node(cache, node);
462 }
463 
464 static void update_backref_node(struct backref_cache *cache,
465 				struct backref_node *node, u64 bytenr)
466 {
467 	struct rb_node *rb_node;
468 	rb_erase(&node->rb_node, &cache->rb_root);
469 	node->bytenr = bytenr;
470 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
471 	if (rb_node)
472 		backref_tree_panic(rb_node, -EEXIST, bytenr);
473 }
474 
475 /*
476  * update backref cache after a transaction commit
477  */
478 static int update_backref_cache(struct btrfs_trans_handle *trans,
479 				struct backref_cache *cache)
480 {
481 	struct backref_node *node;
482 	int level = 0;
483 
484 	if (cache->last_trans == 0) {
485 		cache->last_trans = trans->transid;
486 		return 0;
487 	}
488 
489 	if (cache->last_trans == trans->transid)
490 		return 0;
491 
492 	/*
493 	 * detached nodes are used to avoid unnecessary backref
494 	 * lookup. transaction commit changes the extent tree.
495 	 * so the detached nodes are no longer useful.
496 	 */
497 	while (!list_empty(&cache->detached)) {
498 		node = list_entry(cache->detached.next,
499 				  struct backref_node, list);
500 		remove_backref_node(cache, node);
501 	}
502 
503 	while (!list_empty(&cache->changed)) {
504 		node = list_entry(cache->changed.next,
505 				  struct backref_node, list);
506 		list_del_init(&node->list);
507 		BUG_ON(node->pending);
508 		update_backref_node(cache, node, node->new_bytenr);
509 	}
510 
511 	/*
512 	 * some nodes can be left in the pending list if there were
513 	 * errors during processing the pending nodes.
514 	 */
515 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
516 		list_for_each_entry(node, &cache->pending[level], list) {
517 			BUG_ON(!node->pending);
518 			if (node->bytenr == node->new_bytenr)
519 				continue;
520 			update_backref_node(cache, node, node->new_bytenr);
521 		}
522 	}
523 
524 	cache->last_trans = 0;
525 	return 1;
526 }
527 
528 
529 static int should_ignore_root(struct btrfs_root *root)
530 {
531 	struct btrfs_root *reloc_root;
532 
533 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
534 		return 0;
535 
536 	reloc_root = root->reloc_root;
537 	if (!reloc_root)
538 		return 0;
539 
540 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
541 	    root->fs_info->running_transaction->transid - 1)
542 		return 0;
543 	/*
544 	 * if there is reloc tree and it was created in previous
545 	 * transaction backref lookup can find the reloc tree,
546 	 * so backref node for the fs tree root is useless for
547 	 * relocation.
548 	 */
549 	return 1;
550 }
551 /*
552  * find reloc tree by address of tree root
553  */
554 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
555 					  u64 bytenr)
556 {
557 	struct rb_node *rb_node;
558 	struct mapping_node *node;
559 	struct btrfs_root *root = NULL;
560 
561 	spin_lock(&rc->reloc_root_tree.lock);
562 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
563 	if (rb_node) {
564 		node = rb_entry(rb_node, struct mapping_node, rb_node);
565 		root = (struct btrfs_root *)node->data;
566 	}
567 	spin_unlock(&rc->reloc_root_tree.lock);
568 	return root;
569 }
570 
571 static int is_cowonly_root(u64 root_objectid)
572 {
573 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
574 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
575 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
576 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
577 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
578 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
579 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
580 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
581 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
582 		return 1;
583 	return 0;
584 }
585 
586 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
587 					u64 root_objectid)
588 {
589 	struct btrfs_key key;
590 
591 	key.objectid = root_objectid;
592 	key.type = BTRFS_ROOT_ITEM_KEY;
593 	if (is_cowonly_root(root_objectid))
594 		key.offset = 0;
595 	else
596 		key.offset = (u64)-1;
597 
598 	return btrfs_get_fs_root(fs_info, &key, false);
599 }
600 
601 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
602 static noinline_for_stack
603 struct btrfs_root *find_tree_root(struct reloc_control *rc,
604 				  struct extent_buffer *leaf,
605 				  struct btrfs_extent_ref_v0 *ref0)
606 {
607 	struct btrfs_root *root;
608 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
609 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
610 
611 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
612 
613 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
614 	BUG_ON(IS_ERR(root));
615 
616 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
617 	    generation != btrfs_root_generation(&root->root_item))
618 		return NULL;
619 
620 	return root;
621 }
622 #endif
623 
624 static noinline_for_stack
625 int find_inline_backref(struct extent_buffer *leaf, int slot,
626 			unsigned long *ptr, unsigned long *end)
627 {
628 	struct btrfs_key key;
629 	struct btrfs_extent_item *ei;
630 	struct btrfs_tree_block_info *bi;
631 	u32 item_size;
632 
633 	btrfs_item_key_to_cpu(leaf, &key, slot);
634 
635 	item_size = btrfs_item_size_nr(leaf, slot);
636 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
637 	if (item_size < sizeof(*ei)) {
638 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
639 		return 1;
640 	}
641 #endif
642 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
643 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
644 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
645 
646 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
647 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
648 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
649 		return 1;
650 	}
651 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
652 	    item_size <= sizeof(*ei)) {
653 		WARN_ON(item_size < sizeof(*ei));
654 		return 1;
655 	}
656 
657 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
658 		bi = (struct btrfs_tree_block_info *)(ei + 1);
659 		*ptr = (unsigned long)(bi + 1);
660 	} else {
661 		*ptr = (unsigned long)(ei + 1);
662 	}
663 	*end = (unsigned long)ei + item_size;
664 	return 0;
665 }
666 
667 /*
668  * build backref tree for a given tree block. root of the backref tree
669  * corresponds the tree block, leaves of the backref tree correspond
670  * roots of b-trees that reference the tree block.
671  *
672  * the basic idea of this function is check backrefs of a given block
673  * to find upper level blocks that reference the block, and then check
674  * backrefs of these upper level blocks recursively. the recursion stop
675  * when tree root is reached or backrefs for the block is cached.
676  *
677  * NOTE: if we find backrefs for a block are cached, we know backrefs
678  * for all upper level blocks that directly/indirectly reference the
679  * block are also cached.
680  */
681 static noinline_for_stack
682 struct backref_node *build_backref_tree(struct reloc_control *rc,
683 					struct btrfs_key *node_key,
684 					int level, u64 bytenr)
685 {
686 	struct backref_cache *cache = &rc->backref_cache;
687 	struct btrfs_path *path1;
688 	struct btrfs_path *path2;
689 	struct extent_buffer *eb;
690 	struct btrfs_root *root;
691 	struct backref_node *cur;
692 	struct backref_node *upper;
693 	struct backref_node *lower;
694 	struct backref_node *node = NULL;
695 	struct backref_node *exist = NULL;
696 	struct backref_edge *edge;
697 	struct rb_node *rb_node;
698 	struct btrfs_key key;
699 	unsigned long end;
700 	unsigned long ptr;
701 	LIST_HEAD(list);
702 	LIST_HEAD(useless);
703 	int cowonly;
704 	int ret;
705 	int err = 0;
706 	bool need_check = true;
707 
708 	path1 = btrfs_alloc_path();
709 	path2 = btrfs_alloc_path();
710 	if (!path1 || !path2) {
711 		err = -ENOMEM;
712 		goto out;
713 	}
714 	path1->reada = READA_FORWARD;
715 	path2->reada = READA_FORWARD;
716 
717 	node = alloc_backref_node(cache);
718 	if (!node) {
719 		err = -ENOMEM;
720 		goto out;
721 	}
722 
723 	node->bytenr = bytenr;
724 	node->level = level;
725 	node->lowest = 1;
726 	cur = node;
727 again:
728 	end = 0;
729 	ptr = 0;
730 	key.objectid = cur->bytenr;
731 	key.type = BTRFS_METADATA_ITEM_KEY;
732 	key.offset = (u64)-1;
733 
734 	path1->search_commit_root = 1;
735 	path1->skip_locking = 1;
736 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
737 				0, 0);
738 	if (ret < 0) {
739 		err = ret;
740 		goto out;
741 	}
742 	ASSERT(ret);
743 	ASSERT(path1->slots[0]);
744 
745 	path1->slots[0]--;
746 
747 	WARN_ON(cur->checked);
748 	if (!list_empty(&cur->upper)) {
749 		/*
750 		 * the backref was added previously when processing
751 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
752 		 */
753 		ASSERT(list_is_singular(&cur->upper));
754 		edge = list_entry(cur->upper.next, struct backref_edge,
755 				  list[LOWER]);
756 		ASSERT(list_empty(&edge->list[UPPER]));
757 		exist = edge->node[UPPER];
758 		/*
759 		 * add the upper level block to pending list if we need
760 		 * check its backrefs
761 		 */
762 		if (!exist->checked)
763 			list_add_tail(&edge->list[UPPER], &list);
764 	} else {
765 		exist = NULL;
766 	}
767 
768 	while (1) {
769 		cond_resched();
770 		eb = path1->nodes[0];
771 
772 		if (ptr >= end) {
773 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
774 				ret = btrfs_next_leaf(rc->extent_root, path1);
775 				if (ret < 0) {
776 					err = ret;
777 					goto out;
778 				}
779 				if (ret > 0)
780 					break;
781 				eb = path1->nodes[0];
782 			}
783 
784 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
785 			if (key.objectid != cur->bytenr) {
786 				WARN_ON(exist);
787 				break;
788 			}
789 
790 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
791 			    key.type == BTRFS_METADATA_ITEM_KEY) {
792 				ret = find_inline_backref(eb, path1->slots[0],
793 							  &ptr, &end);
794 				if (ret)
795 					goto next;
796 			}
797 		}
798 
799 		if (ptr < end) {
800 			/* update key for inline back ref */
801 			struct btrfs_extent_inline_ref *iref;
802 			iref = (struct btrfs_extent_inline_ref *)ptr;
803 			key.type = btrfs_extent_inline_ref_type(eb, iref);
804 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
805 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
806 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
807 		}
808 
809 		if (exist &&
810 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
811 		      exist->owner == key.offset) ||
812 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
813 		      exist->bytenr == key.offset))) {
814 			exist = NULL;
815 			goto next;
816 		}
817 
818 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
819 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
820 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
821 			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
822 				struct btrfs_extent_ref_v0 *ref0;
823 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
824 						struct btrfs_extent_ref_v0);
825 				if (key.objectid == key.offset) {
826 					root = find_tree_root(rc, eb, ref0);
827 					if (root && !should_ignore_root(root))
828 						cur->root = root;
829 					else
830 						list_add(&cur->list, &useless);
831 					break;
832 				}
833 				if (is_cowonly_root(btrfs_ref_root_v0(eb,
834 								      ref0)))
835 					cur->cowonly = 1;
836 			}
837 #else
838 		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
839 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
840 #endif
841 			if (key.objectid == key.offset) {
842 				/*
843 				 * only root blocks of reloc trees use
844 				 * backref of this type.
845 				 */
846 				root = find_reloc_root(rc, cur->bytenr);
847 				ASSERT(root);
848 				cur->root = root;
849 				break;
850 			}
851 
852 			edge = alloc_backref_edge(cache);
853 			if (!edge) {
854 				err = -ENOMEM;
855 				goto out;
856 			}
857 			rb_node = tree_search(&cache->rb_root, key.offset);
858 			if (!rb_node) {
859 				upper = alloc_backref_node(cache);
860 				if (!upper) {
861 					free_backref_edge(cache, edge);
862 					err = -ENOMEM;
863 					goto out;
864 				}
865 				upper->bytenr = key.offset;
866 				upper->level = cur->level + 1;
867 				/*
868 				 *  backrefs for the upper level block isn't
869 				 *  cached, add the block to pending list
870 				 */
871 				list_add_tail(&edge->list[UPPER], &list);
872 			} else {
873 				upper = rb_entry(rb_node, struct backref_node,
874 						 rb_node);
875 				ASSERT(upper->checked);
876 				INIT_LIST_HEAD(&edge->list[UPPER]);
877 			}
878 			list_add_tail(&edge->list[LOWER], &cur->upper);
879 			edge->node[LOWER] = cur;
880 			edge->node[UPPER] = upper;
881 
882 			goto next;
883 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
884 			goto next;
885 		}
886 
887 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
888 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
889 		if (IS_ERR(root)) {
890 			err = PTR_ERR(root);
891 			goto out;
892 		}
893 
894 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
895 			cur->cowonly = 1;
896 
897 		if (btrfs_root_level(&root->root_item) == cur->level) {
898 			/* tree root */
899 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
900 			       cur->bytenr);
901 			if (should_ignore_root(root))
902 				list_add(&cur->list, &useless);
903 			else
904 				cur->root = root;
905 			break;
906 		}
907 
908 		level = cur->level + 1;
909 
910 		/*
911 		 * searching the tree to find upper level blocks
912 		 * reference the block.
913 		 */
914 		path2->search_commit_root = 1;
915 		path2->skip_locking = 1;
916 		path2->lowest_level = level;
917 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
918 		path2->lowest_level = 0;
919 		if (ret < 0) {
920 			err = ret;
921 			goto out;
922 		}
923 		if (ret > 0 && path2->slots[level] > 0)
924 			path2->slots[level]--;
925 
926 		eb = path2->nodes[level];
927 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
928 		    cur->bytenr) {
929 			btrfs_err(root->fs_info,
930 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
931 				  cur->bytenr, level - 1, root->objectid,
932 				  node_key->objectid, node_key->type,
933 				  node_key->offset);
934 			err = -ENOENT;
935 			goto out;
936 		}
937 		lower = cur;
938 		need_check = true;
939 		for (; level < BTRFS_MAX_LEVEL; level++) {
940 			if (!path2->nodes[level]) {
941 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
942 				       lower->bytenr);
943 				if (should_ignore_root(root))
944 					list_add(&lower->list, &useless);
945 				else
946 					lower->root = root;
947 				break;
948 			}
949 
950 			edge = alloc_backref_edge(cache);
951 			if (!edge) {
952 				err = -ENOMEM;
953 				goto out;
954 			}
955 
956 			eb = path2->nodes[level];
957 			rb_node = tree_search(&cache->rb_root, eb->start);
958 			if (!rb_node) {
959 				upper = alloc_backref_node(cache);
960 				if (!upper) {
961 					free_backref_edge(cache, edge);
962 					err = -ENOMEM;
963 					goto out;
964 				}
965 				upper->bytenr = eb->start;
966 				upper->owner = btrfs_header_owner(eb);
967 				upper->level = lower->level + 1;
968 				if (!test_bit(BTRFS_ROOT_REF_COWS,
969 					      &root->state))
970 					upper->cowonly = 1;
971 
972 				/*
973 				 * if we know the block isn't shared
974 				 * we can void checking its backrefs.
975 				 */
976 				if (btrfs_block_can_be_shared(root, eb))
977 					upper->checked = 0;
978 				else
979 					upper->checked = 1;
980 
981 				/*
982 				 * add the block to pending list if we
983 				 * need check its backrefs, we only do this once
984 				 * while walking up a tree as we will catch
985 				 * anything else later on.
986 				 */
987 				if (!upper->checked && need_check) {
988 					need_check = false;
989 					list_add_tail(&edge->list[UPPER],
990 						      &list);
991 				} else {
992 					if (upper->checked)
993 						need_check = true;
994 					INIT_LIST_HEAD(&edge->list[UPPER]);
995 				}
996 			} else {
997 				upper = rb_entry(rb_node, struct backref_node,
998 						 rb_node);
999 				ASSERT(upper->checked);
1000 				INIT_LIST_HEAD(&edge->list[UPPER]);
1001 				if (!upper->owner)
1002 					upper->owner = btrfs_header_owner(eb);
1003 			}
1004 			list_add_tail(&edge->list[LOWER], &lower->upper);
1005 			edge->node[LOWER] = lower;
1006 			edge->node[UPPER] = upper;
1007 
1008 			if (rb_node)
1009 				break;
1010 			lower = upper;
1011 			upper = NULL;
1012 		}
1013 		btrfs_release_path(path2);
1014 next:
1015 		if (ptr < end) {
1016 			ptr += btrfs_extent_inline_ref_size(key.type);
1017 			if (ptr >= end) {
1018 				WARN_ON(ptr > end);
1019 				ptr = 0;
1020 				end = 0;
1021 			}
1022 		}
1023 		if (ptr >= end)
1024 			path1->slots[0]++;
1025 	}
1026 	btrfs_release_path(path1);
1027 
1028 	cur->checked = 1;
1029 	WARN_ON(exist);
1030 
1031 	/* the pending list isn't empty, take the first block to process */
1032 	if (!list_empty(&list)) {
1033 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034 		list_del_init(&edge->list[UPPER]);
1035 		cur = edge->node[UPPER];
1036 		goto again;
1037 	}
1038 
1039 	/*
1040 	 * everything goes well, connect backref nodes and insert backref nodes
1041 	 * into the cache.
1042 	 */
1043 	ASSERT(node->checked);
1044 	cowonly = node->cowonly;
1045 	if (!cowonly) {
1046 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1047 				      &node->rb_node);
1048 		if (rb_node)
1049 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1050 		list_add_tail(&node->lower, &cache->leaves);
1051 	}
1052 
1053 	list_for_each_entry(edge, &node->upper, list[LOWER])
1054 		list_add_tail(&edge->list[UPPER], &list);
1055 
1056 	while (!list_empty(&list)) {
1057 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1058 		list_del_init(&edge->list[UPPER]);
1059 		upper = edge->node[UPPER];
1060 		if (upper->detached) {
1061 			list_del(&edge->list[LOWER]);
1062 			lower = edge->node[LOWER];
1063 			free_backref_edge(cache, edge);
1064 			if (list_empty(&lower->upper))
1065 				list_add(&lower->list, &useless);
1066 			continue;
1067 		}
1068 
1069 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1070 			if (upper->lowest) {
1071 				list_del_init(&upper->lower);
1072 				upper->lowest = 0;
1073 			}
1074 
1075 			list_add_tail(&edge->list[UPPER], &upper->lower);
1076 			continue;
1077 		}
1078 
1079 		if (!upper->checked) {
1080 			/*
1081 			 * Still want to blow up for developers since this is a
1082 			 * logic bug.
1083 			 */
1084 			ASSERT(0);
1085 			err = -EINVAL;
1086 			goto out;
1087 		}
1088 		if (cowonly != upper->cowonly) {
1089 			ASSERT(0);
1090 			err = -EINVAL;
1091 			goto out;
1092 		}
1093 
1094 		if (!cowonly) {
1095 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1096 					      &upper->rb_node);
1097 			if (rb_node)
1098 				backref_tree_panic(rb_node, -EEXIST,
1099 						   upper->bytenr);
1100 		}
1101 
1102 		list_add_tail(&edge->list[UPPER], &upper->lower);
1103 
1104 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1105 			list_add_tail(&edge->list[UPPER], &list);
1106 	}
1107 	/*
1108 	 * process useless backref nodes. backref nodes for tree leaves
1109 	 * are deleted from the cache. backref nodes for upper level
1110 	 * tree blocks are left in the cache to avoid unnecessary backref
1111 	 * lookup.
1112 	 */
1113 	while (!list_empty(&useless)) {
1114 		upper = list_entry(useless.next, struct backref_node, list);
1115 		list_del_init(&upper->list);
1116 		ASSERT(list_empty(&upper->upper));
1117 		if (upper == node)
1118 			node = NULL;
1119 		if (upper->lowest) {
1120 			list_del_init(&upper->lower);
1121 			upper->lowest = 0;
1122 		}
1123 		while (!list_empty(&upper->lower)) {
1124 			edge = list_entry(upper->lower.next,
1125 					  struct backref_edge, list[UPPER]);
1126 			list_del(&edge->list[UPPER]);
1127 			list_del(&edge->list[LOWER]);
1128 			lower = edge->node[LOWER];
1129 			free_backref_edge(cache, edge);
1130 
1131 			if (list_empty(&lower->upper))
1132 				list_add(&lower->list, &useless);
1133 		}
1134 		__mark_block_processed(rc, upper);
1135 		if (upper->level > 0) {
1136 			list_add(&upper->list, &cache->detached);
1137 			upper->detached = 1;
1138 		} else {
1139 			rb_erase(&upper->rb_node, &cache->rb_root);
1140 			free_backref_node(cache, upper);
1141 		}
1142 	}
1143 out:
1144 	btrfs_free_path(path1);
1145 	btrfs_free_path(path2);
1146 	if (err) {
1147 		while (!list_empty(&useless)) {
1148 			lower = list_entry(useless.next,
1149 					   struct backref_node, list);
1150 			list_del_init(&lower->list);
1151 		}
1152 		while (!list_empty(&list)) {
1153 			edge = list_first_entry(&list, struct backref_edge,
1154 						list[UPPER]);
1155 			list_del(&edge->list[UPPER]);
1156 			list_del(&edge->list[LOWER]);
1157 			lower = edge->node[LOWER];
1158 			upper = edge->node[UPPER];
1159 			free_backref_edge(cache, edge);
1160 
1161 			/*
1162 			 * Lower is no longer linked to any upper backref nodes
1163 			 * and isn't in the cache, we can free it ourselves.
1164 			 */
1165 			if (list_empty(&lower->upper) &&
1166 			    RB_EMPTY_NODE(&lower->rb_node))
1167 				list_add(&lower->list, &useless);
1168 
1169 			if (!RB_EMPTY_NODE(&upper->rb_node))
1170 				continue;
1171 
1172 			/* Add this guy's upper edges to the list to process */
1173 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1174 				list_add_tail(&edge->list[UPPER], &list);
1175 			if (list_empty(&upper->upper))
1176 				list_add(&upper->list, &useless);
1177 		}
1178 
1179 		while (!list_empty(&useless)) {
1180 			lower = list_entry(useless.next,
1181 					   struct backref_node, list);
1182 			list_del_init(&lower->list);
1183 			if (lower == node)
1184 				node = NULL;
1185 			free_backref_node(cache, lower);
1186 		}
1187 
1188 		free_backref_node(cache, node);
1189 		return ERR_PTR(err);
1190 	}
1191 	ASSERT(!node || !node->detached);
1192 	return node;
1193 }
1194 
1195 /*
1196  * helper to add backref node for the newly created snapshot.
1197  * the backref node is created by cloning backref node that
1198  * corresponds to root of source tree
1199  */
1200 static int clone_backref_node(struct btrfs_trans_handle *trans,
1201 			      struct reloc_control *rc,
1202 			      struct btrfs_root *src,
1203 			      struct btrfs_root *dest)
1204 {
1205 	struct btrfs_root *reloc_root = src->reloc_root;
1206 	struct backref_cache *cache = &rc->backref_cache;
1207 	struct backref_node *node = NULL;
1208 	struct backref_node *new_node;
1209 	struct backref_edge *edge;
1210 	struct backref_edge *new_edge;
1211 	struct rb_node *rb_node;
1212 
1213 	if (cache->last_trans > 0)
1214 		update_backref_cache(trans, cache);
1215 
1216 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1217 	if (rb_node) {
1218 		node = rb_entry(rb_node, struct backref_node, rb_node);
1219 		if (node->detached)
1220 			node = NULL;
1221 		else
1222 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1223 	}
1224 
1225 	if (!node) {
1226 		rb_node = tree_search(&cache->rb_root,
1227 				      reloc_root->commit_root->start);
1228 		if (rb_node) {
1229 			node = rb_entry(rb_node, struct backref_node,
1230 					rb_node);
1231 			BUG_ON(node->detached);
1232 		}
1233 	}
1234 
1235 	if (!node)
1236 		return 0;
1237 
1238 	new_node = alloc_backref_node(cache);
1239 	if (!new_node)
1240 		return -ENOMEM;
1241 
1242 	new_node->bytenr = dest->node->start;
1243 	new_node->level = node->level;
1244 	new_node->lowest = node->lowest;
1245 	new_node->checked = 1;
1246 	new_node->root = dest;
1247 
1248 	if (!node->lowest) {
1249 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1250 			new_edge = alloc_backref_edge(cache);
1251 			if (!new_edge)
1252 				goto fail;
1253 
1254 			new_edge->node[UPPER] = new_node;
1255 			new_edge->node[LOWER] = edge->node[LOWER];
1256 			list_add_tail(&new_edge->list[UPPER],
1257 				      &new_node->lower);
1258 		}
1259 	} else {
1260 		list_add_tail(&new_node->lower, &cache->leaves);
1261 	}
1262 
1263 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1264 			      &new_node->rb_node);
1265 	if (rb_node)
1266 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1267 
1268 	if (!new_node->lowest) {
1269 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1270 			list_add_tail(&new_edge->list[LOWER],
1271 				      &new_edge->node[LOWER]->upper);
1272 		}
1273 	}
1274 	return 0;
1275 fail:
1276 	while (!list_empty(&new_node->lower)) {
1277 		new_edge = list_entry(new_node->lower.next,
1278 				      struct backref_edge, list[UPPER]);
1279 		list_del(&new_edge->list[UPPER]);
1280 		free_backref_edge(cache, new_edge);
1281 	}
1282 	free_backref_node(cache, new_node);
1283 	return -ENOMEM;
1284 }
1285 
1286 /*
1287  * helper to add 'address of tree root -> reloc tree' mapping
1288  */
1289 static int __must_check __add_reloc_root(struct btrfs_root *root)
1290 {
1291 	struct rb_node *rb_node;
1292 	struct mapping_node *node;
1293 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1294 
1295 	node = kmalloc(sizeof(*node), GFP_NOFS);
1296 	if (!node)
1297 		return -ENOMEM;
1298 
1299 	node->bytenr = root->node->start;
1300 	node->data = root;
1301 
1302 	spin_lock(&rc->reloc_root_tree.lock);
1303 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1304 			      node->bytenr, &node->rb_node);
1305 	spin_unlock(&rc->reloc_root_tree.lock);
1306 	if (rb_node) {
1307 		btrfs_panic(root->fs_info, -EEXIST,
1308 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1309 			    node->bytenr);
1310 		kfree(node);
1311 		return -EEXIST;
1312 	}
1313 
1314 	list_add_tail(&root->root_list, &rc->reloc_roots);
1315 	return 0;
1316 }
1317 
1318 /*
1319  * helper to delete the 'address of tree root -> reloc tree'
1320  * mapping
1321  */
1322 static void __del_reloc_root(struct btrfs_root *root)
1323 {
1324 	struct rb_node *rb_node;
1325 	struct mapping_node *node = NULL;
1326 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1327 
1328 	spin_lock(&rc->reloc_root_tree.lock);
1329 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1330 			      root->node->start);
1331 	if (rb_node) {
1332 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1333 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1334 	}
1335 	spin_unlock(&rc->reloc_root_tree.lock);
1336 
1337 	if (!node)
1338 		return;
1339 	BUG_ON((struct btrfs_root *)node->data != root);
1340 
1341 	spin_lock(&root->fs_info->trans_lock);
1342 	list_del_init(&root->root_list);
1343 	spin_unlock(&root->fs_info->trans_lock);
1344 	kfree(node);
1345 }
1346 
1347 /*
1348  * helper to update the 'address of tree root -> reloc tree'
1349  * mapping
1350  */
1351 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1352 {
1353 	struct rb_node *rb_node;
1354 	struct mapping_node *node = NULL;
1355 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1356 
1357 	spin_lock(&rc->reloc_root_tree.lock);
1358 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1359 			      root->node->start);
1360 	if (rb_node) {
1361 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1362 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1363 	}
1364 	spin_unlock(&rc->reloc_root_tree.lock);
1365 
1366 	if (!node)
1367 		return 0;
1368 	BUG_ON((struct btrfs_root *)node->data != root);
1369 
1370 	spin_lock(&rc->reloc_root_tree.lock);
1371 	node->bytenr = new_bytenr;
1372 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1373 			      node->bytenr, &node->rb_node);
1374 	spin_unlock(&rc->reloc_root_tree.lock);
1375 	if (rb_node)
1376 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1377 	return 0;
1378 }
1379 
1380 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1381 					struct btrfs_root *root, u64 objectid)
1382 {
1383 	struct btrfs_root *reloc_root;
1384 	struct extent_buffer *eb;
1385 	struct btrfs_root_item *root_item;
1386 	struct btrfs_key root_key;
1387 	u64 last_snap = 0;
1388 	int ret;
1389 
1390 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1391 	BUG_ON(!root_item);
1392 
1393 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1394 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1395 	root_key.offset = objectid;
1396 
1397 	if (root->root_key.objectid == objectid) {
1398 		/* called by btrfs_init_reloc_root */
1399 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1400 				      BTRFS_TREE_RELOC_OBJECTID);
1401 		BUG_ON(ret);
1402 
1403 		last_snap = btrfs_root_last_snapshot(&root->root_item);
1404 		btrfs_set_root_last_snapshot(&root->root_item,
1405 					     trans->transid - 1);
1406 	} else {
1407 		/*
1408 		 * called by btrfs_reloc_post_snapshot_hook.
1409 		 * the source tree is a reloc tree, all tree blocks
1410 		 * modified after it was created have RELOC flag
1411 		 * set in their headers. so it's OK to not update
1412 		 * the 'last_snapshot'.
1413 		 */
1414 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1415 				      BTRFS_TREE_RELOC_OBJECTID);
1416 		BUG_ON(ret);
1417 	}
1418 
1419 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1420 	btrfs_set_root_bytenr(root_item, eb->start);
1421 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1422 	btrfs_set_root_generation(root_item, trans->transid);
1423 
1424 	if (root->root_key.objectid == objectid) {
1425 		btrfs_set_root_refs(root_item, 0);
1426 		memset(&root_item->drop_progress, 0,
1427 		       sizeof(struct btrfs_disk_key));
1428 		root_item->drop_level = 0;
1429 		/*
1430 		 * abuse rtransid, it is safe because it is impossible to
1431 		 * receive data into a relocation tree.
1432 		 */
1433 		btrfs_set_root_rtransid(root_item, last_snap);
1434 		btrfs_set_root_otransid(root_item, trans->transid);
1435 	}
1436 
1437 	btrfs_tree_unlock(eb);
1438 	free_extent_buffer(eb);
1439 
1440 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1441 				&root_key, root_item);
1442 	BUG_ON(ret);
1443 	kfree(root_item);
1444 
1445 	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1446 	BUG_ON(IS_ERR(reloc_root));
1447 	reloc_root->last_trans = trans->transid;
1448 	return reloc_root;
1449 }
1450 
1451 /*
1452  * create reloc tree for a given fs tree. reloc tree is just a
1453  * snapshot of the fs tree with special root objectid.
1454  */
1455 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1456 			  struct btrfs_root *root)
1457 {
1458 	struct btrfs_root *reloc_root;
1459 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1460 	struct btrfs_block_rsv *rsv;
1461 	int clear_rsv = 0;
1462 	int ret;
1463 
1464 	if (root->reloc_root) {
1465 		reloc_root = root->reloc_root;
1466 		reloc_root->last_trans = trans->transid;
1467 		return 0;
1468 	}
1469 
1470 	if (!rc || !rc->create_reloc_tree ||
1471 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1472 		return 0;
1473 
1474 	if (!trans->reloc_reserved) {
1475 		rsv = trans->block_rsv;
1476 		trans->block_rsv = rc->block_rsv;
1477 		clear_rsv = 1;
1478 	}
1479 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1480 	if (clear_rsv)
1481 		trans->block_rsv = rsv;
1482 
1483 	ret = __add_reloc_root(reloc_root);
1484 	BUG_ON(ret < 0);
1485 	root->reloc_root = reloc_root;
1486 	return 0;
1487 }
1488 
1489 /*
1490  * update root item of reloc tree
1491  */
1492 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1493 			    struct btrfs_root *root)
1494 {
1495 	struct btrfs_root *reloc_root;
1496 	struct btrfs_root_item *root_item;
1497 	int ret;
1498 
1499 	if (!root->reloc_root)
1500 		goto out;
1501 
1502 	reloc_root = root->reloc_root;
1503 	root_item = &reloc_root->root_item;
1504 
1505 	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1506 	    btrfs_root_refs(root_item) == 0) {
1507 		root->reloc_root = NULL;
1508 		__del_reloc_root(reloc_root);
1509 	}
1510 
1511 	if (reloc_root->commit_root != reloc_root->node) {
1512 		btrfs_set_root_node(root_item, reloc_root->node);
1513 		free_extent_buffer(reloc_root->commit_root);
1514 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1515 	}
1516 
1517 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
1518 				&reloc_root->root_key, root_item);
1519 	BUG_ON(ret);
1520 
1521 out:
1522 	return 0;
1523 }
1524 
1525 /*
1526  * helper to find first cached inode with inode number >= objectid
1527  * in a subvolume
1528  */
1529 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1530 {
1531 	struct rb_node *node;
1532 	struct rb_node *prev;
1533 	struct btrfs_inode *entry;
1534 	struct inode *inode;
1535 
1536 	spin_lock(&root->inode_lock);
1537 again:
1538 	node = root->inode_tree.rb_node;
1539 	prev = NULL;
1540 	while (node) {
1541 		prev = node;
1542 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1543 
1544 		if (objectid < btrfs_ino(&entry->vfs_inode))
1545 			node = node->rb_left;
1546 		else if (objectid > btrfs_ino(&entry->vfs_inode))
1547 			node = node->rb_right;
1548 		else
1549 			break;
1550 	}
1551 	if (!node) {
1552 		while (prev) {
1553 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1554 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1555 				node = prev;
1556 				break;
1557 			}
1558 			prev = rb_next(prev);
1559 		}
1560 	}
1561 	while (node) {
1562 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1563 		inode = igrab(&entry->vfs_inode);
1564 		if (inode) {
1565 			spin_unlock(&root->inode_lock);
1566 			return inode;
1567 		}
1568 
1569 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
1570 		if (cond_resched_lock(&root->inode_lock))
1571 			goto again;
1572 
1573 		node = rb_next(node);
1574 	}
1575 	spin_unlock(&root->inode_lock);
1576 	return NULL;
1577 }
1578 
1579 static int in_block_group(u64 bytenr,
1580 			  struct btrfs_block_group_cache *block_group)
1581 {
1582 	if (bytenr >= block_group->key.objectid &&
1583 	    bytenr < block_group->key.objectid + block_group->key.offset)
1584 		return 1;
1585 	return 0;
1586 }
1587 
1588 /*
1589  * get new location of data
1590  */
1591 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1592 			    u64 bytenr, u64 num_bytes)
1593 {
1594 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1595 	struct btrfs_path *path;
1596 	struct btrfs_file_extent_item *fi;
1597 	struct extent_buffer *leaf;
1598 	int ret;
1599 
1600 	path = btrfs_alloc_path();
1601 	if (!path)
1602 		return -ENOMEM;
1603 
1604 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1605 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1606 				       bytenr, 0);
1607 	if (ret < 0)
1608 		goto out;
1609 	if (ret > 0) {
1610 		ret = -ENOENT;
1611 		goto out;
1612 	}
1613 
1614 	leaf = path->nodes[0];
1615 	fi = btrfs_item_ptr(leaf, path->slots[0],
1616 			    struct btrfs_file_extent_item);
1617 
1618 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1619 	       btrfs_file_extent_compression(leaf, fi) ||
1620 	       btrfs_file_extent_encryption(leaf, fi) ||
1621 	       btrfs_file_extent_other_encoding(leaf, fi));
1622 
1623 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1624 		ret = -EINVAL;
1625 		goto out;
1626 	}
1627 
1628 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1629 	ret = 0;
1630 out:
1631 	btrfs_free_path(path);
1632 	return ret;
1633 }
1634 
1635 /*
1636  * update file extent items in the tree leaf to point to
1637  * the new locations.
1638  */
1639 static noinline_for_stack
1640 int replace_file_extents(struct btrfs_trans_handle *trans,
1641 			 struct reloc_control *rc,
1642 			 struct btrfs_root *root,
1643 			 struct extent_buffer *leaf)
1644 {
1645 	struct btrfs_key key;
1646 	struct btrfs_file_extent_item *fi;
1647 	struct inode *inode = NULL;
1648 	u64 parent;
1649 	u64 bytenr;
1650 	u64 new_bytenr = 0;
1651 	u64 num_bytes;
1652 	u64 end;
1653 	u32 nritems;
1654 	u32 i;
1655 	int ret = 0;
1656 	int first = 1;
1657 	int dirty = 0;
1658 
1659 	if (rc->stage != UPDATE_DATA_PTRS)
1660 		return 0;
1661 
1662 	/* reloc trees always use full backref */
1663 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1664 		parent = leaf->start;
1665 	else
1666 		parent = 0;
1667 
1668 	nritems = btrfs_header_nritems(leaf);
1669 	for (i = 0; i < nritems; i++) {
1670 		cond_resched();
1671 		btrfs_item_key_to_cpu(leaf, &key, i);
1672 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1673 			continue;
1674 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1675 		if (btrfs_file_extent_type(leaf, fi) ==
1676 		    BTRFS_FILE_EXTENT_INLINE)
1677 			continue;
1678 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1679 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1680 		if (bytenr == 0)
1681 			continue;
1682 		if (!in_block_group(bytenr, rc->block_group))
1683 			continue;
1684 
1685 		/*
1686 		 * if we are modifying block in fs tree, wait for readpage
1687 		 * to complete and drop the extent cache
1688 		 */
1689 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1690 			if (first) {
1691 				inode = find_next_inode(root, key.objectid);
1692 				first = 0;
1693 			} else if (inode && btrfs_ino(inode) < key.objectid) {
1694 				btrfs_add_delayed_iput(inode);
1695 				inode = find_next_inode(root, key.objectid);
1696 			}
1697 			if (inode && btrfs_ino(inode) == key.objectid) {
1698 				end = key.offset +
1699 				      btrfs_file_extent_num_bytes(leaf, fi);
1700 				WARN_ON(!IS_ALIGNED(key.offset,
1701 						    root->sectorsize));
1702 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1703 				end--;
1704 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1705 						      key.offset, end);
1706 				if (!ret)
1707 					continue;
1708 
1709 				btrfs_drop_extent_cache(inode, key.offset, end,
1710 							1);
1711 				unlock_extent(&BTRFS_I(inode)->io_tree,
1712 					      key.offset, end);
1713 			}
1714 		}
1715 
1716 		ret = get_new_location(rc->data_inode, &new_bytenr,
1717 				       bytenr, num_bytes);
1718 		if (ret) {
1719 			/*
1720 			 * Don't have to abort since we've not changed anything
1721 			 * in the file extent yet.
1722 			 */
1723 			break;
1724 		}
1725 
1726 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1727 		dirty = 1;
1728 
1729 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1730 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1731 					   num_bytes, parent,
1732 					   btrfs_header_owner(leaf),
1733 					   key.objectid, key.offset);
1734 		if (ret) {
1735 			btrfs_abort_transaction(trans, ret);
1736 			break;
1737 		}
1738 
1739 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1740 					parent, btrfs_header_owner(leaf),
1741 					key.objectid, key.offset);
1742 		if (ret) {
1743 			btrfs_abort_transaction(trans, ret);
1744 			break;
1745 		}
1746 	}
1747 	if (dirty)
1748 		btrfs_mark_buffer_dirty(leaf);
1749 	if (inode)
1750 		btrfs_add_delayed_iput(inode);
1751 	return ret;
1752 }
1753 
1754 static noinline_for_stack
1755 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1756 		     struct btrfs_path *path, int level)
1757 {
1758 	struct btrfs_disk_key key1;
1759 	struct btrfs_disk_key key2;
1760 	btrfs_node_key(eb, &key1, slot);
1761 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1762 	return memcmp(&key1, &key2, sizeof(key1));
1763 }
1764 
1765 /*
1766  * try to replace tree blocks in fs tree with the new blocks
1767  * in reloc tree. tree blocks haven't been modified since the
1768  * reloc tree was create can be replaced.
1769  *
1770  * if a block was replaced, level of the block + 1 is returned.
1771  * if no block got replaced, 0 is returned. if there are other
1772  * errors, a negative error number is returned.
1773  */
1774 static noinline_for_stack
1775 int replace_path(struct btrfs_trans_handle *trans,
1776 		 struct btrfs_root *dest, struct btrfs_root *src,
1777 		 struct btrfs_path *path, struct btrfs_key *next_key,
1778 		 int lowest_level, int max_level)
1779 {
1780 	struct extent_buffer *eb;
1781 	struct extent_buffer *parent;
1782 	struct btrfs_key key;
1783 	u64 old_bytenr;
1784 	u64 new_bytenr;
1785 	u64 old_ptr_gen;
1786 	u64 new_ptr_gen;
1787 	u64 last_snapshot;
1788 	u32 blocksize;
1789 	int cow = 0;
1790 	int level;
1791 	int ret;
1792 	int slot;
1793 
1794 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1795 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1796 
1797 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1798 again:
1799 	slot = path->slots[lowest_level];
1800 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1801 
1802 	eb = btrfs_lock_root_node(dest);
1803 	btrfs_set_lock_blocking(eb);
1804 	level = btrfs_header_level(eb);
1805 
1806 	if (level < lowest_level) {
1807 		btrfs_tree_unlock(eb);
1808 		free_extent_buffer(eb);
1809 		return 0;
1810 	}
1811 
1812 	if (cow) {
1813 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1814 		BUG_ON(ret);
1815 	}
1816 	btrfs_set_lock_blocking(eb);
1817 
1818 	if (next_key) {
1819 		next_key->objectid = (u64)-1;
1820 		next_key->type = (u8)-1;
1821 		next_key->offset = (u64)-1;
1822 	}
1823 
1824 	parent = eb;
1825 	while (1) {
1826 		level = btrfs_header_level(parent);
1827 		BUG_ON(level < lowest_level);
1828 
1829 		ret = btrfs_bin_search(parent, &key, level, &slot);
1830 		if (ret && slot > 0)
1831 			slot--;
1832 
1833 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1834 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1835 
1836 		old_bytenr = btrfs_node_blockptr(parent, slot);
1837 		blocksize = dest->nodesize;
1838 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1839 
1840 		if (level <= max_level) {
1841 			eb = path->nodes[level];
1842 			new_bytenr = btrfs_node_blockptr(eb,
1843 							path->slots[level]);
1844 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1845 							path->slots[level]);
1846 		} else {
1847 			new_bytenr = 0;
1848 			new_ptr_gen = 0;
1849 		}
1850 
1851 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1852 			ret = level;
1853 			break;
1854 		}
1855 
1856 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1857 		    memcmp_node_keys(parent, slot, path, level)) {
1858 			if (level <= lowest_level) {
1859 				ret = 0;
1860 				break;
1861 			}
1862 
1863 			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1864 			if (IS_ERR(eb)) {
1865 				ret = PTR_ERR(eb);
1866 				break;
1867 			} else if (!extent_buffer_uptodate(eb)) {
1868 				ret = -EIO;
1869 				free_extent_buffer(eb);
1870 				break;
1871 			}
1872 			btrfs_tree_lock(eb);
1873 			if (cow) {
1874 				ret = btrfs_cow_block(trans, dest, eb, parent,
1875 						      slot, &eb);
1876 				BUG_ON(ret);
1877 			}
1878 			btrfs_set_lock_blocking(eb);
1879 
1880 			btrfs_tree_unlock(parent);
1881 			free_extent_buffer(parent);
1882 
1883 			parent = eb;
1884 			continue;
1885 		}
1886 
1887 		if (!cow) {
1888 			btrfs_tree_unlock(parent);
1889 			free_extent_buffer(parent);
1890 			cow = 1;
1891 			goto again;
1892 		}
1893 
1894 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1895 				      path->slots[level]);
1896 		btrfs_release_path(path);
1897 
1898 		path->lowest_level = level;
1899 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1900 		path->lowest_level = 0;
1901 		BUG_ON(ret);
1902 
1903 		/*
1904 		 * swap blocks in fs tree and reloc tree.
1905 		 */
1906 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1907 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1908 		btrfs_mark_buffer_dirty(parent);
1909 
1910 		btrfs_set_node_blockptr(path->nodes[level],
1911 					path->slots[level], old_bytenr);
1912 		btrfs_set_node_ptr_generation(path->nodes[level],
1913 					      path->slots[level], old_ptr_gen);
1914 		btrfs_mark_buffer_dirty(path->nodes[level]);
1915 
1916 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1917 					path->nodes[level]->start,
1918 					src->root_key.objectid, level - 1, 0);
1919 		BUG_ON(ret);
1920 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1921 					0, dest->root_key.objectid, level - 1,
1922 					0);
1923 		BUG_ON(ret);
1924 
1925 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1926 					path->nodes[level]->start,
1927 					src->root_key.objectid, level - 1, 0);
1928 		BUG_ON(ret);
1929 
1930 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1931 					0, dest->root_key.objectid, level - 1,
1932 					0);
1933 		BUG_ON(ret);
1934 
1935 		btrfs_unlock_up_safe(path, 0);
1936 
1937 		ret = level;
1938 		break;
1939 	}
1940 	btrfs_tree_unlock(parent);
1941 	free_extent_buffer(parent);
1942 	return ret;
1943 }
1944 
1945 /*
1946  * helper to find next relocated block in reloc tree
1947  */
1948 static noinline_for_stack
1949 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1950 		       int *level)
1951 {
1952 	struct extent_buffer *eb;
1953 	int i;
1954 	u64 last_snapshot;
1955 	u32 nritems;
1956 
1957 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1958 
1959 	for (i = 0; i < *level; i++) {
1960 		free_extent_buffer(path->nodes[i]);
1961 		path->nodes[i] = NULL;
1962 	}
1963 
1964 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1965 		eb = path->nodes[i];
1966 		nritems = btrfs_header_nritems(eb);
1967 		while (path->slots[i] + 1 < nritems) {
1968 			path->slots[i]++;
1969 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1970 			    last_snapshot)
1971 				continue;
1972 
1973 			*level = i;
1974 			return 0;
1975 		}
1976 		free_extent_buffer(path->nodes[i]);
1977 		path->nodes[i] = NULL;
1978 	}
1979 	return 1;
1980 }
1981 
1982 /*
1983  * walk down reloc tree to find relocated block of lowest level
1984  */
1985 static noinline_for_stack
1986 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1987 			 int *level)
1988 {
1989 	struct extent_buffer *eb = NULL;
1990 	int i;
1991 	u64 bytenr;
1992 	u64 ptr_gen = 0;
1993 	u64 last_snapshot;
1994 	u32 nritems;
1995 
1996 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1997 
1998 	for (i = *level; i > 0; i--) {
1999 		eb = path->nodes[i];
2000 		nritems = btrfs_header_nritems(eb);
2001 		while (path->slots[i] < nritems) {
2002 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2003 			if (ptr_gen > last_snapshot)
2004 				break;
2005 			path->slots[i]++;
2006 		}
2007 		if (path->slots[i] >= nritems) {
2008 			if (i == *level)
2009 				break;
2010 			*level = i + 1;
2011 			return 0;
2012 		}
2013 		if (i == 1) {
2014 			*level = i;
2015 			return 0;
2016 		}
2017 
2018 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2019 		eb = read_tree_block(root, bytenr, ptr_gen);
2020 		if (IS_ERR(eb)) {
2021 			return PTR_ERR(eb);
2022 		} else if (!extent_buffer_uptodate(eb)) {
2023 			free_extent_buffer(eb);
2024 			return -EIO;
2025 		}
2026 		BUG_ON(btrfs_header_level(eb) != i - 1);
2027 		path->nodes[i - 1] = eb;
2028 		path->slots[i - 1] = 0;
2029 	}
2030 	return 1;
2031 }
2032 
2033 /*
2034  * invalidate extent cache for file extents whose key in range of
2035  * [min_key, max_key)
2036  */
2037 static int invalidate_extent_cache(struct btrfs_root *root,
2038 				   struct btrfs_key *min_key,
2039 				   struct btrfs_key *max_key)
2040 {
2041 	struct inode *inode = NULL;
2042 	u64 objectid;
2043 	u64 start, end;
2044 	u64 ino;
2045 
2046 	objectid = min_key->objectid;
2047 	while (1) {
2048 		cond_resched();
2049 		iput(inode);
2050 
2051 		if (objectid > max_key->objectid)
2052 			break;
2053 
2054 		inode = find_next_inode(root, objectid);
2055 		if (!inode)
2056 			break;
2057 		ino = btrfs_ino(inode);
2058 
2059 		if (ino > max_key->objectid) {
2060 			iput(inode);
2061 			break;
2062 		}
2063 
2064 		objectid = ino + 1;
2065 		if (!S_ISREG(inode->i_mode))
2066 			continue;
2067 
2068 		if (unlikely(min_key->objectid == ino)) {
2069 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2070 				continue;
2071 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2072 				start = 0;
2073 			else {
2074 				start = min_key->offset;
2075 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
2076 			}
2077 		} else {
2078 			start = 0;
2079 		}
2080 
2081 		if (unlikely(max_key->objectid == ino)) {
2082 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2083 				continue;
2084 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2085 				end = (u64)-1;
2086 			} else {
2087 				if (max_key->offset == 0)
2088 					continue;
2089 				end = max_key->offset;
2090 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2091 				end--;
2092 			}
2093 		} else {
2094 			end = (u64)-1;
2095 		}
2096 
2097 		/* the lock_extent waits for readpage to complete */
2098 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2099 		btrfs_drop_extent_cache(inode, start, end, 1);
2100 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2101 	}
2102 	return 0;
2103 }
2104 
2105 static int find_next_key(struct btrfs_path *path, int level,
2106 			 struct btrfs_key *key)
2107 
2108 {
2109 	while (level < BTRFS_MAX_LEVEL) {
2110 		if (!path->nodes[level])
2111 			break;
2112 		if (path->slots[level] + 1 <
2113 		    btrfs_header_nritems(path->nodes[level])) {
2114 			btrfs_node_key_to_cpu(path->nodes[level], key,
2115 					      path->slots[level] + 1);
2116 			return 0;
2117 		}
2118 		level++;
2119 	}
2120 	return 1;
2121 }
2122 
2123 /*
2124  * merge the relocated tree blocks in reloc tree with corresponding
2125  * fs tree.
2126  */
2127 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2128 					       struct btrfs_root *root)
2129 {
2130 	LIST_HEAD(inode_list);
2131 	struct btrfs_key key;
2132 	struct btrfs_key next_key;
2133 	struct btrfs_trans_handle *trans = NULL;
2134 	struct btrfs_root *reloc_root;
2135 	struct btrfs_root_item *root_item;
2136 	struct btrfs_path *path;
2137 	struct extent_buffer *leaf;
2138 	int level;
2139 	int max_level;
2140 	int replaced = 0;
2141 	int ret;
2142 	int err = 0;
2143 	u32 min_reserved;
2144 
2145 	path = btrfs_alloc_path();
2146 	if (!path)
2147 		return -ENOMEM;
2148 	path->reada = READA_FORWARD;
2149 
2150 	reloc_root = root->reloc_root;
2151 	root_item = &reloc_root->root_item;
2152 
2153 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2154 		level = btrfs_root_level(root_item);
2155 		extent_buffer_get(reloc_root->node);
2156 		path->nodes[level] = reloc_root->node;
2157 		path->slots[level] = 0;
2158 	} else {
2159 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2160 
2161 		level = root_item->drop_level;
2162 		BUG_ON(level == 0);
2163 		path->lowest_level = level;
2164 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2165 		path->lowest_level = 0;
2166 		if (ret < 0) {
2167 			btrfs_free_path(path);
2168 			return ret;
2169 		}
2170 
2171 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2172 				      path->slots[level]);
2173 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2174 
2175 		btrfs_unlock_up_safe(path, 0);
2176 	}
2177 
2178 	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2179 	memset(&next_key, 0, sizeof(next_key));
2180 
2181 	while (1) {
2182 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2183 					     BTRFS_RESERVE_FLUSH_ALL);
2184 		if (ret) {
2185 			err = ret;
2186 			goto out;
2187 		}
2188 		trans = btrfs_start_transaction(root, 0);
2189 		if (IS_ERR(trans)) {
2190 			err = PTR_ERR(trans);
2191 			trans = NULL;
2192 			goto out;
2193 		}
2194 		trans->block_rsv = rc->block_rsv;
2195 
2196 		replaced = 0;
2197 		max_level = level;
2198 
2199 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2200 		if (ret < 0) {
2201 			err = ret;
2202 			goto out;
2203 		}
2204 		if (ret > 0)
2205 			break;
2206 
2207 		if (!find_next_key(path, level, &key) &&
2208 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2209 			ret = 0;
2210 		} else {
2211 			ret = replace_path(trans, root, reloc_root, path,
2212 					   &next_key, level, max_level);
2213 		}
2214 		if (ret < 0) {
2215 			err = ret;
2216 			goto out;
2217 		}
2218 
2219 		if (ret > 0) {
2220 			level = ret;
2221 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2222 					      path->slots[level]);
2223 			replaced = 1;
2224 		}
2225 
2226 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2227 		if (ret > 0)
2228 			break;
2229 
2230 		BUG_ON(level == 0);
2231 		/*
2232 		 * save the merging progress in the drop_progress.
2233 		 * this is OK since root refs == 1 in this case.
2234 		 */
2235 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2236 			       path->slots[level]);
2237 		root_item->drop_level = level;
2238 
2239 		btrfs_end_transaction_throttle(trans, root);
2240 		trans = NULL;
2241 
2242 		btrfs_btree_balance_dirty(root);
2243 
2244 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2245 			invalidate_extent_cache(root, &key, &next_key);
2246 	}
2247 
2248 	/*
2249 	 * handle the case only one block in the fs tree need to be
2250 	 * relocated and the block is tree root.
2251 	 */
2252 	leaf = btrfs_lock_root_node(root);
2253 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2254 	btrfs_tree_unlock(leaf);
2255 	free_extent_buffer(leaf);
2256 	if (ret < 0)
2257 		err = ret;
2258 out:
2259 	btrfs_free_path(path);
2260 
2261 	if (err == 0) {
2262 		memset(&root_item->drop_progress, 0,
2263 		       sizeof(root_item->drop_progress));
2264 		root_item->drop_level = 0;
2265 		btrfs_set_root_refs(root_item, 0);
2266 		btrfs_update_reloc_root(trans, root);
2267 	}
2268 
2269 	if (trans)
2270 		btrfs_end_transaction_throttle(trans, root);
2271 
2272 	btrfs_btree_balance_dirty(root);
2273 
2274 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2275 		invalidate_extent_cache(root, &key, &next_key);
2276 
2277 	return err;
2278 }
2279 
2280 static noinline_for_stack
2281 int prepare_to_merge(struct reloc_control *rc, int err)
2282 {
2283 	struct btrfs_root *root = rc->extent_root;
2284 	struct btrfs_root *reloc_root;
2285 	struct btrfs_trans_handle *trans;
2286 	LIST_HEAD(reloc_roots);
2287 	u64 num_bytes = 0;
2288 	int ret;
2289 
2290 	mutex_lock(&root->fs_info->reloc_mutex);
2291 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2292 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2293 	mutex_unlock(&root->fs_info->reloc_mutex);
2294 
2295 again:
2296 	if (!err) {
2297 		num_bytes = rc->merging_rsv_size;
2298 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2299 					  BTRFS_RESERVE_FLUSH_ALL);
2300 		if (ret)
2301 			err = ret;
2302 	}
2303 
2304 	trans = btrfs_join_transaction(rc->extent_root);
2305 	if (IS_ERR(trans)) {
2306 		if (!err)
2307 			btrfs_block_rsv_release(rc->extent_root,
2308 						rc->block_rsv, num_bytes);
2309 		return PTR_ERR(trans);
2310 	}
2311 
2312 	if (!err) {
2313 		if (num_bytes != rc->merging_rsv_size) {
2314 			btrfs_end_transaction(trans, rc->extent_root);
2315 			btrfs_block_rsv_release(rc->extent_root,
2316 						rc->block_rsv, num_bytes);
2317 			goto again;
2318 		}
2319 	}
2320 
2321 	rc->merge_reloc_tree = 1;
2322 
2323 	while (!list_empty(&rc->reloc_roots)) {
2324 		reloc_root = list_entry(rc->reloc_roots.next,
2325 					struct btrfs_root, root_list);
2326 		list_del_init(&reloc_root->root_list);
2327 
2328 		root = read_fs_root(reloc_root->fs_info,
2329 				    reloc_root->root_key.offset);
2330 		BUG_ON(IS_ERR(root));
2331 		BUG_ON(root->reloc_root != reloc_root);
2332 
2333 		/*
2334 		 * set reference count to 1, so btrfs_recover_relocation
2335 		 * knows it should resumes merging
2336 		 */
2337 		if (!err)
2338 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2339 		btrfs_update_reloc_root(trans, root);
2340 
2341 		list_add(&reloc_root->root_list, &reloc_roots);
2342 	}
2343 
2344 	list_splice(&reloc_roots, &rc->reloc_roots);
2345 
2346 	if (!err)
2347 		btrfs_commit_transaction(trans, rc->extent_root);
2348 	else
2349 		btrfs_end_transaction(trans, rc->extent_root);
2350 	return err;
2351 }
2352 
2353 static noinline_for_stack
2354 void free_reloc_roots(struct list_head *list)
2355 {
2356 	struct btrfs_root *reloc_root;
2357 
2358 	while (!list_empty(list)) {
2359 		reloc_root = list_entry(list->next, struct btrfs_root,
2360 					root_list);
2361 		free_extent_buffer(reloc_root->node);
2362 		free_extent_buffer(reloc_root->commit_root);
2363 		reloc_root->node = NULL;
2364 		reloc_root->commit_root = NULL;
2365 		__del_reloc_root(reloc_root);
2366 	}
2367 }
2368 
2369 static noinline_for_stack
2370 void merge_reloc_roots(struct reloc_control *rc)
2371 {
2372 	struct btrfs_root *root;
2373 	struct btrfs_root *reloc_root;
2374 	u64 last_snap;
2375 	u64 otransid;
2376 	u64 objectid;
2377 	LIST_HEAD(reloc_roots);
2378 	int found = 0;
2379 	int ret = 0;
2380 again:
2381 	root = rc->extent_root;
2382 
2383 	/*
2384 	 * this serializes us with btrfs_record_root_in_transaction,
2385 	 * we have to make sure nobody is in the middle of
2386 	 * adding their roots to the list while we are
2387 	 * doing this splice
2388 	 */
2389 	mutex_lock(&root->fs_info->reloc_mutex);
2390 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2391 	mutex_unlock(&root->fs_info->reloc_mutex);
2392 
2393 	while (!list_empty(&reloc_roots)) {
2394 		found = 1;
2395 		reloc_root = list_entry(reloc_roots.next,
2396 					struct btrfs_root, root_list);
2397 
2398 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2399 			root = read_fs_root(reloc_root->fs_info,
2400 					    reloc_root->root_key.offset);
2401 			BUG_ON(IS_ERR(root));
2402 			BUG_ON(root->reloc_root != reloc_root);
2403 
2404 			ret = merge_reloc_root(rc, root);
2405 			if (ret) {
2406 				if (list_empty(&reloc_root->root_list))
2407 					list_add_tail(&reloc_root->root_list,
2408 						      &reloc_roots);
2409 				goto out;
2410 			}
2411 		} else {
2412 			list_del_init(&reloc_root->root_list);
2413 		}
2414 
2415 		/*
2416 		 * we keep the old last snapshot transid in rtranid when we
2417 		 * created the relocation tree.
2418 		 */
2419 		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2420 		otransid = btrfs_root_otransid(&reloc_root->root_item);
2421 		objectid = reloc_root->root_key.offset;
2422 
2423 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2424 		if (ret < 0) {
2425 			if (list_empty(&reloc_root->root_list))
2426 				list_add_tail(&reloc_root->root_list,
2427 					      &reloc_roots);
2428 			goto out;
2429 		}
2430 	}
2431 
2432 	if (found) {
2433 		found = 0;
2434 		goto again;
2435 	}
2436 out:
2437 	if (ret) {
2438 		btrfs_handle_fs_error(root->fs_info, ret, NULL);
2439 		if (!list_empty(&reloc_roots))
2440 			free_reloc_roots(&reloc_roots);
2441 
2442 		/* new reloc root may be added */
2443 		mutex_lock(&root->fs_info->reloc_mutex);
2444 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2445 		mutex_unlock(&root->fs_info->reloc_mutex);
2446 		if (!list_empty(&reloc_roots))
2447 			free_reloc_roots(&reloc_roots);
2448 	}
2449 
2450 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2451 }
2452 
2453 static void free_block_list(struct rb_root *blocks)
2454 {
2455 	struct tree_block *block;
2456 	struct rb_node *rb_node;
2457 	while ((rb_node = rb_first(blocks))) {
2458 		block = rb_entry(rb_node, struct tree_block, rb_node);
2459 		rb_erase(rb_node, blocks);
2460 		kfree(block);
2461 	}
2462 }
2463 
2464 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2465 				      struct btrfs_root *reloc_root)
2466 {
2467 	struct btrfs_root *root;
2468 
2469 	if (reloc_root->last_trans == trans->transid)
2470 		return 0;
2471 
2472 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2473 	BUG_ON(IS_ERR(root));
2474 	BUG_ON(root->reloc_root != reloc_root);
2475 
2476 	return btrfs_record_root_in_trans(trans, root);
2477 }
2478 
2479 static noinline_for_stack
2480 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2481 				     struct reloc_control *rc,
2482 				     struct backref_node *node,
2483 				     struct backref_edge *edges[])
2484 {
2485 	struct backref_node *next;
2486 	struct btrfs_root *root;
2487 	int index = 0;
2488 
2489 	next = node;
2490 	while (1) {
2491 		cond_resched();
2492 		next = walk_up_backref(next, edges, &index);
2493 		root = next->root;
2494 		BUG_ON(!root);
2495 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2496 
2497 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2498 			record_reloc_root_in_trans(trans, root);
2499 			break;
2500 		}
2501 
2502 		btrfs_record_root_in_trans(trans, root);
2503 		root = root->reloc_root;
2504 
2505 		if (next->new_bytenr != root->node->start) {
2506 			BUG_ON(next->new_bytenr);
2507 			BUG_ON(!list_empty(&next->list));
2508 			next->new_bytenr = root->node->start;
2509 			next->root = root;
2510 			list_add_tail(&next->list,
2511 				      &rc->backref_cache.changed);
2512 			__mark_block_processed(rc, next);
2513 			break;
2514 		}
2515 
2516 		WARN_ON(1);
2517 		root = NULL;
2518 		next = walk_down_backref(edges, &index);
2519 		if (!next || next->level <= node->level)
2520 			break;
2521 	}
2522 	if (!root)
2523 		return NULL;
2524 
2525 	next = node;
2526 	/* setup backref node path for btrfs_reloc_cow_block */
2527 	while (1) {
2528 		rc->backref_cache.path[next->level] = next;
2529 		if (--index < 0)
2530 			break;
2531 		next = edges[index]->node[UPPER];
2532 	}
2533 	return root;
2534 }
2535 
2536 /*
2537  * select a tree root for relocation. return NULL if the block
2538  * is reference counted. we should use do_relocation() in this
2539  * case. return a tree root pointer if the block isn't reference
2540  * counted. return -ENOENT if the block is root of reloc tree.
2541  */
2542 static noinline_for_stack
2543 struct btrfs_root *select_one_root(struct backref_node *node)
2544 {
2545 	struct backref_node *next;
2546 	struct btrfs_root *root;
2547 	struct btrfs_root *fs_root = NULL;
2548 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2549 	int index = 0;
2550 
2551 	next = node;
2552 	while (1) {
2553 		cond_resched();
2554 		next = walk_up_backref(next, edges, &index);
2555 		root = next->root;
2556 		BUG_ON(!root);
2557 
2558 		/* no other choice for non-references counted tree */
2559 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2560 			return root;
2561 
2562 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2563 			fs_root = root;
2564 
2565 		if (next != node)
2566 			return NULL;
2567 
2568 		next = walk_down_backref(edges, &index);
2569 		if (!next || next->level <= node->level)
2570 			break;
2571 	}
2572 
2573 	if (!fs_root)
2574 		return ERR_PTR(-ENOENT);
2575 	return fs_root;
2576 }
2577 
2578 static noinline_for_stack
2579 u64 calcu_metadata_size(struct reloc_control *rc,
2580 			struct backref_node *node, int reserve)
2581 {
2582 	struct backref_node *next = node;
2583 	struct backref_edge *edge;
2584 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2585 	u64 num_bytes = 0;
2586 	int index = 0;
2587 
2588 	BUG_ON(reserve && node->processed);
2589 
2590 	while (next) {
2591 		cond_resched();
2592 		while (1) {
2593 			if (next->processed && (reserve || next != node))
2594 				break;
2595 
2596 			num_bytes += rc->extent_root->nodesize;
2597 
2598 			if (list_empty(&next->upper))
2599 				break;
2600 
2601 			edge = list_entry(next->upper.next,
2602 					  struct backref_edge, list[LOWER]);
2603 			edges[index++] = edge;
2604 			next = edge->node[UPPER];
2605 		}
2606 		next = walk_down_backref(edges, &index);
2607 	}
2608 	return num_bytes;
2609 }
2610 
2611 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2612 				  struct reloc_control *rc,
2613 				  struct backref_node *node)
2614 {
2615 	struct btrfs_root *root = rc->extent_root;
2616 	u64 num_bytes;
2617 	int ret;
2618 	u64 tmp;
2619 
2620 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2621 
2622 	trans->block_rsv = rc->block_rsv;
2623 	rc->reserved_bytes += num_bytes;
2624 
2625 	/*
2626 	 * We are under a transaction here so we can only do limited flushing.
2627 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2628 	 * transaction and try to refill when we can flush all the things.
2629 	 */
2630 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2631 				BTRFS_RESERVE_FLUSH_LIMIT);
2632 	if (ret) {
2633 		tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
2634 		while (tmp <= rc->reserved_bytes)
2635 			tmp <<= 1;
2636 		/*
2637 		 * only one thread can access block_rsv at this point,
2638 		 * so we don't need hold lock to protect block_rsv.
2639 		 * we expand more reservation size here to allow enough
2640 		 * space for relocation and we will return eailer in
2641 		 * enospc case.
2642 		 */
2643 		rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2644 			RELOCATION_RESERVED_NODES;
2645 		return -EAGAIN;
2646 	}
2647 
2648 	return 0;
2649 }
2650 
2651 /*
2652  * relocate a block tree, and then update pointers in upper level
2653  * blocks that reference the block to point to the new location.
2654  *
2655  * if called by link_to_upper, the block has already been relocated.
2656  * in that case this function just updates pointers.
2657  */
2658 static int do_relocation(struct btrfs_trans_handle *trans,
2659 			 struct reloc_control *rc,
2660 			 struct backref_node *node,
2661 			 struct btrfs_key *key,
2662 			 struct btrfs_path *path, int lowest)
2663 {
2664 	struct backref_node *upper;
2665 	struct backref_edge *edge;
2666 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2667 	struct btrfs_root *root;
2668 	struct extent_buffer *eb;
2669 	u32 blocksize;
2670 	u64 bytenr;
2671 	u64 generation;
2672 	int slot;
2673 	int ret;
2674 	int err = 0;
2675 
2676 	BUG_ON(lowest && node->eb);
2677 
2678 	path->lowest_level = node->level + 1;
2679 	rc->backref_cache.path[node->level] = node;
2680 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2681 		cond_resched();
2682 
2683 		upper = edge->node[UPPER];
2684 		root = select_reloc_root(trans, rc, upper, edges);
2685 		BUG_ON(!root);
2686 
2687 		if (upper->eb && !upper->locked) {
2688 			if (!lowest) {
2689 				ret = btrfs_bin_search(upper->eb, key,
2690 						       upper->level, &slot);
2691 				BUG_ON(ret);
2692 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2693 				if (node->eb->start == bytenr)
2694 					goto next;
2695 			}
2696 			drop_node_buffer(upper);
2697 		}
2698 
2699 		if (!upper->eb) {
2700 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2701 			if (ret) {
2702 				if (ret < 0)
2703 					err = ret;
2704 				else
2705 					err = -ENOENT;
2706 
2707 				btrfs_release_path(path);
2708 				break;
2709 			}
2710 
2711 			if (!upper->eb) {
2712 				upper->eb = path->nodes[upper->level];
2713 				path->nodes[upper->level] = NULL;
2714 			} else {
2715 				BUG_ON(upper->eb != path->nodes[upper->level]);
2716 			}
2717 
2718 			upper->locked = 1;
2719 			path->locks[upper->level] = 0;
2720 
2721 			slot = path->slots[upper->level];
2722 			btrfs_release_path(path);
2723 		} else {
2724 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2725 					       &slot);
2726 			BUG_ON(ret);
2727 		}
2728 
2729 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2730 		if (lowest) {
2731 			if (bytenr != node->bytenr) {
2732 				btrfs_err(root->fs_info,
2733 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2734 					  bytenr, node->bytenr, slot,
2735 					  upper->eb->start);
2736 				err = -EIO;
2737 				goto next;
2738 			}
2739 		} else {
2740 			if (node->eb->start == bytenr)
2741 				goto next;
2742 		}
2743 
2744 		blocksize = root->nodesize;
2745 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2746 		eb = read_tree_block(root, bytenr, generation);
2747 		if (IS_ERR(eb)) {
2748 			err = PTR_ERR(eb);
2749 			goto next;
2750 		} else if (!extent_buffer_uptodate(eb)) {
2751 			free_extent_buffer(eb);
2752 			err = -EIO;
2753 			goto next;
2754 		}
2755 		btrfs_tree_lock(eb);
2756 		btrfs_set_lock_blocking(eb);
2757 
2758 		if (!node->eb) {
2759 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2760 					      slot, &eb);
2761 			btrfs_tree_unlock(eb);
2762 			free_extent_buffer(eb);
2763 			if (ret < 0) {
2764 				err = ret;
2765 				goto next;
2766 			}
2767 			BUG_ON(node->eb != eb);
2768 		} else {
2769 			btrfs_set_node_blockptr(upper->eb, slot,
2770 						node->eb->start);
2771 			btrfs_set_node_ptr_generation(upper->eb, slot,
2772 						      trans->transid);
2773 			btrfs_mark_buffer_dirty(upper->eb);
2774 
2775 			ret = btrfs_inc_extent_ref(trans, root,
2776 						node->eb->start, blocksize,
2777 						upper->eb->start,
2778 						btrfs_header_owner(upper->eb),
2779 						node->level, 0);
2780 			BUG_ON(ret);
2781 
2782 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2783 			BUG_ON(ret);
2784 		}
2785 next:
2786 		if (!upper->pending)
2787 			drop_node_buffer(upper);
2788 		else
2789 			unlock_node_buffer(upper);
2790 		if (err)
2791 			break;
2792 	}
2793 
2794 	if (!err && node->pending) {
2795 		drop_node_buffer(node);
2796 		list_move_tail(&node->list, &rc->backref_cache.changed);
2797 		node->pending = 0;
2798 	}
2799 
2800 	path->lowest_level = 0;
2801 	BUG_ON(err == -ENOSPC);
2802 	return err;
2803 }
2804 
2805 static int link_to_upper(struct btrfs_trans_handle *trans,
2806 			 struct reloc_control *rc,
2807 			 struct backref_node *node,
2808 			 struct btrfs_path *path)
2809 {
2810 	struct btrfs_key key;
2811 
2812 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2813 	return do_relocation(trans, rc, node, &key, path, 0);
2814 }
2815 
2816 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2817 				struct reloc_control *rc,
2818 				struct btrfs_path *path, int err)
2819 {
2820 	LIST_HEAD(list);
2821 	struct backref_cache *cache = &rc->backref_cache;
2822 	struct backref_node *node;
2823 	int level;
2824 	int ret;
2825 
2826 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2827 		while (!list_empty(&cache->pending[level])) {
2828 			node = list_entry(cache->pending[level].next,
2829 					  struct backref_node, list);
2830 			list_move_tail(&node->list, &list);
2831 			BUG_ON(!node->pending);
2832 
2833 			if (!err) {
2834 				ret = link_to_upper(trans, rc, node, path);
2835 				if (ret < 0)
2836 					err = ret;
2837 			}
2838 		}
2839 		list_splice_init(&list, &cache->pending[level]);
2840 	}
2841 	return err;
2842 }
2843 
2844 static void mark_block_processed(struct reloc_control *rc,
2845 				 u64 bytenr, u32 blocksize)
2846 {
2847 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2848 			EXTENT_DIRTY);
2849 }
2850 
2851 static void __mark_block_processed(struct reloc_control *rc,
2852 				   struct backref_node *node)
2853 {
2854 	u32 blocksize;
2855 	if (node->level == 0 ||
2856 	    in_block_group(node->bytenr, rc->block_group)) {
2857 		blocksize = rc->extent_root->nodesize;
2858 		mark_block_processed(rc, node->bytenr, blocksize);
2859 	}
2860 	node->processed = 1;
2861 }
2862 
2863 /*
2864  * mark a block and all blocks directly/indirectly reference the block
2865  * as processed.
2866  */
2867 static void update_processed_blocks(struct reloc_control *rc,
2868 				    struct backref_node *node)
2869 {
2870 	struct backref_node *next = node;
2871 	struct backref_edge *edge;
2872 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2873 	int index = 0;
2874 
2875 	while (next) {
2876 		cond_resched();
2877 		while (1) {
2878 			if (next->processed)
2879 				break;
2880 
2881 			__mark_block_processed(rc, next);
2882 
2883 			if (list_empty(&next->upper))
2884 				break;
2885 
2886 			edge = list_entry(next->upper.next,
2887 					  struct backref_edge, list[LOWER]);
2888 			edges[index++] = edge;
2889 			next = edge->node[UPPER];
2890 		}
2891 		next = walk_down_backref(edges, &index);
2892 	}
2893 }
2894 
2895 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2896 {
2897 	u32 blocksize = rc->extent_root->nodesize;
2898 
2899 	if (test_range_bit(&rc->processed_blocks, bytenr,
2900 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2901 		return 1;
2902 	return 0;
2903 }
2904 
2905 static int get_tree_block_key(struct reloc_control *rc,
2906 			      struct tree_block *block)
2907 {
2908 	struct extent_buffer *eb;
2909 
2910 	BUG_ON(block->key_ready);
2911 	eb = read_tree_block(rc->extent_root, block->bytenr,
2912 			     block->key.offset);
2913 	if (IS_ERR(eb)) {
2914 		return PTR_ERR(eb);
2915 	} else if (!extent_buffer_uptodate(eb)) {
2916 		free_extent_buffer(eb);
2917 		return -EIO;
2918 	}
2919 	WARN_ON(btrfs_header_level(eb) != block->level);
2920 	if (block->level == 0)
2921 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2922 	else
2923 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2924 	free_extent_buffer(eb);
2925 	block->key_ready = 1;
2926 	return 0;
2927 }
2928 
2929 /*
2930  * helper function to relocate a tree block
2931  */
2932 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2933 				struct reloc_control *rc,
2934 				struct backref_node *node,
2935 				struct btrfs_key *key,
2936 				struct btrfs_path *path)
2937 {
2938 	struct btrfs_root *root;
2939 	int ret = 0;
2940 
2941 	if (!node)
2942 		return 0;
2943 
2944 	BUG_ON(node->processed);
2945 	root = select_one_root(node);
2946 	if (root == ERR_PTR(-ENOENT)) {
2947 		update_processed_blocks(rc, node);
2948 		goto out;
2949 	}
2950 
2951 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2952 		ret = reserve_metadata_space(trans, rc, node);
2953 		if (ret)
2954 			goto out;
2955 	}
2956 
2957 	if (root) {
2958 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2959 			BUG_ON(node->new_bytenr);
2960 			BUG_ON(!list_empty(&node->list));
2961 			btrfs_record_root_in_trans(trans, root);
2962 			root = root->reloc_root;
2963 			node->new_bytenr = root->node->start;
2964 			node->root = root;
2965 			list_add_tail(&node->list, &rc->backref_cache.changed);
2966 		} else {
2967 			path->lowest_level = node->level;
2968 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2969 			btrfs_release_path(path);
2970 			if (ret > 0)
2971 				ret = 0;
2972 		}
2973 		if (!ret)
2974 			update_processed_blocks(rc, node);
2975 	} else {
2976 		ret = do_relocation(trans, rc, node, key, path, 1);
2977 	}
2978 out:
2979 	if (ret || node->level == 0 || node->cowonly)
2980 		remove_backref_node(&rc->backref_cache, node);
2981 	return ret;
2982 }
2983 
2984 /*
2985  * relocate a list of blocks
2986  */
2987 static noinline_for_stack
2988 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2989 			 struct reloc_control *rc, struct rb_root *blocks)
2990 {
2991 	struct backref_node *node;
2992 	struct btrfs_path *path;
2993 	struct tree_block *block;
2994 	struct rb_node *rb_node;
2995 	int ret;
2996 	int err = 0;
2997 
2998 	path = btrfs_alloc_path();
2999 	if (!path) {
3000 		err = -ENOMEM;
3001 		goto out_free_blocks;
3002 	}
3003 
3004 	rb_node = rb_first(blocks);
3005 	while (rb_node) {
3006 		block = rb_entry(rb_node, struct tree_block, rb_node);
3007 		if (!block->key_ready)
3008 			readahead_tree_block(rc->extent_root, block->bytenr);
3009 		rb_node = rb_next(rb_node);
3010 	}
3011 
3012 	rb_node = rb_first(blocks);
3013 	while (rb_node) {
3014 		block = rb_entry(rb_node, struct tree_block, rb_node);
3015 		if (!block->key_ready) {
3016 			err = get_tree_block_key(rc, block);
3017 			if (err)
3018 				goto out_free_path;
3019 		}
3020 		rb_node = rb_next(rb_node);
3021 	}
3022 
3023 	rb_node = rb_first(blocks);
3024 	while (rb_node) {
3025 		block = rb_entry(rb_node, struct tree_block, rb_node);
3026 
3027 		node = build_backref_tree(rc, &block->key,
3028 					  block->level, block->bytenr);
3029 		if (IS_ERR(node)) {
3030 			err = PTR_ERR(node);
3031 			goto out;
3032 		}
3033 
3034 		ret = relocate_tree_block(trans, rc, node, &block->key,
3035 					  path);
3036 		if (ret < 0) {
3037 			if (ret != -EAGAIN || rb_node == rb_first(blocks))
3038 				err = ret;
3039 			goto out;
3040 		}
3041 		rb_node = rb_next(rb_node);
3042 	}
3043 out:
3044 	err = finish_pending_nodes(trans, rc, path, err);
3045 
3046 out_free_path:
3047 	btrfs_free_path(path);
3048 out_free_blocks:
3049 	free_block_list(blocks);
3050 	return err;
3051 }
3052 
3053 static noinline_for_stack
3054 int prealloc_file_extent_cluster(struct inode *inode,
3055 				 struct file_extent_cluster *cluster)
3056 {
3057 	u64 alloc_hint = 0;
3058 	u64 start;
3059 	u64 end;
3060 	u64 offset = BTRFS_I(inode)->index_cnt;
3061 	u64 num_bytes;
3062 	int nr = 0;
3063 	int ret = 0;
3064 	u64 prealloc_start = cluster->start - offset;
3065 	u64 prealloc_end = cluster->end - offset;
3066 	u64 cur_offset;
3067 
3068 	BUG_ON(cluster->start != cluster->boundary[0]);
3069 	inode_lock(inode);
3070 
3071 	ret = btrfs_check_data_free_space(inode, prealloc_start,
3072 					  prealloc_end + 1 - prealloc_start);
3073 	if (ret)
3074 		goto out;
3075 
3076 	cur_offset = prealloc_start;
3077 	while (nr < cluster->nr) {
3078 		start = cluster->boundary[nr] - offset;
3079 		if (nr + 1 < cluster->nr)
3080 			end = cluster->boundary[nr + 1] - 1 - offset;
3081 		else
3082 			end = cluster->end - offset;
3083 
3084 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3085 		num_bytes = end + 1 - start;
3086 		if (cur_offset < start)
3087 			btrfs_free_reserved_data_space(inode, cur_offset,
3088 					start - cur_offset);
3089 		ret = btrfs_prealloc_file_range(inode, 0, start,
3090 						num_bytes, num_bytes,
3091 						end + 1, &alloc_hint);
3092 		cur_offset = end + 1;
3093 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3094 		if (ret)
3095 			break;
3096 		nr++;
3097 	}
3098 	if (cur_offset < prealloc_end)
3099 		btrfs_free_reserved_data_space(inode, cur_offset,
3100 				       prealloc_end + 1 - cur_offset);
3101 out:
3102 	inode_unlock(inode);
3103 	return ret;
3104 }
3105 
3106 static noinline_for_stack
3107 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3108 			 u64 block_start)
3109 {
3110 	struct btrfs_root *root = BTRFS_I(inode)->root;
3111 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3112 	struct extent_map *em;
3113 	int ret = 0;
3114 
3115 	em = alloc_extent_map();
3116 	if (!em)
3117 		return -ENOMEM;
3118 
3119 	em->start = start;
3120 	em->len = end + 1 - start;
3121 	em->block_len = em->len;
3122 	em->block_start = block_start;
3123 	em->bdev = root->fs_info->fs_devices->latest_bdev;
3124 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3125 
3126 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3127 	while (1) {
3128 		write_lock(&em_tree->lock);
3129 		ret = add_extent_mapping(em_tree, em, 0);
3130 		write_unlock(&em_tree->lock);
3131 		if (ret != -EEXIST) {
3132 			free_extent_map(em);
3133 			break;
3134 		}
3135 		btrfs_drop_extent_cache(inode, start, end, 0);
3136 	}
3137 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3138 	return ret;
3139 }
3140 
3141 static int relocate_file_extent_cluster(struct inode *inode,
3142 					struct file_extent_cluster *cluster)
3143 {
3144 	u64 page_start;
3145 	u64 page_end;
3146 	u64 offset = BTRFS_I(inode)->index_cnt;
3147 	unsigned long index;
3148 	unsigned long last_index;
3149 	struct page *page;
3150 	struct file_ra_state *ra;
3151 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3152 	int nr = 0;
3153 	int ret = 0;
3154 
3155 	if (!cluster->nr)
3156 		return 0;
3157 
3158 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3159 	if (!ra)
3160 		return -ENOMEM;
3161 
3162 	ret = prealloc_file_extent_cluster(inode, cluster);
3163 	if (ret)
3164 		goto out;
3165 
3166 	file_ra_state_init(ra, inode->i_mapping);
3167 
3168 	ret = setup_extent_mapping(inode, cluster->start - offset,
3169 				   cluster->end - offset, cluster->start);
3170 	if (ret)
3171 		goto out;
3172 
3173 	index = (cluster->start - offset) >> PAGE_SHIFT;
3174 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3175 	while (index <= last_index) {
3176 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
3177 		if (ret)
3178 			goto out;
3179 
3180 		page = find_lock_page(inode->i_mapping, index);
3181 		if (!page) {
3182 			page_cache_sync_readahead(inode->i_mapping,
3183 						  ra, NULL, index,
3184 						  last_index + 1 - index);
3185 			page = find_or_create_page(inode->i_mapping, index,
3186 						   mask);
3187 			if (!page) {
3188 				btrfs_delalloc_release_metadata(inode,
3189 							PAGE_SIZE);
3190 				ret = -ENOMEM;
3191 				goto out;
3192 			}
3193 		}
3194 
3195 		if (PageReadahead(page)) {
3196 			page_cache_async_readahead(inode->i_mapping,
3197 						   ra, NULL, page, index,
3198 						   last_index + 1 - index);
3199 		}
3200 
3201 		if (!PageUptodate(page)) {
3202 			btrfs_readpage(NULL, page);
3203 			lock_page(page);
3204 			if (!PageUptodate(page)) {
3205 				unlock_page(page);
3206 				put_page(page);
3207 				btrfs_delalloc_release_metadata(inode,
3208 							PAGE_SIZE);
3209 				ret = -EIO;
3210 				goto out;
3211 			}
3212 		}
3213 
3214 		page_start = page_offset(page);
3215 		page_end = page_start + PAGE_SIZE - 1;
3216 
3217 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3218 
3219 		set_page_extent_mapped(page);
3220 
3221 		if (nr < cluster->nr &&
3222 		    page_start + offset == cluster->boundary[nr]) {
3223 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3224 					page_start, page_end,
3225 					EXTENT_BOUNDARY);
3226 			nr++;
3227 		}
3228 
3229 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
3230 		set_page_dirty(page);
3231 
3232 		unlock_extent(&BTRFS_I(inode)->io_tree,
3233 			      page_start, page_end);
3234 		unlock_page(page);
3235 		put_page(page);
3236 
3237 		index++;
3238 		balance_dirty_pages_ratelimited(inode->i_mapping);
3239 		btrfs_throttle(BTRFS_I(inode)->root);
3240 	}
3241 	WARN_ON(nr != cluster->nr);
3242 out:
3243 	kfree(ra);
3244 	return ret;
3245 }
3246 
3247 static noinline_for_stack
3248 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3249 			 struct file_extent_cluster *cluster)
3250 {
3251 	int ret;
3252 
3253 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3254 		ret = relocate_file_extent_cluster(inode, cluster);
3255 		if (ret)
3256 			return ret;
3257 		cluster->nr = 0;
3258 	}
3259 
3260 	if (!cluster->nr)
3261 		cluster->start = extent_key->objectid;
3262 	else
3263 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3264 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3265 	cluster->boundary[cluster->nr] = extent_key->objectid;
3266 	cluster->nr++;
3267 
3268 	if (cluster->nr >= MAX_EXTENTS) {
3269 		ret = relocate_file_extent_cluster(inode, cluster);
3270 		if (ret)
3271 			return ret;
3272 		cluster->nr = 0;
3273 	}
3274 	return 0;
3275 }
3276 
3277 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3278 static int get_ref_objectid_v0(struct reloc_control *rc,
3279 			       struct btrfs_path *path,
3280 			       struct btrfs_key *extent_key,
3281 			       u64 *ref_objectid, int *path_change)
3282 {
3283 	struct btrfs_key key;
3284 	struct extent_buffer *leaf;
3285 	struct btrfs_extent_ref_v0 *ref0;
3286 	int ret;
3287 	int slot;
3288 
3289 	leaf = path->nodes[0];
3290 	slot = path->slots[0];
3291 	while (1) {
3292 		if (slot >= btrfs_header_nritems(leaf)) {
3293 			ret = btrfs_next_leaf(rc->extent_root, path);
3294 			if (ret < 0)
3295 				return ret;
3296 			BUG_ON(ret > 0);
3297 			leaf = path->nodes[0];
3298 			slot = path->slots[0];
3299 			if (path_change)
3300 				*path_change = 1;
3301 		}
3302 		btrfs_item_key_to_cpu(leaf, &key, slot);
3303 		if (key.objectid != extent_key->objectid)
3304 			return -ENOENT;
3305 
3306 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3307 			slot++;
3308 			continue;
3309 		}
3310 		ref0 = btrfs_item_ptr(leaf, slot,
3311 				struct btrfs_extent_ref_v0);
3312 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3313 		break;
3314 	}
3315 	return 0;
3316 }
3317 #endif
3318 
3319 /*
3320  * helper to add a tree block to the list.
3321  * the major work is getting the generation and level of the block
3322  */
3323 static int add_tree_block(struct reloc_control *rc,
3324 			  struct btrfs_key *extent_key,
3325 			  struct btrfs_path *path,
3326 			  struct rb_root *blocks)
3327 {
3328 	struct extent_buffer *eb;
3329 	struct btrfs_extent_item *ei;
3330 	struct btrfs_tree_block_info *bi;
3331 	struct tree_block *block;
3332 	struct rb_node *rb_node;
3333 	u32 item_size;
3334 	int level = -1;
3335 	u64 generation;
3336 
3337 	eb =  path->nodes[0];
3338 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3339 
3340 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3341 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3342 		ei = btrfs_item_ptr(eb, path->slots[0],
3343 				struct btrfs_extent_item);
3344 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3345 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3346 			level = btrfs_tree_block_level(eb, bi);
3347 		} else {
3348 			level = (int)extent_key->offset;
3349 		}
3350 		generation = btrfs_extent_generation(eb, ei);
3351 	} else {
3352 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3353 		u64 ref_owner;
3354 		int ret;
3355 
3356 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3357 		ret = get_ref_objectid_v0(rc, path, extent_key,
3358 					  &ref_owner, NULL);
3359 		if (ret < 0)
3360 			return ret;
3361 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3362 		level = (int)ref_owner;
3363 		/* FIXME: get real generation */
3364 		generation = 0;
3365 #else
3366 		BUG();
3367 #endif
3368 	}
3369 
3370 	btrfs_release_path(path);
3371 
3372 	BUG_ON(level == -1);
3373 
3374 	block = kmalloc(sizeof(*block), GFP_NOFS);
3375 	if (!block)
3376 		return -ENOMEM;
3377 
3378 	block->bytenr = extent_key->objectid;
3379 	block->key.objectid = rc->extent_root->nodesize;
3380 	block->key.offset = generation;
3381 	block->level = level;
3382 	block->key_ready = 0;
3383 
3384 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3385 	if (rb_node)
3386 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3387 
3388 	return 0;
3389 }
3390 
3391 /*
3392  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3393  */
3394 static int __add_tree_block(struct reloc_control *rc,
3395 			    u64 bytenr, u32 blocksize,
3396 			    struct rb_root *blocks)
3397 {
3398 	struct btrfs_path *path;
3399 	struct btrfs_key key;
3400 	int ret;
3401 	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3402 					SKINNY_METADATA);
3403 
3404 	if (tree_block_processed(bytenr, rc))
3405 		return 0;
3406 
3407 	if (tree_search(blocks, bytenr))
3408 		return 0;
3409 
3410 	path = btrfs_alloc_path();
3411 	if (!path)
3412 		return -ENOMEM;
3413 again:
3414 	key.objectid = bytenr;
3415 	if (skinny) {
3416 		key.type = BTRFS_METADATA_ITEM_KEY;
3417 		key.offset = (u64)-1;
3418 	} else {
3419 		key.type = BTRFS_EXTENT_ITEM_KEY;
3420 		key.offset = blocksize;
3421 	}
3422 
3423 	path->search_commit_root = 1;
3424 	path->skip_locking = 1;
3425 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3426 	if (ret < 0)
3427 		goto out;
3428 
3429 	if (ret > 0 && skinny) {
3430 		if (path->slots[0]) {
3431 			path->slots[0]--;
3432 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3433 					      path->slots[0]);
3434 			if (key.objectid == bytenr &&
3435 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3436 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3437 			      key.offset == blocksize)))
3438 				ret = 0;
3439 		}
3440 
3441 		if (ret) {
3442 			skinny = false;
3443 			btrfs_release_path(path);
3444 			goto again;
3445 		}
3446 	}
3447 	BUG_ON(ret);
3448 
3449 	ret = add_tree_block(rc, &key, path, blocks);
3450 out:
3451 	btrfs_free_path(path);
3452 	return ret;
3453 }
3454 
3455 /*
3456  * helper to check if the block use full backrefs for pointers in it
3457  */
3458 static int block_use_full_backref(struct reloc_control *rc,
3459 				  struct extent_buffer *eb)
3460 {
3461 	u64 flags;
3462 	int ret;
3463 
3464 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3465 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3466 		return 1;
3467 
3468 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3469 				       eb->start, btrfs_header_level(eb), 1,
3470 				       NULL, &flags);
3471 	BUG_ON(ret);
3472 
3473 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3474 		ret = 1;
3475 	else
3476 		ret = 0;
3477 	return ret;
3478 }
3479 
3480 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3481 				    struct btrfs_block_group_cache *block_group,
3482 				    struct inode *inode,
3483 				    u64 ino)
3484 {
3485 	struct btrfs_key key;
3486 	struct btrfs_root *root = fs_info->tree_root;
3487 	struct btrfs_trans_handle *trans;
3488 	int ret = 0;
3489 
3490 	if (inode)
3491 		goto truncate;
3492 
3493 	key.objectid = ino;
3494 	key.type = BTRFS_INODE_ITEM_KEY;
3495 	key.offset = 0;
3496 
3497 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3498 	if (IS_ERR(inode) || is_bad_inode(inode)) {
3499 		if (!IS_ERR(inode))
3500 			iput(inode);
3501 		return -ENOENT;
3502 	}
3503 
3504 truncate:
3505 	ret = btrfs_check_trunc_cache_free_space(root,
3506 						 &fs_info->global_block_rsv);
3507 	if (ret)
3508 		goto out;
3509 
3510 	trans = btrfs_join_transaction(root);
3511 	if (IS_ERR(trans)) {
3512 		ret = PTR_ERR(trans);
3513 		goto out;
3514 	}
3515 
3516 	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3517 
3518 	btrfs_end_transaction(trans, root);
3519 	btrfs_btree_balance_dirty(root);
3520 out:
3521 	iput(inode);
3522 	return ret;
3523 }
3524 
3525 /*
3526  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3527  * this function scans fs tree to find blocks reference the data extent
3528  */
3529 static int find_data_references(struct reloc_control *rc,
3530 				struct btrfs_key *extent_key,
3531 				struct extent_buffer *leaf,
3532 				struct btrfs_extent_data_ref *ref,
3533 				struct rb_root *blocks)
3534 {
3535 	struct btrfs_path *path;
3536 	struct tree_block *block;
3537 	struct btrfs_root *root;
3538 	struct btrfs_file_extent_item *fi;
3539 	struct rb_node *rb_node;
3540 	struct btrfs_key key;
3541 	u64 ref_root;
3542 	u64 ref_objectid;
3543 	u64 ref_offset;
3544 	u32 ref_count;
3545 	u32 nritems;
3546 	int err = 0;
3547 	int added = 0;
3548 	int counted;
3549 	int ret;
3550 
3551 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3552 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3553 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3554 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3555 
3556 	/*
3557 	 * This is an extent belonging to the free space cache, lets just delete
3558 	 * it and redo the search.
3559 	 */
3560 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3561 		ret = delete_block_group_cache(rc->extent_root->fs_info,
3562 					       rc->block_group,
3563 					       NULL, ref_objectid);
3564 		if (ret != -ENOENT)
3565 			return ret;
3566 		ret = 0;
3567 	}
3568 
3569 	path = btrfs_alloc_path();
3570 	if (!path)
3571 		return -ENOMEM;
3572 	path->reada = READA_FORWARD;
3573 
3574 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
3575 	if (IS_ERR(root)) {
3576 		err = PTR_ERR(root);
3577 		goto out;
3578 	}
3579 
3580 	key.objectid = ref_objectid;
3581 	key.type = BTRFS_EXTENT_DATA_KEY;
3582 	if (ref_offset > ((u64)-1 << 32))
3583 		key.offset = 0;
3584 	else
3585 		key.offset = ref_offset;
3586 
3587 	path->search_commit_root = 1;
3588 	path->skip_locking = 1;
3589 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3590 	if (ret < 0) {
3591 		err = ret;
3592 		goto out;
3593 	}
3594 
3595 	leaf = path->nodes[0];
3596 	nritems = btrfs_header_nritems(leaf);
3597 	/*
3598 	 * the references in tree blocks that use full backrefs
3599 	 * are not counted in
3600 	 */
3601 	if (block_use_full_backref(rc, leaf))
3602 		counted = 0;
3603 	else
3604 		counted = 1;
3605 	rb_node = tree_search(blocks, leaf->start);
3606 	if (rb_node) {
3607 		if (counted)
3608 			added = 1;
3609 		else
3610 			path->slots[0] = nritems;
3611 	}
3612 
3613 	while (ref_count > 0) {
3614 		while (path->slots[0] >= nritems) {
3615 			ret = btrfs_next_leaf(root, path);
3616 			if (ret < 0) {
3617 				err = ret;
3618 				goto out;
3619 			}
3620 			if (WARN_ON(ret > 0))
3621 				goto out;
3622 
3623 			leaf = path->nodes[0];
3624 			nritems = btrfs_header_nritems(leaf);
3625 			added = 0;
3626 
3627 			if (block_use_full_backref(rc, leaf))
3628 				counted = 0;
3629 			else
3630 				counted = 1;
3631 			rb_node = tree_search(blocks, leaf->start);
3632 			if (rb_node) {
3633 				if (counted)
3634 					added = 1;
3635 				else
3636 					path->slots[0] = nritems;
3637 			}
3638 		}
3639 
3640 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3641 		if (WARN_ON(key.objectid != ref_objectid ||
3642 		    key.type != BTRFS_EXTENT_DATA_KEY))
3643 			break;
3644 
3645 		fi = btrfs_item_ptr(leaf, path->slots[0],
3646 				    struct btrfs_file_extent_item);
3647 
3648 		if (btrfs_file_extent_type(leaf, fi) ==
3649 		    BTRFS_FILE_EXTENT_INLINE)
3650 			goto next;
3651 
3652 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3653 		    extent_key->objectid)
3654 			goto next;
3655 
3656 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3657 		if (key.offset != ref_offset)
3658 			goto next;
3659 
3660 		if (counted)
3661 			ref_count--;
3662 		if (added)
3663 			goto next;
3664 
3665 		if (!tree_block_processed(leaf->start, rc)) {
3666 			block = kmalloc(sizeof(*block), GFP_NOFS);
3667 			if (!block) {
3668 				err = -ENOMEM;
3669 				break;
3670 			}
3671 			block->bytenr = leaf->start;
3672 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3673 			block->level = 0;
3674 			block->key_ready = 1;
3675 			rb_node = tree_insert(blocks, block->bytenr,
3676 					      &block->rb_node);
3677 			if (rb_node)
3678 				backref_tree_panic(rb_node, -EEXIST,
3679 						   block->bytenr);
3680 		}
3681 		if (counted)
3682 			added = 1;
3683 		else
3684 			path->slots[0] = nritems;
3685 next:
3686 		path->slots[0]++;
3687 
3688 	}
3689 out:
3690 	btrfs_free_path(path);
3691 	return err;
3692 }
3693 
3694 /*
3695  * helper to find all tree blocks that reference a given data extent
3696  */
3697 static noinline_for_stack
3698 int add_data_references(struct reloc_control *rc,
3699 			struct btrfs_key *extent_key,
3700 			struct btrfs_path *path,
3701 			struct rb_root *blocks)
3702 {
3703 	struct btrfs_key key;
3704 	struct extent_buffer *eb;
3705 	struct btrfs_extent_data_ref *dref;
3706 	struct btrfs_extent_inline_ref *iref;
3707 	unsigned long ptr;
3708 	unsigned long end;
3709 	u32 blocksize = rc->extent_root->nodesize;
3710 	int ret = 0;
3711 	int err = 0;
3712 
3713 	eb = path->nodes[0];
3714 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3715 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3716 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3717 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3718 		ptr = end;
3719 	else
3720 #endif
3721 		ptr += sizeof(struct btrfs_extent_item);
3722 
3723 	while (ptr < end) {
3724 		iref = (struct btrfs_extent_inline_ref *)ptr;
3725 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3726 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3727 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3728 			ret = __add_tree_block(rc, key.offset, blocksize,
3729 					       blocks);
3730 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3731 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3732 			ret = find_data_references(rc, extent_key,
3733 						   eb, dref, blocks);
3734 		} else {
3735 			BUG();
3736 		}
3737 		if (ret) {
3738 			err = ret;
3739 			goto out;
3740 		}
3741 		ptr += btrfs_extent_inline_ref_size(key.type);
3742 	}
3743 	WARN_ON(ptr > end);
3744 
3745 	while (1) {
3746 		cond_resched();
3747 		eb = path->nodes[0];
3748 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3749 			ret = btrfs_next_leaf(rc->extent_root, path);
3750 			if (ret < 0) {
3751 				err = ret;
3752 				break;
3753 			}
3754 			if (ret > 0)
3755 				break;
3756 			eb = path->nodes[0];
3757 		}
3758 
3759 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3760 		if (key.objectid != extent_key->objectid)
3761 			break;
3762 
3763 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3764 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3765 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3766 #else
3767 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3768 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3769 #endif
3770 			ret = __add_tree_block(rc, key.offset, blocksize,
3771 					       blocks);
3772 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3773 			dref = btrfs_item_ptr(eb, path->slots[0],
3774 					      struct btrfs_extent_data_ref);
3775 			ret = find_data_references(rc, extent_key,
3776 						   eb, dref, blocks);
3777 		} else {
3778 			ret = 0;
3779 		}
3780 		if (ret) {
3781 			err = ret;
3782 			break;
3783 		}
3784 		path->slots[0]++;
3785 	}
3786 out:
3787 	btrfs_release_path(path);
3788 	if (err)
3789 		free_block_list(blocks);
3790 	return err;
3791 }
3792 
3793 /*
3794  * helper to find next unprocessed extent
3795  */
3796 static noinline_for_stack
3797 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3798 		     struct btrfs_key *extent_key)
3799 {
3800 	struct btrfs_key key;
3801 	struct extent_buffer *leaf;
3802 	u64 start, end, last;
3803 	int ret;
3804 
3805 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3806 	while (1) {
3807 		cond_resched();
3808 		if (rc->search_start >= last) {
3809 			ret = 1;
3810 			break;
3811 		}
3812 
3813 		key.objectid = rc->search_start;
3814 		key.type = BTRFS_EXTENT_ITEM_KEY;
3815 		key.offset = 0;
3816 
3817 		path->search_commit_root = 1;
3818 		path->skip_locking = 1;
3819 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3820 					0, 0);
3821 		if (ret < 0)
3822 			break;
3823 next:
3824 		leaf = path->nodes[0];
3825 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3826 			ret = btrfs_next_leaf(rc->extent_root, path);
3827 			if (ret != 0)
3828 				break;
3829 			leaf = path->nodes[0];
3830 		}
3831 
3832 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3833 		if (key.objectid >= last) {
3834 			ret = 1;
3835 			break;
3836 		}
3837 
3838 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3839 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3840 			path->slots[0]++;
3841 			goto next;
3842 		}
3843 
3844 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3845 		    key.objectid + key.offset <= rc->search_start) {
3846 			path->slots[0]++;
3847 			goto next;
3848 		}
3849 
3850 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3851 		    key.objectid + rc->extent_root->nodesize <=
3852 		    rc->search_start) {
3853 			path->slots[0]++;
3854 			goto next;
3855 		}
3856 
3857 		ret = find_first_extent_bit(&rc->processed_blocks,
3858 					    key.objectid, &start, &end,
3859 					    EXTENT_DIRTY, NULL);
3860 
3861 		if (ret == 0 && start <= key.objectid) {
3862 			btrfs_release_path(path);
3863 			rc->search_start = end + 1;
3864 		} else {
3865 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3866 				rc->search_start = key.objectid + key.offset;
3867 			else
3868 				rc->search_start = key.objectid +
3869 					rc->extent_root->nodesize;
3870 			memcpy(extent_key, &key, sizeof(key));
3871 			return 0;
3872 		}
3873 	}
3874 	btrfs_release_path(path);
3875 	return ret;
3876 }
3877 
3878 static void set_reloc_control(struct reloc_control *rc)
3879 {
3880 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3881 
3882 	mutex_lock(&fs_info->reloc_mutex);
3883 	fs_info->reloc_ctl = rc;
3884 	mutex_unlock(&fs_info->reloc_mutex);
3885 }
3886 
3887 static void unset_reloc_control(struct reloc_control *rc)
3888 {
3889 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3890 
3891 	mutex_lock(&fs_info->reloc_mutex);
3892 	fs_info->reloc_ctl = NULL;
3893 	mutex_unlock(&fs_info->reloc_mutex);
3894 }
3895 
3896 static int check_extent_flags(u64 flags)
3897 {
3898 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3899 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3900 		return 1;
3901 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3902 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3903 		return 1;
3904 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3905 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3906 		return 1;
3907 	return 0;
3908 }
3909 
3910 static noinline_for_stack
3911 int prepare_to_relocate(struct reloc_control *rc)
3912 {
3913 	struct btrfs_trans_handle *trans;
3914 	int ret;
3915 
3916 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3917 					      BTRFS_BLOCK_RSV_TEMP);
3918 	if (!rc->block_rsv)
3919 		return -ENOMEM;
3920 
3921 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3922 	rc->search_start = rc->block_group->key.objectid;
3923 	rc->extents_found = 0;
3924 	rc->nodes_relocated = 0;
3925 	rc->merging_rsv_size = 0;
3926 	rc->reserved_bytes = 0;
3927 	rc->block_rsv->size = rc->extent_root->nodesize *
3928 			      RELOCATION_RESERVED_NODES;
3929 	ret = btrfs_block_rsv_refill(rc->extent_root,
3930 				     rc->block_rsv, rc->block_rsv->size,
3931 				     BTRFS_RESERVE_FLUSH_ALL);
3932 	if (ret)
3933 		return ret;
3934 
3935 	rc->create_reloc_tree = 1;
3936 	set_reloc_control(rc);
3937 
3938 	trans = btrfs_join_transaction(rc->extent_root);
3939 	if (IS_ERR(trans)) {
3940 		unset_reloc_control(rc);
3941 		/*
3942 		 * extent tree is not a ref_cow tree and has no reloc_root to
3943 		 * cleanup.  And callers are responsible to free the above
3944 		 * block rsv.
3945 		 */
3946 		return PTR_ERR(trans);
3947 	}
3948 	btrfs_commit_transaction(trans, rc->extent_root);
3949 	return 0;
3950 }
3951 
3952 /*
3953  * Qgroup fixer for data chunk relocation.
3954  * The data relocation is done in the following steps
3955  * 1) Copy data extents into data reloc tree
3956  * 2) Create tree reloc tree(special snapshot) for related subvolumes
3957  * 3) Modify file extents in tree reloc tree
3958  * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
3959  *
3960  * The problem is, data and tree reloc tree are not accounted to qgroup,
3961  * and 4) will only info qgroup to track tree blocks change, not file extents
3962  * in the tree blocks.
3963  *
3964  * The good news is, related data extents are all in data reloc tree, so we
3965  * only need to info qgroup to track all file extents in data reloc tree
3966  * before commit trans.
3967  */
3968 static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
3969 					     struct reloc_control *rc)
3970 {
3971 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3972 	struct inode *inode = rc->data_inode;
3973 	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
3974 	struct btrfs_path *path;
3975 	struct btrfs_key key;
3976 	int ret = 0;
3977 
3978 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3979 		return 0;
3980 
3981 	/*
3982 	 * Only for stage where we update data pointers the qgroup fix is
3983 	 * valid.
3984 	 * For MOVING_DATA stage, we will miss the timing of swapping tree
3985 	 * blocks, and won't fix it.
3986 	 */
3987 	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
3988 		return 0;
3989 
3990 	path = btrfs_alloc_path();
3991 	if (!path)
3992 		return -ENOMEM;
3993 	key.objectid = btrfs_ino(inode);
3994 	key.type = BTRFS_EXTENT_DATA_KEY;
3995 	key.offset = 0;
3996 
3997 	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
3998 	if (ret < 0)
3999 		goto out;
4000 
4001 	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
4002 	while (1) {
4003 		struct btrfs_file_extent_item *fi;
4004 
4005 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4006 		if (key.objectid > btrfs_ino(inode))
4007 			break;
4008 		if (key.type != BTRFS_EXTENT_DATA_KEY)
4009 			goto next;
4010 		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
4011 				    struct btrfs_file_extent_item);
4012 		if (btrfs_file_extent_type(path->nodes[0], fi) !=
4013 				BTRFS_FILE_EXTENT_REG)
4014 			goto next;
4015 		ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
4016 			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
4017 			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
4018 			GFP_NOFS);
4019 		if (ret < 0)
4020 			break;
4021 next:
4022 		ret = btrfs_next_item(data_reloc_root, path);
4023 		if (ret < 0)
4024 			break;
4025 		if (ret > 0) {
4026 			ret = 0;
4027 			break;
4028 		}
4029 	}
4030 	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
4031 out:
4032 	btrfs_free_path(path);
4033 	return ret;
4034 }
4035 
4036 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4037 {
4038 	struct rb_root blocks = RB_ROOT;
4039 	struct btrfs_key key;
4040 	struct btrfs_trans_handle *trans = NULL;
4041 	struct btrfs_path *path;
4042 	struct btrfs_extent_item *ei;
4043 	u64 flags;
4044 	u32 item_size;
4045 	int ret;
4046 	int err = 0;
4047 	int progress = 0;
4048 
4049 	path = btrfs_alloc_path();
4050 	if (!path)
4051 		return -ENOMEM;
4052 	path->reada = READA_FORWARD;
4053 
4054 	ret = prepare_to_relocate(rc);
4055 	if (ret) {
4056 		err = ret;
4057 		goto out_free;
4058 	}
4059 
4060 	while (1) {
4061 		rc->reserved_bytes = 0;
4062 		ret = btrfs_block_rsv_refill(rc->extent_root,
4063 					rc->block_rsv, rc->block_rsv->size,
4064 					BTRFS_RESERVE_FLUSH_ALL);
4065 		if (ret) {
4066 			err = ret;
4067 			break;
4068 		}
4069 		progress++;
4070 		trans = btrfs_start_transaction(rc->extent_root, 0);
4071 		if (IS_ERR(trans)) {
4072 			err = PTR_ERR(trans);
4073 			trans = NULL;
4074 			break;
4075 		}
4076 restart:
4077 		if (update_backref_cache(trans, &rc->backref_cache)) {
4078 			btrfs_end_transaction(trans, rc->extent_root);
4079 			continue;
4080 		}
4081 
4082 		ret = find_next_extent(rc, path, &key);
4083 		if (ret < 0)
4084 			err = ret;
4085 		if (ret != 0)
4086 			break;
4087 
4088 		rc->extents_found++;
4089 
4090 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4091 				    struct btrfs_extent_item);
4092 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4093 		if (item_size >= sizeof(*ei)) {
4094 			flags = btrfs_extent_flags(path->nodes[0], ei);
4095 			ret = check_extent_flags(flags);
4096 			BUG_ON(ret);
4097 
4098 		} else {
4099 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4100 			u64 ref_owner;
4101 			int path_change = 0;
4102 
4103 			BUG_ON(item_size !=
4104 			       sizeof(struct btrfs_extent_item_v0));
4105 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4106 						  &path_change);
4107 			if (ret < 0) {
4108 				err = ret;
4109 				break;
4110 			}
4111 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4112 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4113 			else
4114 				flags = BTRFS_EXTENT_FLAG_DATA;
4115 
4116 			if (path_change) {
4117 				btrfs_release_path(path);
4118 
4119 				path->search_commit_root = 1;
4120 				path->skip_locking = 1;
4121 				ret = btrfs_search_slot(NULL, rc->extent_root,
4122 							&key, path, 0, 0);
4123 				if (ret < 0) {
4124 					err = ret;
4125 					break;
4126 				}
4127 				BUG_ON(ret > 0);
4128 			}
4129 #else
4130 			BUG();
4131 #endif
4132 		}
4133 
4134 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4135 			ret = add_tree_block(rc, &key, path, &blocks);
4136 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4137 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4138 			ret = add_data_references(rc, &key, path, &blocks);
4139 		} else {
4140 			btrfs_release_path(path);
4141 			ret = 0;
4142 		}
4143 		if (ret < 0) {
4144 			err = ret;
4145 			break;
4146 		}
4147 
4148 		if (!RB_EMPTY_ROOT(&blocks)) {
4149 			ret = relocate_tree_blocks(trans, rc, &blocks);
4150 			if (ret < 0) {
4151 				/*
4152 				 * if we fail to relocate tree blocks, force to update
4153 				 * backref cache when committing transaction.
4154 				 */
4155 				rc->backref_cache.last_trans = trans->transid - 1;
4156 
4157 				if (ret != -EAGAIN) {
4158 					err = ret;
4159 					break;
4160 				}
4161 				rc->extents_found--;
4162 				rc->search_start = key.objectid;
4163 			}
4164 		}
4165 
4166 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4167 		btrfs_btree_balance_dirty(rc->extent_root);
4168 		trans = NULL;
4169 
4170 		if (rc->stage == MOVE_DATA_EXTENTS &&
4171 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4172 			rc->found_file_extent = 1;
4173 			ret = relocate_data_extent(rc->data_inode,
4174 						   &key, &rc->cluster);
4175 			if (ret < 0) {
4176 				err = ret;
4177 				break;
4178 			}
4179 		}
4180 	}
4181 	if (trans && progress && err == -ENOSPC) {
4182 		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4183 					      rc->block_group->flags);
4184 		if (ret == 1) {
4185 			err = 0;
4186 			progress = 0;
4187 			goto restart;
4188 		}
4189 	}
4190 
4191 	btrfs_release_path(path);
4192 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4193 
4194 	if (trans) {
4195 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4196 		btrfs_btree_balance_dirty(rc->extent_root);
4197 	}
4198 
4199 	if (!err) {
4200 		ret = relocate_file_extent_cluster(rc->data_inode,
4201 						   &rc->cluster);
4202 		if (ret < 0)
4203 			err = ret;
4204 	}
4205 
4206 	rc->create_reloc_tree = 0;
4207 	set_reloc_control(rc);
4208 
4209 	backref_cache_cleanup(&rc->backref_cache);
4210 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4211 
4212 	err = prepare_to_merge(rc, err);
4213 
4214 	merge_reloc_roots(rc);
4215 
4216 	rc->merge_reloc_tree = 0;
4217 	unset_reloc_control(rc);
4218 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4219 
4220 	/* get rid of pinned extents */
4221 	trans = btrfs_join_transaction(rc->extent_root);
4222 	if (IS_ERR(trans)) {
4223 		err = PTR_ERR(trans);
4224 		goto out_free;
4225 	}
4226 	ret = qgroup_fix_relocated_data_extents(trans, rc);
4227 	if (ret < 0) {
4228 		btrfs_abort_transaction(trans, ret);
4229 		if (!err)
4230 			err = ret;
4231 		goto out_free;
4232 	}
4233 	btrfs_commit_transaction(trans, rc->extent_root);
4234 out_free:
4235 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4236 	btrfs_free_path(path);
4237 	return err;
4238 }
4239 
4240 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4241 				 struct btrfs_root *root, u64 objectid)
4242 {
4243 	struct btrfs_path *path;
4244 	struct btrfs_inode_item *item;
4245 	struct extent_buffer *leaf;
4246 	int ret;
4247 
4248 	path = btrfs_alloc_path();
4249 	if (!path)
4250 		return -ENOMEM;
4251 
4252 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4253 	if (ret)
4254 		goto out;
4255 
4256 	leaf = path->nodes[0];
4257 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4258 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4259 	btrfs_set_inode_generation(leaf, item, 1);
4260 	btrfs_set_inode_size(leaf, item, 0);
4261 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4262 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4263 					  BTRFS_INODE_PREALLOC);
4264 	btrfs_mark_buffer_dirty(leaf);
4265 out:
4266 	btrfs_free_path(path);
4267 	return ret;
4268 }
4269 
4270 /*
4271  * helper to create inode for data relocation.
4272  * the inode is in data relocation tree and its link count is 0
4273  */
4274 static noinline_for_stack
4275 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4276 				 struct btrfs_block_group_cache *group)
4277 {
4278 	struct inode *inode = NULL;
4279 	struct btrfs_trans_handle *trans;
4280 	struct btrfs_root *root;
4281 	struct btrfs_key key;
4282 	u64 objectid;
4283 	int err = 0;
4284 
4285 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4286 	if (IS_ERR(root))
4287 		return ERR_CAST(root);
4288 
4289 	trans = btrfs_start_transaction(root, 6);
4290 	if (IS_ERR(trans))
4291 		return ERR_CAST(trans);
4292 
4293 	err = btrfs_find_free_objectid(root, &objectid);
4294 	if (err)
4295 		goto out;
4296 
4297 	err = __insert_orphan_inode(trans, root, objectid);
4298 	BUG_ON(err);
4299 
4300 	key.objectid = objectid;
4301 	key.type = BTRFS_INODE_ITEM_KEY;
4302 	key.offset = 0;
4303 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4304 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4305 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4306 
4307 	err = btrfs_orphan_add(trans, inode);
4308 out:
4309 	btrfs_end_transaction(trans, root);
4310 	btrfs_btree_balance_dirty(root);
4311 	if (err) {
4312 		if (inode)
4313 			iput(inode);
4314 		inode = ERR_PTR(err);
4315 	}
4316 	return inode;
4317 }
4318 
4319 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4320 {
4321 	struct reloc_control *rc;
4322 
4323 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4324 	if (!rc)
4325 		return NULL;
4326 
4327 	INIT_LIST_HEAD(&rc->reloc_roots);
4328 	backref_cache_init(&rc->backref_cache);
4329 	mapping_tree_init(&rc->reloc_root_tree);
4330 	extent_io_tree_init(&rc->processed_blocks,
4331 			    fs_info->btree_inode->i_mapping);
4332 	return rc;
4333 }
4334 
4335 /*
4336  * function to relocate all extents in a block group.
4337  */
4338 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4339 {
4340 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
4341 	struct reloc_control *rc;
4342 	struct inode *inode;
4343 	struct btrfs_path *path;
4344 	int ret;
4345 	int rw = 0;
4346 	int err = 0;
4347 
4348 	rc = alloc_reloc_control(fs_info);
4349 	if (!rc)
4350 		return -ENOMEM;
4351 
4352 	rc->extent_root = extent_root;
4353 
4354 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4355 	BUG_ON(!rc->block_group);
4356 
4357 	ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
4358 	if (ret) {
4359 		err = ret;
4360 		goto out;
4361 	}
4362 	rw = 1;
4363 
4364 	path = btrfs_alloc_path();
4365 	if (!path) {
4366 		err = -ENOMEM;
4367 		goto out;
4368 	}
4369 
4370 	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4371 					path);
4372 	btrfs_free_path(path);
4373 
4374 	if (!IS_ERR(inode))
4375 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4376 	else
4377 		ret = PTR_ERR(inode);
4378 
4379 	if (ret && ret != -ENOENT) {
4380 		err = ret;
4381 		goto out;
4382 	}
4383 
4384 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4385 	if (IS_ERR(rc->data_inode)) {
4386 		err = PTR_ERR(rc->data_inode);
4387 		rc->data_inode = NULL;
4388 		goto out;
4389 	}
4390 
4391 	btrfs_info(extent_root->fs_info,
4392 		   "relocating block group %llu flags %llu",
4393 		   rc->block_group->key.objectid, rc->block_group->flags);
4394 
4395 	btrfs_wait_block_group_reservations(rc->block_group);
4396 	btrfs_wait_nocow_writers(rc->block_group);
4397 	btrfs_wait_ordered_roots(fs_info, -1,
4398 				 rc->block_group->key.objectid,
4399 				 rc->block_group->key.offset);
4400 
4401 	while (1) {
4402 		mutex_lock(&fs_info->cleaner_mutex);
4403 		ret = relocate_block_group(rc);
4404 		mutex_unlock(&fs_info->cleaner_mutex);
4405 		if (ret < 0) {
4406 			err = ret;
4407 			goto out;
4408 		}
4409 
4410 		if (rc->extents_found == 0)
4411 			break;
4412 
4413 		btrfs_info(extent_root->fs_info, "found %llu extents",
4414 			rc->extents_found);
4415 
4416 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4417 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4418 						       (u64)-1);
4419 			if (ret) {
4420 				err = ret;
4421 				goto out;
4422 			}
4423 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4424 						 0, -1);
4425 			rc->stage = UPDATE_DATA_PTRS;
4426 		}
4427 	}
4428 
4429 	WARN_ON(rc->block_group->pinned > 0);
4430 	WARN_ON(rc->block_group->reserved > 0);
4431 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4432 out:
4433 	if (err && rw)
4434 		btrfs_dec_block_group_ro(extent_root, rc->block_group);
4435 	iput(rc->data_inode);
4436 	btrfs_put_block_group(rc->block_group);
4437 	kfree(rc);
4438 	return err;
4439 }
4440 
4441 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4442 {
4443 	struct btrfs_trans_handle *trans;
4444 	int ret, err;
4445 
4446 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4447 	if (IS_ERR(trans))
4448 		return PTR_ERR(trans);
4449 
4450 	memset(&root->root_item.drop_progress, 0,
4451 		sizeof(root->root_item.drop_progress));
4452 	root->root_item.drop_level = 0;
4453 	btrfs_set_root_refs(&root->root_item, 0);
4454 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
4455 				&root->root_key, &root->root_item);
4456 
4457 	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4458 	if (err)
4459 		return err;
4460 	return ret;
4461 }
4462 
4463 /*
4464  * recover relocation interrupted by system crash.
4465  *
4466  * this function resumes merging reloc trees with corresponding fs trees.
4467  * this is important for keeping the sharing of tree blocks
4468  */
4469 int btrfs_recover_relocation(struct btrfs_root *root)
4470 {
4471 	LIST_HEAD(reloc_roots);
4472 	struct btrfs_key key;
4473 	struct btrfs_root *fs_root;
4474 	struct btrfs_root *reloc_root;
4475 	struct btrfs_path *path;
4476 	struct extent_buffer *leaf;
4477 	struct reloc_control *rc = NULL;
4478 	struct btrfs_trans_handle *trans;
4479 	int ret;
4480 	int err = 0;
4481 
4482 	path = btrfs_alloc_path();
4483 	if (!path)
4484 		return -ENOMEM;
4485 	path->reada = READA_BACK;
4486 
4487 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4488 	key.type = BTRFS_ROOT_ITEM_KEY;
4489 	key.offset = (u64)-1;
4490 
4491 	while (1) {
4492 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4493 					path, 0, 0);
4494 		if (ret < 0) {
4495 			err = ret;
4496 			goto out;
4497 		}
4498 		if (ret > 0) {
4499 			if (path->slots[0] == 0)
4500 				break;
4501 			path->slots[0]--;
4502 		}
4503 		leaf = path->nodes[0];
4504 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4505 		btrfs_release_path(path);
4506 
4507 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4508 		    key.type != BTRFS_ROOT_ITEM_KEY)
4509 			break;
4510 
4511 		reloc_root = btrfs_read_fs_root(root, &key);
4512 		if (IS_ERR(reloc_root)) {
4513 			err = PTR_ERR(reloc_root);
4514 			goto out;
4515 		}
4516 
4517 		list_add(&reloc_root->root_list, &reloc_roots);
4518 
4519 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4520 			fs_root = read_fs_root(root->fs_info,
4521 					       reloc_root->root_key.offset);
4522 			if (IS_ERR(fs_root)) {
4523 				ret = PTR_ERR(fs_root);
4524 				if (ret != -ENOENT) {
4525 					err = ret;
4526 					goto out;
4527 				}
4528 				ret = mark_garbage_root(reloc_root);
4529 				if (ret < 0) {
4530 					err = ret;
4531 					goto out;
4532 				}
4533 			}
4534 		}
4535 
4536 		if (key.offset == 0)
4537 			break;
4538 
4539 		key.offset--;
4540 	}
4541 	btrfs_release_path(path);
4542 
4543 	if (list_empty(&reloc_roots))
4544 		goto out;
4545 
4546 	rc = alloc_reloc_control(root->fs_info);
4547 	if (!rc) {
4548 		err = -ENOMEM;
4549 		goto out;
4550 	}
4551 
4552 	rc->extent_root = root->fs_info->extent_root;
4553 
4554 	set_reloc_control(rc);
4555 
4556 	trans = btrfs_join_transaction(rc->extent_root);
4557 	if (IS_ERR(trans)) {
4558 		unset_reloc_control(rc);
4559 		err = PTR_ERR(trans);
4560 		goto out_free;
4561 	}
4562 
4563 	rc->merge_reloc_tree = 1;
4564 
4565 	while (!list_empty(&reloc_roots)) {
4566 		reloc_root = list_entry(reloc_roots.next,
4567 					struct btrfs_root, root_list);
4568 		list_del(&reloc_root->root_list);
4569 
4570 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4571 			list_add_tail(&reloc_root->root_list,
4572 				      &rc->reloc_roots);
4573 			continue;
4574 		}
4575 
4576 		fs_root = read_fs_root(root->fs_info,
4577 				       reloc_root->root_key.offset);
4578 		if (IS_ERR(fs_root)) {
4579 			err = PTR_ERR(fs_root);
4580 			goto out_free;
4581 		}
4582 
4583 		err = __add_reloc_root(reloc_root);
4584 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4585 		fs_root->reloc_root = reloc_root;
4586 	}
4587 
4588 	err = btrfs_commit_transaction(trans, rc->extent_root);
4589 	if (err)
4590 		goto out_free;
4591 
4592 	merge_reloc_roots(rc);
4593 
4594 	unset_reloc_control(rc);
4595 
4596 	trans = btrfs_join_transaction(rc->extent_root);
4597 	if (IS_ERR(trans)) {
4598 		err = PTR_ERR(trans);
4599 		goto out_free;
4600 	}
4601 	err = qgroup_fix_relocated_data_extents(trans, rc);
4602 	if (err < 0) {
4603 		btrfs_abort_transaction(trans, err);
4604 		goto out_free;
4605 	}
4606 	err = btrfs_commit_transaction(trans, rc->extent_root);
4607 out_free:
4608 	kfree(rc);
4609 out:
4610 	if (!list_empty(&reloc_roots))
4611 		free_reloc_roots(&reloc_roots);
4612 
4613 	btrfs_free_path(path);
4614 
4615 	if (err == 0) {
4616 		/* cleanup orphan inode in data relocation tree */
4617 		fs_root = read_fs_root(root->fs_info,
4618 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
4619 		if (IS_ERR(fs_root))
4620 			err = PTR_ERR(fs_root);
4621 		else
4622 			err = btrfs_orphan_cleanup(fs_root);
4623 	}
4624 	return err;
4625 }
4626 
4627 /*
4628  * helper to add ordered checksum for data relocation.
4629  *
4630  * cloning checksum properly handles the nodatasum extents.
4631  * it also saves CPU time to re-calculate the checksum.
4632  */
4633 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4634 {
4635 	struct btrfs_ordered_sum *sums;
4636 	struct btrfs_ordered_extent *ordered;
4637 	struct btrfs_root *root = BTRFS_I(inode)->root;
4638 	int ret;
4639 	u64 disk_bytenr;
4640 	u64 new_bytenr;
4641 	LIST_HEAD(list);
4642 
4643 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4644 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4645 
4646 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4647 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4648 				       disk_bytenr + len - 1, &list, 0);
4649 	if (ret)
4650 		goto out;
4651 
4652 	while (!list_empty(&list)) {
4653 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4654 		list_del_init(&sums->list);
4655 
4656 		/*
4657 		 * We need to offset the new_bytenr based on where the csum is.
4658 		 * We need to do this because we will read in entire prealloc
4659 		 * extents but we may have written to say the middle of the
4660 		 * prealloc extent, so we need to make sure the csum goes with
4661 		 * the right disk offset.
4662 		 *
4663 		 * We can do this because the data reloc inode refers strictly
4664 		 * to the on disk bytes, so we don't have to worry about
4665 		 * disk_len vs real len like with real inodes since it's all
4666 		 * disk length.
4667 		 */
4668 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4669 		sums->bytenr = new_bytenr;
4670 
4671 		btrfs_add_ordered_sum(inode, ordered, sums);
4672 	}
4673 out:
4674 	btrfs_put_ordered_extent(ordered);
4675 	return ret;
4676 }
4677 
4678 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4679 			  struct btrfs_root *root, struct extent_buffer *buf,
4680 			  struct extent_buffer *cow)
4681 {
4682 	struct reloc_control *rc;
4683 	struct backref_node *node;
4684 	int first_cow = 0;
4685 	int level;
4686 	int ret = 0;
4687 
4688 	rc = root->fs_info->reloc_ctl;
4689 	if (!rc)
4690 		return 0;
4691 
4692 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4693 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4694 
4695 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4696 		if (buf == root->node)
4697 			__update_reloc_root(root, cow->start);
4698 	}
4699 
4700 	level = btrfs_header_level(buf);
4701 	if (btrfs_header_generation(buf) <=
4702 	    btrfs_root_last_snapshot(&root->root_item))
4703 		first_cow = 1;
4704 
4705 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4706 	    rc->create_reloc_tree) {
4707 		WARN_ON(!first_cow && level == 0);
4708 
4709 		node = rc->backref_cache.path[level];
4710 		BUG_ON(node->bytenr != buf->start &&
4711 		       node->new_bytenr != buf->start);
4712 
4713 		drop_node_buffer(node);
4714 		extent_buffer_get(cow);
4715 		node->eb = cow;
4716 		node->new_bytenr = cow->start;
4717 
4718 		if (!node->pending) {
4719 			list_move_tail(&node->list,
4720 				       &rc->backref_cache.pending[level]);
4721 			node->pending = 1;
4722 		}
4723 
4724 		if (first_cow)
4725 			__mark_block_processed(rc, node);
4726 
4727 		if (first_cow && level > 0)
4728 			rc->nodes_relocated += buf->len;
4729 	}
4730 
4731 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4732 		ret = replace_file_extents(trans, rc, root, cow);
4733 	return ret;
4734 }
4735 
4736 /*
4737  * called before creating snapshot. it calculates metadata reservation
4738  * required for relocating tree blocks in the snapshot
4739  */
4740 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4741 			      u64 *bytes_to_reserve)
4742 {
4743 	struct btrfs_root *root;
4744 	struct reloc_control *rc;
4745 
4746 	root = pending->root;
4747 	if (!root->reloc_root)
4748 		return;
4749 
4750 	rc = root->fs_info->reloc_ctl;
4751 	if (!rc->merge_reloc_tree)
4752 		return;
4753 
4754 	root = root->reloc_root;
4755 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4756 	/*
4757 	 * relocation is in the stage of merging trees. the space
4758 	 * used by merging a reloc tree is twice the size of
4759 	 * relocated tree nodes in the worst case. half for cowing
4760 	 * the reloc tree, half for cowing the fs tree. the space
4761 	 * used by cowing the reloc tree will be freed after the
4762 	 * tree is dropped. if we create snapshot, cowing the fs
4763 	 * tree may use more space than it frees. so we need
4764 	 * reserve extra space.
4765 	 */
4766 	*bytes_to_reserve += rc->nodes_relocated;
4767 }
4768 
4769 /*
4770  * called after snapshot is created. migrate block reservation
4771  * and create reloc root for the newly created snapshot
4772  */
4773 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4774 			       struct btrfs_pending_snapshot *pending)
4775 {
4776 	struct btrfs_root *root = pending->root;
4777 	struct btrfs_root *reloc_root;
4778 	struct btrfs_root *new_root;
4779 	struct reloc_control *rc;
4780 	int ret;
4781 
4782 	if (!root->reloc_root)
4783 		return 0;
4784 
4785 	rc = root->fs_info->reloc_ctl;
4786 	rc->merging_rsv_size += rc->nodes_relocated;
4787 
4788 	if (rc->merge_reloc_tree) {
4789 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4790 					      rc->block_rsv,
4791 					      rc->nodes_relocated, 1);
4792 		if (ret)
4793 			return ret;
4794 	}
4795 
4796 	new_root = pending->snap;
4797 	reloc_root = create_reloc_root(trans, root->reloc_root,
4798 				       new_root->root_key.objectid);
4799 	if (IS_ERR(reloc_root))
4800 		return PTR_ERR(reloc_root);
4801 
4802 	ret = __add_reloc_root(reloc_root);
4803 	BUG_ON(ret < 0);
4804 	new_root->reloc_root = reloc_root;
4805 
4806 	if (rc->create_reloc_tree)
4807 		ret = clone_backref_node(trans, rc, root, reloc_root);
4808 	return ret;
4809 }
4810