xref: /openbmc/linux/fs/btrfs/relocation.c (revision 7dd65feb)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "volumes.h"
28 #include "locking.h"
29 #include "btrfs_inode.h"
30 #include "async-thread.h"
31 
32 /*
33  * backref_node, mapping_node and tree_block start with this
34  */
35 struct tree_entry {
36 	struct rb_node rb_node;
37 	u64 bytenr;
38 };
39 
40 /*
41  * present a tree block in the backref cache
42  */
43 struct backref_node {
44 	struct rb_node rb_node;
45 	u64 bytenr;
46 	/* objectid tree block owner */
47 	u64 owner;
48 	/* list of upper level blocks reference this block */
49 	struct list_head upper;
50 	/* list of child blocks in the cache */
51 	struct list_head lower;
52 	/* NULL if this node is not tree root */
53 	struct btrfs_root *root;
54 	/* extent buffer got by COW the block */
55 	struct extent_buffer *eb;
56 	/* level of tree block */
57 	unsigned int level:8;
58 	/* 1 if the block is root of old snapshot */
59 	unsigned int old_root:1;
60 	/* 1 if no child blocks in the cache */
61 	unsigned int lowest:1;
62 	/* is the extent buffer locked */
63 	unsigned int locked:1;
64 	/* has the block been processed */
65 	unsigned int processed:1;
66 	/* have backrefs of this block been checked */
67 	unsigned int checked:1;
68 };
69 
70 /*
71  * present a block pointer in the backref cache
72  */
73 struct backref_edge {
74 	struct list_head list[2];
75 	struct backref_node *node[2];
76 	u64 blockptr;
77 };
78 
79 #define LOWER	0
80 #define UPPER	1
81 
82 struct backref_cache {
83 	/* red black tree of all backref nodes in the cache */
84 	struct rb_root rb_root;
85 	/* list of backref nodes with no child block in the cache */
86 	struct list_head pending[BTRFS_MAX_LEVEL];
87 	spinlock_t lock;
88 };
89 
90 /*
91  * map address of tree root to tree
92  */
93 struct mapping_node {
94 	struct rb_node rb_node;
95 	u64 bytenr;
96 	void *data;
97 };
98 
99 struct mapping_tree {
100 	struct rb_root rb_root;
101 	spinlock_t lock;
102 };
103 
104 /*
105  * present a tree block to process
106  */
107 struct tree_block {
108 	struct rb_node rb_node;
109 	u64 bytenr;
110 	struct btrfs_key key;
111 	unsigned int level:8;
112 	unsigned int key_ready:1;
113 };
114 
115 /* inode vector */
116 #define INODEVEC_SIZE 16
117 
118 struct inodevec {
119 	struct list_head list;
120 	struct inode *inode[INODEVEC_SIZE];
121 	int nr;
122 };
123 
124 #define MAX_EXTENTS 128
125 
126 struct file_extent_cluster {
127 	u64 start;
128 	u64 end;
129 	u64 boundary[MAX_EXTENTS];
130 	unsigned int nr;
131 };
132 
133 struct reloc_control {
134 	/* block group to relocate */
135 	struct btrfs_block_group_cache *block_group;
136 	/* extent tree */
137 	struct btrfs_root *extent_root;
138 	/* inode for moving data */
139 	struct inode *data_inode;
140 	struct btrfs_workers workers;
141 	/* tree blocks have been processed */
142 	struct extent_io_tree processed_blocks;
143 	/* map start of tree root to corresponding reloc tree */
144 	struct mapping_tree reloc_root_tree;
145 	/* list of reloc trees */
146 	struct list_head reloc_roots;
147 	u64 search_start;
148 	u64 extents_found;
149 	u64 extents_skipped;
150 	int stage;
151 	int create_reloc_root;
152 	unsigned int found_file_extent:1;
153 	unsigned int found_old_snapshot:1;
154 };
155 
156 /* stages of data relocation */
157 #define MOVE_DATA_EXTENTS	0
158 #define UPDATE_DATA_PTRS	1
159 
160 /*
161  * merge reloc tree to corresponding fs tree in worker threads
162  */
163 struct async_merge {
164 	struct btrfs_work work;
165 	struct reloc_control *rc;
166 	struct btrfs_root *root;
167 	struct completion *done;
168 	atomic_t *num_pending;
169 };
170 
171 static void mapping_tree_init(struct mapping_tree *tree)
172 {
173 	tree->rb_root.rb_node = NULL;
174 	spin_lock_init(&tree->lock);
175 }
176 
177 static void backref_cache_init(struct backref_cache *cache)
178 {
179 	int i;
180 	cache->rb_root.rb_node = NULL;
181 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 		INIT_LIST_HEAD(&cache->pending[i]);
183 	spin_lock_init(&cache->lock);
184 }
185 
186 static void backref_node_init(struct backref_node *node)
187 {
188 	memset(node, 0, sizeof(*node));
189 	INIT_LIST_HEAD(&node->upper);
190 	INIT_LIST_HEAD(&node->lower);
191 	RB_CLEAR_NODE(&node->rb_node);
192 }
193 
194 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
195 				   struct rb_node *node)
196 {
197 	struct rb_node **p = &root->rb_node;
198 	struct rb_node *parent = NULL;
199 	struct tree_entry *entry;
200 
201 	while (*p) {
202 		parent = *p;
203 		entry = rb_entry(parent, struct tree_entry, rb_node);
204 
205 		if (bytenr < entry->bytenr)
206 			p = &(*p)->rb_left;
207 		else if (bytenr > entry->bytenr)
208 			p = &(*p)->rb_right;
209 		else
210 			return parent;
211 	}
212 
213 	rb_link_node(node, parent, p);
214 	rb_insert_color(node, root);
215 	return NULL;
216 }
217 
218 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
219 {
220 	struct rb_node *n = root->rb_node;
221 	struct tree_entry *entry;
222 
223 	while (n) {
224 		entry = rb_entry(n, struct tree_entry, rb_node);
225 
226 		if (bytenr < entry->bytenr)
227 			n = n->rb_left;
228 		else if (bytenr > entry->bytenr)
229 			n = n->rb_right;
230 		else
231 			return n;
232 	}
233 	return NULL;
234 }
235 
236 /*
237  * walk up backref nodes until reach node presents tree root
238  */
239 static struct backref_node *walk_up_backref(struct backref_node *node,
240 					    struct backref_edge *edges[],
241 					    int *index)
242 {
243 	struct backref_edge *edge;
244 	int idx = *index;
245 
246 	while (!list_empty(&node->upper)) {
247 		edge = list_entry(node->upper.next,
248 				  struct backref_edge, list[LOWER]);
249 		edges[idx++] = edge;
250 		node = edge->node[UPPER];
251 	}
252 	*index = idx;
253 	return node;
254 }
255 
256 /*
257  * walk down backref nodes to find start of next reference path
258  */
259 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
260 					      int *index)
261 {
262 	struct backref_edge *edge;
263 	struct backref_node *lower;
264 	int idx = *index;
265 
266 	while (idx > 0) {
267 		edge = edges[idx - 1];
268 		lower = edge->node[LOWER];
269 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
270 			idx--;
271 			continue;
272 		}
273 		edge = list_entry(edge->list[LOWER].next,
274 				  struct backref_edge, list[LOWER]);
275 		edges[idx - 1] = edge;
276 		*index = idx;
277 		return edge->node[UPPER];
278 	}
279 	*index = 0;
280 	return NULL;
281 }
282 
283 static void drop_node_buffer(struct backref_node *node)
284 {
285 	if (node->eb) {
286 		if (node->locked) {
287 			btrfs_tree_unlock(node->eb);
288 			node->locked = 0;
289 		}
290 		free_extent_buffer(node->eb);
291 		node->eb = NULL;
292 	}
293 }
294 
295 static void drop_backref_node(struct backref_cache *tree,
296 			      struct backref_node *node)
297 {
298 	BUG_ON(!node->lowest);
299 	BUG_ON(!list_empty(&node->upper));
300 
301 	drop_node_buffer(node);
302 	list_del(&node->lower);
303 
304 	rb_erase(&node->rb_node, &tree->rb_root);
305 	kfree(node);
306 }
307 
308 /*
309  * remove a backref node from the backref cache
310  */
311 static void remove_backref_node(struct backref_cache *cache,
312 				struct backref_node *node)
313 {
314 	struct backref_node *upper;
315 	struct backref_edge *edge;
316 
317 	if (!node)
318 		return;
319 
320 	BUG_ON(!node->lowest);
321 	while (!list_empty(&node->upper)) {
322 		edge = list_entry(node->upper.next, struct backref_edge,
323 				  list[LOWER]);
324 		upper = edge->node[UPPER];
325 		list_del(&edge->list[LOWER]);
326 		list_del(&edge->list[UPPER]);
327 		kfree(edge);
328 		/*
329 		 * add the node to pending list if no other
330 		 * child block cached.
331 		 */
332 		if (list_empty(&upper->lower)) {
333 			list_add_tail(&upper->lower,
334 				      &cache->pending[upper->level]);
335 			upper->lowest = 1;
336 		}
337 	}
338 	drop_backref_node(cache, node);
339 }
340 
341 /*
342  * find reloc tree by address of tree root
343  */
344 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
345 					  u64 bytenr)
346 {
347 	struct rb_node *rb_node;
348 	struct mapping_node *node;
349 	struct btrfs_root *root = NULL;
350 
351 	spin_lock(&rc->reloc_root_tree.lock);
352 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
353 	if (rb_node) {
354 		node = rb_entry(rb_node, struct mapping_node, rb_node);
355 		root = (struct btrfs_root *)node->data;
356 	}
357 	spin_unlock(&rc->reloc_root_tree.lock);
358 	return root;
359 }
360 
361 static int is_cowonly_root(u64 root_objectid)
362 {
363 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
364 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
365 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
366 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
367 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
368 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
369 		return 1;
370 	return 0;
371 }
372 
373 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
374 					u64 root_objectid)
375 {
376 	struct btrfs_key key;
377 
378 	key.objectid = root_objectid;
379 	key.type = BTRFS_ROOT_ITEM_KEY;
380 	if (is_cowonly_root(root_objectid))
381 		key.offset = 0;
382 	else
383 		key.offset = (u64)-1;
384 
385 	return btrfs_read_fs_root_no_name(fs_info, &key);
386 }
387 
388 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
389 static noinline_for_stack
390 struct btrfs_root *find_tree_root(struct reloc_control *rc,
391 				  struct extent_buffer *leaf,
392 				  struct btrfs_extent_ref_v0 *ref0)
393 {
394 	struct btrfs_root *root;
395 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
396 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
397 
398 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
399 
400 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
401 	BUG_ON(IS_ERR(root));
402 
403 	if (root->ref_cows &&
404 	    generation != btrfs_root_generation(&root->root_item))
405 		return NULL;
406 
407 	return root;
408 }
409 #endif
410 
411 static noinline_for_stack
412 int find_inline_backref(struct extent_buffer *leaf, int slot,
413 			unsigned long *ptr, unsigned long *end)
414 {
415 	struct btrfs_extent_item *ei;
416 	struct btrfs_tree_block_info *bi;
417 	u32 item_size;
418 
419 	item_size = btrfs_item_size_nr(leaf, slot);
420 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
421 	if (item_size < sizeof(*ei)) {
422 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
423 		return 1;
424 	}
425 #endif
426 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
427 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
428 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
429 
430 	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
431 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
432 		return 1;
433 	}
434 
435 	bi = (struct btrfs_tree_block_info *)(ei + 1);
436 	*ptr = (unsigned long)(bi + 1);
437 	*end = (unsigned long)ei + item_size;
438 	return 0;
439 }
440 
441 /*
442  * build backref tree for a given tree block. root of the backref tree
443  * corresponds the tree block, leaves of the backref tree correspond
444  * roots of b-trees that reference the tree block.
445  *
446  * the basic idea of this function is check backrefs of a given block
447  * to find upper level blocks that refernece the block, and then check
448  * bakcrefs of these upper level blocks recursively. the recursion stop
449  * when tree root is reached or backrefs for the block is cached.
450  *
451  * NOTE: if we find backrefs for a block are cached, we know backrefs
452  * for all upper level blocks that directly/indirectly reference the
453  * block are also cached.
454  */
455 static struct backref_node *build_backref_tree(struct reloc_control *rc,
456 					       struct backref_cache *cache,
457 					       struct btrfs_key *node_key,
458 					       int level, u64 bytenr)
459 {
460 	struct btrfs_path *path1;
461 	struct btrfs_path *path2;
462 	struct extent_buffer *eb;
463 	struct btrfs_root *root;
464 	struct backref_node *cur;
465 	struct backref_node *upper;
466 	struct backref_node *lower;
467 	struct backref_node *node = NULL;
468 	struct backref_node *exist = NULL;
469 	struct backref_edge *edge;
470 	struct rb_node *rb_node;
471 	struct btrfs_key key;
472 	unsigned long end;
473 	unsigned long ptr;
474 	LIST_HEAD(list);
475 	int ret;
476 	int err = 0;
477 
478 	path1 = btrfs_alloc_path();
479 	path2 = btrfs_alloc_path();
480 	if (!path1 || !path2) {
481 		err = -ENOMEM;
482 		goto out;
483 	}
484 
485 	node = kmalloc(sizeof(*node), GFP_NOFS);
486 	if (!node) {
487 		err = -ENOMEM;
488 		goto out;
489 	}
490 
491 	backref_node_init(node);
492 	node->bytenr = bytenr;
493 	node->owner = 0;
494 	node->level = level;
495 	node->lowest = 1;
496 	cur = node;
497 again:
498 	end = 0;
499 	ptr = 0;
500 	key.objectid = cur->bytenr;
501 	key.type = BTRFS_EXTENT_ITEM_KEY;
502 	key.offset = (u64)-1;
503 
504 	path1->search_commit_root = 1;
505 	path1->skip_locking = 1;
506 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
507 				0, 0);
508 	if (ret < 0) {
509 		err = ret;
510 		goto out;
511 	}
512 	BUG_ON(!ret || !path1->slots[0]);
513 
514 	path1->slots[0]--;
515 
516 	WARN_ON(cur->checked);
517 	if (!list_empty(&cur->upper)) {
518 		/*
519 		 * the backref was added previously when processsing
520 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
521 		 */
522 		BUG_ON(!list_is_singular(&cur->upper));
523 		edge = list_entry(cur->upper.next, struct backref_edge,
524 				  list[LOWER]);
525 		BUG_ON(!list_empty(&edge->list[UPPER]));
526 		exist = edge->node[UPPER];
527 		/*
528 		 * add the upper level block to pending list if we need
529 		 * check its backrefs
530 		 */
531 		if (!exist->checked)
532 			list_add_tail(&edge->list[UPPER], &list);
533 	} else {
534 		exist = NULL;
535 	}
536 
537 	while (1) {
538 		cond_resched();
539 		eb = path1->nodes[0];
540 
541 		if (ptr >= end) {
542 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
543 				ret = btrfs_next_leaf(rc->extent_root, path1);
544 				if (ret < 0) {
545 					err = ret;
546 					goto out;
547 				}
548 				if (ret > 0)
549 					break;
550 				eb = path1->nodes[0];
551 			}
552 
553 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
554 			if (key.objectid != cur->bytenr) {
555 				WARN_ON(exist);
556 				break;
557 			}
558 
559 			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
560 				ret = find_inline_backref(eb, path1->slots[0],
561 							  &ptr, &end);
562 				if (ret)
563 					goto next;
564 			}
565 		}
566 
567 		if (ptr < end) {
568 			/* update key for inline back ref */
569 			struct btrfs_extent_inline_ref *iref;
570 			iref = (struct btrfs_extent_inline_ref *)ptr;
571 			key.type = btrfs_extent_inline_ref_type(eb, iref);
572 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
573 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
574 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
575 		}
576 
577 		if (exist &&
578 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
579 		      exist->owner == key.offset) ||
580 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
581 		      exist->bytenr == key.offset))) {
582 			exist = NULL;
583 			goto next;
584 		}
585 
586 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 			if (key.objectid == key.offset &&
590 			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 				struct btrfs_extent_ref_v0 *ref0;
592 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 						struct btrfs_extent_ref_v0);
594 				root = find_tree_root(rc, eb, ref0);
595 				if (root)
596 					cur->root = root;
597 				else
598 					cur->old_root = 1;
599 				break;
600 			}
601 #else
602 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
603 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
604 #endif
605 			if (key.objectid == key.offset) {
606 				/*
607 				 * only root blocks of reloc trees use
608 				 * backref of this type.
609 				 */
610 				root = find_reloc_root(rc, cur->bytenr);
611 				BUG_ON(!root);
612 				cur->root = root;
613 				break;
614 			}
615 
616 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
617 			if (!edge) {
618 				err = -ENOMEM;
619 				goto out;
620 			}
621 			rb_node = tree_search(&cache->rb_root, key.offset);
622 			if (!rb_node) {
623 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
624 				if (!upper) {
625 					kfree(edge);
626 					err = -ENOMEM;
627 					goto out;
628 				}
629 				backref_node_init(upper);
630 				upper->bytenr = key.offset;
631 				upper->owner = 0;
632 				upper->level = cur->level + 1;
633 				/*
634 				 *  backrefs for the upper level block isn't
635 				 *  cached, add the block to pending list
636 				 */
637 				list_add_tail(&edge->list[UPPER], &list);
638 			} else {
639 				upper = rb_entry(rb_node, struct backref_node,
640 						 rb_node);
641 				INIT_LIST_HEAD(&edge->list[UPPER]);
642 			}
643 			list_add(&edge->list[LOWER], &cur->upper);
644 			edge->node[UPPER] = upper;
645 			edge->node[LOWER] = cur;
646 
647 			goto next;
648 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
649 			goto next;
650 		}
651 
652 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
653 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
654 		if (IS_ERR(root)) {
655 			err = PTR_ERR(root);
656 			goto out;
657 		}
658 
659 		if (btrfs_root_level(&root->root_item) == cur->level) {
660 			/* tree root */
661 			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 			       cur->bytenr);
663 			cur->root = root;
664 			break;
665 		}
666 
667 		level = cur->level + 1;
668 
669 		/*
670 		 * searching the tree to find upper level blocks
671 		 * reference the block.
672 		 */
673 		path2->search_commit_root = 1;
674 		path2->skip_locking = 1;
675 		path2->lowest_level = level;
676 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
677 		path2->lowest_level = 0;
678 		if (ret < 0) {
679 			err = ret;
680 			goto out;
681 		}
682 		if (ret > 0 && path2->slots[level] > 0)
683 			path2->slots[level]--;
684 
685 		eb = path2->nodes[level];
686 		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
687 			cur->bytenr);
688 
689 		lower = cur;
690 		for (; level < BTRFS_MAX_LEVEL; level++) {
691 			if (!path2->nodes[level]) {
692 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 				       lower->bytenr);
694 				lower->root = root;
695 				break;
696 			}
697 
698 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
699 			if (!edge) {
700 				err = -ENOMEM;
701 				goto out;
702 			}
703 
704 			eb = path2->nodes[level];
705 			rb_node = tree_search(&cache->rb_root, eb->start);
706 			if (!rb_node) {
707 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
708 				if (!upper) {
709 					kfree(edge);
710 					err = -ENOMEM;
711 					goto out;
712 				}
713 				backref_node_init(upper);
714 				upper->bytenr = eb->start;
715 				upper->owner = btrfs_header_owner(eb);
716 				upper->level = lower->level + 1;
717 
718 				/*
719 				 * if we know the block isn't shared
720 				 * we can void checking its backrefs.
721 				 */
722 				if (btrfs_block_can_be_shared(root, eb))
723 					upper->checked = 0;
724 				else
725 					upper->checked = 1;
726 
727 				/*
728 				 * add the block to pending list if we
729 				 * need check its backrefs. only block
730 				 * at 'cur->level + 1' is added to the
731 				 * tail of pending list. this guarantees
732 				 * we check backrefs from lower level
733 				 * blocks to upper level blocks.
734 				 */
735 				if (!upper->checked &&
736 				    level == cur->level + 1) {
737 					list_add_tail(&edge->list[UPPER],
738 						      &list);
739 				} else
740 					INIT_LIST_HEAD(&edge->list[UPPER]);
741 			} else {
742 				upper = rb_entry(rb_node, struct backref_node,
743 						 rb_node);
744 				BUG_ON(!upper->checked);
745 				INIT_LIST_HEAD(&edge->list[UPPER]);
746 			}
747 			list_add_tail(&edge->list[LOWER], &lower->upper);
748 			edge->node[UPPER] = upper;
749 			edge->node[LOWER] = lower;
750 
751 			if (rb_node)
752 				break;
753 			lower = upper;
754 			upper = NULL;
755 		}
756 		btrfs_release_path(root, path2);
757 next:
758 		if (ptr < end) {
759 			ptr += btrfs_extent_inline_ref_size(key.type);
760 			if (ptr >= end) {
761 				WARN_ON(ptr > end);
762 				ptr = 0;
763 				end = 0;
764 			}
765 		}
766 		if (ptr >= end)
767 			path1->slots[0]++;
768 	}
769 	btrfs_release_path(rc->extent_root, path1);
770 
771 	cur->checked = 1;
772 	WARN_ON(exist);
773 
774 	/* the pending list isn't empty, take the first block to process */
775 	if (!list_empty(&list)) {
776 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
777 		list_del_init(&edge->list[UPPER]);
778 		cur = edge->node[UPPER];
779 		goto again;
780 	}
781 
782 	/*
783 	 * everything goes well, connect backref nodes and insert backref nodes
784 	 * into the cache.
785 	 */
786 	BUG_ON(!node->checked);
787 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
788 	BUG_ON(rb_node);
789 
790 	list_for_each_entry(edge, &node->upper, list[LOWER])
791 		list_add_tail(&edge->list[UPPER], &list);
792 
793 	while (!list_empty(&list)) {
794 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 		list_del_init(&edge->list[UPPER]);
796 		upper = edge->node[UPPER];
797 
798 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 			if (upper->lowest) {
800 				list_del_init(&upper->lower);
801 				upper->lowest = 0;
802 			}
803 
804 			list_add_tail(&edge->list[UPPER], &upper->lower);
805 			continue;
806 		}
807 
808 		BUG_ON(!upper->checked);
809 		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
810 				      &upper->rb_node);
811 		BUG_ON(rb_node);
812 
813 		list_add_tail(&edge->list[UPPER], &upper->lower);
814 
815 		list_for_each_entry(edge, &upper->upper, list[LOWER])
816 			list_add_tail(&edge->list[UPPER], &list);
817 	}
818 out:
819 	btrfs_free_path(path1);
820 	btrfs_free_path(path2);
821 	if (err) {
822 		INIT_LIST_HEAD(&list);
823 		upper = node;
824 		while (upper) {
825 			if (RB_EMPTY_NODE(&upper->rb_node)) {
826 				list_splice_tail(&upper->upper, &list);
827 				kfree(upper);
828 			}
829 
830 			if (list_empty(&list))
831 				break;
832 
833 			edge = list_entry(list.next, struct backref_edge,
834 					  list[LOWER]);
835 			upper = edge->node[UPPER];
836 			kfree(edge);
837 		}
838 		return ERR_PTR(err);
839 	}
840 	return node;
841 }
842 
843 /*
844  * helper to add 'address of tree root -> reloc tree' mapping
845  */
846 static int __add_reloc_root(struct btrfs_root *root)
847 {
848 	struct rb_node *rb_node;
849 	struct mapping_node *node;
850 	struct reloc_control *rc = root->fs_info->reloc_ctl;
851 
852 	node = kmalloc(sizeof(*node), GFP_NOFS);
853 	BUG_ON(!node);
854 
855 	node->bytenr = root->node->start;
856 	node->data = root;
857 
858 	spin_lock(&rc->reloc_root_tree.lock);
859 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
860 			      node->bytenr, &node->rb_node);
861 	spin_unlock(&rc->reloc_root_tree.lock);
862 	BUG_ON(rb_node);
863 
864 	list_add_tail(&root->root_list, &rc->reloc_roots);
865 	return 0;
866 }
867 
868 /*
869  * helper to update/delete the 'address of tree root -> reloc tree'
870  * mapping
871  */
872 static int __update_reloc_root(struct btrfs_root *root, int del)
873 {
874 	struct rb_node *rb_node;
875 	struct mapping_node *node = NULL;
876 	struct reloc_control *rc = root->fs_info->reloc_ctl;
877 
878 	spin_lock(&rc->reloc_root_tree.lock);
879 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
880 			      root->commit_root->start);
881 	if (rb_node) {
882 		node = rb_entry(rb_node, struct mapping_node, rb_node);
883 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
884 	}
885 	spin_unlock(&rc->reloc_root_tree.lock);
886 
887 	BUG_ON((struct btrfs_root *)node->data != root);
888 
889 	if (!del) {
890 		spin_lock(&rc->reloc_root_tree.lock);
891 		node->bytenr = root->node->start;
892 		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
893 				      node->bytenr, &node->rb_node);
894 		spin_unlock(&rc->reloc_root_tree.lock);
895 		BUG_ON(rb_node);
896 	} else {
897 		list_del_init(&root->root_list);
898 		kfree(node);
899 	}
900 	return 0;
901 }
902 
903 /*
904  * create reloc tree for a given fs tree. reloc tree is just a
905  * snapshot of the fs tree with special root objectid.
906  */
907 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 			  struct btrfs_root *root)
909 {
910 	struct btrfs_root *reloc_root;
911 	struct extent_buffer *eb;
912 	struct btrfs_root_item *root_item;
913 	struct btrfs_key root_key;
914 	int ret;
915 
916 	if (root->reloc_root) {
917 		reloc_root = root->reloc_root;
918 		reloc_root->last_trans = trans->transid;
919 		return 0;
920 	}
921 
922 	if (!root->fs_info->reloc_ctl ||
923 	    !root->fs_info->reloc_ctl->create_reloc_root ||
924 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 		return 0;
926 
927 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 	BUG_ON(!root_item);
929 
930 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 	root_key.type = BTRFS_ROOT_ITEM_KEY;
932 	root_key.offset = root->root_key.objectid;
933 
934 	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
935 			      BTRFS_TREE_RELOC_OBJECTID);
936 	BUG_ON(ret);
937 
938 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 	memcpy(root_item, &root->root_item, sizeof(*root_item));
940 	btrfs_set_root_refs(root_item, 1);
941 	btrfs_set_root_bytenr(root_item, eb->start);
942 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 	btrfs_set_root_generation(root_item, trans->transid);
944 	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
945 	root_item->drop_level = 0;
946 
947 	btrfs_tree_unlock(eb);
948 	free_extent_buffer(eb);
949 
950 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
951 				&root_key, root_item);
952 	BUG_ON(ret);
953 	kfree(root_item);
954 
955 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
956 						 &root_key);
957 	BUG_ON(IS_ERR(reloc_root));
958 	reloc_root->last_trans = trans->transid;
959 
960 	__add_reloc_root(reloc_root);
961 	root->reloc_root = reloc_root;
962 	return 0;
963 }
964 
965 /*
966  * update root item of reloc tree
967  */
968 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
969 			    struct btrfs_root *root)
970 {
971 	struct btrfs_root *reloc_root;
972 	struct btrfs_root_item *root_item;
973 	int del = 0;
974 	int ret;
975 
976 	if (!root->reloc_root)
977 		return 0;
978 
979 	reloc_root = root->reloc_root;
980 	root_item = &reloc_root->root_item;
981 
982 	if (btrfs_root_refs(root_item) == 0) {
983 		root->reloc_root = NULL;
984 		del = 1;
985 	}
986 
987 	__update_reloc_root(reloc_root, del);
988 
989 	if (reloc_root->commit_root != reloc_root->node) {
990 		btrfs_set_root_node(root_item, reloc_root->node);
991 		free_extent_buffer(reloc_root->commit_root);
992 		reloc_root->commit_root = btrfs_root_node(reloc_root);
993 	}
994 
995 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
996 				&reloc_root->root_key, root_item);
997 	BUG_ON(ret);
998 	return 0;
999 }
1000 
1001 /*
1002  * helper to find first cached inode with inode number >= objectid
1003  * in a subvolume
1004  */
1005 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1006 {
1007 	struct rb_node *node;
1008 	struct rb_node *prev;
1009 	struct btrfs_inode *entry;
1010 	struct inode *inode;
1011 
1012 	spin_lock(&root->inode_lock);
1013 again:
1014 	node = root->inode_tree.rb_node;
1015 	prev = NULL;
1016 	while (node) {
1017 		prev = node;
1018 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1019 
1020 		if (objectid < entry->vfs_inode.i_ino)
1021 			node = node->rb_left;
1022 		else if (objectid > entry->vfs_inode.i_ino)
1023 			node = node->rb_right;
1024 		else
1025 			break;
1026 	}
1027 	if (!node) {
1028 		while (prev) {
1029 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1030 			if (objectid <= entry->vfs_inode.i_ino) {
1031 				node = prev;
1032 				break;
1033 			}
1034 			prev = rb_next(prev);
1035 		}
1036 	}
1037 	while (node) {
1038 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1039 		inode = igrab(&entry->vfs_inode);
1040 		if (inode) {
1041 			spin_unlock(&root->inode_lock);
1042 			return inode;
1043 		}
1044 
1045 		objectid = entry->vfs_inode.i_ino + 1;
1046 		if (cond_resched_lock(&root->inode_lock))
1047 			goto again;
1048 
1049 		node = rb_next(node);
1050 	}
1051 	spin_unlock(&root->inode_lock);
1052 	return NULL;
1053 }
1054 
1055 static int in_block_group(u64 bytenr,
1056 			  struct btrfs_block_group_cache *block_group)
1057 {
1058 	if (bytenr >= block_group->key.objectid &&
1059 	    bytenr < block_group->key.objectid + block_group->key.offset)
1060 		return 1;
1061 	return 0;
1062 }
1063 
1064 /*
1065  * get new location of data
1066  */
1067 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1068 			    u64 bytenr, u64 num_bytes)
1069 {
1070 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1071 	struct btrfs_path *path;
1072 	struct btrfs_file_extent_item *fi;
1073 	struct extent_buffer *leaf;
1074 	int ret;
1075 
1076 	path = btrfs_alloc_path();
1077 	if (!path)
1078 		return -ENOMEM;
1079 
1080 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1081 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1082 				       bytenr, 0);
1083 	if (ret < 0)
1084 		goto out;
1085 	if (ret > 0) {
1086 		ret = -ENOENT;
1087 		goto out;
1088 	}
1089 
1090 	leaf = path->nodes[0];
1091 	fi = btrfs_item_ptr(leaf, path->slots[0],
1092 			    struct btrfs_file_extent_item);
1093 
1094 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1095 	       btrfs_file_extent_compression(leaf, fi) ||
1096 	       btrfs_file_extent_encryption(leaf, fi) ||
1097 	       btrfs_file_extent_other_encoding(leaf, fi));
1098 
1099 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1100 		ret = 1;
1101 		goto out;
1102 	}
1103 
1104 	if (new_bytenr)
1105 		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 	ret = 0;
1107 out:
1108 	btrfs_free_path(path);
1109 	return ret;
1110 }
1111 
1112 /*
1113  * update file extent items in the tree leaf to point to
1114  * the new locations.
1115  */
1116 static int replace_file_extents(struct btrfs_trans_handle *trans,
1117 				struct reloc_control *rc,
1118 				struct btrfs_root *root,
1119 				struct extent_buffer *leaf,
1120 				struct list_head *inode_list)
1121 {
1122 	struct btrfs_key key;
1123 	struct btrfs_file_extent_item *fi;
1124 	struct inode *inode = NULL;
1125 	struct inodevec *ivec = NULL;
1126 	u64 parent;
1127 	u64 bytenr;
1128 	u64 new_bytenr;
1129 	u64 num_bytes;
1130 	u64 end;
1131 	u32 nritems;
1132 	u32 i;
1133 	int ret;
1134 	int first = 1;
1135 	int dirty = 0;
1136 
1137 	if (rc->stage != UPDATE_DATA_PTRS)
1138 		return 0;
1139 
1140 	/* reloc trees always use full backref */
1141 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1142 		parent = leaf->start;
1143 	else
1144 		parent = 0;
1145 
1146 	nritems = btrfs_header_nritems(leaf);
1147 	for (i = 0; i < nritems; i++) {
1148 		cond_resched();
1149 		btrfs_item_key_to_cpu(leaf, &key, i);
1150 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1151 			continue;
1152 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1153 		if (btrfs_file_extent_type(leaf, fi) ==
1154 		    BTRFS_FILE_EXTENT_INLINE)
1155 			continue;
1156 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1157 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1158 		if (bytenr == 0)
1159 			continue;
1160 		if (!in_block_group(bytenr, rc->block_group))
1161 			continue;
1162 
1163 		/*
1164 		 * if we are modifying block in fs tree, wait for readpage
1165 		 * to complete and drop the extent cache
1166 		 */
1167 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 			if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 				BUG_ON(!ivec);
1171 				ivec->nr = 0;
1172 				list_add_tail(&ivec->list, inode_list);
1173 			}
1174 			if (first) {
1175 				inode = find_next_inode(root, key.objectid);
1176 				if (inode)
1177 					ivec->inode[ivec->nr++] = inode;
1178 				first = 0;
1179 			} else if (inode && inode->i_ino < key.objectid) {
1180 				inode = find_next_inode(root, key.objectid);
1181 				if (inode)
1182 					ivec->inode[ivec->nr++] = inode;
1183 			}
1184 			if (inode && inode->i_ino == key.objectid) {
1185 				end = key.offset +
1186 				      btrfs_file_extent_num_bytes(leaf, fi);
1187 				WARN_ON(!IS_ALIGNED(key.offset,
1188 						    root->sectorsize));
1189 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1190 				end--;
1191 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1192 						      key.offset, end,
1193 						      GFP_NOFS);
1194 				if (!ret)
1195 					continue;
1196 
1197 				btrfs_drop_extent_cache(inode, key.offset, end,
1198 							1);
1199 				unlock_extent(&BTRFS_I(inode)->io_tree,
1200 					      key.offset, end, GFP_NOFS);
1201 			}
1202 		}
1203 
1204 		ret = get_new_location(rc->data_inode, &new_bytenr,
1205 				       bytenr, num_bytes);
1206 		if (ret > 0)
1207 			continue;
1208 		BUG_ON(ret < 0);
1209 
1210 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1211 		dirty = 1;
1212 
1213 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1214 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1215 					   num_bytes, parent,
1216 					   btrfs_header_owner(leaf),
1217 					   key.objectid, key.offset);
1218 		BUG_ON(ret);
1219 
1220 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1221 					parent, btrfs_header_owner(leaf),
1222 					key.objectid, key.offset);
1223 		BUG_ON(ret);
1224 	}
1225 	if (dirty)
1226 		btrfs_mark_buffer_dirty(leaf);
1227 	return 0;
1228 }
1229 
1230 static noinline_for_stack
1231 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1232 		     struct btrfs_path *path, int level)
1233 {
1234 	struct btrfs_disk_key key1;
1235 	struct btrfs_disk_key key2;
1236 	btrfs_node_key(eb, &key1, slot);
1237 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1238 	return memcmp(&key1, &key2, sizeof(key1));
1239 }
1240 
1241 /*
1242  * try to replace tree blocks in fs tree with the new blocks
1243  * in reloc tree. tree blocks haven't been modified since the
1244  * reloc tree was create can be replaced.
1245  *
1246  * if a block was replaced, level of the block + 1 is returned.
1247  * if no block got replaced, 0 is returned. if there are other
1248  * errors, a negative error number is returned.
1249  */
1250 static int replace_path(struct btrfs_trans_handle *trans,
1251 			struct btrfs_root *dest, struct btrfs_root *src,
1252 			struct btrfs_path *path, struct btrfs_key *next_key,
1253 			struct extent_buffer **leaf,
1254 			int lowest_level, int max_level)
1255 {
1256 	struct extent_buffer *eb;
1257 	struct extent_buffer *parent;
1258 	struct btrfs_key key;
1259 	u64 old_bytenr;
1260 	u64 new_bytenr;
1261 	u64 old_ptr_gen;
1262 	u64 new_ptr_gen;
1263 	u64 last_snapshot;
1264 	u32 blocksize;
1265 	int level;
1266 	int ret;
1267 	int slot;
1268 
1269 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 	BUG_ON(lowest_level > 1 && leaf);
1272 
1273 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 
1275 	slot = path->slots[lowest_level];
1276 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 
1278 	eb = btrfs_lock_root_node(dest);
1279 	btrfs_set_lock_blocking(eb);
1280 	level = btrfs_header_level(eb);
1281 
1282 	if (level < lowest_level) {
1283 		btrfs_tree_unlock(eb);
1284 		free_extent_buffer(eb);
1285 		return 0;
1286 	}
1287 
1288 	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1289 	BUG_ON(ret);
1290 	btrfs_set_lock_blocking(eb);
1291 
1292 	if (next_key) {
1293 		next_key->objectid = (u64)-1;
1294 		next_key->type = (u8)-1;
1295 		next_key->offset = (u64)-1;
1296 	}
1297 
1298 	parent = eb;
1299 	while (1) {
1300 		level = btrfs_header_level(parent);
1301 		BUG_ON(level < lowest_level);
1302 
1303 		ret = btrfs_bin_search(parent, &key, level, &slot);
1304 		if (ret && slot > 0)
1305 			slot--;
1306 
1307 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1308 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1309 
1310 		old_bytenr = btrfs_node_blockptr(parent, slot);
1311 		blocksize = btrfs_level_size(dest, level - 1);
1312 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1313 
1314 		if (level <= max_level) {
1315 			eb = path->nodes[level];
1316 			new_bytenr = btrfs_node_blockptr(eb,
1317 							path->slots[level]);
1318 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1319 							path->slots[level]);
1320 		} else {
1321 			new_bytenr = 0;
1322 			new_ptr_gen = 0;
1323 		}
1324 
1325 		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1326 			WARN_ON(1);
1327 			ret = level;
1328 			break;
1329 		}
1330 
1331 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 		    memcmp_node_keys(parent, slot, path, level)) {
1333 			if (level <= lowest_level && !leaf) {
1334 				ret = 0;
1335 				break;
1336 			}
1337 
1338 			eb = read_tree_block(dest, old_bytenr, blocksize,
1339 					     old_ptr_gen);
1340 			btrfs_tree_lock(eb);
1341 			ret = btrfs_cow_block(trans, dest, eb, parent,
1342 					      slot, &eb);
1343 			BUG_ON(ret);
1344 			btrfs_set_lock_blocking(eb);
1345 
1346 			if (level <= lowest_level) {
1347 				*leaf = eb;
1348 				ret = 0;
1349 				break;
1350 			}
1351 
1352 			btrfs_tree_unlock(parent);
1353 			free_extent_buffer(parent);
1354 
1355 			parent = eb;
1356 			continue;
1357 		}
1358 
1359 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 				      path->slots[level]);
1361 		btrfs_release_path(src, path);
1362 
1363 		path->lowest_level = level;
1364 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1365 		path->lowest_level = 0;
1366 		BUG_ON(ret);
1367 
1368 		/*
1369 		 * swap blocks in fs tree and reloc tree.
1370 		 */
1371 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1372 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1373 		btrfs_mark_buffer_dirty(parent);
1374 
1375 		btrfs_set_node_blockptr(path->nodes[level],
1376 					path->slots[level], old_bytenr);
1377 		btrfs_set_node_ptr_generation(path->nodes[level],
1378 					      path->slots[level], old_ptr_gen);
1379 		btrfs_mark_buffer_dirty(path->nodes[level]);
1380 
1381 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1382 					path->nodes[level]->start,
1383 					src->root_key.objectid, level - 1, 0);
1384 		BUG_ON(ret);
1385 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1386 					0, dest->root_key.objectid, level - 1,
1387 					0);
1388 		BUG_ON(ret);
1389 
1390 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1391 					path->nodes[level]->start,
1392 					src->root_key.objectid, level - 1, 0);
1393 		BUG_ON(ret);
1394 
1395 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1396 					0, dest->root_key.objectid, level - 1,
1397 					0);
1398 		BUG_ON(ret);
1399 
1400 		btrfs_unlock_up_safe(path, 0);
1401 
1402 		ret = level;
1403 		break;
1404 	}
1405 	btrfs_tree_unlock(parent);
1406 	free_extent_buffer(parent);
1407 	return ret;
1408 }
1409 
1410 /*
1411  * helper to find next relocated block in reloc tree
1412  */
1413 static noinline_for_stack
1414 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1415 		       int *level)
1416 {
1417 	struct extent_buffer *eb;
1418 	int i;
1419 	u64 last_snapshot;
1420 	u32 nritems;
1421 
1422 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423 
1424 	for (i = 0; i < *level; i++) {
1425 		free_extent_buffer(path->nodes[i]);
1426 		path->nodes[i] = NULL;
1427 	}
1428 
1429 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1430 		eb = path->nodes[i];
1431 		nritems = btrfs_header_nritems(eb);
1432 		while (path->slots[i] + 1 < nritems) {
1433 			path->slots[i]++;
1434 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1435 			    last_snapshot)
1436 				continue;
1437 
1438 			*level = i;
1439 			return 0;
1440 		}
1441 		free_extent_buffer(path->nodes[i]);
1442 		path->nodes[i] = NULL;
1443 	}
1444 	return 1;
1445 }
1446 
1447 /*
1448  * walk down reloc tree to find relocated block of lowest level
1449  */
1450 static noinline_for_stack
1451 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1452 			 int *level)
1453 {
1454 	struct extent_buffer *eb = NULL;
1455 	int i;
1456 	u64 bytenr;
1457 	u64 ptr_gen = 0;
1458 	u64 last_snapshot;
1459 	u32 blocksize;
1460 	u32 nritems;
1461 
1462 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1463 
1464 	for (i = *level; i > 0; i--) {
1465 		eb = path->nodes[i];
1466 		nritems = btrfs_header_nritems(eb);
1467 		while (path->slots[i] < nritems) {
1468 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1469 			if (ptr_gen > last_snapshot)
1470 				break;
1471 			path->slots[i]++;
1472 		}
1473 		if (path->slots[i] >= nritems) {
1474 			if (i == *level)
1475 				break;
1476 			*level = i + 1;
1477 			return 0;
1478 		}
1479 		if (i == 1) {
1480 			*level = i;
1481 			return 0;
1482 		}
1483 
1484 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1485 		blocksize = btrfs_level_size(root, i - 1);
1486 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1487 		BUG_ON(btrfs_header_level(eb) != i - 1);
1488 		path->nodes[i - 1] = eb;
1489 		path->slots[i - 1] = 0;
1490 	}
1491 	return 1;
1492 }
1493 
1494 /*
1495  * invalidate extent cache for file extents whose key in range of
1496  * [min_key, max_key)
1497  */
1498 static int invalidate_extent_cache(struct btrfs_root *root,
1499 				   struct btrfs_key *min_key,
1500 				   struct btrfs_key *max_key)
1501 {
1502 	struct inode *inode = NULL;
1503 	u64 objectid;
1504 	u64 start, end;
1505 
1506 	objectid = min_key->objectid;
1507 	while (1) {
1508 		cond_resched();
1509 		iput(inode);
1510 
1511 		if (objectid > max_key->objectid)
1512 			break;
1513 
1514 		inode = find_next_inode(root, objectid);
1515 		if (!inode)
1516 			break;
1517 
1518 		if (inode->i_ino > max_key->objectid) {
1519 			iput(inode);
1520 			break;
1521 		}
1522 
1523 		objectid = inode->i_ino + 1;
1524 		if (!S_ISREG(inode->i_mode))
1525 			continue;
1526 
1527 		if (unlikely(min_key->objectid == inode->i_ino)) {
1528 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1529 				continue;
1530 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1531 				start = 0;
1532 			else {
1533 				start = min_key->offset;
1534 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1535 			}
1536 		} else {
1537 			start = 0;
1538 		}
1539 
1540 		if (unlikely(max_key->objectid == inode->i_ino)) {
1541 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1542 				continue;
1543 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1544 				end = (u64)-1;
1545 			} else {
1546 				if (max_key->offset == 0)
1547 					continue;
1548 				end = max_key->offset;
1549 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1550 				end--;
1551 			}
1552 		} else {
1553 			end = (u64)-1;
1554 		}
1555 
1556 		/* the lock_extent waits for readpage to complete */
1557 		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1558 		btrfs_drop_extent_cache(inode, start, end, 1);
1559 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1560 	}
1561 	return 0;
1562 }
1563 
1564 static void put_inodes(struct list_head *list)
1565 {
1566 	struct inodevec *ivec;
1567 	while (!list_empty(list)) {
1568 		ivec = list_entry(list->next, struct inodevec, list);
1569 		list_del(&ivec->list);
1570 		while (ivec->nr > 0) {
1571 			ivec->nr--;
1572 			iput(ivec->inode[ivec->nr]);
1573 		}
1574 		kfree(ivec);
1575 	}
1576 }
1577 
1578 static int find_next_key(struct btrfs_path *path, int level,
1579 			 struct btrfs_key *key)
1580 
1581 {
1582 	while (level < BTRFS_MAX_LEVEL) {
1583 		if (!path->nodes[level])
1584 			break;
1585 		if (path->slots[level] + 1 <
1586 		    btrfs_header_nritems(path->nodes[level])) {
1587 			btrfs_node_key_to_cpu(path->nodes[level], key,
1588 					      path->slots[level] + 1);
1589 			return 0;
1590 		}
1591 		level++;
1592 	}
1593 	return 1;
1594 }
1595 
1596 /*
1597  * merge the relocated tree blocks in reloc tree with corresponding
1598  * fs tree.
1599  */
1600 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1601 					       struct btrfs_root *root)
1602 {
1603 	LIST_HEAD(inode_list);
1604 	struct btrfs_key key;
1605 	struct btrfs_key next_key;
1606 	struct btrfs_trans_handle *trans;
1607 	struct btrfs_root *reloc_root;
1608 	struct btrfs_root_item *root_item;
1609 	struct btrfs_path *path;
1610 	struct extent_buffer *leaf = NULL;
1611 	unsigned long nr;
1612 	int level;
1613 	int max_level;
1614 	int replaced = 0;
1615 	int ret;
1616 	int err = 0;
1617 
1618 	path = btrfs_alloc_path();
1619 	if (!path)
1620 		return -ENOMEM;
1621 
1622 	reloc_root = root->reloc_root;
1623 	root_item = &reloc_root->root_item;
1624 
1625 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1626 		level = btrfs_root_level(root_item);
1627 		extent_buffer_get(reloc_root->node);
1628 		path->nodes[level] = reloc_root->node;
1629 		path->slots[level] = 0;
1630 	} else {
1631 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1632 
1633 		level = root_item->drop_level;
1634 		BUG_ON(level == 0);
1635 		path->lowest_level = level;
1636 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1637 		path->lowest_level = 0;
1638 		if (ret < 0) {
1639 			btrfs_free_path(path);
1640 			return ret;
1641 		}
1642 
1643 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1644 				      path->slots[level]);
1645 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1646 
1647 		btrfs_unlock_up_safe(path, 0);
1648 	}
1649 
1650 	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1651 		trans = btrfs_start_transaction(root, 1);
1652 
1653 		leaf = path->nodes[0];
1654 		btrfs_item_key_to_cpu(leaf, &key, 0);
1655 		btrfs_release_path(reloc_root, path);
1656 
1657 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1658 		if (ret < 0) {
1659 			err = ret;
1660 			goto out;
1661 		}
1662 
1663 		leaf = path->nodes[0];
1664 		btrfs_unlock_up_safe(path, 1);
1665 		ret = replace_file_extents(trans, rc, root, leaf,
1666 					   &inode_list);
1667 		if (ret < 0)
1668 			err = ret;
1669 		goto out;
1670 	}
1671 
1672 	memset(&next_key, 0, sizeof(next_key));
1673 
1674 	while (1) {
1675 		leaf = NULL;
1676 		replaced = 0;
1677 		trans = btrfs_start_transaction(root, 1);
1678 		max_level = level;
1679 
1680 		ret = walk_down_reloc_tree(reloc_root, path, &level);
1681 		if (ret < 0) {
1682 			err = ret;
1683 			goto out;
1684 		}
1685 		if (ret > 0)
1686 			break;
1687 
1688 		if (!find_next_key(path, level, &key) &&
1689 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690 			ret = 0;
1691 		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692 			ret = replace_path(trans, root, reloc_root,
1693 					   path, &next_key, &leaf,
1694 					   level, max_level);
1695 		} else {
1696 			ret = replace_path(trans, root, reloc_root,
1697 					   path, &next_key, NULL,
1698 					   level, max_level);
1699 		}
1700 		if (ret < 0) {
1701 			err = ret;
1702 			goto out;
1703 		}
1704 
1705 		if (ret > 0) {
1706 			level = ret;
1707 			btrfs_node_key_to_cpu(path->nodes[level], &key,
1708 					      path->slots[level]);
1709 			replaced = 1;
1710 		} else if (leaf) {
1711 			/*
1712 			 * no block got replaced, try replacing file extents
1713 			 */
1714 			btrfs_item_key_to_cpu(leaf, &key, 0);
1715 			ret = replace_file_extents(trans, rc, root, leaf,
1716 						   &inode_list);
1717 			btrfs_tree_unlock(leaf);
1718 			free_extent_buffer(leaf);
1719 			BUG_ON(ret < 0);
1720 		}
1721 
1722 		ret = walk_up_reloc_tree(reloc_root, path, &level);
1723 		if (ret > 0)
1724 			break;
1725 
1726 		BUG_ON(level == 0);
1727 		/*
1728 		 * save the merging progress in the drop_progress.
1729 		 * this is OK since root refs == 1 in this case.
1730 		 */
1731 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1732 			       path->slots[level]);
1733 		root_item->drop_level = level;
1734 
1735 		nr = trans->blocks_used;
1736 		btrfs_end_transaction(trans, root);
1737 
1738 		btrfs_btree_balance_dirty(root, nr);
1739 
1740 		/*
1741 		 * put inodes outside transaction, otherwise we may deadlock.
1742 		 */
1743 		put_inodes(&inode_list);
1744 
1745 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746 			invalidate_extent_cache(root, &key, &next_key);
1747 	}
1748 
1749 	/*
1750 	 * handle the case only one block in the fs tree need to be
1751 	 * relocated and the block is tree root.
1752 	 */
1753 	leaf = btrfs_lock_root_node(root);
1754 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1755 	btrfs_tree_unlock(leaf);
1756 	free_extent_buffer(leaf);
1757 	if (ret < 0)
1758 		err = ret;
1759 out:
1760 	btrfs_free_path(path);
1761 
1762 	if (err == 0) {
1763 		memset(&root_item->drop_progress, 0,
1764 		       sizeof(root_item->drop_progress));
1765 		root_item->drop_level = 0;
1766 		btrfs_set_root_refs(root_item, 0);
1767 	}
1768 
1769 	nr = trans->blocks_used;
1770 	btrfs_end_transaction(trans, root);
1771 
1772 	btrfs_btree_balance_dirty(root, nr);
1773 
1774 	put_inodes(&inode_list);
1775 
1776 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777 		invalidate_extent_cache(root, &key, &next_key);
1778 
1779 	return err;
1780 }
1781 
1782 /*
1783  * callback for the work threads.
1784  * this function merges reloc tree with corresponding fs tree,
1785  * and then drops the reloc tree.
1786  */
1787 static void merge_func(struct btrfs_work *work)
1788 {
1789 	struct btrfs_trans_handle *trans;
1790 	struct btrfs_root *root;
1791 	struct btrfs_root *reloc_root;
1792 	struct async_merge *async;
1793 
1794 	async = container_of(work, struct async_merge, work);
1795 	reloc_root = async->root;
1796 
1797 	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798 		root = read_fs_root(reloc_root->fs_info,
1799 				    reloc_root->root_key.offset);
1800 		BUG_ON(IS_ERR(root));
1801 		BUG_ON(root->reloc_root != reloc_root);
1802 
1803 		merge_reloc_root(async->rc, root);
1804 
1805 		trans = btrfs_start_transaction(root, 1);
1806 		btrfs_update_reloc_root(trans, root);
1807 		btrfs_end_transaction(trans, root);
1808 	}
1809 
1810 	btrfs_drop_snapshot(reloc_root, 0);
1811 
1812 	if (atomic_dec_and_test(async->num_pending))
1813 		complete(async->done);
1814 
1815 	kfree(async);
1816 }
1817 
1818 static int merge_reloc_roots(struct reloc_control *rc)
1819 {
1820 	struct async_merge *async;
1821 	struct btrfs_root *root;
1822 	struct completion done;
1823 	atomic_t num_pending;
1824 
1825 	init_completion(&done);
1826 	atomic_set(&num_pending, 1);
1827 
1828 	while (!list_empty(&rc->reloc_roots)) {
1829 		root = list_entry(rc->reloc_roots.next,
1830 				  struct btrfs_root, root_list);
1831 		list_del_init(&root->root_list);
1832 
1833 		async = kmalloc(sizeof(*async), GFP_NOFS);
1834 		BUG_ON(!async);
1835 		async->work.func = merge_func;
1836 		async->work.flags = 0;
1837 		async->rc = rc;
1838 		async->root = root;
1839 		async->done = &done;
1840 		async->num_pending = &num_pending;
1841 		atomic_inc(&num_pending);
1842 		btrfs_queue_worker(&rc->workers, &async->work);
1843 	}
1844 
1845 	if (!atomic_dec_and_test(&num_pending))
1846 		wait_for_completion(&done);
1847 
1848 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849 	return 0;
1850 }
1851 
1852 static void free_block_list(struct rb_root *blocks)
1853 {
1854 	struct tree_block *block;
1855 	struct rb_node *rb_node;
1856 	while ((rb_node = rb_first(blocks))) {
1857 		block = rb_entry(rb_node, struct tree_block, rb_node);
1858 		rb_erase(rb_node, blocks);
1859 		kfree(block);
1860 	}
1861 }
1862 
1863 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1864 				      struct btrfs_root *reloc_root)
1865 {
1866 	struct btrfs_root *root;
1867 
1868 	if (reloc_root->last_trans == trans->transid)
1869 		return 0;
1870 
1871 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1872 	BUG_ON(IS_ERR(root));
1873 	BUG_ON(root->reloc_root != reloc_root);
1874 
1875 	return btrfs_record_root_in_trans(trans, root);
1876 }
1877 
1878 /*
1879  * select one tree from trees that references the block.
1880  * for blocks in refernce counted trees, we preper reloc tree.
1881  * if no reloc tree found and reloc_only is true, NULL is returned.
1882  */
1883 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884 					    struct backref_node *node,
1885 					    struct backref_edge *edges[],
1886 					    int *nr, int reloc_only)
1887 {
1888 	struct backref_node *next;
1889 	struct btrfs_root *root;
1890 	int index;
1891 	int loop = 0;
1892 again:
1893 	index = 0;
1894 	next = node;
1895 	while (1) {
1896 		cond_resched();
1897 		next = walk_up_backref(next, edges, &index);
1898 		root = next->root;
1899 		if (!root) {
1900 			BUG_ON(!node->old_root);
1901 			goto skip;
1902 		}
1903 
1904 		/* no other choice for non-refernce counted tree */
1905 		if (!root->ref_cows) {
1906 			BUG_ON(reloc_only);
1907 			break;
1908 		}
1909 
1910 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911 			record_reloc_root_in_trans(trans, root);
1912 			break;
1913 		}
1914 
1915 		if (loop) {
1916 			btrfs_record_root_in_trans(trans, root);
1917 			break;
1918 		}
1919 
1920 		if (reloc_only || next != node) {
1921 			if (!root->reloc_root)
1922 				btrfs_record_root_in_trans(trans, root);
1923 			root = root->reloc_root;
1924 			/*
1925 			 * if the reloc tree was created in current
1926 			 * transation, there is no node in backref tree
1927 			 * corresponds to the root of the reloc tree.
1928 			 */
1929 			if (btrfs_root_last_snapshot(&root->root_item) ==
1930 			    trans->transid - 1)
1931 				break;
1932 		}
1933 skip:
1934 		root = NULL;
1935 		next = walk_down_backref(edges, &index);
1936 		if (!next || next->level <= node->level)
1937 			break;
1938 	}
1939 
1940 	if (!root && !loop && !reloc_only) {
1941 		loop = 1;
1942 		goto again;
1943 	}
1944 
1945 	if (root)
1946 		*nr = index;
1947 	else
1948 		*nr = 0;
1949 
1950 	return root;
1951 }
1952 
1953 static noinline_for_stack
1954 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955 				   struct backref_node *node)
1956 {
1957 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958 	int nr;
1959 	return __select_one_root(trans, node, edges, &nr, 0);
1960 }
1961 
1962 static noinline_for_stack
1963 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1964 				     struct backref_node *node,
1965 				     struct backref_edge *edges[], int *nr)
1966 {
1967 	return __select_one_root(trans, node, edges, nr, 1);
1968 }
1969 
1970 static void grab_path_buffers(struct btrfs_path *path,
1971 			      struct backref_node *node,
1972 			      struct backref_edge *edges[], int nr)
1973 {
1974 	int i = 0;
1975 	while (1) {
1976 		drop_node_buffer(node);
1977 		node->eb = path->nodes[node->level];
1978 		BUG_ON(!node->eb);
1979 		if (path->locks[node->level])
1980 			node->locked = 1;
1981 		path->nodes[node->level] = NULL;
1982 		path->locks[node->level] = 0;
1983 
1984 		if (i >= nr)
1985 			break;
1986 
1987 		edges[i]->blockptr = node->eb->start;
1988 		node = edges[i]->node[UPPER];
1989 		i++;
1990 	}
1991 }
1992 
1993 /*
1994  * relocate a block tree, and then update pointers in upper level
1995  * blocks that reference the block to point to the new location.
1996  *
1997  * if called by link_to_upper, the block has already been relocated.
1998  * in that case this function just updates pointers.
1999  */
2000 static int do_relocation(struct btrfs_trans_handle *trans,
2001 			 struct backref_node *node,
2002 			 struct btrfs_key *key,
2003 			 struct btrfs_path *path, int lowest)
2004 {
2005 	struct backref_node *upper;
2006 	struct backref_edge *edge;
2007 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2008 	struct btrfs_root *root;
2009 	struct extent_buffer *eb;
2010 	u32 blocksize;
2011 	u64 bytenr;
2012 	u64 generation;
2013 	int nr;
2014 	int slot;
2015 	int ret;
2016 	int err = 0;
2017 
2018 	BUG_ON(lowest && node->eb);
2019 
2020 	path->lowest_level = node->level + 1;
2021 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022 		cond_resched();
2023 		if (node->eb && node->eb->start == edge->blockptr)
2024 			continue;
2025 
2026 		upper = edge->node[UPPER];
2027 		root = select_reloc_root(trans, upper, edges, &nr);
2028 		if (!root)
2029 			continue;
2030 
2031 		if (upper->eb && !upper->locked)
2032 			drop_node_buffer(upper);
2033 
2034 		if (!upper->eb) {
2035 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2036 			if (ret < 0) {
2037 				err = ret;
2038 				break;
2039 			}
2040 			BUG_ON(ret > 0);
2041 
2042 			slot = path->slots[upper->level];
2043 
2044 			btrfs_unlock_up_safe(path, upper->level + 1);
2045 			grab_path_buffers(path, upper, edges, nr);
2046 
2047 			btrfs_release_path(NULL, path);
2048 		} else {
2049 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2050 					       &slot);
2051 			BUG_ON(ret);
2052 		}
2053 
2054 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2055 		if (!lowest) {
2056 			if (node->eb->start == bytenr) {
2057 				btrfs_tree_unlock(upper->eb);
2058 				upper->locked = 0;
2059 				continue;
2060 			}
2061 		} else {
2062 			BUG_ON(node->bytenr != bytenr);
2063 		}
2064 
2065 		blocksize = btrfs_level_size(root, node->level);
2066 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2067 		eb = read_tree_block(root, bytenr, blocksize, generation);
2068 		btrfs_tree_lock(eb);
2069 		btrfs_set_lock_blocking(eb);
2070 
2071 		if (!node->eb) {
2072 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073 					      slot, &eb);
2074 			if (ret < 0) {
2075 				err = ret;
2076 				break;
2077 			}
2078 			btrfs_set_lock_blocking(eb);
2079 			node->eb = eb;
2080 			node->locked = 1;
2081 		} else {
2082 			btrfs_set_node_blockptr(upper->eb, slot,
2083 						node->eb->start);
2084 			btrfs_set_node_ptr_generation(upper->eb, slot,
2085 						      trans->transid);
2086 			btrfs_mark_buffer_dirty(upper->eb);
2087 
2088 			ret = btrfs_inc_extent_ref(trans, root,
2089 						node->eb->start, blocksize,
2090 						upper->eb->start,
2091 						btrfs_header_owner(upper->eb),
2092 						node->level, 0);
2093 			BUG_ON(ret);
2094 
2095 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096 			BUG_ON(ret);
2097 		}
2098 		if (!lowest) {
2099 			btrfs_tree_unlock(upper->eb);
2100 			upper->locked = 0;
2101 		}
2102 	}
2103 	path->lowest_level = 0;
2104 	return err;
2105 }
2106 
2107 static int link_to_upper(struct btrfs_trans_handle *trans,
2108 			 struct backref_node *node,
2109 			 struct btrfs_path *path)
2110 {
2111 	struct btrfs_key key;
2112 	if (!node->eb || list_empty(&node->upper))
2113 		return 0;
2114 
2115 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2116 	return do_relocation(trans, node, &key, path, 0);
2117 }
2118 
2119 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120 				struct backref_cache *cache,
2121 				struct btrfs_path *path)
2122 {
2123 	struct backref_node *node;
2124 	int level;
2125 	int ret;
2126 	int err = 0;
2127 
2128 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129 		while (!list_empty(&cache->pending[level])) {
2130 			node = list_entry(cache->pending[level].next,
2131 					  struct backref_node, lower);
2132 			BUG_ON(node->level != level);
2133 
2134 			ret = link_to_upper(trans, node, path);
2135 			if (ret < 0)
2136 				err = ret;
2137 			/*
2138 			 * this remove the node from the pending list and
2139 			 * may add some other nodes to the level + 1
2140 			 * pending list
2141 			 */
2142 			remove_backref_node(cache, node);
2143 		}
2144 	}
2145 	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146 	return err;
2147 }
2148 
2149 static void mark_block_processed(struct reloc_control *rc,
2150 				 struct backref_node *node)
2151 {
2152 	u32 blocksize;
2153 	if (node->level == 0 ||
2154 	    in_block_group(node->bytenr, rc->block_group)) {
2155 		blocksize = btrfs_level_size(rc->extent_root, node->level);
2156 		set_extent_bits(&rc->processed_blocks, node->bytenr,
2157 				node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158 				GFP_NOFS);
2159 	}
2160 	node->processed = 1;
2161 }
2162 
2163 /*
2164  * mark a block and all blocks directly/indirectly reference the block
2165  * as processed.
2166  */
2167 static void update_processed_blocks(struct reloc_control *rc,
2168 				    struct backref_node *node)
2169 {
2170 	struct backref_node *next = node;
2171 	struct backref_edge *edge;
2172 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2173 	int index = 0;
2174 
2175 	while (next) {
2176 		cond_resched();
2177 		while (1) {
2178 			if (next->processed)
2179 				break;
2180 
2181 			mark_block_processed(rc, next);
2182 
2183 			if (list_empty(&next->upper))
2184 				break;
2185 
2186 			edge = list_entry(next->upper.next,
2187 					  struct backref_edge, list[LOWER]);
2188 			edges[index++] = edge;
2189 			next = edge->node[UPPER];
2190 		}
2191 		next = walk_down_backref(edges, &index);
2192 	}
2193 }
2194 
2195 static int tree_block_processed(u64 bytenr, u32 blocksize,
2196 				struct reloc_control *rc)
2197 {
2198 	if (test_range_bit(&rc->processed_blocks, bytenr,
2199 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2200 		return 1;
2201 	return 0;
2202 }
2203 
2204 /*
2205  * check if there are any file extent pointers in the leaf point to
2206  * data require processing
2207  */
2208 static int check_file_extents(struct reloc_control *rc,
2209 			      u64 bytenr, u32 blocksize, u64 ptr_gen)
2210 {
2211 	struct btrfs_key found_key;
2212 	struct btrfs_file_extent_item *fi;
2213 	struct extent_buffer *leaf;
2214 	u32 nritems;
2215 	int i;
2216 	int ret = 0;
2217 
2218 	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219 
2220 	nritems = btrfs_header_nritems(leaf);
2221 	for (i = 0; i < nritems; i++) {
2222 		cond_resched();
2223 		btrfs_item_key_to_cpu(leaf, &found_key, i);
2224 		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225 			continue;
2226 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227 		if (btrfs_file_extent_type(leaf, fi) ==
2228 		    BTRFS_FILE_EXTENT_INLINE)
2229 			continue;
2230 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231 		if (bytenr == 0)
2232 			continue;
2233 		if (in_block_group(bytenr, rc->block_group)) {
2234 			ret = 1;
2235 			break;
2236 		}
2237 	}
2238 	free_extent_buffer(leaf);
2239 	return ret;
2240 }
2241 
2242 /*
2243  * scan child blocks of a given block to find blocks require processing
2244  */
2245 static int add_child_blocks(struct btrfs_trans_handle *trans,
2246 			    struct reloc_control *rc,
2247 			    struct backref_node *node,
2248 			    struct rb_root *blocks)
2249 {
2250 	struct tree_block *block;
2251 	struct rb_node *rb_node;
2252 	u64 bytenr;
2253 	u64 ptr_gen;
2254 	u32 blocksize;
2255 	u32 nritems;
2256 	int i;
2257 	int err = 0;
2258 
2259 	nritems = btrfs_header_nritems(node->eb);
2260 	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261 	for (i = 0; i < nritems; i++) {
2262 		cond_resched();
2263 		bytenr = btrfs_node_blockptr(node->eb, i);
2264 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 		if (ptr_gen == trans->transid)
2266 			continue;
2267 		if (!in_block_group(bytenr, rc->block_group) &&
2268 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 			continue;
2270 		if (tree_block_processed(bytenr, blocksize, rc))
2271 			continue;
2272 
2273 		readahead_tree_block(rc->extent_root,
2274 				     bytenr, blocksize, ptr_gen);
2275 	}
2276 
2277 	for (i = 0; i < nritems; i++) {
2278 		cond_resched();
2279 		bytenr = btrfs_node_blockptr(node->eb, i);
2280 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281 		if (ptr_gen == trans->transid)
2282 			continue;
2283 		if (!in_block_group(bytenr, rc->block_group) &&
2284 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285 			continue;
2286 		if (tree_block_processed(bytenr, blocksize, rc))
2287 			continue;
2288 		if (!in_block_group(bytenr, rc->block_group) &&
2289 		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290 			continue;
2291 
2292 		block = kmalloc(sizeof(*block), GFP_NOFS);
2293 		if (!block) {
2294 			err = -ENOMEM;
2295 			break;
2296 		}
2297 		block->bytenr = bytenr;
2298 		btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299 		block->level = node->level - 1;
2300 		block->key_ready = 1;
2301 		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302 		BUG_ON(rb_node);
2303 	}
2304 	if (err)
2305 		free_block_list(blocks);
2306 	return err;
2307 }
2308 
2309 /*
2310  * find adjacent blocks require processing
2311  */
2312 static noinline_for_stack
2313 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314 			struct reloc_control *rc,
2315 			struct backref_cache *cache,
2316 			struct rb_root *blocks, int level,
2317 			struct backref_node **upper)
2318 {
2319 	struct backref_node *node;
2320 	int ret = 0;
2321 
2322 	WARN_ON(!list_empty(&cache->pending[level]));
2323 
2324 	if (list_empty(&cache->pending[level + 1]))
2325 		return 1;
2326 
2327 	node = list_entry(cache->pending[level + 1].next,
2328 			  struct backref_node, lower);
2329 	if (node->eb)
2330 		ret = add_child_blocks(trans, rc, node, blocks);
2331 
2332 	*upper = node;
2333 	return ret;
2334 }
2335 
2336 static int get_tree_block_key(struct reloc_control *rc,
2337 			      struct tree_block *block)
2338 {
2339 	struct extent_buffer *eb;
2340 
2341 	BUG_ON(block->key_ready);
2342 	eb = read_tree_block(rc->extent_root, block->bytenr,
2343 			     block->key.objectid, block->key.offset);
2344 	WARN_ON(btrfs_header_level(eb) != block->level);
2345 	if (block->level == 0)
2346 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2347 	else
2348 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2349 	free_extent_buffer(eb);
2350 	block->key_ready = 1;
2351 	return 0;
2352 }
2353 
2354 static int reada_tree_block(struct reloc_control *rc,
2355 			    struct tree_block *block)
2356 {
2357 	BUG_ON(block->key_ready);
2358 	readahead_tree_block(rc->extent_root, block->bytenr,
2359 			     block->key.objectid, block->key.offset);
2360 	return 0;
2361 }
2362 
2363 /*
2364  * helper function to relocate a tree block
2365  */
2366 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2367 				struct reloc_control *rc,
2368 				struct backref_node *node,
2369 				struct btrfs_key *key,
2370 				struct btrfs_path *path)
2371 {
2372 	struct btrfs_root *root;
2373 	int ret;
2374 
2375 	root = select_one_root(trans, node);
2376 	if (unlikely(!root)) {
2377 		rc->found_old_snapshot = 1;
2378 		update_processed_blocks(rc, node);
2379 		return 0;
2380 	}
2381 
2382 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2383 		ret = do_relocation(trans, node, key, path, 1);
2384 		if (ret < 0)
2385 			goto out;
2386 		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387 			ret = replace_file_extents(trans, rc, root,
2388 						   node->eb, NULL);
2389 			if (ret < 0)
2390 				goto out;
2391 		}
2392 		drop_node_buffer(node);
2393 	} else if (!root->ref_cows) {
2394 		path->lowest_level = node->level;
2395 		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396 		btrfs_release_path(root, path);
2397 		if (ret < 0)
2398 			goto out;
2399 	} else if (root != node->root) {
2400 		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401 	}
2402 
2403 	update_processed_blocks(rc, node);
2404 	ret = 0;
2405 out:
2406 	drop_node_buffer(node);
2407 	return ret;
2408 }
2409 
2410 /*
2411  * relocate a list of blocks
2412  */
2413 static noinline_for_stack
2414 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415 			 struct reloc_control *rc, struct rb_root *blocks)
2416 {
2417 	struct backref_cache *cache;
2418 	struct backref_node *node;
2419 	struct btrfs_path *path;
2420 	struct tree_block *block;
2421 	struct rb_node *rb_node;
2422 	int level = -1;
2423 	int ret;
2424 	int err = 0;
2425 
2426 	path = btrfs_alloc_path();
2427 	if (!path)
2428 		return -ENOMEM;
2429 
2430 	cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431 	if (!cache) {
2432 		btrfs_free_path(path);
2433 		return -ENOMEM;
2434 	}
2435 
2436 	backref_cache_init(cache);
2437 
2438 	rb_node = rb_first(blocks);
2439 	while (rb_node) {
2440 		block = rb_entry(rb_node, struct tree_block, rb_node);
2441 		if (level == -1)
2442 			level = block->level;
2443 		else
2444 			BUG_ON(level != block->level);
2445 		if (!block->key_ready)
2446 			reada_tree_block(rc, block);
2447 		rb_node = rb_next(rb_node);
2448 	}
2449 
2450 	rb_node = rb_first(blocks);
2451 	while (rb_node) {
2452 		block = rb_entry(rb_node, struct tree_block, rb_node);
2453 		if (!block->key_ready)
2454 			get_tree_block_key(rc, block);
2455 		rb_node = rb_next(rb_node);
2456 	}
2457 
2458 	rb_node = rb_first(blocks);
2459 	while (rb_node) {
2460 		block = rb_entry(rb_node, struct tree_block, rb_node);
2461 
2462 		node = build_backref_tree(rc, cache, &block->key,
2463 					  block->level, block->bytenr);
2464 		if (IS_ERR(node)) {
2465 			err = PTR_ERR(node);
2466 			goto out;
2467 		}
2468 
2469 		ret = relocate_tree_block(trans, rc, node, &block->key,
2470 					  path);
2471 		if (ret < 0) {
2472 			err = ret;
2473 			goto out;
2474 		}
2475 		remove_backref_node(cache, node);
2476 		rb_node = rb_next(rb_node);
2477 	}
2478 
2479 	if (level > 0)
2480 		goto out;
2481 
2482 	free_block_list(blocks);
2483 
2484 	/*
2485 	 * now backrefs of some upper level tree blocks have been cached,
2486 	 * try relocating blocks referenced by these upper level blocks.
2487 	 */
2488 	while (1) {
2489 		struct backref_node *upper = NULL;
2490 		if (trans->transaction->in_commit ||
2491 		    trans->transaction->delayed_refs.flushing)
2492 			break;
2493 
2494 		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2495 					  &upper);
2496 		if (ret < 0)
2497 			err = ret;
2498 		if (ret != 0)
2499 			break;
2500 
2501 		rb_node = rb_first(blocks);
2502 		while (rb_node) {
2503 			block = rb_entry(rb_node, struct tree_block, rb_node);
2504 			if (trans->transaction->in_commit ||
2505 			    trans->transaction->delayed_refs.flushing)
2506 				goto out;
2507 			BUG_ON(!block->key_ready);
2508 			node = build_backref_tree(rc, cache, &block->key,
2509 						  level, block->bytenr);
2510 			if (IS_ERR(node)) {
2511 				err = PTR_ERR(node);
2512 				goto out;
2513 			}
2514 
2515 			ret = relocate_tree_block(trans, rc, node,
2516 						  &block->key, path);
2517 			if (ret < 0) {
2518 				err = ret;
2519 				goto out;
2520 			}
2521 			remove_backref_node(cache, node);
2522 			rb_node = rb_next(rb_node);
2523 		}
2524 		free_block_list(blocks);
2525 
2526 		if (upper) {
2527 			ret = link_to_upper(trans, upper, path);
2528 			if (ret < 0) {
2529 				err = ret;
2530 				break;
2531 			}
2532 			remove_backref_node(cache, upper);
2533 		}
2534 	}
2535 out:
2536 	free_block_list(blocks);
2537 
2538 	ret = finish_pending_nodes(trans, cache, path);
2539 	if (ret < 0)
2540 		err = ret;
2541 
2542 	kfree(cache);
2543 	btrfs_free_path(path);
2544 	return err;
2545 }
2546 
2547 static noinline_for_stack
2548 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2549 			 u64 block_start)
2550 {
2551 	struct btrfs_root *root = BTRFS_I(inode)->root;
2552 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2553 	struct extent_map *em;
2554 	int ret = 0;
2555 
2556 	em = alloc_extent_map(GFP_NOFS);
2557 	if (!em)
2558 		return -ENOMEM;
2559 
2560 	em->start = start;
2561 	em->len = end + 1 - start;
2562 	em->block_len = em->len;
2563 	em->block_start = block_start;
2564 	em->bdev = root->fs_info->fs_devices->latest_bdev;
2565 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
2566 
2567 	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2568 	while (1) {
2569 		write_lock(&em_tree->lock);
2570 		ret = add_extent_mapping(em_tree, em);
2571 		write_unlock(&em_tree->lock);
2572 		if (ret != -EEXIST) {
2573 			free_extent_map(em);
2574 			break;
2575 		}
2576 		btrfs_drop_extent_cache(inode, start, end, 0);
2577 	}
2578 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2579 	return ret;
2580 }
2581 
2582 static int relocate_file_extent_cluster(struct inode *inode,
2583 					struct file_extent_cluster *cluster)
2584 {
2585 	u64 page_start;
2586 	u64 page_end;
2587 	u64 offset = BTRFS_I(inode)->index_cnt;
2588 	unsigned long index;
2589 	unsigned long last_index;
2590 	unsigned int dirty_page = 0;
2591 	struct page *page;
2592 	struct file_ra_state *ra;
2593 	int nr = 0;
2594 	int ret = 0;
2595 
2596 	if (!cluster->nr)
2597 		return 0;
2598 
2599 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
2600 	if (!ra)
2601 		return -ENOMEM;
2602 
2603 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2604 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2605 
2606 	mutex_lock(&inode->i_mutex);
2607 
2608 	i_size_write(inode, cluster->end + 1 - offset);
2609 	ret = setup_extent_mapping(inode, cluster->start - offset,
2610 				   cluster->end - offset, cluster->start);
2611 	if (ret)
2612 		goto out_unlock;
2613 
2614 	file_ra_state_init(ra, inode->i_mapping);
2615 
2616 	WARN_ON(cluster->start != cluster->boundary[0]);
2617 	while (index <= last_index) {
2618 		page = find_lock_page(inode->i_mapping, index);
2619 		if (!page) {
2620 			page_cache_sync_readahead(inode->i_mapping,
2621 						  ra, NULL, index,
2622 						  last_index + 1 - index);
2623 			page = grab_cache_page(inode->i_mapping, index);
2624 			if (!page) {
2625 				ret = -ENOMEM;
2626 				goto out_unlock;
2627 			}
2628 		}
2629 
2630 		if (PageReadahead(page)) {
2631 			page_cache_async_readahead(inode->i_mapping,
2632 						   ra, NULL, page, index,
2633 						   last_index + 1 - index);
2634 		}
2635 
2636 		if (!PageUptodate(page)) {
2637 			btrfs_readpage(NULL, page);
2638 			lock_page(page);
2639 			if (!PageUptodate(page)) {
2640 				unlock_page(page);
2641 				page_cache_release(page);
2642 				ret = -EIO;
2643 				goto out_unlock;
2644 			}
2645 		}
2646 
2647 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2648 		page_end = page_start + PAGE_CACHE_SIZE - 1;
2649 
2650 		lock_extent(&BTRFS_I(inode)->io_tree,
2651 			    page_start, page_end, GFP_NOFS);
2652 
2653 		set_page_extent_mapped(page);
2654 
2655 		if (nr < cluster->nr &&
2656 		    page_start + offset == cluster->boundary[nr]) {
2657 			set_extent_bits(&BTRFS_I(inode)->io_tree,
2658 					page_start, page_end,
2659 					EXTENT_BOUNDARY, GFP_NOFS);
2660 			nr++;
2661 		}
2662 		btrfs_set_extent_delalloc(inode, page_start, page_end);
2663 
2664 		set_page_dirty(page);
2665 		dirty_page++;
2666 
2667 		unlock_extent(&BTRFS_I(inode)->io_tree,
2668 			      page_start, page_end, GFP_NOFS);
2669 		unlock_page(page);
2670 		page_cache_release(page);
2671 
2672 		index++;
2673 		if (nr < cluster->nr &&
2674 		    page_end + 1 + offset == cluster->boundary[nr]) {
2675 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676 							   dirty_page);
2677 			dirty_page = 0;
2678 		}
2679 	}
2680 	if (dirty_page) {
2681 		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682 						   dirty_page);
2683 	}
2684 	WARN_ON(nr != cluster->nr);
2685 out_unlock:
2686 	mutex_unlock(&inode->i_mutex);
2687 	kfree(ra);
2688 	return ret;
2689 }
2690 
2691 static noinline_for_stack
2692 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2693 			 struct file_extent_cluster *cluster)
2694 {
2695 	int ret;
2696 
2697 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2698 		ret = relocate_file_extent_cluster(inode, cluster);
2699 		if (ret)
2700 			return ret;
2701 		cluster->nr = 0;
2702 	}
2703 
2704 	if (!cluster->nr)
2705 		cluster->start = extent_key->objectid;
2706 	else
2707 		BUG_ON(cluster->nr >= MAX_EXTENTS);
2708 	cluster->end = extent_key->objectid + extent_key->offset - 1;
2709 	cluster->boundary[cluster->nr] = extent_key->objectid;
2710 	cluster->nr++;
2711 
2712 	if (cluster->nr >= MAX_EXTENTS) {
2713 		ret = relocate_file_extent_cluster(inode, cluster);
2714 		if (ret)
2715 			return ret;
2716 		cluster->nr = 0;
2717 	}
2718 	return 0;
2719 }
2720 
2721 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2722 static int get_ref_objectid_v0(struct reloc_control *rc,
2723 			       struct btrfs_path *path,
2724 			       struct btrfs_key *extent_key,
2725 			       u64 *ref_objectid, int *path_change)
2726 {
2727 	struct btrfs_key key;
2728 	struct extent_buffer *leaf;
2729 	struct btrfs_extent_ref_v0 *ref0;
2730 	int ret;
2731 	int slot;
2732 
2733 	leaf = path->nodes[0];
2734 	slot = path->slots[0];
2735 	while (1) {
2736 		if (slot >= btrfs_header_nritems(leaf)) {
2737 			ret = btrfs_next_leaf(rc->extent_root, path);
2738 			if (ret < 0)
2739 				return ret;
2740 			BUG_ON(ret > 0);
2741 			leaf = path->nodes[0];
2742 			slot = path->slots[0];
2743 			if (path_change)
2744 				*path_change = 1;
2745 		}
2746 		btrfs_item_key_to_cpu(leaf, &key, slot);
2747 		if (key.objectid != extent_key->objectid)
2748 			return -ENOENT;
2749 
2750 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2751 			slot++;
2752 			continue;
2753 		}
2754 		ref0 = btrfs_item_ptr(leaf, slot,
2755 				struct btrfs_extent_ref_v0);
2756 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2757 		break;
2758 	}
2759 	return 0;
2760 }
2761 #endif
2762 
2763 /*
2764  * helper to add a tree block to the list.
2765  * the major work is getting the generation and level of the block
2766  */
2767 static int add_tree_block(struct reloc_control *rc,
2768 			  struct btrfs_key *extent_key,
2769 			  struct btrfs_path *path,
2770 			  struct rb_root *blocks)
2771 {
2772 	struct extent_buffer *eb;
2773 	struct btrfs_extent_item *ei;
2774 	struct btrfs_tree_block_info *bi;
2775 	struct tree_block *block;
2776 	struct rb_node *rb_node;
2777 	u32 item_size;
2778 	int level = -1;
2779 	int generation;
2780 
2781 	eb =  path->nodes[0];
2782 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
2783 
2784 	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2785 		ei = btrfs_item_ptr(eb, path->slots[0],
2786 				struct btrfs_extent_item);
2787 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2788 		generation = btrfs_extent_generation(eb, ei);
2789 		level = btrfs_tree_block_level(eb, bi);
2790 	} else {
2791 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2792 		u64 ref_owner;
2793 		int ret;
2794 
2795 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2796 		ret = get_ref_objectid_v0(rc, path, extent_key,
2797 					  &ref_owner, NULL);
2798 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2799 		level = (int)ref_owner;
2800 		/* FIXME: get real generation */
2801 		generation = 0;
2802 #else
2803 		BUG();
2804 #endif
2805 	}
2806 
2807 	btrfs_release_path(rc->extent_root, path);
2808 
2809 	BUG_ON(level == -1);
2810 
2811 	block = kmalloc(sizeof(*block), GFP_NOFS);
2812 	if (!block)
2813 		return -ENOMEM;
2814 
2815 	block->bytenr = extent_key->objectid;
2816 	block->key.objectid = extent_key->offset;
2817 	block->key.offset = generation;
2818 	block->level = level;
2819 	block->key_ready = 0;
2820 
2821 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2822 	BUG_ON(rb_node);
2823 
2824 	return 0;
2825 }
2826 
2827 /*
2828  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2829  */
2830 static int __add_tree_block(struct reloc_control *rc,
2831 			    u64 bytenr, u32 blocksize,
2832 			    struct rb_root *blocks)
2833 {
2834 	struct btrfs_path *path;
2835 	struct btrfs_key key;
2836 	int ret;
2837 
2838 	if (tree_block_processed(bytenr, blocksize, rc))
2839 		return 0;
2840 
2841 	if (tree_search(blocks, bytenr))
2842 		return 0;
2843 
2844 	path = btrfs_alloc_path();
2845 	if (!path)
2846 		return -ENOMEM;
2847 
2848 	key.objectid = bytenr;
2849 	key.type = BTRFS_EXTENT_ITEM_KEY;
2850 	key.offset = blocksize;
2851 
2852 	path->search_commit_root = 1;
2853 	path->skip_locking = 1;
2854 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2855 	if (ret < 0)
2856 		goto out;
2857 	BUG_ON(ret);
2858 
2859 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2860 	ret = add_tree_block(rc, &key, path, blocks);
2861 out:
2862 	btrfs_free_path(path);
2863 	return ret;
2864 }
2865 
2866 /*
2867  * helper to check if the block use full backrefs for pointers in it
2868  */
2869 static int block_use_full_backref(struct reloc_control *rc,
2870 				  struct extent_buffer *eb)
2871 {
2872 	struct btrfs_path *path;
2873 	struct btrfs_extent_item *ei;
2874 	struct btrfs_key key;
2875 	u64 flags;
2876 	int ret;
2877 
2878 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2879 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880 		return 1;
2881 
2882 	path = btrfs_alloc_path();
2883 	BUG_ON(!path);
2884 
2885 	key.objectid = eb->start;
2886 	key.type = BTRFS_EXTENT_ITEM_KEY;
2887 	key.offset = eb->len;
2888 
2889 	path->search_commit_root = 1;
2890 	path->skip_locking = 1;
2891 	ret = btrfs_search_slot(NULL, rc->extent_root,
2892 				&key, path, 0, 0);
2893 	BUG_ON(ret);
2894 
2895 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896 			    struct btrfs_extent_item);
2897 	flags = btrfs_extent_flags(path->nodes[0], ei);
2898 	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900 		ret = 1;
2901 	else
2902 		ret = 0;
2903 	btrfs_free_path(path);
2904 	return ret;
2905 }
2906 
2907 /*
2908  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2909  * this function scans fs tree to find blocks reference the data extent
2910  */
2911 static int find_data_references(struct reloc_control *rc,
2912 				struct btrfs_key *extent_key,
2913 				struct extent_buffer *leaf,
2914 				struct btrfs_extent_data_ref *ref,
2915 				struct rb_root *blocks)
2916 {
2917 	struct btrfs_path *path;
2918 	struct tree_block *block;
2919 	struct btrfs_root *root;
2920 	struct btrfs_file_extent_item *fi;
2921 	struct rb_node *rb_node;
2922 	struct btrfs_key key;
2923 	u64 ref_root;
2924 	u64 ref_objectid;
2925 	u64 ref_offset;
2926 	u32 ref_count;
2927 	u32 nritems;
2928 	int err = 0;
2929 	int added = 0;
2930 	int counted;
2931 	int ret;
2932 
2933 	path = btrfs_alloc_path();
2934 	if (!path)
2935 		return -ENOMEM;
2936 
2937 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
2938 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2939 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2940 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
2941 
2942 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
2943 	if (IS_ERR(root)) {
2944 		err = PTR_ERR(root);
2945 		goto out;
2946 	}
2947 
2948 	key.objectid = ref_objectid;
2949 	key.offset = ref_offset;
2950 	key.type = BTRFS_EXTENT_DATA_KEY;
2951 
2952 	path->search_commit_root = 1;
2953 	path->skip_locking = 1;
2954 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2955 	if (ret < 0) {
2956 		err = ret;
2957 		goto out;
2958 	}
2959 
2960 	leaf = path->nodes[0];
2961 	nritems = btrfs_header_nritems(leaf);
2962 	/*
2963 	 * the references in tree blocks that use full backrefs
2964 	 * are not counted in
2965 	 */
2966 	if (block_use_full_backref(rc, leaf))
2967 		counted = 0;
2968 	else
2969 		counted = 1;
2970 	rb_node = tree_search(blocks, leaf->start);
2971 	if (rb_node) {
2972 		if (counted)
2973 			added = 1;
2974 		else
2975 			path->slots[0] = nritems;
2976 	}
2977 
2978 	while (ref_count > 0) {
2979 		while (path->slots[0] >= nritems) {
2980 			ret = btrfs_next_leaf(root, path);
2981 			if (ret < 0) {
2982 				err = ret;
2983 				goto out;
2984 			}
2985 			if (ret > 0) {
2986 				WARN_ON(1);
2987 				goto out;
2988 			}
2989 
2990 			leaf = path->nodes[0];
2991 			nritems = btrfs_header_nritems(leaf);
2992 			added = 0;
2993 
2994 			if (block_use_full_backref(rc, leaf))
2995 				counted = 0;
2996 			else
2997 				counted = 1;
2998 			rb_node = tree_search(blocks, leaf->start);
2999 			if (rb_node) {
3000 				if (counted)
3001 					added = 1;
3002 				else
3003 					path->slots[0] = nritems;
3004 			}
3005 		}
3006 
3007 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3008 		if (key.objectid != ref_objectid ||
3009 		    key.type != BTRFS_EXTENT_DATA_KEY) {
3010 			WARN_ON(1);
3011 			break;
3012 		}
3013 
3014 		fi = btrfs_item_ptr(leaf, path->slots[0],
3015 				    struct btrfs_file_extent_item);
3016 
3017 		if (btrfs_file_extent_type(leaf, fi) ==
3018 		    BTRFS_FILE_EXTENT_INLINE)
3019 			goto next;
3020 
3021 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3022 		    extent_key->objectid)
3023 			goto next;
3024 
3025 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3026 		if (key.offset != ref_offset)
3027 			goto next;
3028 
3029 		if (counted)
3030 			ref_count--;
3031 		if (added)
3032 			goto next;
3033 
3034 		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3035 			block = kmalloc(sizeof(*block), GFP_NOFS);
3036 			if (!block) {
3037 				err = -ENOMEM;
3038 				break;
3039 			}
3040 			block->bytenr = leaf->start;
3041 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3042 			block->level = 0;
3043 			block->key_ready = 1;
3044 			rb_node = tree_insert(blocks, block->bytenr,
3045 					      &block->rb_node);
3046 			BUG_ON(rb_node);
3047 		}
3048 		if (counted)
3049 			added = 1;
3050 		else
3051 			path->slots[0] = nritems;
3052 next:
3053 		path->slots[0]++;
3054 
3055 	}
3056 out:
3057 	btrfs_free_path(path);
3058 	return err;
3059 }
3060 
3061 /*
3062  * hepler to find all tree blocks that reference a given data extent
3063  */
3064 static noinline_for_stack
3065 int add_data_references(struct reloc_control *rc,
3066 			struct btrfs_key *extent_key,
3067 			struct btrfs_path *path,
3068 			struct rb_root *blocks)
3069 {
3070 	struct btrfs_key key;
3071 	struct extent_buffer *eb;
3072 	struct btrfs_extent_data_ref *dref;
3073 	struct btrfs_extent_inline_ref *iref;
3074 	unsigned long ptr;
3075 	unsigned long end;
3076 	u32 blocksize;
3077 	int ret;
3078 	int err = 0;
3079 
3080 	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081 			       extent_key->offset);
3082 	BUG_ON(ret < 0);
3083 	if (ret > 0) {
3084 		/* the relocated data is fragmented */
3085 		rc->extents_skipped++;
3086 		btrfs_release_path(rc->extent_root, path);
3087 		return 0;
3088 	}
3089 
3090 	blocksize = btrfs_level_size(rc->extent_root, 0);
3091 
3092 	eb = path->nodes[0];
3093 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3095 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3096 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3097 		ptr = end;
3098 	else
3099 #endif
3100 		ptr += sizeof(struct btrfs_extent_item);
3101 
3102 	while (ptr < end) {
3103 		iref = (struct btrfs_extent_inline_ref *)ptr;
3104 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3105 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3106 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3107 			ret = __add_tree_block(rc, key.offset, blocksize,
3108 					       blocks);
3109 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3110 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3111 			ret = find_data_references(rc, extent_key,
3112 						   eb, dref, blocks);
3113 		} else {
3114 			BUG();
3115 		}
3116 		ptr += btrfs_extent_inline_ref_size(key.type);
3117 	}
3118 	WARN_ON(ptr > end);
3119 
3120 	while (1) {
3121 		cond_resched();
3122 		eb = path->nodes[0];
3123 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3124 			ret = btrfs_next_leaf(rc->extent_root, path);
3125 			if (ret < 0) {
3126 				err = ret;
3127 				break;
3128 			}
3129 			if (ret > 0)
3130 				break;
3131 			eb = path->nodes[0];
3132 		}
3133 
3134 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3135 		if (key.objectid != extent_key->objectid)
3136 			break;
3137 
3138 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3139 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3140 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3141 #else
3142 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3143 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3144 #endif
3145 			ret = __add_tree_block(rc, key.offset, blocksize,
3146 					       blocks);
3147 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3148 			dref = btrfs_item_ptr(eb, path->slots[0],
3149 					      struct btrfs_extent_data_ref);
3150 			ret = find_data_references(rc, extent_key,
3151 						   eb, dref, blocks);
3152 		} else {
3153 			ret = 0;
3154 		}
3155 		if (ret) {
3156 			err = ret;
3157 			break;
3158 		}
3159 		path->slots[0]++;
3160 	}
3161 	btrfs_release_path(rc->extent_root, path);
3162 	if (err)
3163 		free_block_list(blocks);
3164 	return err;
3165 }
3166 
3167 /*
3168  * hepler to find next unprocessed extent
3169  */
3170 static noinline_for_stack
3171 int find_next_extent(struct btrfs_trans_handle *trans,
3172 		     struct reloc_control *rc, struct btrfs_path *path)
3173 {
3174 	struct btrfs_key key;
3175 	struct extent_buffer *leaf;
3176 	u64 start, end, last;
3177 	int ret;
3178 
3179 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3180 	while (1) {
3181 		cond_resched();
3182 		if (rc->search_start >= last) {
3183 			ret = 1;
3184 			break;
3185 		}
3186 
3187 		key.objectid = rc->search_start;
3188 		key.type = BTRFS_EXTENT_ITEM_KEY;
3189 		key.offset = 0;
3190 
3191 		path->search_commit_root = 1;
3192 		path->skip_locking = 1;
3193 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3194 					0, 0);
3195 		if (ret < 0)
3196 			break;
3197 next:
3198 		leaf = path->nodes[0];
3199 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3200 			ret = btrfs_next_leaf(rc->extent_root, path);
3201 			if (ret != 0)
3202 				break;
3203 			leaf = path->nodes[0];
3204 		}
3205 
3206 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3207 		if (key.objectid >= last) {
3208 			ret = 1;
3209 			break;
3210 		}
3211 
3212 		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3213 		    key.objectid + key.offset <= rc->search_start) {
3214 			path->slots[0]++;
3215 			goto next;
3216 		}
3217 
3218 		ret = find_first_extent_bit(&rc->processed_blocks,
3219 					    key.objectid, &start, &end,
3220 					    EXTENT_DIRTY);
3221 
3222 		if (ret == 0 && start <= key.objectid) {
3223 			btrfs_release_path(rc->extent_root, path);
3224 			rc->search_start = end + 1;
3225 		} else {
3226 			rc->search_start = key.objectid + key.offset;
3227 			return 0;
3228 		}
3229 	}
3230 	btrfs_release_path(rc->extent_root, path);
3231 	return ret;
3232 }
3233 
3234 static void set_reloc_control(struct reloc_control *rc)
3235 {
3236 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3237 	mutex_lock(&fs_info->trans_mutex);
3238 	fs_info->reloc_ctl = rc;
3239 	mutex_unlock(&fs_info->trans_mutex);
3240 }
3241 
3242 static void unset_reloc_control(struct reloc_control *rc)
3243 {
3244 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3245 	mutex_lock(&fs_info->trans_mutex);
3246 	fs_info->reloc_ctl = NULL;
3247 	mutex_unlock(&fs_info->trans_mutex);
3248 }
3249 
3250 static int check_extent_flags(u64 flags)
3251 {
3252 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3253 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3254 		return 1;
3255 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3256 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3257 		return 1;
3258 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3259 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3260 		return 1;
3261 	return 0;
3262 }
3263 
3264 
3265 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266 {
3267 	struct rb_root blocks = RB_ROOT;
3268 	struct btrfs_key key;
3269 	struct file_extent_cluster *cluster;
3270 	struct btrfs_trans_handle *trans = NULL;
3271 	struct btrfs_path *path;
3272 	struct btrfs_extent_item *ei;
3273 	unsigned long nr;
3274 	u64 flags;
3275 	u32 item_size;
3276 	int ret;
3277 	int err = 0;
3278 
3279 	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280 	if (!cluster)
3281 		return -ENOMEM;
3282 
3283 	path = btrfs_alloc_path();
3284 	if (!path)
3285 		return -ENOMEM;
3286 
3287 	rc->extents_found = 0;
3288 	rc->extents_skipped = 0;
3289 
3290 	rc->search_start = rc->block_group->key.objectid;
3291 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3292 			  GFP_NOFS);
3293 
3294 	rc->create_reloc_root = 1;
3295 	set_reloc_control(rc);
3296 
3297 	trans = btrfs_start_transaction(rc->extent_root, 1);
3298 	btrfs_commit_transaction(trans, rc->extent_root);
3299 
3300 	while (1) {
3301 		trans = btrfs_start_transaction(rc->extent_root, 1);
3302 
3303 		ret = find_next_extent(trans, rc, path);
3304 		if (ret < 0)
3305 			err = ret;
3306 		if (ret != 0)
3307 			break;
3308 
3309 		rc->extents_found++;
3310 
3311 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3312 				    struct btrfs_extent_item);
3313 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3314 		item_size = btrfs_item_size_nr(path->nodes[0],
3315 					       path->slots[0]);
3316 		if (item_size >= sizeof(*ei)) {
3317 			flags = btrfs_extent_flags(path->nodes[0], ei);
3318 			ret = check_extent_flags(flags);
3319 			BUG_ON(ret);
3320 
3321 		} else {
3322 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3323 			u64 ref_owner;
3324 			int path_change = 0;
3325 
3326 			BUG_ON(item_size !=
3327 			       sizeof(struct btrfs_extent_item_v0));
3328 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3329 						  &path_change);
3330 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3331 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3332 			else
3333 				flags = BTRFS_EXTENT_FLAG_DATA;
3334 
3335 			if (path_change) {
3336 				btrfs_release_path(rc->extent_root, path);
3337 
3338 				path->search_commit_root = 1;
3339 				path->skip_locking = 1;
3340 				ret = btrfs_search_slot(NULL, rc->extent_root,
3341 							&key, path, 0, 0);
3342 				if (ret < 0) {
3343 					err = ret;
3344 					break;
3345 				}
3346 				BUG_ON(ret > 0);
3347 			}
3348 #else
3349 			BUG();
3350 #endif
3351 		}
3352 
3353 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3354 			ret = add_tree_block(rc, &key, path, &blocks);
3355 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3356 			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3357 			ret = add_data_references(rc, &key, path, &blocks);
3358 		} else {
3359 			btrfs_release_path(rc->extent_root, path);
3360 			ret = 0;
3361 		}
3362 		if (ret < 0) {
3363 			err = 0;
3364 			break;
3365 		}
3366 
3367 		if (!RB_EMPTY_ROOT(&blocks)) {
3368 			ret = relocate_tree_blocks(trans, rc, &blocks);
3369 			if (ret < 0) {
3370 				err = ret;
3371 				break;
3372 			}
3373 		}
3374 
3375 		nr = trans->blocks_used;
3376 		btrfs_end_transaction(trans, rc->extent_root);
3377 		trans = NULL;
3378 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3379 
3380 		if (rc->stage == MOVE_DATA_EXTENTS &&
3381 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3382 			rc->found_file_extent = 1;
3383 			ret = relocate_data_extent(rc->data_inode,
3384 						   &key, cluster);
3385 			if (ret < 0) {
3386 				err = ret;
3387 				break;
3388 			}
3389 		}
3390 	}
3391 	btrfs_free_path(path);
3392 
3393 	if (trans) {
3394 		nr = trans->blocks_used;
3395 		btrfs_end_transaction(trans, rc->extent_root);
3396 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3397 	}
3398 
3399 	if (!err) {
3400 		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3401 		if (ret < 0)
3402 			err = ret;
3403 	}
3404 
3405 	kfree(cluster);
3406 
3407 	rc->create_reloc_root = 0;
3408 	smp_mb();
3409 
3410 	if (rc->extents_found > 0) {
3411 		trans = btrfs_start_transaction(rc->extent_root, 1);
3412 		btrfs_commit_transaction(trans, rc->extent_root);
3413 	}
3414 
3415 	merge_reloc_roots(rc);
3416 
3417 	unset_reloc_control(rc);
3418 
3419 	/* get rid of pinned extents */
3420 	trans = btrfs_start_transaction(rc->extent_root, 1);
3421 	btrfs_commit_transaction(trans, rc->extent_root);
3422 
3423 	return err;
3424 }
3425 
3426 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3427 				 struct btrfs_root *root, u64 objectid)
3428 {
3429 	struct btrfs_path *path;
3430 	struct btrfs_inode_item *item;
3431 	struct extent_buffer *leaf;
3432 	int ret;
3433 
3434 	path = btrfs_alloc_path();
3435 	if (!path)
3436 		return -ENOMEM;
3437 
3438 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3439 	if (ret)
3440 		goto out;
3441 
3442 	leaf = path->nodes[0];
3443 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3444 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3445 	btrfs_set_inode_generation(leaf, item, 1);
3446 	btrfs_set_inode_size(leaf, item, 0);
3447 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3448 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3449 	btrfs_mark_buffer_dirty(leaf);
3450 	btrfs_release_path(root, path);
3451 out:
3452 	btrfs_free_path(path);
3453 	return ret;
3454 }
3455 
3456 /*
3457  * helper to create inode for data relocation.
3458  * the inode is in data relocation tree and its link count is 0
3459  */
3460 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3461 					struct btrfs_block_group_cache *group)
3462 {
3463 	struct inode *inode = NULL;
3464 	struct btrfs_trans_handle *trans;
3465 	struct btrfs_root *root;
3466 	struct btrfs_key key;
3467 	unsigned long nr;
3468 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3469 	int err = 0;
3470 
3471 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3472 	if (IS_ERR(root))
3473 		return ERR_CAST(root);
3474 
3475 	trans = btrfs_start_transaction(root, 1);
3476 	BUG_ON(!trans);
3477 
3478 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3479 	if (err)
3480 		goto out;
3481 
3482 	err = __insert_orphan_inode(trans, root, objectid);
3483 	BUG_ON(err);
3484 
3485 	key.objectid = objectid;
3486 	key.type = BTRFS_INODE_ITEM_KEY;
3487 	key.offset = 0;
3488 	inode = btrfs_iget(root->fs_info->sb, &key, root);
3489 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3490 	BTRFS_I(inode)->index_cnt = group->key.objectid;
3491 
3492 	err = btrfs_orphan_add(trans, inode);
3493 out:
3494 	nr = trans->blocks_used;
3495 	btrfs_end_transaction(trans, root);
3496 
3497 	btrfs_btree_balance_dirty(root, nr);
3498 	if (err) {
3499 		if (inode)
3500 			iput(inode);
3501 		inode = ERR_PTR(err);
3502 	}
3503 	return inode;
3504 }
3505 
3506 /*
3507  * function to relocate all extents in a block group.
3508  */
3509 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3510 {
3511 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3512 	struct reloc_control *rc;
3513 	int ret;
3514 	int err = 0;
3515 
3516 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3517 	if (!rc)
3518 		return -ENOMEM;
3519 
3520 	mapping_tree_init(&rc->reloc_root_tree);
3521 	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3522 	INIT_LIST_HEAD(&rc->reloc_roots);
3523 
3524 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3525 	BUG_ON(!rc->block_group);
3526 
3527 	btrfs_init_workers(&rc->workers, "relocate",
3528 			   fs_info->thread_pool_size, NULL);
3529 
3530 	rc->extent_root = extent_root;
3531 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3532 
3533 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3534 	if (IS_ERR(rc->data_inode)) {
3535 		err = PTR_ERR(rc->data_inode);
3536 		rc->data_inode = NULL;
3537 		goto out;
3538 	}
3539 
3540 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3541 	       (unsigned long long)rc->block_group->key.objectid,
3542 	       (unsigned long long)rc->block_group->flags);
3543 
3544 	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
3545 	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3546 
3547 	while (1) {
3548 		rc->extents_found = 0;
3549 		rc->extents_skipped = 0;
3550 
3551 		mutex_lock(&fs_info->cleaner_mutex);
3552 
3553 		btrfs_clean_old_snapshots(fs_info->tree_root);
3554 		ret = relocate_block_group(rc);
3555 
3556 		mutex_unlock(&fs_info->cleaner_mutex);
3557 		if (ret < 0) {
3558 			err = ret;
3559 			break;
3560 		}
3561 
3562 		if (rc->extents_found == 0)
3563 			break;
3564 
3565 		printk(KERN_INFO "btrfs: found %llu extents\n",
3566 			(unsigned long long)rc->extents_found);
3567 
3568 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3569 			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3570 			invalidate_mapping_pages(rc->data_inode->i_mapping,
3571 						 0, -1);
3572 			rc->stage = UPDATE_DATA_PTRS;
3573 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3574 			   rc->extents_skipped >= rc->extents_found) {
3575 			iput(rc->data_inode);
3576 			rc->data_inode = create_reloc_inode(fs_info,
3577 							    rc->block_group);
3578 			if (IS_ERR(rc->data_inode)) {
3579 				err = PTR_ERR(rc->data_inode);
3580 				rc->data_inode = NULL;
3581 				break;
3582 			}
3583 			rc->stage = MOVE_DATA_EXTENTS;
3584 			rc->found_file_extent = 0;
3585 		}
3586 	}
3587 
3588 	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3589 				     rc->block_group->key.objectid,
3590 				     rc->block_group->key.objectid +
3591 				     rc->block_group->key.offset - 1);
3592 
3593 	WARN_ON(rc->block_group->pinned > 0);
3594 	WARN_ON(rc->block_group->reserved > 0);
3595 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3596 out:
3597 	iput(rc->data_inode);
3598 	btrfs_stop_workers(&rc->workers);
3599 	btrfs_put_block_group(rc->block_group);
3600 	kfree(rc);
3601 	return err;
3602 }
3603 
3604 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3605 {
3606 	struct btrfs_trans_handle *trans;
3607 	int ret;
3608 
3609 	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3610 
3611 	memset(&root->root_item.drop_progress, 0,
3612 		sizeof(root->root_item.drop_progress));
3613 	root->root_item.drop_level = 0;
3614 	btrfs_set_root_refs(&root->root_item, 0);
3615 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
3616 				&root->root_key, &root->root_item);
3617 	BUG_ON(ret);
3618 
3619 	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3620 	BUG_ON(ret);
3621 	return 0;
3622 }
3623 
3624 /*
3625  * recover relocation interrupted by system crash.
3626  *
3627  * this function resumes merging reloc trees with corresponding fs trees.
3628  * this is important for keeping the sharing of tree blocks
3629  */
3630 int btrfs_recover_relocation(struct btrfs_root *root)
3631 {
3632 	LIST_HEAD(reloc_roots);
3633 	struct btrfs_key key;
3634 	struct btrfs_root *fs_root;
3635 	struct btrfs_root *reloc_root;
3636 	struct btrfs_path *path;
3637 	struct extent_buffer *leaf;
3638 	struct reloc_control *rc = NULL;
3639 	struct btrfs_trans_handle *trans;
3640 	int ret;
3641 	int err = 0;
3642 
3643 	path = btrfs_alloc_path();
3644 	if (!path)
3645 		return -ENOMEM;
3646 
3647 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3648 	key.type = BTRFS_ROOT_ITEM_KEY;
3649 	key.offset = (u64)-1;
3650 
3651 	while (1) {
3652 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3653 					path, 0, 0);
3654 		if (ret < 0) {
3655 			err = ret;
3656 			goto out;
3657 		}
3658 		if (ret > 0) {
3659 			if (path->slots[0] == 0)
3660 				break;
3661 			path->slots[0]--;
3662 		}
3663 		leaf = path->nodes[0];
3664 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3665 		btrfs_release_path(root->fs_info->tree_root, path);
3666 
3667 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3668 		    key.type != BTRFS_ROOT_ITEM_KEY)
3669 			break;
3670 
3671 		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3672 		if (IS_ERR(reloc_root)) {
3673 			err = PTR_ERR(reloc_root);
3674 			goto out;
3675 		}
3676 
3677 		list_add(&reloc_root->root_list, &reloc_roots);
3678 
3679 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3680 			fs_root = read_fs_root(root->fs_info,
3681 					       reloc_root->root_key.offset);
3682 			if (IS_ERR(fs_root)) {
3683 				ret = PTR_ERR(fs_root);
3684 				if (ret != -ENOENT) {
3685 					err = ret;
3686 					goto out;
3687 				}
3688 				mark_garbage_root(reloc_root);
3689 			}
3690 		}
3691 
3692 		if (key.offset == 0)
3693 			break;
3694 
3695 		key.offset--;
3696 	}
3697 	btrfs_release_path(root->fs_info->tree_root, path);
3698 
3699 	if (list_empty(&reloc_roots))
3700 		goto out;
3701 
3702 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3703 	if (!rc) {
3704 		err = -ENOMEM;
3705 		goto out;
3706 	}
3707 
3708 	mapping_tree_init(&rc->reloc_root_tree);
3709 	INIT_LIST_HEAD(&rc->reloc_roots);
3710 	btrfs_init_workers(&rc->workers, "relocate",
3711 			   root->fs_info->thread_pool_size, NULL);
3712 	rc->extent_root = root->fs_info->extent_root;
3713 
3714 	set_reloc_control(rc);
3715 
3716 	while (!list_empty(&reloc_roots)) {
3717 		reloc_root = list_entry(reloc_roots.next,
3718 					struct btrfs_root, root_list);
3719 		list_del(&reloc_root->root_list);
3720 
3721 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3722 			list_add_tail(&reloc_root->root_list,
3723 				      &rc->reloc_roots);
3724 			continue;
3725 		}
3726 
3727 		fs_root = read_fs_root(root->fs_info,
3728 				       reloc_root->root_key.offset);
3729 		BUG_ON(IS_ERR(fs_root));
3730 
3731 		__add_reloc_root(reloc_root);
3732 		fs_root->reloc_root = reloc_root;
3733 	}
3734 
3735 	trans = btrfs_start_transaction(rc->extent_root, 1);
3736 	btrfs_commit_transaction(trans, rc->extent_root);
3737 
3738 	merge_reloc_roots(rc);
3739 
3740 	unset_reloc_control(rc);
3741 
3742 	trans = btrfs_start_transaction(rc->extent_root, 1);
3743 	btrfs_commit_transaction(trans, rc->extent_root);
3744 out:
3745 	if (rc) {
3746 		btrfs_stop_workers(&rc->workers);
3747 		kfree(rc);
3748 	}
3749 	while (!list_empty(&reloc_roots)) {
3750 		reloc_root = list_entry(reloc_roots.next,
3751 					struct btrfs_root, root_list);
3752 		list_del(&reloc_root->root_list);
3753 		free_extent_buffer(reloc_root->node);
3754 		free_extent_buffer(reloc_root->commit_root);
3755 		kfree(reloc_root);
3756 	}
3757 	btrfs_free_path(path);
3758 
3759 	if (err == 0) {
3760 		/* cleanup orphan inode in data relocation tree */
3761 		fs_root = read_fs_root(root->fs_info,
3762 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
3763 		if (IS_ERR(fs_root))
3764 			err = PTR_ERR(fs_root);
3765 		btrfs_orphan_cleanup(fs_root);
3766 	}
3767 	return err;
3768 }
3769 
3770 /*
3771  * helper to add ordered checksum for data relocation.
3772  *
3773  * cloning checksum properly handles the nodatasum extents.
3774  * it also saves CPU time to re-calculate the checksum.
3775  */
3776 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3777 {
3778 	struct btrfs_ordered_sum *sums;
3779 	struct btrfs_sector_sum *sector_sum;
3780 	struct btrfs_ordered_extent *ordered;
3781 	struct btrfs_root *root = BTRFS_I(inode)->root;
3782 	size_t offset;
3783 	int ret;
3784 	u64 disk_bytenr;
3785 	LIST_HEAD(list);
3786 
3787 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3788 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3789 
3790 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3791 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3792 				       disk_bytenr + len - 1, &list);
3793 
3794 	while (!list_empty(&list)) {
3795 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3796 		list_del_init(&sums->list);
3797 
3798 		sector_sum = sums->sums;
3799 		sums->bytenr = ordered->start;
3800 
3801 		offset = 0;
3802 		while (offset < sums->len) {
3803 			sector_sum->bytenr += ordered->start - disk_bytenr;
3804 			sector_sum++;
3805 			offset += root->sectorsize;
3806 		}
3807 
3808 		btrfs_add_ordered_sum(inode, ordered, sums);
3809 	}
3810 	btrfs_put_ordered_extent(ordered);
3811 	return 0;
3812 }
3813