xref: /openbmc/linux/fs/btrfs/relocation.c (revision 4dc7ccf7)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 
33 /*
34  * backref_node, mapping_node and tree_block start with this
35  */
36 struct tree_entry {
37 	struct rb_node rb_node;
38 	u64 bytenr;
39 };
40 
41 /*
42  * present a tree block in the backref cache
43  */
44 struct backref_node {
45 	struct rb_node rb_node;
46 	u64 bytenr;
47 	/* objectid tree block owner */
48 	u64 owner;
49 	/* list of upper level blocks reference this block */
50 	struct list_head upper;
51 	/* list of child blocks in the cache */
52 	struct list_head lower;
53 	/* NULL if this node is not tree root */
54 	struct btrfs_root *root;
55 	/* extent buffer got by COW the block */
56 	struct extent_buffer *eb;
57 	/* level of tree block */
58 	unsigned int level:8;
59 	/* 1 if the block is root of old snapshot */
60 	unsigned int old_root:1;
61 	/* 1 if no child blocks in the cache */
62 	unsigned int lowest:1;
63 	/* is the extent buffer locked */
64 	unsigned int locked:1;
65 	/* has the block been processed */
66 	unsigned int processed:1;
67 	/* have backrefs of this block been checked */
68 	unsigned int checked:1;
69 };
70 
71 /*
72  * present a block pointer in the backref cache
73  */
74 struct backref_edge {
75 	struct list_head list[2];
76 	struct backref_node *node[2];
77 	u64 blockptr;
78 };
79 
80 #define LOWER	0
81 #define UPPER	1
82 
83 struct backref_cache {
84 	/* red black tree of all backref nodes in the cache */
85 	struct rb_root rb_root;
86 	/* list of backref nodes with no child block in the cache */
87 	struct list_head pending[BTRFS_MAX_LEVEL];
88 	spinlock_t lock;
89 };
90 
91 /*
92  * map address of tree root to tree
93  */
94 struct mapping_node {
95 	struct rb_node rb_node;
96 	u64 bytenr;
97 	void *data;
98 };
99 
100 struct mapping_tree {
101 	struct rb_root rb_root;
102 	spinlock_t lock;
103 };
104 
105 /*
106  * present a tree block to process
107  */
108 struct tree_block {
109 	struct rb_node rb_node;
110 	u64 bytenr;
111 	struct btrfs_key key;
112 	unsigned int level:8;
113 	unsigned int key_ready:1;
114 };
115 
116 /* inode vector */
117 #define INODEVEC_SIZE 16
118 
119 struct inodevec {
120 	struct list_head list;
121 	struct inode *inode[INODEVEC_SIZE];
122 	int nr;
123 };
124 
125 #define MAX_EXTENTS 128
126 
127 struct file_extent_cluster {
128 	u64 start;
129 	u64 end;
130 	u64 boundary[MAX_EXTENTS];
131 	unsigned int nr;
132 };
133 
134 struct reloc_control {
135 	/* block group to relocate */
136 	struct btrfs_block_group_cache *block_group;
137 	/* extent tree */
138 	struct btrfs_root *extent_root;
139 	/* inode for moving data */
140 	struct inode *data_inode;
141 	struct btrfs_workers workers;
142 	/* tree blocks have been processed */
143 	struct extent_io_tree processed_blocks;
144 	/* map start of tree root to corresponding reloc tree */
145 	struct mapping_tree reloc_root_tree;
146 	/* list of reloc trees */
147 	struct list_head reloc_roots;
148 	u64 search_start;
149 	u64 extents_found;
150 	u64 extents_skipped;
151 	int stage;
152 	int create_reloc_root;
153 	unsigned int found_file_extent:1;
154 	unsigned int found_old_snapshot:1;
155 };
156 
157 /* stages of data relocation */
158 #define MOVE_DATA_EXTENTS	0
159 #define UPDATE_DATA_PTRS	1
160 
161 /*
162  * merge reloc tree to corresponding fs tree in worker threads
163  */
164 struct async_merge {
165 	struct btrfs_work work;
166 	struct reloc_control *rc;
167 	struct btrfs_root *root;
168 	struct completion *done;
169 	atomic_t *num_pending;
170 };
171 
172 static void mapping_tree_init(struct mapping_tree *tree)
173 {
174 	tree->rb_root = RB_ROOT;
175 	spin_lock_init(&tree->lock);
176 }
177 
178 static void backref_cache_init(struct backref_cache *cache)
179 {
180 	int i;
181 	cache->rb_root = RB_ROOT;
182 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 		INIT_LIST_HEAD(&cache->pending[i]);
184 	spin_lock_init(&cache->lock);
185 }
186 
187 static void backref_node_init(struct backref_node *node)
188 {
189 	memset(node, 0, sizeof(*node));
190 	INIT_LIST_HEAD(&node->upper);
191 	INIT_LIST_HEAD(&node->lower);
192 	RB_CLEAR_NODE(&node->rb_node);
193 }
194 
195 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
196 				   struct rb_node *node)
197 {
198 	struct rb_node **p = &root->rb_node;
199 	struct rb_node *parent = NULL;
200 	struct tree_entry *entry;
201 
202 	while (*p) {
203 		parent = *p;
204 		entry = rb_entry(parent, struct tree_entry, rb_node);
205 
206 		if (bytenr < entry->bytenr)
207 			p = &(*p)->rb_left;
208 		else if (bytenr > entry->bytenr)
209 			p = &(*p)->rb_right;
210 		else
211 			return parent;
212 	}
213 
214 	rb_link_node(node, parent, p);
215 	rb_insert_color(node, root);
216 	return NULL;
217 }
218 
219 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
220 {
221 	struct rb_node *n = root->rb_node;
222 	struct tree_entry *entry;
223 
224 	while (n) {
225 		entry = rb_entry(n, struct tree_entry, rb_node);
226 
227 		if (bytenr < entry->bytenr)
228 			n = n->rb_left;
229 		else if (bytenr > entry->bytenr)
230 			n = n->rb_right;
231 		else
232 			return n;
233 	}
234 	return NULL;
235 }
236 
237 /*
238  * walk up backref nodes until reach node presents tree root
239  */
240 static struct backref_node *walk_up_backref(struct backref_node *node,
241 					    struct backref_edge *edges[],
242 					    int *index)
243 {
244 	struct backref_edge *edge;
245 	int idx = *index;
246 
247 	while (!list_empty(&node->upper)) {
248 		edge = list_entry(node->upper.next,
249 				  struct backref_edge, list[LOWER]);
250 		edges[idx++] = edge;
251 		node = edge->node[UPPER];
252 	}
253 	*index = idx;
254 	return node;
255 }
256 
257 /*
258  * walk down backref nodes to find start of next reference path
259  */
260 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
261 					      int *index)
262 {
263 	struct backref_edge *edge;
264 	struct backref_node *lower;
265 	int idx = *index;
266 
267 	while (idx > 0) {
268 		edge = edges[idx - 1];
269 		lower = edge->node[LOWER];
270 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
271 			idx--;
272 			continue;
273 		}
274 		edge = list_entry(edge->list[LOWER].next,
275 				  struct backref_edge, list[LOWER]);
276 		edges[idx - 1] = edge;
277 		*index = idx;
278 		return edge->node[UPPER];
279 	}
280 	*index = 0;
281 	return NULL;
282 }
283 
284 static void drop_node_buffer(struct backref_node *node)
285 {
286 	if (node->eb) {
287 		if (node->locked) {
288 			btrfs_tree_unlock(node->eb);
289 			node->locked = 0;
290 		}
291 		free_extent_buffer(node->eb);
292 		node->eb = NULL;
293 	}
294 }
295 
296 static void drop_backref_node(struct backref_cache *tree,
297 			      struct backref_node *node)
298 {
299 	BUG_ON(!node->lowest);
300 	BUG_ON(!list_empty(&node->upper));
301 
302 	drop_node_buffer(node);
303 	list_del(&node->lower);
304 
305 	rb_erase(&node->rb_node, &tree->rb_root);
306 	kfree(node);
307 }
308 
309 /*
310  * remove a backref node from the backref cache
311  */
312 static void remove_backref_node(struct backref_cache *cache,
313 				struct backref_node *node)
314 {
315 	struct backref_node *upper;
316 	struct backref_edge *edge;
317 
318 	if (!node)
319 		return;
320 
321 	BUG_ON(!node->lowest);
322 	while (!list_empty(&node->upper)) {
323 		edge = list_entry(node->upper.next, struct backref_edge,
324 				  list[LOWER]);
325 		upper = edge->node[UPPER];
326 		list_del(&edge->list[LOWER]);
327 		list_del(&edge->list[UPPER]);
328 		kfree(edge);
329 		/*
330 		 * add the node to pending list if no other
331 		 * child block cached.
332 		 */
333 		if (list_empty(&upper->lower)) {
334 			list_add_tail(&upper->lower,
335 				      &cache->pending[upper->level]);
336 			upper->lowest = 1;
337 		}
338 	}
339 	drop_backref_node(cache, node);
340 }
341 
342 /*
343  * find reloc tree by address of tree root
344  */
345 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
346 					  u64 bytenr)
347 {
348 	struct rb_node *rb_node;
349 	struct mapping_node *node;
350 	struct btrfs_root *root = NULL;
351 
352 	spin_lock(&rc->reloc_root_tree.lock);
353 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
354 	if (rb_node) {
355 		node = rb_entry(rb_node, struct mapping_node, rb_node);
356 		root = (struct btrfs_root *)node->data;
357 	}
358 	spin_unlock(&rc->reloc_root_tree.lock);
359 	return root;
360 }
361 
362 static int is_cowonly_root(u64 root_objectid)
363 {
364 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
365 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
366 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
367 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
368 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
369 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
370 		return 1;
371 	return 0;
372 }
373 
374 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
375 					u64 root_objectid)
376 {
377 	struct btrfs_key key;
378 
379 	key.objectid = root_objectid;
380 	key.type = BTRFS_ROOT_ITEM_KEY;
381 	if (is_cowonly_root(root_objectid))
382 		key.offset = 0;
383 	else
384 		key.offset = (u64)-1;
385 
386 	return btrfs_read_fs_root_no_name(fs_info, &key);
387 }
388 
389 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
390 static noinline_for_stack
391 struct btrfs_root *find_tree_root(struct reloc_control *rc,
392 				  struct extent_buffer *leaf,
393 				  struct btrfs_extent_ref_v0 *ref0)
394 {
395 	struct btrfs_root *root;
396 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
397 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
398 
399 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
400 
401 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
402 	BUG_ON(IS_ERR(root));
403 
404 	if (root->ref_cows &&
405 	    generation != btrfs_root_generation(&root->root_item))
406 		return NULL;
407 
408 	return root;
409 }
410 #endif
411 
412 static noinline_for_stack
413 int find_inline_backref(struct extent_buffer *leaf, int slot,
414 			unsigned long *ptr, unsigned long *end)
415 {
416 	struct btrfs_extent_item *ei;
417 	struct btrfs_tree_block_info *bi;
418 	u32 item_size;
419 
420 	item_size = btrfs_item_size_nr(leaf, slot);
421 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
422 	if (item_size < sizeof(*ei)) {
423 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
424 		return 1;
425 	}
426 #endif
427 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
428 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
429 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
430 
431 	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
432 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
433 		return 1;
434 	}
435 
436 	bi = (struct btrfs_tree_block_info *)(ei + 1);
437 	*ptr = (unsigned long)(bi + 1);
438 	*end = (unsigned long)ei + item_size;
439 	return 0;
440 }
441 
442 /*
443  * build backref tree for a given tree block. root of the backref tree
444  * corresponds the tree block, leaves of the backref tree correspond
445  * roots of b-trees that reference the tree block.
446  *
447  * the basic idea of this function is check backrefs of a given block
448  * to find upper level blocks that refernece the block, and then check
449  * bakcrefs of these upper level blocks recursively. the recursion stop
450  * when tree root is reached or backrefs for the block is cached.
451  *
452  * NOTE: if we find backrefs for a block are cached, we know backrefs
453  * for all upper level blocks that directly/indirectly reference the
454  * block are also cached.
455  */
456 static struct backref_node *build_backref_tree(struct reloc_control *rc,
457 					       struct backref_cache *cache,
458 					       struct btrfs_key *node_key,
459 					       int level, u64 bytenr)
460 {
461 	struct btrfs_path *path1;
462 	struct btrfs_path *path2;
463 	struct extent_buffer *eb;
464 	struct btrfs_root *root;
465 	struct backref_node *cur;
466 	struct backref_node *upper;
467 	struct backref_node *lower;
468 	struct backref_node *node = NULL;
469 	struct backref_node *exist = NULL;
470 	struct backref_edge *edge;
471 	struct rb_node *rb_node;
472 	struct btrfs_key key;
473 	unsigned long end;
474 	unsigned long ptr;
475 	LIST_HEAD(list);
476 	int ret;
477 	int err = 0;
478 
479 	path1 = btrfs_alloc_path();
480 	path2 = btrfs_alloc_path();
481 	if (!path1 || !path2) {
482 		err = -ENOMEM;
483 		goto out;
484 	}
485 
486 	node = kmalloc(sizeof(*node), GFP_NOFS);
487 	if (!node) {
488 		err = -ENOMEM;
489 		goto out;
490 	}
491 
492 	backref_node_init(node);
493 	node->bytenr = bytenr;
494 	node->owner = 0;
495 	node->level = level;
496 	node->lowest = 1;
497 	cur = node;
498 again:
499 	end = 0;
500 	ptr = 0;
501 	key.objectid = cur->bytenr;
502 	key.type = BTRFS_EXTENT_ITEM_KEY;
503 	key.offset = (u64)-1;
504 
505 	path1->search_commit_root = 1;
506 	path1->skip_locking = 1;
507 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
508 				0, 0);
509 	if (ret < 0) {
510 		err = ret;
511 		goto out;
512 	}
513 	BUG_ON(!ret || !path1->slots[0]);
514 
515 	path1->slots[0]--;
516 
517 	WARN_ON(cur->checked);
518 	if (!list_empty(&cur->upper)) {
519 		/*
520 		 * the backref was added previously when processsing
521 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
522 		 */
523 		BUG_ON(!list_is_singular(&cur->upper));
524 		edge = list_entry(cur->upper.next, struct backref_edge,
525 				  list[LOWER]);
526 		BUG_ON(!list_empty(&edge->list[UPPER]));
527 		exist = edge->node[UPPER];
528 		/*
529 		 * add the upper level block to pending list if we need
530 		 * check its backrefs
531 		 */
532 		if (!exist->checked)
533 			list_add_tail(&edge->list[UPPER], &list);
534 	} else {
535 		exist = NULL;
536 	}
537 
538 	while (1) {
539 		cond_resched();
540 		eb = path1->nodes[0];
541 
542 		if (ptr >= end) {
543 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
544 				ret = btrfs_next_leaf(rc->extent_root, path1);
545 				if (ret < 0) {
546 					err = ret;
547 					goto out;
548 				}
549 				if (ret > 0)
550 					break;
551 				eb = path1->nodes[0];
552 			}
553 
554 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
555 			if (key.objectid != cur->bytenr) {
556 				WARN_ON(exist);
557 				break;
558 			}
559 
560 			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
561 				ret = find_inline_backref(eb, path1->slots[0],
562 							  &ptr, &end);
563 				if (ret)
564 					goto next;
565 			}
566 		}
567 
568 		if (ptr < end) {
569 			/* update key for inline back ref */
570 			struct btrfs_extent_inline_ref *iref;
571 			iref = (struct btrfs_extent_inline_ref *)ptr;
572 			key.type = btrfs_extent_inline_ref_type(eb, iref);
573 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
574 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
575 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
576 		}
577 
578 		if (exist &&
579 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
580 		      exist->owner == key.offset) ||
581 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
582 		      exist->bytenr == key.offset))) {
583 			exist = NULL;
584 			goto next;
585 		}
586 
587 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 			if (key.objectid == key.offset &&
591 			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 				struct btrfs_extent_ref_v0 *ref0;
593 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 						struct btrfs_extent_ref_v0);
595 				root = find_tree_root(rc, eb, ref0);
596 				if (root)
597 					cur->root = root;
598 				else
599 					cur->old_root = 1;
600 				break;
601 			}
602 #else
603 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
604 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
605 #endif
606 			if (key.objectid == key.offset) {
607 				/*
608 				 * only root blocks of reloc trees use
609 				 * backref of this type.
610 				 */
611 				root = find_reloc_root(rc, cur->bytenr);
612 				BUG_ON(!root);
613 				cur->root = root;
614 				break;
615 			}
616 
617 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
618 			if (!edge) {
619 				err = -ENOMEM;
620 				goto out;
621 			}
622 			rb_node = tree_search(&cache->rb_root, key.offset);
623 			if (!rb_node) {
624 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
625 				if (!upper) {
626 					kfree(edge);
627 					err = -ENOMEM;
628 					goto out;
629 				}
630 				backref_node_init(upper);
631 				upper->bytenr = key.offset;
632 				upper->owner = 0;
633 				upper->level = cur->level + 1;
634 				/*
635 				 *  backrefs for the upper level block isn't
636 				 *  cached, add the block to pending list
637 				 */
638 				list_add_tail(&edge->list[UPPER], &list);
639 			} else {
640 				upper = rb_entry(rb_node, struct backref_node,
641 						 rb_node);
642 				INIT_LIST_HEAD(&edge->list[UPPER]);
643 			}
644 			list_add(&edge->list[LOWER], &cur->upper);
645 			edge->node[UPPER] = upper;
646 			edge->node[LOWER] = cur;
647 
648 			goto next;
649 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
650 			goto next;
651 		}
652 
653 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
654 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
655 		if (IS_ERR(root)) {
656 			err = PTR_ERR(root);
657 			goto out;
658 		}
659 
660 		if (btrfs_root_level(&root->root_item) == cur->level) {
661 			/* tree root */
662 			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 			       cur->bytenr);
664 			cur->root = root;
665 			break;
666 		}
667 
668 		level = cur->level + 1;
669 
670 		/*
671 		 * searching the tree to find upper level blocks
672 		 * reference the block.
673 		 */
674 		path2->search_commit_root = 1;
675 		path2->skip_locking = 1;
676 		path2->lowest_level = level;
677 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
678 		path2->lowest_level = 0;
679 		if (ret < 0) {
680 			err = ret;
681 			goto out;
682 		}
683 		if (ret > 0 && path2->slots[level] > 0)
684 			path2->slots[level]--;
685 
686 		eb = path2->nodes[level];
687 		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
688 			cur->bytenr);
689 
690 		lower = cur;
691 		for (; level < BTRFS_MAX_LEVEL; level++) {
692 			if (!path2->nodes[level]) {
693 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 				       lower->bytenr);
695 				lower->root = root;
696 				break;
697 			}
698 
699 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
700 			if (!edge) {
701 				err = -ENOMEM;
702 				goto out;
703 			}
704 
705 			eb = path2->nodes[level];
706 			rb_node = tree_search(&cache->rb_root, eb->start);
707 			if (!rb_node) {
708 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
709 				if (!upper) {
710 					kfree(edge);
711 					err = -ENOMEM;
712 					goto out;
713 				}
714 				backref_node_init(upper);
715 				upper->bytenr = eb->start;
716 				upper->owner = btrfs_header_owner(eb);
717 				upper->level = lower->level + 1;
718 
719 				/*
720 				 * if we know the block isn't shared
721 				 * we can void checking its backrefs.
722 				 */
723 				if (btrfs_block_can_be_shared(root, eb))
724 					upper->checked = 0;
725 				else
726 					upper->checked = 1;
727 
728 				/*
729 				 * add the block to pending list if we
730 				 * need check its backrefs. only block
731 				 * at 'cur->level + 1' is added to the
732 				 * tail of pending list. this guarantees
733 				 * we check backrefs from lower level
734 				 * blocks to upper level blocks.
735 				 */
736 				if (!upper->checked &&
737 				    level == cur->level + 1) {
738 					list_add_tail(&edge->list[UPPER],
739 						      &list);
740 				} else
741 					INIT_LIST_HEAD(&edge->list[UPPER]);
742 			} else {
743 				upper = rb_entry(rb_node, struct backref_node,
744 						 rb_node);
745 				BUG_ON(!upper->checked);
746 				INIT_LIST_HEAD(&edge->list[UPPER]);
747 			}
748 			list_add_tail(&edge->list[LOWER], &lower->upper);
749 			edge->node[UPPER] = upper;
750 			edge->node[LOWER] = lower;
751 
752 			if (rb_node)
753 				break;
754 			lower = upper;
755 			upper = NULL;
756 		}
757 		btrfs_release_path(root, path2);
758 next:
759 		if (ptr < end) {
760 			ptr += btrfs_extent_inline_ref_size(key.type);
761 			if (ptr >= end) {
762 				WARN_ON(ptr > end);
763 				ptr = 0;
764 				end = 0;
765 			}
766 		}
767 		if (ptr >= end)
768 			path1->slots[0]++;
769 	}
770 	btrfs_release_path(rc->extent_root, path1);
771 
772 	cur->checked = 1;
773 	WARN_ON(exist);
774 
775 	/* the pending list isn't empty, take the first block to process */
776 	if (!list_empty(&list)) {
777 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
778 		list_del_init(&edge->list[UPPER]);
779 		cur = edge->node[UPPER];
780 		goto again;
781 	}
782 
783 	/*
784 	 * everything goes well, connect backref nodes and insert backref nodes
785 	 * into the cache.
786 	 */
787 	BUG_ON(!node->checked);
788 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
789 	BUG_ON(rb_node);
790 
791 	list_for_each_entry(edge, &node->upper, list[LOWER])
792 		list_add_tail(&edge->list[UPPER], &list);
793 
794 	while (!list_empty(&list)) {
795 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 		list_del_init(&edge->list[UPPER]);
797 		upper = edge->node[UPPER];
798 
799 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 			if (upper->lowest) {
801 				list_del_init(&upper->lower);
802 				upper->lowest = 0;
803 			}
804 
805 			list_add_tail(&edge->list[UPPER], &upper->lower);
806 			continue;
807 		}
808 
809 		BUG_ON(!upper->checked);
810 		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
811 				      &upper->rb_node);
812 		BUG_ON(rb_node);
813 
814 		list_add_tail(&edge->list[UPPER], &upper->lower);
815 
816 		list_for_each_entry(edge, &upper->upper, list[LOWER])
817 			list_add_tail(&edge->list[UPPER], &list);
818 	}
819 out:
820 	btrfs_free_path(path1);
821 	btrfs_free_path(path2);
822 	if (err) {
823 		INIT_LIST_HEAD(&list);
824 		upper = node;
825 		while (upper) {
826 			if (RB_EMPTY_NODE(&upper->rb_node)) {
827 				list_splice_tail(&upper->upper, &list);
828 				kfree(upper);
829 			}
830 
831 			if (list_empty(&list))
832 				break;
833 
834 			edge = list_entry(list.next, struct backref_edge,
835 					  list[LOWER]);
836 			upper = edge->node[UPPER];
837 			kfree(edge);
838 		}
839 		return ERR_PTR(err);
840 	}
841 	return node;
842 }
843 
844 /*
845  * helper to add 'address of tree root -> reloc tree' mapping
846  */
847 static int __add_reloc_root(struct btrfs_root *root)
848 {
849 	struct rb_node *rb_node;
850 	struct mapping_node *node;
851 	struct reloc_control *rc = root->fs_info->reloc_ctl;
852 
853 	node = kmalloc(sizeof(*node), GFP_NOFS);
854 	BUG_ON(!node);
855 
856 	node->bytenr = root->node->start;
857 	node->data = root;
858 
859 	spin_lock(&rc->reloc_root_tree.lock);
860 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
861 			      node->bytenr, &node->rb_node);
862 	spin_unlock(&rc->reloc_root_tree.lock);
863 	BUG_ON(rb_node);
864 
865 	list_add_tail(&root->root_list, &rc->reloc_roots);
866 	return 0;
867 }
868 
869 /*
870  * helper to update/delete the 'address of tree root -> reloc tree'
871  * mapping
872  */
873 static int __update_reloc_root(struct btrfs_root *root, int del)
874 {
875 	struct rb_node *rb_node;
876 	struct mapping_node *node = NULL;
877 	struct reloc_control *rc = root->fs_info->reloc_ctl;
878 
879 	spin_lock(&rc->reloc_root_tree.lock);
880 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
881 			      root->commit_root->start);
882 	if (rb_node) {
883 		node = rb_entry(rb_node, struct mapping_node, rb_node);
884 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
885 	}
886 	spin_unlock(&rc->reloc_root_tree.lock);
887 
888 	BUG_ON((struct btrfs_root *)node->data != root);
889 
890 	if (!del) {
891 		spin_lock(&rc->reloc_root_tree.lock);
892 		node->bytenr = root->node->start;
893 		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
894 				      node->bytenr, &node->rb_node);
895 		spin_unlock(&rc->reloc_root_tree.lock);
896 		BUG_ON(rb_node);
897 	} else {
898 		list_del_init(&root->root_list);
899 		kfree(node);
900 	}
901 	return 0;
902 }
903 
904 /*
905  * create reloc tree for a given fs tree. reloc tree is just a
906  * snapshot of the fs tree with special root objectid.
907  */
908 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 			  struct btrfs_root *root)
910 {
911 	struct btrfs_root *reloc_root;
912 	struct extent_buffer *eb;
913 	struct btrfs_root_item *root_item;
914 	struct btrfs_key root_key;
915 	int ret;
916 
917 	if (root->reloc_root) {
918 		reloc_root = root->reloc_root;
919 		reloc_root->last_trans = trans->transid;
920 		return 0;
921 	}
922 
923 	if (!root->fs_info->reloc_ctl ||
924 	    !root->fs_info->reloc_ctl->create_reloc_root ||
925 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 		return 0;
927 
928 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 	BUG_ON(!root_item);
930 
931 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 	root_key.type = BTRFS_ROOT_ITEM_KEY;
933 	root_key.offset = root->root_key.objectid;
934 
935 	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
936 			      BTRFS_TREE_RELOC_OBJECTID);
937 	BUG_ON(ret);
938 
939 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 	memcpy(root_item, &root->root_item, sizeof(*root_item));
941 	btrfs_set_root_refs(root_item, 1);
942 	btrfs_set_root_bytenr(root_item, eb->start);
943 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 	btrfs_set_root_generation(root_item, trans->transid);
945 	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
946 	root_item->drop_level = 0;
947 
948 	btrfs_tree_unlock(eb);
949 	free_extent_buffer(eb);
950 
951 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
952 				&root_key, root_item);
953 	BUG_ON(ret);
954 	kfree(root_item);
955 
956 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
957 						 &root_key);
958 	BUG_ON(IS_ERR(reloc_root));
959 	reloc_root->last_trans = trans->transid;
960 
961 	__add_reloc_root(reloc_root);
962 	root->reloc_root = reloc_root;
963 	return 0;
964 }
965 
966 /*
967  * update root item of reloc tree
968  */
969 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
970 			    struct btrfs_root *root)
971 {
972 	struct btrfs_root *reloc_root;
973 	struct btrfs_root_item *root_item;
974 	int del = 0;
975 	int ret;
976 
977 	if (!root->reloc_root)
978 		return 0;
979 
980 	reloc_root = root->reloc_root;
981 	root_item = &reloc_root->root_item;
982 
983 	if (btrfs_root_refs(root_item) == 0) {
984 		root->reloc_root = NULL;
985 		del = 1;
986 	}
987 
988 	__update_reloc_root(reloc_root, del);
989 
990 	if (reloc_root->commit_root != reloc_root->node) {
991 		btrfs_set_root_node(root_item, reloc_root->node);
992 		free_extent_buffer(reloc_root->commit_root);
993 		reloc_root->commit_root = btrfs_root_node(reloc_root);
994 	}
995 
996 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
997 				&reloc_root->root_key, root_item);
998 	BUG_ON(ret);
999 	return 0;
1000 }
1001 
1002 /*
1003  * helper to find first cached inode with inode number >= objectid
1004  * in a subvolume
1005  */
1006 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1007 {
1008 	struct rb_node *node;
1009 	struct rb_node *prev;
1010 	struct btrfs_inode *entry;
1011 	struct inode *inode;
1012 
1013 	spin_lock(&root->inode_lock);
1014 again:
1015 	node = root->inode_tree.rb_node;
1016 	prev = NULL;
1017 	while (node) {
1018 		prev = node;
1019 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1020 
1021 		if (objectid < entry->vfs_inode.i_ino)
1022 			node = node->rb_left;
1023 		else if (objectid > entry->vfs_inode.i_ino)
1024 			node = node->rb_right;
1025 		else
1026 			break;
1027 	}
1028 	if (!node) {
1029 		while (prev) {
1030 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1031 			if (objectid <= entry->vfs_inode.i_ino) {
1032 				node = prev;
1033 				break;
1034 			}
1035 			prev = rb_next(prev);
1036 		}
1037 	}
1038 	while (node) {
1039 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1040 		inode = igrab(&entry->vfs_inode);
1041 		if (inode) {
1042 			spin_unlock(&root->inode_lock);
1043 			return inode;
1044 		}
1045 
1046 		objectid = entry->vfs_inode.i_ino + 1;
1047 		if (cond_resched_lock(&root->inode_lock))
1048 			goto again;
1049 
1050 		node = rb_next(node);
1051 	}
1052 	spin_unlock(&root->inode_lock);
1053 	return NULL;
1054 }
1055 
1056 static int in_block_group(u64 bytenr,
1057 			  struct btrfs_block_group_cache *block_group)
1058 {
1059 	if (bytenr >= block_group->key.objectid &&
1060 	    bytenr < block_group->key.objectid + block_group->key.offset)
1061 		return 1;
1062 	return 0;
1063 }
1064 
1065 /*
1066  * get new location of data
1067  */
1068 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1069 			    u64 bytenr, u64 num_bytes)
1070 {
1071 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1072 	struct btrfs_path *path;
1073 	struct btrfs_file_extent_item *fi;
1074 	struct extent_buffer *leaf;
1075 	int ret;
1076 
1077 	path = btrfs_alloc_path();
1078 	if (!path)
1079 		return -ENOMEM;
1080 
1081 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1082 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1083 				       bytenr, 0);
1084 	if (ret < 0)
1085 		goto out;
1086 	if (ret > 0) {
1087 		ret = -ENOENT;
1088 		goto out;
1089 	}
1090 
1091 	leaf = path->nodes[0];
1092 	fi = btrfs_item_ptr(leaf, path->slots[0],
1093 			    struct btrfs_file_extent_item);
1094 
1095 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1096 	       btrfs_file_extent_compression(leaf, fi) ||
1097 	       btrfs_file_extent_encryption(leaf, fi) ||
1098 	       btrfs_file_extent_other_encoding(leaf, fi));
1099 
1100 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1101 		ret = 1;
1102 		goto out;
1103 	}
1104 
1105 	if (new_bytenr)
1106 		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 	ret = 0;
1108 out:
1109 	btrfs_free_path(path);
1110 	return ret;
1111 }
1112 
1113 /*
1114  * update file extent items in the tree leaf to point to
1115  * the new locations.
1116  */
1117 static int replace_file_extents(struct btrfs_trans_handle *trans,
1118 				struct reloc_control *rc,
1119 				struct btrfs_root *root,
1120 				struct extent_buffer *leaf,
1121 				struct list_head *inode_list)
1122 {
1123 	struct btrfs_key key;
1124 	struct btrfs_file_extent_item *fi;
1125 	struct inode *inode = NULL;
1126 	struct inodevec *ivec = NULL;
1127 	u64 parent;
1128 	u64 bytenr;
1129 	u64 new_bytenr;
1130 	u64 num_bytes;
1131 	u64 end;
1132 	u32 nritems;
1133 	u32 i;
1134 	int ret;
1135 	int first = 1;
1136 	int dirty = 0;
1137 
1138 	if (rc->stage != UPDATE_DATA_PTRS)
1139 		return 0;
1140 
1141 	/* reloc trees always use full backref */
1142 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1143 		parent = leaf->start;
1144 	else
1145 		parent = 0;
1146 
1147 	nritems = btrfs_header_nritems(leaf);
1148 	for (i = 0; i < nritems; i++) {
1149 		cond_resched();
1150 		btrfs_item_key_to_cpu(leaf, &key, i);
1151 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1152 			continue;
1153 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1154 		if (btrfs_file_extent_type(leaf, fi) ==
1155 		    BTRFS_FILE_EXTENT_INLINE)
1156 			continue;
1157 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1158 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1159 		if (bytenr == 0)
1160 			continue;
1161 		if (!in_block_group(bytenr, rc->block_group))
1162 			continue;
1163 
1164 		/*
1165 		 * if we are modifying block in fs tree, wait for readpage
1166 		 * to complete and drop the extent cache
1167 		 */
1168 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 			if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 				BUG_ON(!ivec);
1172 				ivec->nr = 0;
1173 				list_add_tail(&ivec->list, inode_list);
1174 			}
1175 			if (first) {
1176 				inode = find_next_inode(root, key.objectid);
1177 				if (inode)
1178 					ivec->inode[ivec->nr++] = inode;
1179 				first = 0;
1180 			} else if (inode && inode->i_ino < key.objectid) {
1181 				inode = find_next_inode(root, key.objectid);
1182 				if (inode)
1183 					ivec->inode[ivec->nr++] = inode;
1184 			}
1185 			if (inode && inode->i_ino == key.objectid) {
1186 				end = key.offset +
1187 				      btrfs_file_extent_num_bytes(leaf, fi);
1188 				WARN_ON(!IS_ALIGNED(key.offset,
1189 						    root->sectorsize));
1190 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1191 				end--;
1192 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1193 						      key.offset, end,
1194 						      GFP_NOFS);
1195 				if (!ret)
1196 					continue;
1197 
1198 				btrfs_drop_extent_cache(inode, key.offset, end,
1199 							1);
1200 				unlock_extent(&BTRFS_I(inode)->io_tree,
1201 					      key.offset, end, GFP_NOFS);
1202 			}
1203 		}
1204 
1205 		ret = get_new_location(rc->data_inode, &new_bytenr,
1206 				       bytenr, num_bytes);
1207 		if (ret > 0)
1208 			continue;
1209 		BUG_ON(ret < 0);
1210 
1211 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1212 		dirty = 1;
1213 
1214 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1215 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1216 					   num_bytes, parent,
1217 					   btrfs_header_owner(leaf),
1218 					   key.objectid, key.offset);
1219 		BUG_ON(ret);
1220 
1221 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1222 					parent, btrfs_header_owner(leaf),
1223 					key.objectid, key.offset);
1224 		BUG_ON(ret);
1225 	}
1226 	if (dirty)
1227 		btrfs_mark_buffer_dirty(leaf);
1228 	return 0;
1229 }
1230 
1231 static noinline_for_stack
1232 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1233 		     struct btrfs_path *path, int level)
1234 {
1235 	struct btrfs_disk_key key1;
1236 	struct btrfs_disk_key key2;
1237 	btrfs_node_key(eb, &key1, slot);
1238 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1239 	return memcmp(&key1, &key2, sizeof(key1));
1240 }
1241 
1242 /*
1243  * try to replace tree blocks in fs tree with the new blocks
1244  * in reloc tree. tree blocks haven't been modified since the
1245  * reloc tree was create can be replaced.
1246  *
1247  * if a block was replaced, level of the block + 1 is returned.
1248  * if no block got replaced, 0 is returned. if there are other
1249  * errors, a negative error number is returned.
1250  */
1251 static int replace_path(struct btrfs_trans_handle *trans,
1252 			struct btrfs_root *dest, struct btrfs_root *src,
1253 			struct btrfs_path *path, struct btrfs_key *next_key,
1254 			struct extent_buffer **leaf,
1255 			int lowest_level, int max_level)
1256 {
1257 	struct extent_buffer *eb;
1258 	struct extent_buffer *parent;
1259 	struct btrfs_key key;
1260 	u64 old_bytenr;
1261 	u64 new_bytenr;
1262 	u64 old_ptr_gen;
1263 	u64 new_ptr_gen;
1264 	u64 last_snapshot;
1265 	u32 blocksize;
1266 	int level;
1267 	int ret;
1268 	int slot;
1269 
1270 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 	BUG_ON(lowest_level > 1 && leaf);
1273 
1274 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 
1276 	slot = path->slots[lowest_level];
1277 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 
1279 	eb = btrfs_lock_root_node(dest);
1280 	btrfs_set_lock_blocking(eb);
1281 	level = btrfs_header_level(eb);
1282 
1283 	if (level < lowest_level) {
1284 		btrfs_tree_unlock(eb);
1285 		free_extent_buffer(eb);
1286 		return 0;
1287 	}
1288 
1289 	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1290 	BUG_ON(ret);
1291 	btrfs_set_lock_blocking(eb);
1292 
1293 	if (next_key) {
1294 		next_key->objectid = (u64)-1;
1295 		next_key->type = (u8)-1;
1296 		next_key->offset = (u64)-1;
1297 	}
1298 
1299 	parent = eb;
1300 	while (1) {
1301 		level = btrfs_header_level(parent);
1302 		BUG_ON(level < lowest_level);
1303 
1304 		ret = btrfs_bin_search(parent, &key, level, &slot);
1305 		if (ret && slot > 0)
1306 			slot--;
1307 
1308 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1309 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1310 
1311 		old_bytenr = btrfs_node_blockptr(parent, slot);
1312 		blocksize = btrfs_level_size(dest, level - 1);
1313 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1314 
1315 		if (level <= max_level) {
1316 			eb = path->nodes[level];
1317 			new_bytenr = btrfs_node_blockptr(eb,
1318 							path->slots[level]);
1319 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1320 							path->slots[level]);
1321 		} else {
1322 			new_bytenr = 0;
1323 			new_ptr_gen = 0;
1324 		}
1325 
1326 		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1327 			WARN_ON(1);
1328 			ret = level;
1329 			break;
1330 		}
1331 
1332 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 		    memcmp_node_keys(parent, slot, path, level)) {
1334 			if (level <= lowest_level && !leaf) {
1335 				ret = 0;
1336 				break;
1337 			}
1338 
1339 			eb = read_tree_block(dest, old_bytenr, blocksize,
1340 					     old_ptr_gen);
1341 			btrfs_tree_lock(eb);
1342 			ret = btrfs_cow_block(trans, dest, eb, parent,
1343 					      slot, &eb);
1344 			BUG_ON(ret);
1345 			btrfs_set_lock_blocking(eb);
1346 
1347 			if (level <= lowest_level) {
1348 				*leaf = eb;
1349 				ret = 0;
1350 				break;
1351 			}
1352 
1353 			btrfs_tree_unlock(parent);
1354 			free_extent_buffer(parent);
1355 
1356 			parent = eb;
1357 			continue;
1358 		}
1359 
1360 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 				      path->slots[level]);
1362 		btrfs_release_path(src, path);
1363 
1364 		path->lowest_level = level;
1365 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1366 		path->lowest_level = 0;
1367 		BUG_ON(ret);
1368 
1369 		/*
1370 		 * swap blocks in fs tree and reloc tree.
1371 		 */
1372 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1373 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1374 		btrfs_mark_buffer_dirty(parent);
1375 
1376 		btrfs_set_node_blockptr(path->nodes[level],
1377 					path->slots[level], old_bytenr);
1378 		btrfs_set_node_ptr_generation(path->nodes[level],
1379 					      path->slots[level], old_ptr_gen);
1380 		btrfs_mark_buffer_dirty(path->nodes[level]);
1381 
1382 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1383 					path->nodes[level]->start,
1384 					src->root_key.objectid, level - 1, 0);
1385 		BUG_ON(ret);
1386 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1387 					0, dest->root_key.objectid, level - 1,
1388 					0);
1389 		BUG_ON(ret);
1390 
1391 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1392 					path->nodes[level]->start,
1393 					src->root_key.objectid, level - 1, 0);
1394 		BUG_ON(ret);
1395 
1396 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1397 					0, dest->root_key.objectid, level - 1,
1398 					0);
1399 		BUG_ON(ret);
1400 
1401 		btrfs_unlock_up_safe(path, 0);
1402 
1403 		ret = level;
1404 		break;
1405 	}
1406 	btrfs_tree_unlock(parent);
1407 	free_extent_buffer(parent);
1408 	return ret;
1409 }
1410 
1411 /*
1412  * helper to find next relocated block in reloc tree
1413  */
1414 static noinline_for_stack
1415 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1416 		       int *level)
1417 {
1418 	struct extent_buffer *eb;
1419 	int i;
1420 	u64 last_snapshot;
1421 	u32 nritems;
1422 
1423 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1424 
1425 	for (i = 0; i < *level; i++) {
1426 		free_extent_buffer(path->nodes[i]);
1427 		path->nodes[i] = NULL;
1428 	}
1429 
1430 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1431 		eb = path->nodes[i];
1432 		nritems = btrfs_header_nritems(eb);
1433 		while (path->slots[i] + 1 < nritems) {
1434 			path->slots[i]++;
1435 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1436 			    last_snapshot)
1437 				continue;
1438 
1439 			*level = i;
1440 			return 0;
1441 		}
1442 		free_extent_buffer(path->nodes[i]);
1443 		path->nodes[i] = NULL;
1444 	}
1445 	return 1;
1446 }
1447 
1448 /*
1449  * walk down reloc tree to find relocated block of lowest level
1450  */
1451 static noinline_for_stack
1452 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1453 			 int *level)
1454 {
1455 	struct extent_buffer *eb = NULL;
1456 	int i;
1457 	u64 bytenr;
1458 	u64 ptr_gen = 0;
1459 	u64 last_snapshot;
1460 	u32 blocksize;
1461 	u32 nritems;
1462 
1463 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1464 
1465 	for (i = *level; i > 0; i--) {
1466 		eb = path->nodes[i];
1467 		nritems = btrfs_header_nritems(eb);
1468 		while (path->slots[i] < nritems) {
1469 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1470 			if (ptr_gen > last_snapshot)
1471 				break;
1472 			path->slots[i]++;
1473 		}
1474 		if (path->slots[i] >= nritems) {
1475 			if (i == *level)
1476 				break;
1477 			*level = i + 1;
1478 			return 0;
1479 		}
1480 		if (i == 1) {
1481 			*level = i;
1482 			return 0;
1483 		}
1484 
1485 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1486 		blocksize = btrfs_level_size(root, i - 1);
1487 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1488 		BUG_ON(btrfs_header_level(eb) != i - 1);
1489 		path->nodes[i - 1] = eb;
1490 		path->slots[i - 1] = 0;
1491 	}
1492 	return 1;
1493 }
1494 
1495 /*
1496  * invalidate extent cache for file extents whose key in range of
1497  * [min_key, max_key)
1498  */
1499 static int invalidate_extent_cache(struct btrfs_root *root,
1500 				   struct btrfs_key *min_key,
1501 				   struct btrfs_key *max_key)
1502 {
1503 	struct inode *inode = NULL;
1504 	u64 objectid;
1505 	u64 start, end;
1506 
1507 	objectid = min_key->objectid;
1508 	while (1) {
1509 		cond_resched();
1510 		iput(inode);
1511 
1512 		if (objectid > max_key->objectid)
1513 			break;
1514 
1515 		inode = find_next_inode(root, objectid);
1516 		if (!inode)
1517 			break;
1518 
1519 		if (inode->i_ino > max_key->objectid) {
1520 			iput(inode);
1521 			break;
1522 		}
1523 
1524 		objectid = inode->i_ino + 1;
1525 		if (!S_ISREG(inode->i_mode))
1526 			continue;
1527 
1528 		if (unlikely(min_key->objectid == inode->i_ino)) {
1529 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1530 				continue;
1531 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1532 				start = 0;
1533 			else {
1534 				start = min_key->offset;
1535 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1536 			}
1537 		} else {
1538 			start = 0;
1539 		}
1540 
1541 		if (unlikely(max_key->objectid == inode->i_ino)) {
1542 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1543 				continue;
1544 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1545 				end = (u64)-1;
1546 			} else {
1547 				if (max_key->offset == 0)
1548 					continue;
1549 				end = max_key->offset;
1550 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1551 				end--;
1552 			}
1553 		} else {
1554 			end = (u64)-1;
1555 		}
1556 
1557 		/* the lock_extent waits for readpage to complete */
1558 		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1559 		btrfs_drop_extent_cache(inode, start, end, 1);
1560 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1561 	}
1562 	return 0;
1563 }
1564 
1565 static void put_inodes(struct list_head *list)
1566 {
1567 	struct inodevec *ivec;
1568 	while (!list_empty(list)) {
1569 		ivec = list_entry(list->next, struct inodevec, list);
1570 		list_del(&ivec->list);
1571 		while (ivec->nr > 0) {
1572 			ivec->nr--;
1573 			iput(ivec->inode[ivec->nr]);
1574 		}
1575 		kfree(ivec);
1576 	}
1577 }
1578 
1579 static int find_next_key(struct btrfs_path *path, int level,
1580 			 struct btrfs_key *key)
1581 
1582 {
1583 	while (level < BTRFS_MAX_LEVEL) {
1584 		if (!path->nodes[level])
1585 			break;
1586 		if (path->slots[level] + 1 <
1587 		    btrfs_header_nritems(path->nodes[level])) {
1588 			btrfs_node_key_to_cpu(path->nodes[level], key,
1589 					      path->slots[level] + 1);
1590 			return 0;
1591 		}
1592 		level++;
1593 	}
1594 	return 1;
1595 }
1596 
1597 /*
1598  * merge the relocated tree blocks in reloc tree with corresponding
1599  * fs tree.
1600  */
1601 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1602 					       struct btrfs_root *root)
1603 {
1604 	LIST_HEAD(inode_list);
1605 	struct btrfs_key key;
1606 	struct btrfs_key next_key;
1607 	struct btrfs_trans_handle *trans;
1608 	struct btrfs_root *reloc_root;
1609 	struct btrfs_root_item *root_item;
1610 	struct btrfs_path *path;
1611 	struct extent_buffer *leaf = NULL;
1612 	unsigned long nr;
1613 	int level;
1614 	int max_level;
1615 	int replaced = 0;
1616 	int ret;
1617 	int err = 0;
1618 
1619 	path = btrfs_alloc_path();
1620 	if (!path)
1621 		return -ENOMEM;
1622 
1623 	reloc_root = root->reloc_root;
1624 	root_item = &reloc_root->root_item;
1625 
1626 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1627 		level = btrfs_root_level(root_item);
1628 		extent_buffer_get(reloc_root->node);
1629 		path->nodes[level] = reloc_root->node;
1630 		path->slots[level] = 0;
1631 	} else {
1632 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1633 
1634 		level = root_item->drop_level;
1635 		BUG_ON(level == 0);
1636 		path->lowest_level = level;
1637 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1638 		path->lowest_level = 0;
1639 		if (ret < 0) {
1640 			btrfs_free_path(path);
1641 			return ret;
1642 		}
1643 
1644 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1645 				      path->slots[level]);
1646 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1647 
1648 		btrfs_unlock_up_safe(path, 0);
1649 	}
1650 
1651 	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1652 		trans = btrfs_start_transaction(root, 1);
1653 
1654 		leaf = path->nodes[0];
1655 		btrfs_item_key_to_cpu(leaf, &key, 0);
1656 		btrfs_release_path(reloc_root, path);
1657 
1658 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1659 		if (ret < 0) {
1660 			err = ret;
1661 			goto out;
1662 		}
1663 
1664 		leaf = path->nodes[0];
1665 		btrfs_unlock_up_safe(path, 1);
1666 		ret = replace_file_extents(trans, rc, root, leaf,
1667 					   &inode_list);
1668 		if (ret < 0)
1669 			err = ret;
1670 		goto out;
1671 	}
1672 
1673 	memset(&next_key, 0, sizeof(next_key));
1674 
1675 	while (1) {
1676 		leaf = NULL;
1677 		replaced = 0;
1678 		trans = btrfs_start_transaction(root, 1);
1679 		max_level = level;
1680 
1681 		ret = walk_down_reloc_tree(reloc_root, path, &level);
1682 		if (ret < 0) {
1683 			err = ret;
1684 			goto out;
1685 		}
1686 		if (ret > 0)
1687 			break;
1688 
1689 		if (!find_next_key(path, level, &key) &&
1690 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 			ret = 0;
1692 		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 			ret = replace_path(trans, root, reloc_root,
1694 					   path, &next_key, &leaf,
1695 					   level, max_level);
1696 		} else {
1697 			ret = replace_path(trans, root, reloc_root,
1698 					   path, &next_key, NULL,
1699 					   level, max_level);
1700 		}
1701 		if (ret < 0) {
1702 			err = ret;
1703 			goto out;
1704 		}
1705 
1706 		if (ret > 0) {
1707 			level = ret;
1708 			btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 					      path->slots[level]);
1710 			replaced = 1;
1711 		} else if (leaf) {
1712 			/*
1713 			 * no block got replaced, try replacing file extents
1714 			 */
1715 			btrfs_item_key_to_cpu(leaf, &key, 0);
1716 			ret = replace_file_extents(trans, rc, root, leaf,
1717 						   &inode_list);
1718 			btrfs_tree_unlock(leaf);
1719 			free_extent_buffer(leaf);
1720 			BUG_ON(ret < 0);
1721 		}
1722 
1723 		ret = walk_up_reloc_tree(reloc_root, path, &level);
1724 		if (ret > 0)
1725 			break;
1726 
1727 		BUG_ON(level == 0);
1728 		/*
1729 		 * save the merging progress in the drop_progress.
1730 		 * this is OK since root refs == 1 in this case.
1731 		 */
1732 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1733 			       path->slots[level]);
1734 		root_item->drop_level = level;
1735 
1736 		nr = trans->blocks_used;
1737 		btrfs_end_transaction(trans, root);
1738 
1739 		btrfs_btree_balance_dirty(root, nr);
1740 
1741 		/*
1742 		 * put inodes outside transaction, otherwise we may deadlock.
1743 		 */
1744 		put_inodes(&inode_list);
1745 
1746 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 			invalidate_extent_cache(root, &key, &next_key);
1748 	}
1749 
1750 	/*
1751 	 * handle the case only one block in the fs tree need to be
1752 	 * relocated and the block is tree root.
1753 	 */
1754 	leaf = btrfs_lock_root_node(root);
1755 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1756 	btrfs_tree_unlock(leaf);
1757 	free_extent_buffer(leaf);
1758 	if (ret < 0)
1759 		err = ret;
1760 out:
1761 	btrfs_free_path(path);
1762 
1763 	if (err == 0) {
1764 		memset(&root_item->drop_progress, 0,
1765 		       sizeof(root_item->drop_progress));
1766 		root_item->drop_level = 0;
1767 		btrfs_set_root_refs(root_item, 0);
1768 	}
1769 
1770 	nr = trans->blocks_used;
1771 	btrfs_end_transaction(trans, root);
1772 
1773 	btrfs_btree_balance_dirty(root, nr);
1774 
1775 	put_inodes(&inode_list);
1776 
1777 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 		invalidate_extent_cache(root, &key, &next_key);
1779 
1780 	return err;
1781 }
1782 
1783 /*
1784  * callback for the work threads.
1785  * this function merges reloc tree with corresponding fs tree,
1786  * and then drops the reloc tree.
1787  */
1788 static void merge_func(struct btrfs_work *work)
1789 {
1790 	struct btrfs_trans_handle *trans;
1791 	struct btrfs_root *root;
1792 	struct btrfs_root *reloc_root;
1793 	struct async_merge *async;
1794 
1795 	async = container_of(work, struct async_merge, work);
1796 	reloc_root = async->root;
1797 
1798 	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 		root = read_fs_root(reloc_root->fs_info,
1800 				    reloc_root->root_key.offset);
1801 		BUG_ON(IS_ERR(root));
1802 		BUG_ON(root->reloc_root != reloc_root);
1803 
1804 		merge_reloc_root(async->rc, root);
1805 
1806 		trans = btrfs_start_transaction(root, 1);
1807 		btrfs_update_reloc_root(trans, root);
1808 		btrfs_end_transaction(trans, root);
1809 	}
1810 
1811 	btrfs_drop_snapshot(reloc_root, 0);
1812 
1813 	if (atomic_dec_and_test(async->num_pending))
1814 		complete(async->done);
1815 
1816 	kfree(async);
1817 }
1818 
1819 static int merge_reloc_roots(struct reloc_control *rc)
1820 {
1821 	struct async_merge *async;
1822 	struct btrfs_root *root;
1823 	struct completion done;
1824 	atomic_t num_pending;
1825 
1826 	init_completion(&done);
1827 	atomic_set(&num_pending, 1);
1828 
1829 	while (!list_empty(&rc->reloc_roots)) {
1830 		root = list_entry(rc->reloc_roots.next,
1831 				  struct btrfs_root, root_list);
1832 		list_del_init(&root->root_list);
1833 
1834 		async = kmalloc(sizeof(*async), GFP_NOFS);
1835 		BUG_ON(!async);
1836 		async->work.func = merge_func;
1837 		async->work.flags = 0;
1838 		async->rc = rc;
1839 		async->root = root;
1840 		async->done = &done;
1841 		async->num_pending = &num_pending;
1842 		atomic_inc(&num_pending);
1843 		btrfs_queue_worker(&rc->workers, &async->work);
1844 	}
1845 
1846 	if (!atomic_dec_and_test(&num_pending))
1847 		wait_for_completion(&done);
1848 
1849 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 	return 0;
1851 }
1852 
1853 static void free_block_list(struct rb_root *blocks)
1854 {
1855 	struct tree_block *block;
1856 	struct rb_node *rb_node;
1857 	while ((rb_node = rb_first(blocks))) {
1858 		block = rb_entry(rb_node, struct tree_block, rb_node);
1859 		rb_erase(rb_node, blocks);
1860 		kfree(block);
1861 	}
1862 }
1863 
1864 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1865 				      struct btrfs_root *reloc_root)
1866 {
1867 	struct btrfs_root *root;
1868 
1869 	if (reloc_root->last_trans == trans->transid)
1870 		return 0;
1871 
1872 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1873 	BUG_ON(IS_ERR(root));
1874 	BUG_ON(root->reloc_root != reloc_root);
1875 
1876 	return btrfs_record_root_in_trans(trans, root);
1877 }
1878 
1879 /*
1880  * select one tree from trees that references the block.
1881  * for blocks in refernce counted trees, we preper reloc tree.
1882  * if no reloc tree found and reloc_only is true, NULL is returned.
1883  */
1884 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 					    struct backref_node *node,
1886 					    struct backref_edge *edges[],
1887 					    int *nr, int reloc_only)
1888 {
1889 	struct backref_node *next;
1890 	struct btrfs_root *root;
1891 	int index;
1892 	int loop = 0;
1893 again:
1894 	index = 0;
1895 	next = node;
1896 	while (1) {
1897 		cond_resched();
1898 		next = walk_up_backref(next, edges, &index);
1899 		root = next->root;
1900 		if (!root) {
1901 			BUG_ON(!node->old_root);
1902 			goto skip;
1903 		}
1904 
1905 		/* no other choice for non-refernce counted tree */
1906 		if (!root->ref_cows) {
1907 			BUG_ON(reloc_only);
1908 			break;
1909 		}
1910 
1911 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 			record_reloc_root_in_trans(trans, root);
1913 			break;
1914 		}
1915 
1916 		if (loop) {
1917 			btrfs_record_root_in_trans(trans, root);
1918 			break;
1919 		}
1920 
1921 		if (reloc_only || next != node) {
1922 			if (!root->reloc_root)
1923 				btrfs_record_root_in_trans(trans, root);
1924 			root = root->reloc_root;
1925 			/*
1926 			 * if the reloc tree was created in current
1927 			 * transation, there is no node in backref tree
1928 			 * corresponds to the root of the reloc tree.
1929 			 */
1930 			if (btrfs_root_last_snapshot(&root->root_item) ==
1931 			    trans->transid - 1)
1932 				break;
1933 		}
1934 skip:
1935 		root = NULL;
1936 		next = walk_down_backref(edges, &index);
1937 		if (!next || next->level <= node->level)
1938 			break;
1939 	}
1940 
1941 	if (!root && !loop && !reloc_only) {
1942 		loop = 1;
1943 		goto again;
1944 	}
1945 
1946 	if (root)
1947 		*nr = index;
1948 	else
1949 		*nr = 0;
1950 
1951 	return root;
1952 }
1953 
1954 static noinline_for_stack
1955 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 				   struct backref_node *node)
1957 {
1958 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 	int nr;
1960 	return __select_one_root(trans, node, edges, &nr, 0);
1961 }
1962 
1963 static noinline_for_stack
1964 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1965 				     struct backref_node *node,
1966 				     struct backref_edge *edges[], int *nr)
1967 {
1968 	return __select_one_root(trans, node, edges, nr, 1);
1969 }
1970 
1971 static void grab_path_buffers(struct btrfs_path *path,
1972 			      struct backref_node *node,
1973 			      struct backref_edge *edges[], int nr)
1974 {
1975 	int i = 0;
1976 	while (1) {
1977 		drop_node_buffer(node);
1978 		node->eb = path->nodes[node->level];
1979 		BUG_ON(!node->eb);
1980 		if (path->locks[node->level])
1981 			node->locked = 1;
1982 		path->nodes[node->level] = NULL;
1983 		path->locks[node->level] = 0;
1984 
1985 		if (i >= nr)
1986 			break;
1987 
1988 		edges[i]->blockptr = node->eb->start;
1989 		node = edges[i]->node[UPPER];
1990 		i++;
1991 	}
1992 }
1993 
1994 /*
1995  * relocate a block tree, and then update pointers in upper level
1996  * blocks that reference the block to point to the new location.
1997  *
1998  * if called by link_to_upper, the block has already been relocated.
1999  * in that case this function just updates pointers.
2000  */
2001 static int do_relocation(struct btrfs_trans_handle *trans,
2002 			 struct backref_node *node,
2003 			 struct btrfs_key *key,
2004 			 struct btrfs_path *path, int lowest)
2005 {
2006 	struct backref_node *upper;
2007 	struct backref_edge *edge;
2008 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2009 	struct btrfs_root *root;
2010 	struct extent_buffer *eb;
2011 	u32 blocksize;
2012 	u64 bytenr;
2013 	u64 generation;
2014 	int nr;
2015 	int slot;
2016 	int ret;
2017 	int err = 0;
2018 
2019 	BUG_ON(lowest && node->eb);
2020 
2021 	path->lowest_level = node->level + 1;
2022 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 		cond_resched();
2024 		if (node->eb && node->eb->start == edge->blockptr)
2025 			continue;
2026 
2027 		upper = edge->node[UPPER];
2028 		root = select_reloc_root(trans, upper, edges, &nr);
2029 		if (!root)
2030 			continue;
2031 
2032 		if (upper->eb && !upper->locked)
2033 			drop_node_buffer(upper);
2034 
2035 		if (!upper->eb) {
2036 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2037 			if (ret < 0) {
2038 				err = ret;
2039 				break;
2040 			}
2041 			BUG_ON(ret > 0);
2042 
2043 			slot = path->slots[upper->level];
2044 
2045 			btrfs_unlock_up_safe(path, upper->level + 1);
2046 			grab_path_buffers(path, upper, edges, nr);
2047 
2048 			btrfs_release_path(NULL, path);
2049 		} else {
2050 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2051 					       &slot);
2052 			BUG_ON(ret);
2053 		}
2054 
2055 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 		if (!lowest) {
2057 			if (node->eb->start == bytenr) {
2058 				btrfs_tree_unlock(upper->eb);
2059 				upper->locked = 0;
2060 				continue;
2061 			}
2062 		} else {
2063 			BUG_ON(node->bytenr != bytenr);
2064 		}
2065 
2066 		blocksize = btrfs_level_size(root, node->level);
2067 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2068 		eb = read_tree_block(root, bytenr, blocksize, generation);
2069 		btrfs_tree_lock(eb);
2070 		btrfs_set_lock_blocking(eb);
2071 
2072 		if (!node->eb) {
2073 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 					      slot, &eb);
2075 			if (ret < 0) {
2076 				err = ret;
2077 				break;
2078 			}
2079 			btrfs_set_lock_blocking(eb);
2080 			node->eb = eb;
2081 			node->locked = 1;
2082 		} else {
2083 			btrfs_set_node_blockptr(upper->eb, slot,
2084 						node->eb->start);
2085 			btrfs_set_node_ptr_generation(upper->eb, slot,
2086 						      trans->transid);
2087 			btrfs_mark_buffer_dirty(upper->eb);
2088 
2089 			ret = btrfs_inc_extent_ref(trans, root,
2090 						node->eb->start, blocksize,
2091 						upper->eb->start,
2092 						btrfs_header_owner(upper->eb),
2093 						node->level, 0);
2094 			BUG_ON(ret);
2095 
2096 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 			BUG_ON(ret);
2098 		}
2099 		if (!lowest) {
2100 			btrfs_tree_unlock(upper->eb);
2101 			upper->locked = 0;
2102 		}
2103 	}
2104 	path->lowest_level = 0;
2105 	return err;
2106 }
2107 
2108 static int link_to_upper(struct btrfs_trans_handle *trans,
2109 			 struct backref_node *node,
2110 			 struct btrfs_path *path)
2111 {
2112 	struct btrfs_key key;
2113 	if (!node->eb || list_empty(&node->upper))
2114 		return 0;
2115 
2116 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 	return do_relocation(trans, node, &key, path, 0);
2118 }
2119 
2120 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 				struct backref_cache *cache,
2122 				struct btrfs_path *path)
2123 {
2124 	struct backref_node *node;
2125 	int level;
2126 	int ret;
2127 	int err = 0;
2128 
2129 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 		while (!list_empty(&cache->pending[level])) {
2131 			node = list_entry(cache->pending[level].next,
2132 					  struct backref_node, lower);
2133 			BUG_ON(node->level != level);
2134 
2135 			ret = link_to_upper(trans, node, path);
2136 			if (ret < 0)
2137 				err = ret;
2138 			/*
2139 			 * this remove the node from the pending list and
2140 			 * may add some other nodes to the level + 1
2141 			 * pending list
2142 			 */
2143 			remove_backref_node(cache, node);
2144 		}
2145 	}
2146 	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 	return err;
2148 }
2149 
2150 static void mark_block_processed(struct reloc_control *rc,
2151 				 struct backref_node *node)
2152 {
2153 	u32 blocksize;
2154 	if (node->level == 0 ||
2155 	    in_block_group(node->bytenr, rc->block_group)) {
2156 		blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 		set_extent_bits(&rc->processed_blocks, node->bytenr,
2158 				node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 				GFP_NOFS);
2160 	}
2161 	node->processed = 1;
2162 }
2163 
2164 /*
2165  * mark a block and all blocks directly/indirectly reference the block
2166  * as processed.
2167  */
2168 static void update_processed_blocks(struct reloc_control *rc,
2169 				    struct backref_node *node)
2170 {
2171 	struct backref_node *next = node;
2172 	struct backref_edge *edge;
2173 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2174 	int index = 0;
2175 
2176 	while (next) {
2177 		cond_resched();
2178 		while (1) {
2179 			if (next->processed)
2180 				break;
2181 
2182 			mark_block_processed(rc, next);
2183 
2184 			if (list_empty(&next->upper))
2185 				break;
2186 
2187 			edge = list_entry(next->upper.next,
2188 					  struct backref_edge, list[LOWER]);
2189 			edges[index++] = edge;
2190 			next = edge->node[UPPER];
2191 		}
2192 		next = walk_down_backref(edges, &index);
2193 	}
2194 }
2195 
2196 static int tree_block_processed(u64 bytenr, u32 blocksize,
2197 				struct reloc_control *rc)
2198 {
2199 	if (test_range_bit(&rc->processed_blocks, bytenr,
2200 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2201 		return 1;
2202 	return 0;
2203 }
2204 
2205 /*
2206  * check if there are any file extent pointers in the leaf point to
2207  * data require processing
2208  */
2209 static int check_file_extents(struct reloc_control *rc,
2210 			      u64 bytenr, u32 blocksize, u64 ptr_gen)
2211 {
2212 	struct btrfs_key found_key;
2213 	struct btrfs_file_extent_item *fi;
2214 	struct extent_buffer *leaf;
2215 	u32 nritems;
2216 	int i;
2217 	int ret = 0;
2218 
2219 	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220 
2221 	nritems = btrfs_header_nritems(leaf);
2222 	for (i = 0; i < nritems; i++) {
2223 		cond_resched();
2224 		btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 			continue;
2227 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 		if (btrfs_file_extent_type(leaf, fi) ==
2229 		    BTRFS_FILE_EXTENT_INLINE)
2230 			continue;
2231 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 		if (bytenr == 0)
2233 			continue;
2234 		if (in_block_group(bytenr, rc->block_group)) {
2235 			ret = 1;
2236 			break;
2237 		}
2238 	}
2239 	free_extent_buffer(leaf);
2240 	return ret;
2241 }
2242 
2243 /*
2244  * scan child blocks of a given block to find blocks require processing
2245  */
2246 static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 			    struct reloc_control *rc,
2248 			    struct backref_node *node,
2249 			    struct rb_root *blocks)
2250 {
2251 	struct tree_block *block;
2252 	struct rb_node *rb_node;
2253 	u64 bytenr;
2254 	u64 ptr_gen;
2255 	u32 blocksize;
2256 	u32 nritems;
2257 	int i;
2258 	int err = 0;
2259 
2260 	nritems = btrfs_header_nritems(node->eb);
2261 	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 	for (i = 0; i < nritems; i++) {
2263 		cond_resched();
2264 		bytenr = btrfs_node_blockptr(node->eb, i);
2265 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 		if (ptr_gen == trans->transid)
2267 			continue;
2268 		if (!in_block_group(bytenr, rc->block_group) &&
2269 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 			continue;
2271 		if (tree_block_processed(bytenr, blocksize, rc))
2272 			continue;
2273 
2274 		readahead_tree_block(rc->extent_root,
2275 				     bytenr, blocksize, ptr_gen);
2276 	}
2277 
2278 	for (i = 0; i < nritems; i++) {
2279 		cond_resched();
2280 		bytenr = btrfs_node_blockptr(node->eb, i);
2281 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 		if (ptr_gen == trans->transid)
2283 			continue;
2284 		if (!in_block_group(bytenr, rc->block_group) &&
2285 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 			continue;
2287 		if (tree_block_processed(bytenr, blocksize, rc))
2288 			continue;
2289 		if (!in_block_group(bytenr, rc->block_group) &&
2290 		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 			continue;
2292 
2293 		block = kmalloc(sizeof(*block), GFP_NOFS);
2294 		if (!block) {
2295 			err = -ENOMEM;
2296 			break;
2297 		}
2298 		block->bytenr = bytenr;
2299 		btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 		block->level = node->level - 1;
2301 		block->key_ready = 1;
2302 		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 		BUG_ON(rb_node);
2304 	}
2305 	if (err)
2306 		free_block_list(blocks);
2307 	return err;
2308 }
2309 
2310 /*
2311  * find adjacent blocks require processing
2312  */
2313 static noinline_for_stack
2314 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 			struct reloc_control *rc,
2316 			struct backref_cache *cache,
2317 			struct rb_root *blocks, int level,
2318 			struct backref_node **upper)
2319 {
2320 	struct backref_node *node;
2321 	int ret = 0;
2322 
2323 	WARN_ON(!list_empty(&cache->pending[level]));
2324 
2325 	if (list_empty(&cache->pending[level + 1]))
2326 		return 1;
2327 
2328 	node = list_entry(cache->pending[level + 1].next,
2329 			  struct backref_node, lower);
2330 	if (node->eb)
2331 		ret = add_child_blocks(trans, rc, node, blocks);
2332 
2333 	*upper = node;
2334 	return ret;
2335 }
2336 
2337 static int get_tree_block_key(struct reloc_control *rc,
2338 			      struct tree_block *block)
2339 {
2340 	struct extent_buffer *eb;
2341 
2342 	BUG_ON(block->key_ready);
2343 	eb = read_tree_block(rc->extent_root, block->bytenr,
2344 			     block->key.objectid, block->key.offset);
2345 	WARN_ON(btrfs_header_level(eb) != block->level);
2346 	if (block->level == 0)
2347 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2348 	else
2349 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2350 	free_extent_buffer(eb);
2351 	block->key_ready = 1;
2352 	return 0;
2353 }
2354 
2355 static int reada_tree_block(struct reloc_control *rc,
2356 			    struct tree_block *block)
2357 {
2358 	BUG_ON(block->key_ready);
2359 	readahead_tree_block(rc->extent_root, block->bytenr,
2360 			     block->key.objectid, block->key.offset);
2361 	return 0;
2362 }
2363 
2364 /*
2365  * helper function to relocate a tree block
2366  */
2367 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2368 				struct reloc_control *rc,
2369 				struct backref_node *node,
2370 				struct btrfs_key *key,
2371 				struct btrfs_path *path)
2372 {
2373 	struct btrfs_root *root;
2374 	int ret;
2375 
2376 	root = select_one_root(trans, node);
2377 	if (unlikely(!root)) {
2378 		rc->found_old_snapshot = 1;
2379 		update_processed_blocks(rc, node);
2380 		return 0;
2381 	}
2382 
2383 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2384 		ret = do_relocation(trans, node, key, path, 1);
2385 		if (ret < 0)
2386 			goto out;
2387 		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 			ret = replace_file_extents(trans, rc, root,
2389 						   node->eb, NULL);
2390 			if (ret < 0)
2391 				goto out;
2392 		}
2393 		drop_node_buffer(node);
2394 	} else if (!root->ref_cows) {
2395 		path->lowest_level = node->level;
2396 		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 		btrfs_release_path(root, path);
2398 		if (ret < 0)
2399 			goto out;
2400 	} else if (root != node->root) {
2401 		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 	}
2403 
2404 	update_processed_blocks(rc, node);
2405 	ret = 0;
2406 out:
2407 	drop_node_buffer(node);
2408 	return ret;
2409 }
2410 
2411 /*
2412  * relocate a list of blocks
2413  */
2414 static noinline_for_stack
2415 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 			 struct reloc_control *rc, struct rb_root *blocks)
2417 {
2418 	struct backref_cache *cache;
2419 	struct backref_node *node;
2420 	struct btrfs_path *path;
2421 	struct tree_block *block;
2422 	struct rb_node *rb_node;
2423 	int level = -1;
2424 	int ret;
2425 	int err = 0;
2426 
2427 	path = btrfs_alloc_path();
2428 	if (!path)
2429 		return -ENOMEM;
2430 
2431 	cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 	if (!cache) {
2433 		btrfs_free_path(path);
2434 		return -ENOMEM;
2435 	}
2436 
2437 	backref_cache_init(cache);
2438 
2439 	rb_node = rb_first(blocks);
2440 	while (rb_node) {
2441 		block = rb_entry(rb_node, struct tree_block, rb_node);
2442 		if (level == -1)
2443 			level = block->level;
2444 		else
2445 			BUG_ON(level != block->level);
2446 		if (!block->key_ready)
2447 			reada_tree_block(rc, block);
2448 		rb_node = rb_next(rb_node);
2449 	}
2450 
2451 	rb_node = rb_first(blocks);
2452 	while (rb_node) {
2453 		block = rb_entry(rb_node, struct tree_block, rb_node);
2454 		if (!block->key_ready)
2455 			get_tree_block_key(rc, block);
2456 		rb_node = rb_next(rb_node);
2457 	}
2458 
2459 	rb_node = rb_first(blocks);
2460 	while (rb_node) {
2461 		block = rb_entry(rb_node, struct tree_block, rb_node);
2462 
2463 		node = build_backref_tree(rc, cache, &block->key,
2464 					  block->level, block->bytenr);
2465 		if (IS_ERR(node)) {
2466 			err = PTR_ERR(node);
2467 			goto out;
2468 		}
2469 
2470 		ret = relocate_tree_block(trans, rc, node, &block->key,
2471 					  path);
2472 		if (ret < 0) {
2473 			err = ret;
2474 			goto out;
2475 		}
2476 		remove_backref_node(cache, node);
2477 		rb_node = rb_next(rb_node);
2478 	}
2479 
2480 	if (level > 0)
2481 		goto out;
2482 
2483 	free_block_list(blocks);
2484 
2485 	/*
2486 	 * now backrefs of some upper level tree blocks have been cached,
2487 	 * try relocating blocks referenced by these upper level blocks.
2488 	 */
2489 	while (1) {
2490 		struct backref_node *upper = NULL;
2491 		if (trans->transaction->in_commit ||
2492 		    trans->transaction->delayed_refs.flushing)
2493 			break;
2494 
2495 		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2496 					  &upper);
2497 		if (ret < 0)
2498 			err = ret;
2499 		if (ret != 0)
2500 			break;
2501 
2502 		rb_node = rb_first(blocks);
2503 		while (rb_node) {
2504 			block = rb_entry(rb_node, struct tree_block, rb_node);
2505 			if (trans->transaction->in_commit ||
2506 			    trans->transaction->delayed_refs.flushing)
2507 				goto out;
2508 			BUG_ON(!block->key_ready);
2509 			node = build_backref_tree(rc, cache, &block->key,
2510 						  level, block->bytenr);
2511 			if (IS_ERR(node)) {
2512 				err = PTR_ERR(node);
2513 				goto out;
2514 			}
2515 
2516 			ret = relocate_tree_block(trans, rc, node,
2517 						  &block->key, path);
2518 			if (ret < 0) {
2519 				err = ret;
2520 				goto out;
2521 			}
2522 			remove_backref_node(cache, node);
2523 			rb_node = rb_next(rb_node);
2524 		}
2525 		free_block_list(blocks);
2526 
2527 		if (upper) {
2528 			ret = link_to_upper(trans, upper, path);
2529 			if (ret < 0) {
2530 				err = ret;
2531 				break;
2532 			}
2533 			remove_backref_node(cache, upper);
2534 		}
2535 	}
2536 out:
2537 	free_block_list(blocks);
2538 
2539 	ret = finish_pending_nodes(trans, cache, path);
2540 	if (ret < 0)
2541 		err = ret;
2542 
2543 	kfree(cache);
2544 	btrfs_free_path(path);
2545 	return err;
2546 }
2547 
2548 static noinline_for_stack
2549 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2550 			 u64 block_start)
2551 {
2552 	struct btrfs_root *root = BTRFS_I(inode)->root;
2553 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2554 	struct extent_map *em;
2555 	int ret = 0;
2556 
2557 	em = alloc_extent_map(GFP_NOFS);
2558 	if (!em)
2559 		return -ENOMEM;
2560 
2561 	em->start = start;
2562 	em->len = end + 1 - start;
2563 	em->block_len = em->len;
2564 	em->block_start = block_start;
2565 	em->bdev = root->fs_info->fs_devices->latest_bdev;
2566 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
2567 
2568 	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2569 	while (1) {
2570 		write_lock(&em_tree->lock);
2571 		ret = add_extent_mapping(em_tree, em);
2572 		write_unlock(&em_tree->lock);
2573 		if (ret != -EEXIST) {
2574 			free_extent_map(em);
2575 			break;
2576 		}
2577 		btrfs_drop_extent_cache(inode, start, end, 0);
2578 	}
2579 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2580 	return ret;
2581 }
2582 
2583 static int relocate_file_extent_cluster(struct inode *inode,
2584 					struct file_extent_cluster *cluster)
2585 {
2586 	u64 page_start;
2587 	u64 page_end;
2588 	u64 offset = BTRFS_I(inode)->index_cnt;
2589 	unsigned long index;
2590 	unsigned long last_index;
2591 	unsigned int dirty_page = 0;
2592 	struct page *page;
2593 	struct file_ra_state *ra;
2594 	int nr = 0;
2595 	int ret = 0;
2596 
2597 	if (!cluster->nr)
2598 		return 0;
2599 
2600 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
2601 	if (!ra)
2602 		return -ENOMEM;
2603 
2604 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2605 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2606 
2607 	mutex_lock(&inode->i_mutex);
2608 
2609 	i_size_write(inode, cluster->end + 1 - offset);
2610 	ret = setup_extent_mapping(inode, cluster->start - offset,
2611 				   cluster->end - offset, cluster->start);
2612 	if (ret)
2613 		goto out_unlock;
2614 
2615 	file_ra_state_init(ra, inode->i_mapping);
2616 
2617 	WARN_ON(cluster->start != cluster->boundary[0]);
2618 	while (index <= last_index) {
2619 		page = find_lock_page(inode->i_mapping, index);
2620 		if (!page) {
2621 			page_cache_sync_readahead(inode->i_mapping,
2622 						  ra, NULL, index,
2623 						  last_index + 1 - index);
2624 			page = grab_cache_page(inode->i_mapping, index);
2625 			if (!page) {
2626 				ret = -ENOMEM;
2627 				goto out_unlock;
2628 			}
2629 		}
2630 
2631 		if (PageReadahead(page)) {
2632 			page_cache_async_readahead(inode->i_mapping,
2633 						   ra, NULL, page, index,
2634 						   last_index + 1 - index);
2635 		}
2636 
2637 		if (!PageUptodate(page)) {
2638 			btrfs_readpage(NULL, page);
2639 			lock_page(page);
2640 			if (!PageUptodate(page)) {
2641 				unlock_page(page);
2642 				page_cache_release(page);
2643 				ret = -EIO;
2644 				goto out_unlock;
2645 			}
2646 		}
2647 
2648 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2649 		page_end = page_start + PAGE_CACHE_SIZE - 1;
2650 
2651 		lock_extent(&BTRFS_I(inode)->io_tree,
2652 			    page_start, page_end, GFP_NOFS);
2653 
2654 		set_page_extent_mapped(page);
2655 
2656 		if (nr < cluster->nr &&
2657 		    page_start + offset == cluster->boundary[nr]) {
2658 			set_extent_bits(&BTRFS_I(inode)->io_tree,
2659 					page_start, page_end,
2660 					EXTENT_BOUNDARY, GFP_NOFS);
2661 			nr++;
2662 		}
2663 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 
2665 		set_page_dirty(page);
2666 		dirty_page++;
2667 
2668 		unlock_extent(&BTRFS_I(inode)->io_tree,
2669 			      page_start, page_end, GFP_NOFS);
2670 		unlock_page(page);
2671 		page_cache_release(page);
2672 
2673 		index++;
2674 		if (nr < cluster->nr &&
2675 		    page_end + 1 + offset == cluster->boundary[nr]) {
2676 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 							   dirty_page);
2678 			dirty_page = 0;
2679 		}
2680 	}
2681 	if (dirty_page) {
2682 		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 						   dirty_page);
2684 	}
2685 	WARN_ON(nr != cluster->nr);
2686 out_unlock:
2687 	mutex_unlock(&inode->i_mutex);
2688 	kfree(ra);
2689 	return ret;
2690 }
2691 
2692 static noinline_for_stack
2693 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2694 			 struct file_extent_cluster *cluster)
2695 {
2696 	int ret;
2697 
2698 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2699 		ret = relocate_file_extent_cluster(inode, cluster);
2700 		if (ret)
2701 			return ret;
2702 		cluster->nr = 0;
2703 	}
2704 
2705 	if (!cluster->nr)
2706 		cluster->start = extent_key->objectid;
2707 	else
2708 		BUG_ON(cluster->nr >= MAX_EXTENTS);
2709 	cluster->end = extent_key->objectid + extent_key->offset - 1;
2710 	cluster->boundary[cluster->nr] = extent_key->objectid;
2711 	cluster->nr++;
2712 
2713 	if (cluster->nr >= MAX_EXTENTS) {
2714 		ret = relocate_file_extent_cluster(inode, cluster);
2715 		if (ret)
2716 			return ret;
2717 		cluster->nr = 0;
2718 	}
2719 	return 0;
2720 }
2721 
2722 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2723 static int get_ref_objectid_v0(struct reloc_control *rc,
2724 			       struct btrfs_path *path,
2725 			       struct btrfs_key *extent_key,
2726 			       u64 *ref_objectid, int *path_change)
2727 {
2728 	struct btrfs_key key;
2729 	struct extent_buffer *leaf;
2730 	struct btrfs_extent_ref_v0 *ref0;
2731 	int ret;
2732 	int slot;
2733 
2734 	leaf = path->nodes[0];
2735 	slot = path->slots[0];
2736 	while (1) {
2737 		if (slot >= btrfs_header_nritems(leaf)) {
2738 			ret = btrfs_next_leaf(rc->extent_root, path);
2739 			if (ret < 0)
2740 				return ret;
2741 			BUG_ON(ret > 0);
2742 			leaf = path->nodes[0];
2743 			slot = path->slots[0];
2744 			if (path_change)
2745 				*path_change = 1;
2746 		}
2747 		btrfs_item_key_to_cpu(leaf, &key, slot);
2748 		if (key.objectid != extent_key->objectid)
2749 			return -ENOENT;
2750 
2751 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2752 			slot++;
2753 			continue;
2754 		}
2755 		ref0 = btrfs_item_ptr(leaf, slot,
2756 				struct btrfs_extent_ref_v0);
2757 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2758 		break;
2759 	}
2760 	return 0;
2761 }
2762 #endif
2763 
2764 /*
2765  * helper to add a tree block to the list.
2766  * the major work is getting the generation and level of the block
2767  */
2768 static int add_tree_block(struct reloc_control *rc,
2769 			  struct btrfs_key *extent_key,
2770 			  struct btrfs_path *path,
2771 			  struct rb_root *blocks)
2772 {
2773 	struct extent_buffer *eb;
2774 	struct btrfs_extent_item *ei;
2775 	struct btrfs_tree_block_info *bi;
2776 	struct tree_block *block;
2777 	struct rb_node *rb_node;
2778 	u32 item_size;
2779 	int level = -1;
2780 	int generation;
2781 
2782 	eb =  path->nodes[0];
2783 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
2784 
2785 	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2786 		ei = btrfs_item_ptr(eb, path->slots[0],
2787 				struct btrfs_extent_item);
2788 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2789 		generation = btrfs_extent_generation(eb, ei);
2790 		level = btrfs_tree_block_level(eb, bi);
2791 	} else {
2792 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2793 		u64 ref_owner;
2794 		int ret;
2795 
2796 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2797 		ret = get_ref_objectid_v0(rc, path, extent_key,
2798 					  &ref_owner, NULL);
2799 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2800 		level = (int)ref_owner;
2801 		/* FIXME: get real generation */
2802 		generation = 0;
2803 #else
2804 		BUG();
2805 #endif
2806 	}
2807 
2808 	btrfs_release_path(rc->extent_root, path);
2809 
2810 	BUG_ON(level == -1);
2811 
2812 	block = kmalloc(sizeof(*block), GFP_NOFS);
2813 	if (!block)
2814 		return -ENOMEM;
2815 
2816 	block->bytenr = extent_key->objectid;
2817 	block->key.objectid = extent_key->offset;
2818 	block->key.offset = generation;
2819 	block->level = level;
2820 	block->key_ready = 0;
2821 
2822 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2823 	BUG_ON(rb_node);
2824 
2825 	return 0;
2826 }
2827 
2828 /*
2829  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2830  */
2831 static int __add_tree_block(struct reloc_control *rc,
2832 			    u64 bytenr, u32 blocksize,
2833 			    struct rb_root *blocks)
2834 {
2835 	struct btrfs_path *path;
2836 	struct btrfs_key key;
2837 	int ret;
2838 
2839 	if (tree_block_processed(bytenr, blocksize, rc))
2840 		return 0;
2841 
2842 	if (tree_search(blocks, bytenr))
2843 		return 0;
2844 
2845 	path = btrfs_alloc_path();
2846 	if (!path)
2847 		return -ENOMEM;
2848 
2849 	key.objectid = bytenr;
2850 	key.type = BTRFS_EXTENT_ITEM_KEY;
2851 	key.offset = blocksize;
2852 
2853 	path->search_commit_root = 1;
2854 	path->skip_locking = 1;
2855 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2856 	if (ret < 0)
2857 		goto out;
2858 	BUG_ON(ret);
2859 
2860 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2861 	ret = add_tree_block(rc, &key, path, blocks);
2862 out:
2863 	btrfs_free_path(path);
2864 	return ret;
2865 }
2866 
2867 /*
2868  * helper to check if the block use full backrefs for pointers in it
2869  */
2870 static int block_use_full_backref(struct reloc_control *rc,
2871 				  struct extent_buffer *eb)
2872 {
2873 	struct btrfs_path *path;
2874 	struct btrfs_extent_item *ei;
2875 	struct btrfs_key key;
2876 	u64 flags;
2877 	int ret;
2878 
2879 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2880 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 		return 1;
2882 
2883 	path = btrfs_alloc_path();
2884 	BUG_ON(!path);
2885 
2886 	key.objectid = eb->start;
2887 	key.type = BTRFS_EXTENT_ITEM_KEY;
2888 	key.offset = eb->len;
2889 
2890 	path->search_commit_root = 1;
2891 	path->skip_locking = 1;
2892 	ret = btrfs_search_slot(NULL, rc->extent_root,
2893 				&key, path, 0, 0);
2894 	BUG_ON(ret);
2895 
2896 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 			    struct btrfs_extent_item);
2898 	flags = btrfs_extent_flags(path->nodes[0], ei);
2899 	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 		ret = 1;
2902 	else
2903 		ret = 0;
2904 	btrfs_free_path(path);
2905 	return ret;
2906 }
2907 
2908 /*
2909  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2910  * this function scans fs tree to find blocks reference the data extent
2911  */
2912 static int find_data_references(struct reloc_control *rc,
2913 				struct btrfs_key *extent_key,
2914 				struct extent_buffer *leaf,
2915 				struct btrfs_extent_data_ref *ref,
2916 				struct rb_root *blocks)
2917 {
2918 	struct btrfs_path *path;
2919 	struct tree_block *block;
2920 	struct btrfs_root *root;
2921 	struct btrfs_file_extent_item *fi;
2922 	struct rb_node *rb_node;
2923 	struct btrfs_key key;
2924 	u64 ref_root;
2925 	u64 ref_objectid;
2926 	u64 ref_offset;
2927 	u32 ref_count;
2928 	u32 nritems;
2929 	int err = 0;
2930 	int added = 0;
2931 	int counted;
2932 	int ret;
2933 
2934 	path = btrfs_alloc_path();
2935 	if (!path)
2936 		return -ENOMEM;
2937 
2938 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
2939 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2940 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2941 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
2942 
2943 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
2944 	if (IS_ERR(root)) {
2945 		err = PTR_ERR(root);
2946 		goto out;
2947 	}
2948 
2949 	key.objectid = ref_objectid;
2950 	key.offset = ref_offset;
2951 	key.type = BTRFS_EXTENT_DATA_KEY;
2952 
2953 	path->search_commit_root = 1;
2954 	path->skip_locking = 1;
2955 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2956 	if (ret < 0) {
2957 		err = ret;
2958 		goto out;
2959 	}
2960 
2961 	leaf = path->nodes[0];
2962 	nritems = btrfs_header_nritems(leaf);
2963 	/*
2964 	 * the references in tree blocks that use full backrefs
2965 	 * are not counted in
2966 	 */
2967 	if (block_use_full_backref(rc, leaf))
2968 		counted = 0;
2969 	else
2970 		counted = 1;
2971 	rb_node = tree_search(blocks, leaf->start);
2972 	if (rb_node) {
2973 		if (counted)
2974 			added = 1;
2975 		else
2976 			path->slots[0] = nritems;
2977 	}
2978 
2979 	while (ref_count > 0) {
2980 		while (path->slots[0] >= nritems) {
2981 			ret = btrfs_next_leaf(root, path);
2982 			if (ret < 0) {
2983 				err = ret;
2984 				goto out;
2985 			}
2986 			if (ret > 0) {
2987 				WARN_ON(1);
2988 				goto out;
2989 			}
2990 
2991 			leaf = path->nodes[0];
2992 			nritems = btrfs_header_nritems(leaf);
2993 			added = 0;
2994 
2995 			if (block_use_full_backref(rc, leaf))
2996 				counted = 0;
2997 			else
2998 				counted = 1;
2999 			rb_node = tree_search(blocks, leaf->start);
3000 			if (rb_node) {
3001 				if (counted)
3002 					added = 1;
3003 				else
3004 					path->slots[0] = nritems;
3005 			}
3006 		}
3007 
3008 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3009 		if (key.objectid != ref_objectid ||
3010 		    key.type != BTRFS_EXTENT_DATA_KEY) {
3011 			WARN_ON(1);
3012 			break;
3013 		}
3014 
3015 		fi = btrfs_item_ptr(leaf, path->slots[0],
3016 				    struct btrfs_file_extent_item);
3017 
3018 		if (btrfs_file_extent_type(leaf, fi) ==
3019 		    BTRFS_FILE_EXTENT_INLINE)
3020 			goto next;
3021 
3022 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3023 		    extent_key->objectid)
3024 			goto next;
3025 
3026 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3027 		if (key.offset != ref_offset)
3028 			goto next;
3029 
3030 		if (counted)
3031 			ref_count--;
3032 		if (added)
3033 			goto next;
3034 
3035 		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3036 			block = kmalloc(sizeof(*block), GFP_NOFS);
3037 			if (!block) {
3038 				err = -ENOMEM;
3039 				break;
3040 			}
3041 			block->bytenr = leaf->start;
3042 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3043 			block->level = 0;
3044 			block->key_ready = 1;
3045 			rb_node = tree_insert(blocks, block->bytenr,
3046 					      &block->rb_node);
3047 			BUG_ON(rb_node);
3048 		}
3049 		if (counted)
3050 			added = 1;
3051 		else
3052 			path->slots[0] = nritems;
3053 next:
3054 		path->slots[0]++;
3055 
3056 	}
3057 out:
3058 	btrfs_free_path(path);
3059 	return err;
3060 }
3061 
3062 /*
3063  * hepler to find all tree blocks that reference a given data extent
3064  */
3065 static noinline_for_stack
3066 int add_data_references(struct reloc_control *rc,
3067 			struct btrfs_key *extent_key,
3068 			struct btrfs_path *path,
3069 			struct rb_root *blocks)
3070 {
3071 	struct btrfs_key key;
3072 	struct extent_buffer *eb;
3073 	struct btrfs_extent_data_ref *dref;
3074 	struct btrfs_extent_inline_ref *iref;
3075 	unsigned long ptr;
3076 	unsigned long end;
3077 	u32 blocksize;
3078 	int ret;
3079 	int err = 0;
3080 
3081 	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 			       extent_key->offset);
3083 	BUG_ON(ret < 0);
3084 	if (ret > 0) {
3085 		/* the relocated data is fragmented */
3086 		rc->extents_skipped++;
3087 		btrfs_release_path(rc->extent_root, path);
3088 		return 0;
3089 	}
3090 
3091 	blocksize = btrfs_level_size(rc->extent_root, 0);
3092 
3093 	eb = path->nodes[0];
3094 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3096 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3097 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3098 		ptr = end;
3099 	else
3100 #endif
3101 		ptr += sizeof(struct btrfs_extent_item);
3102 
3103 	while (ptr < end) {
3104 		iref = (struct btrfs_extent_inline_ref *)ptr;
3105 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3106 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3107 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3108 			ret = __add_tree_block(rc, key.offset, blocksize,
3109 					       blocks);
3110 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3111 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3112 			ret = find_data_references(rc, extent_key,
3113 						   eb, dref, blocks);
3114 		} else {
3115 			BUG();
3116 		}
3117 		ptr += btrfs_extent_inline_ref_size(key.type);
3118 	}
3119 	WARN_ON(ptr > end);
3120 
3121 	while (1) {
3122 		cond_resched();
3123 		eb = path->nodes[0];
3124 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3125 			ret = btrfs_next_leaf(rc->extent_root, path);
3126 			if (ret < 0) {
3127 				err = ret;
3128 				break;
3129 			}
3130 			if (ret > 0)
3131 				break;
3132 			eb = path->nodes[0];
3133 		}
3134 
3135 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3136 		if (key.objectid != extent_key->objectid)
3137 			break;
3138 
3139 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3140 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3141 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3142 #else
3143 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3144 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3145 #endif
3146 			ret = __add_tree_block(rc, key.offset, blocksize,
3147 					       blocks);
3148 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3149 			dref = btrfs_item_ptr(eb, path->slots[0],
3150 					      struct btrfs_extent_data_ref);
3151 			ret = find_data_references(rc, extent_key,
3152 						   eb, dref, blocks);
3153 		} else {
3154 			ret = 0;
3155 		}
3156 		if (ret) {
3157 			err = ret;
3158 			break;
3159 		}
3160 		path->slots[0]++;
3161 	}
3162 	btrfs_release_path(rc->extent_root, path);
3163 	if (err)
3164 		free_block_list(blocks);
3165 	return err;
3166 }
3167 
3168 /*
3169  * hepler to find next unprocessed extent
3170  */
3171 static noinline_for_stack
3172 int find_next_extent(struct btrfs_trans_handle *trans,
3173 		     struct reloc_control *rc, struct btrfs_path *path)
3174 {
3175 	struct btrfs_key key;
3176 	struct extent_buffer *leaf;
3177 	u64 start, end, last;
3178 	int ret;
3179 
3180 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3181 	while (1) {
3182 		cond_resched();
3183 		if (rc->search_start >= last) {
3184 			ret = 1;
3185 			break;
3186 		}
3187 
3188 		key.objectid = rc->search_start;
3189 		key.type = BTRFS_EXTENT_ITEM_KEY;
3190 		key.offset = 0;
3191 
3192 		path->search_commit_root = 1;
3193 		path->skip_locking = 1;
3194 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3195 					0, 0);
3196 		if (ret < 0)
3197 			break;
3198 next:
3199 		leaf = path->nodes[0];
3200 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3201 			ret = btrfs_next_leaf(rc->extent_root, path);
3202 			if (ret != 0)
3203 				break;
3204 			leaf = path->nodes[0];
3205 		}
3206 
3207 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3208 		if (key.objectid >= last) {
3209 			ret = 1;
3210 			break;
3211 		}
3212 
3213 		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3214 		    key.objectid + key.offset <= rc->search_start) {
3215 			path->slots[0]++;
3216 			goto next;
3217 		}
3218 
3219 		ret = find_first_extent_bit(&rc->processed_blocks,
3220 					    key.objectid, &start, &end,
3221 					    EXTENT_DIRTY);
3222 
3223 		if (ret == 0 && start <= key.objectid) {
3224 			btrfs_release_path(rc->extent_root, path);
3225 			rc->search_start = end + 1;
3226 		} else {
3227 			rc->search_start = key.objectid + key.offset;
3228 			return 0;
3229 		}
3230 	}
3231 	btrfs_release_path(rc->extent_root, path);
3232 	return ret;
3233 }
3234 
3235 static void set_reloc_control(struct reloc_control *rc)
3236 {
3237 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3238 	mutex_lock(&fs_info->trans_mutex);
3239 	fs_info->reloc_ctl = rc;
3240 	mutex_unlock(&fs_info->trans_mutex);
3241 }
3242 
3243 static void unset_reloc_control(struct reloc_control *rc)
3244 {
3245 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3246 	mutex_lock(&fs_info->trans_mutex);
3247 	fs_info->reloc_ctl = NULL;
3248 	mutex_unlock(&fs_info->trans_mutex);
3249 }
3250 
3251 static int check_extent_flags(u64 flags)
3252 {
3253 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3254 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3255 		return 1;
3256 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3257 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3258 		return 1;
3259 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3260 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3261 		return 1;
3262 	return 0;
3263 }
3264 
3265 
3266 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267 {
3268 	struct rb_root blocks = RB_ROOT;
3269 	struct btrfs_key key;
3270 	struct file_extent_cluster *cluster;
3271 	struct btrfs_trans_handle *trans = NULL;
3272 	struct btrfs_path *path;
3273 	struct btrfs_extent_item *ei;
3274 	unsigned long nr;
3275 	u64 flags;
3276 	u32 item_size;
3277 	int ret;
3278 	int err = 0;
3279 
3280 	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 	if (!cluster)
3282 		return -ENOMEM;
3283 
3284 	path = btrfs_alloc_path();
3285 	if (!path) {
3286 		kfree(cluster);
3287 		return -ENOMEM;
3288 	}
3289 
3290 	rc->extents_found = 0;
3291 	rc->extents_skipped = 0;
3292 
3293 	rc->search_start = rc->block_group->key.objectid;
3294 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 			  GFP_NOFS);
3296 
3297 	rc->create_reloc_root = 1;
3298 	set_reloc_control(rc);
3299 
3300 	trans = btrfs_start_transaction(rc->extent_root, 1);
3301 	btrfs_commit_transaction(trans, rc->extent_root);
3302 
3303 	while (1) {
3304 		trans = btrfs_start_transaction(rc->extent_root, 1);
3305 
3306 		ret = find_next_extent(trans, rc, path);
3307 		if (ret < 0)
3308 			err = ret;
3309 		if (ret != 0)
3310 			break;
3311 
3312 		rc->extents_found++;
3313 
3314 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 				    struct btrfs_extent_item);
3316 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3317 		item_size = btrfs_item_size_nr(path->nodes[0],
3318 					       path->slots[0]);
3319 		if (item_size >= sizeof(*ei)) {
3320 			flags = btrfs_extent_flags(path->nodes[0], ei);
3321 			ret = check_extent_flags(flags);
3322 			BUG_ON(ret);
3323 
3324 		} else {
3325 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3326 			u64 ref_owner;
3327 			int path_change = 0;
3328 
3329 			BUG_ON(item_size !=
3330 			       sizeof(struct btrfs_extent_item_v0));
3331 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3332 						  &path_change);
3333 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3334 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3335 			else
3336 				flags = BTRFS_EXTENT_FLAG_DATA;
3337 
3338 			if (path_change) {
3339 				btrfs_release_path(rc->extent_root, path);
3340 
3341 				path->search_commit_root = 1;
3342 				path->skip_locking = 1;
3343 				ret = btrfs_search_slot(NULL, rc->extent_root,
3344 							&key, path, 0, 0);
3345 				if (ret < 0) {
3346 					err = ret;
3347 					break;
3348 				}
3349 				BUG_ON(ret > 0);
3350 			}
3351 #else
3352 			BUG();
3353 #endif
3354 		}
3355 
3356 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 			ret = add_tree_block(rc, &key, path, &blocks);
3358 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3359 			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 			ret = add_data_references(rc, &key, path, &blocks);
3361 		} else {
3362 			btrfs_release_path(rc->extent_root, path);
3363 			ret = 0;
3364 		}
3365 		if (ret < 0) {
3366 			err = 0;
3367 			break;
3368 		}
3369 
3370 		if (!RB_EMPTY_ROOT(&blocks)) {
3371 			ret = relocate_tree_blocks(trans, rc, &blocks);
3372 			if (ret < 0) {
3373 				err = ret;
3374 				break;
3375 			}
3376 		}
3377 
3378 		nr = trans->blocks_used;
3379 		btrfs_end_transaction(trans, rc->extent_root);
3380 		trans = NULL;
3381 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 
3383 		if (rc->stage == MOVE_DATA_EXTENTS &&
3384 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 			rc->found_file_extent = 1;
3386 			ret = relocate_data_extent(rc->data_inode,
3387 						   &key, cluster);
3388 			if (ret < 0) {
3389 				err = ret;
3390 				break;
3391 			}
3392 		}
3393 	}
3394 	btrfs_free_path(path);
3395 
3396 	if (trans) {
3397 		nr = trans->blocks_used;
3398 		btrfs_end_transaction(trans, rc->extent_root);
3399 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 	}
3401 
3402 	if (!err) {
3403 		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3404 		if (ret < 0)
3405 			err = ret;
3406 	}
3407 
3408 	kfree(cluster);
3409 
3410 	rc->create_reloc_root = 0;
3411 	smp_mb();
3412 
3413 	if (rc->extents_found > 0) {
3414 		trans = btrfs_start_transaction(rc->extent_root, 1);
3415 		btrfs_commit_transaction(trans, rc->extent_root);
3416 	}
3417 
3418 	merge_reloc_roots(rc);
3419 
3420 	unset_reloc_control(rc);
3421 
3422 	/* get rid of pinned extents */
3423 	trans = btrfs_start_transaction(rc->extent_root, 1);
3424 	btrfs_commit_transaction(trans, rc->extent_root);
3425 
3426 	return err;
3427 }
3428 
3429 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3430 				 struct btrfs_root *root, u64 objectid)
3431 {
3432 	struct btrfs_path *path;
3433 	struct btrfs_inode_item *item;
3434 	struct extent_buffer *leaf;
3435 	int ret;
3436 
3437 	path = btrfs_alloc_path();
3438 	if (!path)
3439 		return -ENOMEM;
3440 
3441 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3442 	if (ret)
3443 		goto out;
3444 
3445 	leaf = path->nodes[0];
3446 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3447 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3448 	btrfs_set_inode_generation(leaf, item, 1);
3449 	btrfs_set_inode_size(leaf, item, 0);
3450 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3452 	btrfs_mark_buffer_dirty(leaf);
3453 	btrfs_release_path(root, path);
3454 out:
3455 	btrfs_free_path(path);
3456 	return ret;
3457 }
3458 
3459 /*
3460  * helper to create inode for data relocation.
3461  * the inode is in data relocation tree and its link count is 0
3462  */
3463 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3464 					struct btrfs_block_group_cache *group)
3465 {
3466 	struct inode *inode = NULL;
3467 	struct btrfs_trans_handle *trans;
3468 	struct btrfs_root *root;
3469 	struct btrfs_key key;
3470 	unsigned long nr;
3471 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3472 	int err = 0;
3473 
3474 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3475 	if (IS_ERR(root))
3476 		return ERR_CAST(root);
3477 
3478 	trans = btrfs_start_transaction(root, 1);
3479 	BUG_ON(!trans);
3480 
3481 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 	if (err)
3483 		goto out;
3484 
3485 	err = __insert_orphan_inode(trans, root, objectid);
3486 	BUG_ON(err);
3487 
3488 	key.objectid = objectid;
3489 	key.type = BTRFS_INODE_ITEM_KEY;
3490 	key.offset = 0;
3491 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3492 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3493 	BTRFS_I(inode)->index_cnt = group->key.objectid;
3494 
3495 	err = btrfs_orphan_add(trans, inode);
3496 out:
3497 	nr = trans->blocks_used;
3498 	btrfs_end_transaction(trans, root);
3499 
3500 	btrfs_btree_balance_dirty(root, nr);
3501 	if (err) {
3502 		if (inode)
3503 			iput(inode);
3504 		inode = ERR_PTR(err);
3505 	}
3506 	return inode;
3507 }
3508 
3509 /*
3510  * function to relocate all extents in a block group.
3511  */
3512 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3513 {
3514 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 	struct reloc_control *rc;
3516 	int ret;
3517 	int err = 0;
3518 
3519 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3520 	if (!rc)
3521 		return -ENOMEM;
3522 
3523 	mapping_tree_init(&rc->reloc_root_tree);
3524 	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 	INIT_LIST_HEAD(&rc->reloc_roots);
3526 
3527 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 	BUG_ON(!rc->block_group);
3529 
3530 	btrfs_init_workers(&rc->workers, "relocate",
3531 			   fs_info->thread_pool_size, NULL);
3532 
3533 	rc->extent_root = extent_root;
3534 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3535 
3536 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 	if (IS_ERR(rc->data_inode)) {
3538 		err = PTR_ERR(rc->data_inode);
3539 		rc->data_inode = NULL;
3540 		goto out;
3541 	}
3542 
3543 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3544 	       (unsigned long long)rc->block_group->key.objectid,
3545 	       (unsigned long long)rc->block_group->flags);
3546 
3547 	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
3548 	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 
3550 	while (1) {
3551 		rc->extents_found = 0;
3552 		rc->extents_skipped = 0;
3553 
3554 		mutex_lock(&fs_info->cleaner_mutex);
3555 
3556 		btrfs_clean_old_snapshots(fs_info->tree_root);
3557 		ret = relocate_block_group(rc);
3558 
3559 		mutex_unlock(&fs_info->cleaner_mutex);
3560 		if (ret < 0) {
3561 			err = ret;
3562 			break;
3563 		}
3564 
3565 		if (rc->extents_found == 0)
3566 			break;
3567 
3568 		printk(KERN_INFO "btrfs: found %llu extents\n",
3569 			(unsigned long long)rc->extents_found);
3570 
3571 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3572 			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3573 			invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 						 0, -1);
3575 			rc->stage = UPDATE_DATA_PTRS;
3576 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3577 			   rc->extents_skipped >= rc->extents_found) {
3578 			iput(rc->data_inode);
3579 			rc->data_inode = create_reloc_inode(fs_info,
3580 							    rc->block_group);
3581 			if (IS_ERR(rc->data_inode)) {
3582 				err = PTR_ERR(rc->data_inode);
3583 				rc->data_inode = NULL;
3584 				break;
3585 			}
3586 			rc->stage = MOVE_DATA_EXTENTS;
3587 			rc->found_file_extent = 0;
3588 		}
3589 	}
3590 
3591 	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3592 				     rc->block_group->key.objectid,
3593 				     rc->block_group->key.objectid +
3594 				     rc->block_group->key.offset - 1);
3595 
3596 	WARN_ON(rc->block_group->pinned > 0);
3597 	WARN_ON(rc->block_group->reserved > 0);
3598 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599 out:
3600 	iput(rc->data_inode);
3601 	btrfs_stop_workers(&rc->workers);
3602 	btrfs_put_block_group(rc->block_group);
3603 	kfree(rc);
3604 	return err;
3605 }
3606 
3607 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3608 {
3609 	struct btrfs_trans_handle *trans;
3610 	int ret;
3611 
3612 	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3613 
3614 	memset(&root->root_item.drop_progress, 0,
3615 		sizeof(root->root_item.drop_progress));
3616 	root->root_item.drop_level = 0;
3617 	btrfs_set_root_refs(&root->root_item, 0);
3618 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
3619 				&root->root_key, &root->root_item);
3620 	BUG_ON(ret);
3621 
3622 	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3623 	BUG_ON(ret);
3624 	return 0;
3625 }
3626 
3627 /*
3628  * recover relocation interrupted by system crash.
3629  *
3630  * this function resumes merging reloc trees with corresponding fs trees.
3631  * this is important for keeping the sharing of tree blocks
3632  */
3633 int btrfs_recover_relocation(struct btrfs_root *root)
3634 {
3635 	LIST_HEAD(reloc_roots);
3636 	struct btrfs_key key;
3637 	struct btrfs_root *fs_root;
3638 	struct btrfs_root *reloc_root;
3639 	struct btrfs_path *path;
3640 	struct extent_buffer *leaf;
3641 	struct reloc_control *rc = NULL;
3642 	struct btrfs_trans_handle *trans;
3643 	int ret;
3644 	int err = 0;
3645 
3646 	path = btrfs_alloc_path();
3647 	if (!path)
3648 		return -ENOMEM;
3649 
3650 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3651 	key.type = BTRFS_ROOT_ITEM_KEY;
3652 	key.offset = (u64)-1;
3653 
3654 	while (1) {
3655 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3656 					path, 0, 0);
3657 		if (ret < 0) {
3658 			err = ret;
3659 			goto out;
3660 		}
3661 		if (ret > 0) {
3662 			if (path->slots[0] == 0)
3663 				break;
3664 			path->slots[0]--;
3665 		}
3666 		leaf = path->nodes[0];
3667 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3668 		btrfs_release_path(root->fs_info->tree_root, path);
3669 
3670 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3671 		    key.type != BTRFS_ROOT_ITEM_KEY)
3672 			break;
3673 
3674 		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3675 		if (IS_ERR(reloc_root)) {
3676 			err = PTR_ERR(reloc_root);
3677 			goto out;
3678 		}
3679 
3680 		list_add(&reloc_root->root_list, &reloc_roots);
3681 
3682 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3683 			fs_root = read_fs_root(root->fs_info,
3684 					       reloc_root->root_key.offset);
3685 			if (IS_ERR(fs_root)) {
3686 				ret = PTR_ERR(fs_root);
3687 				if (ret != -ENOENT) {
3688 					err = ret;
3689 					goto out;
3690 				}
3691 				mark_garbage_root(reloc_root);
3692 			}
3693 		}
3694 
3695 		if (key.offset == 0)
3696 			break;
3697 
3698 		key.offset--;
3699 	}
3700 	btrfs_release_path(root->fs_info->tree_root, path);
3701 
3702 	if (list_empty(&reloc_roots))
3703 		goto out;
3704 
3705 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3706 	if (!rc) {
3707 		err = -ENOMEM;
3708 		goto out;
3709 	}
3710 
3711 	mapping_tree_init(&rc->reloc_root_tree);
3712 	INIT_LIST_HEAD(&rc->reloc_roots);
3713 	btrfs_init_workers(&rc->workers, "relocate",
3714 			   root->fs_info->thread_pool_size, NULL);
3715 	rc->extent_root = root->fs_info->extent_root;
3716 
3717 	set_reloc_control(rc);
3718 
3719 	while (!list_empty(&reloc_roots)) {
3720 		reloc_root = list_entry(reloc_roots.next,
3721 					struct btrfs_root, root_list);
3722 		list_del(&reloc_root->root_list);
3723 
3724 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3725 			list_add_tail(&reloc_root->root_list,
3726 				      &rc->reloc_roots);
3727 			continue;
3728 		}
3729 
3730 		fs_root = read_fs_root(root->fs_info,
3731 				       reloc_root->root_key.offset);
3732 		BUG_ON(IS_ERR(fs_root));
3733 
3734 		__add_reloc_root(reloc_root);
3735 		fs_root->reloc_root = reloc_root;
3736 	}
3737 
3738 	trans = btrfs_start_transaction(rc->extent_root, 1);
3739 	btrfs_commit_transaction(trans, rc->extent_root);
3740 
3741 	merge_reloc_roots(rc);
3742 
3743 	unset_reloc_control(rc);
3744 
3745 	trans = btrfs_start_transaction(rc->extent_root, 1);
3746 	btrfs_commit_transaction(trans, rc->extent_root);
3747 out:
3748 	if (rc) {
3749 		btrfs_stop_workers(&rc->workers);
3750 		kfree(rc);
3751 	}
3752 	while (!list_empty(&reloc_roots)) {
3753 		reloc_root = list_entry(reloc_roots.next,
3754 					struct btrfs_root, root_list);
3755 		list_del(&reloc_root->root_list);
3756 		free_extent_buffer(reloc_root->node);
3757 		free_extent_buffer(reloc_root->commit_root);
3758 		kfree(reloc_root);
3759 	}
3760 	btrfs_free_path(path);
3761 
3762 	if (err == 0) {
3763 		/* cleanup orphan inode in data relocation tree */
3764 		fs_root = read_fs_root(root->fs_info,
3765 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
3766 		if (IS_ERR(fs_root))
3767 			err = PTR_ERR(fs_root);
3768 		else
3769 			btrfs_orphan_cleanup(fs_root);
3770 	}
3771 	return err;
3772 }
3773 
3774 /*
3775  * helper to add ordered checksum for data relocation.
3776  *
3777  * cloning checksum properly handles the nodatasum extents.
3778  * it also saves CPU time to re-calculate the checksum.
3779  */
3780 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3781 {
3782 	struct btrfs_ordered_sum *sums;
3783 	struct btrfs_sector_sum *sector_sum;
3784 	struct btrfs_ordered_extent *ordered;
3785 	struct btrfs_root *root = BTRFS_I(inode)->root;
3786 	size_t offset;
3787 	int ret;
3788 	u64 disk_bytenr;
3789 	LIST_HEAD(list);
3790 
3791 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3792 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3793 
3794 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3795 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3796 				       disk_bytenr + len - 1, &list);
3797 
3798 	while (!list_empty(&list)) {
3799 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3800 		list_del_init(&sums->list);
3801 
3802 		sector_sum = sums->sums;
3803 		sums->bytenr = ordered->start;
3804 
3805 		offset = 0;
3806 		while (offset < sums->len) {
3807 			sector_sum->bytenr += ordered->start - disk_bytenr;
3808 			sector_sum++;
3809 			offset += root->sectorsize;
3810 		}
3811 
3812 		btrfs_add_ordered_sum(inode, ordered, sums);
3813 	}
3814 	btrfs_put_ordered_extent(ordered);
3815 	return 0;
3816 }
3817