xref: /openbmc/linux/fs/btrfs/relocation.c (revision e8e0929d)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "volumes.h"
28 #include "locking.h"
29 #include "btrfs_inode.h"
30 #include "async-thread.h"
31 
32 /*
33  * backref_node, mapping_node and tree_block start with this
34  */
35 struct tree_entry {
36 	struct rb_node rb_node;
37 	u64 bytenr;
38 };
39 
40 /*
41  * present a tree block in the backref cache
42  */
43 struct backref_node {
44 	struct rb_node rb_node;
45 	u64 bytenr;
46 	/* objectid tree block owner */
47 	u64 owner;
48 	/* list of upper level blocks reference this block */
49 	struct list_head upper;
50 	/* list of child blocks in the cache */
51 	struct list_head lower;
52 	/* NULL if this node is not tree root */
53 	struct btrfs_root *root;
54 	/* extent buffer got by COW the block */
55 	struct extent_buffer *eb;
56 	/* level of tree block */
57 	unsigned int level:8;
58 	/* 1 if the block is root of old snapshot */
59 	unsigned int old_root:1;
60 	/* 1 if no child blocks in the cache */
61 	unsigned int lowest:1;
62 	/* is the extent buffer locked */
63 	unsigned int locked:1;
64 	/* has the block been processed */
65 	unsigned int processed:1;
66 	/* have backrefs of this block been checked */
67 	unsigned int checked:1;
68 };
69 
70 /*
71  * present a block pointer in the backref cache
72  */
73 struct backref_edge {
74 	struct list_head list[2];
75 	struct backref_node *node[2];
76 	u64 blockptr;
77 };
78 
79 #define LOWER	0
80 #define UPPER	1
81 
82 struct backref_cache {
83 	/* red black tree of all backref nodes in the cache */
84 	struct rb_root rb_root;
85 	/* list of backref nodes with no child block in the cache */
86 	struct list_head pending[BTRFS_MAX_LEVEL];
87 	spinlock_t lock;
88 };
89 
90 /*
91  * map address of tree root to tree
92  */
93 struct mapping_node {
94 	struct rb_node rb_node;
95 	u64 bytenr;
96 	void *data;
97 };
98 
99 struct mapping_tree {
100 	struct rb_root rb_root;
101 	spinlock_t lock;
102 };
103 
104 /*
105  * present a tree block to process
106  */
107 struct tree_block {
108 	struct rb_node rb_node;
109 	u64 bytenr;
110 	struct btrfs_key key;
111 	unsigned int level:8;
112 	unsigned int key_ready:1;
113 };
114 
115 /* inode vector */
116 #define INODEVEC_SIZE 16
117 
118 struct inodevec {
119 	struct list_head list;
120 	struct inode *inode[INODEVEC_SIZE];
121 	int nr;
122 };
123 
124 #define MAX_EXTENTS 128
125 
126 struct file_extent_cluster {
127 	u64 start;
128 	u64 end;
129 	u64 boundary[MAX_EXTENTS];
130 	unsigned int nr;
131 };
132 
133 struct reloc_control {
134 	/* block group to relocate */
135 	struct btrfs_block_group_cache *block_group;
136 	/* extent tree */
137 	struct btrfs_root *extent_root;
138 	/* inode for moving data */
139 	struct inode *data_inode;
140 	struct btrfs_workers workers;
141 	/* tree blocks have been processed */
142 	struct extent_io_tree processed_blocks;
143 	/* map start of tree root to corresponding reloc tree */
144 	struct mapping_tree reloc_root_tree;
145 	/* list of reloc trees */
146 	struct list_head reloc_roots;
147 	u64 search_start;
148 	u64 extents_found;
149 	u64 extents_skipped;
150 	int stage;
151 	int create_reloc_root;
152 	unsigned int found_file_extent:1;
153 	unsigned int found_old_snapshot:1;
154 };
155 
156 /* stages of data relocation */
157 #define MOVE_DATA_EXTENTS	0
158 #define UPDATE_DATA_PTRS	1
159 
160 /*
161  * merge reloc tree to corresponding fs tree in worker threads
162  */
163 struct async_merge {
164 	struct btrfs_work work;
165 	struct reloc_control *rc;
166 	struct btrfs_root *root;
167 	struct completion *done;
168 	atomic_t *num_pending;
169 };
170 
171 static void mapping_tree_init(struct mapping_tree *tree)
172 {
173 	tree->rb_root.rb_node = NULL;
174 	spin_lock_init(&tree->lock);
175 }
176 
177 static void backref_cache_init(struct backref_cache *cache)
178 {
179 	int i;
180 	cache->rb_root.rb_node = NULL;
181 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 		INIT_LIST_HEAD(&cache->pending[i]);
183 	spin_lock_init(&cache->lock);
184 }
185 
186 static void backref_node_init(struct backref_node *node)
187 {
188 	memset(node, 0, sizeof(*node));
189 	INIT_LIST_HEAD(&node->upper);
190 	INIT_LIST_HEAD(&node->lower);
191 	RB_CLEAR_NODE(&node->rb_node);
192 }
193 
194 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
195 				   struct rb_node *node)
196 {
197 	struct rb_node **p = &root->rb_node;
198 	struct rb_node *parent = NULL;
199 	struct tree_entry *entry;
200 
201 	while (*p) {
202 		parent = *p;
203 		entry = rb_entry(parent, struct tree_entry, rb_node);
204 
205 		if (bytenr < entry->bytenr)
206 			p = &(*p)->rb_left;
207 		else if (bytenr > entry->bytenr)
208 			p = &(*p)->rb_right;
209 		else
210 			return parent;
211 	}
212 
213 	rb_link_node(node, parent, p);
214 	rb_insert_color(node, root);
215 	return NULL;
216 }
217 
218 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
219 {
220 	struct rb_node *n = root->rb_node;
221 	struct tree_entry *entry;
222 
223 	while (n) {
224 		entry = rb_entry(n, struct tree_entry, rb_node);
225 
226 		if (bytenr < entry->bytenr)
227 			n = n->rb_left;
228 		else if (bytenr > entry->bytenr)
229 			n = n->rb_right;
230 		else
231 			return n;
232 	}
233 	return NULL;
234 }
235 
236 /*
237  * walk up backref nodes until reach node presents tree root
238  */
239 static struct backref_node *walk_up_backref(struct backref_node *node,
240 					    struct backref_edge *edges[],
241 					    int *index)
242 {
243 	struct backref_edge *edge;
244 	int idx = *index;
245 
246 	while (!list_empty(&node->upper)) {
247 		edge = list_entry(node->upper.next,
248 				  struct backref_edge, list[LOWER]);
249 		edges[idx++] = edge;
250 		node = edge->node[UPPER];
251 	}
252 	*index = idx;
253 	return node;
254 }
255 
256 /*
257  * walk down backref nodes to find start of next reference path
258  */
259 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
260 					      int *index)
261 {
262 	struct backref_edge *edge;
263 	struct backref_node *lower;
264 	int idx = *index;
265 
266 	while (idx > 0) {
267 		edge = edges[idx - 1];
268 		lower = edge->node[LOWER];
269 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
270 			idx--;
271 			continue;
272 		}
273 		edge = list_entry(edge->list[LOWER].next,
274 				  struct backref_edge, list[LOWER]);
275 		edges[idx - 1] = edge;
276 		*index = idx;
277 		return edge->node[UPPER];
278 	}
279 	*index = 0;
280 	return NULL;
281 }
282 
283 static void drop_node_buffer(struct backref_node *node)
284 {
285 	if (node->eb) {
286 		if (node->locked) {
287 			btrfs_tree_unlock(node->eb);
288 			node->locked = 0;
289 		}
290 		free_extent_buffer(node->eb);
291 		node->eb = NULL;
292 	}
293 }
294 
295 static void drop_backref_node(struct backref_cache *tree,
296 			      struct backref_node *node)
297 {
298 	BUG_ON(!node->lowest);
299 	BUG_ON(!list_empty(&node->upper));
300 
301 	drop_node_buffer(node);
302 	list_del(&node->lower);
303 
304 	rb_erase(&node->rb_node, &tree->rb_root);
305 	kfree(node);
306 }
307 
308 /*
309  * remove a backref node from the backref cache
310  */
311 static void remove_backref_node(struct backref_cache *cache,
312 				struct backref_node *node)
313 {
314 	struct backref_node *upper;
315 	struct backref_edge *edge;
316 
317 	if (!node)
318 		return;
319 
320 	BUG_ON(!node->lowest);
321 	while (!list_empty(&node->upper)) {
322 		edge = list_entry(node->upper.next, struct backref_edge,
323 				  list[LOWER]);
324 		upper = edge->node[UPPER];
325 		list_del(&edge->list[LOWER]);
326 		list_del(&edge->list[UPPER]);
327 		kfree(edge);
328 		/*
329 		 * add the node to pending list if no other
330 		 * child block cached.
331 		 */
332 		if (list_empty(&upper->lower)) {
333 			list_add_tail(&upper->lower,
334 				      &cache->pending[upper->level]);
335 			upper->lowest = 1;
336 		}
337 	}
338 	drop_backref_node(cache, node);
339 }
340 
341 /*
342  * find reloc tree by address of tree root
343  */
344 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
345 					  u64 bytenr)
346 {
347 	struct rb_node *rb_node;
348 	struct mapping_node *node;
349 	struct btrfs_root *root = NULL;
350 
351 	spin_lock(&rc->reloc_root_tree.lock);
352 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
353 	if (rb_node) {
354 		node = rb_entry(rb_node, struct mapping_node, rb_node);
355 		root = (struct btrfs_root *)node->data;
356 	}
357 	spin_unlock(&rc->reloc_root_tree.lock);
358 	return root;
359 }
360 
361 static int is_cowonly_root(u64 root_objectid)
362 {
363 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
364 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
365 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
366 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
367 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
368 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
369 		return 1;
370 	return 0;
371 }
372 
373 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
374 					u64 root_objectid)
375 {
376 	struct btrfs_key key;
377 
378 	key.objectid = root_objectid;
379 	key.type = BTRFS_ROOT_ITEM_KEY;
380 	if (is_cowonly_root(root_objectid))
381 		key.offset = 0;
382 	else
383 		key.offset = (u64)-1;
384 
385 	return btrfs_read_fs_root_no_name(fs_info, &key);
386 }
387 
388 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
389 static noinline_for_stack
390 struct btrfs_root *find_tree_root(struct reloc_control *rc,
391 				  struct extent_buffer *leaf,
392 				  struct btrfs_extent_ref_v0 *ref0)
393 {
394 	struct btrfs_root *root;
395 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
396 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
397 
398 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
399 
400 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
401 	BUG_ON(IS_ERR(root));
402 
403 	if (root->ref_cows &&
404 	    generation != btrfs_root_generation(&root->root_item))
405 		return NULL;
406 
407 	return root;
408 }
409 #endif
410 
411 static noinline_for_stack
412 int find_inline_backref(struct extent_buffer *leaf, int slot,
413 			unsigned long *ptr, unsigned long *end)
414 {
415 	struct btrfs_extent_item *ei;
416 	struct btrfs_tree_block_info *bi;
417 	u32 item_size;
418 
419 	item_size = btrfs_item_size_nr(leaf, slot);
420 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
421 	if (item_size < sizeof(*ei)) {
422 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
423 		return 1;
424 	}
425 #endif
426 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
427 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
428 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
429 
430 	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
431 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
432 		return 1;
433 	}
434 
435 	bi = (struct btrfs_tree_block_info *)(ei + 1);
436 	*ptr = (unsigned long)(bi + 1);
437 	*end = (unsigned long)ei + item_size;
438 	return 0;
439 }
440 
441 /*
442  * build backref tree for a given tree block. root of the backref tree
443  * corresponds the tree block, leaves of the backref tree correspond
444  * roots of b-trees that reference the tree block.
445  *
446  * the basic idea of this function is check backrefs of a given block
447  * to find upper level blocks that refernece the block, and then check
448  * bakcrefs of these upper level blocks recursively. the recursion stop
449  * when tree root is reached or backrefs for the block is cached.
450  *
451  * NOTE: if we find backrefs for a block are cached, we know backrefs
452  * for all upper level blocks that directly/indirectly reference the
453  * block are also cached.
454  */
455 static struct backref_node *build_backref_tree(struct reloc_control *rc,
456 					       struct backref_cache *cache,
457 					       struct btrfs_key *node_key,
458 					       int level, u64 bytenr)
459 {
460 	struct btrfs_path *path1;
461 	struct btrfs_path *path2;
462 	struct extent_buffer *eb;
463 	struct btrfs_root *root;
464 	struct backref_node *cur;
465 	struct backref_node *upper;
466 	struct backref_node *lower;
467 	struct backref_node *node = NULL;
468 	struct backref_node *exist = NULL;
469 	struct backref_edge *edge;
470 	struct rb_node *rb_node;
471 	struct btrfs_key key;
472 	unsigned long end;
473 	unsigned long ptr;
474 	LIST_HEAD(list);
475 	int ret;
476 	int err = 0;
477 
478 	path1 = btrfs_alloc_path();
479 	path2 = btrfs_alloc_path();
480 	if (!path1 || !path2) {
481 		err = -ENOMEM;
482 		goto out;
483 	}
484 
485 	node = kmalloc(sizeof(*node), GFP_NOFS);
486 	if (!node) {
487 		err = -ENOMEM;
488 		goto out;
489 	}
490 
491 	backref_node_init(node);
492 	node->bytenr = bytenr;
493 	node->owner = 0;
494 	node->level = level;
495 	node->lowest = 1;
496 	cur = node;
497 again:
498 	end = 0;
499 	ptr = 0;
500 	key.objectid = cur->bytenr;
501 	key.type = BTRFS_EXTENT_ITEM_KEY;
502 	key.offset = (u64)-1;
503 
504 	path1->search_commit_root = 1;
505 	path1->skip_locking = 1;
506 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
507 				0, 0);
508 	if (ret < 0) {
509 		err = ret;
510 		goto out;
511 	}
512 	BUG_ON(!ret || !path1->slots[0]);
513 
514 	path1->slots[0]--;
515 
516 	WARN_ON(cur->checked);
517 	if (!list_empty(&cur->upper)) {
518 		/*
519 		 * the backref was added previously when processsing
520 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
521 		 */
522 		BUG_ON(!list_is_singular(&cur->upper));
523 		edge = list_entry(cur->upper.next, struct backref_edge,
524 				  list[LOWER]);
525 		BUG_ON(!list_empty(&edge->list[UPPER]));
526 		exist = edge->node[UPPER];
527 		/*
528 		 * add the upper level block to pending list if we need
529 		 * check its backrefs
530 		 */
531 		if (!exist->checked)
532 			list_add_tail(&edge->list[UPPER], &list);
533 	} else {
534 		exist = NULL;
535 	}
536 
537 	while (1) {
538 		cond_resched();
539 		eb = path1->nodes[0];
540 
541 		if (ptr >= end) {
542 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
543 				ret = btrfs_next_leaf(rc->extent_root, path1);
544 				if (ret < 0) {
545 					err = ret;
546 					goto out;
547 				}
548 				if (ret > 0)
549 					break;
550 				eb = path1->nodes[0];
551 			}
552 
553 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
554 			if (key.objectid != cur->bytenr) {
555 				WARN_ON(exist);
556 				break;
557 			}
558 
559 			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
560 				ret = find_inline_backref(eb, path1->slots[0],
561 							  &ptr, &end);
562 				if (ret)
563 					goto next;
564 			}
565 		}
566 
567 		if (ptr < end) {
568 			/* update key for inline back ref */
569 			struct btrfs_extent_inline_ref *iref;
570 			iref = (struct btrfs_extent_inline_ref *)ptr;
571 			key.type = btrfs_extent_inline_ref_type(eb, iref);
572 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
573 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
574 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
575 		}
576 
577 		if (exist &&
578 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
579 		      exist->owner == key.offset) ||
580 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
581 		      exist->bytenr == key.offset))) {
582 			exist = NULL;
583 			goto next;
584 		}
585 
586 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 			if (key.objectid == key.offset &&
590 			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 				struct btrfs_extent_ref_v0 *ref0;
592 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 						struct btrfs_extent_ref_v0);
594 				root = find_tree_root(rc, eb, ref0);
595 				if (root)
596 					cur->root = root;
597 				else
598 					cur->old_root = 1;
599 				break;
600 			}
601 #else
602 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
603 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
604 #endif
605 			if (key.objectid == key.offset) {
606 				/*
607 				 * only root blocks of reloc trees use
608 				 * backref of this type.
609 				 */
610 				root = find_reloc_root(rc, cur->bytenr);
611 				BUG_ON(!root);
612 				cur->root = root;
613 				break;
614 			}
615 
616 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
617 			if (!edge) {
618 				err = -ENOMEM;
619 				goto out;
620 			}
621 			rb_node = tree_search(&cache->rb_root, key.offset);
622 			if (!rb_node) {
623 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
624 				if (!upper) {
625 					kfree(edge);
626 					err = -ENOMEM;
627 					goto out;
628 				}
629 				backref_node_init(upper);
630 				upper->bytenr = key.offset;
631 				upper->owner = 0;
632 				upper->level = cur->level + 1;
633 				/*
634 				 *  backrefs for the upper level block isn't
635 				 *  cached, add the block to pending list
636 				 */
637 				list_add_tail(&edge->list[UPPER], &list);
638 			} else {
639 				upper = rb_entry(rb_node, struct backref_node,
640 						 rb_node);
641 				INIT_LIST_HEAD(&edge->list[UPPER]);
642 			}
643 			list_add(&edge->list[LOWER], &cur->upper);
644 			edge->node[UPPER] = upper;
645 			edge->node[LOWER] = cur;
646 
647 			goto next;
648 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
649 			goto next;
650 		}
651 
652 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
653 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
654 		if (IS_ERR(root)) {
655 			err = PTR_ERR(root);
656 			goto out;
657 		}
658 
659 		if (btrfs_root_level(&root->root_item) == cur->level) {
660 			/* tree root */
661 			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 			       cur->bytenr);
663 			cur->root = root;
664 			break;
665 		}
666 
667 		level = cur->level + 1;
668 
669 		/*
670 		 * searching the tree to find upper level blocks
671 		 * reference the block.
672 		 */
673 		path2->search_commit_root = 1;
674 		path2->skip_locking = 1;
675 		path2->lowest_level = level;
676 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
677 		path2->lowest_level = 0;
678 		if (ret < 0) {
679 			err = ret;
680 			goto out;
681 		}
682 		if (ret > 0 && path2->slots[level] > 0)
683 			path2->slots[level]--;
684 
685 		eb = path2->nodes[level];
686 		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
687 			cur->bytenr);
688 
689 		lower = cur;
690 		for (; level < BTRFS_MAX_LEVEL; level++) {
691 			if (!path2->nodes[level]) {
692 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 				       lower->bytenr);
694 				lower->root = root;
695 				break;
696 			}
697 
698 			edge = kzalloc(sizeof(*edge), GFP_NOFS);
699 			if (!edge) {
700 				err = -ENOMEM;
701 				goto out;
702 			}
703 
704 			eb = path2->nodes[level];
705 			rb_node = tree_search(&cache->rb_root, eb->start);
706 			if (!rb_node) {
707 				upper = kmalloc(sizeof(*upper), GFP_NOFS);
708 				if (!upper) {
709 					kfree(edge);
710 					err = -ENOMEM;
711 					goto out;
712 				}
713 				backref_node_init(upper);
714 				upper->bytenr = eb->start;
715 				upper->owner = btrfs_header_owner(eb);
716 				upper->level = lower->level + 1;
717 
718 				/*
719 				 * if we know the block isn't shared
720 				 * we can void checking its backrefs.
721 				 */
722 				if (btrfs_block_can_be_shared(root, eb))
723 					upper->checked = 0;
724 				else
725 					upper->checked = 1;
726 
727 				/*
728 				 * add the block to pending list if we
729 				 * need check its backrefs. only block
730 				 * at 'cur->level + 1' is added to the
731 				 * tail of pending list. this guarantees
732 				 * we check backrefs from lower level
733 				 * blocks to upper level blocks.
734 				 */
735 				if (!upper->checked &&
736 				    level == cur->level + 1) {
737 					list_add_tail(&edge->list[UPPER],
738 						      &list);
739 				} else
740 					INIT_LIST_HEAD(&edge->list[UPPER]);
741 			} else {
742 				upper = rb_entry(rb_node, struct backref_node,
743 						 rb_node);
744 				BUG_ON(!upper->checked);
745 				INIT_LIST_HEAD(&edge->list[UPPER]);
746 			}
747 			list_add_tail(&edge->list[LOWER], &lower->upper);
748 			edge->node[UPPER] = upper;
749 			edge->node[LOWER] = lower;
750 
751 			if (rb_node)
752 				break;
753 			lower = upper;
754 			upper = NULL;
755 		}
756 		btrfs_release_path(root, path2);
757 next:
758 		if (ptr < end) {
759 			ptr += btrfs_extent_inline_ref_size(key.type);
760 			if (ptr >= end) {
761 				WARN_ON(ptr > end);
762 				ptr = 0;
763 				end = 0;
764 			}
765 		}
766 		if (ptr >= end)
767 			path1->slots[0]++;
768 	}
769 	btrfs_release_path(rc->extent_root, path1);
770 
771 	cur->checked = 1;
772 	WARN_ON(exist);
773 
774 	/* the pending list isn't empty, take the first block to process */
775 	if (!list_empty(&list)) {
776 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
777 		list_del_init(&edge->list[UPPER]);
778 		cur = edge->node[UPPER];
779 		goto again;
780 	}
781 
782 	/*
783 	 * everything goes well, connect backref nodes and insert backref nodes
784 	 * into the cache.
785 	 */
786 	BUG_ON(!node->checked);
787 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
788 	BUG_ON(rb_node);
789 
790 	list_for_each_entry(edge, &node->upper, list[LOWER])
791 		list_add_tail(&edge->list[UPPER], &list);
792 
793 	while (!list_empty(&list)) {
794 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 		list_del_init(&edge->list[UPPER]);
796 		upper = edge->node[UPPER];
797 
798 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 			if (upper->lowest) {
800 				list_del_init(&upper->lower);
801 				upper->lowest = 0;
802 			}
803 
804 			list_add_tail(&edge->list[UPPER], &upper->lower);
805 			continue;
806 		}
807 
808 		BUG_ON(!upper->checked);
809 		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
810 				      &upper->rb_node);
811 		BUG_ON(rb_node);
812 
813 		list_add_tail(&edge->list[UPPER], &upper->lower);
814 
815 		list_for_each_entry(edge, &upper->upper, list[LOWER])
816 			list_add_tail(&edge->list[UPPER], &list);
817 	}
818 out:
819 	btrfs_free_path(path1);
820 	btrfs_free_path(path2);
821 	if (err) {
822 		INIT_LIST_HEAD(&list);
823 		upper = node;
824 		while (upper) {
825 			if (RB_EMPTY_NODE(&upper->rb_node)) {
826 				list_splice_tail(&upper->upper, &list);
827 				kfree(upper);
828 			}
829 
830 			if (list_empty(&list))
831 				break;
832 
833 			edge = list_entry(list.next, struct backref_edge,
834 					  list[LOWER]);
835 			upper = edge->node[UPPER];
836 			kfree(edge);
837 		}
838 		return ERR_PTR(err);
839 	}
840 	return node;
841 }
842 
843 /*
844  * helper to add 'address of tree root -> reloc tree' mapping
845  */
846 static int __add_reloc_root(struct btrfs_root *root)
847 {
848 	struct rb_node *rb_node;
849 	struct mapping_node *node;
850 	struct reloc_control *rc = root->fs_info->reloc_ctl;
851 
852 	node = kmalloc(sizeof(*node), GFP_NOFS);
853 	BUG_ON(!node);
854 
855 	node->bytenr = root->node->start;
856 	node->data = root;
857 
858 	spin_lock(&rc->reloc_root_tree.lock);
859 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
860 			      node->bytenr, &node->rb_node);
861 	spin_unlock(&rc->reloc_root_tree.lock);
862 	BUG_ON(rb_node);
863 
864 	list_add_tail(&root->root_list, &rc->reloc_roots);
865 	return 0;
866 }
867 
868 /*
869  * helper to update/delete the 'address of tree root -> reloc tree'
870  * mapping
871  */
872 static int __update_reloc_root(struct btrfs_root *root, int del)
873 {
874 	struct rb_node *rb_node;
875 	struct mapping_node *node = NULL;
876 	struct reloc_control *rc = root->fs_info->reloc_ctl;
877 
878 	spin_lock(&rc->reloc_root_tree.lock);
879 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
880 			      root->commit_root->start);
881 	if (rb_node) {
882 		node = rb_entry(rb_node, struct mapping_node, rb_node);
883 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
884 	}
885 	spin_unlock(&rc->reloc_root_tree.lock);
886 
887 	BUG_ON((struct btrfs_root *)node->data != root);
888 
889 	if (!del) {
890 		spin_lock(&rc->reloc_root_tree.lock);
891 		node->bytenr = root->node->start;
892 		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
893 				      node->bytenr, &node->rb_node);
894 		spin_unlock(&rc->reloc_root_tree.lock);
895 		BUG_ON(rb_node);
896 	} else {
897 		list_del_init(&root->root_list);
898 		kfree(node);
899 	}
900 	return 0;
901 }
902 
903 /*
904  * create reloc tree for a given fs tree. reloc tree is just a
905  * snapshot of the fs tree with special root objectid.
906  */
907 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 			  struct btrfs_root *root)
909 {
910 	struct btrfs_root *reloc_root;
911 	struct extent_buffer *eb;
912 	struct btrfs_root_item *root_item;
913 	struct btrfs_key root_key;
914 	int ret;
915 
916 	if (root->reloc_root) {
917 		reloc_root = root->reloc_root;
918 		reloc_root->last_trans = trans->transid;
919 		return 0;
920 	}
921 
922 	if (!root->fs_info->reloc_ctl ||
923 	    !root->fs_info->reloc_ctl->create_reloc_root ||
924 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 		return 0;
926 
927 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 	BUG_ON(!root_item);
929 
930 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 	root_key.type = BTRFS_ROOT_ITEM_KEY;
932 	root_key.offset = root->root_key.objectid;
933 
934 	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
935 			      BTRFS_TREE_RELOC_OBJECTID);
936 	BUG_ON(ret);
937 
938 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 	memcpy(root_item, &root->root_item, sizeof(*root_item));
940 	btrfs_set_root_refs(root_item, 1);
941 	btrfs_set_root_bytenr(root_item, eb->start);
942 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 	btrfs_set_root_generation(root_item, trans->transid);
944 	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
945 	root_item->drop_level = 0;
946 
947 	btrfs_tree_unlock(eb);
948 	free_extent_buffer(eb);
949 
950 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
951 				&root_key, root_item);
952 	BUG_ON(ret);
953 	kfree(root_item);
954 
955 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
956 						 &root_key);
957 	BUG_ON(IS_ERR(reloc_root));
958 	reloc_root->last_trans = trans->transid;
959 
960 	__add_reloc_root(reloc_root);
961 	root->reloc_root = reloc_root;
962 	return 0;
963 }
964 
965 /*
966  * update root item of reloc tree
967  */
968 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
969 			    struct btrfs_root *root)
970 {
971 	struct btrfs_root *reloc_root;
972 	struct btrfs_root_item *root_item;
973 	int del = 0;
974 	int ret;
975 
976 	if (!root->reloc_root)
977 		return 0;
978 
979 	reloc_root = root->reloc_root;
980 	root_item = &reloc_root->root_item;
981 
982 	if (btrfs_root_refs(root_item) == 0) {
983 		root->reloc_root = NULL;
984 		del = 1;
985 	}
986 
987 	__update_reloc_root(reloc_root, del);
988 
989 	if (reloc_root->commit_root != reloc_root->node) {
990 		btrfs_set_root_node(root_item, reloc_root->node);
991 		free_extent_buffer(reloc_root->commit_root);
992 		reloc_root->commit_root = btrfs_root_node(reloc_root);
993 	}
994 
995 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
996 				&reloc_root->root_key, root_item);
997 	BUG_ON(ret);
998 	return 0;
999 }
1000 
1001 /*
1002  * helper to find first cached inode with inode number >= objectid
1003  * in a subvolume
1004  */
1005 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1006 {
1007 	struct rb_node *node;
1008 	struct rb_node *prev;
1009 	struct btrfs_inode *entry;
1010 	struct inode *inode;
1011 
1012 	spin_lock(&root->inode_lock);
1013 again:
1014 	node = root->inode_tree.rb_node;
1015 	prev = NULL;
1016 	while (node) {
1017 		prev = node;
1018 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1019 
1020 		if (objectid < entry->vfs_inode.i_ino)
1021 			node = node->rb_left;
1022 		else if (objectid > entry->vfs_inode.i_ino)
1023 			node = node->rb_right;
1024 		else
1025 			break;
1026 	}
1027 	if (!node) {
1028 		while (prev) {
1029 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1030 			if (objectid <= entry->vfs_inode.i_ino) {
1031 				node = prev;
1032 				break;
1033 			}
1034 			prev = rb_next(prev);
1035 		}
1036 	}
1037 	while (node) {
1038 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1039 		inode = igrab(&entry->vfs_inode);
1040 		if (inode) {
1041 			spin_unlock(&root->inode_lock);
1042 			return inode;
1043 		}
1044 
1045 		objectid = entry->vfs_inode.i_ino + 1;
1046 		if (cond_resched_lock(&root->inode_lock))
1047 			goto again;
1048 
1049 		node = rb_next(node);
1050 	}
1051 	spin_unlock(&root->inode_lock);
1052 	return NULL;
1053 }
1054 
1055 static int in_block_group(u64 bytenr,
1056 			  struct btrfs_block_group_cache *block_group)
1057 {
1058 	if (bytenr >= block_group->key.objectid &&
1059 	    bytenr < block_group->key.objectid + block_group->key.offset)
1060 		return 1;
1061 	return 0;
1062 }
1063 
1064 /*
1065  * get new location of data
1066  */
1067 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1068 			    u64 bytenr, u64 num_bytes)
1069 {
1070 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1071 	struct btrfs_path *path;
1072 	struct btrfs_file_extent_item *fi;
1073 	struct extent_buffer *leaf;
1074 	int ret;
1075 
1076 	path = btrfs_alloc_path();
1077 	if (!path)
1078 		return -ENOMEM;
1079 
1080 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1081 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1082 				       bytenr, 0);
1083 	if (ret < 0)
1084 		goto out;
1085 	if (ret > 0) {
1086 		ret = -ENOENT;
1087 		goto out;
1088 	}
1089 
1090 	leaf = path->nodes[0];
1091 	fi = btrfs_item_ptr(leaf, path->slots[0],
1092 			    struct btrfs_file_extent_item);
1093 
1094 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1095 	       btrfs_file_extent_compression(leaf, fi) ||
1096 	       btrfs_file_extent_encryption(leaf, fi) ||
1097 	       btrfs_file_extent_other_encoding(leaf, fi));
1098 
1099 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1100 		ret = 1;
1101 		goto out;
1102 	}
1103 
1104 	if (new_bytenr)
1105 		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 	ret = 0;
1107 out:
1108 	btrfs_free_path(path);
1109 	return ret;
1110 }
1111 
1112 /*
1113  * update file extent items in the tree leaf to point to
1114  * the new locations.
1115  */
1116 static int replace_file_extents(struct btrfs_trans_handle *trans,
1117 				struct reloc_control *rc,
1118 				struct btrfs_root *root,
1119 				struct extent_buffer *leaf,
1120 				struct list_head *inode_list)
1121 {
1122 	struct btrfs_key key;
1123 	struct btrfs_file_extent_item *fi;
1124 	struct inode *inode = NULL;
1125 	struct inodevec *ivec = NULL;
1126 	u64 parent;
1127 	u64 bytenr;
1128 	u64 new_bytenr;
1129 	u64 num_bytes;
1130 	u64 end;
1131 	u32 nritems;
1132 	u32 i;
1133 	int ret;
1134 	int first = 1;
1135 	int dirty = 0;
1136 
1137 	if (rc->stage != UPDATE_DATA_PTRS)
1138 		return 0;
1139 
1140 	/* reloc trees always use full backref */
1141 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1142 		parent = leaf->start;
1143 	else
1144 		parent = 0;
1145 
1146 	nritems = btrfs_header_nritems(leaf);
1147 	for (i = 0; i < nritems; i++) {
1148 		cond_resched();
1149 		btrfs_item_key_to_cpu(leaf, &key, i);
1150 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1151 			continue;
1152 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1153 		if (btrfs_file_extent_type(leaf, fi) ==
1154 		    BTRFS_FILE_EXTENT_INLINE)
1155 			continue;
1156 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1157 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1158 		if (bytenr == 0)
1159 			continue;
1160 		if (!in_block_group(bytenr, rc->block_group))
1161 			continue;
1162 
1163 		/*
1164 		 * if we are modifying block in fs tree, wait for readpage
1165 		 * to complete and drop the extent cache
1166 		 */
1167 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 			if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 				BUG_ON(!ivec);
1171 				ivec->nr = 0;
1172 				list_add_tail(&ivec->list, inode_list);
1173 			}
1174 			if (first) {
1175 				inode = find_next_inode(root, key.objectid);
1176 				if (inode)
1177 					ivec->inode[ivec->nr++] = inode;
1178 				first = 0;
1179 			} else if (inode && inode->i_ino < key.objectid) {
1180 				inode = find_next_inode(root, key.objectid);
1181 				if (inode)
1182 					ivec->inode[ivec->nr++] = inode;
1183 			}
1184 			if (inode && inode->i_ino == key.objectid) {
1185 				end = key.offset +
1186 				      btrfs_file_extent_num_bytes(leaf, fi);
1187 				WARN_ON(!IS_ALIGNED(key.offset,
1188 						    root->sectorsize));
1189 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1190 				end--;
1191 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1192 						      key.offset, end,
1193 						      GFP_NOFS);
1194 				if (!ret)
1195 					continue;
1196 
1197 				btrfs_drop_extent_cache(inode, key.offset, end,
1198 							1);
1199 				unlock_extent(&BTRFS_I(inode)->io_tree,
1200 					      key.offset, end, GFP_NOFS);
1201 			}
1202 		}
1203 
1204 		ret = get_new_location(rc->data_inode, &new_bytenr,
1205 				       bytenr, num_bytes);
1206 		if (ret > 0)
1207 			continue;
1208 		BUG_ON(ret < 0);
1209 
1210 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1211 		dirty = 1;
1212 
1213 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1214 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1215 					   num_bytes, parent,
1216 					   btrfs_header_owner(leaf),
1217 					   key.objectid, key.offset);
1218 		BUG_ON(ret);
1219 
1220 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1221 					parent, btrfs_header_owner(leaf),
1222 					key.objectid, key.offset);
1223 		BUG_ON(ret);
1224 	}
1225 	if (dirty)
1226 		btrfs_mark_buffer_dirty(leaf);
1227 	return 0;
1228 }
1229 
1230 static noinline_for_stack
1231 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1232 		     struct btrfs_path *path, int level)
1233 {
1234 	struct btrfs_disk_key key1;
1235 	struct btrfs_disk_key key2;
1236 	btrfs_node_key(eb, &key1, slot);
1237 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1238 	return memcmp(&key1, &key2, sizeof(key1));
1239 }
1240 
1241 /*
1242  * try to replace tree blocks in fs tree with the new blocks
1243  * in reloc tree. tree blocks haven't been modified since the
1244  * reloc tree was create can be replaced.
1245  *
1246  * if a block was replaced, level of the block + 1 is returned.
1247  * if no block got replaced, 0 is returned. if there are other
1248  * errors, a negative error number is returned.
1249  */
1250 static int replace_path(struct btrfs_trans_handle *trans,
1251 			struct btrfs_root *dest, struct btrfs_root *src,
1252 			struct btrfs_path *path, struct btrfs_key *next_key,
1253 			struct extent_buffer **leaf,
1254 			int lowest_level, int max_level)
1255 {
1256 	struct extent_buffer *eb;
1257 	struct extent_buffer *parent;
1258 	struct btrfs_key key;
1259 	u64 old_bytenr;
1260 	u64 new_bytenr;
1261 	u64 old_ptr_gen;
1262 	u64 new_ptr_gen;
1263 	u64 last_snapshot;
1264 	u32 blocksize;
1265 	int level;
1266 	int ret;
1267 	int slot;
1268 
1269 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 	BUG_ON(lowest_level > 1 && leaf);
1272 
1273 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 
1275 	slot = path->slots[lowest_level];
1276 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 
1278 	eb = btrfs_lock_root_node(dest);
1279 	btrfs_set_lock_blocking(eb);
1280 	level = btrfs_header_level(eb);
1281 
1282 	if (level < lowest_level) {
1283 		btrfs_tree_unlock(eb);
1284 		free_extent_buffer(eb);
1285 		return 0;
1286 	}
1287 
1288 	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1289 	BUG_ON(ret);
1290 	btrfs_set_lock_blocking(eb);
1291 
1292 	if (next_key) {
1293 		next_key->objectid = (u64)-1;
1294 		next_key->type = (u8)-1;
1295 		next_key->offset = (u64)-1;
1296 	}
1297 
1298 	parent = eb;
1299 	while (1) {
1300 		level = btrfs_header_level(parent);
1301 		BUG_ON(level < lowest_level);
1302 
1303 		ret = btrfs_bin_search(parent, &key, level, &slot);
1304 		if (ret && slot > 0)
1305 			slot--;
1306 
1307 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1308 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1309 
1310 		old_bytenr = btrfs_node_blockptr(parent, slot);
1311 		blocksize = btrfs_level_size(dest, level - 1);
1312 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1313 
1314 		if (level <= max_level) {
1315 			eb = path->nodes[level];
1316 			new_bytenr = btrfs_node_blockptr(eb,
1317 							path->slots[level]);
1318 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1319 							path->slots[level]);
1320 		} else {
1321 			new_bytenr = 0;
1322 			new_ptr_gen = 0;
1323 		}
1324 
1325 		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1326 			WARN_ON(1);
1327 			ret = level;
1328 			break;
1329 		}
1330 
1331 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 		    memcmp_node_keys(parent, slot, path, level)) {
1333 			if (level <= lowest_level && !leaf) {
1334 				ret = 0;
1335 				break;
1336 			}
1337 
1338 			eb = read_tree_block(dest, old_bytenr, blocksize,
1339 					     old_ptr_gen);
1340 			btrfs_tree_lock(eb);
1341 			ret = btrfs_cow_block(trans, dest, eb, parent,
1342 					      slot, &eb);
1343 			BUG_ON(ret);
1344 			btrfs_set_lock_blocking(eb);
1345 
1346 			if (level <= lowest_level) {
1347 				*leaf = eb;
1348 				ret = 0;
1349 				break;
1350 			}
1351 
1352 			btrfs_tree_unlock(parent);
1353 			free_extent_buffer(parent);
1354 
1355 			parent = eb;
1356 			continue;
1357 		}
1358 
1359 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 				      path->slots[level]);
1361 		btrfs_release_path(src, path);
1362 
1363 		path->lowest_level = level;
1364 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1365 		path->lowest_level = 0;
1366 		BUG_ON(ret);
1367 
1368 		/*
1369 		 * swap blocks in fs tree and reloc tree.
1370 		 */
1371 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1372 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1373 		btrfs_mark_buffer_dirty(parent);
1374 
1375 		btrfs_set_node_blockptr(path->nodes[level],
1376 					path->slots[level], old_bytenr);
1377 		btrfs_set_node_ptr_generation(path->nodes[level],
1378 					      path->slots[level], old_ptr_gen);
1379 		btrfs_mark_buffer_dirty(path->nodes[level]);
1380 
1381 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1382 					path->nodes[level]->start,
1383 					src->root_key.objectid, level - 1, 0);
1384 		BUG_ON(ret);
1385 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1386 					0, dest->root_key.objectid, level - 1,
1387 					0);
1388 		BUG_ON(ret);
1389 
1390 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1391 					path->nodes[level]->start,
1392 					src->root_key.objectid, level - 1, 0);
1393 		BUG_ON(ret);
1394 
1395 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1396 					0, dest->root_key.objectid, level - 1,
1397 					0);
1398 		BUG_ON(ret);
1399 
1400 		btrfs_unlock_up_safe(path, 0);
1401 
1402 		ret = level;
1403 		break;
1404 	}
1405 	btrfs_tree_unlock(parent);
1406 	free_extent_buffer(parent);
1407 	return ret;
1408 }
1409 
1410 /*
1411  * helper to find next relocated block in reloc tree
1412  */
1413 static noinline_for_stack
1414 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1415 		       int *level)
1416 {
1417 	struct extent_buffer *eb;
1418 	int i;
1419 	u64 last_snapshot;
1420 	u32 nritems;
1421 
1422 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423 
1424 	for (i = 0; i < *level; i++) {
1425 		free_extent_buffer(path->nodes[i]);
1426 		path->nodes[i] = NULL;
1427 	}
1428 
1429 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1430 		eb = path->nodes[i];
1431 		nritems = btrfs_header_nritems(eb);
1432 		while (path->slots[i] + 1 < nritems) {
1433 			path->slots[i]++;
1434 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1435 			    last_snapshot)
1436 				continue;
1437 
1438 			*level = i;
1439 			return 0;
1440 		}
1441 		free_extent_buffer(path->nodes[i]);
1442 		path->nodes[i] = NULL;
1443 	}
1444 	return 1;
1445 }
1446 
1447 /*
1448  * walk down reloc tree to find relocated block of lowest level
1449  */
1450 static noinline_for_stack
1451 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1452 			 int *level)
1453 {
1454 	struct extent_buffer *eb = NULL;
1455 	int i;
1456 	u64 bytenr;
1457 	u64 ptr_gen = 0;
1458 	u64 last_snapshot;
1459 	u32 blocksize;
1460 	u32 nritems;
1461 
1462 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1463 
1464 	for (i = *level; i > 0; i--) {
1465 		eb = path->nodes[i];
1466 		nritems = btrfs_header_nritems(eb);
1467 		while (path->slots[i] < nritems) {
1468 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1469 			if (ptr_gen > last_snapshot)
1470 				break;
1471 			path->slots[i]++;
1472 		}
1473 		if (path->slots[i] >= nritems) {
1474 			if (i == *level)
1475 				break;
1476 			*level = i + 1;
1477 			return 0;
1478 		}
1479 		if (i == 1) {
1480 			*level = i;
1481 			return 0;
1482 		}
1483 
1484 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1485 		blocksize = btrfs_level_size(root, i - 1);
1486 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1487 		BUG_ON(btrfs_header_level(eb) != i - 1);
1488 		path->nodes[i - 1] = eb;
1489 		path->slots[i - 1] = 0;
1490 	}
1491 	return 1;
1492 }
1493 
1494 /*
1495  * invalidate extent cache for file extents whose key in range of
1496  * [min_key, max_key)
1497  */
1498 static int invalidate_extent_cache(struct btrfs_root *root,
1499 				   struct btrfs_key *min_key,
1500 				   struct btrfs_key *max_key)
1501 {
1502 	struct inode *inode = NULL;
1503 	u64 objectid;
1504 	u64 start, end;
1505 
1506 	objectid = min_key->objectid;
1507 	while (1) {
1508 		cond_resched();
1509 		iput(inode);
1510 
1511 		if (objectid > max_key->objectid)
1512 			break;
1513 
1514 		inode = find_next_inode(root, objectid);
1515 		if (!inode)
1516 			break;
1517 
1518 		if (inode->i_ino > max_key->objectid) {
1519 			iput(inode);
1520 			break;
1521 		}
1522 
1523 		objectid = inode->i_ino + 1;
1524 		if (!S_ISREG(inode->i_mode))
1525 			continue;
1526 
1527 		if (unlikely(min_key->objectid == inode->i_ino)) {
1528 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1529 				continue;
1530 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1531 				start = 0;
1532 			else {
1533 				start = min_key->offset;
1534 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1535 			}
1536 		} else {
1537 			start = 0;
1538 		}
1539 
1540 		if (unlikely(max_key->objectid == inode->i_ino)) {
1541 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1542 				continue;
1543 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1544 				end = (u64)-1;
1545 			} else {
1546 				if (max_key->offset == 0)
1547 					continue;
1548 				end = max_key->offset;
1549 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1550 				end--;
1551 			}
1552 		} else {
1553 			end = (u64)-1;
1554 		}
1555 
1556 		/* the lock_extent waits for readpage to complete */
1557 		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1558 		btrfs_drop_extent_cache(inode, start, end, 1);
1559 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1560 	}
1561 	return 0;
1562 }
1563 
1564 static int find_next_key(struct btrfs_path *path, int level,
1565 			 struct btrfs_key *key)
1566 
1567 {
1568 	while (level < BTRFS_MAX_LEVEL) {
1569 		if (!path->nodes[level])
1570 			break;
1571 		if (path->slots[level] + 1 <
1572 		    btrfs_header_nritems(path->nodes[level])) {
1573 			btrfs_node_key_to_cpu(path->nodes[level], key,
1574 					      path->slots[level] + 1);
1575 			return 0;
1576 		}
1577 		level++;
1578 	}
1579 	return 1;
1580 }
1581 
1582 /*
1583  * merge the relocated tree blocks in reloc tree with corresponding
1584  * fs tree.
1585  */
1586 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1587 					       struct btrfs_root *root)
1588 {
1589 	LIST_HEAD(inode_list);
1590 	struct btrfs_key key;
1591 	struct btrfs_key next_key;
1592 	struct btrfs_trans_handle *trans;
1593 	struct btrfs_root *reloc_root;
1594 	struct btrfs_root_item *root_item;
1595 	struct btrfs_path *path;
1596 	struct extent_buffer *leaf = NULL;
1597 	unsigned long nr;
1598 	int level;
1599 	int max_level;
1600 	int replaced = 0;
1601 	int ret;
1602 	int err = 0;
1603 
1604 	path = btrfs_alloc_path();
1605 	if (!path)
1606 		return -ENOMEM;
1607 
1608 	reloc_root = root->reloc_root;
1609 	root_item = &reloc_root->root_item;
1610 
1611 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1612 		level = btrfs_root_level(root_item);
1613 		extent_buffer_get(reloc_root->node);
1614 		path->nodes[level] = reloc_root->node;
1615 		path->slots[level] = 0;
1616 	} else {
1617 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1618 
1619 		level = root_item->drop_level;
1620 		BUG_ON(level == 0);
1621 		path->lowest_level = level;
1622 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1623 		path->lowest_level = 0;
1624 		if (ret < 0) {
1625 			btrfs_free_path(path);
1626 			return ret;
1627 		}
1628 
1629 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1630 				      path->slots[level]);
1631 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1632 
1633 		btrfs_unlock_up_safe(path, 0);
1634 	}
1635 
1636 	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1637 		trans = btrfs_start_transaction(root, 1);
1638 
1639 		leaf = path->nodes[0];
1640 		btrfs_item_key_to_cpu(leaf, &key, 0);
1641 		btrfs_release_path(reloc_root, path);
1642 
1643 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1644 		if (ret < 0) {
1645 			err = ret;
1646 			goto out;
1647 		}
1648 
1649 		leaf = path->nodes[0];
1650 		btrfs_unlock_up_safe(path, 1);
1651 		ret = replace_file_extents(trans, rc, root, leaf,
1652 					   &inode_list);
1653 		if (ret < 0)
1654 			err = ret;
1655 		goto out;
1656 	}
1657 
1658 	memset(&next_key, 0, sizeof(next_key));
1659 
1660 	while (1) {
1661 		leaf = NULL;
1662 		replaced = 0;
1663 		trans = btrfs_start_transaction(root, 1);
1664 		max_level = level;
1665 
1666 		ret = walk_down_reloc_tree(reloc_root, path, &level);
1667 		if (ret < 0) {
1668 			err = ret;
1669 			goto out;
1670 		}
1671 		if (ret > 0)
1672 			break;
1673 
1674 		if (!find_next_key(path, level, &key) &&
1675 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1676 			ret = 0;
1677 		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1678 			ret = replace_path(trans, root, reloc_root,
1679 					   path, &next_key, &leaf,
1680 					   level, max_level);
1681 		} else {
1682 			ret = replace_path(trans, root, reloc_root,
1683 					   path, &next_key, NULL,
1684 					   level, max_level);
1685 		}
1686 		if (ret < 0) {
1687 			err = ret;
1688 			goto out;
1689 		}
1690 
1691 		if (ret > 0) {
1692 			level = ret;
1693 			btrfs_node_key_to_cpu(path->nodes[level], &key,
1694 					      path->slots[level]);
1695 			replaced = 1;
1696 		} else if (leaf) {
1697 			/*
1698 			 * no block got replaced, try replacing file extents
1699 			 */
1700 			btrfs_item_key_to_cpu(leaf, &key, 0);
1701 			ret = replace_file_extents(trans, rc, root, leaf,
1702 						   &inode_list);
1703 			btrfs_tree_unlock(leaf);
1704 			free_extent_buffer(leaf);
1705 			BUG_ON(ret < 0);
1706 		}
1707 
1708 		ret = walk_up_reloc_tree(reloc_root, path, &level);
1709 		if (ret > 0)
1710 			break;
1711 
1712 		BUG_ON(level == 0);
1713 		/*
1714 		 * save the merging progress in the drop_progress.
1715 		 * this is OK since root refs == 1 in this case.
1716 		 */
1717 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1718 			       path->slots[level]);
1719 		root_item->drop_level = level;
1720 
1721 		nr = trans->blocks_used;
1722 		btrfs_end_transaction(trans, root);
1723 
1724 		btrfs_btree_balance_dirty(root, nr);
1725 
1726 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1727 			invalidate_extent_cache(root, &key, &next_key);
1728 	}
1729 
1730 	/*
1731 	 * handle the case only one block in the fs tree need to be
1732 	 * relocated and the block is tree root.
1733 	 */
1734 	leaf = btrfs_lock_root_node(root);
1735 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1736 	btrfs_tree_unlock(leaf);
1737 	free_extent_buffer(leaf);
1738 	if (ret < 0)
1739 		err = ret;
1740 out:
1741 	btrfs_free_path(path);
1742 
1743 	if (err == 0) {
1744 		memset(&root_item->drop_progress, 0,
1745 		       sizeof(root_item->drop_progress));
1746 		root_item->drop_level = 0;
1747 		btrfs_set_root_refs(root_item, 0);
1748 	}
1749 
1750 	nr = trans->blocks_used;
1751 	btrfs_end_transaction(trans, root);
1752 
1753 	btrfs_btree_balance_dirty(root, nr);
1754 
1755 	/*
1756 	 * put inodes while we aren't holding the tree locks
1757 	 */
1758 	while (!list_empty(&inode_list)) {
1759 		struct inodevec *ivec;
1760 		ivec = list_entry(inode_list.next, struct inodevec, list);
1761 		list_del(&ivec->list);
1762 		while (ivec->nr > 0) {
1763 			ivec->nr--;
1764 			iput(ivec->inode[ivec->nr]);
1765 		}
1766 		kfree(ivec);
1767 	}
1768 
1769 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1770 		invalidate_extent_cache(root, &key, &next_key);
1771 
1772 	return err;
1773 }
1774 
1775 /*
1776  * callback for the work threads.
1777  * this function merges reloc tree with corresponding fs tree,
1778  * and then drops the reloc tree.
1779  */
1780 static void merge_func(struct btrfs_work *work)
1781 {
1782 	struct btrfs_trans_handle *trans;
1783 	struct btrfs_root *root;
1784 	struct btrfs_root *reloc_root;
1785 	struct async_merge *async;
1786 
1787 	async = container_of(work, struct async_merge, work);
1788 	reloc_root = async->root;
1789 
1790 	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1791 		root = read_fs_root(reloc_root->fs_info,
1792 				    reloc_root->root_key.offset);
1793 		BUG_ON(IS_ERR(root));
1794 		BUG_ON(root->reloc_root != reloc_root);
1795 
1796 		merge_reloc_root(async->rc, root);
1797 
1798 		trans = btrfs_start_transaction(root, 1);
1799 		btrfs_update_reloc_root(trans, root);
1800 		btrfs_end_transaction(trans, root);
1801 	}
1802 
1803 	btrfs_drop_snapshot(reloc_root, 0);
1804 
1805 	if (atomic_dec_and_test(async->num_pending))
1806 		complete(async->done);
1807 
1808 	kfree(async);
1809 }
1810 
1811 static int merge_reloc_roots(struct reloc_control *rc)
1812 {
1813 	struct async_merge *async;
1814 	struct btrfs_root *root;
1815 	struct completion done;
1816 	atomic_t num_pending;
1817 
1818 	init_completion(&done);
1819 	atomic_set(&num_pending, 1);
1820 
1821 	while (!list_empty(&rc->reloc_roots)) {
1822 		root = list_entry(rc->reloc_roots.next,
1823 				  struct btrfs_root, root_list);
1824 		list_del_init(&root->root_list);
1825 
1826 		async = kmalloc(sizeof(*async), GFP_NOFS);
1827 		BUG_ON(!async);
1828 		async->work.func = merge_func;
1829 		async->work.flags = 0;
1830 		async->rc = rc;
1831 		async->root = root;
1832 		async->done = &done;
1833 		async->num_pending = &num_pending;
1834 		atomic_inc(&num_pending);
1835 		btrfs_queue_worker(&rc->workers, &async->work);
1836 	}
1837 
1838 	if (!atomic_dec_and_test(&num_pending))
1839 		wait_for_completion(&done);
1840 
1841 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1842 	return 0;
1843 }
1844 
1845 static void free_block_list(struct rb_root *blocks)
1846 {
1847 	struct tree_block *block;
1848 	struct rb_node *rb_node;
1849 	while ((rb_node = rb_first(blocks))) {
1850 		block = rb_entry(rb_node, struct tree_block, rb_node);
1851 		rb_erase(rb_node, blocks);
1852 		kfree(block);
1853 	}
1854 }
1855 
1856 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1857 				      struct btrfs_root *reloc_root)
1858 {
1859 	struct btrfs_root *root;
1860 
1861 	if (reloc_root->last_trans == trans->transid)
1862 		return 0;
1863 
1864 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1865 	BUG_ON(IS_ERR(root));
1866 	BUG_ON(root->reloc_root != reloc_root);
1867 
1868 	return btrfs_record_root_in_trans(trans, root);
1869 }
1870 
1871 /*
1872  * select one tree from trees that references the block.
1873  * for blocks in refernce counted trees, we preper reloc tree.
1874  * if no reloc tree found and reloc_only is true, NULL is returned.
1875  */
1876 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1877 					    struct backref_node *node,
1878 					    struct backref_edge *edges[],
1879 					    int *nr, int reloc_only)
1880 {
1881 	struct backref_node *next;
1882 	struct btrfs_root *root;
1883 	int index;
1884 	int loop = 0;
1885 again:
1886 	index = 0;
1887 	next = node;
1888 	while (1) {
1889 		cond_resched();
1890 		next = walk_up_backref(next, edges, &index);
1891 		root = next->root;
1892 		if (!root) {
1893 			BUG_ON(!node->old_root);
1894 			goto skip;
1895 		}
1896 
1897 		/* no other choice for non-refernce counted tree */
1898 		if (!root->ref_cows) {
1899 			BUG_ON(reloc_only);
1900 			break;
1901 		}
1902 
1903 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1904 			record_reloc_root_in_trans(trans, root);
1905 			break;
1906 		}
1907 
1908 		if (loop) {
1909 			btrfs_record_root_in_trans(trans, root);
1910 			break;
1911 		}
1912 
1913 		if (reloc_only || next != node) {
1914 			if (!root->reloc_root)
1915 				btrfs_record_root_in_trans(trans, root);
1916 			root = root->reloc_root;
1917 			/*
1918 			 * if the reloc tree was created in current
1919 			 * transation, there is no node in backref tree
1920 			 * corresponds to the root of the reloc tree.
1921 			 */
1922 			if (btrfs_root_last_snapshot(&root->root_item) ==
1923 			    trans->transid - 1)
1924 				break;
1925 		}
1926 skip:
1927 		root = NULL;
1928 		next = walk_down_backref(edges, &index);
1929 		if (!next || next->level <= node->level)
1930 			break;
1931 	}
1932 
1933 	if (!root && !loop && !reloc_only) {
1934 		loop = 1;
1935 		goto again;
1936 	}
1937 
1938 	if (root)
1939 		*nr = index;
1940 	else
1941 		*nr = 0;
1942 
1943 	return root;
1944 }
1945 
1946 static noinline_for_stack
1947 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1948 				   struct backref_node *node)
1949 {
1950 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1951 	int nr;
1952 	return __select_one_root(trans, node, edges, &nr, 0);
1953 }
1954 
1955 static noinline_for_stack
1956 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1957 				     struct backref_node *node,
1958 				     struct backref_edge *edges[], int *nr)
1959 {
1960 	return __select_one_root(trans, node, edges, nr, 1);
1961 }
1962 
1963 static void grab_path_buffers(struct btrfs_path *path,
1964 			      struct backref_node *node,
1965 			      struct backref_edge *edges[], int nr)
1966 {
1967 	int i = 0;
1968 	while (1) {
1969 		drop_node_buffer(node);
1970 		node->eb = path->nodes[node->level];
1971 		BUG_ON(!node->eb);
1972 		if (path->locks[node->level])
1973 			node->locked = 1;
1974 		path->nodes[node->level] = NULL;
1975 		path->locks[node->level] = 0;
1976 
1977 		if (i >= nr)
1978 			break;
1979 
1980 		edges[i]->blockptr = node->eb->start;
1981 		node = edges[i]->node[UPPER];
1982 		i++;
1983 	}
1984 }
1985 
1986 /*
1987  * relocate a block tree, and then update pointers in upper level
1988  * blocks that reference the block to point to the new location.
1989  *
1990  * if called by link_to_upper, the block has already been relocated.
1991  * in that case this function just updates pointers.
1992  */
1993 static int do_relocation(struct btrfs_trans_handle *trans,
1994 			 struct backref_node *node,
1995 			 struct btrfs_key *key,
1996 			 struct btrfs_path *path, int lowest)
1997 {
1998 	struct backref_node *upper;
1999 	struct backref_edge *edge;
2000 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2001 	struct btrfs_root *root;
2002 	struct extent_buffer *eb;
2003 	u32 blocksize;
2004 	u64 bytenr;
2005 	u64 generation;
2006 	int nr;
2007 	int slot;
2008 	int ret;
2009 	int err = 0;
2010 
2011 	BUG_ON(lowest && node->eb);
2012 
2013 	path->lowest_level = node->level + 1;
2014 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2015 		cond_resched();
2016 		if (node->eb && node->eb->start == edge->blockptr)
2017 			continue;
2018 
2019 		upper = edge->node[UPPER];
2020 		root = select_reloc_root(trans, upper, edges, &nr);
2021 		if (!root)
2022 			continue;
2023 
2024 		if (upper->eb && !upper->locked)
2025 			drop_node_buffer(upper);
2026 
2027 		if (!upper->eb) {
2028 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2029 			if (ret < 0) {
2030 				err = ret;
2031 				break;
2032 			}
2033 			BUG_ON(ret > 0);
2034 
2035 			slot = path->slots[upper->level];
2036 
2037 			btrfs_unlock_up_safe(path, upper->level + 1);
2038 			grab_path_buffers(path, upper, edges, nr);
2039 
2040 			btrfs_release_path(NULL, path);
2041 		} else {
2042 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2043 					       &slot);
2044 			BUG_ON(ret);
2045 		}
2046 
2047 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2048 		if (!lowest) {
2049 			if (node->eb->start == bytenr) {
2050 				btrfs_tree_unlock(upper->eb);
2051 				upper->locked = 0;
2052 				continue;
2053 			}
2054 		} else {
2055 			BUG_ON(node->bytenr != bytenr);
2056 		}
2057 
2058 		blocksize = btrfs_level_size(root, node->level);
2059 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2060 		eb = read_tree_block(root, bytenr, blocksize, generation);
2061 		btrfs_tree_lock(eb);
2062 		btrfs_set_lock_blocking(eb);
2063 
2064 		if (!node->eb) {
2065 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2066 					      slot, &eb);
2067 			if (ret < 0) {
2068 				err = ret;
2069 				break;
2070 			}
2071 			btrfs_set_lock_blocking(eb);
2072 			node->eb = eb;
2073 			node->locked = 1;
2074 		} else {
2075 			btrfs_set_node_blockptr(upper->eb, slot,
2076 						node->eb->start);
2077 			btrfs_set_node_ptr_generation(upper->eb, slot,
2078 						      trans->transid);
2079 			btrfs_mark_buffer_dirty(upper->eb);
2080 
2081 			ret = btrfs_inc_extent_ref(trans, root,
2082 						node->eb->start, blocksize,
2083 						upper->eb->start,
2084 						btrfs_header_owner(upper->eb),
2085 						node->level, 0);
2086 			BUG_ON(ret);
2087 
2088 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2089 			BUG_ON(ret);
2090 		}
2091 		if (!lowest) {
2092 			btrfs_tree_unlock(upper->eb);
2093 			upper->locked = 0;
2094 		}
2095 	}
2096 	path->lowest_level = 0;
2097 	return err;
2098 }
2099 
2100 static int link_to_upper(struct btrfs_trans_handle *trans,
2101 			 struct backref_node *node,
2102 			 struct btrfs_path *path)
2103 {
2104 	struct btrfs_key key;
2105 	if (!node->eb || list_empty(&node->upper))
2106 		return 0;
2107 
2108 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2109 	return do_relocation(trans, node, &key, path, 0);
2110 }
2111 
2112 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2113 				struct backref_cache *cache,
2114 				struct btrfs_path *path)
2115 {
2116 	struct backref_node *node;
2117 	int level;
2118 	int ret;
2119 	int err = 0;
2120 
2121 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2122 		while (!list_empty(&cache->pending[level])) {
2123 			node = list_entry(cache->pending[level].next,
2124 					  struct backref_node, lower);
2125 			BUG_ON(node->level != level);
2126 
2127 			ret = link_to_upper(trans, node, path);
2128 			if (ret < 0)
2129 				err = ret;
2130 			/*
2131 			 * this remove the node from the pending list and
2132 			 * may add some other nodes to the level + 1
2133 			 * pending list
2134 			 */
2135 			remove_backref_node(cache, node);
2136 		}
2137 	}
2138 	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2139 	return err;
2140 }
2141 
2142 static void mark_block_processed(struct reloc_control *rc,
2143 				 struct backref_node *node)
2144 {
2145 	u32 blocksize;
2146 	if (node->level == 0 ||
2147 	    in_block_group(node->bytenr, rc->block_group)) {
2148 		blocksize = btrfs_level_size(rc->extent_root, node->level);
2149 		set_extent_bits(&rc->processed_blocks, node->bytenr,
2150 				node->bytenr + blocksize - 1, EXTENT_DIRTY,
2151 				GFP_NOFS);
2152 	}
2153 	node->processed = 1;
2154 }
2155 
2156 /*
2157  * mark a block and all blocks directly/indirectly reference the block
2158  * as processed.
2159  */
2160 static void update_processed_blocks(struct reloc_control *rc,
2161 				    struct backref_node *node)
2162 {
2163 	struct backref_node *next = node;
2164 	struct backref_edge *edge;
2165 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2166 	int index = 0;
2167 
2168 	while (next) {
2169 		cond_resched();
2170 		while (1) {
2171 			if (next->processed)
2172 				break;
2173 
2174 			mark_block_processed(rc, next);
2175 
2176 			if (list_empty(&next->upper))
2177 				break;
2178 
2179 			edge = list_entry(next->upper.next,
2180 					  struct backref_edge, list[LOWER]);
2181 			edges[index++] = edge;
2182 			next = edge->node[UPPER];
2183 		}
2184 		next = walk_down_backref(edges, &index);
2185 	}
2186 }
2187 
2188 static int tree_block_processed(u64 bytenr, u32 blocksize,
2189 				struct reloc_control *rc)
2190 {
2191 	if (test_range_bit(&rc->processed_blocks, bytenr,
2192 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2193 		return 1;
2194 	return 0;
2195 }
2196 
2197 /*
2198  * check if there are any file extent pointers in the leaf point to
2199  * data require processing
2200  */
2201 static int check_file_extents(struct reloc_control *rc,
2202 			      u64 bytenr, u32 blocksize, u64 ptr_gen)
2203 {
2204 	struct btrfs_key found_key;
2205 	struct btrfs_file_extent_item *fi;
2206 	struct extent_buffer *leaf;
2207 	u32 nritems;
2208 	int i;
2209 	int ret = 0;
2210 
2211 	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2212 
2213 	nritems = btrfs_header_nritems(leaf);
2214 	for (i = 0; i < nritems; i++) {
2215 		cond_resched();
2216 		btrfs_item_key_to_cpu(leaf, &found_key, i);
2217 		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2218 			continue;
2219 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2220 		if (btrfs_file_extent_type(leaf, fi) ==
2221 		    BTRFS_FILE_EXTENT_INLINE)
2222 			continue;
2223 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2224 		if (bytenr == 0)
2225 			continue;
2226 		if (in_block_group(bytenr, rc->block_group)) {
2227 			ret = 1;
2228 			break;
2229 		}
2230 	}
2231 	free_extent_buffer(leaf);
2232 	return ret;
2233 }
2234 
2235 /*
2236  * scan child blocks of a given block to find blocks require processing
2237  */
2238 static int add_child_blocks(struct btrfs_trans_handle *trans,
2239 			    struct reloc_control *rc,
2240 			    struct backref_node *node,
2241 			    struct rb_root *blocks)
2242 {
2243 	struct tree_block *block;
2244 	struct rb_node *rb_node;
2245 	u64 bytenr;
2246 	u64 ptr_gen;
2247 	u32 blocksize;
2248 	u32 nritems;
2249 	int i;
2250 	int err = 0;
2251 
2252 	nritems = btrfs_header_nritems(node->eb);
2253 	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2254 	for (i = 0; i < nritems; i++) {
2255 		cond_resched();
2256 		bytenr = btrfs_node_blockptr(node->eb, i);
2257 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2258 		if (ptr_gen == trans->transid)
2259 			continue;
2260 		if (!in_block_group(bytenr, rc->block_group) &&
2261 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2262 			continue;
2263 		if (tree_block_processed(bytenr, blocksize, rc))
2264 			continue;
2265 
2266 		readahead_tree_block(rc->extent_root,
2267 				     bytenr, blocksize, ptr_gen);
2268 	}
2269 
2270 	for (i = 0; i < nritems; i++) {
2271 		cond_resched();
2272 		bytenr = btrfs_node_blockptr(node->eb, i);
2273 		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2274 		if (ptr_gen == trans->transid)
2275 			continue;
2276 		if (!in_block_group(bytenr, rc->block_group) &&
2277 		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2278 			continue;
2279 		if (tree_block_processed(bytenr, blocksize, rc))
2280 			continue;
2281 		if (!in_block_group(bytenr, rc->block_group) &&
2282 		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2283 			continue;
2284 
2285 		block = kmalloc(sizeof(*block), GFP_NOFS);
2286 		if (!block) {
2287 			err = -ENOMEM;
2288 			break;
2289 		}
2290 		block->bytenr = bytenr;
2291 		btrfs_node_key_to_cpu(node->eb, &block->key, i);
2292 		block->level = node->level - 1;
2293 		block->key_ready = 1;
2294 		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2295 		BUG_ON(rb_node);
2296 	}
2297 	if (err)
2298 		free_block_list(blocks);
2299 	return err;
2300 }
2301 
2302 /*
2303  * find adjacent blocks require processing
2304  */
2305 static noinline_for_stack
2306 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2307 			struct reloc_control *rc,
2308 			struct backref_cache *cache,
2309 			struct rb_root *blocks, int level,
2310 			struct backref_node **upper)
2311 {
2312 	struct backref_node *node;
2313 	int ret = 0;
2314 
2315 	WARN_ON(!list_empty(&cache->pending[level]));
2316 
2317 	if (list_empty(&cache->pending[level + 1]))
2318 		return 1;
2319 
2320 	node = list_entry(cache->pending[level + 1].next,
2321 			  struct backref_node, lower);
2322 	if (node->eb)
2323 		ret = add_child_blocks(trans, rc, node, blocks);
2324 
2325 	*upper = node;
2326 	return ret;
2327 }
2328 
2329 static int get_tree_block_key(struct reloc_control *rc,
2330 			      struct tree_block *block)
2331 {
2332 	struct extent_buffer *eb;
2333 
2334 	BUG_ON(block->key_ready);
2335 	eb = read_tree_block(rc->extent_root, block->bytenr,
2336 			     block->key.objectid, block->key.offset);
2337 	WARN_ON(btrfs_header_level(eb) != block->level);
2338 	if (block->level == 0)
2339 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2340 	else
2341 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2342 	free_extent_buffer(eb);
2343 	block->key_ready = 1;
2344 	return 0;
2345 }
2346 
2347 static int reada_tree_block(struct reloc_control *rc,
2348 			    struct tree_block *block)
2349 {
2350 	BUG_ON(block->key_ready);
2351 	readahead_tree_block(rc->extent_root, block->bytenr,
2352 			     block->key.objectid, block->key.offset);
2353 	return 0;
2354 }
2355 
2356 /*
2357  * helper function to relocate a tree block
2358  */
2359 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2360 				struct reloc_control *rc,
2361 				struct backref_node *node,
2362 				struct btrfs_key *key,
2363 				struct btrfs_path *path)
2364 {
2365 	struct btrfs_root *root;
2366 	int ret;
2367 
2368 	root = select_one_root(trans, node);
2369 	if (unlikely(!root)) {
2370 		rc->found_old_snapshot = 1;
2371 		update_processed_blocks(rc, node);
2372 		return 0;
2373 	}
2374 
2375 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2376 		ret = do_relocation(trans, node, key, path, 1);
2377 		if (ret < 0)
2378 			goto out;
2379 		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2380 			ret = replace_file_extents(trans, rc, root,
2381 						   node->eb, NULL);
2382 			if (ret < 0)
2383 				goto out;
2384 		}
2385 		drop_node_buffer(node);
2386 	} else if (!root->ref_cows) {
2387 		path->lowest_level = node->level;
2388 		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2389 		btrfs_release_path(root, path);
2390 		if (ret < 0)
2391 			goto out;
2392 	} else if (root != node->root) {
2393 		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2394 	}
2395 
2396 	update_processed_blocks(rc, node);
2397 	ret = 0;
2398 out:
2399 	drop_node_buffer(node);
2400 	return ret;
2401 }
2402 
2403 /*
2404  * relocate a list of blocks
2405  */
2406 static noinline_for_stack
2407 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2408 			 struct reloc_control *rc, struct rb_root *blocks)
2409 {
2410 	struct backref_cache *cache;
2411 	struct backref_node *node;
2412 	struct btrfs_path *path;
2413 	struct tree_block *block;
2414 	struct rb_node *rb_node;
2415 	int level = -1;
2416 	int ret;
2417 	int err = 0;
2418 
2419 	path = btrfs_alloc_path();
2420 	if (!path)
2421 		return -ENOMEM;
2422 
2423 	cache = kmalloc(sizeof(*cache), GFP_NOFS);
2424 	if (!cache) {
2425 		btrfs_free_path(path);
2426 		return -ENOMEM;
2427 	}
2428 
2429 	backref_cache_init(cache);
2430 
2431 	rb_node = rb_first(blocks);
2432 	while (rb_node) {
2433 		block = rb_entry(rb_node, struct tree_block, rb_node);
2434 		if (level == -1)
2435 			level = block->level;
2436 		else
2437 			BUG_ON(level != block->level);
2438 		if (!block->key_ready)
2439 			reada_tree_block(rc, block);
2440 		rb_node = rb_next(rb_node);
2441 	}
2442 
2443 	rb_node = rb_first(blocks);
2444 	while (rb_node) {
2445 		block = rb_entry(rb_node, struct tree_block, rb_node);
2446 		if (!block->key_ready)
2447 			get_tree_block_key(rc, block);
2448 		rb_node = rb_next(rb_node);
2449 	}
2450 
2451 	rb_node = rb_first(blocks);
2452 	while (rb_node) {
2453 		block = rb_entry(rb_node, struct tree_block, rb_node);
2454 
2455 		node = build_backref_tree(rc, cache, &block->key,
2456 					  block->level, block->bytenr);
2457 		if (IS_ERR(node)) {
2458 			err = PTR_ERR(node);
2459 			goto out;
2460 		}
2461 
2462 		ret = relocate_tree_block(trans, rc, node, &block->key,
2463 					  path);
2464 		if (ret < 0) {
2465 			err = ret;
2466 			goto out;
2467 		}
2468 		remove_backref_node(cache, node);
2469 		rb_node = rb_next(rb_node);
2470 	}
2471 
2472 	if (level > 0)
2473 		goto out;
2474 
2475 	free_block_list(blocks);
2476 
2477 	/*
2478 	 * now backrefs of some upper level tree blocks have been cached,
2479 	 * try relocating blocks referenced by these upper level blocks.
2480 	 */
2481 	while (1) {
2482 		struct backref_node *upper = NULL;
2483 		if (trans->transaction->in_commit ||
2484 		    trans->transaction->delayed_refs.flushing)
2485 			break;
2486 
2487 		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2488 					  &upper);
2489 		if (ret < 0)
2490 			err = ret;
2491 		if (ret != 0)
2492 			break;
2493 
2494 		rb_node = rb_first(blocks);
2495 		while (rb_node) {
2496 			block = rb_entry(rb_node, struct tree_block, rb_node);
2497 			if (trans->transaction->in_commit ||
2498 			    trans->transaction->delayed_refs.flushing)
2499 				goto out;
2500 			BUG_ON(!block->key_ready);
2501 			node = build_backref_tree(rc, cache, &block->key,
2502 						  level, block->bytenr);
2503 			if (IS_ERR(node)) {
2504 				err = PTR_ERR(node);
2505 				goto out;
2506 			}
2507 
2508 			ret = relocate_tree_block(trans, rc, node,
2509 						  &block->key, path);
2510 			if (ret < 0) {
2511 				err = ret;
2512 				goto out;
2513 			}
2514 			remove_backref_node(cache, node);
2515 			rb_node = rb_next(rb_node);
2516 		}
2517 		free_block_list(blocks);
2518 
2519 		if (upper) {
2520 			ret = link_to_upper(trans, upper, path);
2521 			if (ret < 0) {
2522 				err = ret;
2523 				break;
2524 			}
2525 			remove_backref_node(cache, upper);
2526 		}
2527 	}
2528 out:
2529 	free_block_list(blocks);
2530 
2531 	ret = finish_pending_nodes(trans, cache, path);
2532 	if (ret < 0)
2533 		err = ret;
2534 
2535 	kfree(cache);
2536 	btrfs_free_path(path);
2537 	return err;
2538 }
2539 
2540 static noinline_for_stack
2541 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2542 			 u64 block_start)
2543 {
2544 	struct btrfs_root *root = BTRFS_I(inode)->root;
2545 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2546 	struct extent_map *em;
2547 	int ret = 0;
2548 
2549 	em = alloc_extent_map(GFP_NOFS);
2550 	if (!em)
2551 		return -ENOMEM;
2552 
2553 	em->start = start;
2554 	em->len = end + 1 - start;
2555 	em->block_len = em->len;
2556 	em->block_start = block_start;
2557 	em->bdev = root->fs_info->fs_devices->latest_bdev;
2558 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
2559 
2560 	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2561 	while (1) {
2562 		write_lock(&em_tree->lock);
2563 		ret = add_extent_mapping(em_tree, em);
2564 		write_unlock(&em_tree->lock);
2565 		if (ret != -EEXIST) {
2566 			free_extent_map(em);
2567 			break;
2568 		}
2569 		btrfs_drop_extent_cache(inode, start, end, 0);
2570 	}
2571 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2572 	return ret;
2573 }
2574 
2575 static int relocate_file_extent_cluster(struct inode *inode,
2576 					struct file_extent_cluster *cluster)
2577 {
2578 	u64 page_start;
2579 	u64 page_end;
2580 	u64 offset = BTRFS_I(inode)->index_cnt;
2581 	unsigned long index;
2582 	unsigned long last_index;
2583 	unsigned int dirty_page = 0;
2584 	struct page *page;
2585 	struct file_ra_state *ra;
2586 	int nr = 0;
2587 	int ret = 0;
2588 
2589 	if (!cluster->nr)
2590 		return 0;
2591 
2592 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
2593 	if (!ra)
2594 		return -ENOMEM;
2595 
2596 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2597 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2598 
2599 	mutex_lock(&inode->i_mutex);
2600 
2601 	i_size_write(inode, cluster->end + 1 - offset);
2602 	ret = setup_extent_mapping(inode, cluster->start - offset,
2603 				   cluster->end - offset, cluster->start);
2604 	if (ret)
2605 		goto out_unlock;
2606 
2607 	file_ra_state_init(ra, inode->i_mapping);
2608 
2609 	WARN_ON(cluster->start != cluster->boundary[0]);
2610 	while (index <= last_index) {
2611 		page = find_lock_page(inode->i_mapping, index);
2612 		if (!page) {
2613 			page_cache_sync_readahead(inode->i_mapping,
2614 						  ra, NULL, index,
2615 						  last_index + 1 - index);
2616 			page = grab_cache_page(inode->i_mapping, index);
2617 			if (!page) {
2618 				ret = -ENOMEM;
2619 				goto out_unlock;
2620 			}
2621 		}
2622 
2623 		if (PageReadahead(page)) {
2624 			page_cache_async_readahead(inode->i_mapping,
2625 						   ra, NULL, page, index,
2626 						   last_index + 1 - index);
2627 		}
2628 
2629 		if (!PageUptodate(page)) {
2630 			btrfs_readpage(NULL, page);
2631 			lock_page(page);
2632 			if (!PageUptodate(page)) {
2633 				unlock_page(page);
2634 				page_cache_release(page);
2635 				ret = -EIO;
2636 				goto out_unlock;
2637 			}
2638 		}
2639 
2640 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2641 		page_end = page_start + PAGE_CACHE_SIZE - 1;
2642 
2643 		lock_extent(&BTRFS_I(inode)->io_tree,
2644 			    page_start, page_end, GFP_NOFS);
2645 
2646 		set_page_extent_mapped(page);
2647 
2648 		if (nr < cluster->nr &&
2649 		    page_start + offset == cluster->boundary[nr]) {
2650 			set_extent_bits(&BTRFS_I(inode)->io_tree,
2651 					page_start, page_end,
2652 					EXTENT_BOUNDARY, GFP_NOFS);
2653 			nr++;
2654 		}
2655 		btrfs_set_extent_delalloc(inode, page_start, page_end);
2656 
2657 		set_page_dirty(page);
2658 		dirty_page++;
2659 
2660 		unlock_extent(&BTRFS_I(inode)->io_tree,
2661 			      page_start, page_end, GFP_NOFS);
2662 		unlock_page(page);
2663 		page_cache_release(page);
2664 
2665 		index++;
2666 		if (nr < cluster->nr &&
2667 		    page_end + 1 + offset == cluster->boundary[nr]) {
2668 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2669 							   dirty_page);
2670 			dirty_page = 0;
2671 		}
2672 	}
2673 	if (dirty_page) {
2674 		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2675 						   dirty_page);
2676 	}
2677 	WARN_ON(nr != cluster->nr);
2678 out_unlock:
2679 	mutex_unlock(&inode->i_mutex);
2680 	kfree(ra);
2681 	return ret;
2682 }
2683 
2684 static noinline_for_stack
2685 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2686 			 struct file_extent_cluster *cluster)
2687 {
2688 	int ret;
2689 
2690 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2691 		ret = relocate_file_extent_cluster(inode, cluster);
2692 		if (ret)
2693 			return ret;
2694 		cluster->nr = 0;
2695 	}
2696 
2697 	if (!cluster->nr)
2698 		cluster->start = extent_key->objectid;
2699 	else
2700 		BUG_ON(cluster->nr >= MAX_EXTENTS);
2701 	cluster->end = extent_key->objectid + extent_key->offset - 1;
2702 	cluster->boundary[cluster->nr] = extent_key->objectid;
2703 	cluster->nr++;
2704 
2705 	if (cluster->nr >= MAX_EXTENTS) {
2706 		ret = relocate_file_extent_cluster(inode, cluster);
2707 		if (ret)
2708 			return ret;
2709 		cluster->nr = 0;
2710 	}
2711 	return 0;
2712 }
2713 
2714 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2715 static int get_ref_objectid_v0(struct reloc_control *rc,
2716 			       struct btrfs_path *path,
2717 			       struct btrfs_key *extent_key,
2718 			       u64 *ref_objectid, int *path_change)
2719 {
2720 	struct btrfs_key key;
2721 	struct extent_buffer *leaf;
2722 	struct btrfs_extent_ref_v0 *ref0;
2723 	int ret;
2724 	int slot;
2725 
2726 	leaf = path->nodes[0];
2727 	slot = path->slots[0];
2728 	while (1) {
2729 		if (slot >= btrfs_header_nritems(leaf)) {
2730 			ret = btrfs_next_leaf(rc->extent_root, path);
2731 			if (ret < 0)
2732 				return ret;
2733 			BUG_ON(ret > 0);
2734 			leaf = path->nodes[0];
2735 			slot = path->slots[0];
2736 			if (path_change)
2737 				*path_change = 1;
2738 		}
2739 		btrfs_item_key_to_cpu(leaf, &key, slot);
2740 		if (key.objectid != extent_key->objectid)
2741 			return -ENOENT;
2742 
2743 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2744 			slot++;
2745 			continue;
2746 		}
2747 		ref0 = btrfs_item_ptr(leaf, slot,
2748 				struct btrfs_extent_ref_v0);
2749 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2750 		break;
2751 	}
2752 	return 0;
2753 }
2754 #endif
2755 
2756 /*
2757  * helper to add a tree block to the list.
2758  * the major work is getting the generation and level of the block
2759  */
2760 static int add_tree_block(struct reloc_control *rc,
2761 			  struct btrfs_key *extent_key,
2762 			  struct btrfs_path *path,
2763 			  struct rb_root *blocks)
2764 {
2765 	struct extent_buffer *eb;
2766 	struct btrfs_extent_item *ei;
2767 	struct btrfs_tree_block_info *bi;
2768 	struct tree_block *block;
2769 	struct rb_node *rb_node;
2770 	u32 item_size;
2771 	int level = -1;
2772 	int generation;
2773 
2774 	eb =  path->nodes[0];
2775 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
2776 
2777 	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2778 		ei = btrfs_item_ptr(eb, path->slots[0],
2779 				struct btrfs_extent_item);
2780 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2781 		generation = btrfs_extent_generation(eb, ei);
2782 		level = btrfs_tree_block_level(eb, bi);
2783 	} else {
2784 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2785 		u64 ref_owner;
2786 		int ret;
2787 
2788 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2789 		ret = get_ref_objectid_v0(rc, path, extent_key,
2790 					  &ref_owner, NULL);
2791 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2792 		level = (int)ref_owner;
2793 		/* FIXME: get real generation */
2794 		generation = 0;
2795 #else
2796 		BUG();
2797 #endif
2798 	}
2799 
2800 	btrfs_release_path(rc->extent_root, path);
2801 
2802 	BUG_ON(level == -1);
2803 
2804 	block = kmalloc(sizeof(*block), GFP_NOFS);
2805 	if (!block)
2806 		return -ENOMEM;
2807 
2808 	block->bytenr = extent_key->objectid;
2809 	block->key.objectid = extent_key->offset;
2810 	block->key.offset = generation;
2811 	block->level = level;
2812 	block->key_ready = 0;
2813 
2814 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2815 	BUG_ON(rb_node);
2816 
2817 	return 0;
2818 }
2819 
2820 /*
2821  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2822  */
2823 static int __add_tree_block(struct reloc_control *rc,
2824 			    u64 bytenr, u32 blocksize,
2825 			    struct rb_root *blocks)
2826 {
2827 	struct btrfs_path *path;
2828 	struct btrfs_key key;
2829 	int ret;
2830 
2831 	if (tree_block_processed(bytenr, blocksize, rc))
2832 		return 0;
2833 
2834 	if (tree_search(blocks, bytenr))
2835 		return 0;
2836 
2837 	path = btrfs_alloc_path();
2838 	if (!path)
2839 		return -ENOMEM;
2840 
2841 	key.objectid = bytenr;
2842 	key.type = BTRFS_EXTENT_ITEM_KEY;
2843 	key.offset = blocksize;
2844 
2845 	path->search_commit_root = 1;
2846 	path->skip_locking = 1;
2847 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2848 	if (ret < 0)
2849 		goto out;
2850 	BUG_ON(ret);
2851 
2852 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2853 	ret = add_tree_block(rc, &key, path, blocks);
2854 out:
2855 	btrfs_free_path(path);
2856 	return ret;
2857 }
2858 
2859 /*
2860  * helper to check if the block use full backrefs for pointers in it
2861  */
2862 static int block_use_full_backref(struct reloc_control *rc,
2863 				  struct extent_buffer *eb)
2864 {
2865 	struct btrfs_path *path;
2866 	struct btrfs_extent_item *ei;
2867 	struct btrfs_key key;
2868 	u64 flags;
2869 	int ret;
2870 
2871 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2872 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2873 		return 1;
2874 
2875 	path = btrfs_alloc_path();
2876 	BUG_ON(!path);
2877 
2878 	key.objectid = eb->start;
2879 	key.type = BTRFS_EXTENT_ITEM_KEY;
2880 	key.offset = eb->len;
2881 
2882 	path->search_commit_root = 1;
2883 	path->skip_locking = 1;
2884 	ret = btrfs_search_slot(NULL, rc->extent_root,
2885 				&key, path, 0, 0);
2886 	BUG_ON(ret);
2887 
2888 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2889 			    struct btrfs_extent_item);
2890 	flags = btrfs_extent_flags(path->nodes[0], ei);
2891 	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2892 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2893 		ret = 1;
2894 	else
2895 		ret = 0;
2896 	btrfs_free_path(path);
2897 	return ret;
2898 }
2899 
2900 /*
2901  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2902  * this function scans fs tree to find blocks reference the data extent
2903  */
2904 static int find_data_references(struct reloc_control *rc,
2905 				struct btrfs_key *extent_key,
2906 				struct extent_buffer *leaf,
2907 				struct btrfs_extent_data_ref *ref,
2908 				struct rb_root *blocks)
2909 {
2910 	struct btrfs_path *path;
2911 	struct tree_block *block;
2912 	struct btrfs_root *root;
2913 	struct btrfs_file_extent_item *fi;
2914 	struct rb_node *rb_node;
2915 	struct btrfs_key key;
2916 	u64 ref_root;
2917 	u64 ref_objectid;
2918 	u64 ref_offset;
2919 	u32 ref_count;
2920 	u32 nritems;
2921 	int err = 0;
2922 	int added = 0;
2923 	int counted;
2924 	int ret;
2925 
2926 	path = btrfs_alloc_path();
2927 	if (!path)
2928 		return -ENOMEM;
2929 
2930 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
2931 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2932 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2933 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
2934 
2935 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
2936 	if (IS_ERR(root)) {
2937 		err = PTR_ERR(root);
2938 		goto out;
2939 	}
2940 
2941 	key.objectid = ref_objectid;
2942 	key.offset = ref_offset;
2943 	key.type = BTRFS_EXTENT_DATA_KEY;
2944 
2945 	path->search_commit_root = 1;
2946 	path->skip_locking = 1;
2947 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2948 	if (ret < 0) {
2949 		err = ret;
2950 		goto out;
2951 	}
2952 
2953 	leaf = path->nodes[0];
2954 	nritems = btrfs_header_nritems(leaf);
2955 	/*
2956 	 * the references in tree blocks that use full backrefs
2957 	 * are not counted in
2958 	 */
2959 	if (block_use_full_backref(rc, leaf))
2960 		counted = 0;
2961 	else
2962 		counted = 1;
2963 	rb_node = tree_search(blocks, leaf->start);
2964 	if (rb_node) {
2965 		if (counted)
2966 			added = 1;
2967 		else
2968 			path->slots[0] = nritems;
2969 	}
2970 
2971 	while (ref_count > 0) {
2972 		while (path->slots[0] >= nritems) {
2973 			ret = btrfs_next_leaf(root, path);
2974 			if (ret < 0) {
2975 				err = ret;
2976 				goto out;
2977 			}
2978 			if (ret > 0) {
2979 				WARN_ON(1);
2980 				goto out;
2981 			}
2982 
2983 			leaf = path->nodes[0];
2984 			nritems = btrfs_header_nritems(leaf);
2985 			added = 0;
2986 
2987 			if (block_use_full_backref(rc, leaf))
2988 				counted = 0;
2989 			else
2990 				counted = 1;
2991 			rb_node = tree_search(blocks, leaf->start);
2992 			if (rb_node) {
2993 				if (counted)
2994 					added = 1;
2995 				else
2996 					path->slots[0] = nritems;
2997 			}
2998 		}
2999 
3000 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3001 		if (key.objectid != ref_objectid ||
3002 		    key.type != BTRFS_EXTENT_DATA_KEY) {
3003 			WARN_ON(1);
3004 			break;
3005 		}
3006 
3007 		fi = btrfs_item_ptr(leaf, path->slots[0],
3008 				    struct btrfs_file_extent_item);
3009 
3010 		if (btrfs_file_extent_type(leaf, fi) ==
3011 		    BTRFS_FILE_EXTENT_INLINE)
3012 			goto next;
3013 
3014 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3015 		    extent_key->objectid)
3016 			goto next;
3017 
3018 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3019 		if (key.offset != ref_offset)
3020 			goto next;
3021 
3022 		if (counted)
3023 			ref_count--;
3024 		if (added)
3025 			goto next;
3026 
3027 		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3028 			block = kmalloc(sizeof(*block), GFP_NOFS);
3029 			if (!block) {
3030 				err = -ENOMEM;
3031 				break;
3032 			}
3033 			block->bytenr = leaf->start;
3034 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3035 			block->level = 0;
3036 			block->key_ready = 1;
3037 			rb_node = tree_insert(blocks, block->bytenr,
3038 					      &block->rb_node);
3039 			BUG_ON(rb_node);
3040 		}
3041 		if (counted)
3042 			added = 1;
3043 		else
3044 			path->slots[0] = nritems;
3045 next:
3046 		path->slots[0]++;
3047 
3048 	}
3049 out:
3050 	btrfs_free_path(path);
3051 	return err;
3052 }
3053 
3054 /*
3055  * hepler to find all tree blocks that reference a given data extent
3056  */
3057 static noinline_for_stack
3058 int add_data_references(struct reloc_control *rc,
3059 			struct btrfs_key *extent_key,
3060 			struct btrfs_path *path,
3061 			struct rb_root *blocks)
3062 {
3063 	struct btrfs_key key;
3064 	struct extent_buffer *eb;
3065 	struct btrfs_extent_data_ref *dref;
3066 	struct btrfs_extent_inline_ref *iref;
3067 	unsigned long ptr;
3068 	unsigned long end;
3069 	u32 blocksize;
3070 	int ret;
3071 	int err = 0;
3072 
3073 	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3074 			       extent_key->offset);
3075 	BUG_ON(ret < 0);
3076 	if (ret > 0) {
3077 		/* the relocated data is fragmented */
3078 		rc->extents_skipped++;
3079 		btrfs_release_path(rc->extent_root, path);
3080 		return 0;
3081 	}
3082 
3083 	blocksize = btrfs_level_size(rc->extent_root, 0);
3084 
3085 	eb = path->nodes[0];
3086 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3087 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3088 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3089 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3090 		ptr = end;
3091 	else
3092 #endif
3093 		ptr += sizeof(struct btrfs_extent_item);
3094 
3095 	while (ptr < end) {
3096 		iref = (struct btrfs_extent_inline_ref *)ptr;
3097 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3098 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3099 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3100 			ret = __add_tree_block(rc, key.offset, blocksize,
3101 					       blocks);
3102 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3103 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3104 			ret = find_data_references(rc, extent_key,
3105 						   eb, dref, blocks);
3106 		} else {
3107 			BUG();
3108 		}
3109 		ptr += btrfs_extent_inline_ref_size(key.type);
3110 	}
3111 	WARN_ON(ptr > end);
3112 
3113 	while (1) {
3114 		cond_resched();
3115 		eb = path->nodes[0];
3116 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3117 			ret = btrfs_next_leaf(rc->extent_root, path);
3118 			if (ret < 0) {
3119 				err = ret;
3120 				break;
3121 			}
3122 			if (ret > 0)
3123 				break;
3124 			eb = path->nodes[0];
3125 		}
3126 
3127 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3128 		if (key.objectid != extent_key->objectid)
3129 			break;
3130 
3131 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3132 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3133 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3134 #else
3135 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3136 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3137 #endif
3138 			ret = __add_tree_block(rc, key.offset, blocksize,
3139 					       blocks);
3140 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3141 			dref = btrfs_item_ptr(eb, path->slots[0],
3142 					      struct btrfs_extent_data_ref);
3143 			ret = find_data_references(rc, extent_key,
3144 						   eb, dref, blocks);
3145 		} else {
3146 			ret = 0;
3147 		}
3148 		if (ret) {
3149 			err = ret;
3150 			break;
3151 		}
3152 		path->slots[0]++;
3153 	}
3154 	btrfs_release_path(rc->extent_root, path);
3155 	if (err)
3156 		free_block_list(blocks);
3157 	return err;
3158 }
3159 
3160 /*
3161  * hepler to find next unprocessed extent
3162  */
3163 static noinline_for_stack
3164 int find_next_extent(struct btrfs_trans_handle *trans,
3165 		     struct reloc_control *rc, struct btrfs_path *path)
3166 {
3167 	struct btrfs_key key;
3168 	struct extent_buffer *leaf;
3169 	u64 start, end, last;
3170 	int ret;
3171 
3172 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3173 	while (1) {
3174 		cond_resched();
3175 		if (rc->search_start >= last) {
3176 			ret = 1;
3177 			break;
3178 		}
3179 
3180 		key.objectid = rc->search_start;
3181 		key.type = BTRFS_EXTENT_ITEM_KEY;
3182 		key.offset = 0;
3183 
3184 		path->search_commit_root = 1;
3185 		path->skip_locking = 1;
3186 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3187 					0, 0);
3188 		if (ret < 0)
3189 			break;
3190 next:
3191 		leaf = path->nodes[0];
3192 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3193 			ret = btrfs_next_leaf(rc->extent_root, path);
3194 			if (ret != 0)
3195 				break;
3196 			leaf = path->nodes[0];
3197 		}
3198 
3199 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3200 		if (key.objectid >= last) {
3201 			ret = 1;
3202 			break;
3203 		}
3204 
3205 		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3206 		    key.objectid + key.offset <= rc->search_start) {
3207 			path->slots[0]++;
3208 			goto next;
3209 		}
3210 
3211 		ret = find_first_extent_bit(&rc->processed_blocks,
3212 					    key.objectid, &start, &end,
3213 					    EXTENT_DIRTY);
3214 
3215 		if (ret == 0 && start <= key.objectid) {
3216 			btrfs_release_path(rc->extent_root, path);
3217 			rc->search_start = end + 1;
3218 		} else {
3219 			rc->search_start = key.objectid + key.offset;
3220 			return 0;
3221 		}
3222 	}
3223 	btrfs_release_path(rc->extent_root, path);
3224 	return ret;
3225 }
3226 
3227 static void set_reloc_control(struct reloc_control *rc)
3228 {
3229 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3230 	mutex_lock(&fs_info->trans_mutex);
3231 	fs_info->reloc_ctl = rc;
3232 	mutex_unlock(&fs_info->trans_mutex);
3233 }
3234 
3235 static void unset_reloc_control(struct reloc_control *rc)
3236 {
3237 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3238 	mutex_lock(&fs_info->trans_mutex);
3239 	fs_info->reloc_ctl = NULL;
3240 	mutex_unlock(&fs_info->trans_mutex);
3241 }
3242 
3243 static int check_extent_flags(u64 flags)
3244 {
3245 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3246 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3247 		return 1;
3248 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3249 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3250 		return 1;
3251 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3252 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3253 		return 1;
3254 	return 0;
3255 }
3256 
3257 
3258 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3259 {
3260 	struct rb_root blocks = RB_ROOT;
3261 	struct btrfs_key key;
3262 	struct file_extent_cluster *cluster;
3263 	struct btrfs_trans_handle *trans = NULL;
3264 	struct btrfs_path *path;
3265 	struct btrfs_extent_item *ei;
3266 	unsigned long nr;
3267 	u64 flags;
3268 	u32 item_size;
3269 	int ret;
3270 	int err = 0;
3271 
3272 	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3273 	if (!cluster)
3274 		return -ENOMEM;
3275 
3276 	path = btrfs_alloc_path();
3277 	if (!path)
3278 		return -ENOMEM;
3279 
3280 	rc->extents_found = 0;
3281 	rc->extents_skipped = 0;
3282 
3283 	rc->search_start = rc->block_group->key.objectid;
3284 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3285 			  GFP_NOFS);
3286 
3287 	rc->create_reloc_root = 1;
3288 	set_reloc_control(rc);
3289 
3290 	trans = btrfs_start_transaction(rc->extent_root, 1);
3291 	btrfs_commit_transaction(trans, rc->extent_root);
3292 
3293 	while (1) {
3294 		trans = btrfs_start_transaction(rc->extent_root, 1);
3295 
3296 		ret = find_next_extent(trans, rc, path);
3297 		if (ret < 0)
3298 			err = ret;
3299 		if (ret != 0)
3300 			break;
3301 
3302 		rc->extents_found++;
3303 
3304 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3305 				    struct btrfs_extent_item);
3306 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3307 		item_size = btrfs_item_size_nr(path->nodes[0],
3308 					       path->slots[0]);
3309 		if (item_size >= sizeof(*ei)) {
3310 			flags = btrfs_extent_flags(path->nodes[0], ei);
3311 			ret = check_extent_flags(flags);
3312 			BUG_ON(ret);
3313 
3314 		} else {
3315 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3316 			u64 ref_owner;
3317 			int path_change = 0;
3318 
3319 			BUG_ON(item_size !=
3320 			       sizeof(struct btrfs_extent_item_v0));
3321 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3322 						  &path_change);
3323 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3324 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3325 			else
3326 				flags = BTRFS_EXTENT_FLAG_DATA;
3327 
3328 			if (path_change) {
3329 				btrfs_release_path(rc->extent_root, path);
3330 
3331 				path->search_commit_root = 1;
3332 				path->skip_locking = 1;
3333 				ret = btrfs_search_slot(NULL, rc->extent_root,
3334 							&key, path, 0, 0);
3335 				if (ret < 0) {
3336 					err = ret;
3337 					break;
3338 				}
3339 				BUG_ON(ret > 0);
3340 			}
3341 #else
3342 			BUG();
3343 #endif
3344 		}
3345 
3346 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3347 			ret = add_tree_block(rc, &key, path, &blocks);
3348 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3349 			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3350 			ret = add_data_references(rc, &key, path, &blocks);
3351 		} else {
3352 			btrfs_release_path(rc->extent_root, path);
3353 			ret = 0;
3354 		}
3355 		if (ret < 0) {
3356 			err = 0;
3357 			break;
3358 		}
3359 
3360 		if (!RB_EMPTY_ROOT(&blocks)) {
3361 			ret = relocate_tree_blocks(trans, rc, &blocks);
3362 			if (ret < 0) {
3363 				err = ret;
3364 				break;
3365 			}
3366 		}
3367 
3368 		nr = trans->blocks_used;
3369 		btrfs_end_transaction(trans, rc->extent_root);
3370 		trans = NULL;
3371 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3372 
3373 		if (rc->stage == MOVE_DATA_EXTENTS &&
3374 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3375 			rc->found_file_extent = 1;
3376 			ret = relocate_data_extent(rc->data_inode,
3377 						   &key, cluster);
3378 			if (ret < 0) {
3379 				err = ret;
3380 				break;
3381 			}
3382 		}
3383 	}
3384 	btrfs_free_path(path);
3385 
3386 	if (trans) {
3387 		nr = trans->blocks_used;
3388 		btrfs_end_transaction(trans, rc->extent_root);
3389 		btrfs_btree_balance_dirty(rc->extent_root, nr);
3390 	}
3391 
3392 	if (!err) {
3393 		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3394 		if (ret < 0)
3395 			err = ret;
3396 	}
3397 
3398 	kfree(cluster);
3399 
3400 	rc->create_reloc_root = 0;
3401 	smp_mb();
3402 
3403 	if (rc->extents_found > 0) {
3404 		trans = btrfs_start_transaction(rc->extent_root, 1);
3405 		btrfs_commit_transaction(trans, rc->extent_root);
3406 	}
3407 
3408 	merge_reloc_roots(rc);
3409 
3410 	unset_reloc_control(rc);
3411 
3412 	/* get rid of pinned extents */
3413 	trans = btrfs_start_transaction(rc->extent_root, 1);
3414 	btrfs_commit_transaction(trans, rc->extent_root);
3415 
3416 	return err;
3417 }
3418 
3419 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3420 				 struct btrfs_root *root, u64 objectid)
3421 {
3422 	struct btrfs_path *path;
3423 	struct btrfs_inode_item *item;
3424 	struct extent_buffer *leaf;
3425 	int ret;
3426 
3427 	path = btrfs_alloc_path();
3428 	if (!path)
3429 		return -ENOMEM;
3430 
3431 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3432 	if (ret)
3433 		goto out;
3434 
3435 	leaf = path->nodes[0];
3436 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3437 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3438 	btrfs_set_inode_generation(leaf, item, 1);
3439 	btrfs_set_inode_size(leaf, item, 0);
3440 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3441 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3442 	btrfs_mark_buffer_dirty(leaf);
3443 	btrfs_release_path(root, path);
3444 out:
3445 	btrfs_free_path(path);
3446 	return ret;
3447 }
3448 
3449 /*
3450  * helper to create inode for data relocation.
3451  * the inode is in data relocation tree and its link count is 0
3452  */
3453 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3454 					struct btrfs_block_group_cache *group)
3455 {
3456 	struct inode *inode = NULL;
3457 	struct btrfs_trans_handle *trans;
3458 	struct btrfs_root *root;
3459 	struct btrfs_key key;
3460 	unsigned long nr;
3461 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3462 	int err = 0;
3463 
3464 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3465 	if (IS_ERR(root))
3466 		return ERR_CAST(root);
3467 
3468 	trans = btrfs_start_transaction(root, 1);
3469 	BUG_ON(!trans);
3470 
3471 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3472 	if (err)
3473 		goto out;
3474 
3475 	err = __insert_orphan_inode(trans, root, objectid);
3476 	BUG_ON(err);
3477 
3478 	key.objectid = objectid;
3479 	key.type = BTRFS_INODE_ITEM_KEY;
3480 	key.offset = 0;
3481 	inode = btrfs_iget(root->fs_info->sb, &key, root);
3482 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3483 	BTRFS_I(inode)->index_cnt = group->key.objectid;
3484 
3485 	err = btrfs_orphan_add(trans, inode);
3486 out:
3487 	nr = trans->blocks_used;
3488 	btrfs_end_transaction(trans, root);
3489 
3490 	btrfs_btree_balance_dirty(root, nr);
3491 	if (err) {
3492 		if (inode)
3493 			iput(inode);
3494 		inode = ERR_PTR(err);
3495 	}
3496 	return inode;
3497 }
3498 
3499 /*
3500  * function to relocate all extents in a block group.
3501  */
3502 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3503 {
3504 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3505 	struct reloc_control *rc;
3506 	int ret;
3507 	int err = 0;
3508 
3509 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3510 	if (!rc)
3511 		return -ENOMEM;
3512 
3513 	mapping_tree_init(&rc->reloc_root_tree);
3514 	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3515 	INIT_LIST_HEAD(&rc->reloc_roots);
3516 
3517 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3518 	BUG_ON(!rc->block_group);
3519 
3520 	btrfs_init_workers(&rc->workers, "relocate",
3521 			   fs_info->thread_pool_size);
3522 
3523 	rc->extent_root = extent_root;
3524 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3525 
3526 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3527 	if (IS_ERR(rc->data_inode)) {
3528 		err = PTR_ERR(rc->data_inode);
3529 		rc->data_inode = NULL;
3530 		goto out;
3531 	}
3532 
3533 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3534 	       (unsigned long long)rc->block_group->key.objectid,
3535 	       (unsigned long long)rc->block_group->flags);
3536 
3537 	btrfs_start_delalloc_inodes(fs_info->tree_root);
3538 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3539 
3540 	while (1) {
3541 		rc->extents_found = 0;
3542 		rc->extents_skipped = 0;
3543 
3544 		mutex_lock(&fs_info->cleaner_mutex);
3545 
3546 		btrfs_clean_old_snapshots(fs_info->tree_root);
3547 		ret = relocate_block_group(rc);
3548 
3549 		mutex_unlock(&fs_info->cleaner_mutex);
3550 		if (ret < 0) {
3551 			err = ret;
3552 			break;
3553 		}
3554 
3555 		if (rc->extents_found == 0)
3556 			break;
3557 
3558 		printk(KERN_INFO "btrfs: found %llu extents\n",
3559 			(unsigned long long)rc->extents_found);
3560 
3561 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3562 			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3563 			invalidate_mapping_pages(rc->data_inode->i_mapping,
3564 						 0, -1);
3565 			rc->stage = UPDATE_DATA_PTRS;
3566 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3567 			   rc->extents_skipped >= rc->extents_found) {
3568 			iput(rc->data_inode);
3569 			rc->data_inode = create_reloc_inode(fs_info,
3570 							    rc->block_group);
3571 			if (IS_ERR(rc->data_inode)) {
3572 				err = PTR_ERR(rc->data_inode);
3573 				rc->data_inode = NULL;
3574 				break;
3575 			}
3576 			rc->stage = MOVE_DATA_EXTENTS;
3577 			rc->found_file_extent = 0;
3578 		}
3579 	}
3580 
3581 	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3582 				     rc->block_group->key.objectid,
3583 				     rc->block_group->key.objectid +
3584 				     rc->block_group->key.offset - 1);
3585 
3586 	WARN_ON(rc->block_group->pinned > 0);
3587 	WARN_ON(rc->block_group->reserved > 0);
3588 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3589 out:
3590 	iput(rc->data_inode);
3591 	btrfs_stop_workers(&rc->workers);
3592 	btrfs_put_block_group(rc->block_group);
3593 	kfree(rc);
3594 	return err;
3595 }
3596 
3597 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3598 {
3599 	struct btrfs_trans_handle *trans;
3600 	int ret;
3601 
3602 	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3603 
3604 	memset(&root->root_item.drop_progress, 0,
3605 		sizeof(root->root_item.drop_progress));
3606 	root->root_item.drop_level = 0;
3607 	btrfs_set_root_refs(&root->root_item, 0);
3608 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
3609 				&root->root_key, &root->root_item);
3610 	BUG_ON(ret);
3611 
3612 	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3613 	BUG_ON(ret);
3614 	return 0;
3615 }
3616 
3617 /*
3618  * recover relocation interrupted by system crash.
3619  *
3620  * this function resumes merging reloc trees with corresponding fs trees.
3621  * this is important for keeping the sharing of tree blocks
3622  */
3623 int btrfs_recover_relocation(struct btrfs_root *root)
3624 {
3625 	LIST_HEAD(reloc_roots);
3626 	struct btrfs_key key;
3627 	struct btrfs_root *fs_root;
3628 	struct btrfs_root *reloc_root;
3629 	struct btrfs_path *path;
3630 	struct extent_buffer *leaf;
3631 	struct reloc_control *rc = NULL;
3632 	struct btrfs_trans_handle *trans;
3633 	int ret;
3634 	int err = 0;
3635 
3636 	path = btrfs_alloc_path();
3637 	if (!path)
3638 		return -ENOMEM;
3639 
3640 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3641 	key.type = BTRFS_ROOT_ITEM_KEY;
3642 	key.offset = (u64)-1;
3643 
3644 	while (1) {
3645 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3646 					path, 0, 0);
3647 		if (ret < 0) {
3648 			err = ret;
3649 			goto out;
3650 		}
3651 		if (ret > 0) {
3652 			if (path->slots[0] == 0)
3653 				break;
3654 			path->slots[0]--;
3655 		}
3656 		leaf = path->nodes[0];
3657 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3658 		btrfs_release_path(root->fs_info->tree_root, path);
3659 
3660 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3661 		    key.type != BTRFS_ROOT_ITEM_KEY)
3662 			break;
3663 
3664 		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3665 		if (IS_ERR(reloc_root)) {
3666 			err = PTR_ERR(reloc_root);
3667 			goto out;
3668 		}
3669 
3670 		list_add(&reloc_root->root_list, &reloc_roots);
3671 
3672 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3673 			fs_root = read_fs_root(root->fs_info,
3674 					       reloc_root->root_key.offset);
3675 			if (IS_ERR(fs_root)) {
3676 				ret = PTR_ERR(fs_root);
3677 				if (ret != -ENOENT) {
3678 					err = ret;
3679 					goto out;
3680 				}
3681 				mark_garbage_root(reloc_root);
3682 			}
3683 		}
3684 
3685 		if (key.offset == 0)
3686 			break;
3687 
3688 		key.offset--;
3689 	}
3690 	btrfs_release_path(root->fs_info->tree_root, path);
3691 
3692 	if (list_empty(&reloc_roots))
3693 		goto out;
3694 
3695 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3696 	if (!rc) {
3697 		err = -ENOMEM;
3698 		goto out;
3699 	}
3700 
3701 	mapping_tree_init(&rc->reloc_root_tree);
3702 	INIT_LIST_HEAD(&rc->reloc_roots);
3703 	btrfs_init_workers(&rc->workers, "relocate",
3704 			   root->fs_info->thread_pool_size);
3705 	rc->extent_root = root->fs_info->extent_root;
3706 
3707 	set_reloc_control(rc);
3708 
3709 	while (!list_empty(&reloc_roots)) {
3710 		reloc_root = list_entry(reloc_roots.next,
3711 					struct btrfs_root, root_list);
3712 		list_del(&reloc_root->root_list);
3713 
3714 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3715 			list_add_tail(&reloc_root->root_list,
3716 				      &rc->reloc_roots);
3717 			continue;
3718 		}
3719 
3720 		fs_root = read_fs_root(root->fs_info,
3721 				       reloc_root->root_key.offset);
3722 		BUG_ON(IS_ERR(fs_root));
3723 
3724 		__add_reloc_root(reloc_root);
3725 		fs_root->reloc_root = reloc_root;
3726 	}
3727 
3728 	trans = btrfs_start_transaction(rc->extent_root, 1);
3729 	btrfs_commit_transaction(trans, rc->extent_root);
3730 
3731 	merge_reloc_roots(rc);
3732 
3733 	unset_reloc_control(rc);
3734 
3735 	trans = btrfs_start_transaction(rc->extent_root, 1);
3736 	btrfs_commit_transaction(trans, rc->extent_root);
3737 out:
3738 	if (rc) {
3739 		btrfs_stop_workers(&rc->workers);
3740 		kfree(rc);
3741 	}
3742 	while (!list_empty(&reloc_roots)) {
3743 		reloc_root = list_entry(reloc_roots.next,
3744 					struct btrfs_root, root_list);
3745 		list_del(&reloc_root->root_list);
3746 		free_extent_buffer(reloc_root->node);
3747 		free_extent_buffer(reloc_root->commit_root);
3748 		kfree(reloc_root);
3749 	}
3750 	btrfs_free_path(path);
3751 
3752 	if (err == 0) {
3753 		/* cleanup orphan inode in data relocation tree */
3754 		fs_root = read_fs_root(root->fs_info,
3755 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
3756 		if (IS_ERR(fs_root))
3757 			err = PTR_ERR(fs_root);
3758 	}
3759 	return err;
3760 }
3761 
3762 /*
3763  * helper to add ordered checksum for data relocation.
3764  *
3765  * cloning checksum properly handles the nodatasum extents.
3766  * it also saves CPU time to re-calculate the checksum.
3767  */
3768 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3769 {
3770 	struct btrfs_ordered_sum *sums;
3771 	struct btrfs_sector_sum *sector_sum;
3772 	struct btrfs_ordered_extent *ordered;
3773 	struct btrfs_root *root = BTRFS_I(inode)->root;
3774 	size_t offset;
3775 	int ret;
3776 	u64 disk_bytenr;
3777 	LIST_HEAD(list);
3778 
3779 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3780 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3781 
3782 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3783 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3784 				       disk_bytenr + len - 1, &list);
3785 
3786 	while (!list_empty(&list)) {
3787 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3788 		list_del_init(&sums->list);
3789 
3790 		sector_sum = sums->sums;
3791 		sums->bytenr = ordered->start;
3792 
3793 		offset = 0;
3794 		while (offset < sums->len) {
3795 			sector_sum->bytenr += ordered->start - disk_bytenr;
3796 			sector_sum++;
3797 			offset += root->sectorsize;
3798 		}
3799 
3800 		btrfs_add_ordered_sum(inode, ordered, sums);
3801 	}
3802 	btrfs_put_ordered_extent(ordered);
3803 	return 0;
3804 }
3805