xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision 75eff68ea6959efd999b125fc3dcf73931a3b30b)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/version.h>
23 #include "compat.h"
24 #include "hash.h"
25 #include "crc32c.h"
26 #include "ctree.h"
27 #include "disk-io.h"
28 #include "print-tree.h"
29 #include "transaction.h"
30 #include "volumes.h"
31 #include "locking.h"
32 #include "ref-cache.h"
33 #include "compat.h"
34 
35 #define PENDING_EXTENT_INSERT 0
36 #define PENDING_EXTENT_DELETE 1
37 #define PENDING_BACKREF_UPDATE 2
38 
39 struct pending_extent_op {
40 	int type;
41 	u64 bytenr;
42 	u64 num_bytes;
43 	u64 parent;
44 	u64 orig_parent;
45 	u64 generation;
46 	u64 orig_generation;
47 	int level;
48 	struct list_head list;
49 	int del;
50 };
51 
52 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
53 				 btrfs_root *extent_root, int all);
54 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
55 			       btrfs_root *extent_root, int all);
56 static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 			  struct btrfs_root *root,
58 			  u64 bytenr, u64 num_bytes, int is_data);
59 static int update_block_group(struct btrfs_trans_handle *trans,
60 			      struct btrfs_root *root,
61 			      u64 bytenr, u64 num_bytes, int alloc,
62 			      int mark_free);
63 
64 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65 {
66 	return (cache->flags & bits) == bits;
67 }
68 
69 /*
70  * this adds the block group to the fs_info rb tree for the block group
71  * cache
72  */
73 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 				struct btrfs_block_group_cache *block_group)
75 {
76 	struct rb_node **p;
77 	struct rb_node *parent = NULL;
78 	struct btrfs_block_group_cache *cache;
79 
80 	spin_lock(&info->block_group_cache_lock);
81 	p = &info->block_group_cache_tree.rb_node;
82 
83 	while (*p) {
84 		parent = *p;
85 		cache = rb_entry(parent, struct btrfs_block_group_cache,
86 				 cache_node);
87 		if (block_group->key.objectid < cache->key.objectid) {
88 			p = &(*p)->rb_left;
89 		} else if (block_group->key.objectid > cache->key.objectid) {
90 			p = &(*p)->rb_right;
91 		} else {
92 			spin_unlock(&info->block_group_cache_lock);
93 			return -EEXIST;
94 		}
95 	}
96 
97 	rb_link_node(&block_group->cache_node, parent, p);
98 	rb_insert_color(&block_group->cache_node,
99 			&info->block_group_cache_tree);
100 	spin_unlock(&info->block_group_cache_lock);
101 
102 	return 0;
103 }
104 
105 /*
106  * This will return the block group at or after bytenr if contains is 0, else
107  * it will return the block group that contains the bytenr
108  */
109 static struct btrfs_block_group_cache *
110 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 			      int contains)
112 {
113 	struct btrfs_block_group_cache *cache, *ret = NULL;
114 	struct rb_node *n;
115 	u64 end, start;
116 
117 	spin_lock(&info->block_group_cache_lock);
118 	n = info->block_group_cache_tree.rb_node;
119 
120 	while (n) {
121 		cache = rb_entry(n, struct btrfs_block_group_cache,
122 				 cache_node);
123 		end = cache->key.objectid + cache->key.offset - 1;
124 		start = cache->key.objectid;
125 
126 		if (bytenr < start) {
127 			if (!contains && (!ret || start < ret->key.objectid))
128 				ret = cache;
129 			n = n->rb_left;
130 		} else if (bytenr > start) {
131 			if (contains && bytenr <= end) {
132 				ret = cache;
133 				break;
134 			}
135 			n = n->rb_right;
136 		} else {
137 			ret = cache;
138 			break;
139 		}
140 	}
141 	if (ret)
142 		atomic_inc(&ret->count);
143 	spin_unlock(&info->block_group_cache_lock);
144 
145 	return ret;
146 }
147 
148 /*
149  * this is only called by cache_block_group, since we could have freed extents
150  * we need to check the pinned_extents for any extents that can't be used yet
151  * since their free space will be released as soon as the transaction commits.
152  */
153 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 			      struct btrfs_fs_info *info, u64 start, u64 end)
155 {
156 	u64 extent_start, extent_end, size;
157 	int ret;
158 
159 	mutex_lock(&info->pinned_mutex);
160 	while (start < end) {
161 		ret = find_first_extent_bit(&info->pinned_extents, start,
162 					    &extent_start, &extent_end,
163 					    EXTENT_DIRTY);
164 		if (ret)
165 			break;
166 
167 		if (extent_start == start) {
168 			start = extent_end + 1;
169 		} else if (extent_start > start && extent_start < end) {
170 			size = extent_start - start;
171 			ret = btrfs_add_free_space(block_group, start,
172 						   size);
173 			BUG_ON(ret);
174 			start = extent_end + 1;
175 		} else {
176 			break;
177 		}
178 	}
179 
180 	if (start < end) {
181 		size = end - start;
182 		ret = btrfs_add_free_space(block_group, start, size);
183 		BUG_ON(ret);
184 	}
185 	mutex_unlock(&info->pinned_mutex);
186 
187 	return 0;
188 }
189 
190 static int remove_sb_from_cache(struct btrfs_root *root,
191 				struct btrfs_block_group_cache *cache)
192 {
193 	u64 bytenr;
194 	u64 *logical;
195 	int stripe_len;
196 	int i, nr, ret;
197 
198 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 		bytenr = btrfs_sb_offset(i);
200 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 				       cache->key.objectid, bytenr, 0,
202 				       &logical, &nr, &stripe_len);
203 		BUG_ON(ret);
204 		while (nr--) {
205 			btrfs_remove_free_space(cache, logical[nr],
206 						stripe_len);
207 		}
208 		kfree(logical);
209 	}
210 	return 0;
211 }
212 
213 static int cache_block_group(struct btrfs_root *root,
214 			     struct btrfs_block_group_cache *block_group)
215 {
216 	struct btrfs_path *path;
217 	int ret = 0;
218 	struct btrfs_key key;
219 	struct extent_buffer *leaf;
220 	int slot;
221 	u64 last;
222 
223 	if (!block_group)
224 		return 0;
225 
226 	root = root->fs_info->extent_root;
227 
228 	if (block_group->cached)
229 		return 0;
230 
231 	path = btrfs_alloc_path();
232 	if (!path)
233 		return -ENOMEM;
234 
235 	path->reada = 2;
236 	/*
237 	 * we get into deadlocks with paths held by callers of this function.
238 	 * since the alloc_mutex is protecting things right now, just
239 	 * skip the locking here
240 	 */
241 	path->skip_locking = 1;
242 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 	key.objectid = last;
244 	key.offset = 0;
245 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 	if (ret < 0)
248 		goto err;
249 
250 	while(1) {
251 		leaf = path->nodes[0];
252 		slot = path->slots[0];
253 		if (slot >= btrfs_header_nritems(leaf)) {
254 			ret = btrfs_next_leaf(root, path);
255 			if (ret < 0)
256 				goto err;
257 			if (ret == 0)
258 				continue;
259 			else
260 				break;
261 		}
262 		btrfs_item_key_to_cpu(leaf, &key, slot);
263 		if (key.objectid < block_group->key.objectid)
264 			goto next;
265 
266 		if (key.objectid >= block_group->key.objectid +
267 		    block_group->key.offset)
268 			break;
269 
270 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 			add_new_free_space(block_group, root->fs_info, last,
272 					   key.objectid);
273 
274 			last = key.objectid + key.offset;
275 		}
276 next:
277 		path->slots[0]++;
278 	}
279 
280 	add_new_free_space(block_group, root->fs_info, last,
281 			   block_group->key.objectid +
282 			   block_group->key.offset);
283 
284 	remove_sb_from_cache(root, block_group);
285 	block_group->cached = 1;
286 	ret = 0;
287 err:
288 	btrfs_free_path(path);
289 	return ret;
290 }
291 
292 /*
293  * return the block group that starts at or after bytenr
294  */
295 static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
296 						       btrfs_fs_info *info,
297 							 u64 bytenr)
298 {
299 	struct btrfs_block_group_cache *cache;
300 
301 	cache = block_group_cache_tree_search(info, bytenr, 0);
302 
303 	return cache;
304 }
305 
306 /*
307  * return the block group that contains teh given bytenr
308  */
309 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
310 							 btrfs_fs_info *info,
311 							 u64 bytenr)
312 {
313 	struct btrfs_block_group_cache *cache;
314 
315 	cache = block_group_cache_tree_search(info, bytenr, 1);
316 
317 	return cache;
318 }
319 
320 static inline void put_block_group(struct btrfs_block_group_cache *cache)
321 {
322 	if (atomic_dec_and_test(&cache->count))
323 		kfree(cache);
324 }
325 
326 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
327 						  u64 flags)
328 {
329 	struct list_head *head = &info->space_info;
330 	struct list_head *cur;
331 	struct btrfs_space_info *found;
332 	list_for_each(cur, head) {
333 		found = list_entry(cur, struct btrfs_space_info, list);
334 		if (found->flags == flags)
335 			return found;
336 	}
337 	return NULL;
338 }
339 
340 static u64 div_factor(u64 num, int factor)
341 {
342 	if (factor == 10)
343 		return num;
344 	num *= factor;
345 	do_div(num, 10);
346 	return num;
347 }
348 
349 u64 btrfs_find_block_group(struct btrfs_root *root,
350 			   u64 search_start, u64 search_hint, int owner)
351 {
352 	struct btrfs_block_group_cache *cache;
353 	u64 used;
354 	u64 last = max(search_hint, search_start);
355 	u64 group_start = 0;
356 	int full_search = 0;
357 	int factor = 9;
358 	int wrapped = 0;
359 again:
360 	while (1) {
361 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
362 		if (!cache)
363 			break;
364 
365 		spin_lock(&cache->lock);
366 		last = cache->key.objectid + cache->key.offset;
367 		used = btrfs_block_group_used(&cache->item);
368 
369 		if ((full_search || !cache->ro) &&
370 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
371 			if (used + cache->pinned + cache->reserved <
372 			    div_factor(cache->key.offset, factor)) {
373 				group_start = cache->key.objectid;
374 				spin_unlock(&cache->lock);
375 				put_block_group(cache);
376 				goto found;
377 			}
378 		}
379 		spin_unlock(&cache->lock);
380 		put_block_group(cache);
381 		cond_resched();
382 	}
383 	if (!wrapped) {
384 		last = search_start;
385 		wrapped = 1;
386 		goto again;
387 	}
388 	if (!full_search && factor < 10) {
389 		last = search_start;
390 		full_search = 1;
391 		factor = 10;
392 		goto again;
393 	}
394 found:
395 	return group_start;
396 }
397 
398 /* simple helper to search for an existing extent at a given offset */
399 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
400 {
401 	int ret;
402 	struct btrfs_key key;
403 	struct btrfs_path *path;
404 
405 	path = btrfs_alloc_path();
406 	BUG_ON(!path);
407 	key.objectid = start;
408 	key.offset = len;
409 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
410 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
411 				0, 0);
412 	btrfs_free_path(path);
413 	return ret;
414 }
415 
416 /*
417  * Back reference rules.  Back refs have three main goals:
418  *
419  * 1) differentiate between all holders of references to an extent so that
420  *    when a reference is dropped we can make sure it was a valid reference
421  *    before freeing the extent.
422  *
423  * 2) Provide enough information to quickly find the holders of an extent
424  *    if we notice a given block is corrupted or bad.
425  *
426  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
427  *    maintenance.  This is actually the same as #2, but with a slightly
428  *    different use case.
429  *
430  * File extents can be referenced by:
431  *
432  * - multiple snapshots, subvolumes, or different generations in one subvol
433  * - different files inside a single subvolume
434  * - different offsets inside a file (bookend extents in file.c)
435  *
436  * The extent ref structure has fields for:
437  *
438  * - Objectid of the subvolume root
439  * - Generation number of the tree holding the reference
440  * - objectid of the file holding the reference
441  * - number of references holding by parent node (alway 1 for tree blocks)
442  *
443  * Btree leaf may hold multiple references to a file extent. In most cases,
444  * these references are from same file and the corresponding offsets inside
445  * the file are close together.
446  *
447  * When a file extent is allocated the fields are filled in:
448  *     (root_key.objectid, trans->transid, inode objectid, 1)
449  *
450  * When a leaf is cow'd new references are added for every file extent found
451  * in the leaf.  It looks similar to the create case, but trans->transid will
452  * be different when the block is cow'd.
453  *
454  *     (root_key.objectid, trans->transid, inode objectid,
455  *      number of references in the leaf)
456  *
457  * When a file extent is removed either during snapshot deletion or
458  * file truncation, we find the corresponding back reference and check
459  * the following fields:
460  *
461  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
462  *      inode objectid)
463  *
464  * Btree extents can be referenced by:
465  *
466  * - Different subvolumes
467  * - Different generations of the same subvolume
468  *
469  * When a tree block is created, back references are inserted:
470  *
471  * (root->root_key.objectid, trans->transid, level, 1)
472  *
473  * When a tree block is cow'd, new back references are added for all the
474  * blocks it points to. If the tree block isn't in reference counted root,
475  * the old back references are removed. These new back references are of
476  * the form (trans->transid will have increased since creation):
477  *
478  * (root->root_key.objectid, trans->transid, level, 1)
479  *
480  * When a backref is in deleting, the following fields are checked:
481  *
482  * if backref was for a tree root:
483  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
484  * else
485  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
486  *
487  * Back Reference Key composing:
488  *
489  * The key objectid corresponds to the first byte in the extent, the key
490  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
491  * byte of parent extent. If a extent is tree root, the key offset is set
492  * to the key objectid.
493  */
494 
495 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
496 					  struct btrfs_root *root,
497 					  struct btrfs_path *path,
498 					  u64 bytenr, u64 parent,
499 					  u64 ref_root, u64 ref_generation,
500 					  u64 owner_objectid, int del)
501 {
502 	struct btrfs_key key;
503 	struct btrfs_extent_ref *ref;
504 	struct extent_buffer *leaf;
505 	u64 ref_objectid;
506 	int ret;
507 
508 	key.objectid = bytenr;
509 	key.type = BTRFS_EXTENT_REF_KEY;
510 	key.offset = parent;
511 
512 	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
513 	if (ret < 0)
514 		goto out;
515 	if (ret > 0) {
516 		ret = -ENOENT;
517 		goto out;
518 	}
519 
520 	leaf = path->nodes[0];
521 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
522 	ref_objectid = btrfs_ref_objectid(leaf, ref);
523 	if (btrfs_ref_root(leaf, ref) != ref_root ||
524 	    btrfs_ref_generation(leaf, ref) != ref_generation ||
525 	    (ref_objectid != owner_objectid &&
526 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
527 		ret = -EIO;
528 		WARN_ON(1);
529 		goto out;
530 	}
531 	ret = 0;
532 out:
533 	return ret;
534 }
535 
536 /*
537  * updates all the backrefs that are pending on update_list for the
538  * extent_root
539  */
540 static int noinline update_backrefs(struct btrfs_trans_handle *trans,
541 				    struct btrfs_root *extent_root,
542 				    struct btrfs_path *path,
543 				    struct list_head *update_list)
544 {
545 	struct btrfs_key key;
546 	struct btrfs_extent_ref *ref;
547 	struct btrfs_fs_info *info = extent_root->fs_info;
548 	struct pending_extent_op *op;
549 	struct extent_buffer *leaf;
550 	int ret = 0;
551 	struct list_head *cur = update_list->next;
552 	u64 ref_objectid;
553 	u64 ref_root = extent_root->root_key.objectid;
554 
555 	op = list_entry(cur, struct pending_extent_op, list);
556 
557 search:
558 	key.objectid = op->bytenr;
559 	key.type = BTRFS_EXTENT_REF_KEY;
560 	key.offset = op->orig_parent;
561 
562 	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
563 	BUG_ON(ret);
564 
565 	leaf = path->nodes[0];
566 
567 loop:
568 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
569 
570 	ref_objectid = btrfs_ref_objectid(leaf, ref);
571 
572 	if (btrfs_ref_root(leaf, ref) != ref_root ||
573 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
574 	    (ref_objectid != op->level &&
575 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
576 		printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
577 		       "owner %u\n", op->bytenr, op->orig_parent,
578 		       ref_root, op->level);
579 		btrfs_print_leaf(extent_root, leaf);
580 		BUG();
581 	}
582 
583 	key.objectid = op->bytenr;
584 	key.offset = op->parent;
585 	key.type = BTRFS_EXTENT_REF_KEY;
586 	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
587 	BUG_ON(ret);
588 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
589 	btrfs_set_ref_generation(leaf, ref, op->generation);
590 
591 	cur = cur->next;
592 
593 	list_del_init(&op->list);
594 	unlock_extent(&info->extent_ins, op->bytenr,
595 		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
596 	kfree(op);
597 
598 	if (cur == update_list) {
599 		btrfs_mark_buffer_dirty(path->nodes[0]);
600 		btrfs_release_path(extent_root, path);
601 		goto out;
602 	}
603 
604 	op = list_entry(cur, struct pending_extent_op, list);
605 
606 	path->slots[0]++;
607 	while (path->slots[0] < btrfs_header_nritems(leaf)) {
608 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
609 		if (key.objectid == op->bytenr &&
610 		    key.type == BTRFS_EXTENT_REF_KEY)
611 			goto loop;
612 		path->slots[0]++;
613 	}
614 
615 	btrfs_mark_buffer_dirty(path->nodes[0]);
616 	btrfs_release_path(extent_root, path);
617 	goto search;
618 
619 out:
620 	return 0;
621 }
622 
623 static int noinline insert_extents(struct btrfs_trans_handle *trans,
624 				   struct btrfs_root *extent_root,
625 				   struct btrfs_path *path,
626 				   struct list_head *insert_list, int nr)
627 {
628 	struct btrfs_key *keys;
629 	u32 *data_size;
630 	struct pending_extent_op *op;
631 	struct extent_buffer *leaf;
632 	struct list_head *cur = insert_list->next;
633 	struct btrfs_fs_info *info = extent_root->fs_info;
634 	u64 ref_root = extent_root->root_key.objectid;
635 	int i = 0, last = 0, ret;
636 	int total = nr * 2;
637 
638 	if (!nr)
639 		return 0;
640 
641 	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
642 	if (!keys)
643 		return -ENOMEM;
644 
645 	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
646 	if (!data_size) {
647 		kfree(keys);
648 		return -ENOMEM;
649 	}
650 
651 	list_for_each_entry(op, insert_list, list) {
652 		keys[i].objectid = op->bytenr;
653 		keys[i].offset = op->num_bytes;
654 		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
655 		data_size[i] = sizeof(struct btrfs_extent_item);
656 		i++;
657 
658 		keys[i].objectid = op->bytenr;
659 		keys[i].offset = op->parent;
660 		keys[i].type = BTRFS_EXTENT_REF_KEY;
661 		data_size[i] = sizeof(struct btrfs_extent_ref);
662 		i++;
663 	}
664 
665 	op = list_entry(cur, struct pending_extent_op, list);
666 	i = 0;
667 	while (i < total) {
668 		int c;
669 		ret = btrfs_insert_some_items(trans, extent_root, path,
670 					      keys+i, data_size+i, total-i);
671 		BUG_ON(ret < 0);
672 
673 		if (last && ret > 1)
674 			BUG();
675 
676 		leaf = path->nodes[0];
677 		for (c = 0; c < ret; c++) {
678 			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
679 
680 			/*
681 			 * if the first item we inserted was a backref, then
682 			 * the EXTENT_ITEM will be the odd c's, else it will
683 			 * be the even c's
684 			 */
685 			if ((ref_first && (c % 2)) ||
686 			    (!ref_first && !(c % 2))) {
687 				struct btrfs_extent_item *itm;
688 
689 				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
690 						     struct btrfs_extent_item);
691 				btrfs_set_extent_refs(path->nodes[0], itm, 1);
692 				op->del++;
693 			} else {
694 				struct btrfs_extent_ref *ref;
695 
696 				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
697 						     struct btrfs_extent_ref);
698 				btrfs_set_ref_root(leaf, ref, ref_root);
699 				btrfs_set_ref_generation(leaf, ref,
700 							 op->generation);
701 				btrfs_set_ref_objectid(leaf, ref, op->level);
702 				btrfs_set_ref_num_refs(leaf, ref, 1);
703 				op->del++;
704 			}
705 
706 			/*
707 			 * using del to see when its ok to free up the
708 			 * pending_extent_op.  In the case where we insert the
709 			 * last item on the list in order to help do batching
710 			 * we need to not free the extent op until we actually
711 			 * insert the extent_item
712 			 */
713 			if (op->del == 2) {
714 				unlock_extent(&info->extent_ins, op->bytenr,
715 					      op->bytenr + op->num_bytes - 1,
716 					      GFP_NOFS);
717 				cur = cur->next;
718 				list_del_init(&op->list);
719 				kfree(op);
720 				if (cur != insert_list)
721 					op = list_entry(cur,
722 						struct pending_extent_op,
723 						list);
724 			}
725 		}
726 		btrfs_mark_buffer_dirty(leaf);
727 		btrfs_release_path(extent_root, path);
728 
729 		/*
730 		 * Ok backref's and items usually go right next to eachother,
731 		 * but if we could only insert 1 item that means that we
732 		 * inserted on the end of a leaf, and we have no idea what may
733 		 * be on the next leaf so we just play it safe.  In order to
734 		 * try and help this case we insert the last thing on our
735 		 * insert list so hopefully it will end up being the last
736 		 * thing on the leaf and everything else will be before it,
737 		 * which will let us insert a whole bunch of items at the same
738 		 * time.
739 		 */
740 		if (ret == 1 && !last && (i + ret < total)) {
741 			/*
742 			 * last: where we will pick up the next time around
743 			 * i: our current key to insert, will be total - 1
744 			 * cur: the current op we are screwing with
745 			 * op: duh
746 			 */
747 			last = i + ret;
748 			i = total - 1;
749 			cur = insert_list->prev;
750 			op = list_entry(cur, struct pending_extent_op, list);
751 		} else if (last) {
752 			/*
753 			 * ok we successfully inserted the last item on the
754 			 * list, lets reset everything
755 			 *
756 			 * i: our current key to insert, so where we left off
757 			 *    last time
758 			 * last: done with this
759 			 * cur: the op we are messing with
760 			 * op: duh
761 			 * total: since we inserted the last key, we need to
762 			 *        decrement total so we dont overflow
763 			 */
764 			i = last;
765 			last = 0;
766 			total--;
767 			if (i < total) {
768 				cur = insert_list->next;
769 				op = list_entry(cur, struct pending_extent_op,
770 						list);
771 			}
772 		} else {
773 			i += ret;
774 		}
775 
776 		cond_resched();
777 	}
778 	ret = 0;
779 	kfree(keys);
780 	kfree(data_size);
781 	return ret;
782 }
783 
784 static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
785 					  struct btrfs_root *root,
786 					  struct btrfs_path *path,
787 					  u64 bytenr, u64 parent,
788 					  u64 ref_root, u64 ref_generation,
789 					  u64 owner_objectid)
790 {
791 	struct btrfs_key key;
792 	struct extent_buffer *leaf;
793 	struct btrfs_extent_ref *ref;
794 	u32 num_refs;
795 	int ret;
796 
797 	key.objectid = bytenr;
798 	key.type = BTRFS_EXTENT_REF_KEY;
799 	key.offset = parent;
800 
801 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
802 	if (ret == 0) {
803 		leaf = path->nodes[0];
804 		ref = btrfs_item_ptr(leaf, path->slots[0],
805 				     struct btrfs_extent_ref);
806 		btrfs_set_ref_root(leaf, ref, ref_root);
807 		btrfs_set_ref_generation(leaf, ref, ref_generation);
808 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
809 		btrfs_set_ref_num_refs(leaf, ref, 1);
810 	} else if (ret == -EEXIST) {
811 		u64 existing_owner;
812 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
813 		leaf = path->nodes[0];
814 		ref = btrfs_item_ptr(leaf, path->slots[0],
815 				     struct btrfs_extent_ref);
816 		if (btrfs_ref_root(leaf, ref) != ref_root ||
817 		    btrfs_ref_generation(leaf, ref) != ref_generation) {
818 			ret = -EIO;
819 			WARN_ON(1);
820 			goto out;
821 		}
822 
823 		num_refs = btrfs_ref_num_refs(leaf, ref);
824 		BUG_ON(num_refs == 0);
825 		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
826 
827 		existing_owner = btrfs_ref_objectid(leaf, ref);
828 		if (existing_owner != owner_objectid &&
829 		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
830 			btrfs_set_ref_objectid(leaf, ref,
831 					BTRFS_MULTIPLE_OBJECTIDS);
832 		}
833 		ret = 0;
834 	} else {
835 		goto out;
836 	}
837 	btrfs_mark_buffer_dirty(path->nodes[0]);
838 out:
839 	btrfs_release_path(root, path);
840 	return ret;
841 }
842 
843 static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
844 					  struct btrfs_root *root,
845 					  struct btrfs_path *path)
846 {
847 	struct extent_buffer *leaf;
848 	struct btrfs_extent_ref *ref;
849 	u32 num_refs;
850 	int ret = 0;
851 
852 	leaf = path->nodes[0];
853 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
854 	num_refs = btrfs_ref_num_refs(leaf, ref);
855 	BUG_ON(num_refs == 0);
856 	num_refs -= 1;
857 	if (num_refs == 0) {
858 		ret = btrfs_del_item(trans, root, path);
859 	} else {
860 		btrfs_set_ref_num_refs(leaf, ref, num_refs);
861 		btrfs_mark_buffer_dirty(leaf);
862 	}
863 	btrfs_release_path(root, path);
864 	return ret;
865 }
866 
867 #ifdef BIO_RW_DISCARD
868 static void btrfs_issue_discard(struct block_device *bdev,
869 				u64 start, u64 len)
870 {
871 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
872 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
873 #else
874 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
875 #endif
876 }
877 #endif
878 
879 static int noinline free_extents(struct btrfs_trans_handle *trans,
880 				 struct btrfs_root *extent_root,
881 				 struct list_head *del_list)
882 {
883 	struct btrfs_fs_info *info = extent_root->fs_info;
884 	struct btrfs_path *path;
885 	struct btrfs_key key, found_key;
886 	struct extent_buffer *leaf;
887 	struct list_head *cur;
888 	struct pending_extent_op *op;
889 	struct btrfs_extent_item *ei;
890 	int ret, num_to_del, extent_slot = 0, found_extent = 0;
891 	u32 refs;
892 	u64 bytes_freed = 0;
893 
894 	path = btrfs_alloc_path();
895 	if (!path)
896 		return -ENOMEM;
897 	path->reada = 1;
898 
899 search:
900 	/* search for the backref for the current ref we want to delete */
901 	cur = del_list->next;
902 	op = list_entry(cur, struct pending_extent_op, list);
903 	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
904 				    op->orig_parent,
905 				    extent_root->root_key.objectid,
906 				    op->orig_generation, op->level, 1);
907 	if (ret) {
908 		printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
909 		       "owner %u\n", op->bytenr,
910 		       extent_root->root_key.objectid, op->orig_generation,
911 		       op->level);
912 		btrfs_print_leaf(extent_root, path->nodes[0]);
913 		WARN_ON(1);
914 		goto out;
915 	}
916 
917 	extent_slot = path->slots[0];
918 	num_to_del = 1;
919 	found_extent = 0;
920 
921 	/*
922 	 * if we aren't the first item on the leaf we can move back one and see
923 	 * if our ref is right next to our extent item
924 	 */
925 	if (likely(extent_slot)) {
926 		extent_slot--;
927 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
928 				      extent_slot);
929 		if (found_key.objectid == op->bytenr &&
930 		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
931 		    found_key.offset == op->num_bytes) {
932 			num_to_del++;
933 			found_extent = 1;
934 		}
935 	}
936 
937 	/*
938 	 * if we didn't find the extent we need to delete the backref and then
939 	 * search for the extent item key so we can update its ref count
940 	 */
941 	if (!found_extent) {
942 		key.objectid = op->bytenr;
943 		key.type = BTRFS_EXTENT_ITEM_KEY;
944 		key.offset = op->num_bytes;
945 
946 		ret = remove_extent_backref(trans, extent_root, path);
947 		BUG_ON(ret);
948 		btrfs_release_path(extent_root, path);
949 		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
950 		BUG_ON(ret);
951 		extent_slot = path->slots[0];
952 	}
953 
954 	/* this is where we update the ref count for the extent */
955 	leaf = path->nodes[0];
956 	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
957 	refs = btrfs_extent_refs(leaf, ei);
958 	BUG_ON(refs == 0);
959 	refs--;
960 	btrfs_set_extent_refs(leaf, ei, refs);
961 
962 	btrfs_mark_buffer_dirty(leaf);
963 
964 	/*
965 	 * This extent needs deleting.  The reason cur_slot is extent_slot +
966 	 * num_to_del is because extent_slot points to the slot where the extent
967 	 * is, and if the backref was not right next to the extent we will be
968 	 * deleting at least 1 item, and will want to start searching at the
969 	 * slot directly next to extent_slot.  However if we did find the
970 	 * backref next to the extent item them we will be deleting at least 2
971 	 * items and will want to start searching directly after the ref slot
972 	 */
973 	if (!refs) {
974 		struct list_head *pos, *n, *end;
975 		int cur_slot = extent_slot+num_to_del;
976 		u64 super_used;
977 		u64 root_used;
978 
979 		path->slots[0] = extent_slot;
980 		bytes_freed = op->num_bytes;
981 
982 		mutex_lock(&info->pinned_mutex);
983 		ret = pin_down_bytes(trans, extent_root, op->bytenr,
984 				     op->num_bytes, op->level >=
985 				     BTRFS_FIRST_FREE_OBJECTID);
986 		mutex_unlock(&info->pinned_mutex);
987 		BUG_ON(ret < 0);
988 		op->del = ret;
989 
990 		/*
991 		 * we need to see if we can delete multiple things at once, so
992 		 * start looping through the list of extents we are wanting to
993 		 * delete and see if their extent/backref's are right next to
994 		 * eachother and the extents only have 1 ref
995 		 */
996 		for (pos = cur->next; pos != del_list; pos = pos->next) {
997 			struct pending_extent_op *tmp;
998 
999 			tmp = list_entry(pos, struct pending_extent_op, list);
1000 
1001 			/* we only want to delete extent+ref at this stage */
1002 			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1003 				break;
1004 
1005 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1006 			if (found_key.objectid != tmp->bytenr ||
1007 			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1008 			    found_key.offset != tmp->num_bytes)
1009 				break;
1010 
1011 			/* check to make sure this extent only has one ref */
1012 			ei = btrfs_item_ptr(leaf, cur_slot,
1013 					    struct btrfs_extent_item);
1014 			if (btrfs_extent_refs(leaf, ei) != 1)
1015 				break;
1016 
1017 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1018 			if (found_key.objectid != tmp->bytenr ||
1019 			    found_key.type != BTRFS_EXTENT_REF_KEY ||
1020 			    found_key.offset != tmp->orig_parent)
1021 				break;
1022 
1023 			/*
1024 			 * the ref is right next to the extent, we can set the
1025 			 * ref count to 0 since we will delete them both now
1026 			 */
1027 			btrfs_set_extent_refs(leaf, ei, 0);
1028 
1029 			/* pin down the bytes for this extent */
1030 			mutex_lock(&info->pinned_mutex);
1031 			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1032 					     tmp->num_bytes, tmp->level >=
1033 					     BTRFS_FIRST_FREE_OBJECTID);
1034 			mutex_unlock(&info->pinned_mutex);
1035 			BUG_ON(ret < 0);
1036 
1037 			/*
1038 			 * use the del field to tell if we need to go ahead and
1039 			 * free up the extent when we delete the item or not.
1040 			 */
1041 			tmp->del = ret;
1042 			bytes_freed += tmp->num_bytes;
1043 
1044 			num_to_del += 2;
1045 			cur_slot += 2;
1046 		}
1047 		end = pos;
1048 
1049 		/* update the free space counters */
1050 		spin_lock(&info->delalloc_lock);
1051 		super_used = btrfs_super_bytes_used(&info->super_copy);
1052 		btrfs_set_super_bytes_used(&info->super_copy,
1053 					   super_used - bytes_freed);
1054 		spin_unlock(&info->delalloc_lock);
1055 
1056 		root_used = btrfs_root_used(&extent_root->root_item);
1057 		btrfs_set_root_used(&extent_root->root_item,
1058 				    root_used - bytes_freed);
1059 
1060 		/* delete the items */
1061 		ret = btrfs_del_items(trans, extent_root, path,
1062 				      path->slots[0], num_to_del);
1063 		BUG_ON(ret);
1064 
1065 		/*
1066 		 * loop through the extents we deleted and do the cleanup work
1067 		 * on them
1068 		 */
1069 		for (pos = cur, n = pos->next; pos != end;
1070 		     pos = n, n = pos->next) {
1071 			struct pending_extent_op *tmp;
1072 #ifdef BIO_RW_DISCARD
1073 			u64 map_length;
1074 			struct btrfs_multi_bio *multi = NULL;
1075 #endif
1076 			tmp = list_entry(pos, struct pending_extent_op, list);
1077 
1078 			/*
1079 			 * remember tmp->del tells us wether or not we pinned
1080 			 * down the extent
1081 			 */
1082 			ret = update_block_group(trans, extent_root,
1083 						 tmp->bytenr, tmp->num_bytes, 0,
1084 						 tmp->del);
1085 			BUG_ON(ret);
1086 
1087 #ifdef BIO_RW_DISCARD
1088 			map_length = tmp->num_bytes;
1089 			ret = btrfs_map_block(&info->mapping_tree, READ,
1090 					      tmp->bytenr, &map_length, &multi,
1091 					      0);
1092 			if (!ret) {
1093 				struct btrfs_bio_stripe *stripe;
1094 				int i;
1095 
1096 				stripe = multi->stripes;
1097 
1098 				if (map_length > tmp->num_bytes)
1099 					map_length = tmp->num_bytes;
1100 
1101 				for (i = 0; i < multi->num_stripes;
1102 				     i++, stripe++)
1103 					btrfs_issue_discard(stripe->dev->bdev,
1104 							    stripe->physical,
1105 							    map_length);
1106 				kfree(multi);
1107 			}
1108 #endif
1109 			list_del_init(&tmp->list);
1110 			unlock_extent(&info->extent_ins, tmp->bytenr,
1111 				      tmp->bytenr + tmp->num_bytes - 1,
1112 				      GFP_NOFS);
1113 			kfree(tmp);
1114 		}
1115 	} else if (refs && found_extent) {
1116 		/*
1117 		 * the ref and extent were right next to eachother, but the
1118 		 * extent still has a ref, so just free the backref and keep
1119 		 * going
1120 		 */
1121 		ret = remove_extent_backref(trans, extent_root, path);
1122 		BUG_ON(ret);
1123 
1124 		list_del_init(&op->list);
1125 		unlock_extent(&info->extent_ins, op->bytenr,
1126 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1127 		kfree(op);
1128 	} else {
1129 		/*
1130 		 * the extent has multiple refs and the backref we were looking
1131 		 * for was not right next to it, so just unlock and go next,
1132 		 * we're good to go
1133 		 */
1134 		list_del_init(&op->list);
1135 		unlock_extent(&info->extent_ins, op->bytenr,
1136 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1137 		kfree(op);
1138 	}
1139 
1140 	btrfs_release_path(extent_root, path);
1141 	if (!list_empty(del_list))
1142 		goto search;
1143 
1144 out:
1145 	btrfs_free_path(path);
1146 	return ret;
1147 }
1148 
1149 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1150 				     struct btrfs_root *root, u64 bytenr,
1151 				     u64 orig_parent, u64 parent,
1152 				     u64 orig_root, u64 ref_root,
1153 				     u64 orig_generation, u64 ref_generation,
1154 				     u64 owner_objectid)
1155 {
1156 	int ret;
1157 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1158 	struct btrfs_path *path;
1159 
1160 	if (root == root->fs_info->extent_root) {
1161 		struct pending_extent_op *extent_op;
1162 		u64 num_bytes;
1163 
1164 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1165 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
1166 		mutex_lock(&root->fs_info->extent_ins_mutex);
1167 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1168 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1169 			u64 priv;
1170 			ret = get_state_private(&root->fs_info->extent_ins,
1171 						bytenr, &priv);
1172 			BUG_ON(ret);
1173 			extent_op = (struct pending_extent_op *)
1174 							(unsigned long)priv;
1175 			BUG_ON(extent_op->parent != orig_parent);
1176 			BUG_ON(extent_op->generation != orig_generation);
1177 
1178 			extent_op->parent = parent;
1179 			extent_op->generation = ref_generation;
1180 		} else {
1181 			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1182 			BUG_ON(!extent_op);
1183 
1184 			extent_op->type = PENDING_BACKREF_UPDATE;
1185 			extent_op->bytenr = bytenr;
1186 			extent_op->num_bytes = num_bytes;
1187 			extent_op->parent = parent;
1188 			extent_op->orig_parent = orig_parent;
1189 			extent_op->generation = ref_generation;
1190 			extent_op->orig_generation = orig_generation;
1191 			extent_op->level = (int)owner_objectid;
1192 			INIT_LIST_HEAD(&extent_op->list);
1193 			extent_op->del = 0;
1194 
1195 			set_extent_bits(&root->fs_info->extent_ins,
1196 					bytenr, bytenr + num_bytes - 1,
1197 					EXTENT_WRITEBACK, GFP_NOFS);
1198 			set_state_private(&root->fs_info->extent_ins,
1199 					  bytenr, (unsigned long)extent_op);
1200 		}
1201 		mutex_unlock(&root->fs_info->extent_ins_mutex);
1202 		return 0;
1203 	}
1204 
1205 	path = btrfs_alloc_path();
1206 	if (!path)
1207 		return -ENOMEM;
1208 	ret = lookup_extent_backref(trans, extent_root, path,
1209 				    bytenr, orig_parent, orig_root,
1210 				    orig_generation, owner_objectid, 1);
1211 	if (ret)
1212 		goto out;
1213 	ret = remove_extent_backref(trans, extent_root, path);
1214 	if (ret)
1215 		goto out;
1216 	ret = insert_extent_backref(trans, extent_root, path, bytenr,
1217 				    parent, ref_root, ref_generation,
1218 				    owner_objectid);
1219 	BUG_ON(ret);
1220 	finish_current_insert(trans, extent_root, 0);
1221 	del_pending_extents(trans, extent_root, 0);
1222 out:
1223 	btrfs_free_path(path);
1224 	return ret;
1225 }
1226 
1227 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1228 			    struct btrfs_root *root, u64 bytenr,
1229 			    u64 orig_parent, u64 parent,
1230 			    u64 ref_root, u64 ref_generation,
1231 			    u64 owner_objectid)
1232 {
1233 	int ret;
1234 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1235 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1236 		return 0;
1237 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1238 					parent, ref_root, ref_root,
1239 					ref_generation, ref_generation,
1240 					owner_objectid);
1241 	return ret;
1242 }
1243 
1244 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1245 				  struct btrfs_root *root, u64 bytenr,
1246 				  u64 orig_parent, u64 parent,
1247 				  u64 orig_root, u64 ref_root,
1248 				  u64 orig_generation, u64 ref_generation,
1249 				  u64 owner_objectid)
1250 {
1251 	struct btrfs_path *path;
1252 	int ret;
1253 	struct btrfs_key key;
1254 	struct extent_buffer *l;
1255 	struct btrfs_extent_item *item;
1256 	u32 refs;
1257 
1258 	path = btrfs_alloc_path();
1259 	if (!path)
1260 		return -ENOMEM;
1261 
1262 	path->reada = 1;
1263 	key.objectid = bytenr;
1264 	key.type = BTRFS_EXTENT_ITEM_KEY;
1265 	key.offset = (u64)-1;
1266 
1267 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1268 				0, 1);
1269 	if (ret < 0)
1270 		return ret;
1271 	BUG_ON(ret == 0 || path->slots[0] == 0);
1272 
1273 	path->slots[0]--;
1274 	l = path->nodes[0];
1275 
1276 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1277 	if (key.objectid != bytenr) {
1278 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1279 		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
1280 		BUG();
1281 	}
1282 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1283 
1284 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1285 	refs = btrfs_extent_refs(l, item);
1286 	btrfs_set_extent_refs(l, item, refs + 1);
1287 	btrfs_mark_buffer_dirty(path->nodes[0]);
1288 
1289 	btrfs_release_path(root->fs_info->extent_root, path);
1290 
1291 	path->reada = 1;
1292 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1293 				    path, bytenr, parent,
1294 				    ref_root, ref_generation,
1295 				    owner_objectid);
1296 	BUG_ON(ret);
1297 	finish_current_insert(trans, root->fs_info->extent_root, 0);
1298 	del_pending_extents(trans, root->fs_info->extent_root, 0);
1299 
1300 	btrfs_free_path(path);
1301 	return 0;
1302 }
1303 
1304 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1305 			 struct btrfs_root *root,
1306 			 u64 bytenr, u64 num_bytes, u64 parent,
1307 			 u64 ref_root, u64 ref_generation,
1308 			 u64 owner_objectid)
1309 {
1310 	int ret;
1311 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1312 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1313 		return 0;
1314 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1315 				     0, ref_root, 0, ref_generation,
1316 				     owner_objectid);
1317 	return ret;
1318 }
1319 
1320 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1321 			 struct btrfs_root *root)
1322 {
1323 	finish_current_insert(trans, root->fs_info->extent_root, 1);
1324 	del_pending_extents(trans, root->fs_info->extent_root, 1);
1325 	return 0;
1326 }
1327 
1328 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1329 			    struct btrfs_root *root, u64 bytenr,
1330 			    u64 num_bytes, u32 *refs)
1331 {
1332 	struct btrfs_path *path;
1333 	int ret;
1334 	struct btrfs_key key;
1335 	struct extent_buffer *l;
1336 	struct btrfs_extent_item *item;
1337 
1338 	WARN_ON(num_bytes < root->sectorsize);
1339 	path = btrfs_alloc_path();
1340 	path->reada = 1;
1341 	key.objectid = bytenr;
1342 	key.offset = num_bytes;
1343 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1344 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1345 				0, 0);
1346 	if (ret < 0)
1347 		goto out;
1348 	if (ret != 0) {
1349 		btrfs_print_leaf(root, path->nodes[0]);
1350 		printk("failed to find block number %Lu\n", bytenr);
1351 		BUG();
1352 	}
1353 	l = path->nodes[0];
1354 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1355 	*refs = btrfs_extent_refs(l, item);
1356 out:
1357 	btrfs_free_path(path);
1358 	return 0;
1359 }
1360 
1361 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1362 			  struct btrfs_root *root, u64 objectid, u64 bytenr)
1363 {
1364 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1365 	struct btrfs_path *path;
1366 	struct extent_buffer *leaf;
1367 	struct btrfs_extent_ref *ref_item;
1368 	struct btrfs_key key;
1369 	struct btrfs_key found_key;
1370 	u64 ref_root;
1371 	u64 last_snapshot;
1372 	u32 nritems;
1373 	int ret;
1374 
1375 	key.objectid = bytenr;
1376 	key.offset = (u64)-1;
1377 	key.type = BTRFS_EXTENT_ITEM_KEY;
1378 
1379 	path = btrfs_alloc_path();
1380 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1381 	if (ret < 0)
1382 		goto out;
1383 	BUG_ON(ret == 0);
1384 
1385 	ret = -ENOENT;
1386 	if (path->slots[0] == 0)
1387 		goto out;
1388 
1389 	path->slots[0]--;
1390 	leaf = path->nodes[0];
1391 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1392 
1393 	if (found_key.objectid != bytenr ||
1394 	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
1395 		goto out;
1396 
1397 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1398 	while (1) {
1399 		leaf = path->nodes[0];
1400 		nritems = btrfs_header_nritems(leaf);
1401 		if (path->slots[0] >= nritems) {
1402 			ret = btrfs_next_leaf(extent_root, path);
1403 			if (ret < 0)
1404 				goto out;
1405 			if (ret == 0)
1406 				continue;
1407 			break;
1408 		}
1409 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1410 		if (found_key.objectid != bytenr)
1411 			break;
1412 
1413 		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1414 			path->slots[0]++;
1415 			continue;
1416 		}
1417 
1418 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
1419 					  struct btrfs_extent_ref);
1420 		ref_root = btrfs_ref_root(leaf, ref_item);
1421 		if ((ref_root != root->root_key.objectid &&
1422 		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1423 		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
1424 			ret = 1;
1425 			goto out;
1426 		}
1427 		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1428 			ret = 1;
1429 			goto out;
1430 		}
1431 
1432 		path->slots[0]++;
1433 	}
1434 	ret = 0;
1435 out:
1436 	btrfs_free_path(path);
1437 	return ret;
1438 }
1439 
1440 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1441 		    struct extent_buffer *buf, u32 nr_extents)
1442 {
1443 	struct btrfs_key key;
1444 	struct btrfs_file_extent_item *fi;
1445 	u64 root_gen;
1446 	u32 nritems;
1447 	int i;
1448 	int level;
1449 	int ret = 0;
1450 	int shared = 0;
1451 
1452 	if (!root->ref_cows)
1453 		return 0;
1454 
1455 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1456 		shared = 0;
1457 		root_gen = root->root_key.offset;
1458 	} else {
1459 		shared = 1;
1460 		root_gen = trans->transid - 1;
1461 	}
1462 
1463 	level = btrfs_header_level(buf);
1464 	nritems = btrfs_header_nritems(buf);
1465 
1466 	if (level == 0) {
1467 		struct btrfs_leaf_ref *ref;
1468 		struct btrfs_extent_info *info;
1469 
1470 		ref = btrfs_alloc_leaf_ref(root, nr_extents);
1471 		if (!ref) {
1472 			ret = -ENOMEM;
1473 			goto out;
1474 		}
1475 
1476 		ref->root_gen = root_gen;
1477 		ref->bytenr = buf->start;
1478 		ref->owner = btrfs_header_owner(buf);
1479 		ref->generation = btrfs_header_generation(buf);
1480 		ref->nritems = nr_extents;
1481 		info = ref->extents;
1482 
1483 		for (i = 0; nr_extents > 0 && i < nritems; i++) {
1484 			u64 disk_bytenr;
1485 			btrfs_item_key_to_cpu(buf, &key, i);
1486 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1487 				continue;
1488 			fi = btrfs_item_ptr(buf, i,
1489 					    struct btrfs_file_extent_item);
1490 			if (btrfs_file_extent_type(buf, fi) ==
1491 			    BTRFS_FILE_EXTENT_INLINE)
1492 				continue;
1493 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1494 			if (disk_bytenr == 0)
1495 				continue;
1496 
1497 			info->bytenr = disk_bytenr;
1498 			info->num_bytes =
1499 				btrfs_file_extent_disk_num_bytes(buf, fi);
1500 			info->objectid = key.objectid;
1501 			info->offset = key.offset;
1502 			info++;
1503 		}
1504 
1505 		ret = btrfs_add_leaf_ref(root, ref, shared);
1506 		if (ret == -EEXIST && shared) {
1507 			struct btrfs_leaf_ref *old;
1508 			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1509 			BUG_ON(!old);
1510 			btrfs_remove_leaf_ref(root, old);
1511 			btrfs_free_leaf_ref(root, old);
1512 			ret = btrfs_add_leaf_ref(root, ref, shared);
1513 		}
1514 		WARN_ON(ret);
1515 		btrfs_free_leaf_ref(root, ref);
1516 	}
1517 out:
1518 	return ret;
1519 }
1520 
1521 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1522 		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
1523 		  u32 *nr_extents)
1524 {
1525 	u64 bytenr;
1526 	u64 ref_root;
1527 	u64 orig_root;
1528 	u64 ref_generation;
1529 	u64 orig_generation;
1530 	u32 nritems;
1531 	u32 nr_file_extents = 0;
1532 	struct btrfs_key key;
1533 	struct btrfs_file_extent_item *fi;
1534 	int i;
1535 	int level;
1536 	int ret = 0;
1537 	int faili = 0;
1538 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1539 			    u64, u64, u64, u64, u64, u64, u64, u64);
1540 
1541 	ref_root = btrfs_header_owner(buf);
1542 	ref_generation = btrfs_header_generation(buf);
1543 	orig_root = btrfs_header_owner(orig_buf);
1544 	orig_generation = btrfs_header_generation(orig_buf);
1545 
1546 	nritems = btrfs_header_nritems(buf);
1547 	level = btrfs_header_level(buf);
1548 
1549 	if (root->ref_cows) {
1550 		process_func = __btrfs_inc_extent_ref;
1551 	} else {
1552 		if (level == 0 &&
1553 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1554 			goto out;
1555 		if (level != 0 &&
1556 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1557 			goto out;
1558 		process_func = __btrfs_update_extent_ref;
1559 	}
1560 
1561 	for (i = 0; i < nritems; i++) {
1562 		cond_resched();
1563 		if (level == 0) {
1564 			btrfs_item_key_to_cpu(buf, &key, i);
1565 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1566 				continue;
1567 			fi = btrfs_item_ptr(buf, i,
1568 					    struct btrfs_file_extent_item);
1569 			if (btrfs_file_extent_type(buf, fi) ==
1570 			    BTRFS_FILE_EXTENT_INLINE)
1571 				continue;
1572 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1573 			if (bytenr == 0)
1574 				continue;
1575 
1576 			nr_file_extents++;
1577 
1578 			ret = process_func(trans, root, bytenr,
1579 					   orig_buf->start, buf->start,
1580 					   orig_root, ref_root,
1581 					   orig_generation, ref_generation,
1582 					   key.objectid);
1583 
1584 			if (ret) {
1585 				faili = i;
1586 				WARN_ON(1);
1587 				goto fail;
1588 			}
1589 		} else {
1590 			bytenr = btrfs_node_blockptr(buf, i);
1591 			ret = process_func(trans, root, bytenr,
1592 					   orig_buf->start, buf->start,
1593 					   orig_root, ref_root,
1594 					   orig_generation, ref_generation,
1595 					   level - 1);
1596 			if (ret) {
1597 				faili = i;
1598 				WARN_ON(1);
1599 				goto fail;
1600 			}
1601 		}
1602 	}
1603 out:
1604 	if (nr_extents) {
1605 		if (level == 0)
1606 			*nr_extents = nr_file_extents;
1607 		else
1608 			*nr_extents = nritems;
1609 	}
1610 	return 0;
1611 fail:
1612 	WARN_ON(1);
1613 	return ret;
1614 }
1615 
1616 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1617 		     struct btrfs_root *root, struct extent_buffer *orig_buf,
1618 		     struct extent_buffer *buf, int start_slot, int nr)
1619 
1620 {
1621 	u64 bytenr;
1622 	u64 ref_root;
1623 	u64 orig_root;
1624 	u64 ref_generation;
1625 	u64 orig_generation;
1626 	struct btrfs_key key;
1627 	struct btrfs_file_extent_item *fi;
1628 	int i;
1629 	int ret;
1630 	int slot;
1631 	int level;
1632 
1633 	BUG_ON(start_slot < 0);
1634 	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1635 
1636 	ref_root = btrfs_header_owner(buf);
1637 	ref_generation = btrfs_header_generation(buf);
1638 	orig_root = btrfs_header_owner(orig_buf);
1639 	orig_generation = btrfs_header_generation(orig_buf);
1640 	level = btrfs_header_level(buf);
1641 
1642 	if (!root->ref_cows) {
1643 		if (level == 0 &&
1644 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1645 			return 0;
1646 		if (level != 0 &&
1647 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1648 			return 0;
1649 	}
1650 
1651 	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1652 		cond_resched();
1653 		if (level == 0) {
1654 			btrfs_item_key_to_cpu(buf, &key, slot);
1655 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1656 				continue;
1657 			fi = btrfs_item_ptr(buf, slot,
1658 					    struct btrfs_file_extent_item);
1659 			if (btrfs_file_extent_type(buf, fi) ==
1660 			    BTRFS_FILE_EXTENT_INLINE)
1661 				continue;
1662 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1663 			if (bytenr == 0)
1664 				continue;
1665 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1666 					    orig_buf->start, buf->start,
1667 					    orig_root, ref_root,
1668 					    orig_generation, ref_generation,
1669 					    key.objectid);
1670 			if (ret)
1671 				goto fail;
1672 		} else {
1673 			bytenr = btrfs_node_blockptr(buf, slot);
1674 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1675 					    orig_buf->start, buf->start,
1676 					    orig_root, ref_root,
1677 					    orig_generation, ref_generation,
1678 					    level - 1);
1679 			if (ret)
1680 				goto fail;
1681 		}
1682 	}
1683 	return 0;
1684 fail:
1685 	WARN_ON(1);
1686 	return -1;
1687 }
1688 
1689 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1690 				 struct btrfs_root *root,
1691 				 struct btrfs_path *path,
1692 				 struct btrfs_block_group_cache *cache)
1693 {
1694 	int ret;
1695 	int pending_ret;
1696 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1697 	unsigned long bi;
1698 	struct extent_buffer *leaf;
1699 
1700 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1701 	if (ret < 0)
1702 		goto fail;
1703 	BUG_ON(ret);
1704 
1705 	leaf = path->nodes[0];
1706 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1707 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1708 	btrfs_mark_buffer_dirty(leaf);
1709 	btrfs_release_path(extent_root, path);
1710 fail:
1711 	finish_current_insert(trans, extent_root, 0);
1712 	pending_ret = del_pending_extents(trans, extent_root, 0);
1713 	if (ret)
1714 		return ret;
1715 	if (pending_ret)
1716 		return pending_ret;
1717 	return 0;
1718 
1719 }
1720 
1721 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1722 				   struct btrfs_root *root)
1723 {
1724 	struct btrfs_block_group_cache *cache, *entry;
1725 	struct rb_node *n;
1726 	int err = 0;
1727 	int werr = 0;
1728 	struct btrfs_path *path;
1729 	u64 last = 0;
1730 
1731 	path = btrfs_alloc_path();
1732 	if (!path)
1733 		return -ENOMEM;
1734 
1735 	while(1) {
1736 		cache = NULL;
1737 		spin_lock(&root->fs_info->block_group_cache_lock);
1738 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
1739 		     n; n = rb_next(n)) {
1740 			entry = rb_entry(n, struct btrfs_block_group_cache,
1741 					 cache_node);
1742 			if (entry->dirty) {
1743 				cache = entry;
1744 				break;
1745 			}
1746 		}
1747 		spin_unlock(&root->fs_info->block_group_cache_lock);
1748 
1749 		if (!cache)
1750 			break;
1751 
1752 		cache->dirty = 0;
1753 		last += cache->key.offset;
1754 
1755 		err = write_one_cache_group(trans, root,
1756 					    path, cache);
1757 		/*
1758 		 * if we fail to write the cache group, we want
1759 		 * to keep it marked dirty in hopes that a later
1760 		 * write will work
1761 		 */
1762 		if (err) {
1763 			werr = err;
1764 			continue;
1765 		}
1766 	}
1767 	btrfs_free_path(path);
1768 	return werr;
1769 }
1770 
1771 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1772 {
1773 	struct btrfs_block_group_cache *block_group;
1774 	int readonly = 0;
1775 
1776 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1777 	if (!block_group || block_group->ro)
1778 		readonly = 1;
1779 	if (block_group)
1780 		put_block_group(block_group);
1781 	return readonly;
1782 }
1783 
1784 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1785 			     u64 total_bytes, u64 bytes_used,
1786 			     struct btrfs_space_info **space_info)
1787 {
1788 	struct btrfs_space_info *found;
1789 
1790 	found = __find_space_info(info, flags);
1791 	if (found) {
1792 		spin_lock(&found->lock);
1793 		found->total_bytes += total_bytes;
1794 		found->bytes_used += bytes_used;
1795 		found->full = 0;
1796 		spin_unlock(&found->lock);
1797 		*space_info = found;
1798 		return 0;
1799 	}
1800 	found = kzalloc(sizeof(*found), GFP_NOFS);
1801 	if (!found)
1802 		return -ENOMEM;
1803 
1804 	list_add(&found->list, &info->space_info);
1805 	INIT_LIST_HEAD(&found->block_groups);
1806 	init_rwsem(&found->groups_sem);
1807 	spin_lock_init(&found->lock);
1808 	found->flags = flags;
1809 	found->total_bytes = total_bytes;
1810 	found->bytes_used = bytes_used;
1811 	found->bytes_pinned = 0;
1812 	found->bytes_reserved = 0;
1813 	found->bytes_readonly = 0;
1814 	found->full = 0;
1815 	found->force_alloc = 0;
1816 	*space_info = found;
1817 	return 0;
1818 }
1819 
1820 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1821 {
1822 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1823 				   BTRFS_BLOCK_GROUP_RAID1 |
1824 				   BTRFS_BLOCK_GROUP_RAID10 |
1825 				   BTRFS_BLOCK_GROUP_DUP);
1826 	if (extra_flags) {
1827 		if (flags & BTRFS_BLOCK_GROUP_DATA)
1828 			fs_info->avail_data_alloc_bits |= extra_flags;
1829 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
1830 			fs_info->avail_metadata_alloc_bits |= extra_flags;
1831 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1832 			fs_info->avail_system_alloc_bits |= extra_flags;
1833 	}
1834 }
1835 
1836 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1837 {
1838 	spin_lock(&cache->space_info->lock);
1839 	spin_lock(&cache->lock);
1840 	if (!cache->ro) {
1841 		cache->space_info->bytes_readonly += cache->key.offset -
1842 					btrfs_block_group_used(&cache->item);
1843 		cache->ro = 1;
1844 	}
1845 	spin_unlock(&cache->lock);
1846 	spin_unlock(&cache->space_info->lock);
1847 }
1848 
1849 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1850 {
1851 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
1852 
1853 	if (num_devices == 1)
1854 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1855 	if (num_devices < 4)
1856 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1857 
1858 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1859 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1860 		      BTRFS_BLOCK_GROUP_RAID10))) {
1861 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
1862 	}
1863 
1864 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1865 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1866 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1867 	}
1868 
1869 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1870 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1871 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
1872 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
1873 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1874 	return flags;
1875 }
1876 
1877 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1878 			  struct btrfs_root *extent_root, u64 alloc_bytes,
1879 			  u64 flags, int force)
1880 {
1881 	struct btrfs_space_info *space_info;
1882 	u64 thresh;
1883 	int ret = 0;
1884 
1885 	mutex_lock(&extent_root->fs_info->chunk_mutex);
1886 
1887 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
1888 
1889 	space_info = __find_space_info(extent_root->fs_info, flags);
1890 	if (!space_info) {
1891 		ret = update_space_info(extent_root->fs_info, flags,
1892 					0, 0, &space_info);
1893 		BUG_ON(ret);
1894 	}
1895 	BUG_ON(!space_info);
1896 
1897 	spin_lock(&space_info->lock);
1898 	if (space_info->force_alloc) {
1899 		force = 1;
1900 		space_info->force_alloc = 0;
1901 	}
1902 	if (space_info->full) {
1903 		spin_unlock(&space_info->lock);
1904 		goto out;
1905 	}
1906 
1907 	thresh = space_info->total_bytes - space_info->bytes_readonly;
1908 	thresh = div_factor(thresh, 6);
1909 	if (!force &&
1910 	   (space_info->bytes_used + space_info->bytes_pinned +
1911 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
1912 		spin_unlock(&space_info->lock);
1913 		goto out;
1914 	}
1915 	spin_unlock(&space_info->lock);
1916 
1917 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
1918 	if (ret) {
1919 printk("space info full %Lu\n", flags);
1920 		space_info->full = 1;
1921 	}
1922 out:
1923 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
1924 	return ret;
1925 }
1926 
1927 static int update_block_group(struct btrfs_trans_handle *trans,
1928 			      struct btrfs_root *root,
1929 			      u64 bytenr, u64 num_bytes, int alloc,
1930 			      int mark_free)
1931 {
1932 	struct btrfs_block_group_cache *cache;
1933 	struct btrfs_fs_info *info = root->fs_info;
1934 	u64 total = num_bytes;
1935 	u64 old_val;
1936 	u64 byte_in_group;
1937 
1938 	while(total) {
1939 		cache = btrfs_lookup_block_group(info, bytenr);
1940 		if (!cache)
1941 			return -1;
1942 		byte_in_group = bytenr - cache->key.objectid;
1943 		WARN_ON(byte_in_group > cache->key.offset);
1944 
1945 		spin_lock(&cache->space_info->lock);
1946 		spin_lock(&cache->lock);
1947 		cache->dirty = 1;
1948 		old_val = btrfs_block_group_used(&cache->item);
1949 		num_bytes = min(total, cache->key.offset - byte_in_group);
1950 		if (alloc) {
1951 			old_val += num_bytes;
1952 			cache->space_info->bytes_used += num_bytes;
1953 			if (cache->ro)
1954 				cache->space_info->bytes_readonly -= num_bytes;
1955 			btrfs_set_block_group_used(&cache->item, old_val);
1956 			spin_unlock(&cache->lock);
1957 			spin_unlock(&cache->space_info->lock);
1958 		} else {
1959 			old_val -= num_bytes;
1960 			cache->space_info->bytes_used -= num_bytes;
1961 			if (cache->ro)
1962 				cache->space_info->bytes_readonly += num_bytes;
1963 			btrfs_set_block_group_used(&cache->item, old_val);
1964 			spin_unlock(&cache->lock);
1965 			spin_unlock(&cache->space_info->lock);
1966 			if (mark_free) {
1967 				int ret;
1968 				ret = btrfs_add_free_space(cache, bytenr,
1969 							   num_bytes);
1970 				WARN_ON(ret);
1971 			}
1972 		}
1973 		put_block_group(cache);
1974 		total -= num_bytes;
1975 		bytenr += num_bytes;
1976 	}
1977 	return 0;
1978 }
1979 
1980 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1981 {
1982 	struct btrfs_block_group_cache *cache;
1983 	u64 bytenr;
1984 
1985 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1986 	if (!cache)
1987 		return 0;
1988 
1989 	bytenr = cache->key.objectid;
1990 	put_block_group(cache);
1991 
1992 	return bytenr;
1993 }
1994 
1995 int btrfs_update_pinned_extents(struct btrfs_root *root,
1996 				u64 bytenr, u64 num, int pin)
1997 {
1998 	u64 len;
1999 	struct btrfs_block_group_cache *cache;
2000 	struct btrfs_fs_info *fs_info = root->fs_info;
2001 
2002 	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2003 	if (pin) {
2004 		set_extent_dirty(&fs_info->pinned_extents,
2005 				bytenr, bytenr + num - 1, GFP_NOFS);
2006 	} else {
2007 		clear_extent_dirty(&fs_info->pinned_extents,
2008 				bytenr, bytenr + num - 1, GFP_NOFS);
2009 	}
2010 	while (num > 0) {
2011 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2012 		BUG_ON(!cache);
2013 		len = min(num, cache->key.offset -
2014 			  (bytenr - cache->key.objectid));
2015 		if (pin) {
2016 			spin_lock(&cache->space_info->lock);
2017 			spin_lock(&cache->lock);
2018 			cache->pinned += len;
2019 			cache->space_info->bytes_pinned += len;
2020 			spin_unlock(&cache->lock);
2021 			spin_unlock(&cache->space_info->lock);
2022 			fs_info->total_pinned += len;
2023 		} else {
2024 			spin_lock(&cache->space_info->lock);
2025 			spin_lock(&cache->lock);
2026 			cache->pinned -= len;
2027 			cache->space_info->bytes_pinned -= len;
2028 			spin_unlock(&cache->lock);
2029 			spin_unlock(&cache->space_info->lock);
2030 			fs_info->total_pinned -= len;
2031 			if (cache->cached)
2032 				btrfs_add_free_space(cache, bytenr, len);
2033 		}
2034 		put_block_group(cache);
2035 		bytenr += len;
2036 		num -= len;
2037 	}
2038 	return 0;
2039 }
2040 
2041 static int update_reserved_extents(struct btrfs_root *root,
2042 				   u64 bytenr, u64 num, int reserve)
2043 {
2044 	u64 len;
2045 	struct btrfs_block_group_cache *cache;
2046 	struct btrfs_fs_info *fs_info = root->fs_info;
2047 
2048 	while (num > 0) {
2049 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2050 		BUG_ON(!cache);
2051 		len = min(num, cache->key.offset -
2052 			  (bytenr - cache->key.objectid));
2053 
2054 		spin_lock(&cache->space_info->lock);
2055 		spin_lock(&cache->lock);
2056 		if (reserve) {
2057 			cache->reserved += len;
2058 			cache->space_info->bytes_reserved += len;
2059 		} else {
2060 			cache->reserved -= len;
2061 			cache->space_info->bytes_reserved -= len;
2062 		}
2063 		spin_unlock(&cache->lock);
2064 		spin_unlock(&cache->space_info->lock);
2065 		put_block_group(cache);
2066 		bytenr += len;
2067 		num -= len;
2068 	}
2069 	return 0;
2070 }
2071 
2072 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2073 {
2074 	u64 last = 0;
2075 	u64 start;
2076 	u64 end;
2077 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2078 	int ret;
2079 
2080 	mutex_lock(&root->fs_info->pinned_mutex);
2081 	while(1) {
2082 		ret = find_first_extent_bit(pinned_extents, last,
2083 					    &start, &end, EXTENT_DIRTY);
2084 		if (ret)
2085 			break;
2086 		set_extent_dirty(copy, start, end, GFP_NOFS);
2087 		last = end + 1;
2088 	}
2089 	mutex_unlock(&root->fs_info->pinned_mutex);
2090 	return 0;
2091 }
2092 
2093 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2094 			       struct btrfs_root *root,
2095 			       struct extent_io_tree *unpin)
2096 {
2097 	u64 start;
2098 	u64 end;
2099 	int ret;
2100 
2101 	mutex_lock(&root->fs_info->pinned_mutex);
2102 	while(1) {
2103 		ret = find_first_extent_bit(unpin, 0, &start, &end,
2104 					    EXTENT_DIRTY);
2105 		if (ret)
2106 			break;
2107 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2108 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
2109 		if (need_resched()) {
2110 			mutex_unlock(&root->fs_info->pinned_mutex);
2111 			cond_resched();
2112 			mutex_lock(&root->fs_info->pinned_mutex);
2113 		}
2114 	}
2115 	mutex_unlock(&root->fs_info->pinned_mutex);
2116 	return 0;
2117 }
2118 
2119 static int finish_current_insert(struct btrfs_trans_handle *trans,
2120 				 struct btrfs_root *extent_root, int all)
2121 {
2122 	u64 start;
2123 	u64 end;
2124 	u64 priv;
2125 	u64 search = 0;
2126 	u64 skipped = 0;
2127 	struct btrfs_fs_info *info = extent_root->fs_info;
2128 	struct btrfs_path *path;
2129 	struct pending_extent_op *extent_op, *tmp;
2130 	struct list_head insert_list, update_list;
2131 	int ret;
2132 	int num_inserts = 0, max_inserts;
2133 
2134 	path = btrfs_alloc_path();
2135 	INIT_LIST_HEAD(&insert_list);
2136 	INIT_LIST_HEAD(&update_list);
2137 
2138 	max_inserts = extent_root->leafsize /
2139 		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2140 		 sizeof(struct btrfs_extent_ref) +
2141 		 sizeof(struct btrfs_extent_item));
2142 again:
2143 	mutex_lock(&info->extent_ins_mutex);
2144 	while (1) {
2145 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
2146 					    &end, EXTENT_WRITEBACK);
2147 		if (ret) {
2148 			if (skipped && all && !num_inserts) {
2149 				skipped = 0;
2150 				search = 0;
2151 				continue;
2152 			}
2153 			mutex_unlock(&info->extent_ins_mutex);
2154 			break;
2155 		}
2156 
2157 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2158 		if (!ret) {
2159 			skipped = 1;
2160 			search = end + 1;
2161 			if (need_resched()) {
2162 				mutex_unlock(&info->extent_ins_mutex);
2163 				cond_resched();
2164 				mutex_lock(&info->extent_ins_mutex);
2165 			}
2166 			continue;
2167 		}
2168 
2169 		ret = get_state_private(&info->extent_ins, start, &priv);
2170 		BUG_ON(ret);
2171 		extent_op = (struct pending_extent_op *)(unsigned long) priv;
2172 
2173 		if (extent_op->type == PENDING_EXTENT_INSERT) {
2174 			num_inserts++;
2175 			list_add_tail(&extent_op->list, &insert_list);
2176 			search = end + 1;
2177 			if (num_inserts == max_inserts) {
2178 				mutex_unlock(&info->extent_ins_mutex);
2179 				break;
2180 			}
2181 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2182 			list_add_tail(&extent_op->list, &update_list);
2183 			search = end + 1;
2184 		} else {
2185 			BUG();
2186 		}
2187 	}
2188 
2189 	/*
2190 	 * process the update list, clear the writeback bit for it, and if
2191 	 * somebody marked this thing for deletion then just unlock it and be
2192 	 * done, the free_extents will handle it
2193 	 */
2194 	mutex_lock(&info->extent_ins_mutex);
2195 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2196 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2197 				  extent_op->bytenr + extent_op->num_bytes - 1,
2198 				  EXTENT_WRITEBACK, GFP_NOFS);
2199 		if (extent_op->del) {
2200 			list_del_init(&extent_op->list);
2201 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2202 				      extent_op->bytenr + extent_op->num_bytes
2203 				      - 1, GFP_NOFS);
2204 			kfree(extent_op);
2205 		}
2206 	}
2207 	mutex_unlock(&info->extent_ins_mutex);
2208 
2209 	/*
2210 	 * still have things left on the update list, go ahead an update
2211 	 * everything
2212 	 */
2213 	if (!list_empty(&update_list)) {
2214 		ret = update_backrefs(trans, extent_root, path, &update_list);
2215 		BUG_ON(ret);
2216 	}
2217 
2218 	/*
2219 	 * if no inserts need to be done, but we skipped some extents and we
2220 	 * need to make sure everything is cleaned then reset everything and
2221 	 * go back to the beginning
2222 	 */
2223 	if (!num_inserts && all && skipped) {
2224 		search = 0;
2225 		skipped = 0;
2226 		INIT_LIST_HEAD(&update_list);
2227 		INIT_LIST_HEAD(&insert_list);
2228 		goto again;
2229 	} else if (!num_inserts) {
2230 		goto out;
2231 	}
2232 
2233 	/*
2234 	 * process the insert extents list.  Again if we are deleting this
2235 	 * extent, then just unlock it, pin down the bytes if need be, and be
2236 	 * done with it.  Saves us from having to actually insert the extent
2237 	 * into the tree and then subsequently come along and delete it
2238 	 */
2239 	mutex_lock(&info->extent_ins_mutex);
2240 	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2241 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2242 				  extent_op->bytenr + extent_op->num_bytes - 1,
2243 				  EXTENT_WRITEBACK, GFP_NOFS);
2244 		if (extent_op->del) {
2245 			list_del_init(&extent_op->list);
2246 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2247 				      extent_op->bytenr + extent_op->num_bytes
2248 				      - 1, GFP_NOFS);
2249 
2250 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2251 			ret = pin_down_bytes(trans, extent_root,
2252 					     extent_op->bytenr,
2253 					     extent_op->num_bytes, 0);
2254 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2255 
2256 			ret = update_block_group(trans, extent_root,
2257 						 extent_op->bytenr,
2258 						 extent_op->num_bytes,
2259 						 0, ret > 0);
2260 			BUG_ON(ret);
2261 			kfree(extent_op);
2262 			num_inserts--;
2263 		}
2264 	}
2265 	mutex_unlock(&info->extent_ins_mutex);
2266 
2267 	ret = insert_extents(trans, extent_root, path, &insert_list,
2268 			     num_inserts);
2269 	BUG_ON(ret);
2270 
2271 	/*
2272 	 * if we broke out of the loop in order to insert stuff because we hit
2273 	 * the maximum number of inserts at a time we can handle, then loop
2274 	 * back and pick up where we left off
2275 	 */
2276 	if (num_inserts == max_inserts) {
2277 		INIT_LIST_HEAD(&insert_list);
2278 		INIT_LIST_HEAD(&update_list);
2279 		num_inserts = 0;
2280 		goto again;
2281 	}
2282 
2283 	/*
2284 	 * again, if we need to make absolutely sure there are no more pending
2285 	 * extent operations left and we know that we skipped some, go back to
2286 	 * the beginning and do it all again
2287 	 */
2288 	if (all && skipped) {
2289 		INIT_LIST_HEAD(&insert_list);
2290 		INIT_LIST_HEAD(&update_list);
2291 		search = 0;
2292 		skipped = 0;
2293 		num_inserts = 0;
2294 		goto again;
2295 	}
2296 out:
2297 	btrfs_free_path(path);
2298 	return 0;
2299 }
2300 
2301 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2302 			  struct btrfs_root *root,
2303 			  u64 bytenr, u64 num_bytes, int is_data)
2304 {
2305 	int err = 0;
2306 	struct extent_buffer *buf;
2307 
2308 	if (is_data)
2309 		goto pinit;
2310 
2311 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2312 	if (!buf)
2313 		goto pinit;
2314 
2315 	/* we can reuse a block if it hasn't been written
2316 	 * and it is from this transaction.  We can't
2317 	 * reuse anything from the tree log root because
2318 	 * it has tiny sub-transactions.
2319 	 */
2320 	if (btrfs_buffer_uptodate(buf, 0) &&
2321 	    btrfs_try_tree_lock(buf)) {
2322 		u64 header_owner = btrfs_header_owner(buf);
2323 		u64 header_transid = btrfs_header_generation(buf);
2324 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2325 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2326 		    header_transid == trans->transid &&
2327 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2328 			clean_tree_block(NULL, root, buf);
2329 			btrfs_tree_unlock(buf);
2330 			free_extent_buffer(buf);
2331 			return 1;
2332 		}
2333 		btrfs_tree_unlock(buf);
2334 	}
2335 	free_extent_buffer(buf);
2336 pinit:
2337 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2338 
2339 	BUG_ON(err < 0);
2340 	return 0;
2341 }
2342 
2343 /*
2344  * remove an extent from the root, returns 0 on success
2345  */
2346 static int __free_extent(struct btrfs_trans_handle *trans,
2347 			 struct btrfs_root *root,
2348 			 u64 bytenr, u64 num_bytes, u64 parent,
2349 			 u64 root_objectid, u64 ref_generation,
2350 			 u64 owner_objectid, int pin, int mark_free)
2351 {
2352 	struct btrfs_path *path;
2353 	struct btrfs_key key;
2354 	struct btrfs_fs_info *info = root->fs_info;
2355 	struct btrfs_root *extent_root = info->extent_root;
2356 	struct extent_buffer *leaf;
2357 	int ret;
2358 	int extent_slot = 0;
2359 	int found_extent = 0;
2360 	int num_to_del = 1;
2361 	struct btrfs_extent_item *ei;
2362 	u32 refs;
2363 
2364 	key.objectid = bytenr;
2365 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2366 	key.offset = num_bytes;
2367 	path = btrfs_alloc_path();
2368 	if (!path)
2369 		return -ENOMEM;
2370 
2371 	path->reada = 1;
2372 	ret = lookup_extent_backref(trans, extent_root, path,
2373 				    bytenr, parent, root_objectid,
2374 				    ref_generation, owner_objectid, 1);
2375 	if (ret == 0) {
2376 		struct btrfs_key found_key;
2377 		extent_slot = path->slots[0];
2378 		while(extent_slot > 0) {
2379 			extent_slot--;
2380 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2381 					      extent_slot);
2382 			if (found_key.objectid != bytenr)
2383 				break;
2384 			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2385 			    found_key.offset == num_bytes) {
2386 				found_extent = 1;
2387 				break;
2388 			}
2389 			if (path->slots[0] - extent_slot > 5)
2390 				break;
2391 		}
2392 		if (!found_extent) {
2393 			ret = remove_extent_backref(trans, extent_root, path);
2394 			BUG_ON(ret);
2395 			btrfs_release_path(extent_root, path);
2396 			ret = btrfs_search_slot(trans, extent_root,
2397 						&key, path, -1, 1);
2398 			if (ret) {
2399 				printk(KERN_ERR "umm, got %d back from search"
2400 				       ", was looking for %Lu\n", ret,
2401 				       bytenr);
2402 				btrfs_print_leaf(extent_root, path->nodes[0]);
2403 			}
2404 			BUG_ON(ret);
2405 			extent_slot = path->slots[0];
2406 		}
2407 	} else {
2408 		btrfs_print_leaf(extent_root, path->nodes[0]);
2409 		WARN_ON(1);
2410 		printk("Unable to find ref byte nr %Lu root %Lu "
2411 		       "gen %Lu owner %Lu\n", bytenr,
2412 		       root_objectid, ref_generation, owner_objectid);
2413 	}
2414 
2415 	leaf = path->nodes[0];
2416 	ei = btrfs_item_ptr(leaf, extent_slot,
2417 			    struct btrfs_extent_item);
2418 	refs = btrfs_extent_refs(leaf, ei);
2419 	BUG_ON(refs == 0);
2420 	refs -= 1;
2421 	btrfs_set_extent_refs(leaf, ei, refs);
2422 
2423 	btrfs_mark_buffer_dirty(leaf);
2424 
2425 	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2426 		struct btrfs_extent_ref *ref;
2427 		ref = btrfs_item_ptr(leaf, path->slots[0],
2428 				     struct btrfs_extent_ref);
2429 		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2430 		/* if the back ref and the extent are next to each other
2431 		 * they get deleted below in one shot
2432 		 */
2433 		path->slots[0] = extent_slot;
2434 		num_to_del = 2;
2435 	} else if (found_extent) {
2436 		/* otherwise delete the extent back ref */
2437 		ret = remove_extent_backref(trans, extent_root, path);
2438 		BUG_ON(ret);
2439 		/* if refs are 0, we need to setup the path for deletion */
2440 		if (refs == 0) {
2441 			btrfs_release_path(extent_root, path);
2442 			ret = btrfs_search_slot(trans, extent_root, &key, path,
2443 						-1, 1);
2444 			BUG_ON(ret);
2445 		}
2446 	}
2447 
2448 	if (refs == 0) {
2449 		u64 super_used;
2450 		u64 root_used;
2451 #ifdef BIO_RW_DISCARD
2452 		u64 map_length = num_bytes;
2453 		struct btrfs_multi_bio *multi = NULL;
2454 #endif
2455 
2456 		if (pin) {
2457 			mutex_lock(&root->fs_info->pinned_mutex);
2458 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2459 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2460 			mutex_unlock(&root->fs_info->pinned_mutex);
2461 			if (ret > 0)
2462 				mark_free = 1;
2463 			BUG_ON(ret < 0);
2464 		}
2465 		/* block accounting for super block */
2466 		spin_lock(&info->delalloc_lock);
2467 		super_used = btrfs_super_bytes_used(&info->super_copy);
2468 		btrfs_set_super_bytes_used(&info->super_copy,
2469 					   super_used - num_bytes);
2470 		spin_unlock(&info->delalloc_lock);
2471 
2472 		/* block accounting for root item */
2473 		root_used = btrfs_root_used(&root->root_item);
2474 		btrfs_set_root_used(&root->root_item,
2475 					   root_used - num_bytes);
2476 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2477 				      num_to_del);
2478 		BUG_ON(ret);
2479 		btrfs_release_path(extent_root, path);
2480 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2481 					 mark_free);
2482 		BUG_ON(ret);
2483 
2484 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2485 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2486 			BUG_ON(ret);
2487 		}
2488 
2489 #ifdef BIO_RW_DISCARD
2490 		/* Tell the block device(s) that the sectors can be discarded */
2491 		ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2492 				      bytenr, &map_length, &multi, 0);
2493 		if (!ret) {
2494 			struct btrfs_bio_stripe *stripe = multi->stripes;
2495 			int i;
2496 
2497 			if (map_length > num_bytes)
2498 				map_length = num_bytes;
2499 
2500 			for (i = 0; i < multi->num_stripes; i++, stripe++) {
2501 				btrfs_issue_discard(stripe->dev->bdev,
2502 						    stripe->physical,
2503 						     map_length);
2504 			}
2505 			kfree(multi);
2506 		}
2507 #endif
2508 	}
2509 	btrfs_free_path(path);
2510 	finish_current_insert(trans, extent_root, 0);
2511 	return ret;
2512 }
2513 
2514 /*
2515  * find all the blocks marked as pending in the radix tree and remove
2516  * them from the extent map
2517  */
2518 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
2519 			       btrfs_root *extent_root, int all)
2520 {
2521 	int ret;
2522 	int err = 0;
2523 	u64 start;
2524 	u64 end;
2525 	u64 priv;
2526 	u64 search = 0;
2527 	int nr = 0, skipped = 0;
2528 	struct extent_io_tree *pending_del;
2529 	struct extent_io_tree *extent_ins;
2530 	struct pending_extent_op *extent_op;
2531 	struct btrfs_fs_info *info = extent_root->fs_info;
2532 	struct list_head delete_list;
2533 
2534 	INIT_LIST_HEAD(&delete_list);
2535 	extent_ins = &extent_root->fs_info->extent_ins;
2536 	pending_del = &extent_root->fs_info->pending_del;
2537 
2538 again:
2539 	mutex_lock(&info->extent_ins_mutex);
2540 	while(1) {
2541 		ret = find_first_extent_bit(pending_del, search, &start, &end,
2542 					    EXTENT_WRITEBACK);
2543 		if (ret) {
2544 			if (all && skipped && !nr) {
2545 				search = 0;
2546 				continue;
2547 			}
2548 			mutex_unlock(&info->extent_ins_mutex);
2549 			break;
2550 		}
2551 
2552 		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2553 		if (!ret) {
2554 			search = end+1;
2555 			skipped = 1;
2556 
2557 			if (need_resched()) {
2558 				mutex_unlock(&info->extent_ins_mutex);
2559 				cond_resched();
2560 				mutex_lock(&info->extent_ins_mutex);
2561 			}
2562 
2563 			continue;
2564 		}
2565 		BUG_ON(ret < 0);
2566 
2567 		ret = get_state_private(pending_del, start, &priv);
2568 		BUG_ON(ret);
2569 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
2570 
2571 		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2572 				  GFP_NOFS);
2573 		if (!test_range_bit(extent_ins, start, end,
2574 				    EXTENT_WRITEBACK, 0)) {
2575 			list_add_tail(&extent_op->list, &delete_list);
2576 			nr++;
2577 		} else {
2578 			kfree(extent_op);
2579 
2580 			ret = get_state_private(&info->extent_ins, start,
2581 						&priv);
2582 			BUG_ON(ret);
2583 			extent_op = (struct pending_extent_op *)
2584 						(unsigned long)priv;
2585 
2586 			clear_extent_bits(&info->extent_ins, start, end,
2587 					  EXTENT_WRITEBACK, GFP_NOFS);
2588 
2589 			if (extent_op->type == PENDING_BACKREF_UPDATE) {
2590 				list_add_tail(&extent_op->list, &delete_list);
2591 				search = end + 1;
2592 				nr++;
2593 				continue;
2594 			}
2595 
2596 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2597 			ret = pin_down_bytes(trans, extent_root, start,
2598 					     end + 1 - start, 0);
2599 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2600 
2601 			ret = update_block_group(trans, extent_root, start,
2602 						end + 1 - start, 0, ret > 0);
2603 
2604 			unlock_extent(extent_ins, start, end, GFP_NOFS);
2605 			BUG_ON(ret);
2606 			kfree(extent_op);
2607 		}
2608 		if (ret)
2609 			err = ret;
2610 
2611 		search = end + 1;
2612 
2613 		if (need_resched()) {
2614 			mutex_unlock(&info->extent_ins_mutex);
2615 			cond_resched();
2616 			mutex_lock(&info->extent_ins_mutex);
2617 		}
2618 	}
2619 
2620 	if (nr) {
2621 		ret = free_extents(trans, extent_root, &delete_list);
2622 		BUG_ON(ret);
2623 	}
2624 
2625 	if (all && skipped) {
2626 		INIT_LIST_HEAD(&delete_list);
2627 		search = 0;
2628 		nr = 0;
2629 		goto again;
2630 	}
2631 
2632 	return err;
2633 }
2634 
2635 /*
2636  * remove an extent from the root, returns 0 on success
2637  */
2638 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2639 			       struct btrfs_root *root,
2640 			       u64 bytenr, u64 num_bytes, u64 parent,
2641 			       u64 root_objectid, u64 ref_generation,
2642 			       u64 owner_objectid, int pin)
2643 {
2644 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2645 	int pending_ret;
2646 	int ret;
2647 
2648 	WARN_ON(num_bytes < root->sectorsize);
2649 	if (root == extent_root) {
2650 		struct pending_extent_op *extent_op = NULL;
2651 
2652 		mutex_lock(&root->fs_info->extent_ins_mutex);
2653 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2654 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2655 			u64 priv;
2656 			ret = get_state_private(&root->fs_info->extent_ins,
2657 						bytenr, &priv);
2658 			BUG_ON(ret);
2659 			extent_op = (struct pending_extent_op *)
2660 						(unsigned long)priv;
2661 
2662 			extent_op->del = 1;
2663 			if (extent_op->type == PENDING_EXTENT_INSERT) {
2664 				mutex_unlock(&root->fs_info->extent_ins_mutex);
2665 				return 0;
2666 			}
2667 		}
2668 
2669 		if (extent_op) {
2670 			ref_generation = extent_op->orig_generation;
2671 			parent = extent_op->orig_parent;
2672 		}
2673 
2674 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2675 		BUG_ON(!extent_op);
2676 
2677 		extent_op->type = PENDING_EXTENT_DELETE;
2678 		extent_op->bytenr = bytenr;
2679 		extent_op->num_bytes = num_bytes;
2680 		extent_op->parent = parent;
2681 		extent_op->orig_parent = parent;
2682 		extent_op->generation = ref_generation;
2683 		extent_op->orig_generation = ref_generation;
2684 		extent_op->level = (int)owner_objectid;
2685 		INIT_LIST_HEAD(&extent_op->list);
2686 		extent_op->del = 0;
2687 
2688 		set_extent_bits(&root->fs_info->pending_del,
2689 				bytenr, bytenr + num_bytes - 1,
2690 				EXTENT_WRITEBACK, GFP_NOFS);
2691 		set_state_private(&root->fs_info->pending_del,
2692 				  bytenr, (unsigned long)extent_op);
2693 		mutex_unlock(&root->fs_info->extent_ins_mutex);
2694 		return 0;
2695 	}
2696 	/* if metadata always pin */
2697 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2698 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2699 			struct btrfs_block_group_cache *cache;
2700 
2701 			/* btrfs_free_reserved_extent */
2702 			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2703 			BUG_ON(!cache);
2704 			btrfs_add_free_space(cache, bytenr, num_bytes);
2705 			put_block_group(cache);
2706 			update_reserved_extents(root, bytenr, num_bytes, 0);
2707 			return 0;
2708 		}
2709 		pin = 1;
2710 	}
2711 
2712 	/* if data pin when any transaction has committed this */
2713 	if (ref_generation != trans->transid)
2714 		pin = 1;
2715 
2716 	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2717 			    root_objectid, ref_generation,
2718 			    owner_objectid, pin, pin == 0);
2719 
2720 	finish_current_insert(trans, root->fs_info->extent_root, 0);
2721 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2722 	return ret ? ret : pending_ret;
2723 }
2724 
2725 int btrfs_free_extent(struct btrfs_trans_handle *trans,
2726 		      struct btrfs_root *root,
2727 		      u64 bytenr, u64 num_bytes, u64 parent,
2728 		      u64 root_objectid, u64 ref_generation,
2729 		      u64 owner_objectid, int pin)
2730 {
2731 	int ret;
2732 
2733 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2734 				  root_objectid, ref_generation,
2735 				  owner_objectid, pin);
2736 	return ret;
2737 }
2738 
2739 static u64 stripe_align(struct btrfs_root *root, u64 val)
2740 {
2741 	u64 mask = ((u64)root->stripesize - 1);
2742 	u64 ret = (val + mask) & ~mask;
2743 	return ret;
2744 }
2745 
2746 /*
2747  * walks the btree of allocated extents and find a hole of a given size.
2748  * The key ins is changed to record the hole:
2749  * ins->objectid == block start
2750  * ins->flags = BTRFS_EXTENT_ITEM_KEY
2751  * ins->offset == number of blocks
2752  * Any available blocks before search_start are skipped.
2753  */
2754 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2755 				     struct btrfs_root *orig_root,
2756 				     u64 num_bytes, u64 empty_size,
2757 				     u64 search_start, u64 search_end,
2758 				     u64 hint_byte, struct btrfs_key *ins,
2759 				     u64 exclude_start, u64 exclude_nr,
2760 				     int data)
2761 {
2762 	int ret = 0;
2763 	struct btrfs_root * root = orig_root->fs_info->extent_root;
2764 	u64 total_needed = num_bytes;
2765 	u64 *last_ptr = NULL;
2766 	u64 last_wanted = 0;
2767 	struct btrfs_block_group_cache *block_group = NULL;
2768 	int chunk_alloc_done = 0;
2769 	int empty_cluster = 2 * 1024 * 1024;
2770 	int allowed_chunk_alloc = 0;
2771 	struct list_head *head = NULL, *cur = NULL;
2772 	int loop = 0;
2773 	int extra_loop = 0;
2774 	struct btrfs_space_info *space_info;
2775 
2776 	WARN_ON(num_bytes < root->sectorsize);
2777 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2778 	ins->objectid = 0;
2779 	ins->offset = 0;
2780 
2781 	if (orig_root->ref_cows || empty_size)
2782 		allowed_chunk_alloc = 1;
2783 
2784 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
2785 		last_ptr = &root->fs_info->last_alloc;
2786 		empty_cluster = 64 * 1024;
2787 	}
2788 
2789 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2790 		last_ptr = &root->fs_info->last_data_alloc;
2791 
2792 	if (last_ptr) {
2793 		if (*last_ptr) {
2794 			hint_byte = *last_ptr;
2795 			last_wanted = *last_ptr;
2796 		} else
2797 			empty_size += empty_cluster;
2798 	} else {
2799 		empty_cluster = 0;
2800 	}
2801 	search_start = max(search_start, first_logical_byte(root, 0));
2802 	search_start = max(search_start, hint_byte);
2803 
2804 	if (last_wanted && search_start != last_wanted) {
2805 		last_wanted = 0;
2806 		empty_size += empty_cluster;
2807 	}
2808 
2809 	total_needed += empty_size;
2810 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2811 	if (!block_group)
2812 		block_group = btrfs_lookup_first_block_group(root->fs_info,
2813 							     search_start);
2814 	space_info = __find_space_info(root->fs_info, data);
2815 
2816 	down_read(&space_info->groups_sem);
2817 	while (1) {
2818 		struct btrfs_free_space *free_space;
2819 		/*
2820 		 * the only way this happens if our hint points to a block
2821 		 * group thats not of the proper type, while looping this
2822 		 * should never happen
2823 		 */
2824 		if (empty_size)
2825 			extra_loop = 1;
2826 
2827 		if (!block_group)
2828 			goto new_group_no_lock;
2829 
2830 		if (unlikely(!block_group->cached)) {
2831 			mutex_lock(&block_group->cache_mutex);
2832 			ret = cache_block_group(root, block_group);
2833 			mutex_unlock(&block_group->cache_mutex);
2834 			if (ret)
2835 				break;
2836 		}
2837 
2838 		mutex_lock(&block_group->alloc_mutex);
2839 		if (unlikely(!block_group_bits(block_group, data)))
2840 			goto new_group;
2841 
2842 		if (unlikely(block_group->ro))
2843 			goto new_group;
2844 
2845 		free_space = btrfs_find_free_space(block_group, search_start,
2846 						   total_needed);
2847 		if (free_space) {
2848 			u64 start = block_group->key.objectid;
2849 			u64 end = block_group->key.objectid +
2850 				block_group->key.offset;
2851 
2852 			search_start = stripe_align(root, free_space->offset);
2853 
2854 			/* move on to the next group */
2855 			if (search_start + num_bytes >= search_end)
2856 				goto new_group;
2857 
2858 			/* move on to the next group */
2859 			if (search_start + num_bytes > end)
2860 				goto new_group;
2861 
2862 			if (last_wanted && search_start != last_wanted) {
2863 				total_needed += empty_cluster;
2864 				empty_size += empty_cluster;
2865 				last_wanted = 0;
2866 				/*
2867 				 * if search_start is still in this block group
2868 				 * then we just re-search this block group
2869 				 */
2870 				if (search_start >= start &&
2871 				    search_start < end) {
2872 					mutex_unlock(&block_group->alloc_mutex);
2873 					continue;
2874 				}
2875 
2876 				/* else we go to the next block group */
2877 				goto new_group;
2878 			}
2879 
2880 			if (exclude_nr > 0 &&
2881 			    (search_start + num_bytes > exclude_start &&
2882 			     search_start < exclude_start + exclude_nr)) {
2883 				search_start = exclude_start + exclude_nr;
2884 				/*
2885 				 * if search_start is still in this block group
2886 				 * then we just re-search this block group
2887 				 */
2888 				if (search_start >= start &&
2889 				    search_start < end) {
2890 					mutex_unlock(&block_group->alloc_mutex);
2891 					last_wanted = 0;
2892 					continue;
2893 				}
2894 
2895 				/* else we go to the next block group */
2896 				goto new_group;
2897 			}
2898 
2899 			ins->objectid = search_start;
2900 			ins->offset = num_bytes;
2901 
2902 			btrfs_remove_free_space_lock(block_group, search_start,
2903 						     num_bytes);
2904 			/* we are all good, lets return */
2905 			mutex_unlock(&block_group->alloc_mutex);
2906 			break;
2907 		}
2908 new_group:
2909 		mutex_unlock(&block_group->alloc_mutex);
2910 		put_block_group(block_group);
2911 		block_group = NULL;
2912 new_group_no_lock:
2913 		/* don't try to compare new allocations against the
2914 		 * last allocation any more
2915 		 */
2916 		last_wanted = 0;
2917 
2918 		/*
2919 		 * Here's how this works.
2920 		 * loop == 0: we were searching a block group via a hint
2921 		 *		and didn't find anything, so we start at
2922 		 *		the head of the block groups and keep searching
2923 		 * loop == 1: we're searching through all of the block groups
2924 		 *		if we hit the head again we have searched
2925 		 *		all of the block groups for this space and we
2926 		 *		need to try and allocate, if we cant error out.
2927 		 * loop == 2: we allocated more space and are looping through
2928 		 *		all of the block groups again.
2929 		 */
2930 		if (loop == 0) {
2931 			head = &space_info->block_groups;
2932 			cur = head->next;
2933 			loop++;
2934 		} else if (loop == 1 && cur == head) {
2935 			int keep_going;
2936 
2937 			/* at this point we give up on the empty_size
2938 			 * allocations and just try to allocate the min
2939 			 * space.
2940 			 *
2941 			 * The extra_loop field was set if an empty_size
2942 			 * allocation was attempted above, and if this
2943 			 * is try we need to try the loop again without
2944 			 * the additional empty_size.
2945 			 */
2946 			total_needed -= empty_size;
2947 			empty_size = 0;
2948 			keep_going = extra_loop;
2949 			loop++;
2950 
2951 			if (allowed_chunk_alloc && !chunk_alloc_done) {
2952 				up_read(&space_info->groups_sem);
2953 				ret = do_chunk_alloc(trans, root, num_bytes +
2954 						     2 * 1024 * 1024, data, 1);
2955 				down_read(&space_info->groups_sem);
2956 				if (ret < 0)
2957 					goto loop_check;
2958 				head = &space_info->block_groups;
2959 				/*
2960 				 * we've allocated a new chunk, keep
2961 				 * trying
2962 				 */
2963 				keep_going = 1;
2964 				chunk_alloc_done = 1;
2965 			} else if (!allowed_chunk_alloc) {
2966 				space_info->force_alloc = 1;
2967 			}
2968 loop_check:
2969 			if (keep_going) {
2970 				cur = head->next;
2971 				extra_loop = 0;
2972 			} else {
2973 				break;
2974 			}
2975 		} else if (cur == head) {
2976 			break;
2977 		}
2978 
2979 		block_group = list_entry(cur, struct btrfs_block_group_cache,
2980 					 list);
2981 		atomic_inc(&block_group->count);
2982 
2983 		search_start = block_group->key.objectid;
2984 		cur = cur->next;
2985 	}
2986 
2987 	/* we found what we needed */
2988 	if (ins->objectid) {
2989 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
2990 			trans->block_group = block_group->key.objectid;
2991 
2992 		if (last_ptr)
2993 			*last_ptr = ins->objectid + ins->offset;
2994 		ret = 0;
2995 	} else if (!ret) {
2996 		printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
2997 		       " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
2998 		       loop, allowed_chunk_alloc);
2999 		ret = -ENOSPC;
3000 	}
3001 	if (block_group)
3002 		put_block_group(block_group);
3003 
3004 	up_read(&space_info->groups_sem);
3005 	return ret;
3006 }
3007 
3008 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3009 {
3010 	struct btrfs_block_group_cache *cache;
3011 	struct list_head *l;
3012 
3013 	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
3014 	       info->total_bytes - info->bytes_used - info->bytes_pinned -
3015 	       info->bytes_reserved, (info->full) ? "" : "not ");
3016 
3017 	down_read(&info->groups_sem);
3018 	list_for_each(l, &info->block_groups) {
3019 		cache = list_entry(l, struct btrfs_block_group_cache, list);
3020 		spin_lock(&cache->lock);
3021 		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
3022 		       "%Lu pinned %Lu reserved\n",
3023 		       cache->key.objectid, cache->key.offset,
3024 		       btrfs_block_group_used(&cache->item),
3025 		       cache->pinned, cache->reserved);
3026 		btrfs_dump_free_space(cache, bytes);
3027 		spin_unlock(&cache->lock);
3028 	}
3029 	up_read(&info->groups_sem);
3030 }
3031 
3032 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3033 				  struct btrfs_root *root,
3034 				  u64 num_bytes, u64 min_alloc_size,
3035 				  u64 empty_size, u64 hint_byte,
3036 				  u64 search_end, struct btrfs_key *ins,
3037 				  u64 data)
3038 {
3039 	int ret;
3040 	u64 search_start = 0;
3041 	u64 alloc_profile;
3042 	struct btrfs_fs_info *info = root->fs_info;
3043 
3044 	if (data) {
3045 		alloc_profile = info->avail_data_alloc_bits &
3046 			        info->data_alloc_profile;
3047 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3048 	} else if (root == root->fs_info->chunk_root) {
3049 		alloc_profile = info->avail_system_alloc_bits &
3050 			        info->system_alloc_profile;
3051 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3052 	} else {
3053 		alloc_profile = info->avail_metadata_alloc_bits &
3054 			        info->metadata_alloc_profile;
3055 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3056 	}
3057 again:
3058 	data = btrfs_reduce_alloc_profile(root, data);
3059 	/*
3060 	 * the only place that sets empty_size is btrfs_realloc_node, which
3061 	 * is not called recursively on allocations
3062 	 */
3063 	if (empty_size || root->ref_cows) {
3064 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3065 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3066 				     2 * 1024 * 1024,
3067 				     BTRFS_BLOCK_GROUP_METADATA |
3068 				     (info->metadata_alloc_profile &
3069 				      info->avail_metadata_alloc_bits), 0);
3070 		}
3071 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3072 				     num_bytes + 2 * 1024 * 1024, data, 0);
3073 	}
3074 
3075 	WARN_ON(num_bytes < root->sectorsize);
3076 	ret = find_free_extent(trans, root, num_bytes, empty_size,
3077 			       search_start, search_end, hint_byte, ins,
3078 			       trans->alloc_exclude_start,
3079 			       trans->alloc_exclude_nr, data);
3080 
3081 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3082 		num_bytes = num_bytes >> 1;
3083 		num_bytes = num_bytes & ~(root->sectorsize - 1);
3084 		num_bytes = max(num_bytes, min_alloc_size);
3085 		do_chunk_alloc(trans, root->fs_info->extent_root,
3086 			       num_bytes, data, 1);
3087 		goto again;
3088 	}
3089 	if (ret) {
3090 		struct btrfs_space_info *sinfo;
3091 
3092 		sinfo = __find_space_info(root->fs_info, data);
3093 		printk("allocation failed flags %Lu, wanted %Lu\n",
3094 		       data, num_bytes);
3095 		dump_space_info(sinfo, num_bytes);
3096 		BUG();
3097 	}
3098 
3099 	return ret;
3100 }
3101 
3102 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3103 {
3104 	struct btrfs_block_group_cache *cache;
3105 
3106 	cache = btrfs_lookup_block_group(root->fs_info, start);
3107 	if (!cache) {
3108 		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
3109 		return -ENOSPC;
3110 	}
3111 	btrfs_add_free_space(cache, start, len);
3112 	put_block_group(cache);
3113 	update_reserved_extents(root, start, len, 0);
3114 	return 0;
3115 }
3116 
3117 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3118 				  struct btrfs_root *root,
3119 				  u64 num_bytes, u64 min_alloc_size,
3120 				  u64 empty_size, u64 hint_byte,
3121 				  u64 search_end, struct btrfs_key *ins,
3122 				  u64 data)
3123 {
3124 	int ret;
3125 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3126 				     empty_size, hint_byte, search_end, ins,
3127 				     data);
3128 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
3129 	return ret;
3130 }
3131 
3132 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3133 					 struct btrfs_root *root, u64 parent,
3134 					 u64 root_objectid, u64 ref_generation,
3135 					 u64 owner, struct btrfs_key *ins)
3136 {
3137 	int ret;
3138 	int pending_ret;
3139 	u64 super_used;
3140 	u64 root_used;
3141 	u64 num_bytes = ins->offset;
3142 	u32 sizes[2];
3143 	struct btrfs_fs_info *info = root->fs_info;
3144 	struct btrfs_root *extent_root = info->extent_root;
3145 	struct btrfs_extent_item *extent_item;
3146 	struct btrfs_extent_ref *ref;
3147 	struct btrfs_path *path;
3148 	struct btrfs_key keys[2];
3149 
3150 	if (parent == 0)
3151 		parent = ins->objectid;
3152 
3153 	/* block accounting for super block */
3154 	spin_lock(&info->delalloc_lock);
3155 	super_used = btrfs_super_bytes_used(&info->super_copy);
3156 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3157 	spin_unlock(&info->delalloc_lock);
3158 
3159 	/* block accounting for root item */
3160 	root_used = btrfs_root_used(&root->root_item);
3161 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3162 
3163 	if (root == extent_root) {
3164 		struct pending_extent_op *extent_op;
3165 
3166 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3167 		BUG_ON(!extent_op);
3168 
3169 		extent_op->type = PENDING_EXTENT_INSERT;
3170 		extent_op->bytenr = ins->objectid;
3171 		extent_op->num_bytes = ins->offset;
3172 		extent_op->parent = parent;
3173 		extent_op->orig_parent = 0;
3174 		extent_op->generation = ref_generation;
3175 		extent_op->orig_generation = 0;
3176 		extent_op->level = (int)owner;
3177 		INIT_LIST_HEAD(&extent_op->list);
3178 		extent_op->del = 0;
3179 
3180 		mutex_lock(&root->fs_info->extent_ins_mutex);
3181 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3182 				ins->objectid + ins->offset - 1,
3183 				EXTENT_WRITEBACK, GFP_NOFS);
3184 		set_state_private(&root->fs_info->extent_ins,
3185 				  ins->objectid, (unsigned long)extent_op);
3186 		mutex_unlock(&root->fs_info->extent_ins_mutex);
3187 		goto update_block;
3188 	}
3189 
3190 	memcpy(&keys[0], ins, sizeof(*ins));
3191 	keys[1].objectid = ins->objectid;
3192 	keys[1].type = BTRFS_EXTENT_REF_KEY;
3193 	keys[1].offset = parent;
3194 	sizes[0] = sizeof(*extent_item);
3195 	sizes[1] = sizeof(*ref);
3196 
3197 	path = btrfs_alloc_path();
3198 	BUG_ON(!path);
3199 
3200 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3201 				       sizes, 2);
3202 	BUG_ON(ret);
3203 
3204 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3205 				     struct btrfs_extent_item);
3206 	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3207 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3208 			     struct btrfs_extent_ref);
3209 
3210 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3211 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3212 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3213 	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3214 
3215 	btrfs_mark_buffer_dirty(path->nodes[0]);
3216 
3217 	trans->alloc_exclude_start = 0;
3218 	trans->alloc_exclude_nr = 0;
3219 	btrfs_free_path(path);
3220 	finish_current_insert(trans, extent_root, 0);
3221 	pending_ret = del_pending_extents(trans, extent_root, 0);
3222 
3223 	if (ret)
3224 		goto out;
3225 	if (pending_ret) {
3226 		ret = pending_ret;
3227 		goto out;
3228 	}
3229 
3230 update_block:
3231 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
3232 	if (ret) {
3233 		printk("update block group failed for %Lu %Lu\n",
3234 		       ins->objectid, ins->offset);
3235 		BUG();
3236 	}
3237 out:
3238 	return ret;
3239 }
3240 
3241 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3242 				struct btrfs_root *root, u64 parent,
3243 				u64 root_objectid, u64 ref_generation,
3244 				u64 owner, struct btrfs_key *ins)
3245 {
3246 	int ret;
3247 
3248 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3249 		return 0;
3250 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3251 					    ref_generation, owner, ins);
3252 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
3253 	return ret;
3254 }
3255 
3256 /*
3257  * this is used by the tree logging recovery code.  It records that
3258  * an extent has been allocated and makes sure to clear the free
3259  * space cache bits as well
3260  */
3261 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3262 				struct btrfs_root *root, u64 parent,
3263 				u64 root_objectid, u64 ref_generation,
3264 				u64 owner, struct btrfs_key *ins)
3265 {
3266 	int ret;
3267 	struct btrfs_block_group_cache *block_group;
3268 
3269 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3270 	mutex_lock(&block_group->cache_mutex);
3271 	cache_block_group(root, block_group);
3272 	mutex_unlock(&block_group->cache_mutex);
3273 
3274 	ret = btrfs_remove_free_space(block_group, ins->objectid,
3275 				      ins->offset);
3276 	BUG_ON(ret);
3277 	put_block_group(block_group);
3278 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3279 					    ref_generation, owner, ins);
3280 	return ret;
3281 }
3282 
3283 /*
3284  * finds a free extent and does all the dirty work required for allocation
3285  * returns the key for the extent through ins, and a tree buffer for
3286  * the first block of the extent through buf.
3287  *
3288  * returns 0 if everything worked, non-zero otherwise.
3289  */
3290 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3291 		       struct btrfs_root *root,
3292 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
3293 		       u64 root_objectid, u64 ref_generation,
3294 		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
3295 		       u64 search_end, struct btrfs_key *ins, u64 data)
3296 {
3297 	int ret;
3298 
3299 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
3300 				     min_alloc_size, empty_size, hint_byte,
3301 				     search_end, ins, data);
3302 	BUG_ON(ret);
3303 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3304 		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3305 					root_objectid, ref_generation,
3306 					owner_objectid, ins);
3307 		BUG_ON(ret);
3308 
3309 	} else {
3310 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
3311 	}
3312 	return ret;
3313 }
3314 
3315 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3316 					    struct btrfs_root *root,
3317 					    u64 bytenr, u32 blocksize)
3318 {
3319 	struct extent_buffer *buf;
3320 
3321 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3322 	if (!buf)
3323 		return ERR_PTR(-ENOMEM);
3324 	btrfs_set_header_generation(buf, trans->transid);
3325 	btrfs_tree_lock(buf);
3326 	clean_tree_block(trans, root, buf);
3327 	btrfs_set_buffer_uptodate(buf);
3328 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3329 		set_extent_dirty(&root->dirty_log_pages, buf->start,
3330 			 buf->start + buf->len - 1, GFP_NOFS);
3331 	} else {
3332 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3333 			 buf->start + buf->len - 1, GFP_NOFS);
3334 	}
3335 	trans->blocks_used++;
3336 	return buf;
3337 }
3338 
3339 /*
3340  * helper function to allocate a block for a given tree
3341  * returns the tree buffer or NULL.
3342  */
3343 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3344 					     struct btrfs_root *root,
3345 					     u32 blocksize, u64 parent,
3346 					     u64 root_objectid,
3347 					     u64 ref_generation,
3348 					     int level,
3349 					     u64 hint,
3350 					     u64 empty_size)
3351 {
3352 	struct btrfs_key ins;
3353 	int ret;
3354 	struct extent_buffer *buf;
3355 
3356 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3357 				 root_objectid, ref_generation, level,
3358 				 empty_size, hint, (u64)-1, &ins, 0);
3359 	if (ret) {
3360 		BUG_ON(ret > 0);
3361 		return ERR_PTR(ret);
3362 	}
3363 
3364 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3365 	return buf;
3366 }
3367 
3368 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3369 			struct btrfs_root *root, struct extent_buffer *leaf)
3370 {
3371 	u64 leaf_owner;
3372 	u64 leaf_generation;
3373 	struct btrfs_key key;
3374 	struct btrfs_file_extent_item *fi;
3375 	int i;
3376 	int nritems;
3377 	int ret;
3378 
3379 	BUG_ON(!btrfs_is_leaf(leaf));
3380 	nritems = btrfs_header_nritems(leaf);
3381 	leaf_owner = btrfs_header_owner(leaf);
3382 	leaf_generation = btrfs_header_generation(leaf);
3383 
3384 	for (i = 0; i < nritems; i++) {
3385 		u64 disk_bytenr;
3386 		cond_resched();
3387 
3388 		btrfs_item_key_to_cpu(leaf, &key, i);
3389 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3390 			continue;
3391 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3392 		if (btrfs_file_extent_type(leaf, fi) ==
3393 		    BTRFS_FILE_EXTENT_INLINE)
3394 			continue;
3395 		/*
3396 		 * FIXME make sure to insert a trans record that
3397 		 * repeats the snapshot del on crash
3398 		 */
3399 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3400 		if (disk_bytenr == 0)
3401 			continue;
3402 
3403 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
3404 				btrfs_file_extent_disk_num_bytes(leaf, fi),
3405 				leaf->start, leaf_owner, leaf_generation,
3406 				key.objectid, 0);
3407 		BUG_ON(ret);
3408 
3409 		atomic_inc(&root->fs_info->throttle_gen);
3410 		wake_up(&root->fs_info->transaction_throttle);
3411 		cond_resched();
3412 	}
3413 	return 0;
3414 }
3415 
3416 static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3417 					struct btrfs_root *root,
3418 					struct btrfs_leaf_ref *ref)
3419 {
3420 	int i;
3421 	int ret;
3422 	struct btrfs_extent_info *info = ref->extents;
3423 
3424 	for (i = 0; i < ref->nritems; i++) {
3425 		ret = __btrfs_free_extent(trans, root, info->bytenr,
3426 					  info->num_bytes, ref->bytenr,
3427 					  ref->owner, ref->generation,
3428 					  info->objectid, 0);
3429 
3430 		atomic_inc(&root->fs_info->throttle_gen);
3431 		wake_up(&root->fs_info->transaction_throttle);
3432 		cond_resched();
3433 
3434 		BUG_ON(ret);
3435 		info++;
3436 	}
3437 
3438 	return 0;
3439 }
3440 
3441 static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
3442 			      u32 *refs)
3443 {
3444 	int ret;
3445 
3446 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3447 	BUG_ON(ret);
3448 
3449 #if 0 // some debugging code in case we see problems here
3450 	/* if the refs count is one, it won't get increased again.  But
3451 	 * if the ref count is > 1, someone may be decreasing it at
3452 	 * the same time we are.
3453 	 */
3454 	if (*refs != 1) {
3455 		struct extent_buffer *eb = NULL;
3456 		eb = btrfs_find_create_tree_block(root, start, len);
3457 		if (eb)
3458 			btrfs_tree_lock(eb);
3459 
3460 		mutex_lock(&root->fs_info->alloc_mutex);
3461 		ret = lookup_extent_ref(NULL, root, start, len, refs);
3462 		BUG_ON(ret);
3463 		mutex_unlock(&root->fs_info->alloc_mutex);
3464 
3465 		if (eb) {
3466 			btrfs_tree_unlock(eb);
3467 			free_extent_buffer(eb);
3468 		}
3469 		if (*refs == 1) {
3470 			printk("block %llu went down to one during drop_snap\n",
3471 			       (unsigned long long)start);
3472 		}
3473 
3474 	}
3475 #endif
3476 
3477 	cond_resched();
3478 	return ret;
3479 }
3480 
3481 /*
3482  * helper function for drop_snapshot, this walks down the tree dropping ref
3483  * counts as it goes.
3484  */
3485 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
3486 				   struct btrfs_root *root,
3487 				   struct btrfs_path *path, int *level)
3488 {
3489 	u64 root_owner;
3490 	u64 root_gen;
3491 	u64 bytenr;
3492 	u64 ptr_gen;
3493 	struct extent_buffer *next;
3494 	struct extent_buffer *cur;
3495 	struct extent_buffer *parent;
3496 	struct btrfs_leaf_ref *ref;
3497 	u32 blocksize;
3498 	int ret;
3499 	u32 refs;
3500 
3501 	WARN_ON(*level < 0);
3502 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
3503 	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3504 				path->nodes[*level]->len, &refs);
3505 	BUG_ON(ret);
3506 	if (refs > 1)
3507 		goto out;
3508 
3509 	/*
3510 	 * walk down to the last node level and free all the leaves
3511 	 */
3512 	while(*level >= 0) {
3513 		WARN_ON(*level < 0);
3514 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
3515 		cur = path->nodes[*level];
3516 
3517 		if (btrfs_header_level(cur) != *level)
3518 			WARN_ON(1);
3519 
3520 		if (path->slots[*level] >=
3521 		    btrfs_header_nritems(cur))
3522 			break;
3523 		if (*level == 0) {
3524 			ret = btrfs_drop_leaf_ref(trans, root, cur);
3525 			BUG_ON(ret);
3526 			break;
3527 		}
3528 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3529 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3530 		blocksize = btrfs_level_size(root, *level - 1);
3531 
3532 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3533 		BUG_ON(ret);
3534 		if (refs != 1) {
3535 			parent = path->nodes[*level];
3536 			root_owner = btrfs_header_owner(parent);
3537 			root_gen = btrfs_header_generation(parent);
3538 			path->slots[*level]++;
3539 
3540 			ret = __btrfs_free_extent(trans, root, bytenr,
3541 						blocksize, parent->start,
3542 						root_owner, root_gen,
3543 						*level - 1, 1);
3544 			BUG_ON(ret);
3545 
3546 			atomic_inc(&root->fs_info->throttle_gen);
3547 			wake_up(&root->fs_info->transaction_throttle);
3548 			cond_resched();
3549 
3550 			continue;
3551 		}
3552 		/*
3553 		 * at this point, we have a single ref, and since the
3554 		 * only place referencing this extent is a dead root
3555 		 * the reference count should never go higher.
3556 		 * So, we don't need to check it again
3557 		 */
3558 		if (*level == 1) {
3559 			ref = btrfs_lookup_leaf_ref(root, bytenr);
3560 			if (ref && ref->generation != ptr_gen) {
3561 				btrfs_free_leaf_ref(root, ref);
3562 				ref = NULL;
3563 			}
3564 			if (ref) {
3565 				ret = cache_drop_leaf_ref(trans, root, ref);
3566 				BUG_ON(ret);
3567 				btrfs_remove_leaf_ref(root, ref);
3568 				btrfs_free_leaf_ref(root, ref);
3569 				*level = 0;
3570 				break;
3571 			}
3572 			if (printk_ratelimit()) {
3573 				printk("leaf ref miss for bytenr %llu\n",
3574 				       (unsigned long long)bytenr);
3575 			}
3576 		}
3577 		next = btrfs_find_tree_block(root, bytenr, blocksize);
3578 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3579 			free_extent_buffer(next);
3580 
3581 			next = read_tree_block(root, bytenr, blocksize,
3582 					       ptr_gen);
3583 			cond_resched();
3584 #if 0
3585 			/*
3586 			 * this is a debugging check and can go away
3587 			 * the ref should never go all the way down to 1
3588 			 * at this point
3589 			 */
3590 			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3591 						&refs);
3592 			BUG_ON(ret);
3593 			WARN_ON(refs != 1);
3594 #endif
3595 		}
3596 		WARN_ON(*level <= 0);
3597 		if (path->nodes[*level-1])
3598 			free_extent_buffer(path->nodes[*level-1]);
3599 		path->nodes[*level-1] = next;
3600 		*level = btrfs_header_level(next);
3601 		path->slots[*level] = 0;
3602 		cond_resched();
3603 	}
3604 out:
3605 	WARN_ON(*level < 0);
3606 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
3607 
3608 	if (path->nodes[*level] == root->node) {
3609 		parent = path->nodes[*level];
3610 		bytenr = path->nodes[*level]->start;
3611 	} else {
3612 		parent = path->nodes[*level + 1];
3613 		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3614 	}
3615 
3616 	blocksize = btrfs_level_size(root, *level);
3617 	root_owner = btrfs_header_owner(parent);
3618 	root_gen = btrfs_header_generation(parent);
3619 
3620 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3621 				  parent->start, root_owner, root_gen,
3622 				  *level, 1);
3623 	free_extent_buffer(path->nodes[*level]);
3624 	path->nodes[*level] = NULL;
3625 	*level += 1;
3626 	BUG_ON(ret);
3627 
3628 	cond_resched();
3629 	return 0;
3630 }
3631 
3632 /*
3633  * helper function for drop_subtree, this function is similar to
3634  * walk_down_tree. The main difference is that it checks reference
3635  * counts while tree blocks are locked.
3636  */
3637 static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
3638 				      struct btrfs_root *root,
3639 				      struct btrfs_path *path, int *level)
3640 {
3641 	struct extent_buffer *next;
3642 	struct extent_buffer *cur;
3643 	struct extent_buffer *parent;
3644 	u64 bytenr;
3645 	u64 ptr_gen;
3646 	u32 blocksize;
3647 	u32 refs;
3648 	int ret;
3649 
3650 	cur = path->nodes[*level];
3651 	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3652 				      &refs);
3653 	BUG_ON(ret);
3654 	if (refs > 1)
3655 		goto out;
3656 
3657 	while (*level >= 0) {
3658 		cur = path->nodes[*level];
3659 		if (*level == 0) {
3660 			ret = btrfs_drop_leaf_ref(trans, root, cur);
3661 			BUG_ON(ret);
3662 			clean_tree_block(trans, root, cur);
3663 			break;
3664 		}
3665 		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3666 			clean_tree_block(trans, root, cur);
3667 			break;
3668 		}
3669 
3670 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3671 		blocksize = btrfs_level_size(root, *level - 1);
3672 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3673 
3674 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3675 		btrfs_tree_lock(next);
3676 
3677 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3678 					      &refs);
3679 		BUG_ON(ret);
3680 		if (refs > 1) {
3681 			parent = path->nodes[*level];
3682 			ret = btrfs_free_extent(trans, root, bytenr,
3683 					blocksize, parent->start,
3684 					btrfs_header_owner(parent),
3685 					btrfs_header_generation(parent),
3686 					*level - 1, 1);
3687 			BUG_ON(ret);
3688 			path->slots[*level]++;
3689 			btrfs_tree_unlock(next);
3690 			free_extent_buffer(next);
3691 			continue;
3692 		}
3693 
3694 		*level = btrfs_header_level(next);
3695 		path->nodes[*level] = next;
3696 		path->slots[*level] = 0;
3697 		path->locks[*level] = 1;
3698 		cond_resched();
3699 	}
3700 out:
3701 	parent = path->nodes[*level + 1];
3702 	bytenr = path->nodes[*level]->start;
3703 	blocksize = path->nodes[*level]->len;
3704 
3705 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3706 			parent->start, btrfs_header_owner(parent),
3707 			btrfs_header_generation(parent), *level, 1);
3708 	BUG_ON(ret);
3709 
3710 	if (path->locks[*level]) {
3711 		btrfs_tree_unlock(path->nodes[*level]);
3712 		path->locks[*level] = 0;
3713 	}
3714 	free_extent_buffer(path->nodes[*level]);
3715 	path->nodes[*level] = NULL;
3716 	*level += 1;
3717 	cond_resched();
3718 	return 0;
3719 }
3720 
3721 /*
3722  * helper for dropping snapshots.  This walks back up the tree in the path
3723  * to find the first node higher up where we haven't yet gone through
3724  * all the slots
3725  */
3726 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3727 				 struct btrfs_root *root,
3728 				 struct btrfs_path *path,
3729 				 int *level, int max_level)
3730 {
3731 	u64 root_owner;
3732 	u64 root_gen;
3733 	struct btrfs_root_item *root_item = &root->root_item;
3734 	int i;
3735 	int slot;
3736 	int ret;
3737 
3738 	for (i = *level; i < max_level && path->nodes[i]; i++) {
3739 		slot = path->slots[i];
3740 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3741 			struct extent_buffer *node;
3742 			struct btrfs_disk_key disk_key;
3743 			node = path->nodes[i];
3744 			path->slots[i]++;
3745 			*level = i;
3746 			WARN_ON(*level == 0);
3747 			btrfs_node_key(node, &disk_key, path->slots[i]);
3748 			memcpy(&root_item->drop_progress,
3749 			       &disk_key, sizeof(disk_key));
3750 			root_item->drop_level = i;
3751 			return 0;
3752 		} else {
3753 			struct extent_buffer *parent;
3754 			if (path->nodes[*level] == root->node)
3755 				parent = path->nodes[*level];
3756 			else
3757 				parent = path->nodes[*level + 1];
3758 
3759 			root_owner = btrfs_header_owner(parent);
3760 			root_gen = btrfs_header_generation(parent);
3761 
3762 			clean_tree_block(trans, root, path->nodes[*level]);
3763 			ret = btrfs_free_extent(trans, root,
3764 						path->nodes[*level]->start,
3765 						path->nodes[*level]->len,
3766 						parent->start, root_owner,
3767 						root_gen, *level, 1);
3768 			BUG_ON(ret);
3769 			if (path->locks[*level]) {
3770 				btrfs_tree_unlock(path->nodes[*level]);
3771 				path->locks[*level] = 0;
3772 			}
3773 			free_extent_buffer(path->nodes[*level]);
3774 			path->nodes[*level] = NULL;
3775 			*level = i + 1;
3776 		}
3777 	}
3778 	return 1;
3779 }
3780 
3781 /*
3782  * drop the reference count on the tree rooted at 'snap'.  This traverses
3783  * the tree freeing any blocks that have a ref count of zero after being
3784  * decremented.
3785  */
3786 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3787 			*root)
3788 {
3789 	int ret = 0;
3790 	int wret;
3791 	int level;
3792 	struct btrfs_path *path;
3793 	int i;
3794 	int orig_level;
3795 	struct btrfs_root_item *root_item = &root->root_item;
3796 
3797 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3798 	path = btrfs_alloc_path();
3799 	BUG_ON(!path);
3800 
3801 	level = btrfs_header_level(root->node);
3802 	orig_level = level;
3803 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3804 		path->nodes[level] = root->node;
3805 		extent_buffer_get(root->node);
3806 		path->slots[level] = 0;
3807 	} else {
3808 		struct btrfs_key key;
3809 		struct btrfs_disk_key found_key;
3810 		struct extent_buffer *node;
3811 
3812 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3813 		level = root_item->drop_level;
3814 		path->lowest_level = level;
3815 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3816 		if (wret < 0) {
3817 			ret = wret;
3818 			goto out;
3819 		}
3820 		node = path->nodes[level];
3821 		btrfs_node_key(node, &found_key, path->slots[level]);
3822 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3823 			       sizeof(found_key)));
3824 		/*
3825 		 * unlock our path, this is safe because only this
3826 		 * function is allowed to delete this snapshot
3827 		 */
3828 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3829 			if (path->nodes[i] && path->locks[i]) {
3830 				path->locks[i] = 0;
3831 				btrfs_tree_unlock(path->nodes[i]);
3832 			}
3833 		}
3834 	}
3835 	while(1) {
3836 		wret = walk_down_tree(trans, root, path, &level);
3837 		if (wret > 0)
3838 			break;
3839 		if (wret < 0)
3840 			ret = wret;
3841 
3842 		wret = walk_up_tree(trans, root, path, &level,
3843 				    BTRFS_MAX_LEVEL);
3844 		if (wret > 0)
3845 			break;
3846 		if (wret < 0)
3847 			ret = wret;
3848 		if (trans->transaction->in_commit) {
3849 			ret = -EAGAIN;
3850 			break;
3851 		}
3852 		atomic_inc(&root->fs_info->throttle_gen);
3853 		wake_up(&root->fs_info->transaction_throttle);
3854 	}
3855 	for (i = 0; i <= orig_level; i++) {
3856 		if (path->nodes[i]) {
3857 			free_extent_buffer(path->nodes[i]);
3858 			path->nodes[i] = NULL;
3859 		}
3860 	}
3861 out:
3862 	btrfs_free_path(path);
3863 	return ret;
3864 }
3865 
3866 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3867 			struct btrfs_root *root,
3868 			struct extent_buffer *node,
3869 			struct extent_buffer *parent)
3870 {
3871 	struct btrfs_path *path;
3872 	int level;
3873 	int parent_level;
3874 	int ret = 0;
3875 	int wret;
3876 
3877 	path = btrfs_alloc_path();
3878 	BUG_ON(!path);
3879 
3880 	BUG_ON(!btrfs_tree_locked(parent));
3881 	parent_level = btrfs_header_level(parent);
3882 	extent_buffer_get(parent);
3883 	path->nodes[parent_level] = parent;
3884 	path->slots[parent_level] = btrfs_header_nritems(parent);
3885 
3886 	BUG_ON(!btrfs_tree_locked(node));
3887 	level = btrfs_header_level(node);
3888 	extent_buffer_get(node);
3889 	path->nodes[level] = node;
3890 	path->slots[level] = 0;
3891 
3892 	while (1) {
3893 		wret = walk_down_subtree(trans, root, path, &level);
3894 		if (wret < 0)
3895 			ret = wret;
3896 		if (wret != 0)
3897 			break;
3898 
3899 		wret = walk_up_tree(trans, root, path, &level, parent_level);
3900 		if (wret < 0)
3901 			ret = wret;
3902 		if (wret != 0)
3903 			break;
3904 	}
3905 
3906 	btrfs_free_path(path);
3907 	return ret;
3908 }
3909 
3910 static unsigned long calc_ra(unsigned long start, unsigned long last,
3911 			     unsigned long nr)
3912 {
3913 	return min(last, start + nr - 1);
3914 }
3915 
3916 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
3917 					 u64 len)
3918 {
3919 	u64 page_start;
3920 	u64 page_end;
3921 	unsigned long first_index;
3922 	unsigned long last_index;
3923 	unsigned long i;
3924 	struct page *page;
3925 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3926 	struct file_ra_state *ra;
3927 	struct btrfs_ordered_extent *ordered;
3928 	unsigned int total_read = 0;
3929 	unsigned int total_dirty = 0;
3930 	int ret = 0;
3931 
3932 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3933 
3934 	mutex_lock(&inode->i_mutex);
3935 	first_index = start >> PAGE_CACHE_SHIFT;
3936 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3937 
3938 	/* make sure the dirty trick played by the caller work */
3939 	ret = invalidate_inode_pages2_range(inode->i_mapping,
3940 					    first_index, last_index);
3941 	if (ret)
3942 		goto out_unlock;
3943 
3944 	file_ra_state_init(ra, inode->i_mapping);
3945 
3946 	for (i = first_index ; i <= last_index; i++) {
3947 		if (total_read % ra->ra_pages == 0) {
3948 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3949 				       calc_ra(i, last_index, ra->ra_pages));
3950 		}
3951 		total_read++;
3952 again:
3953 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3954 			BUG_ON(1);
3955 		page = grab_cache_page(inode->i_mapping, i);
3956 		if (!page) {
3957 			ret = -ENOMEM;
3958 			goto out_unlock;
3959 		}
3960 		if (!PageUptodate(page)) {
3961 			btrfs_readpage(NULL, page);
3962 			lock_page(page);
3963 			if (!PageUptodate(page)) {
3964 				unlock_page(page);
3965 				page_cache_release(page);
3966 				ret = -EIO;
3967 				goto out_unlock;
3968 			}
3969 		}
3970 		wait_on_page_writeback(page);
3971 
3972 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3973 		page_end = page_start + PAGE_CACHE_SIZE - 1;
3974 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3975 
3976 		ordered = btrfs_lookup_ordered_extent(inode, page_start);
3977 		if (ordered) {
3978 			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3979 			unlock_page(page);
3980 			page_cache_release(page);
3981 			btrfs_start_ordered_extent(inode, ordered, 1);
3982 			btrfs_put_ordered_extent(ordered);
3983 			goto again;
3984 		}
3985 		set_page_extent_mapped(page);
3986 
3987 		btrfs_set_extent_delalloc(inode, page_start, page_end);
3988 		if (i == first_index)
3989 			set_extent_bits(io_tree, page_start, page_end,
3990 					EXTENT_BOUNDARY, GFP_NOFS);
3991 
3992 		set_page_dirty(page);
3993 		total_dirty++;
3994 
3995 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3996 		unlock_page(page);
3997 		page_cache_release(page);
3998 	}
3999 
4000 out_unlock:
4001 	kfree(ra);
4002 	mutex_unlock(&inode->i_mutex);
4003 	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4004 	return ret;
4005 }
4006 
4007 static int noinline relocate_data_extent(struct inode *reloc_inode,
4008 					 struct btrfs_key *extent_key,
4009 					 u64 offset)
4010 {
4011 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4012 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4013 	struct extent_map *em;
4014 	u64 start = extent_key->objectid - offset;
4015 	u64 end = start + extent_key->offset - 1;
4016 
4017 	em = alloc_extent_map(GFP_NOFS);
4018 	BUG_ON(!em || IS_ERR(em));
4019 
4020 	em->start = start;
4021 	em->len = extent_key->offset;
4022 	em->block_len = extent_key->offset;
4023 	em->block_start = extent_key->objectid;
4024 	em->bdev = root->fs_info->fs_devices->latest_bdev;
4025 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
4026 
4027 	/* setup extent map to cheat btrfs_readpage */
4028 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4029 	while (1) {
4030 		int ret;
4031 		spin_lock(&em_tree->lock);
4032 		ret = add_extent_mapping(em_tree, em);
4033 		spin_unlock(&em_tree->lock);
4034 		if (ret != -EEXIST) {
4035 			free_extent_map(em);
4036 			break;
4037 		}
4038 		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4039 	}
4040 	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4041 
4042 	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4043 }
4044 
4045 struct btrfs_ref_path {
4046 	u64 extent_start;
4047 	u64 nodes[BTRFS_MAX_LEVEL];
4048 	u64 root_objectid;
4049 	u64 root_generation;
4050 	u64 owner_objectid;
4051 	u32 num_refs;
4052 	int lowest_level;
4053 	int current_level;
4054 	int shared_level;
4055 
4056 	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4057 	u64 new_nodes[BTRFS_MAX_LEVEL];
4058 };
4059 
4060 struct disk_extent {
4061 	u64 ram_bytes;
4062 	u64 disk_bytenr;
4063 	u64 disk_num_bytes;
4064 	u64 offset;
4065 	u64 num_bytes;
4066 	u8 compression;
4067 	u8 encryption;
4068 	u16 other_encoding;
4069 };
4070 
4071 static int is_cowonly_root(u64 root_objectid)
4072 {
4073 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4074 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4075 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4076 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4077 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4078 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4079 		return 1;
4080 	return 0;
4081 }
4082 
4083 static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
4084 				    struct btrfs_root *extent_root,
4085 				    struct btrfs_ref_path *ref_path,
4086 				    int first_time)
4087 {
4088 	struct extent_buffer *leaf;
4089 	struct btrfs_path *path;
4090 	struct btrfs_extent_ref *ref;
4091 	struct btrfs_key key;
4092 	struct btrfs_key found_key;
4093 	u64 bytenr;
4094 	u32 nritems;
4095 	int level;
4096 	int ret = 1;
4097 
4098 	path = btrfs_alloc_path();
4099 	if (!path)
4100 		return -ENOMEM;
4101 
4102 	if (first_time) {
4103 		ref_path->lowest_level = -1;
4104 		ref_path->current_level = -1;
4105 		ref_path->shared_level = -1;
4106 		goto walk_up;
4107 	}
4108 walk_down:
4109 	level = ref_path->current_level - 1;
4110 	while (level >= -1) {
4111 		u64 parent;
4112 		if (level < ref_path->lowest_level)
4113 			break;
4114 
4115 		if (level >= 0) {
4116 			bytenr = ref_path->nodes[level];
4117 		} else {
4118 			bytenr = ref_path->extent_start;
4119 		}
4120 		BUG_ON(bytenr == 0);
4121 
4122 		parent = ref_path->nodes[level + 1];
4123 		ref_path->nodes[level + 1] = 0;
4124 		ref_path->current_level = level;
4125 		BUG_ON(parent == 0);
4126 
4127 		key.objectid = bytenr;
4128 		key.offset = parent + 1;
4129 		key.type = BTRFS_EXTENT_REF_KEY;
4130 
4131 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4132 		if (ret < 0)
4133 			goto out;
4134 		BUG_ON(ret == 0);
4135 
4136 		leaf = path->nodes[0];
4137 		nritems = btrfs_header_nritems(leaf);
4138 		if (path->slots[0] >= nritems) {
4139 			ret = btrfs_next_leaf(extent_root, path);
4140 			if (ret < 0)
4141 				goto out;
4142 			if (ret > 0)
4143 				goto next;
4144 			leaf = path->nodes[0];
4145 		}
4146 
4147 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4148 		if (found_key.objectid == bytenr &&
4149 		    found_key.type == BTRFS_EXTENT_REF_KEY) {
4150 			if (level < ref_path->shared_level)
4151 				ref_path->shared_level = level;
4152 			goto found;
4153 		}
4154 next:
4155 		level--;
4156 		btrfs_release_path(extent_root, path);
4157 		cond_resched();
4158 	}
4159 	/* reached lowest level */
4160 	ret = 1;
4161 	goto out;
4162 walk_up:
4163 	level = ref_path->current_level;
4164 	while (level < BTRFS_MAX_LEVEL - 1) {
4165 		u64 ref_objectid;
4166 		if (level >= 0) {
4167 			bytenr = ref_path->nodes[level];
4168 		} else {
4169 			bytenr = ref_path->extent_start;
4170 		}
4171 		BUG_ON(bytenr == 0);
4172 
4173 		key.objectid = bytenr;
4174 		key.offset = 0;
4175 		key.type = BTRFS_EXTENT_REF_KEY;
4176 
4177 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4178 		if (ret < 0)
4179 			goto out;
4180 
4181 		leaf = path->nodes[0];
4182 		nritems = btrfs_header_nritems(leaf);
4183 		if (path->slots[0] >= nritems) {
4184 			ret = btrfs_next_leaf(extent_root, path);
4185 			if (ret < 0)
4186 				goto out;
4187 			if (ret > 0) {
4188 				/* the extent was freed by someone */
4189 				if (ref_path->lowest_level == level)
4190 					goto out;
4191 				btrfs_release_path(extent_root, path);
4192 				goto walk_down;
4193 			}
4194 			leaf = path->nodes[0];
4195 		}
4196 
4197 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4198 		if (found_key.objectid != bytenr ||
4199 				found_key.type != BTRFS_EXTENT_REF_KEY) {
4200 			/* the extent was freed by someone */
4201 			if (ref_path->lowest_level == level) {
4202 				ret = 1;
4203 				goto out;
4204 			}
4205 			btrfs_release_path(extent_root, path);
4206 			goto walk_down;
4207 		}
4208 found:
4209 		ref = btrfs_item_ptr(leaf, path->slots[0],
4210 				struct btrfs_extent_ref);
4211 		ref_objectid = btrfs_ref_objectid(leaf, ref);
4212 		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4213 			if (first_time) {
4214 				level = (int)ref_objectid;
4215 				BUG_ON(level >= BTRFS_MAX_LEVEL);
4216 				ref_path->lowest_level = level;
4217 				ref_path->current_level = level;
4218 				ref_path->nodes[level] = bytenr;
4219 			} else {
4220 				WARN_ON(ref_objectid != level);
4221 			}
4222 		} else {
4223 			WARN_ON(level != -1);
4224 		}
4225 		first_time = 0;
4226 
4227 		if (ref_path->lowest_level == level) {
4228 			ref_path->owner_objectid = ref_objectid;
4229 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4230 		}
4231 
4232 		/*
4233 		 * the block is tree root or the block isn't in reference
4234 		 * counted tree.
4235 		 */
4236 		if (found_key.objectid == found_key.offset ||
4237 		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4238 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4239 			ref_path->root_generation =
4240 				btrfs_ref_generation(leaf, ref);
4241 			if (level < 0) {
4242 				/* special reference from the tree log */
4243 				ref_path->nodes[0] = found_key.offset;
4244 				ref_path->current_level = 0;
4245 			}
4246 			ret = 0;
4247 			goto out;
4248 		}
4249 
4250 		level++;
4251 		BUG_ON(ref_path->nodes[level] != 0);
4252 		ref_path->nodes[level] = found_key.offset;
4253 		ref_path->current_level = level;
4254 
4255 		/*
4256 		 * the reference was created in the running transaction,
4257 		 * no need to continue walking up.
4258 		 */
4259 		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4260 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4261 			ref_path->root_generation =
4262 				btrfs_ref_generation(leaf, ref);
4263 			ret = 0;
4264 			goto out;
4265 		}
4266 
4267 		btrfs_release_path(extent_root, path);
4268 		cond_resched();
4269 	}
4270 	/* reached max tree level, but no tree root found. */
4271 	BUG();
4272 out:
4273 	btrfs_free_path(path);
4274 	return ret;
4275 }
4276 
4277 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4278 				struct btrfs_root *extent_root,
4279 				struct btrfs_ref_path *ref_path,
4280 				u64 extent_start)
4281 {
4282 	memset(ref_path, 0, sizeof(*ref_path));
4283 	ref_path->extent_start = extent_start;
4284 
4285 	return __next_ref_path(trans, extent_root, ref_path, 1);
4286 }
4287 
4288 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4289 			       struct btrfs_root *extent_root,
4290 			       struct btrfs_ref_path *ref_path)
4291 {
4292 	return __next_ref_path(trans, extent_root, ref_path, 0);
4293 }
4294 
4295 static int noinline get_new_locations(struct inode *reloc_inode,
4296 				      struct btrfs_key *extent_key,
4297 				      u64 offset, int no_fragment,
4298 				      struct disk_extent **extents,
4299 				      int *nr_extents)
4300 {
4301 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4302 	struct btrfs_path *path;
4303 	struct btrfs_file_extent_item *fi;
4304 	struct extent_buffer *leaf;
4305 	struct disk_extent *exts = *extents;
4306 	struct btrfs_key found_key;
4307 	u64 cur_pos;
4308 	u64 last_byte;
4309 	u32 nritems;
4310 	int nr = 0;
4311 	int max = *nr_extents;
4312 	int ret;
4313 
4314 	WARN_ON(!no_fragment && *extents);
4315 	if (!exts) {
4316 		max = 1;
4317 		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4318 		if (!exts)
4319 			return -ENOMEM;
4320 	}
4321 
4322 	path = btrfs_alloc_path();
4323 	BUG_ON(!path);
4324 
4325 	cur_pos = extent_key->objectid - offset;
4326 	last_byte = extent_key->objectid + extent_key->offset;
4327 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4328 				       cur_pos, 0);
4329 	if (ret < 0)
4330 		goto out;
4331 	if (ret > 0) {
4332 		ret = -ENOENT;
4333 		goto out;
4334 	}
4335 
4336 	while (1) {
4337 		leaf = path->nodes[0];
4338 		nritems = btrfs_header_nritems(leaf);
4339 		if (path->slots[0] >= nritems) {
4340 			ret = btrfs_next_leaf(root, path);
4341 			if (ret < 0)
4342 				goto out;
4343 			if (ret > 0)
4344 				break;
4345 			leaf = path->nodes[0];
4346 		}
4347 
4348 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4349 		if (found_key.offset != cur_pos ||
4350 		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
4351 		    found_key.objectid != reloc_inode->i_ino)
4352 			break;
4353 
4354 		fi = btrfs_item_ptr(leaf, path->slots[0],
4355 				    struct btrfs_file_extent_item);
4356 		if (btrfs_file_extent_type(leaf, fi) !=
4357 		    BTRFS_FILE_EXTENT_REG ||
4358 		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4359 			break;
4360 
4361 		if (nr == max) {
4362 			struct disk_extent *old = exts;
4363 			max *= 2;
4364 			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4365 			memcpy(exts, old, sizeof(*exts) * nr);
4366 			if (old != *extents)
4367 				kfree(old);
4368 		}
4369 
4370 		exts[nr].disk_bytenr =
4371 			btrfs_file_extent_disk_bytenr(leaf, fi);
4372 		exts[nr].disk_num_bytes =
4373 			btrfs_file_extent_disk_num_bytes(leaf, fi);
4374 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4375 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4376 		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4377 		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4378 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4379 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4380 									   fi);
4381 		BUG_ON(exts[nr].offset > 0);
4382 		BUG_ON(exts[nr].compression || exts[nr].encryption);
4383 		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4384 
4385 		cur_pos += exts[nr].num_bytes;
4386 		nr++;
4387 
4388 		if (cur_pos + offset >= last_byte)
4389 			break;
4390 
4391 		if (no_fragment) {
4392 			ret = 1;
4393 			goto out;
4394 		}
4395 		path->slots[0]++;
4396 	}
4397 
4398 	WARN_ON(cur_pos + offset > last_byte);
4399 	if (cur_pos + offset < last_byte) {
4400 		ret = -ENOENT;
4401 		goto out;
4402 	}
4403 	ret = 0;
4404 out:
4405 	btrfs_free_path(path);
4406 	if (ret) {
4407 		if (exts != *extents)
4408 			kfree(exts);
4409 	} else {
4410 		*extents = exts;
4411 		*nr_extents = nr;
4412 	}
4413 	return ret;
4414 }
4415 
4416 static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
4417 					struct btrfs_root *root,
4418 					struct btrfs_path *path,
4419 					struct btrfs_key *extent_key,
4420 					struct btrfs_key *leaf_key,
4421 					struct btrfs_ref_path *ref_path,
4422 					struct disk_extent *new_extents,
4423 					int nr_extents)
4424 {
4425 	struct extent_buffer *leaf;
4426 	struct btrfs_file_extent_item *fi;
4427 	struct inode *inode = NULL;
4428 	struct btrfs_key key;
4429 	u64 lock_start = 0;
4430 	u64 lock_end = 0;
4431 	u64 num_bytes;
4432 	u64 ext_offset;
4433 	u64 first_pos;
4434 	u32 nritems;
4435 	int nr_scaned = 0;
4436 	int extent_locked = 0;
4437 	int extent_type;
4438 	int ret;
4439 
4440 	memcpy(&key, leaf_key, sizeof(key));
4441 	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4442 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4443 		if (key.objectid < ref_path->owner_objectid ||
4444 		    (key.objectid == ref_path->owner_objectid &&
4445 		     key.type < BTRFS_EXTENT_DATA_KEY)) {
4446 			key.objectid = ref_path->owner_objectid;
4447 			key.type = BTRFS_EXTENT_DATA_KEY;
4448 			key.offset = 0;
4449 		}
4450 	}
4451 
4452 	while (1) {
4453 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4454 		if (ret < 0)
4455 			goto out;
4456 
4457 		leaf = path->nodes[0];
4458 		nritems = btrfs_header_nritems(leaf);
4459 next:
4460 		if (extent_locked && ret > 0) {
4461 			/*
4462 			 * the file extent item was modified by someone
4463 			 * before the extent got locked.
4464 			 */
4465 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4466 				      lock_end, GFP_NOFS);
4467 			extent_locked = 0;
4468 		}
4469 
4470 		if (path->slots[0] >= nritems) {
4471 			if (++nr_scaned > 2)
4472 				break;
4473 
4474 			BUG_ON(extent_locked);
4475 			ret = btrfs_next_leaf(root, path);
4476 			if (ret < 0)
4477 				goto out;
4478 			if (ret > 0)
4479 				break;
4480 			leaf = path->nodes[0];
4481 			nritems = btrfs_header_nritems(leaf);
4482 		}
4483 
4484 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4485 
4486 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4487 			if ((key.objectid > ref_path->owner_objectid) ||
4488 			    (key.objectid == ref_path->owner_objectid &&
4489 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
4490 			    (key.offset >= first_pos + extent_key->offset))
4491 				break;
4492 		}
4493 
4494 		if (inode && key.objectid != inode->i_ino) {
4495 			BUG_ON(extent_locked);
4496 			btrfs_release_path(root, path);
4497 			mutex_unlock(&inode->i_mutex);
4498 			iput(inode);
4499 			inode = NULL;
4500 			continue;
4501 		}
4502 
4503 		if (key.type != BTRFS_EXTENT_DATA_KEY) {
4504 			path->slots[0]++;
4505 			ret = 1;
4506 			goto next;
4507 		}
4508 		fi = btrfs_item_ptr(leaf, path->slots[0],
4509 				    struct btrfs_file_extent_item);
4510 		extent_type = btrfs_file_extent_type(leaf, fi);
4511 		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4512 		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4513 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4514 		     extent_key->objectid)) {
4515 			path->slots[0]++;
4516 			ret = 1;
4517 			goto next;
4518 		}
4519 
4520 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4521 		ext_offset = btrfs_file_extent_offset(leaf, fi);
4522 
4523 		if (first_pos > key.offset - ext_offset)
4524 			first_pos = key.offset - ext_offset;
4525 
4526 		if (!extent_locked) {
4527 			lock_start = key.offset;
4528 			lock_end = lock_start + num_bytes - 1;
4529 		} else {
4530 			if (lock_start > key.offset ||
4531 			    lock_end + 1 < key.offset + num_bytes) {
4532 				unlock_extent(&BTRFS_I(inode)->io_tree,
4533 					      lock_start, lock_end, GFP_NOFS);
4534 				extent_locked = 0;
4535 			}
4536 		}
4537 
4538 		if (!inode) {
4539 			btrfs_release_path(root, path);
4540 
4541 			inode = btrfs_iget_locked(root->fs_info->sb,
4542 						  key.objectid, root);
4543 			if (inode->i_state & I_NEW) {
4544 				BTRFS_I(inode)->root = root;
4545 				BTRFS_I(inode)->location.objectid =
4546 					key.objectid;
4547 				BTRFS_I(inode)->location.type =
4548 					BTRFS_INODE_ITEM_KEY;
4549 				BTRFS_I(inode)->location.offset = 0;
4550 				btrfs_read_locked_inode(inode);
4551 				unlock_new_inode(inode);
4552 			}
4553 			/*
4554 			 * some code call btrfs_commit_transaction while
4555 			 * holding the i_mutex, so we can't use mutex_lock
4556 			 * here.
4557 			 */
4558 			if (is_bad_inode(inode) ||
4559 			    !mutex_trylock(&inode->i_mutex)) {
4560 				iput(inode);
4561 				inode = NULL;
4562 				key.offset = (u64)-1;
4563 				goto skip;
4564 			}
4565 		}
4566 
4567 		if (!extent_locked) {
4568 			struct btrfs_ordered_extent *ordered;
4569 
4570 			btrfs_release_path(root, path);
4571 
4572 			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4573 				    lock_end, GFP_NOFS);
4574 			ordered = btrfs_lookup_first_ordered_extent(inode,
4575 								    lock_end);
4576 			if (ordered &&
4577 			    ordered->file_offset <= lock_end &&
4578 			    ordered->file_offset + ordered->len > lock_start) {
4579 				unlock_extent(&BTRFS_I(inode)->io_tree,
4580 					      lock_start, lock_end, GFP_NOFS);
4581 				btrfs_start_ordered_extent(inode, ordered, 1);
4582 				btrfs_put_ordered_extent(ordered);
4583 				key.offset += num_bytes;
4584 				goto skip;
4585 			}
4586 			if (ordered)
4587 				btrfs_put_ordered_extent(ordered);
4588 
4589 			extent_locked = 1;
4590 			continue;
4591 		}
4592 
4593 		if (nr_extents == 1) {
4594 			/* update extent pointer in place */
4595 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
4596 						new_extents[0].disk_bytenr);
4597 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4598 						new_extents[0].disk_num_bytes);
4599 			btrfs_mark_buffer_dirty(leaf);
4600 
4601 			btrfs_drop_extent_cache(inode, key.offset,
4602 						key.offset + num_bytes - 1, 0);
4603 
4604 			ret = btrfs_inc_extent_ref(trans, root,
4605 						new_extents[0].disk_bytenr,
4606 						new_extents[0].disk_num_bytes,
4607 						leaf->start,
4608 						root->root_key.objectid,
4609 						trans->transid,
4610 						key.objectid);
4611 			BUG_ON(ret);
4612 
4613 			ret = btrfs_free_extent(trans, root,
4614 						extent_key->objectid,
4615 						extent_key->offset,
4616 						leaf->start,
4617 						btrfs_header_owner(leaf),
4618 						btrfs_header_generation(leaf),
4619 						key.objectid, 0);
4620 			BUG_ON(ret);
4621 
4622 			btrfs_release_path(root, path);
4623 			key.offset += num_bytes;
4624 		} else {
4625 			BUG_ON(1);
4626 #if 0
4627 			u64 alloc_hint;
4628 			u64 extent_len;
4629 			int i;
4630 			/*
4631 			 * drop old extent pointer at first, then insert the
4632 			 * new pointers one bye one
4633 			 */
4634 			btrfs_release_path(root, path);
4635 			ret = btrfs_drop_extents(trans, root, inode, key.offset,
4636 						 key.offset + num_bytes,
4637 						 key.offset, &alloc_hint);
4638 			BUG_ON(ret);
4639 
4640 			for (i = 0; i < nr_extents; i++) {
4641 				if (ext_offset >= new_extents[i].num_bytes) {
4642 					ext_offset -= new_extents[i].num_bytes;
4643 					continue;
4644 				}
4645 				extent_len = min(new_extents[i].num_bytes -
4646 						 ext_offset, num_bytes);
4647 
4648 				ret = btrfs_insert_empty_item(trans, root,
4649 							      path, &key,
4650 							      sizeof(*fi));
4651 				BUG_ON(ret);
4652 
4653 				leaf = path->nodes[0];
4654 				fi = btrfs_item_ptr(leaf, path->slots[0],
4655 						struct btrfs_file_extent_item);
4656 				btrfs_set_file_extent_generation(leaf, fi,
4657 							trans->transid);
4658 				btrfs_set_file_extent_type(leaf, fi,
4659 							BTRFS_FILE_EXTENT_REG);
4660 				btrfs_set_file_extent_disk_bytenr(leaf, fi,
4661 						new_extents[i].disk_bytenr);
4662 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4663 						new_extents[i].disk_num_bytes);
4664 				btrfs_set_file_extent_ram_bytes(leaf, fi,
4665 						new_extents[i].ram_bytes);
4666 
4667 				btrfs_set_file_extent_compression(leaf, fi,
4668 						new_extents[i].compression);
4669 				btrfs_set_file_extent_encryption(leaf, fi,
4670 						new_extents[i].encryption);
4671 				btrfs_set_file_extent_other_encoding(leaf, fi,
4672 						new_extents[i].other_encoding);
4673 
4674 				btrfs_set_file_extent_num_bytes(leaf, fi,
4675 							extent_len);
4676 				ext_offset += new_extents[i].offset;
4677 				btrfs_set_file_extent_offset(leaf, fi,
4678 							ext_offset);
4679 				btrfs_mark_buffer_dirty(leaf);
4680 
4681 				btrfs_drop_extent_cache(inode, key.offset,
4682 						key.offset + extent_len - 1, 0);
4683 
4684 				ret = btrfs_inc_extent_ref(trans, root,
4685 						new_extents[i].disk_bytenr,
4686 						new_extents[i].disk_num_bytes,
4687 						leaf->start,
4688 						root->root_key.objectid,
4689 						trans->transid, key.objectid);
4690 				BUG_ON(ret);
4691 				btrfs_release_path(root, path);
4692 
4693 				inode_add_bytes(inode, extent_len);
4694 
4695 				ext_offset = 0;
4696 				num_bytes -= extent_len;
4697 				key.offset += extent_len;
4698 
4699 				if (num_bytes == 0)
4700 					break;
4701 			}
4702 			BUG_ON(i >= nr_extents);
4703 #endif
4704 		}
4705 
4706 		if (extent_locked) {
4707 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4708 				      lock_end, GFP_NOFS);
4709 			extent_locked = 0;
4710 		}
4711 skip:
4712 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4713 		    key.offset >= first_pos + extent_key->offset)
4714 			break;
4715 
4716 		cond_resched();
4717 	}
4718 	ret = 0;
4719 out:
4720 	btrfs_release_path(root, path);
4721 	if (inode) {
4722 		mutex_unlock(&inode->i_mutex);
4723 		if (extent_locked) {
4724 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4725 				      lock_end, GFP_NOFS);
4726 		}
4727 		iput(inode);
4728 	}
4729 	return ret;
4730 }
4731 
4732 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4733 			       struct btrfs_root *root,
4734 			       struct extent_buffer *buf, u64 orig_start)
4735 {
4736 	int level;
4737 	int ret;
4738 
4739 	BUG_ON(btrfs_header_generation(buf) != trans->transid);
4740 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4741 
4742 	level = btrfs_header_level(buf);
4743 	if (level == 0) {
4744 		struct btrfs_leaf_ref *ref;
4745 		struct btrfs_leaf_ref *orig_ref;
4746 
4747 		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4748 		if (!orig_ref)
4749 			return -ENOENT;
4750 
4751 		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4752 		if (!ref) {
4753 			btrfs_free_leaf_ref(root, orig_ref);
4754 			return -ENOMEM;
4755 		}
4756 
4757 		ref->nritems = orig_ref->nritems;
4758 		memcpy(ref->extents, orig_ref->extents,
4759 			sizeof(ref->extents[0]) * ref->nritems);
4760 
4761 		btrfs_free_leaf_ref(root, orig_ref);
4762 
4763 		ref->root_gen = trans->transid;
4764 		ref->bytenr = buf->start;
4765 		ref->owner = btrfs_header_owner(buf);
4766 		ref->generation = btrfs_header_generation(buf);
4767 		ret = btrfs_add_leaf_ref(root, ref, 0);
4768 		WARN_ON(ret);
4769 		btrfs_free_leaf_ref(root, ref);
4770 	}
4771 	return 0;
4772 }
4773 
4774 static int noinline invalidate_extent_cache(struct btrfs_root *root,
4775 					struct extent_buffer *leaf,
4776 					struct btrfs_block_group_cache *group,
4777 					struct btrfs_root *target_root)
4778 {
4779 	struct btrfs_key key;
4780 	struct inode *inode = NULL;
4781 	struct btrfs_file_extent_item *fi;
4782 	u64 num_bytes;
4783 	u64 skip_objectid = 0;
4784 	u32 nritems;
4785 	u32 i;
4786 
4787 	nritems = btrfs_header_nritems(leaf);
4788 	for (i = 0; i < nritems; i++) {
4789 		btrfs_item_key_to_cpu(leaf, &key, i);
4790 		if (key.objectid == skip_objectid ||
4791 		    key.type != BTRFS_EXTENT_DATA_KEY)
4792 			continue;
4793 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4794 		if (btrfs_file_extent_type(leaf, fi) ==
4795 		    BTRFS_FILE_EXTENT_INLINE)
4796 			continue;
4797 		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4798 			continue;
4799 		if (!inode || inode->i_ino != key.objectid) {
4800 			iput(inode);
4801 			inode = btrfs_ilookup(target_root->fs_info->sb,
4802 					      key.objectid, target_root, 1);
4803 		}
4804 		if (!inode) {
4805 			skip_objectid = key.objectid;
4806 			continue;
4807 		}
4808 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4809 
4810 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4811 			    key.offset + num_bytes - 1, GFP_NOFS);
4812 		btrfs_drop_extent_cache(inode, key.offset,
4813 					key.offset + num_bytes - 1, 1);
4814 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4815 			      key.offset + num_bytes - 1, GFP_NOFS);
4816 		cond_resched();
4817 	}
4818 	iput(inode);
4819 	return 0;
4820 }
4821 
4822 static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4823 					struct btrfs_root *root,
4824 					struct extent_buffer *leaf,
4825 					struct btrfs_block_group_cache *group,
4826 					struct inode *reloc_inode)
4827 {
4828 	struct btrfs_key key;
4829 	struct btrfs_key extent_key;
4830 	struct btrfs_file_extent_item *fi;
4831 	struct btrfs_leaf_ref *ref;
4832 	struct disk_extent *new_extent;
4833 	u64 bytenr;
4834 	u64 num_bytes;
4835 	u32 nritems;
4836 	u32 i;
4837 	int ext_index;
4838 	int nr_extent;
4839 	int ret;
4840 
4841 	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4842 	BUG_ON(!new_extent);
4843 
4844 	ref = btrfs_lookup_leaf_ref(root, leaf->start);
4845 	BUG_ON(!ref);
4846 
4847 	ext_index = -1;
4848 	nritems = btrfs_header_nritems(leaf);
4849 	for (i = 0; i < nritems; i++) {
4850 		btrfs_item_key_to_cpu(leaf, &key, i);
4851 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4852 			continue;
4853 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4854 		if (btrfs_file_extent_type(leaf, fi) ==
4855 		    BTRFS_FILE_EXTENT_INLINE)
4856 			continue;
4857 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4858 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4859 		if (bytenr == 0)
4860 			continue;
4861 
4862 		ext_index++;
4863 		if (bytenr >= group->key.objectid + group->key.offset ||
4864 		    bytenr + num_bytes <= group->key.objectid)
4865 			continue;
4866 
4867 		extent_key.objectid = bytenr;
4868 		extent_key.offset = num_bytes;
4869 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4870 		nr_extent = 1;
4871 		ret = get_new_locations(reloc_inode, &extent_key,
4872 					group->key.objectid, 1,
4873 					&new_extent, &nr_extent);
4874 		if (ret > 0)
4875 			continue;
4876 		BUG_ON(ret < 0);
4877 
4878 		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4879 		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4880 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4881 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4882 
4883 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
4884 						new_extent->disk_bytenr);
4885 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4886 						new_extent->disk_num_bytes);
4887 		btrfs_mark_buffer_dirty(leaf);
4888 
4889 		ret = btrfs_inc_extent_ref(trans, root,
4890 					new_extent->disk_bytenr,
4891 					new_extent->disk_num_bytes,
4892 					leaf->start,
4893 					root->root_key.objectid,
4894 					trans->transid, key.objectid);
4895 		BUG_ON(ret);
4896 		ret = btrfs_free_extent(trans, root,
4897 					bytenr, num_bytes, leaf->start,
4898 					btrfs_header_owner(leaf),
4899 					btrfs_header_generation(leaf),
4900 					key.objectid, 0);
4901 		BUG_ON(ret);
4902 		cond_resched();
4903 	}
4904 	kfree(new_extent);
4905 	BUG_ON(ext_index + 1 != ref->nritems);
4906 	btrfs_free_leaf_ref(root, ref);
4907 	return 0;
4908 }
4909 
4910 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4911 			  struct btrfs_root *root)
4912 {
4913 	struct btrfs_root *reloc_root;
4914 	int ret;
4915 
4916 	if (root->reloc_root) {
4917 		reloc_root = root->reloc_root;
4918 		root->reloc_root = NULL;
4919 		list_add(&reloc_root->dead_list,
4920 			 &root->fs_info->dead_reloc_roots);
4921 
4922 		btrfs_set_root_bytenr(&reloc_root->root_item,
4923 				      reloc_root->node->start);
4924 		btrfs_set_root_level(&root->root_item,
4925 				     btrfs_header_level(reloc_root->node));
4926 		memset(&reloc_root->root_item.drop_progress, 0,
4927 			sizeof(struct btrfs_disk_key));
4928 		reloc_root->root_item.drop_level = 0;
4929 
4930 		ret = btrfs_update_root(trans, root->fs_info->tree_root,
4931 					&reloc_root->root_key,
4932 					&reloc_root->root_item);
4933 		BUG_ON(ret);
4934 	}
4935 	return 0;
4936 }
4937 
4938 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4939 {
4940 	struct btrfs_trans_handle *trans;
4941 	struct btrfs_root *reloc_root;
4942 	struct btrfs_root *prev_root = NULL;
4943 	struct list_head dead_roots;
4944 	int ret;
4945 	unsigned long nr;
4946 
4947 	INIT_LIST_HEAD(&dead_roots);
4948 	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4949 
4950 	while (!list_empty(&dead_roots)) {
4951 		reloc_root = list_entry(dead_roots.prev,
4952 					struct btrfs_root, dead_list);
4953 		list_del_init(&reloc_root->dead_list);
4954 
4955 		BUG_ON(reloc_root->commit_root != NULL);
4956 		while (1) {
4957 			trans = btrfs_join_transaction(root, 1);
4958 			BUG_ON(!trans);
4959 
4960 			mutex_lock(&root->fs_info->drop_mutex);
4961 			ret = btrfs_drop_snapshot(trans, reloc_root);
4962 			if (ret != -EAGAIN)
4963 				break;
4964 			mutex_unlock(&root->fs_info->drop_mutex);
4965 
4966 			nr = trans->blocks_used;
4967 			ret = btrfs_end_transaction(trans, root);
4968 			BUG_ON(ret);
4969 			btrfs_btree_balance_dirty(root, nr);
4970 		}
4971 
4972 		free_extent_buffer(reloc_root->node);
4973 
4974 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
4975 				     &reloc_root->root_key);
4976 		BUG_ON(ret);
4977 		mutex_unlock(&root->fs_info->drop_mutex);
4978 
4979 		nr = trans->blocks_used;
4980 		ret = btrfs_end_transaction(trans, root);
4981 		BUG_ON(ret);
4982 		btrfs_btree_balance_dirty(root, nr);
4983 
4984 		kfree(prev_root);
4985 		prev_root = reloc_root;
4986 	}
4987 	if (prev_root) {
4988 		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4989 		kfree(prev_root);
4990 	}
4991 	return 0;
4992 }
4993 
4994 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
4995 {
4996 	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
4997 	return 0;
4998 }
4999 
5000 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5001 {
5002 	struct btrfs_root *reloc_root;
5003 	struct btrfs_trans_handle *trans;
5004 	struct btrfs_key location;
5005 	int found;
5006 	int ret;
5007 
5008 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5009 	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5010 	BUG_ON(ret);
5011 	found = !list_empty(&root->fs_info->dead_reloc_roots);
5012 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5013 
5014 	if (found) {
5015 		trans = btrfs_start_transaction(root, 1);
5016 		BUG_ON(!trans);
5017 		ret = btrfs_commit_transaction(trans, root);
5018 		BUG_ON(ret);
5019 	}
5020 
5021 	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5022 	location.offset = (u64)-1;
5023 	location.type = BTRFS_ROOT_ITEM_KEY;
5024 
5025 	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5026 	BUG_ON(!reloc_root);
5027 	btrfs_orphan_cleanup(reloc_root);
5028 	return 0;
5029 }
5030 
5031 static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
5032 				    struct btrfs_root *root)
5033 {
5034 	struct btrfs_root *reloc_root;
5035 	struct extent_buffer *eb;
5036 	struct btrfs_root_item *root_item;
5037 	struct btrfs_key root_key;
5038 	int ret;
5039 
5040 	BUG_ON(!root->ref_cows);
5041 	if (root->reloc_root)
5042 		return 0;
5043 
5044 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5045 	BUG_ON(!root_item);
5046 
5047 	ret = btrfs_copy_root(trans, root, root->commit_root,
5048 			      &eb, BTRFS_TREE_RELOC_OBJECTID);
5049 	BUG_ON(ret);
5050 
5051 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5052 	root_key.offset = root->root_key.objectid;
5053 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5054 
5055 	memcpy(root_item, &root->root_item, sizeof(root_item));
5056 	btrfs_set_root_refs(root_item, 0);
5057 	btrfs_set_root_bytenr(root_item, eb->start);
5058 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
5059 	btrfs_set_root_generation(root_item, trans->transid);
5060 
5061 	btrfs_tree_unlock(eb);
5062 	free_extent_buffer(eb);
5063 
5064 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5065 				&root_key, root_item);
5066 	BUG_ON(ret);
5067 	kfree(root_item);
5068 
5069 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5070 						 &root_key);
5071 	BUG_ON(!reloc_root);
5072 	reloc_root->last_trans = trans->transid;
5073 	reloc_root->commit_root = NULL;
5074 	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5075 
5076 	root->reloc_root = reloc_root;
5077 	return 0;
5078 }
5079 
5080 /*
5081  * Core function of space balance.
5082  *
5083  * The idea is using reloc trees to relocate tree blocks in reference
5084  * counted roots. There is one reloc tree for each subvol, and all
5085  * reloc trees share same root key objectid. Reloc trees are snapshots
5086  * of the latest committed roots of subvols (root->commit_root).
5087  *
5088  * To relocate a tree block referenced by a subvol, there are two steps.
5089  * COW the block through subvol's reloc tree, then update block pointer
5090  * in the subvol to point to the new block. Since all reloc trees share
5091  * same root key objectid, doing special handing for tree blocks owned
5092  * by them is easy. Once a tree block has been COWed in one reloc tree,
5093  * we can use the resulting new block directly when the same block is
5094  * required to COW again through other reloc trees. By this way, relocated
5095  * tree blocks are shared between reloc trees, so they are also shared
5096  * between subvols.
5097  */
5098 static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
5099 				      struct btrfs_root *root,
5100 				      struct btrfs_path *path,
5101 				      struct btrfs_key *first_key,
5102 				      struct btrfs_ref_path *ref_path,
5103 				      struct btrfs_block_group_cache *group,
5104 				      struct inode *reloc_inode)
5105 {
5106 	struct btrfs_root *reloc_root;
5107 	struct extent_buffer *eb = NULL;
5108 	struct btrfs_key *keys;
5109 	u64 *nodes;
5110 	int level;
5111 	int shared_level;
5112 	int lowest_level = 0;
5113 	int ret;
5114 
5115 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5116 		lowest_level = ref_path->owner_objectid;
5117 
5118 	if (!root->ref_cows) {
5119 		path->lowest_level = lowest_level;
5120 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5121 		BUG_ON(ret < 0);
5122 		path->lowest_level = 0;
5123 		btrfs_release_path(root, path);
5124 		return 0;
5125 	}
5126 
5127 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5128 	ret = init_reloc_tree(trans, root);
5129 	BUG_ON(ret);
5130 	reloc_root = root->reloc_root;
5131 
5132 	shared_level = ref_path->shared_level;
5133 	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5134 
5135 	keys = ref_path->node_keys;
5136 	nodes = ref_path->new_nodes;
5137 	memset(&keys[shared_level + 1], 0,
5138 	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5139 	memset(&nodes[shared_level + 1], 0,
5140 	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5141 
5142 	if (nodes[lowest_level] == 0) {
5143 		path->lowest_level = lowest_level;
5144 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5145 					0, 1);
5146 		BUG_ON(ret);
5147 		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5148 			eb = path->nodes[level];
5149 			if (!eb || eb == reloc_root->node)
5150 				break;
5151 			nodes[level] = eb->start;
5152 			if (level == 0)
5153 				btrfs_item_key_to_cpu(eb, &keys[level], 0);
5154 			else
5155 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
5156 		}
5157 		if (nodes[0] &&
5158 		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5159 			eb = path->nodes[0];
5160 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
5161 						      group, reloc_inode);
5162 			BUG_ON(ret);
5163 		}
5164 		btrfs_release_path(reloc_root, path);
5165 	} else {
5166 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5167 				       lowest_level);
5168 		BUG_ON(ret);
5169 	}
5170 
5171 	/*
5172 	 * replace tree blocks in the fs tree with tree blocks in
5173 	 * the reloc tree.
5174 	 */
5175 	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5176 	BUG_ON(ret < 0);
5177 
5178 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5179 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5180 					0, 0);
5181 		BUG_ON(ret);
5182 		extent_buffer_get(path->nodes[0]);
5183 		eb = path->nodes[0];
5184 		btrfs_release_path(reloc_root, path);
5185 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
5186 		BUG_ON(ret);
5187 		free_extent_buffer(eb);
5188 	}
5189 
5190 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5191 	path->lowest_level = 0;
5192 	return 0;
5193 }
5194 
5195 static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
5196 					struct btrfs_root *root,
5197 					struct btrfs_path *path,
5198 					struct btrfs_key *first_key,
5199 					struct btrfs_ref_path *ref_path)
5200 {
5201 	int ret;
5202 
5203 	ret = relocate_one_path(trans, root, path, first_key,
5204 				ref_path, NULL, NULL);
5205 	BUG_ON(ret);
5206 
5207 	if (root == root->fs_info->extent_root)
5208 		btrfs_extent_post_op(trans, root);
5209 
5210 	return 0;
5211 }
5212 
5213 static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
5214 				    struct btrfs_root *extent_root,
5215 				    struct btrfs_path *path,
5216 				    struct btrfs_key *extent_key)
5217 {
5218 	int ret;
5219 
5220 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5221 	if (ret)
5222 		goto out;
5223 	ret = btrfs_del_item(trans, extent_root, path);
5224 out:
5225 	btrfs_release_path(extent_root, path);
5226 	return ret;
5227 }
5228 
5229 static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
5230 						struct btrfs_ref_path *ref_path)
5231 {
5232 	struct btrfs_key root_key;
5233 
5234 	root_key.objectid = ref_path->root_objectid;
5235 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5236 	if (is_cowonly_root(ref_path->root_objectid))
5237 		root_key.offset = 0;
5238 	else
5239 		root_key.offset = (u64)-1;
5240 
5241 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
5242 }
5243 
5244 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
5245 					struct btrfs_path *path,
5246 					struct btrfs_key *extent_key,
5247 					struct btrfs_block_group_cache *group,
5248 					struct inode *reloc_inode, int pass)
5249 {
5250 	struct btrfs_trans_handle *trans;
5251 	struct btrfs_root *found_root;
5252 	struct btrfs_ref_path *ref_path = NULL;
5253 	struct disk_extent *new_extents = NULL;
5254 	int nr_extents = 0;
5255 	int loops;
5256 	int ret;
5257 	int level;
5258 	struct btrfs_key first_key;
5259 	u64 prev_block = 0;
5260 
5261 
5262 	trans = btrfs_start_transaction(extent_root, 1);
5263 	BUG_ON(!trans);
5264 
5265 	if (extent_key->objectid == 0) {
5266 		ret = del_extent_zero(trans, extent_root, path, extent_key);
5267 		goto out;
5268 	}
5269 
5270 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5271 	if (!ref_path) {
5272 	       ret = -ENOMEM;
5273 	       goto out;
5274 	}
5275 
5276 	for (loops = 0; ; loops++) {
5277 		if (loops == 0) {
5278 			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5279 						   extent_key->objectid);
5280 		} else {
5281 			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5282 		}
5283 		if (ret < 0)
5284 			goto out;
5285 		if (ret > 0)
5286 			break;
5287 
5288 		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5289 		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5290 			continue;
5291 
5292 		found_root = read_ref_root(extent_root->fs_info, ref_path);
5293 		BUG_ON(!found_root);
5294 		/*
5295 		 * for reference counted tree, only process reference paths
5296 		 * rooted at the latest committed root.
5297 		 */
5298 		if (found_root->ref_cows &&
5299 		    ref_path->root_generation != found_root->root_key.offset)
5300 			continue;
5301 
5302 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5303 			if (pass == 0) {
5304 				/*
5305 				 * copy data extents to new locations
5306 				 */
5307 				u64 group_start = group->key.objectid;
5308 				ret = relocate_data_extent(reloc_inode,
5309 							   extent_key,
5310 							   group_start);
5311 				if (ret < 0)
5312 					goto out;
5313 				break;
5314 			}
5315 			level = 0;
5316 		} else {
5317 			level = ref_path->owner_objectid;
5318 		}
5319 
5320 		if (prev_block != ref_path->nodes[level]) {
5321 			struct extent_buffer *eb;
5322 			u64 block_start = ref_path->nodes[level];
5323 			u64 block_size = btrfs_level_size(found_root, level);
5324 
5325 			eb = read_tree_block(found_root, block_start,
5326 					     block_size, 0);
5327 			btrfs_tree_lock(eb);
5328 			BUG_ON(level != btrfs_header_level(eb));
5329 
5330 			if (level == 0)
5331 				btrfs_item_key_to_cpu(eb, &first_key, 0);
5332 			else
5333 				btrfs_node_key_to_cpu(eb, &first_key, 0);
5334 
5335 			btrfs_tree_unlock(eb);
5336 			free_extent_buffer(eb);
5337 			prev_block = block_start;
5338 		}
5339 
5340 		btrfs_record_root_in_trans(found_root);
5341 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5342 			/*
5343 			 * try to update data extent references while
5344 			 * keeping metadata shared between snapshots.
5345 			 */
5346 			if (pass == 1) {
5347 				ret = relocate_one_path(trans, found_root,
5348 						path, &first_key, ref_path,
5349 						group, reloc_inode);
5350 				if (ret < 0)
5351 					goto out;
5352 				continue;
5353 			}
5354 			/*
5355 			 * use fallback method to process the remaining
5356 			 * references.
5357 			 */
5358 			if (!new_extents) {
5359 				u64 group_start = group->key.objectid;
5360 				new_extents = kmalloc(sizeof(*new_extents),
5361 						      GFP_NOFS);
5362 				nr_extents = 1;
5363 				ret = get_new_locations(reloc_inode,
5364 							extent_key,
5365 							group_start, 1,
5366 							&new_extents,
5367 							&nr_extents);
5368 				if (ret)
5369 					goto out;
5370 			}
5371 			ret = replace_one_extent(trans, found_root,
5372 						path, extent_key,
5373 						&first_key, ref_path,
5374 						new_extents, nr_extents);
5375 		} else {
5376 			ret = relocate_tree_block(trans, found_root, path,
5377 						  &first_key, ref_path);
5378 		}
5379 		if (ret < 0)
5380 			goto out;
5381 	}
5382 	ret = 0;
5383 out:
5384 	btrfs_end_transaction(trans, extent_root);
5385 	kfree(new_extents);
5386 	kfree(ref_path);
5387 	return ret;
5388 }
5389 
5390 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5391 {
5392 	u64 num_devices;
5393 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5394 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5395 
5396 	num_devices = root->fs_info->fs_devices->rw_devices;
5397 	if (num_devices == 1) {
5398 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5399 		stripped = flags & ~stripped;
5400 
5401 		/* turn raid0 into single device chunks */
5402 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
5403 			return stripped;
5404 
5405 		/* turn mirroring into duplication */
5406 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5407 			     BTRFS_BLOCK_GROUP_RAID10))
5408 			return stripped | BTRFS_BLOCK_GROUP_DUP;
5409 		return flags;
5410 	} else {
5411 		/* they already had raid on here, just return */
5412 		if (flags & stripped)
5413 			return flags;
5414 
5415 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5416 		stripped = flags & ~stripped;
5417 
5418 		/* switch duplicated blocks with raid1 */
5419 		if (flags & BTRFS_BLOCK_GROUP_DUP)
5420 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
5421 
5422 		/* turn single device chunks into raid0 */
5423 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
5424 	}
5425 	return flags;
5426 }
5427 
5428 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5429 		     struct btrfs_block_group_cache *shrink_block_group,
5430 		     int force)
5431 {
5432 	struct btrfs_trans_handle *trans;
5433 	u64 new_alloc_flags;
5434 	u64 calc;
5435 
5436 	spin_lock(&shrink_block_group->lock);
5437 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5438 		spin_unlock(&shrink_block_group->lock);
5439 
5440 		trans = btrfs_start_transaction(root, 1);
5441 		spin_lock(&shrink_block_group->lock);
5442 
5443 		new_alloc_flags = update_block_group_flags(root,
5444 						   shrink_block_group->flags);
5445 		if (new_alloc_flags != shrink_block_group->flags) {
5446 			calc =
5447 			     btrfs_block_group_used(&shrink_block_group->item);
5448 		} else {
5449 			calc = shrink_block_group->key.offset;
5450 		}
5451 		spin_unlock(&shrink_block_group->lock);
5452 
5453 		do_chunk_alloc(trans, root->fs_info->extent_root,
5454 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
5455 
5456 		btrfs_end_transaction(trans, root);
5457 	} else
5458 		spin_unlock(&shrink_block_group->lock);
5459 	return 0;
5460 }
5461 
5462 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5463 				 struct btrfs_root *root,
5464 				 u64 objectid, u64 size)
5465 {
5466 	struct btrfs_path *path;
5467 	struct btrfs_inode_item *item;
5468 	struct extent_buffer *leaf;
5469 	int ret;
5470 
5471 	path = btrfs_alloc_path();
5472 	if (!path)
5473 		return -ENOMEM;
5474 
5475 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5476 	if (ret)
5477 		goto out;
5478 
5479 	leaf = path->nodes[0];
5480 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5481 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5482 	btrfs_set_inode_generation(leaf, item, 1);
5483 	btrfs_set_inode_size(leaf, item, size);
5484 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5485 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5486 	btrfs_mark_buffer_dirty(leaf);
5487 	btrfs_release_path(root, path);
5488 out:
5489 	btrfs_free_path(path);
5490 	return ret;
5491 }
5492 
5493 static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
5494 					struct btrfs_block_group_cache *group)
5495 {
5496 	struct inode *inode = NULL;
5497 	struct btrfs_trans_handle *trans;
5498 	struct btrfs_root *root;
5499 	struct btrfs_key root_key;
5500 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5501 	int err = 0;
5502 
5503 	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5504 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5505 	root_key.offset = (u64)-1;
5506 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5507 	if (IS_ERR(root))
5508 		return ERR_CAST(root);
5509 
5510 	trans = btrfs_start_transaction(root, 1);
5511 	BUG_ON(!trans);
5512 
5513 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5514 	if (err)
5515 		goto out;
5516 
5517 	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5518 	BUG_ON(err);
5519 
5520 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5521 				       group->key.offset, 0, group->key.offset,
5522 				       0, 0, 0);
5523 	BUG_ON(err);
5524 
5525 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5526 	if (inode->i_state & I_NEW) {
5527 		BTRFS_I(inode)->root = root;
5528 		BTRFS_I(inode)->location.objectid = objectid;
5529 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5530 		BTRFS_I(inode)->location.offset = 0;
5531 		btrfs_read_locked_inode(inode);
5532 		unlock_new_inode(inode);
5533 		BUG_ON(is_bad_inode(inode));
5534 	} else {
5535 		BUG_ON(1);
5536 	}
5537 	BTRFS_I(inode)->index_cnt = group->key.objectid;
5538 
5539 	err = btrfs_orphan_add(trans, inode);
5540 out:
5541 	btrfs_end_transaction(trans, root);
5542 	if (err) {
5543 		if (inode)
5544 			iput(inode);
5545 		inode = ERR_PTR(err);
5546 	}
5547 	return inode;
5548 }
5549 
5550 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5551 {
5552 
5553 	struct btrfs_ordered_sum *sums;
5554 	struct btrfs_sector_sum *sector_sum;
5555 	struct btrfs_ordered_extent *ordered;
5556 	struct btrfs_root *root = BTRFS_I(inode)->root;
5557 	struct list_head list;
5558 	size_t offset;
5559 	int ret;
5560 	u64 disk_bytenr;
5561 
5562 	INIT_LIST_HEAD(&list);
5563 
5564 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5565 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5566 
5567 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5568 	ret = btrfs_lookup_csums_range(root, disk_bytenr,
5569 				       disk_bytenr + len - 1, &list);
5570 
5571 	while (!list_empty(&list)) {
5572 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5573 		list_del_init(&sums->list);
5574 
5575 		sector_sum = sums->sums;
5576 		sums->bytenr = ordered->start;
5577 
5578 		offset = 0;
5579 		while (offset < sums->len) {
5580 			sector_sum->bytenr += ordered->start - disk_bytenr;
5581 			sector_sum++;
5582 			offset += root->sectorsize;
5583 		}
5584 
5585 		btrfs_add_ordered_sum(inode, ordered, sums);
5586 	}
5587 	btrfs_put_ordered_extent(ordered);
5588 	return 0;
5589 }
5590 
5591 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5592 {
5593 	struct btrfs_trans_handle *trans;
5594 	struct btrfs_path *path;
5595 	struct btrfs_fs_info *info = root->fs_info;
5596 	struct extent_buffer *leaf;
5597 	struct inode *reloc_inode;
5598 	struct btrfs_block_group_cache *block_group;
5599 	struct btrfs_key key;
5600 	u64 skipped;
5601 	u64 cur_byte;
5602 	u64 total_found;
5603 	u32 nritems;
5604 	int ret;
5605 	int progress;
5606 	int pass = 0;
5607 
5608 	root = root->fs_info->extent_root;
5609 
5610 	block_group = btrfs_lookup_block_group(info, group_start);
5611 	BUG_ON(!block_group);
5612 
5613 	printk("btrfs relocating block group %llu flags %llu\n",
5614 	       (unsigned long long)block_group->key.objectid,
5615 	       (unsigned long long)block_group->flags);
5616 
5617 	path = btrfs_alloc_path();
5618 	BUG_ON(!path);
5619 
5620 	reloc_inode = create_reloc_inode(info, block_group);
5621 	BUG_ON(IS_ERR(reloc_inode));
5622 
5623 	__alloc_chunk_for_shrink(root, block_group, 1);
5624 	set_block_group_readonly(block_group);
5625 
5626 	btrfs_start_delalloc_inodes(info->tree_root);
5627 	btrfs_wait_ordered_extents(info->tree_root, 0);
5628 again:
5629 	skipped = 0;
5630 	total_found = 0;
5631 	progress = 0;
5632 	key.objectid = block_group->key.objectid;
5633 	key.offset = 0;
5634 	key.type = 0;
5635 	cur_byte = key.objectid;
5636 
5637 	trans = btrfs_start_transaction(info->tree_root, 1);
5638 	btrfs_commit_transaction(trans, info->tree_root);
5639 
5640 	mutex_lock(&root->fs_info->cleaner_mutex);
5641 	btrfs_clean_old_snapshots(info->tree_root);
5642 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5643 	mutex_unlock(&root->fs_info->cleaner_mutex);
5644 
5645 	while(1) {
5646 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5647 		if (ret < 0)
5648 			goto out;
5649 next:
5650 		leaf = path->nodes[0];
5651 		nritems = btrfs_header_nritems(leaf);
5652 		if (path->slots[0] >= nritems) {
5653 			ret = btrfs_next_leaf(root, path);
5654 			if (ret < 0)
5655 				goto out;
5656 			if (ret == 1) {
5657 				ret = 0;
5658 				break;
5659 			}
5660 			leaf = path->nodes[0];
5661 			nritems = btrfs_header_nritems(leaf);
5662 		}
5663 
5664 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5665 
5666 		if (key.objectid >= block_group->key.objectid +
5667 		    block_group->key.offset)
5668 			break;
5669 
5670 		if (progress && need_resched()) {
5671 			btrfs_release_path(root, path);
5672 			cond_resched();
5673 			progress = 0;
5674 			continue;
5675 		}
5676 		progress = 1;
5677 
5678 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5679 		    key.objectid + key.offset <= cur_byte) {
5680 			path->slots[0]++;
5681 			goto next;
5682 		}
5683 
5684 		total_found++;
5685 		cur_byte = key.objectid + key.offset;
5686 		btrfs_release_path(root, path);
5687 
5688 		__alloc_chunk_for_shrink(root, block_group, 0);
5689 		ret = relocate_one_extent(root, path, &key, block_group,
5690 					  reloc_inode, pass);
5691 		BUG_ON(ret < 0);
5692 		if (ret > 0)
5693 			skipped++;
5694 
5695 		key.objectid = cur_byte;
5696 		key.type = 0;
5697 		key.offset = 0;
5698 	}
5699 
5700 	btrfs_release_path(root, path);
5701 
5702 	if (pass == 0) {
5703 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5704 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5705 		WARN_ON(reloc_inode->i_mapping->nrpages);
5706 	}
5707 
5708 	if (total_found > 0) {
5709 		printk("btrfs found %llu extents in pass %d\n",
5710 		       (unsigned long long)total_found, pass);
5711 		pass++;
5712 		if (total_found == skipped && pass > 2) {
5713 			iput(reloc_inode);
5714 			reloc_inode = create_reloc_inode(info, block_group);
5715 			pass = 0;
5716 		}
5717 		goto again;
5718 	}
5719 
5720 	/* delete reloc_inode */
5721 	iput(reloc_inode);
5722 
5723 	/* unpin extents in this range */
5724 	trans = btrfs_start_transaction(info->tree_root, 1);
5725 	btrfs_commit_transaction(trans, info->tree_root);
5726 
5727 	spin_lock(&block_group->lock);
5728 	WARN_ON(block_group->pinned > 0);
5729 	WARN_ON(block_group->reserved > 0);
5730 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5731 	spin_unlock(&block_group->lock);
5732 	put_block_group(block_group);
5733 	ret = 0;
5734 out:
5735 	btrfs_free_path(path);
5736 	return ret;
5737 }
5738 
5739 static int find_first_block_group(struct btrfs_root *root,
5740 		struct btrfs_path *path, struct btrfs_key *key)
5741 {
5742 	int ret = 0;
5743 	struct btrfs_key found_key;
5744 	struct extent_buffer *leaf;
5745 	int slot;
5746 
5747 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5748 	if (ret < 0)
5749 		goto out;
5750 
5751 	while(1) {
5752 		slot = path->slots[0];
5753 		leaf = path->nodes[0];
5754 		if (slot >= btrfs_header_nritems(leaf)) {
5755 			ret = btrfs_next_leaf(root, path);
5756 			if (ret == 0)
5757 				continue;
5758 			if (ret < 0)
5759 				goto out;
5760 			break;
5761 		}
5762 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5763 
5764 		if (found_key.objectid >= key->objectid &&
5765 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5766 			ret = 0;
5767 			goto out;
5768 		}
5769 		path->slots[0]++;
5770 	}
5771 	ret = -ENOENT;
5772 out:
5773 	return ret;
5774 }
5775 
5776 int btrfs_free_block_groups(struct btrfs_fs_info *info)
5777 {
5778 	struct btrfs_block_group_cache *block_group;
5779 	struct rb_node *n;
5780 
5781 	spin_lock(&info->block_group_cache_lock);
5782 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5783 		block_group = rb_entry(n, struct btrfs_block_group_cache,
5784 				       cache_node);
5785 		rb_erase(&block_group->cache_node,
5786 			 &info->block_group_cache_tree);
5787 		spin_unlock(&info->block_group_cache_lock);
5788 
5789 		btrfs_remove_free_space_cache(block_group);
5790 		down_write(&block_group->space_info->groups_sem);
5791 		list_del(&block_group->list);
5792 		up_write(&block_group->space_info->groups_sem);
5793 
5794 		WARN_ON(atomic_read(&block_group->count) != 1);
5795 		kfree(block_group);
5796 
5797 		spin_lock(&info->block_group_cache_lock);
5798 	}
5799 	spin_unlock(&info->block_group_cache_lock);
5800 	return 0;
5801 }
5802 
5803 int btrfs_read_block_groups(struct btrfs_root *root)
5804 {
5805 	struct btrfs_path *path;
5806 	int ret;
5807 	struct btrfs_block_group_cache *cache;
5808 	struct btrfs_fs_info *info = root->fs_info;
5809 	struct btrfs_space_info *space_info;
5810 	struct btrfs_key key;
5811 	struct btrfs_key found_key;
5812 	struct extent_buffer *leaf;
5813 
5814 	root = info->extent_root;
5815 	key.objectid = 0;
5816 	key.offset = 0;
5817 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5818 	path = btrfs_alloc_path();
5819 	if (!path)
5820 		return -ENOMEM;
5821 
5822 	while(1) {
5823 		ret = find_first_block_group(root, path, &key);
5824 		if (ret > 0) {
5825 			ret = 0;
5826 			goto error;
5827 		}
5828 		if (ret != 0)
5829 			goto error;
5830 
5831 		leaf = path->nodes[0];
5832 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5833 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
5834 		if (!cache) {
5835 			ret = -ENOMEM;
5836 			break;
5837 		}
5838 
5839 		atomic_set(&cache->count, 1);
5840 		spin_lock_init(&cache->lock);
5841 		mutex_init(&cache->alloc_mutex);
5842 		mutex_init(&cache->cache_mutex);
5843 		INIT_LIST_HEAD(&cache->list);
5844 		read_extent_buffer(leaf, &cache->item,
5845 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
5846 				   sizeof(cache->item));
5847 		memcpy(&cache->key, &found_key, sizeof(found_key));
5848 
5849 		key.objectid = found_key.objectid + found_key.offset;
5850 		btrfs_release_path(root, path);
5851 		cache->flags = btrfs_block_group_flags(&cache->item);
5852 
5853 		ret = update_space_info(info, cache->flags, found_key.offset,
5854 					btrfs_block_group_used(&cache->item),
5855 					&space_info);
5856 		BUG_ON(ret);
5857 		cache->space_info = space_info;
5858 		down_write(&space_info->groups_sem);
5859 		list_add_tail(&cache->list, &space_info->block_groups);
5860 		up_write(&space_info->groups_sem);
5861 
5862 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
5863 		BUG_ON(ret);
5864 
5865 		set_avail_alloc_bits(root->fs_info, cache->flags);
5866 		if (btrfs_chunk_readonly(root, cache->key.objectid))
5867 			set_block_group_readonly(cache);
5868 	}
5869 	ret = 0;
5870 error:
5871 	btrfs_free_path(path);
5872 	return ret;
5873 }
5874 
5875 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5876 			   struct btrfs_root *root, u64 bytes_used,
5877 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
5878 			   u64 size)
5879 {
5880 	int ret;
5881 	struct btrfs_root *extent_root;
5882 	struct btrfs_block_group_cache *cache;
5883 
5884 	extent_root = root->fs_info->extent_root;
5885 
5886 	root->fs_info->last_trans_new_blockgroup = trans->transid;
5887 
5888 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
5889 	if (!cache)
5890 		return -ENOMEM;
5891 
5892 	cache->key.objectid = chunk_offset;
5893 	cache->key.offset = size;
5894 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5895 	atomic_set(&cache->count, 1);
5896 	spin_lock_init(&cache->lock);
5897 	mutex_init(&cache->alloc_mutex);
5898 	mutex_init(&cache->cache_mutex);
5899 	INIT_LIST_HEAD(&cache->list);
5900 
5901 	btrfs_set_block_group_used(&cache->item, bytes_used);
5902 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5903 	cache->flags = type;
5904 	btrfs_set_block_group_flags(&cache->item, type);
5905 
5906 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5907 				&cache->space_info);
5908 	BUG_ON(ret);
5909 	down_write(&cache->space_info->groups_sem);
5910 	list_add_tail(&cache->list, &cache->space_info->block_groups);
5911 	up_write(&cache->space_info->groups_sem);
5912 
5913 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
5914 	BUG_ON(ret);
5915 
5916 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5917 				sizeof(cache->item));
5918 	BUG_ON(ret);
5919 
5920 	finish_current_insert(trans, extent_root, 0);
5921 	ret = del_pending_extents(trans, extent_root, 0);
5922 	BUG_ON(ret);
5923 	set_avail_alloc_bits(extent_root->fs_info, type);
5924 
5925 	return 0;
5926 }
5927 
5928 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5929 			     struct btrfs_root *root, u64 group_start)
5930 {
5931 	struct btrfs_path *path;
5932 	struct btrfs_block_group_cache *block_group;
5933 	struct btrfs_key key;
5934 	int ret;
5935 
5936 	root = root->fs_info->extent_root;
5937 
5938 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5939 	BUG_ON(!block_group);
5940 	BUG_ON(!block_group->ro);
5941 
5942 	memcpy(&key, &block_group->key, sizeof(key));
5943 
5944 	path = btrfs_alloc_path();
5945 	BUG_ON(!path);
5946 
5947 	btrfs_remove_free_space_cache(block_group);
5948 	rb_erase(&block_group->cache_node,
5949 		 &root->fs_info->block_group_cache_tree);
5950 	down_write(&block_group->space_info->groups_sem);
5951 	list_del(&block_group->list);
5952 	up_write(&block_group->space_info->groups_sem);
5953 
5954 	spin_lock(&block_group->space_info->lock);
5955 	block_group->space_info->total_bytes -= block_group->key.offset;
5956 	block_group->space_info->bytes_readonly -= block_group->key.offset;
5957 	spin_unlock(&block_group->space_info->lock);
5958 	block_group->space_info->full = 0;
5959 
5960 	put_block_group(block_group);
5961 	put_block_group(block_group);
5962 
5963 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5964 	if (ret > 0)
5965 		ret = -EIO;
5966 	if (ret < 0)
5967 		goto out;
5968 
5969 	ret = btrfs_del_item(trans, root, path);
5970 out:
5971 	btrfs_free_path(path);
5972 	return ret;
5973 }
5974