xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include "compat.h"
25 #include "hash.h"
26 #include "crc32c.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "ref-cache.h"
34 
35 #define PENDING_EXTENT_INSERT 0
36 #define PENDING_EXTENT_DELETE 1
37 #define PENDING_BACKREF_UPDATE 2
38 
39 struct pending_extent_op {
40 	int type;
41 	u64 bytenr;
42 	u64 num_bytes;
43 	u64 parent;
44 	u64 orig_parent;
45 	u64 generation;
46 	u64 orig_generation;
47 	int level;
48 	struct list_head list;
49 	int del;
50 };
51 
52 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53 					 struct btrfs_root *root, u64 parent,
54 					 u64 root_objectid, u64 ref_generation,
55 					 u64 owner, struct btrfs_key *ins,
56 					 int ref_mod);
57 static int update_reserved_extents(struct btrfs_root *root,
58 				   u64 bytenr, u64 num, int reserve);
59 static int update_block_group(struct btrfs_trans_handle *trans,
60 			      struct btrfs_root *root,
61 			      u64 bytenr, u64 num_bytes, int alloc,
62 			      int mark_free);
63 static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
64 					struct btrfs_root *root,
65 					u64 bytenr, u64 num_bytes, u64 parent,
66 					u64 root_objectid, u64 ref_generation,
67 					u64 owner_objectid, int pin,
68 					int ref_to_drop);
69 
70 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
71 			  struct btrfs_root *extent_root, u64 alloc_bytes,
72 			  u64 flags, int force);
73 
74 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
75 {
76 	return (cache->flags & bits) == bits;
77 }
78 
79 /*
80  * this adds the block group to the fs_info rb tree for the block group
81  * cache
82  */
83 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
84 				struct btrfs_block_group_cache *block_group)
85 {
86 	struct rb_node **p;
87 	struct rb_node *parent = NULL;
88 	struct btrfs_block_group_cache *cache;
89 
90 	spin_lock(&info->block_group_cache_lock);
91 	p = &info->block_group_cache_tree.rb_node;
92 
93 	while (*p) {
94 		parent = *p;
95 		cache = rb_entry(parent, struct btrfs_block_group_cache,
96 				 cache_node);
97 		if (block_group->key.objectid < cache->key.objectid) {
98 			p = &(*p)->rb_left;
99 		} else if (block_group->key.objectid > cache->key.objectid) {
100 			p = &(*p)->rb_right;
101 		} else {
102 			spin_unlock(&info->block_group_cache_lock);
103 			return -EEXIST;
104 		}
105 	}
106 
107 	rb_link_node(&block_group->cache_node, parent, p);
108 	rb_insert_color(&block_group->cache_node,
109 			&info->block_group_cache_tree);
110 	spin_unlock(&info->block_group_cache_lock);
111 
112 	return 0;
113 }
114 
115 /*
116  * This will return the block group at or after bytenr if contains is 0, else
117  * it will return the block group that contains the bytenr
118  */
119 static struct btrfs_block_group_cache *
120 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
121 			      int contains)
122 {
123 	struct btrfs_block_group_cache *cache, *ret = NULL;
124 	struct rb_node *n;
125 	u64 end, start;
126 
127 	spin_lock(&info->block_group_cache_lock);
128 	n = info->block_group_cache_tree.rb_node;
129 
130 	while (n) {
131 		cache = rb_entry(n, struct btrfs_block_group_cache,
132 				 cache_node);
133 		end = cache->key.objectid + cache->key.offset - 1;
134 		start = cache->key.objectid;
135 
136 		if (bytenr < start) {
137 			if (!contains && (!ret || start < ret->key.objectid))
138 				ret = cache;
139 			n = n->rb_left;
140 		} else if (bytenr > start) {
141 			if (contains && bytenr <= end) {
142 				ret = cache;
143 				break;
144 			}
145 			n = n->rb_right;
146 		} else {
147 			ret = cache;
148 			break;
149 		}
150 	}
151 	if (ret)
152 		atomic_inc(&ret->count);
153 	spin_unlock(&info->block_group_cache_lock);
154 
155 	return ret;
156 }
157 
158 /*
159  * this is only called by cache_block_group, since we could have freed extents
160  * we need to check the pinned_extents for any extents that can't be used yet
161  * since their free space will be released as soon as the transaction commits.
162  */
163 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
164 			      struct btrfs_fs_info *info, u64 start, u64 end)
165 {
166 	u64 extent_start, extent_end, size;
167 	int ret;
168 
169 	mutex_lock(&info->pinned_mutex);
170 	while (start < end) {
171 		ret = find_first_extent_bit(&info->pinned_extents, start,
172 					    &extent_start, &extent_end,
173 					    EXTENT_DIRTY);
174 		if (ret)
175 			break;
176 
177 		if (extent_start == start) {
178 			start = extent_end + 1;
179 		} else if (extent_start > start && extent_start < end) {
180 			size = extent_start - start;
181 			ret = btrfs_add_free_space(block_group, start,
182 						   size);
183 			BUG_ON(ret);
184 			start = extent_end + 1;
185 		} else {
186 			break;
187 		}
188 	}
189 
190 	if (start < end) {
191 		size = end - start;
192 		ret = btrfs_add_free_space(block_group, start, size);
193 		BUG_ON(ret);
194 	}
195 	mutex_unlock(&info->pinned_mutex);
196 
197 	return 0;
198 }
199 
200 static int remove_sb_from_cache(struct btrfs_root *root,
201 				struct btrfs_block_group_cache *cache)
202 {
203 	u64 bytenr;
204 	u64 *logical;
205 	int stripe_len;
206 	int i, nr, ret;
207 
208 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
209 		bytenr = btrfs_sb_offset(i);
210 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
211 				       cache->key.objectid, bytenr, 0,
212 				       &logical, &nr, &stripe_len);
213 		BUG_ON(ret);
214 		while (nr--) {
215 			btrfs_remove_free_space(cache, logical[nr],
216 						stripe_len);
217 		}
218 		kfree(logical);
219 	}
220 	return 0;
221 }
222 
223 static int cache_block_group(struct btrfs_root *root,
224 			     struct btrfs_block_group_cache *block_group)
225 {
226 	struct btrfs_path *path;
227 	int ret = 0;
228 	struct btrfs_key key;
229 	struct extent_buffer *leaf;
230 	int slot;
231 	u64 last;
232 
233 	if (!block_group)
234 		return 0;
235 
236 	root = root->fs_info->extent_root;
237 
238 	if (block_group->cached)
239 		return 0;
240 
241 	path = btrfs_alloc_path();
242 	if (!path)
243 		return -ENOMEM;
244 
245 	path->reada = 2;
246 	/*
247 	 * we get into deadlocks with paths held by callers of this function.
248 	 * since the alloc_mutex is protecting things right now, just
249 	 * skip the locking here
250 	 */
251 	path->skip_locking = 1;
252 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
253 	key.objectid = last;
254 	key.offset = 0;
255 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
256 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
257 	if (ret < 0)
258 		goto err;
259 
260 	while (1) {
261 		leaf = path->nodes[0];
262 		slot = path->slots[0];
263 		if (slot >= btrfs_header_nritems(leaf)) {
264 			ret = btrfs_next_leaf(root, path);
265 			if (ret < 0)
266 				goto err;
267 			if (ret == 0)
268 				continue;
269 			else
270 				break;
271 		}
272 		btrfs_item_key_to_cpu(leaf, &key, slot);
273 		if (key.objectid < block_group->key.objectid)
274 			goto next;
275 
276 		if (key.objectid >= block_group->key.objectid +
277 		    block_group->key.offset)
278 			break;
279 
280 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
281 			add_new_free_space(block_group, root->fs_info, last,
282 					   key.objectid);
283 
284 			last = key.objectid + key.offset;
285 		}
286 next:
287 		path->slots[0]++;
288 	}
289 
290 	add_new_free_space(block_group, root->fs_info, last,
291 			   block_group->key.objectid +
292 			   block_group->key.offset);
293 
294 	remove_sb_from_cache(root, block_group);
295 	block_group->cached = 1;
296 	ret = 0;
297 err:
298 	btrfs_free_path(path);
299 	return ret;
300 }
301 
302 /*
303  * return the block group that starts at or after bytenr
304  */
305 static struct btrfs_block_group_cache *
306 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
307 {
308 	struct btrfs_block_group_cache *cache;
309 
310 	cache = block_group_cache_tree_search(info, bytenr, 0);
311 
312 	return cache;
313 }
314 
315 /*
316  * return the block group that contains teh given bytenr
317  */
318 struct btrfs_block_group_cache *btrfs_lookup_block_group(
319 						 struct btrfs_fs_info *info,
320 						 u64 bytenr)
321 {
322 	struct btrfs_block_group_cache *cache;
323 
324 	cache = block_group_cache_tree_search(info, bytenr, 1);
325 
326 	return cache;
327 }
328 
329 static inline void put_block_group(struct btrfs_block_group_cache *cache)
330 {
331 	if (atomic_dec_and_test(&cache->count))
332 		kfree(cache);
333 }
334 
335 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
336 						  u64 flags)
337 {
338 	struct list_head *head = &info->space_info;
339 	struct btrfs_space_info *found;
340 
341 	rcu_read_lock();
342 	list_for_each_entry_rcu(found, head, list) {
343 		if (found->flags == flags) {
344 			rcu_read_unlock();
345 			return found;
346 		}
347 	}
348 	rcu_read_unlock();
349 	return NULL;
350 }
351 
352 /*
353  * after adding space to the filesystem, we need to clear the full flags
354  * on all the space infos.
355  */
356 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
357 {
358 	struct list_head *head = &info->space_info;
359 	struct btrfs_space_info *found;
360 
361 	rcu_read_lock();
362 	list_for_each_entry_rcu(found, head, list)
363 		found->full = 0;
364 	rcu_read_unlock();
365 }
366 
367 static u64 div_factor(u64 num, int factor)
368 {
369 	if (factor == 10)
370 		return num;
371 	num *= factor;
372 	do_div(num, 10);
373 	return num;
374 }
375 
376 u64 btrfs_find_block_group(struct btrfs_root *root,
377 			   u64 search_start, u64 search_hint, int owner)
378 {
379 	struct btrfs_block_group_cache *cache;
380 	u64 used;
381 	u64 last = max(search_hint, search_start);
382 	u64 group_start = 0;
383 	int full_search = 0;
384 	int factor = 9;
385 	int wrapped = 0;
386 again:
387 	while (1) {
388 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
389 		if (!cache)
390 			break;
391 
392 		spin_lock(&cache->lock);
393 		last = cache->key.objectid + cache->key.offset;
394 		used = btrfs_block_group_used(&cache->item);
395 
396 		if ((full_search || !cache->ro) &&
397 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
398 			if (used + cache->pinned + cache->reserved <
399 			    div_factor(cache->key.offset, factor)) {
400 				group_start = cache->key.objectid;
401 				spin_unlock(&cache->lock);
402 				put_block_group(cache);
403 				goto found;
404 			}
405 		}
406 		spin_unlock(&cache->lock);
407 		put_block_group(cache);
408 		cond_resched();
409 	}
410 	if (!wrapped) {
411 		last = search_start;
412 		wrapped = 1;
413 		goto again;
414 	}
415 	if (!full_search && factor < 10) {
416 		last = search_start;
417 		full_search = 1;
418 		factor = 10;
419 		goto again;
420 	}
421 found:
422 	return group_start;
423 }
424 
425 /* simple helper to search for an existing extent at a given offset */
426 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
427 {
428 	int ret;
429 	struct btrfs_key key;
430 	struct btrfs_path *path;
431 
432 	path = btrfs_alloc_path();
433 	BUG_ON(!path);
434 	key.objectid = start;
435 	key.offset = len;
436 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
437 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
438 				0, 0);
439 	btrfs_free_path(path);
440 	return ret;
441 }
442 
443 /*
444  * Back reference rules.  Back refs have three main goals:
445  *
446  * 1) differentiate between all holders of references to an extent so that
447  *    when a reference is dropped we can make sure it was a valid reference
448  *    before freeing the extent.
449  *
450  * 2) Provide enough information to quickly find the holders of an extent
451  *    if we notice a given block is corrupted or bad.
452  *
453  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
454  *    maintenance.  This is actually the same as #2, but with a slightly
455  *    different use case.
456  *
457  * File extents can be referenced by:
458  *
459  * - multiple snapshots, subvolumes, or different generations in one subvol
460  * - different files inside a single subvolume
461  * - different offsets inside a file (bookend extents in file.c)
462  *
463  * The extent ref structure has fields for:
464  *
465  * - Objectid of the subvolume root
466  * - Generation number of the tree holding the reference
467  * - objectid of the file holding the reference
468  * - number of references holding by parent node (alway 1 for tree blocks)
469  *
470  * Btree leaf may hold multiple references to a file extent. In most cases,
471  * these references are from same file and the corresponding offsets inside
472  * the file are close together.
473  *
474  * When a file extent is allocated the fields are filled in:
475  *     (root_key.objectid, trans->transid, inode objectid, 1)
476  *
477  * When a leaf is cow'd new references are added for every file extent found
478  * in the leaf.  It looks similar to the create case, but trans->transid will
479  * be different when the block is cow'd.
480  *
481  *     (root_key.objectid, trans->transid, inode objectid,
482  *      number of references in the leaf)
483  *
484  * When a file extent is removed either during snapshot deletion or
485  * file truncation, we find the corresponding back reference and check
486  * the following fields:
487  *
488  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
489  *      inode objectid)
490  *
491  * Btree extents can be referenced by:
492  *
493  * - Different subvolumes
494  * - Different generations of the same subvolume
495  *
496  * When a tree block is created, back references are inserted:
497  *
498  * (root->root_key.objectid, trans->transid, level, 1)
499  *
500  * When a tree block is cow'd, new back references are added for all the
501  * blocks it points to. If the tree block isn't in reference counted root,
502  * the old back references are removed. These new back references are of
503  * the form (trans->transid will have increased since creation):
504  *
505  * (root->root_key.objectid, trans->transid, level, 1)
506  *
507  * When a backref is in deleting, the following fields are checked:
508  *
509  * if backref was for a tree root:
510  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
511  * else
512  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
513  *
514  * Back Reference Key composing:
515  *
516  * The key objectid corresponds to the first byte in the extent, the key
517  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
518  * byte of parent extent. If a extent is tree root, the key offset is set
519  * to the key objectid.
520  */
521 
522 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
523 					  struct btrfs_root *root,
524 					  struct btrfs_path *path,
525 					  u64 bytenr, u64 parent,
526 					  u64 ref_root, u64 ref_generation,
527 					  u64 owner_objectid, int del)
528 {
529 	struct btrfs_key key;
530 	struct btrfs_extent_ref *ref;
531 	struct extent_buffer *leaf;
532 	u64 ref_objectid;
533 	int ret;
534 
535 	key.objectid = bytenr;
536 	key.type = BTRFS_EXTENT_REF_KEY;
537 	key.offset = parent;
538 
539 	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
540 	if (ret < 0)
541 		goto out;
542 	if (ret > 0) {
543 		ret = -ENOENT;
544 		goto out;
545 	}
546 
547 	leaf = path->nodes[0];
548 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
549 	ref_objectid = btrfs_ref_objectid(leaf, ref);
550 	if (btrfs_ref_root(leaf, ref) != ref_root ||
551 	    btrfs_ref_generation(leaf, ref) != ref_generation ||
552 	    (ref_objectid != owner_objectid &&
553 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
554 		ret = -EIO;
555 		WARN_ON(1);
556 		goto out;
557 	}
558 	ret = 0;
559 out:
560 	return ret;
561 }
562 
563 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
564 					  struct btrfs_root *root,
565 					  struct btrfs_path *path,
566 					  u64 bytenr, u64 parent,
567 					  u64 ref_root, u64 ref_generation,
568 					  u64 owner_objectid,
569 					  int refs_to_add)
570 {
571 	struct btrfs_key key;
572 	struct extent_buffer *leaf;
573 	struct btrfs_extent_ref *ref;
574 	u32 num_refs;
575 	int ret;
576 
577 	key.objectid = bytenr;
578 	key.type = BTRFS_EXTENT_REF_KEY;
579 	key.offset = parent;
580 
581 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
582 	if (ret == 0) {
583 		leaf = path->nodes[0];
584 		ref = btrfs_item_ptr(leaf, path->slots[0],
585 				     struct btrfs_extent_ref);
586 		btrfs_set_ref_root(leaf, ref, ref_root);
587 		btrfs_set_ref_generation(leaf, ref, ref_generation);
588 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
589 		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
590 	} else if (ret == -EEXIST) {
591 		u64 existing_owner;
592 
593 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
594 		leaf = path->nodes[0];
595 		ref = btrfs_item_ptr(leaf, path->slots[0],
596 				     struct btrfs_extent_ref);
597 		if (btrfs_ref_root(leaf, ref) != ref_root ||
598 		    btrfs_ref_generation(leaf, ref) != ref_generation) {
599 			ret = -EIO;
600 			WARN_ON(1);
601 			goto out;
602 		}
603 
604 		num_refs = btrfs_ref_num_refs(leaf, ref);
605 		BUG_ON(num_refs == 0);
606 		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
607 
608 		existing_owner = btrfs_ref_objectid(leaf, ref);
609 		if (existing_owner != owner_objectid &&
610 		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
611 			btrfs_set_ref_objectid(leaf, ref,
612 					BTRFS_MULTIPLE_OBJECTIDS);
613 		}
614 		ret = 0;
615 	} else {
616 		goto out;
617 	}
618 	btrfs_unlock_up_safe(path, 1);
619 	btrfs_mark_buffer_dirty(path->nodes[0]);
620 out:
621 	btrfs_release_path(root, path);
622 	return ret;
623 }
624 
625 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
626 					  struct btrfs_root *root,
627 					  struct btrfs_path *path,
628 					  int refs_to_drop)
629 {
630 	struct extent_buffer *leaf;
631 	struct btrfs_extent_ref *ref;
632 	u32 num_refs;
633 	int ret = 0;
634 
635 	leaf = path->nodes[0];
636 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
637 	num_refs = btrfs_ref_num_refs(leaf, ref);
638 	BUG_ON(num_refs < refs_to_drop);
639 	num_refs -= refs_to_drop;
640 	if (num_refs == 0) {
641 		ret = btrfs_del_item(trans, root, path);
642 	} else {
643 		btrfs_set_ref_num_refs(leaf, ref, num_refs);
644 		btrfs_mark_buffer_dirty(leaf);
645 	}
646 	btrfs_release_path(root, path);
647 	return ret;
648 }
649 
650 #ifdef BIO_RW_DISCARD
651 static void btrfs_issue_discard(struct block_device *bdev,
652 				u64 start, u64 len)
653 {
654 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
655 }
656 #endif
657 
658 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
659 				u64 num_bytes)
660 {
661 #ifdef BIO_RW_DISCARD
662 	int ret;
663 	u64 map_length = num_bytes;
664 	struct btrfs_multi_bio *multi = NULL;
665 
666 	/* Tell the block device(s) that the sectors can be discarded */
667 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
668 			      bytenr, &map_length, &multi, 0);
669 	if (!ret) {
670 		struct btrfs_bio_stripe *stripe = multi->stripes;
671 		int i;
672 
673 		if (map_length > num_bytes)
674 			map_length = num_bytes;
675 
676 		for (i = 0; i < multi->num_stripes; i++, stripe++) {
677 			btrfs_issue_discard(stripe->dev->bdev,
678 					    stripe->physical,
679 					    map_length);
680 		}
681 		kfree(multi);
682 	}
683 
684 	return ret;
685 #else
686 	return 0;
687 #endif
688 }
689 
690 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
691 				     struct btrfs_root *root, u64 bytenr,
692 				     u64 num_bytes,
693 				     u64 orig_parent, u64 parent,
694 				     u64 orig_root, u64 ref_root,
695 				     u64 orig_generation, u64 ref_generation,
696 				     u64 owner_objectid)
697 {
698 	int ret;
699 	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
700 
701 	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
702 				       orig_parent, parent, orig_root,
703 				       ref_root, orig_generation,
704 				       ref_generation, owner_objectid, pin);
705 	BUG_ON(ret);
706 	return ret;
707 }
708 
709 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
710 			    struct btrfs_root *root, u64 bytenr,
711 			    u64 num_bytes, u64 orig_parent, u64 parent,
712 			    u64 ref_root, u64 ref_generation,
713 			    u64 owner_objectid)
714 {
715 	int ret;
716 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
717 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
718 		return 0;
719 
720 	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
721 					orig_parent, parent, ref_root,
722 					ref_root, ref_generation,
723 					ref_generation, owner_objectid);
724 	return ret;
725 }
726 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
727 				  struct btrfs_root *root, u64 bytenr,
728 				  u64 num_bytes,
729 				  u64 orig_parent, u64 parent,
730 				  u64 orig_root, u64 ref_root,
731 				  u64 orig_generation, u64 ref_generation,
732 				  u64 owner_objectid)
733 {
734 	int ret;
735 
736 	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
737 				    ref_generation, owner_objectid,
738 				    BTRFS_ADD_DELAYED_REF, 0);
739 	BUG_ON(ret);
740 	return ret;
741 }
742 
743 static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
744 			  struct btrfs_root *root, u64 bytenr,
745 			  u64 num_bytes, u64 parent, u64 ref_root,
746 			  u64 ref_generation, u64 owner_objectid,
747 			  int refs_to_add)
748 {
749 	struct btrfs_path *path;
750 	int ret;
751 	struct btrfs_key key;
752 	struct extent_buffer *l;
753 	struct btrfs_extent_item *item;
754 	u32 refs;
755 
756 	path = btrfs_alloc_path();
757 	if (!path)
758 		return -ENOMEM;
759 
760 	path->reada = 1;
761 	path->leave_spinning = 1;
762 	key.objectid = bytenr;
763 	key.type = BTRFS_EXTENT_ITEM_KEY;
764 	key.offset = num_bytes;
765 
766 	/* first find the extent item and update its reference count */
767 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
768 				path, 0, 1);
769 	if (ret < 0) {
770 		btrfs_set_path_blocking(path);
771 		return ret;
772 	}
773 
774 	if (ret > 0) {
775 		WARN_ON(1);
776 		btrfs_free_path(path);
777 		return -EIO;
778 	}
779 	l = path->nodes[0];
780 
781 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
782 	if (key.objectid != bytenr) {
783 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
784 		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
785 		       (unsigned long long)bytenr,
786 		       (unsigned long long)key.objectid);
787 		BUG();
788 	}
789 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
790 
791 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
792 
793 	refs = btrfs_extent_refs(l, item);
794 	btrfs_set_extent_refs(l, item, refs + refs_to_add);
795 	btrfs_unlock_up_safe(path, 1);
796 
797 	btrfs_mark_buffer_dirty(path->nodes[0]);
798 
799 	btrfs_release_path(root->fs_info->extent_root, path);
800 
801 	path->reada = 1;
802 	path->leave_spinning = 1;
803 
804 	/* now insert the actual backref */
805 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
806 				    path, bytenr, parent,
807 				    ref_root, ref_generation,
808 				    owner_objectid, refs_to_add);
809 	BUG_ON(ret);
810 	btrfs_free_path(path);
811 	return 0;
812 }
813 
814 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
815 			 struct btrfs_root *root,
816 			 u64 bytenr, u64 num_bytes, u64 parent,
817 			 u64 ref_root, u64 ref_generation,
818 			 u64 owner_objectid)
819 {
820 	int ret;
821 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
822 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
823 		return 0;
824 
825 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
826 				     0, ref_root, 0, ref_generation,
827 				     owner_objectid);
828 	return ret;
829 }
830 
831 static int drop_delayed_ref(struct btrfs_trans_handle *trans,
832 					struct btrfs_root *root,
833 					struct btrfs_delayed_ref_node *node)
834 {
835 	int ret = 0;
836 	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
837 
838 	BUG_ON(node->ref_mod == 0);
839 	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
840 				  node->parent, ref->root, ref->generation,
841 				  ref->owner_objectid, ref->pin, node->ref_mod);
842 
843 	return ret;
844 }
845 
846 /* helper function to actually process a single delayed ref entry */
847 static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
848 					struct btrfs_root *root,
849 					struct btrfs_delayed_ref_node *node,
850 					int insert_reserved)
851 {
852 	int ret;
853 	struct btrfs_delayed_ref *ref;
854 
855 	if (node->parent == (u64)-1) {
856 		struct btrfs_delayed_ref_head *head;
857 		/*
858 		 * we've hit the end of the chain and we were supposed
859 		 * to insert this extent into the tree.  But, it got
860 		 * deleted before we ever needed to insert it, so all
861 		 * we have to do is clean up the accounting
862 		 */
863 		if (insert_reserved) {
864 			update_reserved_extents(root, node->bytenr,
865 						node->num_bytes, 0);
866 		}
867 		head = btrfs_delayed_node_to_head(node);
868 		mutex_unlock(&head->mutex);
869 		return 0;
870 	}
871 
872 	ref = btrfs_delayed_node_to_ref(node);
873 	if (ref->action == BTRFS_ADD_DELAYED_REF) {
874 		if (insert_reserved) {
875 			struct btrfs_key ins;
876 
877 			ins.objectid = node->bytenr;
878 			ins.offset = node->num_bytes;
879 			ins.type = BTRFS_EXTENT_ITEM_KEY;
880 
881 			/* record the full extent allocation */
882 			ret = __btrfs_alloc_reserved_extent(trans, root,
883 					node->parent, ref->root,
884 					ref->generation, ref->owner_objectid,
885 					&ins, node->ref_mod);
886 			update_reserved_extents(root, node->bytenr,
887 						node->num_bytes, 0);
888 		} else {
889 			/* just add one backref */
890 			ret = add_extent_ref(trans, root, node->bytenr,
891 				     node->num_bytes,
892 				     node->parent, ref->root, ref->generation,
893 				     ref->owner_objectid, node->ref_mod);
894 		}
895 		BUG_ON(ret);
896 	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
897 		WARN_ON(insert_reserved);
898 		ret = drop_delayed_ref(trans, root, node);
899 	}
900 	return 0;
901 }
902 
903 static noinline struct btrfs_delayed_ref_node *
904 select_delayed_ref(struct btrfs_delayed_ref_head *head)
905 {
906 	struct rb_node *node;
907 	struct btrfs_delayed_ref_node *ref;
908 	int action = BTRFS_ADD_DELAYED_REF;
909 again:
910 	/*
911 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
912 	 * this prevents ref count from going down to zero when
913 	 * there still are pending delayed ref.
914 	 */
915 	node = rb_prev(&head->node.rb_node);
916 	while (1) {
917 		if (!node)
918 			break;
919 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
920 				rb_node);
921 		if (ref->bytenr != head->node.bytenr)
922 			break;
923 		if (btrfs_delayed_node_to_ref(ref)->action == action)
924 			return ref;
925 		node = rb_prev(node);
926 	}
927 	if (action == BTRFS_ADD_DELAYED_REF) {
928 		action = BTRFS_DROP_DELAYED_REF;
929 		goto again;
930 	}
931 	return NULL;
932 }
933 
934 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
935 				       struct btrfs_root *root,
936 				       struct list_head *cluster)
937 {
938 	struct btrfs_delayed_ref_root *delayed_refs;
939 	struct btrfs_delayed_ref_node *ref;
940 	struct btrfs_delayed_ref_head *locked_ref = NULL;
941 	int ret;
942 	int count = 0;
943 	int must_insert_reserved = 0;
944 
945 	delayed_refs = &trans->transaction->delayed_refs;
946 	while (1) {
947 		if (!locked_ref) {
948 			/* pick a new head ref from the cluster list */
949 			if (list_empty(cluster))
950 				break;
951 
952 			locked_ref = list_entry(cluster->next,
953 				     struct btrfs_delayed_ref_head, cluster);
954 
955 			/* grab the lock that says we are going to process
956 			 * all the refs for this head */
957 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
958 
959 			/*
960 			 * we may have dropped the spin lock to get the head
961 			 * mutex lock, and that might have given someone else
962 			 * time to free the head.  If that's true, it has been
963 			 * removed from our list and we can move on.
964 			 */
965 			if (ret == -EAGAIN) {
966 				locked_ref = NULL;
967 				count++;
968 				continue;
969 			}
970 		}
971 
972 		/*
973 		 * record the must insert reserved flag before we
974 		 * drop the spin lock.
975 		 */
976 		must_insert_reserved = locked_ref->must_insert_reserved;
977 		locked_ref->must_insert_reserved = 0;
978 
979 		/*
980 		 * locked_ref is the head node, so we have to go one
981 		 * node back for any delayed ref updates
982 		 */
983 		ref = select_delayed_ref(locked_ref);
984 		if (!ref) {
985 			/* All delayed refs have been processed, Go ahead
986 			 * and send the head node to run_one_delayed_ref,
987 			 * so that any accounting fixes can happen
988 			 */
989 			ref = &locked_ref->node;
990 			list_del_init(&locked_ref->cluster);
991 			locked_ref = NULL;
992 		}
993 
994 		ref->in_tree = 0;
995 		rb_erase(&ref->rb_node, &delayed_refs->root);
996 		delayed_refs->num_entries--;
997 		spin_unlock(&delayed_refs->lock);
998 
999 		ret = run_one_delayed_ref(trans, root, ref,
1000 					  must_insert_reserved);
1001 		BUG_ON(ret);
1002 		btrfs_put_delayed_ref(ref);
1003 
1004 		count++;
1005 		cond_resched();
1006 		spin_lock(&delayed_refs->lock);
1007 	}
1008 	return count;
1009 }
1010 
1011 /*
1012  * this starts processing the delayed reference count updates and
1013  * extent insertions we have queued up so far.  count can be
1014  * 0, which means to process everything in the tree at the start
1015  * of the run (but not newly added entries), or it can be some target
1016  * number you'd like to process.
1017  */
1018 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1019 			   struct btrfs_root *root, unsigned long count)
1020 {
1021 	struct rb_node *node;
1022 	struct btrfs_delayed_ref_root *delayed_refs;
1023 	struct btrfs_delayed_ref_node *ref;
1024 	struct list_head cluster;
1025 	int ret;
1026 	int run_all = count == (unsigned long)-1;
1027 	int run_most = 0;
1028 
1029 	if (root == root->fs_info->extent_root)
1030 		root = root->fs_info->tree_root;
1031 
1032 	delayed_refs = &trans->transaction->delayed_refs;
1033 	INIT_LIST_HEAD(&cluster);
1034 again:
1035 	spin_lock(&delayed_refs->lock);
1036 	if (count == 0) {
1037 		count = delayed_refs->num_entries * 2;
1038 		run_most = 1;
1039 	}
1040 	while (1) {
1041 		if (!(run_all || run_most) &&
1042 		    delayed_refs->num_heads_ready < 64)
1043 			break;
1044 
1045 		/*
1046 		 * go find something we can process in the rbtree.  We start at
1047 		 * the beginning of the tree, and then build a cluster
1048 		 * of refs to process starting at the first one we are able to
1049 		 * lock
1050 		 */
1051 		ret = btrfs_find_ref_cluster(trans, &cluster,
1052 					     delayed_refs->run_delayed_start);
1053 		if (ret)
1054 			break;
1055 
1056 		ret = run_clustered_refs(trans, root, &cluster);
1057 		BUG_ON(ret < 0);
1058 
1059 		count -= min_t(unsigned long, ret, count);
1060 
1061 		if (count == 0)
1062 			break;
1063 	}
1064 
1065 	if (run_all) {
1066 		node = rb_first(&delayed_refs->root);
1067 		if (!node)
1068 			goto out;
1069 		count = (unsigned long)-1;
1070 
1071 		while (node) {
1072 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
1073 				       rb_node);
1074 			if (btrfs_delayed_ref_is_head(ref)) {
1075 				struct btrfs_delayed_ref_head *head;
1076 
1077 				head = btrfs_delayed_node_to_head(ref);
1078 				atomic_inc(&ref->refs);
1079 
1080 				spin_unlock(&delayed_refs->lock);
1081 				mutex_lock(&head->mutex);
1082 				mutex_unlock(&head->mutex);
1083 
1084 				btrfs_put_delayed_ref(ref);
1085 				cond_resched();
1086 				goto again;
1087 			}
1088 			node = rb_next(node);
1089 		}
1090 		spin_unlock(&delayed_refs->lock);
1091 		schedule_timeout(1);
1092 		goto again;
1093 	}
1094 out:
1095 	spin_unlock(&delayed_refs->lock);
1096 	return 0;
1097 }
1098 
1099 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1100 			  struct btrfs_root *root, u64 objectid, u64 bytenr)
1101 {
1102 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1103 	struct btrfs_path *path;
1104 	struct extent_buffer *leaf;
1105 	struct btrfs_extent_ref *ref_item;
1106 	struct btrfs_key key;
1107 	struct btrfs_key found_key;
1108 	u64 ref_root;
1109 	u64 last_snapshot;
1110 	u32 nritems;
1111 	int ret;
1112 
1113 	key.objectid = bytenr;
1114 	key.offset = (u64)-1;
1115 	key.type = BTRFS_EXTENT_ITEM_KEY;
1116 
1117 	path = btrfs_alloc_path();
1118 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1119 	if (ret < 0)
1120 		goto out;
1121 	BUG_ON(ret == 0);
1122 
1123 	ret = -ENOENT;
1124 	if (path->slots[0] == 0)
1125 		goto out;
1126 
1127 	path->slots[0]--;
1128 	leaf = path->nodes[0];
1129 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1130 
1131 	if (found_key.objectid != bytenr ||
1132 	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
1133 		goto out;
1134 
1135 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1136 	while (1) {
1137 		leaf = path->nodes[0];
1138 		nritems = btrfs_header_nritems(leaf);
1139 		if (path->slots[0] >= nritems) {
1140 			ret = btrfs_next_leaf(extent_root, path);
1141 			if (ret < 0)
1142 				goto out;
1143 			if (ret == 0)
1144 				continue;
1145 			break;
1146 		}
1147 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1148 		if (found_key.objectid != bytenr)
1149 			break;
1150 
1151 		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1152 			path->slots[0]++;
1153 			continue;
1154 		}
1155 
1156 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
1157 					  struct btrfs_extent_ref);
1158 		ref_root = btrfs_ref_root(leaf, ref_item);
1159 		if ((ref_root != root->root_key.objectid &&
1160 		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1161 		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
1162 			ret = 1;
1163 			goto out;
1164 		}
1165 		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1166 			ret = 1;
1167 			goto out;
1168 		}
1169 
1170 		path->slots[0]++;
1171 	}
1172 	ret = 0;
1173 out:
1174 	btrfs_free_path(path);
1175 	return ret;
1176 }
1177 
1178 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1179 		    struct extent_buffer *buf, u32 nr_extents)
1180 {
1181 	struct btrfs_key key;
1182 	struct btrfs_file_extent_item *fi;
1183 	u64 root_gen;
1184 	u32 nritems;
1185 	int i;
1186 	int level;
1187 	int ret = 0;
1188 	int shared = 0;
1189 
1190 	if (!root->ref_cows)
1191 		return 0;
1192 
1193 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1194 		shared = 0;
1195 		root_gen = root->root_key.offset;
1196 	} else {
1197 		shared = 1;
1198 		root_gen = trans->transid - 1;
1199 	}
1200 
1201 	level = btrfs_header_level(buf);
1202 	nritems = btrfs_header_nritems(buf);
1203 
1204 	if (level == 0) {
1205 		struct btrfs_leaf_ref *ref;
1206 		struct btrfs_extent_info *info;
1207 
1208 		ref = btrfs_alloc_leaf_ref(root, nr_extents);
1209 		if (!ref) {
1210 			ret = -ENOMEM;
1211 			goto out;
1212 		}
1213 
1214 		ref->root_gen = root_gen;
1215 		ref->bytenr = buf->start;
1216 		ref->owner = btrfs_header_owner(buf);
1217 		ref->generation = btrfs_header_generation(buf);
1218 		ref->nritems = nr_extents;
1219 		info = ref->extents;
1220 
1221 		for (i = 0; nr_extents > 0 && i < nritems; i++) {
1222 			u64 disk_bytenr;
1223 			btrfs_item_key_to_cpu(buf, &key, i);
1224 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1225 				continue;
1226 			fi = btrfs_item_ptr(buf, i,
1227 					    struct btrfs_file_extent_item);
1228 			if (btrfs_file_extent_type(buf, fi) ==
1229 			    BTRFS_FILE_EXTENT_INLINE)
1230 				continue;
1231 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1232 			if (disk_bytenr == 0)
1233 				continue;
1234 
1235 			info->bytenr = disk_bytenr;
1236 			info->num_bytes =
1237 				btrfs_file_extent_disk_num_bytes(buf, fi);
1238 			info->objectid = key.objectid;
1239 			info->offset = key.offset;
1240 			info++;
1241 		}
1242 
1243 		ret = btrfs_add_leaf_ref(root, ref, shared);
1244 		if (ret == -EEXIST && shared) {
1245 			struct btrfs_leaf_ref *old;
1246 			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1247 			BUG_ON(!old);
1248 			btrfs_remove_leaf_ref(root, old);
1249 			btrfs_free_leaf_ref(root, old);
1250 			ret = btrfs_add_leaf_ref(root, ref, shared);
1251 		}
1252 		WARN_ON(ret);
1253 		btrfs_free_leaf_ref(root, ref);
1254 	}
1255 out:
1256 	return ret;
1257 }
1258 
1259 /* when a block goes through cow, we update the reference counts of
1260  * everything that block points to.  The internal pointers of the block
1261  * can be in just about any order, and it is likely to have clusters of
1262  * things that are close together and clusters of things that are not.
1263  *
1264  * To help reduce the seeks that come with updating all of these reference
1265  * counts, sort them by byte number before actual updates are done.
1266  *
1267  * struct refsort is used to match byte number to slot in the btree block.
1268  * we sort based on the byte number and then use the slot to actually
1269  * find the item.
1270  *
1271  * struct refsort is smaller than strcut btrfs_item and smaller than
1272  * struct btrfs_key_ptr.  Since we're currently limited to the page size
1273  * for a btree block, there's no way for a kmalloc of refsorts for a
1274  * single node to be bigger than a page.
1275  */
1276 struct refsort {
1277 	u64 bytenr;
1278 	u32 slot;
1279 };
1280 
1281 /*
1282  * for passing into sort()
1283  */
1284 static int refsort_cmp(const void *a_void, const void *b_void)
1285 {
1286 	const struct refsort *a = a_void;
1287 	const struct refsort *b = b_void;
1288 
1289 	if (a->bytenr < b->bytenr)
1290 		return -1;
1291 	if (a->bytenr > b->bytenr)
1292 		return 1;
1293 	return 0;
1294 }
1295 
1296 
1297 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1298 			   struct btrfs_root *root,
1299 			   struct extent_buffer *orig_buf,
1300 			   struct extent_buffer *buf, u32 *nr_extents)
1301 {
1302 	u64 bytenr;
1303 	u64 ref_root;
1304 	u64 orig_root;
1305 	u64 ref_generation;
1306 	u64 orig_generation;
1307 	struct refsort *sorted;
1308 	u32 nritems;
1309 	u32 nr_file_extents = 0;
1310 	struct btrfs_key key;
1311 	struct btrfs_file_extent_item *fi;
1312 	int i;
1313 	int level;
1314 	int ret = 0;
1315 	int faili = 0;
1316 	int refi = 0;
1317 	int slot;
1318 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1319 			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
1320 
1321 	ref_root = btrfs_header_owner(buf);
1322 	ref_generation = btrfs_header_generation(buf);
1323 	orig_root = btrfs_header_owner(orig_buf);
1324 	orig_generation = btrfs_header_generation(orig_buf);
1325 
1326 	nritems = btrfs_header_nritems(buf);
1327 	level = btrfs_header_level(buf);
1328 
1329 	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1330 	BUG_ON(!sorted);
1331 
1332 	if (root->ref_cows) {
1333 		process_func = __btrfs_inc_extent_ref;
1334 	} else {
1335 		if (level == 0 &&
1336 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1337 			goto out;
1338 		if (level != 0 &&
1339 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1340 			goto out;
1341 		process_func = __btrfs_update_extent_ref;
1342 	}
1343 
1344 	/*
1345 	 * we make two passes through the items.  In the first pass we
1346 	 * only record the byte number and slot.  Then we sort based on
1347 	 * byte number and do the actual work based on the sorted results
1348 	 */
1349 	for (i = 0; i < nritems; i++) {
1350 		cond_resched();
1351 		if (level == 0) {
1352 			btrfs_item_key_to_cpu(buf, &key, i);
1353 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1354 				continue;
1355 			fi = btrfs_item_ptr(buf, i,
1356 					    struct btrfs_file_extent_item);
1357 			if (btrfs_file_extent_type(buf, fi) ==
1358 			    BTRFS_FILE_EXTENT_INLINE)
1359 				continue;
1360 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1361 			if (bytenr == 0)
1362 				continue;
1363 
1364 			nr_file_extents++;
1365 			sorted[refi].bytenr = bytenr;
1366 			sorted[refi].slot = i;
1367 			refi++;
1368 		} else {
1369 			bytenr = btrfs_node_blockptr(buf, i);
1370 			sorted[refi].bytenr = bytenr;
1371 			sorted[refi].slot = i;
1372 			refi++;
1373 		}
1374 	}
1375 	/*
1376 	 * if refi == 0, we didn't actually put anything into the sorted
1377 	 * array and we're done
1378 	 */
1379 	if (refi == 0)
1380 		goto out;
1381 
1382 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1383 
1384 	for (i = 0; i < refi; i++) {
1385 		cond_resched();
1386 		slot = sorted[i].slot;
1387 		bytenr = sorted[i].bytenr;
1388 
1389 		if (level == 0) {
1390 			btrfs_item_key_to_cpu(buf, &key, slot);
1391 			fi = btrfs_item_ptr(buf, slot,
1392 					    struct btrfs_file_extent_item);
1393 
1394 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1395 			if (bytenr == 0)
1396 				continue;
1397 
1398 			ret = process_func(trans, root, bytenr,
1399 				   btrfs_file_extent_disk_num_bytes(buf, fi),
1400 				   orig_buf->start, buf->start,
1401 				   orig_root, ref_root,
1402 				   orig_generation, ref_generation,
1403 				   key.objectid);
1404 
1405 			if (ret) {
1406 				faili = slot;
1407 				WARN_ON(1);
1408 				goto fail;
1409 			}
1410 		} else {
1411 			ret = process_func(trans, root, bytenr, buf->len,
1412 					   orig_buf->start, buf->start,
1413 					   orig_root, ref_root,
1414 					   orig_generation, ref_generation,
1415 					   level - 1);
1416 			if (ret) {
1417 				faili = slot;
1418 				WARN_ON(1);
1419 				goto fail;
1420 			}
1421 		}
1422 	}
1423 out:
1424 	kfree(sorted);
1425 	if (nr_extents) {
1426 		if (level == 0)
1427 			*nr_extents = nr_file_extents;
1428 		else
1429 			*nr_extents = nritems;
1430 	}
1431 	return 0;
1432 fail:
1433 	kfree(sorted);
1434 	WARN_ON(1);
1435 	return ret;
1436 }
1437 
1438 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1439 		     struct btrfs_root *root, struct extent_buffer *orig_buf,
1440 		     struct extent_buffer *buf, int start_slot, int nr)
1441 
1442 {
1443 	u64 bytenr;
1444 	u64 ref_root;
1445 	u64 orig_root;
1446 	u64 ref_generation;
1447 	u64 orig_generation;
1448 	struct btrfs_key key;
1449 	struct btrfs_file_extent_item *fi;
1450 	int i;
1451 	int ret;
1452 	int slot;
1453 	int level;
1454 
1455 	BUG_ON(start_slot < 0);
1456 	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1457 
1458 	ref_root = btrfs_header_owner(buf);
1459 	ref_generation = btrfs_header_generation(buf);
1460 	orig_root = btrfs_header_owner(orig_buf);
1461 	orig_generation = btrfs_header_generation(orig_buf);
1462 	level = btrfs_header_level(buf);
1463 
1464 	if (!root->ref_cows) {
1465 		if (level == 0 &&
1466 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1467 			return 0;
1468 		if (level != 0 &&
1469 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1470 			return 0;
1471 	}
1472 
1473 	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1474 		cond_resched();
1475 		if (level == 0) {
1476 			btrfs_item_key_to_cpu(buf, &key, slot);
1477 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1478 				continue;
1479 			fi = btrfs_item_ptr(buf, slot,
1480 					    struct btrfs_file_extent_item);
1481 			if (btrfs_file_extent_type(buf, fi) ==
1482 			    BTRFS_FILE_EXTENT_INLINE)
1483 				continue;
1484 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1485 			if (bytenr == 0)
1486 				continue;
1487 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1488 				    btrfs_file_extent_disk_num_bytes(buf, fi),
1489 				    orig_buf->start, buf->start,
1490 				    orig_root, ref_root, orig_generation,
1491 				    ref_generation, key.objectid);
1492 			if (ret)
1493 				goto fail;
1494 		} else {
1495 			bytenr = btrfs_node_blockptr(buf, slot);
1496 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1497 					    buf->len, orig_buf->start,
1498 					    buf->start, orig_root, ref_root,
1499 					    orig_generation, ref_generation,
1500 					    level - 1);
1501 			if (ret)
1502 				goto fail;
1503 		}
1504 	}
1505 	return 0;
1506 fail:
1507 	WARN_ON(1);
1508 	return -1;
1509 }
1510 
1511 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1512 				 struct btrfs_root *root,
1513 				 struct btrfs_path *path,
1514 				 struct btrfs_block_group_cache *cache)
1515 {
1516 	int ret;
1517 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1518 	unsigned long bi;
1519 	struct extent_buffer *leaf;
1520 
1521 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1522 	if (ret < 0)
1523 		goto fail;
1524 	BUG_ON(ret);
1525 
1526 	leaf = path->nodes[0];
1527 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1528 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1529 	btrfs_mark_buffer_dirty(leaf);
1530 	btrfs_release_path(extent_root, path);
1531 fail:
1532 	if (ret)
1533 		return ret;
1534 	return 0;
1535 
1536 }
1537 
1538 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1539 				   struct btrfs_root *root)
1540 {
1541 	struct btrfs_block_group_cache *cache, *entry;
1542 	struct rb_node *n;
1543 	int err = 0;
1544 	int werr = 0;
1545 	struct btrfs_path *path;
1546 	u64 last = 0;
1547 
1548 	path = btrfs_alloc_path();
1549 	if (!path)
1550 		return -ENOMEM;
1551 
1552 	while (1) {
1553 		cache = NULL;
1554 		spin_lock(&root->fs_info->block_group_cache_lock);
1555 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
1556 		     n; n = rb_next(n)) {
1557 			entry = rb_entry(n, struct btrfs_block_group_cache,
1558 					 cache_node);
1559 			if (entry->dirty) {
1560 				cache = entry;
1561 				break;
1562 			}
1563 		}
1564 		spin_unlock(&root->fs_info->block_group_cache_lock);
1565 
1566 		if (!cache)
1567 			break;
1568 
1569 		cache->dirty = 0;
1570 		last += cache->key.offset;
1571 
1572 		err = write_one_cache_group(trans, root,
1573 					    path, cache);
1574 		/*
1575 		 * if we fail to write the cache group, we want
1576 		 * to keep it marked dirty in hopes that a later
1577 		 * write will work
1578 		 */
1579 		if (err) {
1580 			werr = err;
1581 			continue;
1582 		}
1583 	}
1584 	btrfs_free_path(path);
1585 	return werr;
1586 }
1587 
1588 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1589 {
1590 	struct btrfs_block_group_cache *block_group;
1591 	int readonly = 0;
1592 
1593 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1594 	if (!block_group || block_group->ro)
1595 		readonly = 1;
1596 	if (block_group)
1597 		put_block_group(block_group);
1598 	return readonly;
1599 }
1600 
1601 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1602 			     u64 total_bytes, u64 bytes_used,
1603 			     struct btrfs_space_info **space_info)
1604 {
1605 	struct btrfs_space_info *found;
1606 
1607 	found = __find_space_info(info, flags);
1608 	if (found) {
1609 		spin_lock(&found->lock);
1610 		found->total_bytes += total_bytes;
1611 		found->bytes_used += bytes_used;
1612 		found->full = 0;
1613 		spin_unlock(&found->lock);
1614 		*space_info = found;
1615 		return 0;
1616 	}
1617 	found = kzalloc(sizeof(*found), GFP_NOFS);
1618 	if (!found)
1619 		return -ENOMEM;
1620 
1621 	INIT_LIST_HEAD(&found->block_groups);
1622 	init_rwsem(&found->groups_sem);
1623 	spin_lock_init(&found->lock);
1624 	found->flags = flags;
1625 	found->total_bytes = total_bytes;
1626 	found->bytes_used = bytes_used;
1627 	found->bytes_pinned = 0;
1628 	found->bytes_reserved = 0;
1629 	found->bytes_readonly = 0;
1630 	found->bytes_delalloc = 0;
1631 	found->full = 0;
1632 	found->force_alloc = 0;
1633 	*space_info = found;
1634 	list_add_rcu(&found->list, &info->space_info);
1635 	return 0;
1636 }
1637 
1638 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1639 {
1640 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1641 				   BTRFS_BLOCK_GROUP_RAID1 |
1642 				   BTRFS_BLOCK_GROUP_RAID10 |
1643 				   BTRFS_BLOCK_GROUP_DUP);
1644 	if (extra_flags) {
1645 		if (flags & BTRFS_BLOCK_GROUP_DATA)
1646 			fs_info->avail_data_alloc_bits |= extra_flags;
1647 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
1648 			fs_info->avail_metadata_alloc_bits |= extra_flags;
1649 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1650 			fs_info->avail_system_alloc_bits |= extra_flags;
1651 	}
1652 }
1653 
1654 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1655 {
1656 	spin_lock(&cache->space_info->lock);
1657 	spin_lock(&cache->lock);
1658 	if (!cache->ro) {
1659 		cache->space_info->bytes_readonly += cache->key.offset -
1660 					btrfs_block_group_used(&cache->item);
1661 		cache->ro = 1;
1662 	}
1663 	spin_unlock(&cache->lock);
1664 	spin_unlock(&cache->space_info->lock);
1665 }
1666 
1667 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1668 {
1669 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
1670 
1671 	if (num_devices == 1)
1672 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1673 	if (num_devices < 4)
1674 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1675 
1676 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1677 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1678 		      BTRFS_BLOCK_GROUP_RAID10))) {
1679 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
1680 	}
1681 
1682 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1683 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1684 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1685 	}
1686 
1687 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1688 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1689 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
1690 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
1691 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1692 	return flags;
1693 }
1694 
1695 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1696 {
1697 	struct btrfs_fs_info *info = root->fs_info;
1698 	u64 alloc_profile;
1699 
1700 	if (data) {
1701 		alloc_profile = info->avail_data_alloc_bits &
1702 			info->data_alloc_profile;
1703 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1704 	} else if (root == root->fs_info->chunk_root) {
1705 		alloc_profile = info->avail_system_alloc_bits &
1706 			info->system_alloc_profile;
1707 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1708 	} else {
1709 		alloc_profile = info->avail_metadata_alloc_bits &
1710 			info->metadata_alloc_profile;
1711 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1712 	}
1713 
1714 	return btrfs_reduce_alloc_profile(root, data);
1715 }
1716 
1717 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
1718 {
1719 	u64 alloc_target;
1720 
1721 	alloc_target = btrfs_get_alloc_profile(root, 1);
1722 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
1723 						       alloc_target);
1724 }
1725 
1726 /*
1727  * for now this just makes sure we have at least 5% of our metadata space free
1728  * for use.
1729  */
1730 int btrfs_check_metadata_free_space(struct btrfs_root *root)
1731 {
1732 	struct btrfs_fs_info *info = root->fs_info;
1733 	struct btrfs_space_info *meta_sinfo;
1734 	u64 alloc_target, thresh;
1735 	int committed = 0, ret;
1736 
1737 	/* get the space info for where the metadata will live */
1738 	alloc_target = btrfs_get_alloc_profile(root, 0);
1739 	meta_sinfo = __find_space_info(info, alloc_target);
1740 
1741 again:
1742 	spin_lock(&meta_sinfo->lock);
1743 	if (!meta_sinfo->full)
1744 		thresh = meta_sinfo->total_bytes * 80;
1745 	else
1746 		thresh = meta_sinfo->total_bytes * 95;
1747 
1748 	do_div(thresh, 100);
1749 
1750 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
1751 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
1752 		struct btrfs_trans_handle *trans;
1753 		if (!meta_sinfo->full) {
1754 			meta_sinfo->force_alloc = 1;
1755 			spin_unlock(&meta_sinfo->lock);
1756 
1757 			trans = btrfs_start_transaction(root, 1);
1758 			if (!trans)
1759 				return -ENOMEM;
1760 
1761 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1762 					     2 * 1024 * 1024, alloc_target, 0);
1763 			btrfs_end_transaction(trans, root);
1764 			goto again;
1765 		}
1766 		spin_unlock(&meta_sinfo->lock);
1767 
1768 		if (!committed) {
1769 			committed = 1;
1770 			trans = btrfs_join_transaction(root, 1);
1771 			if (!trans)
1772 				return -ENOMEM;
1773 			ret = btrfs_commit_transaction(trans, root);
1774 			if (ret)
1775 				return ret;
1776 			goto again;
1777 		}
1778 		return -ENOSPC;
1779 	}
1780 	spin_unlock(&meta_sinfo->lock);
1781 
1782 	return 0;
1783 }
1784 
1785 /*
1786  * This will check the space that the inode allocates from to make sure we have
1787  * enough space for bytes.
1788  */
1789 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1790 				u64 bytes)
1791 {
1792 	struct btrfs_space_info *data_sinfo;
1793 	int ret = 0, committed = 0;
1794 
1795 	/* make sure bytes are sectorsize aligned */
1796 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1797 
1798 	data_sinfo = BTRFS_I(inode)->space_info;
1799 again:
1800 	/* make sure we have enough space to handle the data first */
1801 	spin_lock(&data_sinfo->lock);
1802 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
1803 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
1804 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
1805 	    data_sinfo->bytes_may_use < bytes) {
1806 		struct btrfs_trans_handle *trans;
1807 
1808 		/*
1809 		 * if we don't have enough free bytes in this space then we need
1810 		 * to alloc a new chunk.
1811 		 */
1812 		if (!data_sinfo->full) {
1813 			u64 alloc_target;
1814 
1815 			data_sinfo->force_alloc = 1;
1816 			spin_unlock(&data_sinfo->lock);
1817 
1818 			alloc_target = btrfs_get_alloc_profile(root, 1);
1819 			trans = btrfs_start_transaction(root, 1);
1820 			if (!trans)
1821 				return -ENOMEM;
1822 
1823 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1824 					     bytes + 2 * 1024 * 1024,
1825 					     alloc_target, 0);
1826 			btrfs_end_transaction(trans, root);
1827 			if (ret)
1828 				return ret;
1829 			goto again;
1830 		}
1831 		spin_unlock(&data_sinfo->lock);
1832 
1833 		/* commit the current transaction and try again */
1834 		if (!committed) {
1835 			committed = 1;
1836 			trans = btrfs_join_transaction(root, 1);
1837 			if (!trans)
1838 				return -ENOMEM;
1839 			ret = btrfs_commit_transaction(trans, root);
1840 			if (ret)
1841 				return ret;
1842 			goto again;
1843 		}
1844 
1845 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1846 		       ", %llu bytes_used, %llu bytes_reserved, "
1847 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
1848 		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
1849 		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
1850 		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
1851 		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
1852 		return -ENOSPC;
1853 	}
1854 	data_sinfo->bytes_may_use += bytes;
1855 	BTRFS_I(inode)->reserved_bytes += bytes;
1856 	spin_unlock(&data_sinfo->lock);
1857 
1858 	return btrfs_check_metadata_free_space(root);
1859 }
1860 
1861 /*
1862  * if there was an error for whatever reason after calling
1863  * btrfs_check_data_free_space, call this so we can cleanup the counters.
1864  */
1865 void btrfs_free_reserved_data_space(struct btrfs_root *root,
1866 				    struct inode *inode, u64 bytes)
1867 {
1868 	struct btrfs_space_info *data_sinfo;
1869 
1870 	/* make sure bytes are sectorsize aligned */
1871 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1872 
1873 	data_sinfo = BTRFS_I(inode)->space_info;
1874 	spin_lock(&data_sinfo->lock);
1875 	data_sinfo->bytes_may_use -= bytes;
1876 	BTRFS_I(inode)->reserved_bytes -= bytes;
1877 	spin_unlock(&data_sinfo->lock);
1878 }
1879 
1880 /* called when we are adding a delalloc extent to the inode's io_tree */
1881 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1882 				  u64 bytes)
1883 {
1884 	struct btrfs_space_info *data_sinfo;
1885 
1886 	/* get the space info for where this inode will be storing its data */
1887 	data_sinfo = BTRFS_I(inode)->space_info;
1888 
1889 	/* make sure we have enough space to handle the data first */
1890 	spin_lock(&data_sinfo->lock);
1891 	data_sinfo->bytes_delalloc += bytes;
1892 
1893 	/*
1894 	 * we are adding a delalloc extent without calling
1895 	 * btrfs_check_data_free_space first.  This happens on a weird
1896 	 * writepage condition, but shouldn't hurt our accounting
1897 	 */
1898 	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
1899 		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
1900 		BTRFS_I(inode)->reserved_bytes = 0;
1901 	} else {
1902 		data_sinfo->bytes_may_use -= bytes;
1903 		BTRFS_I(inode)->reserved_bytes -= bytes;
1904 	}
1905 
1906 	spin_unlock(&data_sinfo->lock);
1907 }
1908 
1909 /* called when we are clearing an delalloc extent from the inode's io_tree */
1910 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1911 			      u64 bytes)
1912 {
1913 	struct btrfs_space_info *info;
1914 
1915 	info = BTRFS_I(inode)->space_info;
1916 
1917 	spin_lock(&info->lock);
1918 	info->bytes_delalloc -= bytes;
1919 	spin_unlock(&info->lock);
1920 }
1921 
1922 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1923 			  struct btrfs_root *extent_root, u64 alloc_bytes,
1924 			  u64 flags, int force)
1925 {
1926 	struct btrfs_space_info *space_info;
1927 	u64 thresh;
1928 	int ret = 0;
1929 
1930 	mutex_lock(&extent_root->fs_info->chunk_mutex);
1931 
1932 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
1933 
1934 	space_info = __find_space_info(extent_root->fs_info, flags);
1935 	if (!space_info) {
1936 		ret = update_space_info(extent_root->fs_info, flags,
1937 					0, 0, &space_info);
1938 		BUG_ON(ret);
1939 	}
1940 	BUG_ON(!space_info);
1941 
1942 	spin_lock(&space_info->lock);
1943 	if (space_info->force_alloc) {
1944 		force = 1;
1945 		space_info->force_alloc = 0;
1946 	}
1947 	if (space_info->full) {
1948 		spin_unlock(&space_info->lock);
1949 		goto out;
1950 	}
1951 
1952 	thresh = space_info->total_bytes - space_info->bytes_readonly;
1953 	thresh = div_factor(thresh, 6);
1954 	if (!force &&
1955 	   (space_info->bytes_used + space_info->bytes_pinned +
1956 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
1957 		spin_unlock(&space_info->lock);
1958 		goto out;
1959 	}
1960 	spin_unlock(&space_info->lock);
1961 
1962 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
1963 	if (ret)
1964 		space_info->full = 1;
1965 out:
1966 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
1967 	return ret;
1968 }
1969 
1970 static int update_block_group(struct btrfs_trans_handle *trans,
1971 			      struct btrfs_root *root,
1972 			      u64 bytenr, u64 num_bytes, int alloc,
1973 			      int mark_free)
1974 {
1975 	struct btrfs_block_group_cache *cache;
1976 	struct btrfs_fs_info *info = root->fs_info;
1977 	u64 total = num_bytes;
1978 	u64 old_val;
1979 	u64 byte_in_group;
1980 
1981 	while (total) {
1982 		cache = btrfs_lookup_block_group(info, bytenr);
1983 		if (!cache)
1984 			return -1;
1985 		byte_in_group = bytenr - cache->key.objectid;
1986 		WARN_ON(byte_in_group > cache->key.offset);
1987 
1988 		spin_lock(&cache->space_info->lock);
1989 		spin_lock(&cache->lock);
1990 		cache->dirty = 1;
1991 		old_val = btrfs_block_group_used(&cache->item);
1992 		num_bytes = min(total, cache->key.offset - byte_in_group);
1993 		if (alloc) {
1994 			old_val += num_bytes;
1995 			cache->space_info->bytes_used += num_bytes;
1996 			if (cache->ro)
1997 				cache->space_info->bytes_readonly -= num_bytes;
1998 			btrfs_set_block_group_used(&cache->item, old_val);
1999 			spin_unlock(&cache->lock);
2000 			spin_unlock(&cache->space_info->lock);
2001 		} else {
2002 			old_val -= num_bytes;
2003 			cache->space_info->bytes_used -= num_bytes;
2004 			if (cache->ro)
2005 				cache->space_info->bytes_readonly += num_bytes;
2006 			btrfs_set_block_group_used(&cache->item, old_val);
2007 			spin_unlock(&cache->lock);
2008 			spin_unlock(&cache->space_info->lock);
2009 			if (mark_free) {
2010 				int ret;
2011 
2012 				ret = btrfs_discard_extent(root, bytenr,
2013 							   num_bytes);
2014 				WARN_ON(ret);
2015 
2016 				ret = btrfs_add_free_space(cache, bytenr,
2017 							   num_bytes);
2018 				WARN_ON(ret);
2019 			}
2020 		}
2021 		put_block_group(cache);
2022 		total -= num_bytes;
2023 		bytenr += num_bytes;
2024 	}
2025 	return 0;
2026 }
2027 
2028 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2029 {
2030 	struct btrfs_block_group_cache *cache;
2031 	u64 bytenr;
2032 
2033 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2034 	if (!cache)
2035 		return 0;
2036 
2037 	bytenr = cache->key.objectid;
2038 	put_block_group(cache);
2039 
2040 	return bytenr;
2041 }
2042 
2043 int btrfs_update_pinned_extents(struct btrfs_root *root,
2044 				u64 bytenr, u64 num, int pin)
2045 {
2046 	u64 len;
2047 	struct btrfs_block_group_cache *cache;
2048 	struct btrfs_fs_info *fs_info = root->fs_info;
2049 
2050 	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2051 	if (pin) {
2052 		set_extent_dirty(&fs_info->pinned_extents,
2053 				bytenr, bytenr + num - 1, GFP_NOFS);
2054 	} else {
2055 		clear_extent_dirty(&fs_info->pinned_extents,
2056 				bytenr, bytenr + num - 1, GFP_NOFS);
2057 	}
2058 	mutex_unlock(&root->fs_info->pinned_mutex);
2059 
2060 	while (num > 0) {
2061 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2062 		BUG_ON(!cache);
2063 		len = min(num, cache->key.offset -
2064 			  (bytenr - cache->key.objectid));
2065 		if (pin) {
2066 			spin_lock(&cache->space_info->lock);
2067 			spin_lock(&cache->lock);
2068 			cache->pinned += len;
2069 			cache->space_info->bytes_pinned += len;
2070 			spin_unlock(&cache->lock);
2071 			spin_unlock(&cache->space_info->lock);
2072 			fs_info->total_pinned += len;
2073 		} else {
2074 			spin_lock(&cache->space_info->lock);
2075 			spin_lock(&cache->lock);
2076 			cache->pinned -= len;
2077 			cache->space_info->bytes_pinned -= len;
2078 			spin_unlock(&cache->lock);
2079 			spin_unlock(&cache->space_info->lock);
2080 			fs_info->total_pinned -= len;
2081 			if (cache->cached)
2082 				btrfs_add_free_space(cache, bytenr, len);
2083 		}
2084 		put_block_group(cache);
2085 		bytenr += len;
2086 		num -= len;
2087 	}
2088 	return 0;
2089 }
2090 
2091 static int update_reserved_extents(struct btrfs_root *root,
2092 				   u64 bytenr, u64 num, int reserve)
2093 {
2094 	u64 len;
2095 	struct btrfs_block_group_cache *cache;
2096 	struct btrfs_fs_info *fs_info = root->fs_info;
2097 
2098 	while (num > 0) {
2099 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2100 		BUG_ON(!cache);
2101 		len = min(num, cache->key.offset -
2102 			  (bytenr - cache->key.objectid));
2103 
2104 		spin_lock(&cache->space_info->lock);
2105 		spin_lock(&cache->lock);
2106 		if (reserve) {
2107 			cache->reserved += len;
2108 			cache->space_info->bytes_reserved += len;
2109 		} else {
2110 			cache->reserved -= len;
2111 			cache->space_info->bytes_reserved -= len;
2112 		}
2113 		spin_unlock(&cache->lock);
2114 		spin_unlock(&cache->space_info->lock);
2115 		put_block_group(cache);
2116 		bytenr += len;
2117 		num -= len;
2118 	}
2119 	return 0;
2120 }
2121 
2122 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2123 {
2124 	u64 last = 0;
2125 	u64 start;
2126 	u64 end;
2127 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2128 	int ret;
2129 
2130 	mutex_lock(&root->fs_info->pinned_mutex);
2131 	while (1) {
2132 		ret = find_first_extent_bit(pinned_extents, last,
2133 					    &start, &end, EXTENT_DIRTY);
2134 		if (ret)
2135 			break;
2136 		set_extent_dirty(copy, start, end, GFP_NOFS);
2137 		last = end + 1;
2138 	}
2139 	mutex_unlock(&root->fs_info->pinned_mutex);
2140 	return 0;
2141 }
2142 
2143 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2144 			       struct btrfs_root *root,
2145 			       struct extent_io_tree *unpin)
2146 {
2147 	u64 start;
2148 	u64 end;
2149 	int ret;
2150 
2151 	while (1) {
2152 		mutex_lock(&root->fs_info->pinned_mutex);
2153 		ret = find_first_extent_bit(unpin, 0, &start, &end,
2154 					    EXTENT_DIRTY);
2155 		if (ret)
2156 			break;
2157 
2158 		ret = btrfs_discard_extent(root, start, end + 1 - start);
2159 
2160 		/* unlocks the pinned mutex */
2161 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2162 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
2163 
2164 		cond_resched();
2165 	}
2166 	mutex_unlock(&root->fs_info->pinned_mutex);
2167 	return ret;
2168 }
2169 
2170 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2171 			  struct btrfs_root *root,
2172 			  struct btrfs_path *path,
2173 			  u64 bytenr, u64 num_bytes, int is_data,
2174 			  struct extent_buffer **must_clean)
2175 {
2176 	int err = 0;
2177 	struct extent_buffer *buf;
2178 
2179 	if (is_data)
2180 		goto pinit;
2181 
2182 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2183 	if (!buf)
2184 		goto pinit;
2185 
2186 	/* we can reuse a block if it hasn't been written
2187 	 * and it is from this transaction.  We can't
2188 	 * reuse anything from the tree log root because
2189 	 * it has tiny sub-transactions.
2190 	 */
2191 	if (btrfs_buffer_uptodate(buf, 0) &&
2192 	    btrfs_try_tree_lock(buf)) {
2193 		u64 header_owner = btrfs_header_owner(buf);
2194 		u64 header_transid = btrfs_header_generation(buf);
2195 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2196 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2197 		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2198 		    header_transid == trans->transid &&
2199 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2200 			*must_clean = buf;
2201 			return 1;
2202 		}
2203 		btrfs_tree_unlock(buf);
2204 	}
2205 	free_extent_buffer(buf);
2206 pinit:
2207 	btrfs_set_path_blocking(path);
2208 	mutex_lock(&root->fs_info->pinned_mutex);
2209 	/* unlocks the pinned mutex */
2210 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2211 
2212 	BUG_ON(err < 0);
2213 	return 0;
2214 }
2215 
2216 /*
2217  * remove an extent from the root, returns 0 on success
2218  */
2219 static int __free_extent(struct btrfs_trans_handle *trans,
2220 			 struct btrfs_root *root,
2221 			 u64 bytenr, u64 num_bytes, u64 parent,
2222 			 u64 root_objectid, u64 ref_generation,
2223 			 u64 owner_objectid, int pin, int mark_free,
2224 			 int refs_to_drop)
2225 {
2226 	struct btrfs_path *path;
2227 	struct btrfs_key key;
2228 	struct btrfs_fs_info *info = root->fs_info;
2229 	struct btrfs_root *extent_root = info->extent_root;
2230 	struct extent_buffer *leaf;
2231 	int ret;
2232 	int extent_slot = 0;
2233 	int found_extent = 0;
2234 	int num_to_del = 1;
2235 	struct btrfs_extent_item *ei;
2236 	u32 refs;
2237 
2238 	key.objectid = bytenr;
2239 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2240 	key.offset = num_bytes;
2241 	path = btrfs_alloc_path();
2242 	if (!path)
2243 		return -ENOMEM;
2244 
2245 	path->reada = 1;
2246 	path->leave_spinning = 1;
2247 	ret = lookup_extent_backref(trans, extent_root, path,
2248 				    bytenr, parent, root_objectid,
2249 				    ref_generation, owner_objectid, 1);
2250 	if (ret == 0) {
2251 		struct btrfs_key found_key;
2252 		extent_slot = path->slots[0];
2253 		while (extent_slot > 0) {
2254 			extent_slot--;
2255 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2256 					      extent_slot);
2257 			if (found_key.objectid != bytenr)
2258 				break;
2259 			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2260 			    found_key.offset == num_bytes) {
2261 				found_extent = 1;
2262 				break;
2263 			}
2264 			if (path->slots[0] - extent_slot > 5)
2265 				break;
2266 		}
2267 		if (!found_extent) {
2268 			ret = remove_extent_backref(trans, extent_root, path,
2269 						    refs_to_drop);
2270 			BUG_ON(ret);
2271 			btrfs_release_path(extent_root, path);
2272 			path->leave_spinning = 1;
2273 			ret = btrfs_search_slot(trans, extent_root,
2274 						&key, path, -1, 1);
2275 			if (ret) {
2276 				printk(KERN_ERR "umm, got %d back from search"
2277 				       ", was looking for %llu\n", ret,
2278 				       (unsigned long long)bytenr);
2279 				btrfs_print_leaf(extent_root, path->nodes[0]);
2280 			}
2281 			BUG_ON(ret);
2282 			extent_slot = path->slots[0];
2283 		}
2284 	} else {
2285 		btrfs_print_leaf(extent_root, path->nodes[0]);
2286 		WARN_ON(1);
2287 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2288 		       "parent %llu root %llu gen %llu owner %llu\n",
2289 		       (unsigned long long)bytenr,
2290 		       (unsigned long long)parent,
2291 		       (unsigned long long)root_objectid,
2292 		       (unsigned long long)ref_generation,
2293 		       (unsigned long long)owner_objectid);
2294 	}
2295 
2296 	leaf = path->nodes[0];
2297 	ei = btrfs_item_ptr(leaf, extent_slot,
2298 			    struct btrfs_extent_item);
2299 	refs = btrfs_extent_refs(leaf, ei);
2300 
2301 	/*
2302 	 * we're not allowed to delete the extent item if there
2303 	 * are other delayed ref updates pending
2304 	 */
2305 
2306 	BUG_ON(refs < refs_to_drop);
2307 	refs -= refs_to_drop;
2308 	btrfs_set_extent_refs(leaf, ei, refs);
2309 	btrfs_mark_buffer_dirty(leaf);
2310 
2311 	if (refs == 0 && found_extent &&
2312 	    path->slots[0] == extent_slot + 1) {
2313 		struct btrfs_extent_ref *ref;
2314 		ref = btrfs_item_ptr(leaf, path->slots[0],
2315 				     struct btrfs_extent_ref);
2316 		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2317 		/* if the back ref and the extent are next to each other
2318 		 * they get deleted below in one shot
2319 		 */
2320 		path->slots[0] = extent_slot;
2321 		num_to_del = 2;
2322 	} else if (found_extent) {
2323 		/* otherwise delete the extent back ref */
2324 		ret = remove_extent_backref(trans, extent_root, path,
2325 					    refs_to_drop);
2326 		BUG_ON(ret);
2327 		/* if refs are 0, we need to setup the path for deletion */
2328 		if (refs == 0) {
2329 			btrfs_release_path(extent_root, path);
2330 			path->leave_spinning = 1;
2331 			ret = btrfs_search_slot(trans, extent_root, &key, path,
2332 						-1, 1);
2333 			BUG_ON(ret);
2334 		}
2335 	}
2336 
2337 	if (refs == 0) {
2338 		u64 super_used;
2339 		u64 root_used;
2340 		struct extent_buffer *must_clean = NULL;
2341 
2342 		if (pin) {
2343 			ret = pin_down_bytes(trans, root, path,
2344 				bytenr, num_bytes,
2345 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2346 				&must_clean);
2347 			if (ret > 0)
2348 				mark_free = 1;
2349 			BUG_ON(ret < 0);
2350 		}
2351 
2352 		/* block accounting for super block */
2353 		spin_lock(&info->delalloc_lock);
2354 		super_used = btrfs_super_bytes_used(&info->super_copy);
2355 		btrfs_set_super_bytes_used(&info->super_copy,
2356 					   super_used - num_bytes);
2357 
2358 		/* block accounting for root item */
2359 		root_used = btrfs_root_used(&root->root_item);
2360 		btrfs_set_root_used(&root->root_item,
2361 					   root_used - num_bytes);
2362 		spin_unlock(&info->delalloc_lock);
2363 
2364 		/*
2365 		 * it is going to be very rare for someone to be waiting
2366 		 * on the block we're freeing.  del_items might need to
2367 		 * schedule, so rather than get fancy, just force it
2368 		 * to blocking here
2369 		 */
2370 		if (must_clean)
2371 			btrfs_set_lock_blocking(must_clean);
2372 
2373 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2374 				      num_to_del);
2375 		BUG_ON(ret);
2376 		btrfs_release_path(extent_root, path);
2377 
2378 		if (must_clean) {
2379 			clean_tree_block(NULL, root, must_clean);
2380 			btrfs_tree_unlock(must_clean);
2381 			free_extent_buffer(must_clean);
2382 		}
2383 
2384 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2385 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2386 			BUG_ON(ret);
2387 		} else {
2388 			invalidate_mapping_pages(info->btree_inode->i_mapping,
2389 			     bytenr >> PAGE_CACHE_SHIFT,
2390 			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2391 		}
2392 
2393 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2394 					 mark_free);
2395 		BUG_ON(ret);
2396 	}
2397 	btrfs_free_path(path);
2398 	return ret;
2399 }
2400 
2401 /*
2402  * remove an extent from the root, returns 0 on success
2403  */
2404 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2405 					struct btrfs_root *root,
2406 					u64 bytenr, u64 num_bytes, u64 parent,
2407 					u64 root_objectid, u64 ref_generation,
2408 					u64 owner_objectid, int pin,
2409 					int refs_to_drop)
2410 {
2411 	WARN_ON(num_bytes < root->sectorsize);
2412 
2413 	/*
2414 	 * if metadata always pin
2415 	 * if data pin when any transaction has committed this
2416 	 */
2417 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2418 	    ref_generation != trans->transid)
2419 		pin = 1;
2420 
2421 	if (ref_generation != trans->transid)
2422 		pin = 1;
2423 
2424 	return __free_extent(trans, root, bytenr, num_bytes, parent,
2425 			    root_objectid, ref_generation,
2426 			    owner_objectid, pin, pin == 0, refs_to_drop);
2427 }
2428 
2429 /*
2430  * when we free an extent, it is possible (and likely) that we free the last
2431  * delayed ref for that extent as well.  This searches the delayed ref tree for
2432  * a given extent, and if there are no other delayed refs to be processed, it
2433  * removes it from the tree.
2434  */
2435 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2436 				      struct btrfs_root *root, u64 bytenr)
2437 {
2438 	struct btrfs_delayed_ref_head *head;
2439 	struct btrfs_delayed_ref_root *delayed_refs;
2440 	struct btrfs_delayed_ref_node *ref;
2441 	struct rb_node *node;
2442 	int ret;
2443 
2444 	delayed_refs = &trans->transaction->delayed_refs;
2445 	spin_lock(&delayed_refs->lock);
2446 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2447 	if (!head)
2448 		goto out;
2449 
2450 	node = rb_prev(&head->node.rb_node);
2451 	if (!node)
2452 		goto out;
2453 
2454 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2455 
2456 	/* there are still entries for this ref, we can't drop it */
2457 	if (ref->bytenr == bytenr)
2458 		goto out;
2459 
2460 	/*
2461 	 * waiting for the lock here would deadlock.  If someone else has it
2462 	 * locked they are already in the process of dropping it anyway
2463 	 */
2464 	if (!mutex_trylock(&head->mutex))
2465 		goto out;
2466 
2467 	/*
2468 	 * at this point we have a head with no other entries.  Go
2469 	 * ahead and process it.
2470 	 */
2471 	head->node.in_tree = 0;
2472 	rb_erase(&head->node.rb_node, &delayed_refs->root);
2473 
2474 	delayed_refs->num_entries--;
2475 
2476 	/*
2477 	 * we don't take a ref on the node because we're removing it from the
2478 	 * tree, so we just steal the ref the tree was holding.
2479 	 */
2480 	delayed_refs->num_heads--;
2481 	if (list_empty(&head->cluster))
2482 		delayed_refs->num_heads_ready--;
2483 
2484 	list_del_init(&head->cluster);
2485 	spin_unlock(&delayed_refs->lock);
2486 
2487 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2488 				  &head->node, head->must_insert_reserved);
2489 	BUG_ON(ret);
2490 	btrfs_put_delayed_ref(&head->node);
2491 	return 0;
2492 out:
2493 	spin_unlock(&delayed_refs->lock);
2494 	return 0;
2495 }
2496 
2497 int btrfs_free_extent(struct btrfs_trans_handle *trans,
2498 		      struct btrfs_root *root,
2499 		      u64 bytenr, u64 num_bytes, u64 parent,
2500 		      u64 root_objectid, u64 ref_generation,
2501 		      u64 owner_objectid, int pin)
2502 {
2503 	int ret;
2504 
2505 	/*
2506 	 * tree log blocks never actually go into the extent allocation
2507 	 * tree, just update pinning info and exit early.
2508 	 *
2509 	 * data extents referenced by the tree log do need to have
2510 	 * their reference counts bumped.
2511 	 */
2512 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 		mutex_lock(&root->fs_info->pinned_mutex);
2515 
2516 		/* unlocks the pinned mutex */
2517 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 		update_reserved_extents(root, bytenr, num_bytes, 0);
2519 		ret = 0;
2520 	} else {
2521 		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2522 				       root_objectid, ref_generation,
2523 				       owner_objectid,
2524 				       BTRFS_DROP_DELAYED_REF, 1);
2525 		BUG_ON(ret);
2526 		ret = check_ref_cleanup(trans, root, bytenr);
2527 		BUG_ON(ret);
2528 	}
2529 	return ret;
2530 }
2531 
2532 static u64 stripe_align(struct btrfs_root *root, u64 val)
2533 {
2534 	u64 mask = ((u64)root->stripesize - 1);
2535 	u64 ret = (val + mask) & ~mask;
2536 	return ret;
2537 }
2538 
2539 /*
2540  * walks the btree of allocated extents and find a hole of a given size.
2541  * The key ins is changed to record the hole:
2542  * ins->objectid == block start
2543  * ins->flags = BTRFS_EXTENT_ITEM_KEY
2544  * ins->offset == number of blocks
2545  * Any available blocks before search_start are skipped.
2546  */
2547 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2548 				     struct btrfs_root *orig_root,
2549 				     u64 num_bytes, u64 empty_size,
2550 				     u64 search_start, u64 search_end,
2551 				     u64 hint_byte, struct btrfs_key *ins,
2552 				     u64 exclude_start, u64 exclude_nr,
2553 				     int data)
2554 {
2555 	int ret = 0;
2556 	struct btrfs_root *root = orig_root->fs_info->extent_root;
2557 	u64 total_needed = num_bytes;
2558 	u64 *last_ptr = NULL;
2559 	u64 last_wanted = 0;
2560 	struct btrfs_block_group_cache *block_group = NULL;
2561 	int chunk_alloc_done = 0;
2562 	int empty_cluster = 2 * 1024 * 1024;
2563 	int allowed_chunk_alloc = 0;
2564 	struct list_head *head = NULL, *cur = NULL;
2565 	int loop = 0;
2566 	int extra_loop = 0;
2567 	struct btrfs_space_info *space_info;
2568 
2569 	WARN_ON(num_bytes < root->sectorsize);
2570 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2571 	ins->objectid = 0;
2572 	ins->offset = 0;
2573 
2574 	if (orig_root->ref_cows || empty_size)
2575 		allowed_chunk_alloc = 1;
2576 
2577 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
2578 		last_ptr = &root->fs_info->last_alloc;
2579 		if (!btrfs_test_opt(root, SSD))
2580 			empty_cluster = 64 * 1024;
2581 	}
2582 
2583 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2584 		last_ptr = &root->fs_info->last_data_alloc;
2585 
2586 	if (last_ptr) {
2587 		if (*last_ptr) {
2588 			hint_byte = *last_ptr;
2589 			last_wanted = *last_ptr;
2590 		} else
2591 			empty_size += empty_cluster;
2592 	} else {
2593 		empty_cluster = 0;
2594 	}
2595 	search_start = max(search_start, first_logical_byte(root, 0));
2596 	search_start = max(search_start, hint_byte);
2597 
2598 	if (last_wanted && search_start != last_wanted) {
2599 		last_wanted = 0;
2600 		empty_size += empty_cluster;
2601 	}
2602 
2603 	total_needed += empty_size;
2604 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2605 	if (!block_group)
2606 		block_group = btrfs_lookup_first_block_group(root->fs_info,
2607 							     search_start);
2608 	space_info = __find_space_info(root->fs_info, data);
2609 
2610 	down_read(&space_info->groups_sem);
2611 	while (1) {
2612 		struct btrfs_free_space *free_space;
2613 		/*
2614 		 * the only way this happens if our hint points to a block
2615 		 * group thats not of the proper type, while looping this
2616 		 * should never happen
2617 		 */
2618 		if (empty_size)
2619 			extra_loop = 1;
2620 
2621 		if (!block_group)
2622 			goto new_group_no_lock;
2623 
2624 		if (unlikely(!block_group->cached)) {
2625 			mutex_lock(&block_group->cache_mutex);
2626 			ret = cache_block_group(root, block_group);
2627 			mutex_unlock(&block_group->cache_mutex);
2628 			if (ret)
2629 				break;
2630 		}
2631 
2632 		mutex_lock(&block_group->alloc_mutex);
2633 		if (unlikely(!block_group_bits(block_group, data)))
2634 			goto new_group;
2635 
2636 		if (unlikely(block_group->ro))
2637 			goto new_group;
2638 
2639 		free_space = btrfs_find_free_space(block_group, search_start,
2640 						   total_needed);
2641 		if (free_space) {
2642 			u64 start = block_group->key.objectid;
2643 			u64 end = block_group->key.objectid +
2644 				block_group->key.offset;
2645 
2646 			search_start = stripe_align(root, free_space->offset);
2647 
2648 			/* move on to the next group */
2649 			if (search_start + num_bytes >= search_end)
2650 				goto new_group;
2651 
2652 			/* move on to the next group */
2653 			if (search_start + num_bytes > end)
2654 				goto new_group;
2655 
2656 			if (last_wanted && search_start != last_wanted) {
2657 				total_needed += empty_cluster;
2658 				empty_size += empty_cluster;
2659 				last_wanted = 0;
2660 				/*
2661 				 * if search_start is still in this block group
2662 				 * then we just re-search this block group
2663 				 */
2664 				if (search_start >= start &&
2665 				    search_start < end) {
2666 					mutex_unlock(&block_group->alloc_mutex);
2667 					continue;
2668 				}
2669 
2670 				/* else we go to the next block group */
2671 				goto new_group;
2672 			}
2673 
2674 			if (exclude_nr > 0 &&
2675 			    (search_start + num_bytes > exclude_start &&
2676 			     search_start < exclude_start + exclude_nr)) {
2677 				search_start = exclude_start + exclude_nr;
2678 				/*
2679 				 * if search_start is still in this block group
2680 				 * then we just re-search this block group
2681 				 */
2682 				if (search_start >= start &&
2683 				    search_start < end) {
2684 					mutex_unlock(&block_group->alloc_mutex);
2685 					last_wanted = 0;
2686 					continue;
2687 				}
2688 
2689 				/* else we go to the next block group */
2690 				goto new_group;
2691 			}
2692 
2693 			ins->objectid = search_start;
2694 			ins->offset = num_bytes;
2695 
2696 			btrfs_remove_free_space_lock(block_group, search_start,
2697 						     num_bytes);
2698 			/* we are all good, lets return */
2699 			mutex_unlock(&block_group->alloc_mutex);
2700 			break;
2701 		}
2702 new_group:
2703 		mutex_unlock(&block_group->alloc_mutex);
2704 		put_block_group(block_group);
2705 		block_group = NULL;
2706 new_group_no_lock:
2707 		/* don't try to compare new allocations against the
2708 		 * last allocation any more
2709 		 */
2710 		last_wanted = 0;
2711 
2712 		/*
2713 		 * Here's how this works.
2714 		 * loop == 0: we were searching a block group via a hint
2715 		 *		and didn't find anything, so we start at
2716 		 *		the head of the block groups and keep searching
2717 		 * loop == 1: we're searching through all of the block groups
2718 		 *		if we hit the head again we have searched
2719 		 *		all of the block groups for this space and we
2720 		 *		need to try and allocate, if we cant error out.
2721 		 * loop == 2: we allocated more space and are looping through
2722 		 *		all of the block groups again.
2723 		 */
2724 		if (loop == 0) {
2725 			head = &space_info->block_groups;
2726 			cur = head->next;
2727 			loop++;
2728 		} else if (loop == 1 && cur == head) {
2729 			int keep_going;
2730 
2731 			/* at this point we give up on the empty_size
2732 			 * allocations and just try to allocate the min
2733 			 * space.
2734 			 *
2735 			 * The extra_loop field was set if an empty_size
2736 			 * allocation was attempted above, and if this
2737 			 * is try we need to try the loop again without
2738 			 * the additional empty_size.
2739 			 */
2740 			total_needed -= empty_size;
2741 			empty_size = 0;
2742 			keep_going = extra_loop;
2743 			loop++;
2744 
2745 			if (allowed_chunk_alloc && !chunk_alloc_done) {
2746 				up_read(&space_info->groups_sem);
2747 				ret = do_chunk_alloc(trans, root, num_bytes +
2748 						     2 * 1024 * 1024, data, 1);
2749 				down_read(&space_info->groups_sem);
2750 				if (ret < 0)
2751 					goto loop_check;
2752 				head = &space_info->block_groups;
2753 				/*
2754 				 * we've allocated a new chunk, keep
2755 				 * trying
2756 				 */
2757 				keep_going = 1;
2758 				chunk_alloc_done = 1;
2759 			} else if (!allowed_chunk_alloc) {
2760 				space_info->force_alloc = 1;
2761 			}
2762 loop_check:
2763 			if (keep_going) {
2764 				cur = head->next;
2765 				extra_loop = 0;
2766 			} else {
2767 				break;
2768 			}
2769 		} else if (cur == head) {
2770 			break;
2771 		}
2772 
2773 		block_group = list_entry(cur, struct btrfs_block_group_cache,
2774 					 list);
2775 		atomic_inc(&block_group->count);
2776 
2777 		search_start = block_group->key.objectid;
2778 		cur = cur->next;
2779 	}
2780 
2781 	/* we found what we needed */
2782 	if (ins->objectid) {
2783 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
2784 			trans->block_group = block_group->key.objectid;
2785 
2786 		if (last_ptr)
2787 			*last_ptr = ins->objectid + ins->offset;
2788 		ret = 0;
2789 	} else if (!ret) {
2790 		printk(KERN_ERR "btrfs searching for %llu bytes, "
2791 		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
2792 		       (unsigned long long)total_needed,
2793 		       (unsigned long long)num_bytes,
2794 		       loop, allowed_chunk_alloc);
2795 		ret = -ENOSPC;
2796 	}
2797 	if (block_group)
2798 		put_block_group(block_group);
2799 
2800 	up_read(&space_info->groups_sem);
2801 	return ret;
2802 }
2803 
2804 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2805 {
2806 	struct btrfs_block_group_cache *cache;
2807 
2808 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
2809 	       (unsigned long long)(info->total_bytes - info->bytes_used -
2810 				    info->bytes_pinned - info->bytes_reserved),
2811 	       (info->full) ? "" : "not ");
2812 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
2813 	       " may_use=%llu, used=%llu\n", info->total_bytes,
2814 	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
2815 	       info->bytes_used);
2816 
2817 	down_read(&info->groups_sem);
2818 	list_for_each_entry(cache, &info->block_groups, list) {
2819 		spin_lock(&cache->lock);
2820 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
2821 		       "%llu pinned %llu reserved\n",
2822 		       (unsigned long long)cache->key.objectid,
2823 		       (unsigned long long)cache->key.offset,
2824 		       (unsigned long long)btrfs_block_group_used(&cache->item),
2825 		       (unsigned long long)cache->pinned,
2826 		       (unsigned long long)cache->reserved);
2827 		btrfs_dump_free_space(cache, bytes);
2828 		spin_unlock(&cache->lock);
2829 	}
2830 	up_read(&info->groups_sem);
2831 }
2832 
2833 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2834 				  struct btrfs_root *root,
2835 				  u64 num_bytes, u64 min_alloc_size,
2836 				  u64 empty_size, u64 hint_byte,
2837 				  u64 search_end, struct btrfs_key *ins,
2838 				  u64 data)
2839 {
2840 	int ret;
2841 	u64 search_start = 0;
2842 	struct btrfs_fs_info *info = root->fs_info;
2843 
2844 	data = btrfs_get_alloc_profile(root, data);
2845 again:
2846 	/*
2847 	 * the only place that sets empty_size is btrfs_realloc_node, which
2848 	 * is not called recursively on allocations
2849 	 */
2850 	if (empty_size || root->ref_cows) {
2851 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2852 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2853 				     2 * 1024 * 1024,
2854 				     BTRFS_BLOCK_GROUP_METADATA |
2855 				     (info->metadata_alloc_profile &
2856 				      info->avail_metadata_alloc_bits), 0);
2857 		}
2858 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2859 				     num_bytes + 2 * 1024 * 1024, data, 0);
2860 	}
2861 
2862 	WARN_ON(num_bytes < root->sectorsize);
2863 	ret = find_free_extent(trans, root, num_bytes, empty_size,
2864 			       search_start, search_end, hint_byte, ins,
2865 			       trans->alloc_exclude_start,
2866 			       trans->alloc_exclude_nr, data);
2867 
2868 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2869 		num_bytes = num_bytes >> 1;
2870 		num_bytes = num_bytes & ~(root->sectorsize - 1);
2871 		num_bytes = max(num_bytes, min_alloc_size);
2872 		do_chunk_alloc(trans, root->fs_info->extent_root,
2873 			       num_bytes, data, 1);
2874 		goto again;
2875 	}
2876 	if (ret) {
2877 		struct btrfs_space_info *sinfo;
2878 
2879 		sinfo = __find_space_info(root->fs_info, data);
2880 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
2881 		       "wanted %llu\n", (unsigned long long)data,
2882 		       (unsigned long long)num_bytes);
2883 		dump_space_info(sinfo, num_bytes);
2884 		BUG();
2885 	}
2886 
2887 	return ret;
2888 }
2889 
2890 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2891 {
2892 	struct btrfs_block_group_cache *cache;
2893 	int ret = 0;
2894 
2895 	cache = btrfs_lookup_block_group(root->fs_info, start);
2896 	if (!cache) {
2897 		printk(KERN_ERR "Unable to find block group for %llu\n",
2898 		       (unsigned long long)start);
2899 		return -ENOSPC;
2900 	}
2901 
2902 	ret = btrfs_discard_extent(root, start, len);
2903 
2904 	btrfs_add_free_space(cache, start, len);
2905 	put_block_group(cache);
2906 	update_reserved_extents(root, start, len, 0);
2907 
2908 	return ret;
2909 }
2910 
2911 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2912 				  struct btrfs_root *root,
2913 				  u64 num_bytes, u64 min_alloc_size,
2914 				  u64 empty_size, u64 hint_byte,
2915 				  u64 search_end, struct btrfs_key *ins,
2916 				  u64 data)
2917 {
2918 	int ret;
2919 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2920 				     empty_size, hint_byte, search_end, ins,
2921 				     data);
2922 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
2923 	return ret;
2924 }
2925 
2926 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2927 					 struct btrfs_root *root, u64 parent,
2928 					 u64 root_objectid, u64 ref_generation,
2929 					 u64 owner, struct btrfs_key *ins,
2930 					 int ref_mod)
2931 {
2932 	int ret;
2933 	u64 super_used;
2934 	u64 root_used;
2935 	u64 num_bytes = ins->offset;
2936 	u32 sizes[2];
2937 	struct btrfs_fs_info *info = root->fs_info;
2938 	struct btrfs_root *extent_root = info->extent_root;
2939 	struct btrfs_extent_item *extent_item;
2940 	struct btrfs_extent_ref *ref;
2941 	struct btrfs_path *path;
2942 	struct btrfs_key keys[2];
2943 
2944 	if (parent == 0)
2945 		parent = ins->objectid;
2946 
2947 	/* block accounting for super block */
2948 	spin_lock(&info->delalloc_lock);
2949 	super_used = btrfs_super_bytes_used(&info->super_copy);
2950 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2951 
2952 	/* block accounting for root item */
2953 	root_used = btrfs_root_used(&root->root_item);
2954 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2955 	spin_unlock(&info->delalloc_lock);
2956 
2957 	memcpy(&keys[0], ins, sizeof(*ins));
2958 	keys[1].objectid = ins->objectid;
2959 	keys[1].type = BTRFS_EXTENT_REF_KEY;
2960 	keys[1].offset = parent;
2961 	sizes[0] = sizeof(*extent_item);
2962 	sizes[1] = sizeof(*ref);
2963 
2964 	path = btrfs_alloc_path();
2965 	BUG_ON(!path);
2966 
2967 	path->leave_spinning = 1;
2968 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2969 				       sizes, 2);
2970 	BUG_ON(ret);
2971 
2972 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2973 				     struct btrfs_extent_item);
2974 	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
2975 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2976 			     struct btrfs_extent_ref);
2977 
2978 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2979 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2980 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2981 	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
2982 
2983 	btrfs_mark_buffer_dirty(path->nodes[0]);
2984 
2985 	trans->alloc_exclude_start = 0;
2986 	trans->alloc_exclude_nr = 0;
2987 	btrfs_free_path(path);
2988 
2989 	if (ret)
2990 		goto out;
2991 
2992 	ret = update_block_group(trans, root, ins->objectid,
2993 				 ins->offset, 1, 0);
2994 	if (ret) {
2995 		printk(KERN_ERR "btrfs update block group failed for %llu "
2996 		       "%llu\n", (unsigned long long)ins->objectid,
2997 		       (unsigned long long)ins->offset);
2998 		BUG();
2999 	}
3000 out:
3001 	return ret;
3002 }
3003 
3004 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3005 				struct btrfs_root *root, u64 parent,
3006 				u64 root_objectid, u64 ref_generation,
3007 				u64 owner, struct btrfs_key *ins)
3008 {
3009 	int ret;
3010 
3011 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3012 		return 0;
3013 
3014 	ret = btrfs_add_delayed_ref(trans, ins->objectid,
3015 				    ins->offset, parent, root_objectid,
3016 				    ref_generation, owner,
3017 				    BTRFS_ADD_DELAYED_EXTENT, 0);
3018 	BUG_ON(ret);
3019 	return ret;
3020 }
3021 
3022 /*
3023  * this is used by the tree logging recovery code.  It records that
3024  * an extent has been allocated and makes sure to clear the free
3025  * space cache bits as well
3026  */
3027 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3028 				struct btrfs_root *root, u64 parent,
3029 				u64 root_objectid, u64 ref_generation,
3030 				u64 owner, struct btrfs_key *ins)
3031 {
3032 	int ret;
3033 	struct btrfs_block_group_cache *block_group;
3034 
3035 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3036 	mutex_lock(&block_group->cache_mutex);
3037 	cache_block_group(root, block_group);
3038 	mutex_unlock(&block_group->cache_mutex);
3039 
3040 	ret = btrfs_remove_free_space(block_group, ins->objectid,
3041 				      ins->offset);
3042 	BUG_ON(ret);
3043 	put_block_group(block_group);
3044 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3045 					    ref_generation, owner, ins, 1);
3046 	return ret;
3047 }
3048 
3049 /*
3050  * finds a free extent and does all the dirty work required for allocation
3051  * returns the key for the extent through ins, and a tree buffer for
3052  * the first block of the extent through buf.
3053  *
3054  * returns 0 if everything worked, non-zero otherwise.
3055  */
3056 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3057 		       struct btrfs_root *root,
3058 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
3059 		       u64 root_objectid, u64 ref_generation,
3060 		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
3061 		       u64 search_end, struct btrfs_key *ins, u64 data)
3062 {
3063 	int ret;
3064 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
3065 				     min_alloc_size, empty_size, hint_byte,
3066 				     search_end, ins, data);
3067 	BUG_ON(ret);
3068 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3069 		ret = btrfs_add_delayed_ref(trans, ins->objectid,
3070 					    ins->offset, parent, root_objectid,
3071 					    ref_generation, owner_objectid,
3072 					    BTRFS_ADD_DELAYED_EXTENT, 0);
3073 		BUG_ON(ret);
3074 	}
3075 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
3076 	return ret;
3077 }
3078 
3079 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3080 					    struct btrfs_root *root,
3081 					    u64 bytenr, u32 blocksize,
3082 					    int level)
3083 {
3084 	struct extent_buffer *buf;
3085 
3086 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3087 	if (!buf)
3088 		return ERR_PTR(-ENOMEM);
3089 	btrfs_set_header_generation(buf, trans->transid);
3090 	btrfs_set_buffer_lockdep_class(buf, level);
3091 	btrfs_tree_lock(buf);
3092 	clean_tree_block(trans, root, buf);
3093 
3094 	btrfs_set_lock_blocking(buf);
3095 	btrfs_set_buffer_uptodate(buf);
3096 
3097 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3098 		set_extent_dirty(&root->dirty_log_pages, buf->start,
3099 			 buf->start + buf->len - 1, GFP_NOFS);
3100 	} else {
3101 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3102 			 buf->start + buf->len - 1, GFP_NOFS);
3103 	}
3104 	trans->blocks_used++;
3105 	/* this returns a buffer locked for blocking */
3106 	return buf;
3107 }
3108 
3109 /*
3110  * helper function to allocate a block for a given tree
3111  * returns the tree buffer or NULL.
3112  */
3113 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3114 					     struct btrfs_root *root,
3115 					     u32 blocksize, u64 parent,
3116 					     u64 root_objectid,
3117 					     u64 ref_generation,
3118 					     int level,
3119 					     u64 hint,
3120 					     u64 empty_size)
3121 {
3122 	struct btrfs_key ins;
3123 	int ret;
3124 	struct extent_buffer *buf;
3125 
3126 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3127 				 root_objectid, ref_generation, level,
3128 				 empty_size, hint, (u64)-1, &ins, 0);
3129 	if (ret) {
3130 		BUG_ON(ret > 0);
3131 		return ERR_PTR(ret);
3132 	}
3133 
3134 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3135 				    blocksize, level);
3136 	return buf;
3137 }
3138 
3139 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3140 			struct btrfs_root *root, struct extent_buffer *leaf)
3141 {
3142 	u64 leaf_owner;
3143 	u64 leaf_generation;
3144 	struct refsort *sorted;
3145 	struct btrfs_key key;
3146 	struct btrfs_file_extent_item *fi;
3147 	int i;
3148 	int nritems;
3149 	int ret;
3150 	int refi = 0;
3151 	int slot;
3152 
3153 	BUG_ON(!btrfs_is_leaf(leaf));
3154 	nritems = btrfs_header_nritems(leaf);
3155 	leaf_owner = btrfs_header_owner(leaf);
3156 	leaf_generation = btrfs_header_generation(leaf);
3157 
3158 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3159 	/* we do this loop twice.  The first time we build a list
3160 	 * of the extents we have a reference on, then we sort the list
3161 	 * by bytenr.  The second time around we actually do the
3162 	 * extent freeing.
3163 	 */
3164 	for (i = 0; i < nritems; i++) {
3165 		u64 disk_bytenr;
3166 		cond_resched();
3167 
3168 		btrfs_item_key_to_cpu(leaf, &key, i);
3169 
3170 		/* only extents have references, skip everything else */
3171 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3172 			continue;
3173 
3174 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3175 
3176 		/* inline extents live in the btree, they don't have refs */
3177 		if (btrfs_file_extent_type(leaf, fi) ==
3178 		    BTRFS_FILE_EXTENT_INLINE)
3179 			continue;
3180 
3181 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3182 
3183 		/* holes don't have refs */
3184 		if (disk_bytenr == 0)
3185 			continue;
3186 
3187 		sorted[refi].bytenr = disk_bytenr;
3188 		sorted[refi].slot = i;
3189 		refi++;
3190 	}
3191 
3192 	if (refi == 0)
3193 		goto out;
3194 
3195 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3196 
3197 	for (i = 0; i < refi; i++) {
3198 		u64 disk_bytenr;
3199 
3200 		disk_bytenr = sorted[i].bytenr;
3201 		slot = sorted[i].slot;
3202 
3203 		cond_resched();
3204 
3205 		btrfs_item_key_to_cpu(leaf, &key, slot);
3206 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3207 			continue;
3208 
3209 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3210 
3211 		ret = btrfs_free_extent(trans, root, disk_bytenr,
3212 				btrfs_file_extent_disk_num_bytes(leaf, fi),
3213 				leaf->start, leaf_owner, leaf_generation,
3214 				key.objectid, 0);
3215 		BUG_ON(ret);
3216 
3217 		atomic_inc(&root->fs_info->throttle_gen);
3218 		wake_up(&root->fs_info->transaction_throttle);
3219 		cond_resched();
3220 	}
3221 out:
3222 	kfree(sorted);
3223 	return 0;
3224 }
3225 
3226 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3227 					struct btrfs_root *root,
3228 					struct btrfs_leaf_ref *ref)
3229 {
3230 	int i;
3231 	int ret;
3232 	struct btrfs_extent_info *info;
3233 	struct refsort *sorted;
3234 
3235 	if (ref->nritems == 0)
3236 		return 0;
3237 
3238 	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3239 	for (i = 0; i < ref->nritems; i++) {
3240 		sorted[i].bytenr = ref->extents[i].bytenr;
3241 		sorted[i].slot = i;
3242 	}
3243 	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3244 
3245 	/*
3246 	 * the items in the ref were sorted when the ref was inserted
3247 	 * into the ref cache, so this is already in order
3248 	 */
3249 	for (i = 0; i < ref->nritems; i++) {
3250 		info = ref->extents + sorted[i].slot;
3251 		ret = btrfs_free_extent(trans, root, info->bytenr,
3252 					  info->num_bytes, ref->bytenr,
3253 					  ref->owner, ref->generation,
3254 					  info->objectid, 0);
3255 
3256 		atomic_inc(&root->fs_info->throttle_gen);
3257 		wake_up(&root->fs_info->transaction_throttle);
3258 		cond_resched();
3259 
3260 		BUG_ON(ret);
3261 		info++;
3262 	}
3263 
3264 	kfree(sorted);
3265 	return 0;
3266 }
3267 
3268 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3269 				     struct btrfs_root *root, u64 start,
3270 				     u64 len, u32 *refs)
3271 {
3272 	int ret;
3273 
3274 	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3275 	BUG_ON(ret);
3276 
3277 #if 0 /* some debugging code in case we see problems here */
3278 	/* if the refs count is one, it won't get increased again.  But
3279 	 * if the ref count is > 1, someone may be decreasing it at
3280 	 * the same time we are.
3281 	 */
3282 	if (*refs != 1) {
3283 		struct extent_buffer *eb = NULL;
3284 		eb = btrfs_find_create_tree_block(root, start, len);
3285 		if (eb)
3286 			btrfs_tree_lock(eb);
3287 
3288 		mutex_lock(&root->fs_info->alloc_mutex);
3289 		ret = lookup_extent_ref(NULL, root, start, len, refs);
3290 		BUG_ON(ret);
3291 		mutex_unlock(&root->fs_info->alloc_mutex);
3292 
3293 		if (eb) {
3294 			btrfs_tree_unlock(eb);
3295 			free_extent_buffer(eb);
3296 		}
3297 		if (*refs == 1) {
3298 			printk(KERN_ERR "btrfs block %llu went down to one "
3299 			       "during drop_snap\n", (unsigned long long)start);
3300 		}
3301 
3302 	}
3303 #endif
3304 
3305 	cond_resched();
3306 	return ret;
3307 }
3308 
3309 /*
3310  * this is used while deleting old snapshots, and it drops the refs
3311  * on a whole subtree starting from a level 1 node.
3312  *
3313  * The idea is to sort all the leaf pointers, and then drop the
3314  * ref on all the leaves in order.  Most of the time the leaves
3315  * will have ref cache entries, so no leaf IOs will be required to
3316  * find the extents they have references on.
3317  *
3318  * For each leaf, any references it has are also dropped in order
3319  *
3320  * This ends up dropping the references in something close to optimal
3321  * order for reading and modifying the extent allocation tree.
3322  */
3323 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3324 					struct btrfs_root *root,
3325 					struct btrfs_path *path)
3326 {
3327 	u64 bytenr;
3328 	u64 root_owner;
3329 	u64 root_gen;
3330 	struct extent_buffer *eb = path->nodes[1];
3331 	struct extent_buffer *leaf;
3332 	struct btrfs_leaf_ref *ref;
3333 	struct refsort *sorted = NULL;
3334 	int nritems = btrfs_header_nritems(eb);
3335 	int ret;
3336 	int i;
3337 	int refi = 0;
3338 	int slot = path->slots[1];
3339 	u32 blocksize = btrfs_level_size(root, 0);
3340 	u32 refs;
3341 
3342 	if (nritems == 0)
3343 		goto out;
3344 
3345 	root_owner = btrfs_header_owner(eb);
3346 	root_gen = btrfs_header_generation(eb);
3347 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3348 
3349 	/*
3350 	 * step one, sort all the leaf pointers so we don't scribble
3351 	 * randomly into the extent allocation tree
3352 	 */
3353 	for (i = slot; i < nritems; i++) {
3354 		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3355 		sorted[refi].slot = i;
3356 		refi++;
3357 	}
3358 
3359 	/*
3360 	 * nritems won't be zero, but if we're picking up drop_snapshot
3361 	 * after a crash, slot might be > 0, so double check things
3362 	 * just in case.
3363 	 */
3364 	if (refi == 0)
3365 		goto out;
3366 
3367 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3368 
3369 	/*
3370 	 * the first loop frees everything the leaves point to
3371 	 */
3372 	for (i = 0; i < refi; i++) {
3373 		u64 ptr_gen;
3374 
3375 		bytenr = sorted[i].bytenr;
3376 
3377 		/*
3378 		 * check the reference count on this leaf.  If it is > 1
3379 		 * we just decrement it below and don't update any
3380 		 * of the refs the leaf points to.
3381 		 */
3382 		ret = drop_snap_lookup_refcount(trans, root, bytenr,
3383 						blocksize, &refs);
3384 		BUG_ON(ret);
3385 		if (refs != 1)
3386 			continue;
3387 
3388 		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3389 
3390 		/*
3391 		 * the leaf only had one reference, which means the
3392 		 * only thing pointing to this leaf is the snapshot
3393 		 * we're deleting.  It isn't possible for the reference
3394 		 * count to increase again later
3395 		 *
3396 		 * The reference cache is checked for the leaf,
3397 		 * and if found we'll be able to drop any refs held by
3398 		 * the leaf without needing to read it in.
3399 		 */
3400 		ref = btrfs_lookup_leaf_ref(root, bytenr);
3401 		if (ref && ref->generation != ptr_gen) {
3402 			btrfs_free_leaf_ref(root, ref);
3403 			ref = NULL;
3404 		}
3405 		if (ref) {
3406 			ret = cache_drop_leaf_ref(trans, root, ref);
3407 			BUG_ON(ret);
3408 			btrfs_remove_leaf_ref(root, ref);
3409 			btrfs_free_leaf_ref(root, ref);
3410 		} else {
3411 			/*
3412 			 * the leaf wasn't in the reference cache, so
3413 			 * we have to read it.
3414 			 */
3415 			leaf = read_tree_block(root, bytenr, blocksize,
3416 					       ptr_gen);
3417 			ret = btrfs_drop_leaf_ref(trans, root, leaf);
3418 			BUG_ON(ret);
3419 			free_extent_buffer(leaf);
3420 		}
3421 		atomic_inc(&root->fs_info->throttle_gen);
3422 		wake_up(&root->fs_info->transaction_throttle);
3423 		cond_resched();
3424 	}
3425 
3426 	/*
3427 	 * run through the loop again to free the refs on the leaves.
3428 	 * This is faster than doing it in the loop above because
3429 	 * the leaves are likely to be clustered together.  We end up
3430 	 * working in nice chunks on the extent allocation tree.
3431 	 */
3432 	for (i = 0; i < refi; i++) {
3433 		bytenr = sorted[i].bytenr;
3434 		ret = btrfs_free_extent(trans, root, bytenr,
3435 					blocksize, eb->start,
3436 					root_owner, root_gen, 0, 1);
3437 		BUG_ON(ret);
3438 
3439 		atomic_inc(&root->fs_info->throttle_gen);
3440 		wake_up(&root->fs_info->transaction_throttle);
3441 		cond_resched();
3442 	}
3443 out:
3444 	kfree(sorted);
3445 
3446 	/*
3447 	 * update the path to show we've processed the entire level 1
3448 	 * node.  This will get saved into the root's drop_snapshot_progress
3449 	 * field so these drops are not repeated again if this transaction
3450 	 * commits.
3451 	 */
3452 	path->slots[1] = nritems;
3453 	return 0;
3454 }
3455 
3456 /*
3457  * helper function for drop_snapshot, this walks down the tree dropping ref
3458  * counts as it goes.
3459  */
3460 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3461 				   struct btrfs_root *root,
3462 				   struct btrfs_path *path, int *level)
3463 {
3464 	u64 root_owner;
3465 	u64 root_gen;
3466 	u64 bytenr;
3467 	u64 ptr_gen;
3468 	struct extent_buffer *next;
3469 	struct extent_buffer *cur;
3470 	struct extent_buffer *parent;
3471 	u32 blocksize;
3472 	int ret;
3473 	u32 refs;
3474 
3475 	WARN_ON(*level < 0);
3476 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
3477 	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3478 				path->nodes[*level]->len, &refs);
3479 	BUG_ON(ret);
3480 	if (refs > 1)
3481 		goto out;
3482 
3483 	/*
3484 	 * walk down to the last node level and free all the leaves
3485 	 */
3486 	while (*level >= 0) {
3487 		WARN_ON(*level < 0);
3488 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
3489 		cur = path->nodes[*level];
3490 
3491 		if (btrfs_header_level(cur) != *level)
3492 			WARN_ON(1);
3493 
3494 		if (path->slots[*level] >=
3495 		    btrfs_header_nritems(cur))
3496 			break;
3497 
3498 		/* the new code goes down to level 1 and does all the
3499 		 * leaves pointed to that node in bulk.  So, this check
3500 		 * for level 0 will always be false.
3501 		 *
3502 		 * But, the disk format allows the drop_snapshot_progress
3503 		 * field in the root to leave things in a state where
3504 		 * a leaf will need cleaning up here.  If someone crashes
3505 		 * with the old code and then boots with the new code,
3506 		 * we might find a leaf here.
3507 		 */
3508 		if (*level == 0) {
3509 			ret = btrfs_drop_leaf_ref(trans, root, cur);
3510 			BUG_ON(ret);
3511 			break;
3512 		}
3513 
3514 		/*
3515 		 * once we get to level one, process the whole node
3516 		 * at once, including everything below it.
3517 		 */
3518 		if (*level == 1) {
3519 			ret = drop_level_one_refs(trans, root, path);
3520 			BUG_ON(ret);
3521 			break;
3522 		}
3523 
3524 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3525 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3526 		blocksize = btrfs_level_size(root, *level - 1);
3527 
3528 		ret = drop_snap_lookup_refcount(trans, root, bytenr,
3529 						blocksize, &refs);
3530 		BUG_ON(ret);
3531 
3532 		/*
3533 		 * if there is more than one reference, we don't need
3534 		 * to read that node to drop any references it has.  We
3535 		 * just drop the ref we hold on that node and move on to the
3536 		 * next slot in this level.
3537 		 */
3538 		if (refs != 1) {
3539 			parent = path->nodes[*level];
3540 			root_owner = btrfs_header_owner(parent);
3541 			root_gen = btrfs_header_generation(parent);
3542 			path->slots[*level]++;
3543 
3544 			ret = btrfs_free_extent(trans, root, bytenr,
3545 						blocksize, parent->start,
3546 						root_owner, root_gen,
3547 						*level - 1, 1);
3548 			BUG_ON(ret);
3549 
3550 			atomic_inc(&root->fs_info->throttle_gen);
3551 			wake_up(&root->fs_info->transaction_throttle);
3552 			cond_resched();
3553 
3554 			continue;
3555 		}
3556 
3557 		/*
3558 		 * we need to keep freeing things in the next level down.
3559 		 * read the block and loop around to process it
3560 		 */
3561 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3562 		WARN_ON(*level <= 0);
3563 		if (path->nodes[*level-1])
3564 			free_extent_buffer(path->nodes[*level-1]);
3565 		path->nodes[*level-1] = next;
3566 		*level = btrfs_header_level(next);
3567 		path->slots[*level] = 0;
3568 		cond_resched();
3569 	}
3570 out:
3571 	WARN_ON(*level < 0);
3572 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
3573 
3574 	if (path->nodes[*level] == root->node) {
3575 		parent = path->nodes[*level];
3576 		bytenr = path->nodes[*level]->start;
3577 	} else {
3578 		parent = path->nodes[*level + 1];
3579 		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3580 	}
3581 
3582 	blocksize = btrfs_level_size(root, *level);
3583 	root_owner = btrfs_header_owner(parent);
3584 	root_gen = btrfs_header_generation(parent);
3585 
3586 	/*
3587 	 * cleanup and free the reference on the last node
3588 	 * we processed
3589 	 */
3590 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3591 				  parent->start, root_owner, root_gen,
3592 				  *level, 1);
3593 	free_extent_buffer(path->nodes[*level]);
3594 	path->nodes[*level] = NULL;
3595 
3596 	*level += 1;
3597 	BUG_ON(ret);
3598 
3599 	cond_resched();
3600 	return 0;
3601 }
3602 
3603 /*
3604  * helper function for drop_subtree, this function is similar to
3605  * walk_down_tree. The main difference is that it checks reference
3606  * counts while tree blocks are locked.
3607  */
3608 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3609 				      struct btrfs_root *root,
3610 				      struct btrfs_path *path, int *level)
3611 {
3612 	struct extent_buffer *next;
3613 	struct extent_buffer *cur;
3614 	struct extent_buffer *parent;
3615 	u64 bytenr;
3616 	u64 ptr_gen;
3617 	u32 blocksize;
3618 	u32 refs;
3619 	int ret;
3620 
3621 	cur = path->nodes[*level];
3622 	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3623 				      &refs);
3624 	BUG_ON(ret);
3625 	if (refs > 1)
3626 		goto out;
3627 
3628 	while (*level >= 0) {
3629 		cur = path->nodes[*level];
3630 		if (*level == 0) {
3631 			ret = btrfs_drop_leaf_ref(trans, root, cur);
3632 			BUG_ON(ret);
3633 			clean_tree_block(trans, root, cur);
3634 			break;
3635 		}
3636 		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3637 			clean_tree_block(trans, root, cur);
3638 			break;
3639 		}
3640 
3641 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3642 		blocksize = btrfs_level_size(root, *level - 1);
3643 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3644 
3645 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3646 		btrfs_tree_lock(next);
3647 		btrfs_set_lock_blocking(next);
3648 
3649 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3650 					      &refs);
3651 		BUG_ON(ret);
3652 		if (refs > 1) {
3653 			parent = path->nodes[*level];
3654 			ret = btrfs_free_extent(trans, root, bytenr,
3655 					blocksize, parent->start,
3656 					btrfs_header_owner(parent),
3657 					btrfs_header_generation(parent),
3658 					*level - 1, 1);
3659 			BUG_ON(ret);
3660 			path->slots[*level]++;
3661 			btrfs_tree_unlock(next);
3662 			free_extent_buffer(next);
3663 			continue;
3664 		}
3665 
3666 		*level = btrfs_header_level(next);
3667 		path->nodes[*level] = next;
3668 		path->slots[*level] = 0;
3669 		path->locks[*level] = 1;
3670 		cond_resched();
3671 	}
3672 out:
3673 	parent = path->nodes[*level + 1];
3674 	bytenr = path->nodes[*level]->start;
3675 	blocksize = path->nodes[*level]->len;
3676 
3677 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3678 			parent->start, btrfs_header_owner(parent),
3679 			btrfs_header_generation(parent), *level, 1);
3680 	BUG_ON(ret);
3681 
3682 	if (path->locks[*level]) {
3683 		btrfs_tree_unlock(path->nodes[*level]);
3684 		path->locks[*level] = 0;
3685 	}
3686 	free_extent_buffer(path->nodes[*level]);
3687 	path->nodes[*level] = NULL;
3688 	*level += 1;
3689 	cond_resched();
3690 	return 0;
3691 }
3692 
3693 /*
3694  * helper for dropping snapshots.  This walks back up the tree in the path
3695  * to find the first node higher up where we haven't yet gone through
3696  * all the slots
3697  */
3698 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3699 				 struct btrfs_root *root,
3700 				 struct btrfs_path *path,
3701 				 int *level, int max_level)
3702 {
3703 	u64 root_owner;
3704 	u64 root_gen;
3705 	struct btrfs_root_item *root_item = &root->root_item;
3706 	int i;
3707 	int slot;
3708 	int ret;
3709 
3710 	for (i = *level; i < max_level && path->nodes[i]; i++) {
3711 		slot = path->slots[i];
3712 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3713 			struct extent_buffer *node;
3714 			struct btrfs_disk_key disk_key;
3715 
3716 			/*
3717 			 * there is more work to do in this level.
3718 			 * Update the drop_progress marker to reflect
3719 			 * the work we've done so far, and then bump
3720 			 * the slot number
3721 			 */
3722 			node = path->nodes[i];
3723 			path->slots[i]++;
3724 			*level = i;
3725 			WARN_ON(*level == 0);
3726 			btrfs_node_key(node, &disk_key, path->slots[i]);
3727 			memcpy(&root_item->drop_progress,
3728 			       &disk_key, sizeof(disk_key));
3729 			root_item->drop_level = i;
3730 			return 0;
3731 		} else {
3732 			struct extent_buffer *parent;
3733 
3734 			/*
3735 			 * this whole node is done, free our reference
3736 			 * on it and go up one level
3737 			 */
3738 			if (path->nodes[*level] == root->node)
3739 				parent = path->nodes[*level];
3740 			else
3741 				parent = path->nodes[*level + 1];
3742 
3743 			root_owner = btrfs_header_owner(parent);
3744 			root_gen = btrfs_header_generation(parent);
3745 
3746 			clean_tree_block(trans, root, path->nodes[*level]);
3747 			ret = btrfs_free_extent(trans, root,
3748 						path->nodes[*level]->start,
3749 						path->nodes[*level]->len,
3750 						parent->start, root_owner,
3751 						root_gen, *level, 1);
3752 			BUG_ON(ret);
3753 			if (path->locks[*level]) {
3754 				btrfs_tree_unlock(path->nodes[*level]);
3755 				path->locks[*level] = 0;
3756 			}
3757 			free_extent_buffer(path->nodes[*level]);
3758 			path->nodes[*level] = NULL;
3759 			*level = i + 1;
3760 		}
3761 	}
3762 	return 1;
3763 }
3764 
3765 /*
3766  * drop the reference count on the tree rooted at 'snap'.  This traverses
3767  * the tree freeing any blocks that have a ref count of zero after being
3768  * decremented.
3769  */
3770 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3771 			*root)
3772 {
3773 	int ret = 0;
3774 	int wret;
3775 	int level;
3776 	struct btrfs_path *path;
3777 	int i;
3778 	int orig_level;
3779 	int update_count;
3780 	struct btrfs_root_item *root_item = &root->root_item;
3781 
3782 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3783 	path = btrfs_alloc_path();
3784 	BUG_ON(!path);
3785 
3786 	level = btrfs_header_level(root->node);
3787 	orig_level = level;
3788 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3789 		path->nodes[level] = root->node;
3790 		extent_buffer_get(root->node);
3791 		path->slots[level] = 0;
3792 	} else {
3793 		struct btrfs_key key;
3794 		struct btrfs_disk_key found_key;
3795 		struct extent_buffer *node;
3796 
3797 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3798 		level = root_item->drop_level;
3799 		path->lowest_level = level;
3800 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3801 		if (wret < 0) {
3802 			ret = wret;
3803 			goto out;
3804 		}
3805 		node = path->nodes[level];
3806 		btrfs_node_key(node, &found_key, path->slots[level]);
3807 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3808 			       sizeof(found_key)));
3809 		/*
3810 		 * unlock our path, this is safe because only this
3811 		 * function is allowed to delete this snapshot
3812 		 */
3813 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3814 			if (path->nodes[i] && path->locks[i]) {
3815 				path->locks[i] = 0;
3816 				btrfs_tree_unlock(path->nodes[i]);
3817 			}
3818 		}
3819 	}
3820 	while (1) {
3821 		unsigned long update;
3822 		wret = walk_down_tree(trans, root, path, &level);
3823 		if (wret > 0)
3824 			break;
3825 		if (wret < 0)
3826 			ret = wret;
3827 
3828 		wret = walk_up_tree(trans, root, path, &level,
3829 				    BTRFS_MAX_LEVEL);
3830 		if (wret > 0)
3831 			break;
3832 		if (wret < 0)
3833 			ret = wret;
3834 		if (trans->transaction->in_commit ||
3835 		    trans->transaction->delayed_refs.flushing) {
3836 			ret = -EAGAIN;
3837 			break;
3838 		}
3839 		atomic_inc(&root->fs_info->throttle_gen);
3840 		wake_up(&root->fs_info->transaction_throttle);
3841 		for (update_count = 0; update_count < 16; update_count++) {
3842 			update = trans->delayed_ref_updates;
3843 			trans->delayed_ref_updates = 0;
3844 			if (update)
3845 				btrfs_run_delayed_refs(trans, root, update);
3846 			else
3847 				break;
3848 		}
3849 	}
3850 	for (i = 0; i <= orig_level; i++) {
3851 		if (path->nodes[i]) {
3852 			free_extent_buffer(path->nodes[i]);
3853 			path->nodes[i] = NULL;
3854 		}
3855 	}
3856 out:
3857 	btrfs_free_path(path);
3858 	return ret;
3859 }
3860 
3861 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3862 			struct btrfs_root *root,
3863 			struct extent_buffer *node,
3864 			struct extent_buffer *parent)
3865 {
3866 	struct btrfs_path *path;
3867 	int level;
3868 	int parent_level;
3869 	int ret = 0;
3870 	int wret;
3871 
3872 	path = btrfs_alloc_path();
3873 	BUG_ON(!path);
3874 
3875 	btrfs_assert_tree_locked(parent);
3876 	parent_level = btrfs_header_level(parent);
3877 	extent_buffer_get(parent);
3878 	path->nodes[parent_level] = parent;
3879 	path->slots[parent_level] = btrfs_header_nritems(parent);
3880 
3881 	btrfs_assert_tree_locked(node);
3882 	level = btrfs_header_level(node);
3883 	extent_buffer_get(node);
3884 	path->nodes[level] = node;
3885 	path->slots[level] = 0;
3886 
3887 	while (1) {
3888 		wret = walk_down_subtree(trans, root, path, &level);
3889 		if (wret < 0)
3890 			ret = wret;
3891 		if (wret != 0)
3892 			break;
3893 
3894 		wret = walk_up_tree(trans, root, path, &level, parent_level);
3895 		if (wret < 0)
3896 			ret = wret;
3897 		if (wret != 0)
3898 			break;
3899 	}
3900 
3901 	btrfs_free_path(path);
3902 	return ret;
3903 }
3904 
3905 static unsigned long calc_ra(unsigned long start, unsigned long last,
3906 			     unsigned long nr)
3907 {
3908 	return min(last, start + nr - 1);
3909 }
3910 
3911 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3912 					 u64 len)
3913 {
3914 	u64 page_start;
3915 	u64 page_end;
3916 	unsigned long first_index;
3917 	unsigned long last_index;
3918 	unsigned long i;
3919 	struct page *page;
3920 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3921 	struct file_ra_state *ra;
3922 	struct btrfs_ordered_extent *ordered;
3923 	unsigned int total_read = 0;
3924 	unsigned int total_dirty = 0;
3925 	int ret = 0;
3926 
3927 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3928 
3929 	mutex_lock(&inode->i_mutex);
3930 	first_index = start >> PAGE_CACHE_SHIFT;
3931 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3932 
3933 	/* make sure the dirty trick played by the caller work */
3934 	ret = invalidate_inode_pages2_range(inode->i_mapping,
3935 					    first_index, last_index);
3936 	if (ret)
3937 		goto out_unlock;
3938 
3939 	file_ra_state_init(ra, inode->i_mapping);
3940 
3941 	for (i = first_index ; i <= last_index; i++) {
3942 		if (total_read % ra->ra_pages == 0) {
3943 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3944 				       calc_ra(i, last_index, ra->ra_pages));
3945 		}
3946 		total_read++;
3947 again:
3948 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3949 			BUG_ON(1);
3950 		page = grab_cache_page(inode->i_mapping, i);
3951 		if (!page) {
3952 			ret = -ENOMEM;
3953 			goto out_unlock;
3954 		}
3955 		if (!PageUptodate(page)) {
3956 			btrfs_readpage(NULL, page);
3957 			lock_page(page);
3958 			if (!PageUptodate(page)) {
3959 				unlock_page(page);
3960 				page_cache_release(page);
3961 				ret = -EIO;
3962 				goto out_unlock;
3963 			}
3964 		}
3965 		wait_on_page_writeback(page);
3966 
3967 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3968 		page_end = page_start + PAGE_CACHE_SIZE - 1;
3969 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3970 
3971 		ordered = btrfs_lookup_ordered_extent(inode, page_start);
3972 		if (ordered) {
3973 			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3974 			unlock_page(page);
3975 			page_cache_release(page);
3976 			btrfs_start_ordered_extent(inode, ordered, 1);
3977 			btrfs_put_ordered_extent(ordered);
3978 			goto again;
3979 		}
3980 		set_page_extent_mapped(page);
3981 
3982 		if (i == first_index)
3983 			set_extent_bits(io_tree, page_start, page_end,
3984 					EXTENT_BOUNDARY, GFP_NOFS);
3985 		btrfs_set_extent_delalloc(inode, page_start, page_end);
3986 
3987 		set_page_dirty(page);
3988 		total_dirty++;
3989 
3990 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3991 		unlock_page(page);
3992 		page_cache_release(page);
3993 	}
3994 
3995 out_unlock:
3996 	kfree(ra);
3997 	mutex_unlock(&inode->i_mutex);
3998 	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
3999 	return ret;
4000 }
4001 
4002 static noinline int relocate_data_extent(struct inode *reloc_inode,
4003 					 struct btrfs_key *extent_key,
4004 					 u64 offset)
4005 {
4006 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4007 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4008 	struct extent_map *em;
4009 	u64 start = extent_key->objectid - offset;
4010 	u64 end = start + extent_key->offset - 1;
4011 
4012 	em = alloc_extent_map(GFP_NOFS);
4013 	BUG_ON(!em || IS_ERR(em));
4014 
4015 	em->start = start;
4016 	em->len = extent_key->offset;
4017 	em->block_len = extent_key->offset;
4018 	em->block_start = extent_key->objectid;
4019 	em->bdev = root->fs_info->fs_devices->latest_bdev;
4020 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
4021 
4022 	/* setup extent map to cheat btrfs_readpage */
4023 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4024 	while (1) {
4025 		int ret;
4026 		spin_lock(&em_tree->lock);
4027 		ret = add_extent_mapping(em_tree, em);
4028 		spin_unlock(&em_tree->lock);
4029 		if (ret != -EEXIST) {
4030 			free_extent_map(em);
4031 			break;
4032 		}
4033 		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4034 	}
4035 	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4036 
4037 	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4038 }
4039 
4040 struct btrfs_ref_path {
4041 	u64 extent_start;
4042 	u64 nodes[BTRFS_MAX_LEVEL];
4043 	u64 root_objectid;
4044 	u64 root_generation;
4045 	u64 owner_objectid;
4046 	u32 num_refs;
4047 	int lowest_level;
4048 	int current_level;
4049 	int shared_level;
4050 
4051 	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4052 	u64 new_nodes[BTRFS_MAX_LEVEL];
4053 };
4054 
4055 struct disk_extent {
4056 	u64 ram_bytes;
4057 	u64 disk_bytenr;
4058 	u64 disk_num_bytes;
4059 	u64 offset;
4060 	u64 num_bytes;
4061 	u8 compression;
4062 	u8 encryption;
4063 	u16 other_encoding;
4064 };
4065 
4066 static int is_cowonly_root(u64 root_objectid)
4067 {
4068 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4069 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4070 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4071 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4072 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4073 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4074 		return 1;
4075 	return 0;
4076 }
4077 
4078 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4079 				    struct btrfs_root *extent_root,
4080 				    struct btrfs_ref_path *ref_path,
4081 				    int first_time)
4082 {
4083 	struct extent_buffer *leaf;
4084 	struct btrfs_path *path;
4085 	struct btrfs_extent_ref *ref;
4086 	struct btrfs_key key;
4087 	struct btrfs_key found_key;
4088 	u64 bytenr;
4089 	u32 nritems;
4090 	int level;
4091 	int ret = 1;
4092 
4093 	path = btrfs_alloc_path();
4094 	if (!path)
4095 		return -ENOMEM;
4096 
4097 	if (first_time) {
4098 		ref_path->lowest_level = -1;
4099 		ref_path->current_level = -1;
4100 		ref_path->shared_level = -1;
4101 		goto walk_up;
4102 	}
4103 walk_down:
4104 	level = ref_path->current_level - 1;
4105 	while (level >= -1) {
4106 		u64 parent;
4107 		if (level < ref_path->lowest_level)
4108 			break;
4109 
4110 		if (level >= 0)
4111 			bytenr = ref_path->nodes[level];
4112 		else
4113 			bytenr = ref_path->extent_start;
4114 		BUG_ON(bytenr == 0);
4115 
4116 		parent = ref_path->nodes[level + 1];
4117 		ref_path->nodes[level + 1] = 0;
4118 		ref_path->current_level = level;
4119 		BUG_ON(parent == 0);
4120 
4121 		key.objectid = bytenr;
4122 		key.offset = parent + 1;
4123 		key.type = BTRFS_EXTENT_REF_KEY;
4124 
4125 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4126 		if (ret < 0)
4127 			goto out;
4128 		BUG_ON(ret == 0);
4129 
4130 		leaf = path->nodes[0];
4131 		nritems = btrfs_header_nritems(leaf);
4132 		if (path->slots[0] >= nritems) {
4133 			ret = btrfs_next_leaf(extent_root, path);
4134 			if (ret < 0)
4135 				goto out;
4136 			if (ret > 0)
4137 				goto next;
4138 			leaf = path->nodes[0];
4139 		}
4140 
4141 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4142 		if (found_key.objectid == bytenr &&
4143 		    found_key.type == BTRFS_EXTENT_REF_KEY) {
4144 			if (level < ref_path->shared_level)
4145 				ref_path->shared_level = level;
4146 			goto found;
4147 		}
4148 next:
4149 		level--;
4150 		btrfs_release_path(extent_root, path);
4151 		cond_resched();
4152 	}
4153 	/* reached lowest level */
4154 	ret = 1;
4155 	goto out;
4156 walk_up:
4157 	level = ref_path->current_level;
4158 	while (level < BTRFS_MAX_LEVEL - 1) {
4159 		u64 ref_objectid;
4160 
4161 		if (level >= 0)
4162 			bytenr = ref_path->nodes[level];
4163 		else
4164 			bytenr = ref_path->extent_start;
4165 
4166 		BUG_ON(bytenr == 0);
4167 
4168 		key.objectid = bytenr;
4169 		key.offset = 0;
4170 		key.type = BTRFS_EXTENT_REF_KEY;
4171 
4172 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4173 		if (ret < 0)
4174 			goto out;
4175 
4176 		leaf = path->nodes[0];
4177 		nritems = btrfs_header_nritems(leaf);
4178 		if (path->slots[0] >= nritems) {
4179 			ret = btrfs_next_leaf(extent_root, path);
4180 			if (ret < 0)
4181 				goto out;
4182 			if (ret > 0) {
4183 				/* the extent was freed by someone */
4184 				if (ref_path->lowest_level == level)
4185 					goto out;
4186 				btrfs_release_path(extent_root, path);
4187 				goto walk_down;
4188 			}
4189 			leaf = path->nodes[0];
4190 		}
4191 
4192 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4193 		if (found_key.objectid != bytenr ||
4194 				found_key.type != BTRFS_EXTENT_REF_KEY) {
4195 			/* the extent was freed by someone */
4196 			if (ref_path->lowest_level == level) {
4197 				ret = 1;
4198 				goto out;
4199 			}
4200 			btrfs_release_path(extent_root, path);
4201 			goto walk_down;
4202 		}
4203 found:
4204 		ref = btrfs_item_ptr(leaf, path->slots[0],
4205 				struct btrfs_extent_ref);
4206 		ref_objectid = btrfs_ref_objectid(leaf, ref);
4207 		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4208 			if (first_time) {
4209 				level = (int)ref_objectid;
4210 				BUG_ON(level >= BTRFS_MAX_LEVEL);
4211 				ref_path->lowest_level = level;
4212 				ref_path->current_level = level;
4213 				ref_path->nodes[level] = bytenr;
4214 			} else {
4215 				WARN_ON(ref_objectid != level);
4216 			}
4217 		} else {
4218 			WARN_ON(level != -1);
4219 		}
4220 		first_time = 0;
4221 
4222 		if (ref_path->lowest_level == level) {
4223 			ref_path->owner_objectid = ref_objectid;
4224 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4225 		}
4226 
4227 		/*
4228 		 * the block is tree root or the block isn't in reference
4229 		 * counted tree.
4230 		 */
4231 		if (found_key.objectid == found_key.offset ||
4232 		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4233 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4234 			ref_path->root_generation =
4235 				btrfs_ref_generation(leaf, ref);
4236 			if (level < 0) {
4237 				/* special reference from the tree log */
4238 				ref_path->nodes[0] = found_key.offset;
4239 				ref_path->current_level = 0;
4240 			}
4241 			ret = 0;
4242 			goto out;
4243 		}
4244 
4245 		level++;
4246 		BUG_ON(ref_path->nodes[level] != 0);
4247 		ref_path->nodes[level] = found_key.offset;
4248 		ref_path->current_level = level;
4249 
4250 		/*
4251 		 * the reference was created in the running transaction,
4252 		 * no need to continue walking up.
4253 		 */
4254 		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4255 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4256 			ref_path->root_generation =
4257 				btrfs_ref_generation(leaf, ref);
4258 			ret = 0;
4259 			goto out;
4260 		}
4261 
4262 		btrfs_release_path(extent_root, path);
4263 		cond_resched();
4264 	}
4265 	/* reached max tree level, but no tree root found. */
4266 	BUG();
4267 out:
4268 	btrfs_free_path(path);
4269 	return ret;
4270 }
4271 
4272 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4273 				struct btrfs_root *extent_root,
4274 				struct btrfs_ref_path *ref_path,
4275 				u64 extent_start)
4276 {
4277 	memset(ref_path, 0, sizeof(*ref_path));
4278 	ref_path->extent_start = extent_start;
4279 
4280 	return __next_ref_path(trans, extent_root, ref_path, 1);
4281 }
4282 
4283 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4284 			       struct btrfs_root *extent_root,
4285 			       struct btrfs_ref_path *ref_path)
4286 {
4287 	return __next_ref_path(trans, extent_root, ref_path, 0);
4288 }
4289 
4290 static noinline int get_new_locations(struct inode *reloc_inode,
4291 				      struct btrfs_key *extent_key,
4292 				      u64 offset, int no_fragment,
4293 				      struct disk_extent **extents,
4294 				      int *nr_extents)
4295 {
4296 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4297 	struct btrfs_path *path;
4298 	struct btrfs_file_extent_item *fi;
4299 	struct extent_buffer *leaf;
4300 	struct disk_extent *exts = *extents;
4301 	struct btrfs_key found_key;
4302 	u64 cur_pos;
4303 	u64 last_byte;
4304 	u32 nritems;
4305 	int nr = 0;
4306 	int max = *nr_extents;
4307 	int ret;
4308 
4309 	WARN_ON(!no_fragment && *extents);
4310 	if (!exts) {
4311 		max = 1;
4312 		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4313 		if (!exts)
4314 			return -ENOMEM;
4315 	}
4316 
4317 	path = btrfs_alloc_path();
4318 	BUG_ON(!path);
4319 
4320 	cur_pos = extent_key->objectid - offset;
4321 	last_byte = extent_key->objectid + extent_key->offset;
4322 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4323 				       cur_pos, 0);
4324 	if (ret < 0)
4325 		goto out;
4326 	if (ret > 0) {
4327 		ret = -ENOENT;
4328 		goto out;
4329 	}
4330 
4331 	while (1) {
4332 		leaf = path->nodes[0];
4333 		nritems = btrfs_header_nritems(leaf);
4334 		if (path->slots[0] >= nritems) {
4335 			ret = btrfs_next_leaf(root, path);
4336 			if (ret < 0)
4337 				goto out;
4338 			if (ret > 0)
4339 				break;
4340 			leaf = path->nodes[0];
4341 		}
4342 
4343 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4344 		if (found_key.offset != cur_pos ||
4345 		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
4346 		    found_key.objectid != reloc_inode->i_ino)
4347 			break;
4348 
4349 		fi = btrfs_item_ptr(leaf, path->slots[0],
4350 				    struct btrfs_file_extent_item);
4351 		if (btrfs_file_extent_type(leaf, fi) !=
4352 		    BTRFS_FILE_EXTENT_REG ||
4353 		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4354 			break;
4355 
4356 		if (nr == max) {
4357 			struct disk_extent *old = exts;
4358 			max *= 2;
4359 			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4360 			memcpy(exts, old, sizeof(*exts) * nr);
4361 			if (old != *extents)
4362 				kfree(old);
4363 		}
4364 
4365 		exts[nr].disk_bytenr =
4366 			btrfs_file_extent_disk_bytenr(leaf, fi);
4367 		exts[nr].disk_num_bytes =
4368 			btrfs_file_extent_disk_num_bytes(leaf, fi);
4369 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4370 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4371 		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4372 		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4373 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4374 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4375 									   fi);
4376 		BUG_ON(exts[nr].offset > 0);
4377 		BUG_ON(exts[nr].compression || exts[nr].encryption);
4378 		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4379 
4380 		cur_pos += exts[nr].num_bytes;
4381 		nr++;
4382 
4383 		if (cur_pos + offset >= last_byte)
4384 			break;
4385 
4386 		if (no_fragment) {
4387 			ret = 1;
4388 			goto out;
4389 		}
4390 		path->slots[0]++;
4391 	}
4392 
4393 	BUG_ON(cur_pos + offset > last_byte);
4394 	if (cur_pos + offset < last_byte) {
4395 		ret = -ENOENT;
4396 		goto out;
4397 	}
4398 	ret = 0;
4399 out:
4400 	btrfs_free_path(path);
4401 	if (ret) {
4402 		if (exts != *extents)
4403 			kfree(exts);
4404 	} else {
4405 		*extents = exts;
4406 		*nr_extents = nr;
4407 	}
4408 	return ret;
4409 }
4410 
4411 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4412 					struct btrfs_root *root,
4413 					struct btrfs_path *path,
4414 					struct btrfs_key *extent_key,
4415 					struct btrfs_key *leaf_key,
4416 					struct btrfs_ref_path *ref_path,
4417 					struct disk_extent *new_extents,
4418 					int nr_extents)
4419 {
4420 	struct extent_buffer *leaf;
4421 	struct btrfs_file_extent_item *fi;
4422 	struct inode *inode = NULL;
4423 	struct btrfs_key key;
4424 	u64 lock_start = 0;
4425 	u64 lock_end = 0;
4426 	u64 num_bytes;
4427 	u64 ext_offset;
4428 	u64 search_end = (u64)-1;
4429 	u32 nritems;
4430 	int nr_scaned = 0;
4431 	int extent_locked = 0;
4432 	int extent_type;
4433 	int ret;
4434 
4435 	memcpy(&key, leaf_key, sizeof(key));
4436 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4437 		if (key.objectid < ref_path->owner_objectid ||
4438 		    (key.objectid == ref_path->owner_objectid &&
4439 		     key.type < BTRFS_EXTENT_DATA_KEY)) {
4440 			key.objectid = ref_path->owner_objectid;
4441 			key.type = BTRFS_EXTENT_DATA_KEY;
4442 			key.offset = 0;
4443 		}
4444 	}
4445 
4446 	while (1) {
4447 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4448 		if (ret < 0)
4449 			goto out;
4450 
4451 		leaf = path->nodes[0];
4452 		nritems = btrfs_header_nritems(leaf);
4453 next:
4454 		if (extent_locked && ret > 0) {
4455 			/*
4456 			 * the file extent item was modified by someone
4457 			 * before the extent got locked.
4458 			 */
4459 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4460 				      lock_end, GFP_NOFS);
4461 			extent_locked = 0;
4462 		}
4463 
4464 		if (path->slots[0] >= nritems) {
4465 			if (++nr_scaned > 2)
4466 				break;
4467 
4468 			BUG_ON(extent_locked);
4469 			ret = btrfs_next_leaf(root, path);
4470 			if (ret < 0)
4471 				goto out;
4472 			if (ret > 0)
4473 				break;
4474 			leaf = path->nodes[0];
4475 			nritems = btrfs_header_nritems(leaf);
4476 		}
4477 
4478 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4479 
4480 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4481 			if ((key.objectid > ref_path->owner_objectid) ||
4482 			    (key.objectid == ref_path->owner_objectid &&
4483 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
4484 			    key.offset >= search_end)
4485 				break;
4486 		}
4487 
4488 		if (inode && key.objectid != inode->i_ino) {
4489 			BUG_ON(extent_locked);
4490 			btrfs_release_path(root, path);
4491 			mutex_unlock(&inode->i_mutex);
4492 			iput(inode);
4493 			inode = NULL;
4494 			continue;
4495 		}
4496 
4497 		if (key.type != BTRFS_EXTENT_DATA_KEY) {
4498 			path->slots[0]++;
4499 			ret = 1;
4500 			goto next;
4501 		}
4502 		fi = btrfs_item_ptr(leaf, path->slots[0],
4503 				    struct btrfs_file_extent_item);
4504 		extent_type = btrfs_file_extent_type(leaf, fi);
4505 		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4506 		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4507 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4508 		     extent_key->objectid)) {
4509 			path->slots[0]++;
4510 			ret = 1;
4511 			goto next;
4512 		}
4513 
4514 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4515 		ext_offset = btrfs_file_extent_offset(leaf, fi);
4516 
4517 		if (search_end == (u64)-1) {
4518 			search_end = key.offset - ext_offset +
4519 				btrfs_file_extent_ram_bytes(leaf, fi);
4520 		}
4521 
4522 		if (!extent_locked) {
4523 			lock_start = key.offset;
4524 			lock_end = lock_start + num_bytes - 1;
4525 		} else {
4526 			if (lock_start > key.offset ||
4527 			    lock_end + 1 < key.offset + num_bytes) {
4528 				unlock_extent(&BTRFS_I(inode)->io_tree,
4529 					      lock_start, lock_end, GFP_NOFS);
4530 				extent_locked = 0;
4531 			}
4532 		}
4533 
4534 		if (!inode) {
4535 			btrfs_release_path(root, path);
4536 
4537 			inode = btrfs_iget_locked(root->fs_info->sb,
4538 						  key.objectid, root);
4539 			if (inode->i_state & I_NEW) {
4540 				BTRFS_I(inode)->root = root;
4541 				BTRFS_I(inode)->location.objectid =
4542 					key.objectid;
4543 				BTRFS_I(inode)->location.type =
4544 					BTRFS_INODE_ITEM_KEY;
4545 				BTRFS_I(inode)->location.offset = 0;
4546 				btrfs_read_locked_inode(inode);
4547 				unlock_new_inode(inode);
4548 			}
4549 			/*
4550 			 * some code call btrfs_commit_transaction while
4551 			 * holding the i_mutex, so we can't use mutex_lock
4552 			 * here.
4553 			 */
4554 			if (is_bad_inode(inode) ||
4555 			    !mutex_trylock(&inode->i_mutex)) {
4556 				iput(inode);
4557 				inode = NULL;
4558 				key.offset = (u64)-1;
4559 				goto skip;
4560 			}
4561 		}
4562 
4563 		if (!extent_locked) {
4564 			struct btrfs_ordered_extent *ordered;
4565 
4566 			btrfs_release_path(root, path);
4567 
4568 			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4569 				    lock_end, GFP_NOFS);
4570 			ordered = btrfs_lookup_first_ordered_extent(inode,
4571 								    lock_end);
4572 			if (ordered &&
4573 			    ordered->file_offset <= lock_end &&
4574 			    ordered->file_offset + ordered->len > lock_start) {
4575 				unlock_extent(&BTRFS_I(inode)->io_tree,
4576 					      lock_start, lock_end, GFP_NOFS);
4577 				btrfs_start_ordered_extent(inode, ordered, 1);
4578 				btrfs_put_ordered_extent(ordered);
4579 				key.offset += num_bytes;
4580 				goto skip;
4581 			}
4582 			if (ordered)
4583 				btrfs_put_ordered_extent(ordered);
4584 
4585 			extent_locked = 1;
4586 			continue;
4587 		}
4588 
4589 		if (nr_extents == 1) {
4590 			/* update extent pointer in place */
4591 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
4592 						new_extents[0].disk_bytenr);
4593 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4594 						new_extents[0].disk_num_bytes);
4595 			btrfs_mark_buffer_dirty(leaf);
4596 
4597 			btrfs_drop_extent_cache(inode, key.offset,
4598 						key.offset + num_bytes - 1, 0);
4599 
4600 			ret = btrfs_inc_extent_ref(trans, root,
4601 						new_extents[0].disk_bytenr,
4602 						new_extents[0].disk_num_bytes,
4603 						leaf->start,
4604 						root->root_key.objectid,
4605 						trans->transid,
4606 						key.objectid);
4607 			BUG_ON(ret);
4608 
4609 			ret = btrfs_free_extent(trans, root,
4610 						extent_key->objectid,
4611 						extent_key->offset,
4612 						leaf->start,
4613 						btrfs_header_owner(leaf),
4614 						btrfs_header_generation(leaf),
4615 						key.objectid, 0);
4616 			BUG_ON(ret);
4617 
4618 			btrfs_release_path(root, path);
4619 			key.offset += num_bytes;
4620 		} else {
4621 			BUG_ON(1);
4622 #if 0
4623 			u64 alloc_hint;
4624 			u64 extent_len;
4625 			int i;
4626 			/*
4627 			 * drop old extent pointer at first, then insert the
4628 			 * new pointers one bye one
4629 			 */
4630 			btrfs_release_path(root, path);
4631 			ret = btrfs_drop_extents(trans, root, inode, key.offset,
4632 						 key.offset + num_bytes,
4633 						 key.offset, &alloc_hint);
4634 			BUG_ON(ret);
4635 
4636 			for (i = 0; i < nr_extents; i++) {
4637 				if (ext_offset >= new_extents[i].num_bytes) {
4638 					ext_offset -= new_extents[i].num_bytes;
4639 					continue;
4640 				}
4641 				extent_len = min(new_extents[i].num_bytes -
4642 						 ext_offset, num_bytes);
4643 
4644 				ret = btrfs_insert_empty_item(trans, root,
4645 							      path, &key,
4646 							      sizeof(*fi));
4647 				BUG_ON(ret);
4648 
4649 				leaf = path->nodes[0];
4650 				fi = btrfs_item_ptr(leaf, path->slots[0],
4651 						struct btrfs_file_extent_item);
4652 				btrfs_set_file_extent_generation(leaf, fi,
4653 							trans->transid);
4654 				btrfs_set_file_extent_type(leaf, fi,
4655 							BTRFS_FILE_EXTENT_REG);
4656 				btrfs_set_file_extent_disk_bytenr(leaf, fi,
4657 						new_extents[i].disk_bytenr);
4658 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4659 						new_extents[i].disk_num_bytes);
4660 				btrfs_set_file_extent_ram_bytes(leaf, fi,
4661 						new_extents[i].ram_bytes);
4662 
4663 				btrfs_set_file_extent_compression(leaf, fi,
4664 						new_extents[i].compression);
4665 				btrfs_set_file_extent_encryption(leaf, fi,
4666 						new_extents[i].encryption);
4667 				btrfs_set_file_extent_other_encoding(leaf, fi,
4668 						new_extents[i].other_encoding);
4669 
4670 				btrfs_set_file_extent_num_bytes(leaf, fi,
4671 							extent_len);
4672 				ext_offset += new_extents[i].offset;
4673 				btrfs_set_file_extent_offset(leaf, fi,
4674 							ext_offset);
4675 				btrfs_mark_buffer_dirty(leaf);
4676 
4677 				btrfs_drop_extent_cache(inode, key.offset,
4678 						key.offset + extent_len - 1, 0);
4679 
4680 				ret = btrfs_inc_extent_ref(trans, root,
4681 						new_extents[i].disk_bytenr,
4682 						new_extents[i].disk_num_bytes,
4683 						leaf->start,
4684 						root->root_key.objectid,
4685 						trans->transid, key.objectid);
4686 				BUG_ON(ret);
4687 				btrfs_release_path(root, path);
4688 
4689 				inode_add_bytes(inode, extent_len);
4690 
4691 				ext_offset = 0;
4692 				num_bytes -= extent_len;
4693 				key.offset += extent_len;
4694 
4695 				if (num_bytes == 0)
4696 					break;
4697 			}
4698 			BUG_ON(i >= nr_extents);
4699 #endif
4700 		}
4701 
4702 		if (extent_locked) {
4703 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4704 				      lock_end, GFP_NOFS);
4705 			extent_locked = 0;
4706 		}
4707 skip:
4708 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4709 		    key.offset >= search_end)
4710 			break;
4711 
4712 		cond_resched();
4713 	}
4714 	ret = 0;
4715 out:
4716 	btrfs_release_path(root, path);
4717 	if (inode) {
4718 		mutex_unlock(&inode->i_mutex);
4719 		if (extent_locked) {
4720 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4721 				      lock_end, GFP_NOFS);
4722 		}
4723 		iput(inode);
4724 	}
4725 	return ret;
4726 }
4727 
4728 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4729 			       struct btrfs_root *root,
4730 			       struct extent_buffer *buf, u64 orig_start)
4731 {
4732 	int level;
4733 	int ret;
4734 
4735 	BUG_ON(btrfs_header_generation(buf) != trans->transid);
4736 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4737 
4738 	level = btrfs_header_level(buf);
4739 	if (level == 0) {
4740 		struct btrfs_leaf_ref *ref;
4741 		struct btrfs_leaf_ref *orig_ref;
4742 
4743 		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4744 		if (!orig_ref)
4745 			return -ENOENT;
4746 
4747 		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4748 		if (!ref) {
4749 			btrfs_free_leaf_ref(root, orig_ref);
4750 			return -ENOMEM;
4751 		}
4752 
4753 		ref->nritems = orig_ref->nritems;
4754 		memcpy(ref->extents, orig_ref->extents,
4755 			sizeof(ref->extents[0]) * ref->nritems);
4756 
4757 		btrfs_free_leaf_ref(root, orig_ref);
4758 
4759 		ref->root_gen = trans->transid;
4760 		ref->bytenr = buf->start;
4761 		ref->owner = btrfs_header_owner(buf);
4762 		ref->generation = btrfs_header_generation(buf);
4763 
4764 		ret = btrfs_add_leaf_ref(root, ref, 0);
4765 		WARN_ON(ret);
4766 		btrfs_free_leaf_ref(root, ref);
4767 	}
4768 	return 0;
4769 }
4770 
4771 static noinline int invalidate_extent_cache(struct btrfs_root *root,
4772 					struct extent_buffer *leaf,
4773 					struct btrfs_block_group_cache *group,
4774 					struct btrfs_root *target_root)
4775 {
4776 	struct btrfs_key key;
4777 	struct inode *inode = NULL;
4778 	struct btrfs_file_extent_item *fi;
4779 	u64 num_bytes;
4780 	u64 skip_objectid = 0;
4781 	u32 nritems;
4782 	u32 i;
4783 
4784 	nritems = btrfs_header_nritems(leaf);
4785 	for (i = 0; i < nritems; i++) {
4786 		btrfs_item_key_to_cpu(leaf, &key, i);
4787 		if (key.objectid == skip_objectid ||
4788 		    key.type != BTRFS_EXTENT_DATA_KEY)
4789 			continue;
4790 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4791 		if (btrfs_file_extent_type(leaf, fi) ==
4792 		    BTRFS_FILE_EXTENT_INLINE)
4793 			continue;
4794 		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4795 			continue;
4796 		if (!inode || inode->i_ino != key.objectid) {
4797 			iput(inode);
4798 			inode = btrfs_ilookup(target_root->fs_info->sb,
4799 					      key.objectid, target_root, 1);
4800 		}
4801 		if (!inode) {
4802 			skip_objectid = key.objectid;
4803 			continue;
4804 		}
4805 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4806 
4807 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4808 			    key.offset + num_bytes - 1, GFP_NOFS);
4809 		btrfs_drop_extent_cache(inode, key.offset,
4810 					key.offset + num_bytes - 1, 1);
4811 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4812 			      key.offset + num_bytes - 1, GFP_NOFS);
4813 		cond_resched();
4814 	}
4815 	iput(inode);
4816 	return 0;
4817 }
4818 
4819 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4820 					struct btrfs_root *root,
4821 					struct extent_buffer *leaf,
4822 					struct btrfs_block_group_cache *group,
4823 					struct inode *reloc_inode)
4824 {
4825 	struct btrfs_key key;
4826 	struct btrfs_key extent_key;
4827 	struct btrfs_file_extent_item *fi;
4828 	struct btrfs_leaf_ref *ref;
4829 	struct disk_extent *new_extent;
4830 	u64 bytenr;
4831 	u64 num_bytes;
4832 	u32 nritems;
4833 	u32 i;
4834 	int ext_index;
4835 	int nr_extent;
4836 	int ret;
4837 
4838 	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4839 	BUG_ON(!new_extent);
4840 
4841 	ref = btrfs_lookup_leaf_ref(root, leaf->start);
4842 	BUG_ON(!ref);
4843 
4844 	ext_index = -1;
4845 	nritems = btrfs_header_nritems(leaf);
4846 	for (i = 0; i < nritems; i++) {
4847 		btrfs_item_key_to_cpu(leaf, &key, i);
4848 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4849 			continue;
4850 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4851 		if (btrfs_file_extent_type(leaf, fi) ==
4852 		    BTRFS_FILE_EXTENT_INLINE)
4853 			continue;
4854 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4855 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4856 		if (bytenr == 0)
4857 			continue;
4858 
4859 		ext_index++;
4860 		if (bytenr >= group->key.objectid + group->key.offset ||
4861 		    bytenr + num_bytes <= group->key.objectid)
4862 			continue;
4863 
4864 		extent_key.objectid = bytenr;
4865 		extent_key.offset = num_bytes;
4866 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4867 		nr_extent = 1;
4868 		ret = get_new_locations(reloc_inode, &extent_key,
4869 					group->key.objectid, 1,
4870 					&new_extent, &nr_extent);
4871 		if (ret > 0)
4872 			continue;
4873 		BUG_ON(ret < 0);
4874 
4875 		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4876 		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4877 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4878 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4879 
4880 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
4881 						new_extent->disk_bytenr);
4882 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4883 						new_extent->disk_num_bytes);
4884 		btrfs_mark_buffer_dirty(leaf);
4885 
4886 		ret = btrfs_inc_extent_ref(trans, root,
4887 					new_extent->disk_bytenr,
4888 					new_extent->disk_num_bytes,
4889 					leaf->start,
4890 					root->root_key.objectid,
4891 					trans->transid, key.objectid);
4892 		BUG_ON(ret);
4893 
4894 		ret = btrfs_free_extent(trans, root,
4895 					bytenr, num_bytes, leaf->start,
4896 					btrfs_header_owner(leaf),
4897 					btrfs_header_generation(leaf),
4898 					key.objectid, 0);
4899 		BUG_ON(ret);
4900 		cond_resched();
4901 	}
4902 	kfree(new_extent);
4903 	BUG_ON(ext_index + 1 != ref->nritems);
4904 	btrfs_free_leaf_ref(root, ref);
4905 	return 0;
4906 }
4907 
4908 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4909 			  struct btrfs_root *root)
4910 {
4911 	struct btrfs_root *reloc_root;
4912 	int ret;
4913 
4914 	if (root->reloc_root) {
4915 		reloc_root = root->reloc_root;
4916 		root->reloc_root = NULL;
4917 		list_add(&reloc_root->dead_list,
4918 			 &root->fs_info->dead_reloc_roots);
4919 
4920 		btrfs_set_root_bytenr(&reloc_root->root_item,
4921 				      reloc_root->node->start);
4922 		btrfs_set_root_level(&root->root_item,
4923 				     btrfs_header_level(reloc_root->node));
4924 		memset(&reloc_root->root_item.drop_progress, 0,
4925 			sizeof(struct btrfs_disk_key));
4926 		reloc_root->root_item.drop_level = 0;
4927 
4928 		ret = btrfs_update_root(trans, root->fs_info->tree_root,
4929 					&reloc_root->root_key,
4930 					&reloc_root->root_item);
4931 		BUG_ON(ret);
4932 	}
4933 	return 0;
4934 }
4935 
4936 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4937 {
4938 	struct btrfs_trans_handle *trans;
4939 	struct btrfs_root *reloc_root;
4940 	struct btrfs_root *prev_root = NULL;
4941 	struct list_head dead_roots;
4942 	int ret;
4943 	unsigned long nr;
4944 
4945 	INIT_LIST_HEAD(&dead_roots);
4946 	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4947 
4948 	while (!list_empty(&dead_roots)) {
4949 		reloc_root = list_entry(dead_roots.prev,
4950 					struct btrfs_root, dead_list);
4951 		list_del_init(&reloc_root->dead_list);
4952 
4953 		BUG_ON(reloc_root->commit_root != NULL);
4954 		while (1) {
4955 			trans = btrfs_join_transaction(root, 1);
4956 			BUG_ON(!trans);
4957 
4958 			mutex_lock(&root->fs_info->drop_mutex);
4959 			ret = btrfs_drop_snapshot(trans, reloc_root);
4960 			if (ret != -EAGAIN)
4961 				break;
4962 			mutex_unlock(&root->fs_info->drop_mutex);
4963 
4964 			nr = trans->blocks_used;
4965 			ret = btrfs_end_transaction(trans, root);
4966 			BUG_ON(ret);
4967 			btrfs_btree_balance_dirty(root, nr);
4968 		}
4969 
4970 		free_extent_buffer(reloc_root->node);
4971 
4972 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
4973 				     &reloc_root->root_key);
4974 		BUG_ON(ret);
4975 		mutex_unlock(&root->fs_info->drop_mutex);
4976 
4977 		nr = trans->blocks_used;
4978 		ret = btrfs_end_transaction(trans, root);
4979 		BUG_ON(ret);
4980 		btrfs_btree_balance_dirty(root, nr);
4981 
4982 		kfree(prev_root);
4983 		prev_root = reloc_root;
4984 	}
4985 	if (prev_root) {
4986 		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4987 		kfree(prev_root);
4988 	}
4989 	return 0;
4990 }
4991 
4992 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
4993 {
4994 	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
4995 	return 0;
4996 }
4997 
4998 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
4999 {
5000 	struct btrfs_root *reloc_root;
5001 	struct btrfs_trans_handle *trans;
5002 	struct btrfs_key location;
5003 	int found;
5004 	int ret;
5005 
5006 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5007 	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5008 	BUG_ON(ret);
5009 	found = !list_empty(&root->fs_info->dead_reloc_roots);
5010 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5011 
5012 	if (found) {
5013 		trans = btrfs_start_transaction(root, 1);
5014 		BUG_ON(!trans);
5015 		ret = btrfs_commit_transaction(trans, root);
5016 		BUG_ON(ret);
5017 	}
5018 
5019 	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5020 	location.offset = (u64)-1;
5021 	location.type = BTRFS_ROOT_ITEM_KEY;
5022 
5023 	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5024 	BUG_ON(!reloc_root);
5025 	btrfs_orphan_cleanup(reloc_root);
5026 	return 0;
5027 }
5028 
5029 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5030 				    struct btrfs_root *root)
5031 {
5032 	struct btrfs_root *reloc_root;
5033 	struct extent_buffer *eb;
5034 	struct btrfs_root_item *root_item;
5035 	struct btrfs_key root_key;
5036 	int ret;
5037 
5038 	BUG_ON(!root->ref_cows);
5039 	if (root->reloc_root)
5040 		return 0;
5041 
5042 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5043 	BUG_ON(!root_item);
5044 
5045 	ret = btrfs_copy_root(trans, root, root->commit_root,
5046 			      &eb, BTRFS_TREE_RELOC_OBJECTID);
5047 	BUG_ON(ret);
5048 
5049 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5050 	root_key.offset = root->root_key.objectid;
5051 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5052 
5053 	memcpy(root_item, &root->root_item, sizeof(root_item));
5054 	btrfs_set_root_refs(root_item, 0);
5055 	btrfs_set_root_bytenr(root_item, eb->start);
5056 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
5057 	btrfs_set_root_generation(root_item, trans->transid);
5058 
5059 	btrfs_tree_unlock(eb);
5060 	free_extent_buffer(eb);
5061 
5062 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5063 				&root_key, root_item);
5064 	BUG_ON(ret);
5065 	kfree(root_item);
5066 
5067 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5068 						 &root_key);
5069 	BUG_ON(!reloc_root);
5070 	reloc_root->last_trans = trans->transid;
5071 	reloc_root->commit_root = NULL;
5072 	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5073 
5074 	root->reloc_root = reloc_root;
5075 	return 0;
5076 }
5077 
5078 /*
5079  * Core function of space balance.
5080  *
5081  * The idea is using reloc trees to relocate tree blocks in reference
5082  * counted roots. There is one reloc tree for each subvol, and all
5083  * reloc trees share same root key objectid. Reloc trees are snapshots
5084  * of the latest committed roots of subvols (root->commit_root).
5085  *
5086  * To relocate a tree block referenced by a subvol, there are two steps.
5087  * COW the block through subvol's reloc tree, then update block pointer
5088  * in the subvol to point to the new block. Since all reloc trees share
5089  * same root key objectid, doing special handing for tree blocks owned
5090  * by them is easy. Once a tree block has been COWed in one reloc tree,
5091  * we can use the resulting new block directly when the same block is
5092  * required to COW again through other reloc trees. By this way, relocated
5093  * tree blocks are shared between reloc trees, so they are also shared
5094  * between subvols.
5095  */
5096 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5097 				      struct btrfs_root *root,
5098 				      struct btrfs_path *path,
5099 				      struct btrfs_key *first_key,
5100 				      struct btrfs_ref_path *ref_path,
5101 				      struct btrfs_block_group_cache *group,
5102 				      struct inode *reloc_inode)
5103 {
5104 	struct btrfs_root *reloc_root;
5105 	struct extent_buffer *eb = NULL;
5106 	struct btrfs_key *keys;
5107 	u64 *nodes;
5108 	int level;
5109 	int shared_level;
5110 	int lowest_level = 0;
5111 	int ret;
5112 
5113 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5114 		lowest_level = ref_path->owner_objectid;
5115 
5116 	if (!root->ref_cows) {
5117 		path->lowest_level = lowest_level;
5118 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5119 		BUG_ON(ret < 0);
5120 		path->lowest_level = 0;
5121 		btrfs_release_path(root, path);
5122 		return 0;
5123 	}
5124 
5125 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5126 	ret = init_reloc_tree(trans, root);
5127 	BUG_ON(ret);
5128 	reloc_root = root->reloc_root;
5129 
5130 	shared_level = ref_path->shared_level;
5131 	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5132 
5133 	keys = ref_path->node_keys;
5134 	nodes = ref_path->new_nodes;
5135 	memset(&keys[shared_level + 1], 0,
5136 	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5137 	memset(&nodes[shared_level + 1], 0,
5138 	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5139 
5140 	if (nodes[lowest_level] == 0) {
5141 		path->lowest_level = lowest_level;
5142 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5143 					0, 1);
5144 		BUG_ON(ret);
5145 		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5146 			eb = path->nodes[level];
5147 			if (!eb || eb == reloc_root->node)
5148 				break;
5149 			nodes[level] = eb->start;
5150 			if (level == 0)
5151 				btrfs_item_key_to_cpu(eb, &keys[level], 0);
5152 			else
5153 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
5154 		}
5155 		if (nodes[0] &&
5156 		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5157 			eb = path->nodes[0];
5158 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
5159 						      group, reloc_inode);
5160 			BUG_ON(ret);
5161 		}
5162 		btrfs_release_path(reloc_root, path);
5163 	} else {
5164 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5165 				       lowest_level);
5166 		BUG_ON(ret);
5167 	}
5168 
5169 	/*
5170 	 * replace tree blocks in the fs tree with tree blocks in
5171 	 * the reloc tree.
5172 	 */
5173 	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5174 	BUG_ON(ret < 0);
5175 
5176 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5177 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5178 					0, 0);
5179 		BUG_ON(ret);
5180 		extent_buffer_get(path->nodes[0]);
5181 		eb = path->nodes[0];
5182 		btrfs_release_path(reloc_root, path);
5183 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
5184 		BUG_ON(ret);
5185 		free_extent_buffer(eb);
5186 	}
5187 
5188 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5189 	path->lowest_level = 0;
5190 	return 0;
5191 }
5192 
5193 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5194 					struct btrfs_root *root,
5195 					struct btrfs_path *path,
5196 					struct btrfs_key *first_key,
5197 					struct btrfs_ref_path *ref_path)
5198 {
5199 	int ret;
5200 
5201 	ret = relocate_one_path(trans, root, path, first_key,
5202 				ref_path, NULL, NULL);
5203 	BUG_ON(ret);
5204 
5205 	return 0;
5206 }
5207 
5208 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5209 				    struct btrfs_root *extent_root,
5210 				    struct btrfs_path *path,
5211 				    struct btrfs_key *extent_key)
5212 {
5213 	int ret;
5214 
5215 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5216 	if (ret)
5217 		goto out;
5218 	ret = btrfs_del_item(trans, extent_root, path);
5219 out:
5220 	btrfs_release_path(extent_root, path);
5221 	return ret;
5222 }
5223 
5224 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5225 						struct btrfs_ref_path *ref_path)
5226 {
5227 	struct btrfs_key root_key;
5228 
5229 	root_key.objectid = ref_path->root_objectid;
5230 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5231 	if (is_cowonly_root(ref_path->root_objectid))
5232 		root_key.offset = 0;
5233 	else
5234 		root_key.offset = (u64)-1;
5235 
5236 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
5237 }
5238 
5239 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5240 					struct btrfs_path *path,
5241 					struct btrfs_key *extent_key,
5242 					struct btrfs_block_group_cache *group,
5243 					struct inode *reloc_inode, int pass)
5244 {
5245 	struct btrfs_trans_handle *trans;
5246 	struct btrfs_root *found_root;
5247 	struct btrfs_ref_path *ref_path = NULL;
5248 	struct disk_extent *new_extents = NULL;
5249 	int nr_extents = 0;
5250 	int loops;
5251 	int ret;
5252 	int level;
5253 	struct btrfs_key first_key;
5254 	u64 prev_block = 0;
5255 
5256 
5257 	trans = btrfs_start_transaction(extent_root, 1);
5258 	BUG_ON(!trans);
5259 
5260 	if (extent_key->objectid == 0) {
5261 		ret = del_extent_zero(trans, extent_root, path, extent_key);
5262 		goto out;
5263 	}
5264 
5265 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5266 	if (!ref_path) {
5267 		ret = -ENOMEM;
5268 		goto out;
5269 	}
5270 
5271 	for (loops = 0; ; loops++) {
5272 		if (loops == 0) {
5273 			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5274 						   extent_key->objectid);
5275 		} else {
5276 			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5277 		}
5278 		if (ret < 0)
5279 			goto out;
5280 		if (ret > 0)
5281 			break;
5282 
5283 		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5284 		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5285 			continue;
5286 
5287 		found_root = read_ref_root(extent_root->fs_info, ref_path);
5288 		BUG_ON(!found_root);
5289 		/*
5290 		 * for reference counted tree, only process reference paths
5291 		 * rooted at the latest committed root.
5292 		 */
5293 		if (found_root->ref_cows &&
5294 		    ref_path->root_generation != found_root->root_key.offset)
5295 			continue;
5296 
5297 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5298 			if (pass == 0) {
5299 				/*
5300 				 * copy data extents to new locations
5301 				 */
5302 				u64 group_start = group->key.objectid;
5303 				ret = relocate_data_extent(reloc_inode,
5304 							   extent_key,
5305 							   group_start);
5306 				if (ret < 0)
5307 					goto out;
5308 				break;
5309 			}
5310 			level = 0;
5311 		} else {
5312 			level = ref_path->owner_objectid;
5313 		}
5314 
5315 		if (prev_block != ref_path->nodes[level]) {
5316 			struct extent_buffer *eb;
5317 			u64 block_start = ref_path->nodes[level];
5318 			u64 block_size = btrfs_level_size(found_root, level);
5319 
5320 			eb = read_tree_block(found_root, block_start,
5321 					     block_size, 0);
5322 			btrfs_tree_lock(eb);
5323 			BUG_ON(level != btrfs_header_level(eb));
5324 
5325 			if (level == 0)
5326 				btrfs_item_key_to_cpu(eb, &first_key, 0);
5327 			else
5328 				btrfs_node_key_to_cpu(eb, &first_key, 0);
5329 
5330 			btrfs_tree_unlock(eb);
5331 			free_extent_buffer(eb);
5332 			prev_block = block_start;
5333 		}
5334 
5335 		mutex_lock(&extent_root->fs_info->trans_mutex);
5336 		btrfs_record_root_in_trans(found_root);
5337 		mutex_unlock(&extent_root->fs_info->trans_mutex);
5338 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5339 			/*
5340 			 * try to update data extent references while
5341 			 * keeping metadata shared between snapshots.
5342 			 */
5343 			if (pass == 1) {
5344 				ret = relocate_one_path(trans, found_root,
5345 						path, &first_key, ref_path,
5346 						group, reloc_inode);
5347 				if (ret < 0)
5348 					goto out;
5349 				continue;
5350 			}
5351 			/*
5352 			 * use fallback method to process the remaining
5353 			 * references.
5354 			 */
5355 			if (!new_extents) {
5356 				u64 group_start = group->key.objectid;
5357 				new_extents = kmalloc(sizeof(*new_extents),
5358 						      GFP_NOFS);
5359 				nr_extents = 1;
5360 				ret = get_new_locations(reloc_inode,
5361 							extent_key,
5362 							group_start, 1,
5363 							&new_extents,
5364 							&nr_extents);
5365 				if (ret)
5366 					goto out;
5367 			}
5368 			ret = replace_one_extent(trans, found_root,
5369 						path, extent_key,
5370 						&first_key, ref_path,
5371 						new_extents, nr_extents);
5372 		} else {
5373 			ret = relocate_tree_block(trans, found_root, path,
5374 						  &first_key, ref_path);
5375 		}
5376 		if (ret < 0)
5377 			goto out;
5378 	}
5379 	ret = 0;
5380 out:
5381 	btrfs_end_transaction(trans, extent_root);
5382 	kfree(new_extents);
5383 	kfree(ref_path);
5384 	return ret;
5385 }
5386 
5387 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5388 {
5389 	u64 num_devices;
5390 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5391 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5392 
5393 	num_devices = root->fs_info->fs_devices->rw_devices;
5394 	if (num_devices == 1) {
5395 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5396 		stripped = flags & ~stripped;
5397 
5398 		/* turn raid0 into single device chunks */
5399 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
5400 			return stripped;
5401 
5402 		/* turn mirroring into duplication */
5403 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5404 			     BTRFS_BLOCK_GROUP_RAID10))
5405 			return stripped | BTRFS_BLOCK_GROUP_DUP;
5406 		return flags;
5407 	} else {
5408 		/* they already had raid on here, just return */
5409 		if (flags & stripped)
5410 			return flags;
5411 
5412 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5413 		stripped = flags & ~stripped;
5414 
5415 		/* switch duplicated blocks with raid1 */
5416 		if (flags & BTRFS_BLOCK_GROUP_DUP)
5417 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
5418 
5419 		/* turn single device chunks into raid0 */
5420 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
5421 	}
5422 	return flags;
5423 }
5424 
5425 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5426 		     struct btrfs_block_group_cache *shrink_block_group,
5427 		     int force)
5428 {
5429 	struct btrfs_trans_handle *trans;
5430 	u64 new_alloc_flags;
5431 	u64 calc;
5432 
5433 	spin_lock(&shrink_block_group->lock);
5434 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5435 		spin_unlock(&shrink_block_group->lock);
5436 
5437 		trans = btrfs_start_transaction(root, 1);
5438 		spin_lock(&shrink_block_group->lock);
5439 
5440 		new_alloc_flags = update_block_group_flags(root,
5441 						   shrink_block_group->flags);
5442 		if (new_alloc_flags != shrink_block_group->flags) {
5443 			calc =
5444 			     btrfs_block_group_used(&shrink_block_group->item);
5445 		} else {
5446 			calc = shrink_block_group->key.offset;
5447 		}
5448 		spin_unlock(&shrink_block_group->lock);
5449 
5450 		do_chunk_alloc(trans, root->fs_info->extent_root,
5451 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
5452 
5453 		btrfs_end_transaction(trans, root);
5454 	} else
5455 		spin_unlock(&shrink_block_group->lock);
5456 	return 0;
5457 }
5458 
5459 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5460 				 struct btrfs_root *root,
5461 				 u64 objectid, u64 size)
5462 {
5463 	struct btrfs_path *path;
5464 	struct btrfs_inode_item *item;
5465 	struct extent_buffer *leaf;
5466 	int ret;
5467 
5468 	path = btrfs_alloc_path();
5469 	if (!path)
5470 		return -ENOMEM;
5471 
5472 	path->leave_spinning = 1;
5473 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5474 	if (ret)
5475 		goto out;
5476 
5477 	leaf = path->nodes[0];
5478 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5479 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5480 	btrfs_set_inode_generation(leaf, item, 1);
5481 	btrfs_set_inode_size(leaf, item, size);
5482 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5483 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5484 	btrfs_mark_buffer_dirty(leaf);
5485 	btrfs_release_path(root, path);
5486 out:
5487 	btrfs_free_path(path);
5488 	return ret;
5489 }
5490 
5491 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5492 					struct btrfs_block_group_cache *group)
5493 {
5494 	struct inode *inode = NULL;
5495 	struct btrfs_trans_handle *trans;
5496 	struct btrfs_root *root;
5497 	struct btrfs_key root_key;
5498 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5499 	int err = 0;
5500 
5501 	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5502 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5503 	root_key.offset = (u64)-1;
5504 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5505 	if (IS_ERR(root))
5506 		return ERR_CAST(root);
5507 
5508 	trans = btrfs_start_transaction(root, 1);
5509 	BUG_ON(!trans);
5510 
5511 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5512 	if (err)
5513 		goto out;
5514 
5515 	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5516 	BUG_ON(err);
5517 
5518 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5519 				       group->key.offset, 0, group->key.offset,
5520 				       0, 0, 0);
5521 	BUG_ON(err);
5522 
5523 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5524 	if (inode->i_state & I_NEW) {
5525 		BTRFS_I(inode)->root = root;
5526 		BTRFS_I(inode)->location.objectid = objectid;
5527 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5528 		BTRFS_I(inode)->location.offset = 0;
5529 		btrfs_read_locked_inode(inode);
5530 		unlock_new_inode(inode);
5531 		BUG_ON(is_bad_inode(inode));
5532 	} else {
5533 		BUG_ON(1);
5534 	}
5535 	BTRFS_I(inode)->index_cnt = group->key.objectid;
5536 
5537 	err = btrfs_orphan_add(trans, inode);
5538 out:
5539 	btrfs_end_transaction(trans, root);
5540 	if (err) {
5541 		if (inode)
5542 			iput(inode);
5543 		inode = ERR_PTR(err);
5544 	}
5545 	return inode;
5546 }
5547 
5548 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5549 {
5550 
5551 	struct btrfs_ordered_sum *sums;
5552 	struct btrfs_sector_sum *sector_sum;
5553 	struct btrfs_ordered_extent *ordered;
5554 	struct btrfs_root *root = BTRFS_I(inode)->root;
5555 	struct list_head list;
5556 	size_t offset;
5557 	int ret;
5558 	u64 disk_bytenr;
5559 
5560 	INIT_LIST_HEAD(&list);
5561 
5562 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5563 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5564 
5565 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5566 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5567 				       disk_bytenr + len - 1, &list);
5568 
5569 	while (!list_empty(&list)) {
5570 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5571 		list_del_init(&sums->list);
5572 
5573 		sector_sum = sums->sums;
5574 		sums->bytenr = ordered->start;
5575 
5576 		offset = 0;
5577 		while (offset < sums->len) {
5578 			sector_sum->bytenr += ordered->start - disk_bytenr;
5579 			sector_sum++;
5580 			offset += root->sectorsize;
5581 		}
5582 
5583 		btrfs_add_ordered_sum(inode, ordered, sums);
5584 	}
5585 	btrfs_put_ordered_extent(ordered);
5586 	return 0;
5587 }
5588 
5589 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5590 {
5591 	struct btrfs_trans_handle *trans;
5592 	struct btrfs_path *path;
5593 	struct btrfs_fs_info *info = root->fs_info;
5594 	struct extent_buffer *leaf;
5595 	struct inode *reloc_inode;
5596 	struct btrfs_block_group_cache *block_group;
5597 	struct btrfs_key key;
5598 	u64 skipped;
5599 	u64 cur_byte;
5600 	u64 total_found;
5601 	u32 nritems;
5602 	int ret;
5603 	int progress;
5604 	int pass = 0;
5605 
5606 	root = root->fs_info->extent_root;
5607 
5608 	block_group = btrfs_lookup_block_group(info, group_start);
5609 	BUG_ON(!block_group);
5610 
5611 	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5612 	       (unsigned long long)block_group->key.objectid,
5613 	       (unsigned long long)block_group->flags);
5614 
5615 	path = btrfs_alloc_path();
5616 	BUG_ON(!path);
5617 
5618 	reloc_inode = create_reloc_inode(info, block_group);
5619 	BUG_ON(IS_ERR(reloc_inode));
5620 
5621 	__alloc_chunk_for_shrink(root, block_group, 1);
5622 	set_block_group_readonly(block_group);
5623 
5624 	btrfs_start_delalloc_inodes(info->tree_root);
5625 	btrfs_wait_ordered_extents(info->tree_root, 0);
5626 again:
5627 	skipped = 0;
5628 	total_found = 0;
5629 	progress = 0;
5630 	key.objectid = block_group->key.objectid;
5631 	key.offset = 0;
5632 	key.type = 0;
5633 	cur_byte = key.objectid;
5634 
5635 	trans = btrfs_start_transaction(info->tree_root, 1);
5636 	btrfs_commit_transaction(trans, info->tree_root);
5637 
5638 	mutex_lock(&root->fs_info->cleaner_mutex);
5639 	btrfs_clean_old_snapshots(info->tree_root);
5640 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5641 	mutex_unlock(&root->fs_info->cleaner_mutex);
5642 
5643 	trans = btrfs_start_transaction(info->tree_root, 1);
5644 	btrfs_commit_transaction(trans, info->tree_root);
5645 
5646 	while (1) {
5647 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5648 		if (ret < 0)
5649 			goto out;
5650 next:
5651 		leaf = path->nodes[0];
5652 		nritems = btrfs_header_nritems(leaf);
5653 		if (path->slots[0] >= nritems) {
5654 			ret = btrfs_next_leaf(root, path);
5655 			if (ret < 0)
5656 				goto out;
5657 			if (ret == 1) {
5658 				ret = 0;
5659 				break;
5660 			}
5661 			leaf = path->nodes[0];
5662 			nritems = btrfs_header_nritems(leaf);
5663 		}
5664 
5665 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5666 
5667 		if (key.objectid >= block_group->key.objectid +
5668 		    block_group->key.offset)
5669 			break;
5670 
5671 		if (progress && need_resched()) {
5672 			btrfs_release_path(root, path);
5673 			cond_resched();
5674 			progress = 0;
5675 			continue;
5676 		}
5677 		progress = 1;
5678 
5679 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5680 		    key.objectid + key.offset <= cur_byte) {
5681 			path->slots[0]++;
5682 			goto next;
5683 		}
5684 
5685 		total_found++;
5686 		cur_byte = key.objectid + key.offset;
5687 		btrfs_release_path(root, path);
5688 
5689 		__alloc_chunk_for_shrink(root, block_group, 0);
5690 		ret = relocate_one_extent(root, path, &key, block_group,
5691 					  reloc_inode, pass);
5692 		BUG_ON(ret < 0);
5693 		if (ret > 0)
5694 			skipped++;
5695 
5696 		key.objectid = cur_byte;
5697 		key.type = 0;
5698 		key.offset = 0;
5699 	}
5700 
5701 	btrfs_release_path(root, path);
5702 
5703 	if (pass == 0) {
5704 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5705 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5706 	}
5707 
5708 	if (total_found > 0) {
5709 		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5710 		       (unsigned long long)total_found, pass);
5711 		pass++;
5712 		if (total_found == skipped && pass > 2) {
5713 			iput(reloc_inode);
5714 			reloc_inode = create_reloc_inode(info, block_group);
5715 			pass = 0;
5716 		}
5717 		goto again;
5718 	}
5719 
5720 	/* delete reloc_inode */
5721 	iput(reloc_inode);
5722 
5723 	/* unpin extents in this range */
5724 	trans = btrfs_start_transaction(info->tree_root, 1);
5725 	btrfs_commit_transaction(trans, info->tree_root);
5726 
5727 	spin_lock(&block_group->lock);
5728 	WARN_ON(block_group->pinned > 0);
5729 	WARN_ON(block_group->reserved > 0);
5730 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5731 	spin_unlock(&block_group->lock);
5732 	put_block_group(block_group);
5733 	ret = 0;
5734 out:
5735 	btrfs_free_path(path);
5736 	return ret;
5737 }
5738 
5739 static int find_first_block_group(struct btrfs_root *root,
5740 		struct btrfs_path *path, struct btrfs_key *key)
5741 {
5742 	int ret = 0;
5743 	struct btrfs_key found_key;
5744 	struct extent_buffer *leaf;
5745 	int slot;
5746 
5747 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5748 	if (ret < 0)
5749 		goto out;
5750 
5751 	while (1) {
5752 		slot = path->slots[0];
5753 		leaf = path->nodes[0];
5754 		if (slot >= btrfs_header_nritems(leaf)) {
5755 			ret = btrfs_next_leaf(root, path);
5756 			if (ret == 0)
5757 				continue;
5758 			if (ret < 0)
5759 				goto out;
5760 			break;
5761 		}
5762 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5763 
5764 		if (found_key.objectid >= key->objectid &&
5765 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5766 			ret = 0;
5767 			goto out;
5768 		}
5769 		path->slots[0]++;
5770 	}
5771 	ret = -ENOENT;
5772 out:
5773 	return ret;
5774 }
5775 
5776 int btrfs_free_block_groups(struct btrfs_fs_info *info)
5777 {
5778 	struct btrfs_block_group_cache *block_group;
5779 	struct btrfs_space_info *space_info;
5780 	struct rb_node *n;
5781 
5782 	spin_lock(&info->block_group_cache_lock);
5783 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5784 		block_group = rb_entry(n, struct btrfs_block_group_cache,
5785 				       cache_node);
5786 		rb_erase(&block_group->cache_node,
5787 			 &info->block_group_cache_tree);
5788 		spin_unlock(&info->block_group_cache_lock);
5789 
5790 		btrfs_remove_free_space_cache(block_group);
5791 		down_write(&block_group->space_info->groups_sem);
5792 		list_del(&block_group->list);
5793 		up_write(&block_group->space_info->groups_sem);
5794 
5795 		WARN_ON(atomic_read(&block_group->count) != 1);
5796 		kfree(block_group);
5797 
5798 		spin_lock(&info->block_group_cache_lock);
5799 	}
5800 	spin_unlock(&info->block_group_cache_lock);
5801 
5802 	/* now that all the block groups are freed, go through and
5803 	 * free all the space_info structs.  This is only called during
5804 	 * the final stages of unmount, and so we know nobody is
5805 	 * using them.  We call synchronize_rcu() once before we start,
5806 	 * just to be on the safe side.
5807 	 */
5808 	synchronize_rcu();
5809 
5810 	while(!list_empty(&info->space_info)) {
5811 		space_info = list_entry(info->space_info.next,
5812 					struct btrfs_space_info,
5813 					list);
5814 
5815 		list_del(&space_info->list);
5816 		kfree(space_info);
5817 	}
5818 	return 0;
5819 }
5820 
5821 int btrfs_read_block_groups(struct btrfs_root *root)
5822 {
5823 	struct btrfs_path *path;
5824 	int ret;
5825 	struct btrfs_block_group_cache *cache;
5826 	struct btrfs_fs_info *info = root->fs_info;
5827 	struct btrfs_space_info *space_info;
5828 	struct btrfs_key key;
5829 	struct btrfs_key found_key;
5830 	struct extent_buffer *leaf;
5831 
5832 	root = info->extent_root;
5833 	key.objectid = 0;
5834 	key.offset = 0;
5835 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5836 	path = btrfs_alloc_path();
5837 	if (!path)
5838 		return -ENOMEM;
5839 
5840 	while (1) {
5841 		ret = find_first_block_group(root, path, &key);
5842 		if (ret > 0) {
5843 			ret = 0;
5844 			goto error;
5845 		}
5846 		if (ret != 0)
5847 			goto error;
5848 
5849 		leaf = path->nodes[0];
5850 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5851 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
5852 		if (!cache) {
5853 			ret = -ENOMEM;
5854 			break;
5855 		}
5856 
5857 		atomic_set(&cache->count, 1);
5858 		spin_lock_init(&cache->lock);
5859 		mutex_init(&cache->alloc_mutex);
5860 		mutex_init(&cache->cache_mutex);
5861 		INIT_LIST_HEAD(&cache->list);
5862 		read_extent_buffer(leaf, &cache->item,
5863 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
5864 				   sizeof(cache->item));
5865 		memcpy(&cache->key, &found_key, sizeof(found_key));
5866 
5867 		key.objectid = found_key.objectid + found_key.offset;
5868 		btrfs_release_path(root, path);
5869 		cache->flags = btrfs_block_group_flags(&cache->item);
5870 
5871 		ret = update_space_info(info, cache->flags, found_key.offset,
5872 					btrfs_block_group_used(&cache->item),
5873 					&space_info);
5874 		BUG_ON(ret);
5875 		cache->space_info = space_info;
5876 		down_write(&space_info->groups_sem);
5877 		list_add_tail(&cache->list, &space_info->block_groups);
5878 		up_write(&space_info->groups_sem);
5879 
5880 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
5881 		BUG_ON(ret);
5882 
5883 		set_avail_alloc_bits(root->fs_info, cache->flags);
5884 		if (btrfs_chunk_readonly(root, cache->key.objectid))
5885 			set_block_group_readonly(cache);
5886 	}
5887 	ret = 0;
5888 error:
5889 	btrfs_free_path(path);
5890 	return ret;
5891 }
5892 
5893 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5894 			   struct btrfs_root *root, u64 bytes_used,
5895 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
5896 			   u64 size)
5897 {
5898 	int ret;
5899 	struct btrfs_root *extent_root;
5900 	struct btrfs_block_group_cache *cache;
5901 
5902 	extent_root = root->fs_info->extent_root;
5903 
5904 	root->fs_info->last_trans_log_full_commit = trans->transid;
5905 
5906 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
5907 	if (!cache)
5908 		return -ENOMEM;
5909 
5910 	cache->key.objectid = chunk_offset;
5911 	cache->key.offset = size;
5912 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5913 	atomic_set(&cache->count, 1);
5914 	spin_lock_init(&cache->lock);
5915 	mutex_init(&cache->alloc_mutex);
5916 	mutex_init(&cache->cache_mutex);
5917 	INIT_LIST_HEAD(&cache->list);
5918 
5919 	btrfs_set_block_group_used(&cache->item, bytes_used);
5920 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5921 	cache->flags = type;
5922 	btrfs_set_block_group_flags(&cache->item, type);
5923 
5924 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5925 				&cache->space_info);
5926 	BUG_ON(ret);
5927 	down_write(&cache->space_info->groups_sem);
5928 	list_add_tail(&cache->list, &cache->space_info->block_groups);
5929 	up_write(&cache->space_info->groups_sem);
5930 
5931 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
5932 	BUG_ON(ret);
5933 
5934 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5935 				sizeof(cache->item));
5936 	BUG_ON(ret);
5937 
5938 	set_avail_alloc_bits(extent_root->fs_info, type);
5939 
5940 	return 0;
5941 }
5942 
5943 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5944 			     struct btrfs_root *root, u64 group_start)
5945 {
5946 	struct btrfs_path *path;
5947 	struct btrfs_block_group_cache *block_group;
5948 	struct btrfs_key key;
5949 	int ret;
5950 
5951 	root = root->fs_info->extent_root;
5952 
5953 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5954 	BUG_ON(!block_group);
5955 	BUG_ON(!block_group->ro);
5956 
5957 	memcpy(&key, &block_group->key, sizeof(key));
5958 
5959 	path = btrfs_alloc_path();
5960 	BUG_ON(!path);
5961 
5962 	spin_lock(&root->fs_info->block_group_cache_lock);
5963 	rb_erase(&block_group->cache_node,
5964 		 &root->fs_info->block_group_cache_tree);
5965 	spin_unlock(&root->fs_info->block_group_cache_lock);
5966 	btrfs_remove_free_space_cache(block_group);
5967 	down_write(&block_group->space_info->groups_sem);
5968 	list_del(&block_group->list);
5969 	up_write(&block_group->space_info->groups_sem);
5970 
5971 	spin_lock(&block_group->space_info->lock);
5972 	block_group->space_info->total_bytes -= block_group->key.offset;
5973 	block_group->space_info->bytes_readonly -= block_group->key.offset;
5974 	spin_unlock(&block_group->space_info->lock);
5975 	block_group->space_info->full = 0;
5976 
5977 	put_block_group(block_group);
5978 	put_block_group(block_group);
5979 
5980 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5981 	if (ret > 0)
5982 		ret = -EIO;
5983 	if (ret < 0)
5984 		goto out;
5985 
5986 	ret = btrfs_del_item(trans, root, path);
5987 out:
5988 	btrfs_free_path(path);
5989 	return ret;
5990 }
5991