xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision 05bcf503)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 #undef SCRAMBLE_DELAYED_REFS
38 
39 /*
40  * control flags for do_chunk_alloc's force field
41  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
42  * if we really need one.
43  *
44  * CHUNK_ALLOC_LIMITED means to only try and allocate one
45  * if we have very few chunks already allocated.  This is
46  * used as part of the clustering code to help make sure
47  * we have a good pool of storage to cluster in, without
48  * filling the FS with empty chunks
49  *
50  * CHUNK_ALLOC_FORCE means it must try to allocate one
51  *
52  */
53 enum {
54 	CHUNK_ALLOC_NO_FORCE = 0,
55 	CHUNK_ALLOC_LIMITED = 1,
56 	CHUNK_ALLOC_FORCE = 2,
57 };
58 
59 /*
60  * Control how reservations are dealt with.
61  *
62  * RESERVE_FREE - freeing a reservation.
63  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
64  *   ENOSPC accounting
65  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
66  *   bytes_may_use as the ENOSPC accounting is done elsewhere
67  */
68 enum {
69 	RESERVE_FREE = 0,
70 	RESERVE_ALLOC = 1,
71 	RESERVE_ALLOC_NO_ACCOUNT = 2,
72 };
73 
74 static int update_block_group(struct btrfs_trans_handle *trans,
75 			      struct btrfs_root *root,
76 			      u64 bytenr, u64 num_bytes, int alloc);
77 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
78 				struct btrfs_root *root,
79 				u64 bytenr, u64 num_bytes, u64 parent,
80 				u64 root_objectid, u64 owner_objectid,
81 				u64 owner_offset, int refs_to_drop,
82 				struct btrfs_delayed_extent_op *extra_op);
83 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
84 				    struct extent_buffer *leaf,
85 				    struct btrfs_extent_item *ei);
86 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
87 				      struct btrfs_root *root,
88 				      u64 parent, u64 root_objectid,
89 				      u64 flags, u64 owner, u64 offset,
90 				      struct btrfs_key *ins, int ref_mod);
91 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
92 				     struct btrfs_root *root,
93 				     u64 parent, u64 root_objectid,
94 				     u64 flags, struct btrfs_disk_key *key,
95 				     int level, struct btrfs_key *ins);
96 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 			  struct btrfs_root *extent_root, u64 flags,
98 			  int force);
99 static int find_next_key(struct btrfs_path *path, int level,
100 			 struct btrfs_key *key);
101 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
102 			    int dump_block_groups);
103 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
104 				       u64 num_bytes, int reserve);
105 
106 static noinline int
107 block_group_cache_done(struct btrfs_block_group_cache *cache)
108 {
109 	smp_mb();
110 	return cache->cached == BTRFS_CACHE_FINISHED;
111 }
112 
113 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
114 {
115 	return (cache->flags & bits) == bits;
116 }
117 
118 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
119 {
120 	atomic_inc(&cache->count);
121 }
122 
123 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
124 {
125 	if (atomic_dec_and_test(&cache->count)) {
126 		WARN_ON(cache->pinned > 0);
127 		WARN_ON(cache->reserved > 0);
128 		kfree(cache->free_space_ctl);
129 		kfree(cache);
130 	}
131 }
132 
133 /*
134  * this adds the block group to the fs_info rb tree for the block group
135  * cache
136  */
137 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
138 				struct btrfs_block_group_cache *block_group)
139 {
140 	struct rb_node **p;
141 	struct rb_node *parent = NULL;
142 	struct btrfs_block_group_cache *cache;
143 
144 	spin_lock(&info->block_group_cache_lock);
145 	p = &info->block_group_cache_tree.rb_node;
146 
147 	while (*p) {
148 		parent = *p;
149 		cache = rb_entry(parent, struct btrfs_block_group_cache,
150 				 cache_node);
151 		if (block_group->key.objectid < cache->key.objectid) {
152 			p = &(*p)->rb_left;
153 		} else if (block_group->key.objectid > cache->key.objectid) {
154 			p = &(*p)->rb_right;
155 		} else {
156 			spin_unlock(&info->block_group_cache_lock);
157 			return -EEXIST;
158 		}
159 	}
160 
161 	rb_link_node(&block_group->cache_node, parent, p);
162 	rb_insert_color(&block_group->cache_node,
163 			&info->block_group_cache_tree);
164 	spin_unlock(&info->block_group_cache_lock);
165 
166 	return 0;
167 }
168 
169 /*
170  * This will return the block group at or after bytenr if contains is 0, else
171  * it will return the block group that contains the bytenr
172  */
173 static struct btrfs_block_group_cache *
174 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
175 			      int contains)
176 {
177 	struct btrfs_block_group_cache *cache, *ret = NULL;
178 	struct rb_node *n;
179 	u64 end, start;
180 
181 	spin_lock(&info->block_group_cache_lock);
182 	n = info->block_group_cache_tree.rb_node;
183 
184 	while (n) {
185 		cache = rb_entry(n, struct btrfs_block_group_cache,
186 				 cache_node);
187 		end = cache->key.objectid + cache->key.offset - 1;
188 		start = cache->key.objectid;
189 
190 		if (bytenr < start) {
191 			if (!contains && (!ret || start < ret->key.objectid))
192 				ret = cache;
193 			n = n->rb_left;
194 		} else if (bytenr > start) {
195 			if (contains && bytenr <= end) {
196 				ret = cache;
197 				break;
198 			}
199 			n = n->rb_right;
200 		} else {
201 			ret = cache;
202 			break;
203 		}
204 	}
205 	if (ret)
206 		btrfs_get_block_group(ret);
207 	spin_unlock(&info->block_group_cache_lock);
208 
209 	return ret;
210 }
211 
212 static int add_excluded_extent(struct btrfs_root *root,
213 			       u64 start, u64 num_bytes)
214 {
215 	u64 end = start + num_bytes - 1;
216 	set_extent_bits(&root->fs_info->freed_extents[0],
217 			start, end, EXTENT_UPTODATE, GFP_NOFS);
218 	set_extent_bits(&root->fs_info->freed_extents[1],
219 			start, end, EXTENT_UPTODATE, GFP_NOFS);
220 	return 0;
221 }
222 
223 static void free_excluded_extents(struct btrfs_root *root,
224 				  struct btrfs_block_group_cache *cache)
225 {
226 	u64 start, end;
227 
228 	start = cache->key.objectid;
229 	end = start + cache->key.offset - 1;
230 
231 	clear_extent_bits(&root->fs_info->freed_extents[0],
232 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
233 	clear_extent_bits(&root->fs_info->freed_extents[1],
234 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
235 }
236 
237 static int exclude_super_stripes(struct btrfs_root *root,
238 				 struct btrfs_block_group_cache *cache)
239 {
240 	u64 bytenr;
241 	u64 *logical;
242 	int stripe_len;
243 	int i, nr, ret;
244 
245 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
246 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
247 		cache->bytes_super += stripe_len;
248 		ret = add_excluded_extent(root, cache->key.objectid,
249 					  stripe_len);
250 		BUG_ON(ret); /* -ENOMEM */
251 	}
252 
253 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
254 		bytenr = btrfs_sb_offset(i);
255 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
256 				       cache->key.objectid, bytenr,
257 				       0, &logical, &nr, &stripe_len);
258 		BUG_ON(ret); /* -ENOMEM */
259 
260 		while (nr--) {
261 			cache->bytes_super += stripe_len;
262 			ret = add_excluded_extent(root, logical[nr],
263 						  stripe_len);
264 			BUG_ON(ret); /* -ENOMEM */
265 		}
266 
267 		kfree(logical);
268 	}
269 	return 0;
270 }
271 
272 static struct btrfs_caching_control *
273 get_caching_control(struct btrfs_block_group_cache *cache)
274 {
275 	struct btrfs_caching_control *ctl;
276 
277 	spin_lock(&cache->lock);
278 	if (cache->cached != BTRFS_CACHE_STARTED) {
279 		spin_unlock(&cache->lock);
280 		return NULL;
281 	}
282 
283 	/* We're loading it the fast way, so we don't have a caching_ctl. */
284 	if (!cache->caching_ctl) {
285 		spin_unlock(&cache->lock);
286 		return NULL;
287 	}
288 
289 	ctl = cache->caching_ctl;
290 	atomic_inc(&ctl->count);
291 	spin_unlock(&cache->lock);
292 	return ctl;
293 }
294 
295 static void put_caching_control(struct btrfs_caching_control *ctl)
296 {
297 	if (atomic_dec_and_test(&ctl->count))
298 		kfree(ctl);
299 }
300 
301 /*
302  * this is only called by cache_block_group, since we could have freed extents
303  * we need to check the pinned_extents for any extents that can't be used yet
304  * since their free space will be released as soon as the transaction commits.
305  */
306 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
307 			      struct btrfs_fs_info *info, u64 start, u64 end)
308 {
309 	u64 extent_start, extent_end, size, total_added = 0;
310 	int ret;
311 
312 	while (start < end) {
313 		ret = find_first_extent_bit(info->pinned_extents, start,
314 					    &extent_start, &extent_end,
315 					    EXTENT_DIRTY | EXTENT_UPTODATE,
316 					    NULL);
317 		if (ret)
318 			break;
319 
320 		if (extent_start <= start) {
321 			start = extent_end + 1;
322 		} else if (extent_start > start && extent_start < end) {
323 			size = extent_start - start;
324 			total_added += size;
325 			ret = btrfs_add_free_space(block_group, start,
326 						   size);
327 			BUG_ON(ret); /* -ENOMEM or logic error */
328 			start = extent_end + 1;
329 		} else {
330 			break;
331 		}
332 	}
333 
334 	if (start < end) {
335 		size = end - start;
336 		total_added += size;
337 		ret = btrfs_add_free_space(block_group, start, size);
338 		BUG_ON(ret); /* -ENOMEM or logic error */
339 	}
340 
341 	return total_added;
342 }
343 
344 static noinline void caching_thread(struct btrfs_work *work)
345 {
346 	struct btrfs_block_group_cache *block_group;
347 	struct btrfs_fs_info *fs_info;
348 	struct btrfs_caching_control *caching_ctl;
349 	struct btrfs_root *extent_root;
350 	struct btrfs_path *path;
351 	struct extent_buffer *leaf;
352 	struct btrfs_key key;
353 	u64 total_found = 0;
354 	u64 last = 0;
355 	u32 nritems;
356 	int ret = 0;
357 
358 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
359 	block_group = caching_ctl->block_group;
360 	fs_info = block_group->fs_info;
361 	extent_root = fs_info->extent_root;
362 
363 	path = btrfs_alloc_path();
364 	if (!path)
365 		goto out;
366 
367 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
368 
369 	/*
370 	 * We don't want to deadlock with somebody trying to allocate a new
371 	 * extent for the extent root while also trying to search the extent
372 	 * root to add free space.  So we skip locking and search the commit
373 	 * root, since its read-only
374 	 */
375 	path->skip_locking = 1;
376 	path->search_commit_root = 1;
377 	path->reada = 1;
378 
379 	key.objectid = last;
380 	key.offset = 0;
381 	key.type = BTRFS_EXTENT_ITEM_KEY;
382 again:
383 	mutex_lock(&caching_ctl->mutex);
384 	/* need to make sure the commit_root doesn't disappear */
385 	down_read(&fs_info->extent_commit_sem);
386 
387 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
388 	if (ret < 0)
389 		goto err;
390 
391 	leaf = path->nodes[0];
392 	nritems = btrfs_header_nritems(leaf);
393 
394 	while (1) {
395 		if (btrfs_fs_closing(fs_info) > 1) {
396 			last = (u64)-1;
397 			break;
398 		}
399 
400 		if (path->slots[0] < nritems) {
401 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
402 		} else {
403 			ret = find_next_key(path, 0, &key);
404 			if (ret)
405 				break;
406 
407 			if (need_resched() ||
408 			    btrfs_next_leaf(extent_root, path)) {
409 				caching_ctl->progress = last;
410 				btrfs_release_path(path);
411 				up_read(&fs_info->extent_commit_sem);
412 				mutex_unlock(&caching_ctl->mutex);
413 				cond_resched();
414 				goto again;
415 			}
416 			leaf = path->nodes[0];
417 			nritems = btrfs_header_nritems(leaf);
418 			continue;
419 		}
420 
421 		if (key.objectid < block_group->key.objectid) {
422 			path->slots[0]++;
423 			continue;
424 		}
425 
426 		if (key.objectid >= block_group->key.objectid +
427 		    block_group->key.offset)
428 			break;
429 
430 		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
431 			total_found += add_new_free_space(block_group,
432 							  fs_info, last,
433 							  key.objectid);
434 			last = key.objectid + key.offset;
435 
436 			if (total_found > (1024 * 1024 * 2)) {
437 				total_found = 0;
438 				wake_up(&caching_ctl->wait);
439 			}
440 		}
441 		path->slots[0]++;
442 	}
443 	ret = 0;
444 
445 	total_found += add_new_free_space(block_group, fs_info, last,
446 					  block_group->key.objectid +
447 					  block_group->key.offset);
448 	caching_ctl->progress = (u64)-1;
449 
450 	spin_lock(&block_group->lock);
451 	block_group->caching_ctl = NULL;
452 	block_group->cached = BTRFS_CACHE_FINISHED;
453 	spin_unlock(&block_group->lock);
454 
455 err:
456 	btrfs_free_path(path);
457 	up_read(&fs_info->extent_commit_sem);
458 
459 	free_excluded_extents(extent_root, block_group);
460 
461 	mutex_unlock(&caching_ctl->mutex);
462 out:
463 	wake_up(&caching_ctl->wait);
464 
465 	put_caching_control(caching_ctl);
466 	btrfs_put_block_group(block_group);
467 }
468 
469 static int cache_block_group(struct btrfs_block_group_cache *cache,
470 			     struct btrfs_trans_handle *trans,
471 			     struct btrfs_root *root,
472 			     int load_cache_only)
473 {
474 	DEFINE_WAIT(wait);
475 	struct btrfs_fs_info *fs_info = cache->fs_info;
476 	struct btrfs_caching_control *caching_ctl;
477 	int ret = 0;
478 
479 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
480 	if (!caching_ctl)
481 		return -ENOMEM;
482 
483 	INIT_LIST_HEAD(&caching_ctl->list);
484 	mutex_init(&caching_ctl->mutex);
485 	init_waitqueue_head(&caching_ctl->wait);
486 	caching_ctl->block_group = cache;
487 	caching_ctl->progress = cache->key.objectid;
488 	atomic_set(&caching_ctl->count, 1);
489 	caching_ctl->work.func = caching_thread;
490 
491 	spin_lock(&cache->lock);
492 	/*
493 	 * This should be a rare occasion, but this could happen I think in the
494 	 * case where one thread starts to load the space cache info, and then
495 	 * some other thread starts a transaction commit which tries to do an
496 	 * allocation while the other thread is still loading the space cache
497 	 * info.  The previous loop should have kept us from choosing this block
498 	 * group, but if we've moved to the state where we will wait on caching
499 	 * block groups we need to first check if we're doing a fast load here,
500 	 * so we can wait for it to finish, otherwise we could end up allocating
501 	 * from a block group who's cache gets evicted for one reason or
502 	 * another.
503 	 */
504 	while (cache->cached == BTRFS_CACHE_FAST) {
505 		struct btrfs_caching_control *ctl;
506 
507 		ctl = cache->caching_ctl;
508 		atomic_inc(&ctl->count);
509 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
510 		spin_unlock(&cache->lock);
511 
512 		schedule();
513 
514 		finish_wait(&ctl->wait, &wait);
515 		put_caching_control(ctl);
516 		spin_lock(&cache->lock);
517 	}
518 
519 	if (cache->cached != BTRFS_CACHE_NO) {
520 		spin_unlock(&cache->lock);
521 		kfree(caching_ctl);
522 		return 0;
523 	}
524 	WARN_ON(cache->caching_ctl);
525 	cache->caching_ctl = caching_ctl;
526 	cache->cached = BTRFS_CACHE_FAST;
527 	spin_unlock(&cache->lock);
528 
529 	/*
530 	 * We can't do the read from on-disk cache during a commit since we need
531 	 * to have the normal tree locking.  Also if we are currently trying to
532 	 * allocate blocks for the tree root we can't do the fast caching since
533 	 * we likely hold important locks.
534 	 */
535 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
536 		ret = load_free_space_cache(fs_info, cache);
537 
538 		spin_lock(&cache->lock);
539 		if (ret == 1) {
540 			cache->caching_ctl = NULL;
541 			cache->cached = BTRFS_CACHE_FINISHED;
542 			cache->last_byte_to_unpin = (u64)-1;
543 		} else {
544 			if (load_cache_only) {
545 				cache->caching_ctl = NULL;
546 				cache->cached = BTRFS_CACHE_NO;
547 			} else {
548 				cache->cached = BTRFS_CACHE_STARTED;
549 			}
550 		}
551 		spin_unlock(&cache->lock);
552 		wake_up(&caching_ctl->wait);
553 		if (ret == 1) {
554 			put_caching_control(caching_ctl);
555 			free_excluded_extents(fs_info->extent_root, cache);
556 			return 0;
557 		}
558 	} else {
559 		/*
560 		 * We are not going to do the fast caching, set cached to the
561 		 * appropriate value and wakeup any waiters.
562 		 */
563 		spin_lock(&cache->lock);
564 		if (load_cache_only) {
565 			cache->caching_ctl = NULL;
566 			cache->cached = BTRFS_CACHE_NO;
567 		} else {
568 			cache->cached = BTRFS_CACHE_STARTED;
569 		}
570 		spin_unlock(&cache->lock);
571 		wake_up(&caching_ctl->wait);
572 	}
573 
574 	if (load_cache_only) {
575 		put_caching_control(caching_ctl);
576 		return 0;
577 	}
578 
579 	down_write(&fs_info->extent_commit_sem);
580 	atomic_inc(&caching_ctl->count);
581 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
582 	up_write(&fs_info->extent_commit_sem);
583 
584 	btrfs_get_block_group(cache);
585 
586 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
587 
588 	return ret;
589 }
590 
591 /*
592  * return the block group that starts at or after bytenr
593  */
594 static struct btrfs_block_group_cache *
595 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
596 {
597 	struct btrfs_block_group_cache *cache;
598 
599 	cache = block_group_cache_tree_search(info, bytenr, 0);
600 
601 	return cache;
602 }
603 
604 /*
605  * return the block group that contains the given bytenr
606  */
607 struct btrfs_block_group_cache *btrfs_lookup_block_group(
608 						 struct btrfs_fs_info *info,
609 						 u64 bytenr)
610 {
611 	struct btrfs_block_group_cache *cache;
612 
613 	cache = block_group_cache_tree_search(info, bytenr, 1);
614 
615 	return cache;
616 }
617 
618 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
619 						  u64 flags)
620 {
621 	struct list_head *head = &info->space_info;
622 	struct btrfs_space_info *found;
623 
624 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
625 
626 	rcu_read_lock();
627 	list_for_each_entry_rcu(found, head, list) {
628 		if (found->flags & flags) {
629 			rcu_read_unlock();
630 			return found;
631 		}
632 	}
633 	rcu_read_unlock();
634 	return NULL;
635 }
636 
637 /*
638  * after adding space to the filesystem, we need to clear the full flags
639  * on all the space infos.
640  */
641 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
642 {
643 	struct list_head *head = &info->space_info;
644 	struct btrfs_space_info *found;
645 
646 	rcu_read_lock();
647 	list_for_each_entry_rcu(found, head, list)
648 		found->full = 0;
649 	rcu_read_unlock();
650 }
651 
652 static u64 div_factor(u64 num, int factor)
653 {
654 	if (factor == 10)
655 		return num;
656 	num *= factor;
657 	do_div(num, 10);
658 	return num;
659 }
660 
661 static u64 div_factor_fine(u64 num, int factor)
662 {
663 	if (factor == 100)
664 		return num;
665 	num *= factor;
666 	do_div(num, 100);
667 	return num;
668 }
669 
670 u64 btrfs_find_block_group(struct btrfs_root *root,
671 			   u64 search_start, u64 search_hint, int owner)
672 {
673 	struct btrfs_block_group_cache *cache;
674 	u64 used;
675 	u64 last = max(search_hint, search_start);
676 	u64 group_start = 0;
677 	int full_search = 0;
678 	int factor = 9;
679 	int wrapped = 0;
680 again:
681 	while (1) {
682 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
683 		if (!cache)
684 			break;
685 
686 		spin_lock(&cache->lock);
687 		last = cache->key.objectid + cache->key.offset;
688 		used = btrfs_block_group_used(&cache->item);
689 
690 		if ((full_search || !cache->ro) &&
691 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
692 			if (used + cache->pinned + cache->reserved <
693 			    div_factor(cache->key.offset, factor)) {
694 				group_start = cache->key.objectid;
695 				spin_unlock(&cache->lock);
696 				btrfs_put_block_group(cache);
697 				goto found;
698 			}
699 		}
700 		spin_unlock(&cache->lock);
701 		btrfs_put_block_group(cache);
702 		cond_resched();
703 	}
704 	if (!wrapped) {
705 		last = search_start;
706 		wrapped = 1;
707 		goto again;
708 	}
709 	if (!full_search && factor < 10) {
710 		last = search_start;
711 		full_search = 1;
712 		factor = 10;
713 		goto again;
714 	}
715 found:
716 	return group_start;
717 }
718 
719 /* simple helper to search for an existing extent at a given offset */
720 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
721 {
722 	int ret;
723 	struct btrfs_key key;
724 	struct btrfs_path *path;
725 
726 	path = btrfs_alloc_path();
727 	if (!path)
728 		return -ENOMEM;
729 
730 	key.objectid = start;
731 	key.offset = len;
732 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
733 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
734 				0, 0);
735 	btrfs_free_path(path);
736 	return ret;
737 }
738 
739 /*
740  * helper function to lookup reference count and flags of extent.
741  *
742  * the head node for delayed ref is used to store the sum of all the
743  * reference count modifications queued up in the rbtree. the head
744  * node may also store the extent flags to set. This way you can check
745  * to see what the reference count and extent flags would be if all of
746  * the delayed refs are not processed.
747  */
748 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
749 			     struct btrfs_root *root, u64 bytenr,
750 			     u64 num_bytes, u64 *refs, u64 *flags)
751 {
752 	struct btrfs_delayed_ref_head *head;
753 	struct btrfs_delayed_ref_root *delayed_refs;
754 	struct btrfs_path *path;
755 	struct btrfs_extent_item *ei;
756 	struct extent_buffer *leaf;
757 	struct btrfs_key key;
758 	u32 item_size;
759 	u64 num_refs;
760 	u64 extent_flags;
761 	int ret;
762 
763 	path = btrfs_alloc_path();
764 	if (!path)
765 		return -ENOMEM;
766 
767 	key.objectid = bytenr;
768 	key.type = BTRFS_EXTENT_ITEM_KEY;
769 	key.offset = num_bytes;
770 	if (!trans) {
771 		path->skip_locking = 1;
772 		path->search_commit_root = 1;
773 	}
774 again:
775 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
776 				&key, path, 0, 0);
777 	if (ret < 0)
778 		goto out_free;
779 
780 	if (ret == 0) {
781 		leaf = path->nodes[0];
782 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
783 		if (item_size >= sizeof(*ei)) {
784 			ei = btrfs_item_ptr(leaf, path->slots[0],
785 					    struct btrfs_extent_item);
786 			num_refs = btrfs_extent_refs(leaf, ei);
787 			extent_flags = btrfs_extent_flags(leaf, ei);
788 		} else {
789 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
790 			struct btrfs_extent_item_v0 *ei0;
791 			BUG_ON(item_size != sizeof(*ei0));
792 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
793 					     struct btrfs_extent_item_v0);
794 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
795 			/* FIXME: this isn't correct for data */
796 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
797 #else
798 			BUG();
799 #endif
800 		}
801 		BUG_ON(num_refs == 0);
802 	} else {
803 		num_refs = 0;
804 		extent_flags = 0;
805 		ret = 0;
806 	}
807 
808 	if (!trans)
809 		goto out;
810 
811 	delayed_refs = &trans->transaction->delayed_refs;
812 	spin_lock(&delayed_refs->lock);
813 	head = btrfs_find_delayed_ref_head(trans, bytenr);
814 	if (head) {
815 		if (!mutex_trylock(&head->mutex)) {
816 			atomic_inc(&head->node.refs);
817 			spin_unlock(&delayed_refs->lock);
818 
819 			btrfs_release_path(path);
820 
821 			/*
822 			 * Mutex was contended, block until it's released and try
823 			 * again
824 			 */
825 			mutex_lock(&head->mutex);
826 			mutex_unlock(&head->mutex);
827 			btrfs_put_delayed_ref(&head->node);
828 			goto again;
829 		}
830 		if (head->extent_op && head->extent_op->update_flags)
831 			extent_flags |= head->extent_op->flags_to_set;
832 		else
833 			BUG_ON(num_refs == 0);
834 
835 		num_refs += head->node.ref_mod;
836 		mutex_unlock(&head->mutex);
837 	}
838 	spin_unlock(&delayed_refs->lock);
839 out:
840 	WARN_ON(num_refs == 0);
841 	if (refs)
842 		*refs = num_refs;
843 	if (flags)
844 		*flags = extent_flags;
845 out_free:
846 	btrfs_free_path(path);
847 	return ret;
848 }
849 
850 /*
851  * Back reference rules.  Back refs have three main goals:
852  *
853  * 1) differentiate between all holders of references to an extent so that
854  *    when a reference is dropped we can make sure it was a valid reference
855  *    before freeing the extent.
856  *
857  * 2) Provide enough information to quickly find the holders of an extent
858  *    if we notice a given block is corrupted or bad.
859  *
860  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
861  *    maintenance.  This is actually the same as #2, but with a slightly
862  *    different use case.
863  *
864  * There are two kinds of back refs. The implicit back refs is optimized
865  * for pointers in non-shared tree blocks. For a given pointer in a block,
866  * back refs of this kind provide information about the block's owner tree
867  * and the pointer's key. These information allow us to find the block by
868  * b-tree searching. The full back refs is for pointers in tree blocks not
869  * referenced by their owner trees. The location of tree block is recorded
870  * in the back refs. Actually the full back refs is generic, and can be
871  * used in all cases the implicit back refs is used. The major shortcoming
872  * of the full back refs is its overhead. Every time a tree block gets
873  * COWed, we have to update back refs entry for all pointers in it.
874  *
875  * For a newly allocated tree block, we use implicit back refs for
876  * pointers in it. This means most tree related operations only involve
877  * implicit back refs. For a tree block created in old transaction, the
878  * only way to drop a reference to it is COW it. So we can detect the
879  * event that tree block loses its owner tree's reference and do the
880  * back refs conversion.
881  *
882  * When a tree block is COW'd through a tree, there are four cases:
883  *
884  * The reference count of the block is one and the tree is the block's
885  * owner tree. Nothing to do in this case.
886  *
887  * The reference count of the block is one and the tree is not the
888  * block's owner tree. In this case, full back refs is used for pointers
889  * in the block. Remove these full back refs, add implicit back refs for
890  * every pointers in the new block.
891  *
892  * The reference count of the block is greater than one and the tree is
893  * the block's owner tree. In this case, implicit back refs is used for
894  * pointers in the block. Add full back refs for every pointers in the
895  * block, increase lower level extents' reference counts. The original
896  * implicit back refs are entailed to the new block.
897  *
898  * The reference count of the block is greater than one and the tree is
899  * not the block's owner tree. Add implicit back refs for every pointer in
900  * the new block, increase lower level extents' reference count.
901  *
902  * Back Reference Key composing:
903  *
904  * The key objectid corresponds to the first byte in the extent,
905  * The key type is used to differentiate between types of back refs.
906  * There are different meanings of the key offset for different types
907  * of back refs.
908  *
909  * File extents can be referenced by:
910  *
911  * - multiple snapshots, subvolumes, or different generations in one subvol
912  * - different files inside a single subvolume
913  * - different offsets inside a file (bookend extents in file.c)
914  *
915  * The extent ref structure for the implicit back refs has fields for:
916  *
917  * - Objectid of the subvolume root
918  * - objectid of the file holding the reference
919  * - original offset in the file
920  * - how many bookend extents
921  *
922  * The key offset for the implicit back refs is hash of the first
923  * three fields.
924  *
925  * The extent ref structure for the full back refs has field for:
926  *
927  * - number of pointers in the tree leaf
928  *
929  * The key offset for the implicit back refs is the first byte of
930  * the tree leaf
931  *
932  * When a file extent is allocated, The implicit back refs is used.
933  * the fields are filled in:
934  *
935  *     (root_key.objectid, inode objectid, offset in file, 1)
936  *
937  * When a file extent is removed file truncation, we find the
938  * corresponding implicit back refs and check the following fields:
939  *
940  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
941  *
942  * Btree extents can be referenced by:
943  *
944  * - Different subvolumes
945  *
946  * Both the implicit back refs and the full back refs for tree blocks
947  * only consist of key. The key offset for the implicit back refs is
948  * objectid of block's owner tree. The key offset for the full back refs
949  * is the first byte of parent block.
950  *
951  * When implicit back refs is used, information about the lowest key and
952  * level of the tree block are required. These information are stored in
953  * tree block info structure.
954  */
955 
956 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
957 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
958 				  struct btrfs_root *root,
959 				  struct btrfs_path *path,
960 				  u64 owner, u32 extra_size)
961 {
962 	struct btrfs_extent_item *item;
963 	struct btrfs_extent_item_v0 *ei0;
964 	struct btrfs_extent_ref_v0 *ref0;
965 	struct btrfs_tree_block_info *bi;
966 	struct extent_buffer *leaf;
967 	struct btrfs_key key;
968 	struct btrfs_key found_key;
969 	u32 new_size = sizeof(*item);
970 	u64 refs;
971 	int ret;
972 
973 	leaf = path->nodes[0];
974 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
975 
976 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
977 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
978 			     struct btrfs_extent_item_v0);
979 	refs = btrfs_extent_refs_v0(leaf, ei0);
980 
981 	if (owner == (u64)-1) {
982 		while (1) {
983 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
984 				ret = btrfs_next_leaf(root, path);
985 				if (ret < 0)
986 					return ret;
987 				BUG_ON(ret > 0); /* Corruption */
988 				leaf = path->nodes[0];
989 			}
990 			btrfs_item_key_to_cpu(leaf, &found_key,
991 					      path->slots[0]);
992 			BUG_ON(key.objectid != found_key.objectid);
993 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
994 				path->slots[0]++;
995 				continue;
996 			}
997 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
998 					      struct btrfs_extent_ref_v0);
999 			owner = btrfs_ref_objectid_v0(leaf, ref0);
1000 			break;
1001 		}
1002 	}
1003 	btrfs_release_path(path);
1004 
1005 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1006 		new_size += sizeof(*bi);
1007 
1008 	new_size -= sizeof(*ei0);
1009 	ret = btrfs_search_slot(trans, root, &key, path,
1010 				new_size + extra_size, 1);
1011 	if (ret < 0)
1012 		return ret;
1013 	BUG_ON(ret); /* Corruption */
1014 
1015 	btrfs_extend_item(trans, root, path, new_size);
1016 
1017 	leaf = path->nodes[0];
1018 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1019 	btrfs_set_extent_refs(leaf, item, refs);
1020 	/* FIXME: get real generation */
1021 	btrfs_set_extent_generation(leaf, item, 0);
1022 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1023 		btrfs_set_extent_flags(leaf, item,
1024 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1025 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1026 		bi = (struct btrfs_tree_block_info *)(item + 1);
1027 		/* FIXME: get first key of the block */
1028 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1029 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1030 	} else {
1031 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1032 	}
1033 	btrfs_mark_buffer_dirty(leaf);
1034 	return 0;
1035 }
1036 #endif
1037 
1038 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1039 {
1040 	u32 high_crc = ~(u32)0;
1041 	u32 low_crc = ~(u32)0;
1042 	__le64 lenum;
1043 
1044 	lenum = cpu_to_le64(root_objectid);
1045 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1046 	lenum = cpu_to_le64(owner);
1047 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1048 	lenum = cpu_to_le64(offset);
1049 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1050 
1051 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1052 }
1053 
1054 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1055 				     struct btrfs_extent_data_ref *ref)
1056 {
1057 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1058 				    btrfs_extent_data_ref_objectid(leaf, ref),
1059 				    btrfs_extent_data_ref_offset(leaf, ref));
1060 }
1061 
1062 static int match_extent_data_ref(struct extent_buffer *leaf,
1063 				 struct btrfs_extent_data_ref *ref,
1064 				 u64 root_objectid, u64 owner, u64 offset)
1065 {
1066 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1067 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1068 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1069 		return 0;
1070 	return 1;
1071 }
1072 
1073 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1074 					   struct btrfs_root *root,
1075 					   struct btrfs_path *path,
1076 					   u64 bytenr, u64 parent,
1077 					   u64 root_objectid,
1078 					   u64 owner, u64 offset)
1079 {
1080 	struct btrfs_key key;
1081 	struct btrfs_extent_data_ref *ref;
1082 	struct extent_buffer *leaf;
1083 	u32 nritems;
1084 	int ret;
1085 	int recow;
1086 	int err = -ENOENT;
1087 
1088 	key.objectid = bytenr;
1089 	if (parent) {
1090 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1091 		key.offset = parent;
1092 	} else {
1093 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1094 		key.offset = hash_extent_data_ref(root_objectid,
1095 						  owner, offset);
1096 	}
1097 again:
1098 	recow = 0;
1099 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1100 	if (ret < 0) {
1101 		err = ret;
1102 		goto fail;
1103 	}
1104 
1105 	if (parent) {
1106 		if (!ret)
1107 			return 0;
1108 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1109 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1110 		btrfs_release_path(path);
1111 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1112 		if (ret < 0) {
1113 			err = ret;
1114 			goto fail;
1115 		}
1116 		if (!ret)
1117 			return 0;
1118 #endif
1119 		goto fail;
1120 	}
1121 
1122 	leaf = path->nodes[0];
1123 	nritems = btrfs_header_nritems(leaf);
1124 	while (1) {
1125 		if (path->slots[0] >= nritems) {
1126 			ret = btrfs_next_leaf(root, path);
1127 			if (ret < 0)
1128 				err = ret;
1129 			if (ret)
1130 				goto fail;
1131 
1132 			leaf = path->nodes[0];
1133 			nritems = btrfs_header_nritems(leaf);
1134 			recow = 1;
1135 		}
1136 
1137 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1138 		if (key.objectid != bytenr ||
1139 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1140 			goto fail;
1141 
1142 		ref = btrfs_item_ptr(leaf, path->slots[0],
1143 				     struct btrfs_extent_data_ref);
1144 
1145 		if (match_extent_data_ref(leaf, ref, root_objectid,
1146 					  owner, offset)) {
1147 			if (recow) {
1148 				btrfs_release_path(path);
1149 				goto again;
1150 			}
1151 			err = 0;
1152 			break;
1153 		}
1154 		path->slots[0]++;
1155 	}
1156 fail:
1157 	return err;
1158 }
1159 
1160 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1161 					   struct btrfs_root *root,
1162 					   struct btrfs_path *path,
1163 					   u64 bytenr, u64 parent,
1164 					   u64 root_objectid, u64 owner,
1165 					   u64 offset, int refs_to_add)
1166 {
1167 	struct btrfs_key key;
1168 	struct extent_buffer *leaf;
1169 	u32 size;
1170 	u32 num_refs;
1171 	int ret;
1172 
1173 	key.objectid = bytenr;
1174 	if (parent) {
1175 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1176 		key.offset = parent;
1177 		size = sizeof(struct btrfs_shared_data_ref);
1178 	} else {
1179 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1180 		key.offset = hash_extent_data_ref(root_objectid,
1181 						  owner, offset);
1182 		size = sizeof(struct btrfs_extent_data_ref);
1183 	}
1184 
1185 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1186 	if (ret && ret != -EEXIST)
1187 		goto fail;
1188 
1189 	leaf = path->nodes[0];
1190 	if (parent) {
1191 		struct btrfs_shared_data_ref *ref;
1192 		ref = btrfs_item_ptr(leaf, path->slots[0],
1193 				     struct btrfs_shared_data_ref);
1194 		if (ret == 0) {
1195 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1196 		} else {
1197 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1198 			num_refs += refs_to_add;
1199 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1200 		}
1201 	} else {
1202 		struct btrfs_extent_data_ref *ref;
1203 		while (ret == -EEXIST) {
1204 			ref = btrfs_item_ptr(leaf, path->slots[0],
1205 					     struct btrfs_extent_data_ref);
1206 			if (match_extent_data_ref(leaf, ref, root_objectid,
1207 						  owner, offset))
1208 				break;
1209 			btrfs_release_path(path);
1210 			key.offset++;
1211 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1212 						      size);
1213 			if (ret && ret != -EEXIST)
1214 				goto fail;
1215 
1216 			leaf = path->nodes[0];
1217 		}
1218 		ref = btrfs_item_ptr(leaf, path->slots[0],
1219 				     struct btrfs_extent_data_ref);
1220 		if (ret == 0) {
1221 			btrfs_set_extent_data_ref_root(leaf, ref,
1222 						       root_objectid);
1223 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1224 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1225 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1226 		} else {
1227 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1228 			num_refs += refs_to_add;
1229 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1230 		}
1231 	}
1232 	btrfs_mark_buffer_dirty(leaf);
1233 	ret = 0;
1234 fail:
1235 	btrfs_release_path(path);
1236 	return ret;
1237 }
1238 
1239 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1240 					   struct btrfs_root *root,
1241 					   struct btrfs_path *path,
1242 					   int refs_to_drop)
1243 {
1244 	struct btrfs_key key;
1245 	struct btrfs_extent_data_ref *ref1 = NULL;
1246 	struct btrfs_shared_data_ref *ref2 = NULL;
1247 	struct extent_buffer *leaf;
1248 	u32 num_refs = 0;
1249 	int ret = 0;
1250 
1251 	leaf = path->nodes[0];
1252 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1253 
1254 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1255 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1256 				      struct btrfs_extent_data_ref);
1257 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1258 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1259 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1260 				      struct btrfs_shared_data_ref);
1261 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1262 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1263 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1264 		struct btrfs_extent_ref_v0 *ref0;
1265 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1266 				      struct btrfs_extent_ref_v0);
1267 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1268 #endif
1269 	} else {
1270 		BUG();
1271 	}
1272 
1273 	BUG_ON(num_refs < refs_to_drop);
1274 	num_refs -= refs_to_drop;
1275 
1276 	if (num_refs == 0) {
1277 		ret = btrfs_del_item(trans, root, path);
1278 	} else {
1279 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1280 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1281 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1282 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1283 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1284 		else {
1285 			struct btrfs_extent_ref_v0 *ref0;
1286 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1287 					struct btrfs_extent_ref_v0);
1288 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1289 		}
1290 #endif
1291 		btrfs_mark_buffer_dirty(leaf);
1292 	}
1293 	return ret;
1294 }
1295 
1296 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1297 					  struct btrfs_path *path,
1298 					  struct btrfs_extent_inline_ref *iref)
1299 {
1300 	struct btrfs_key key;
1301 	struct extent_buffer *leaf;
1302 	struct btrfs_extent_data_ref *ref1;
1303 	struct btrfs_shared_data_ref *ref2;
1304 	u32 num_refs = 0;
1305 
1306 	leaf = path->nodes[0];
1307 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1308 	if (iref) {
1309 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1310 		    BTRFS_EXTENT_DATA_REF_KEY) {
1311 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1312 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1313 		} else {
1314 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1315 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1316 		}
1317 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1318 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1319 				      struct btrfs_extent_data_ref);
1320 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1321 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1322 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1323 				      struct btrfs_shared_data_ref);
1324 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1325 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1326 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1327 		struct btrfs_extent_ref_v0 *ref0;
1328 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1329 				      struct btrfs_extent_ref_v0);
1330 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1331 #endif
1332 	} else {
1333 		WARN_ON(1);
1334 	}
1335 	return num_refs;
1336 }
1337 
1338 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1339 					  struct btrfs_root *root,
1340 					  struct btrfs_path *path,
1341 					  u64 bytenr, u64 parent,
1342 					  u64 root_objectid)
1343 {
1344 	struct btrfs_key key;
1345 	int ret;
1346 
1347 	key.objectid = bytenr;
1348 	if (parent) {
1349 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1350 		key.offset = parent;
1351 	} else {
1352 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1353 		key.offset = root_objectid;
1354 	}
1355 
1356 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1357 	if (ret > 0)
1358 		ret = -ENOENT;
1359 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1360 	if (ret == -ENOENT && parent) {
1361 		btrfs_release_path(path);
1362 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1363 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1364 		if (ret > 0)
1365 			ret = -ENOENT;
1366 	}
1367 #endif
1368 	return ret;
1369 }
1370 
1371 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1372 					  struct btrfs_root *root,
1373 					  struct btrfs_path *path,
1374 					  u64 bytenr, u64 parent,
1375 					  u64 root_objectid)
1376 {
1377 	struct btrfs_key key;
1378 	int ret;
1379 
1380 	key.objectid = bytenr;
1381 	if (parent) {
1382 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1383 		key.offset = parent;
1384 	} else {
1385 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1386 		key.offset = root_objectid;
1387 	}
1388 
1389 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1390 	btrfs_release_path(path);
1391 	return ret;
1392 }
1393 
1394 static inline int extent_ref_type(u64 parent, u64 owner)
1395 {
1396 	int type;
1397 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1398 		if (parent > 0)
1399 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1400 		else
1401 			type = BTRFS_TREE_BLOCK_REF_KEY;
1402 	} else {
1403 		if (parent > 0)
1404 			type = BTRFS_SHARED_DATA_REF_KEY;
1405 		else
1406 			type = BTRFS_EXTENT_DATA_REF_KEY;
1407 	}
1408 	return type;
1409 }
1410 
1411 static int find_next_key(struct btrfs_path *path, int level,
1412 			 struct btrfs_key *key)
1413 
1414 {
1415 	for (; level < BTRFS_MAX_LEVEL; level++) {
1416 		if (!path->nodes[level])
1417 			break;
1418 		if (path->slots[level] + 1 >=
1419 		    btrfs_header_nritems(path->nodes[level]))
1420 			continue;
1421 		if (level == 0)
1422 			btrfs_item_key_to_cpu(path->nodes[level], key,
1423 					      path->slots[level] + 1);
1424 		else
1425 			btrfs_node_key_to_cpu(path->nodes[level], key,
1426 					      path->slots[level] + 1);
1427 		return 0;
1428 	}
1429 	return 1;
1430 }
1431 
1432 /*
1433  * look for inline back ref. if back ref is found, *ref_ret is set
1434  * to the address of inline back ref, and 0 is returned.
1435  *
1436  * if back ref isn't found, *ref_ret is set to the address where it
1437  * should be inserted, and -ENOENT is returned.
1438  *
1439  * if insert is true and there are too many inline back refs, the path
1440  * points to the extent item, and -EAGAIN is returned.
1441  *
1442  * NOTE: inline back refs are ordered in the same way that back ref
1443  *	 items in the tree are ordered.
1444  */
1445 static noinline_for_stack
1446 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1447 				 struct btrfs_root *root,
1448 				 struct btrfs_path *path,
1449 				 struct btrfs_extent_inline_ref **ref_ret,
1450 				 u64 bytenr, u64 num_bytes,
1451 				 u64 parent, u64 root_objectid,
1452 				 u64 owner, u64 offset, int insert)
1453 {
1454 	struct btrfs_key key;
1455 	struct extent_buffer *leaf;
1456 	struct btrfs_extent_item *ei;
1457 	struct btrfs_extent_inline_ref *iref;
1458 	u64 flags;
1459 	u64 item_size;
1460 	unsigned long ptr;
1461 	unsigned long end;
1462 	int extra_size;
1463 	int type;
1464 	int want;
1465 	int ret;
1466 	int err = 0;
1467 
1468 	key.objectid = bytenr;
1469 	key.type = BTRFS_EXTENT_ITEM_KEY;
1470 	key.offset = num_bytes;
1471 
1472 	want = extent_ref_type(parent, owner);
1473 	if (insert) {
1474 		extra_size = btrfs_extent_inline_ref_size(want);
1475 		path->keep_locks = 1;
1476 	} else
1477 		extra_size = -1;
1478 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1479 	if (ret < 0) {
1480 		err = ret;
1481 		goto out;
1482 	}
1483 	if (ret && !insert) {
1484 		err = -ENOENT;
1485 		goto out;
1486 	}
1487 	BUG_ON(ret); /* Corruption */
1488 
1489 	leaf = path->nodes[0];
1490 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1491 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1492 	if (item_size < sizeof(*ei)) {
1493 		if (!insert) {
1494 			err = -ENOENT;
1495 			goto out;
1496 		}
1497 		ret = convert_extent_item_v0(trans, root, path, owner,
1498 					     extra_size);
1499 		if (ret < 0) {
1500 			err = ret;
1501 			goto out;
1502 		}
1503 		leaf = path->nodes[0];
1504 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1505 	}
1506 #endif
1507 	BUG_ON(item_size < sizeof(*ei));
1508 
1509 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1510 	flags = btrfs_extent_flags(leaf, ei);
1511 
1512 	ptr = (unsigned long)(ei + 1);
1513 	end = (unsigned long)ei + item_size;
1514 
1515 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1516 		ptr += sizeof(struct btrfs_tree_block_info);
1517 		BUG_ON(ptr > end);
1518 	} else {
1519 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1520 	}
1521 
1522 	err = -ENOENT;
1523 	while (1) {
1524 		if (ptr >= end) {
1525 			WARN_ON(ptr > end);
1526 			break;
1527 		}
1528 		iref = (struct btrfs_extent_inline_ref *)ptr;
1529 		type = btrfs_extent_inline_ref_type(leaf, iref);
1530 		if (want < type)
1531 			break;
1532 		if (want > type) {
1533 			ptr += btrfs_extent_inline_ref_size(type);
1534 			continue;
1535 		}
1536 
1537 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1538 			struct btrfs_extent_data_ref *dref;
1539 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1540 			if (match_extent_data_ref(leaf, dref, root_objectid,
1541 						  owner, offset)) {
1542 				err = 0;
1543 				break;
1544 			}
1545 			if (hash_extent_data_ref_item(leaf, dref) <
1546 			    hash_extent_data_ref(root_objectid, owner, offset))
1547 				break;
1548 		} else {
1549 			u64 ref_offset;
1550 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1551 			if (parent > 0) {
1552 				if (parent == ref_offset) {
1553 					err = 0;
1554 					break;
1555 				}
1556 				if (ref_offset < parent)
1557 					break;
1558 			} else {
1559 				if (root_objectid == ref_offset) {
1560 					err = 0;
1561 					break;
1562 				}
1563 				if (ref_offset < root_objectid)
1564 					break;
1565 			}
1566 		}
1567 		ptr += btrfs_extent_inline_ref_size(type);
1568 	}
1569 	if (err == -ENOENT && insert) {
1570 		if (item_size + extra_size >=
1571 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1572 			err = -EAGAIN;
1573 			goto out;
1574 		}
1575 		/*
1576 		 * To add new inline back ref, we have to make sure
1577 		 * there is no corresponding back ref item.
1578 		 * For simplicity, we just do not add new inline back
1579 		 * ref if there is any kind of item for this block
1580 		 */
1581 		if (find_next_key(path, 0, &key) == 0 &&
1582 		    key.objectid == bytenr &&
1583 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1584 			err = -EAGAIN;
1585 			goto out;
1586 		}
1587 	}
1588 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1589 out:
1590 	if (insert) {
1591 		path->keep_locks = 0;
1592 		btrfs_unlock_up_safe(path, 1);
1593 	}
1594 	return err;
1595 }
1596 
1597 /*
1598  * helper to add new inline back ref
1599  */
1600 static noinline_for_stack
1601 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1602 				 struct btrfs_root *root,
1603 				 struct btrfs_path *path,
1604 				 struct btrfs_extent_inline_ref *iref,
1605 				 u64 parent, u64 root_objectid,
1606 				 u64 owner, u64 offset, int refs_to_add,
1607 				 struct btrfs_delayed_extent_op *extent_op)
1608 {
1609 	struct extent_buffer *leaf;
1610 	struct btrfs_extent_item *ei;
1611 	unsigned long ptr;
1612 	unsigned long end;
1613 	unsigned long item_offset;
1614 	u64 refs;
1615 	int size;
1616 	int type;
1617 
1618 	leaf = path->nodes[0];
1619 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1620 	item_offset = (unsigned long)iref - (unsigned long)ei;
1621 
1622 	type = extent_ref_type(parent, owner);
1623 	size = btrfs_extent_inline_ref_size(type);
1624 
1625 	btrfs_extend_item(trans, root, path, size);
1626 
1627 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1628 	refs = btrfs_extent_refs(leaf, ei);
1629 	refs += refs_to_add;
1630 	btrfs_set_extent_refs(leaf, ei, refs);
1631 	if (extent_op)
1632 		__run_delayed_extent_op(extent_op, leaf, ei);
1633 
1634 	ptr = (unsigned long)ei + item_offset;
1635 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1636 	if (ptr < end - size)
1637 		memmove_extent_buffer(leaf, ptr + size, ptr,
1638 				      end - size - ptr);
1639 
1640 	iref = (struct btrfs_extent_inline_ref *)ptr;
1641 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1642 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1643 		struct btrfs_extent_data_ref *dref;
1644 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1645 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1646 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1647 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1648 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1649 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1650 		struct btrfs_shared_data_ref *sref;
1651 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1652 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1653 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1654 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1655 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1656 	} else {
1657 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1658 	}
1659 	btrfs_mark_buffer_dirty(leaf);
1660 }
1661 
1662 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1663 				 struct btrfs_root *root,
1664 				 struct btrfs_path *path,
1665 				 struct btrfs_extent_inline_ref **ref_ret,
1666 				 u64 bytenr, u64 num_bytes, u64 parent,
1667 				 u64 root_objectid, u64 owner, u64 offset)
1668 {
1669 	int ret;
1670 
1671 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1672 					   bytenr, num_bytes, parent,
1673 					   root_objectid, owner, offset, 0);
1674 	if (ret != -ENOENT)
1675 		return ret;
1676 
1677 	btrfs_release_path(path);
1678 	*ref_ret = NULL;
1679 
1680 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1681 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1682 					    root_objectid);
1683 	} else {
1684 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1685 					     root_objectid, owner, offset);
1686 	}
1687 	return ret;
1688 }
1689 
1690 /*
1691  * helper to update/remove inline back ref
1692  */
1693 static noinline_for_stack
1694 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1695 				  struct btrfs_root *root,
1696 				  struct btrfs_path *path,
1697 				  struct btrfs_extent_inline_ref *iref,
1698 				  int refs_to_mod,
1699 				  struct btrfs_delayed_extent_op *extent_op)
1700 {
1701 	struct extent_buffer *leaf;
1702 	struct btrfs_extent_item *ei;
1703 	struct btrfs_extent_data_ref *dref = NULL;
1704 	struct btrfs_shared_data_ref *sref = NULL;
1705 	unsigned long ptr;
1706 	unsigned long end;
1707 	u32 item_size;
1708 	int size;
1709 	int type;
1710 	u64 refs;
1711 
1712 	leaf = path->nodes[0];
1713 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1714 	refs = btrfs_extent_refs(leaf, ei);
1715 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1716 	refs += refs_to_mod;
1717 	btrfs_set_extent_refs(leaf, ei, refs);
1718 	if (extent_op)
1719 		__run_delayed_extent_op(extent_op, leaf, ei);
1720 
1721 	type = btrfs_extent_inline_ref_type(leaf, iref);
1722 
1723 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1724 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1725 		refs = btrfs_extent_data_ref_count(leaf, dref);
1726 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1727 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1728 		refs = btrfs_shared_data_ref_count(leaf, sref);
1729 	} else {
1730 		refs = 1;
1731 		BUG_ON(refs_to_mod != -1);
1732 	}
1733 
1734 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1735 	refs += refs_to_mod;
1736 
1737 	if (refs > 0) {
1738 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1739 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1740 		else
1741 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1742 	} else {
1743 		size =  btrfs_extent_inline_ref_size(type);
1744 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1745 		ptr = (unsigned long)iref;
1746 		end = (unsigned long)ei + item_size;
1747 		if (ptr + size < end)
1748 			memmove_extent_buffer(leaf, ptr, ptr + size,
1749 					      end - ptr - size);
1750 		item_size -= size;
1751 		btrfs_truncate_item(trans, root, path, item_size, 1);
1752 	}
1753 	btrfs_mark_buffer_dirty(leaf);
1754 }
1755 
1756 static noinline_for_stack
1757 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1758 				 struct btrfs_root *root,
1759 				 struct btrfs_path *path,
1760 				 u64 bytenr, u64 num_bytes, u64 parent,
1761 				 u64 root_objectid, u64 owner,
1762 				 u64 offset, int refs_to_add,
1763 				 struct btrfs_delayed_extent_op *extent_op)
1764 {
1765 	struct btrfs_extent_inline_ref *iref;
1766 	int ret;
1767 
1768 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1769 					   bytenr, num_bytes, parent,
1770 					   root_objectid, owner, offset, 1);
1771 	if (ret == 0) {
1772 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1773 		update_inline_extent_backref(trans, root, path, iref,
1774 					     refs_to_add, extent_op);
1775 	} else if (ret == -ENOENT) {
1776 		setup_inline_extent_backref(trans, root, path, iref, parent,
1777 					    root_objectid, owner, offset,
1778 					    refs_to_add, extent_op);
1779 		ret = 0;
1780 	}
1781 	return ret;
1782 }
1783 
1784 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1785 				 struct btrfs_root *root,
1786 				 struct btrfs_path *path,
1787 				 u64 bytenr, u64 parent, u64 root_objectid,
1788 				 u64 owner, u64 offset, int refs_to_add)
1789 {
1790 	int ret;
1791 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1792 		BUG_ON(refs_to_add != 1);
1793 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1794 					    parent, root_objectid);
1795 	} else {
1796 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1797 					     parent, root_objectid,
1798 					     owner, offset, refs_to_add);
1799 	}
1800 	return ret;
1801 }
1802 
1803 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1804 				 struct btrfs_root *root,
1805 				 struct btrfs_path *path,
1806 				 struct btrfs_extent_inline_ref *iref,
1807 				 int refs_to_drop, int is_data)
1808 {
1809 	int ret = 0;
1810 
1811 	BUG_ON(!is_data && refs_to_drop != 1);
1812 	if (iref) {
1813 		update_inline_extent_backref(trans, root, path, iref,
1814 					     -refs_to_drop, NULL);
1815 	} else if (is_data) {
1816 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1817 	} else {
1818 		ret = btrfs_del_item(trans, root, path);
1819 	}
1820 	return ret;
1821 }
1822 
1823 static int btrfs_issue_discard(struct block_device *bdev,
1824 				u64 start, u64 len)
1825 {
1826 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1827 }
1828 
1829 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1830 				u64 num_bytes, u64 *actual_bytes)
1831 {
1832 	int ret;
1833 	u64 discarded_bytes = 0;
1834 	struct btrfs_bio *bbio = NULL;
1835 
1836 
1837 	/* Tell the block device(s) that the sectors can be discarded */
1838 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1839 			      bytenr, &num_bytes, &bbio, 0);
1840 	/* Error condition is -ENOMEM */
1841 	if (!ret) {
1842 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1843 		int i;
1844 
1845 
1846 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1847 			if (!stripe->dev->can_discard)
1848 				continue;
1849 
1850 			ret = btrfs_issue_discard(stripe->dev->bdev,
1851 						  stripe->physical,
1852 						  stripe->length);
1853 			if (!ret)
1854 				discarded_bytes += stripe->length;
1855 			else if (ret != -EOPNOTSUPP)
1856 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1857 
1858 			/*
1859 			 * Just in case we get back EOPNOTSUPP for some reason,
1860 			 * just ignore the return value so we don't screw up
1861 			 * people calling discard_extent.
1862 			 */
1863 			ret = 0;
1864 		}
1865 		kfree(bbio);
1866 	}
1867 
1868 	if (actual_bytes)
1869 		*actual_bytes = discarded_bytes;
1870 
1871 
1872 	return ret;
1873 }
1874 
1875 /* Can return -ENOMEM */
1876 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1877 			 struct btrfs_root *root,
1878 			 u64 bytenr, u64 num_bytes, u64 parent,
1879 			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1880 {
1881 	int ret;
1882 	struct btrfs_fs_info *fs_info = root->fs_info;
1883 
1884 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1885 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1886 
1887 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1888 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1889 					num_bytes,
1890 					parent, root_objectid, (int)owner,
1891 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1892 	} else {
1893 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1894 					num_bytes,
1895 					parent, root_objectid, owner, offset,
1896 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1897 	}
1898 	return ret;
1899 }
1900 
1901 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1902 				  struct btrfs_root *root,
1903 				  u64 bytenr, u64 num_bytes,
1904 				  u64 parent, u64 root_objectid,
1905 				  u64 owner, u64 offset, int refs_to_add,
1906 				  struct btrfs_delayed_extent_op *extent_op)
1907 {
1908 	struct btrfs_path *path;
1909 	struct extent_buffer *leaf;
1910 	struct btrfs_extent_item *item;
1911 	u64 refs;
1912 	int ret;
1913 	int err = 0;
1914 
1915 	path = btrfs_alloc_path();
1916 	if (!path)
1917 		return -ENOMEM;
1918 
1919 	path->reada = 1;
1920 	path->leave_spinning = 1;
1921 	/* this will setup the path even if it fails to insert the back ref */
1922 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1923 					   path, bytenr, num_bytes, parent,
1924 					   root_objectid, owner, offset,
1925 					   refs_to_add, extent_op);
1926 	if (ret == 0)
1927 		goto out;
1928 
1929 	if (ret != -EAGAIN) {
1930 		err = ret;
1931 		goto out;
1932 	}
1933 
1934 	leaf = path->nodes[0];
1935 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1936 	refs = btrfs_extent_refs(leaf, item);
1937 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1938 	if (extent_op)
1939 		__run_delayed_extent_op(extent_op, leaf, item);
1940 
1941 	btrfs_mark_buffer_dirty(leaf);
1942 	btrfs_release_path(path);
1943 
1944 	path->reada = 1;
1945 	path->leave_spinning = 1;
1946 
1947 	/* now insert the actual backref */
1948 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1949 				    path, bytenr, parent, root_objectid,
1950 				    owner, offset, refs_to_add);
1951 	if (ret)
1952 		btrfs_abort_transaction(trans, root, ret);
1953 out:
1954 	btrfs_free_path(path);
1955 	return err;
1956 }
1957 
1958 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1959 				struct btrfs_root *root,
1960 				struct btrfs_delayed_ref_node *node,
1961 				struct btrfs_delayed_extent_op *extent_op,
1962 				int insert_reserved)
1963 {
1964 	int ret = 0;
1965 	struct btrfs_delayed_data_ref *ref;
1966 	struct btrfs_key ins;
1967 	u64 parent = 0;
1968 	u64 ref_root = 0;
1969 	u64 flags = 0;
1970 
1971 	ins.objectid = node->bytenr;
1972 	ins.offset = node->num_bytes;
1973 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1974 
1975 	ref = btrfs_delayed_node_to_data_ref(node);
1976 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1977 		parent = ref->parent;
1978 	else
1979 		ref_root = ref->root;
1980 
1981 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1982 		if (extent_op) {
1983 			BUG_ON(extent_op->update_key);
1984 			flags |= extent_op->flags_to_set;
1985 		}
1986 		ret = alloc_reserved_file_extent(trans, root,
1987 						 parent, ref_root, flags,
1988 						 ref->objectid, ref->offset,
1989 						 &ins, node->ref_mod);
1990 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1991 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1992 					     node->num_bytes, parent,
1993 					     ref_root, ref->objectid,
1994 					     ref->offset, node->ref_mod,
1995 					     extent_op);
1996 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1997 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1998 					  node->num_bytes, parent,
1999 					  ref_root, ref->objectid,
2000 					  ref->offset, node->ref_mod,
2001 					  extent_op);
2002 	} else {
2003 		BUG();
2004 	}
2005 	return ret;
2006 }
2007 
2008 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2009 				    struct extent_buffer *leaf,
2010 				    struct btrfs_extent_item *ei)
2011 {
2012 	u64 flags = btrfs_extent_flags(leaf, ei);
2013 	if (extent_op->update_flags) {
2014 		flags |= extent_op->flags_to_set;
2015 		btrfs_set_extent_flags(leaf, ei, flags);
2016 	}
2017 
2018 	if (extent_op->update_key) {
2019 		struct btrfs_tree_block_info *bi;
2020 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2021 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2022 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2023 	}
2024 }
2025 
2026 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2027 				 struct btrfs_root *root,
2028 				 struct btrfs_delayed_ref_node *node,
2029 				 struct btrfs_delayed_extent_op *extent_op)
2030 {
2031 	struct btrfs_key key;
2032 	struct btrfs_path *path;
2033 	struct btrfs_extent_item *ei;
2034 	struct extent_buffer *leaf;
2035 	u32 item_size;
2036 	int ret;
2037 	int err = 0;
2038 
2039 	if (trans->aborted)
2040 		return 0;
2041 
2042 	path = btrfs_alloc_path();
2043 	if (!path)
2044 		return -ENOMEM;
2045 
2046 	key.objectid = node->bytenr;
2047 	key.type = BTRFS_EXTENT_ITEM_KEY;
2048 	key.offset = node->num_bytes;
2049 
2050 	path->reada = 1;
2051 	path->leave_spinning = 1;
2052 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2053 				path, 0, 1);
2054 	if (ret < 0) {
2055 		err = ret;
2056 		goto out;
2057 	}
2058 	if (ret > 0) {
2059 		err = -EIO;
2060 		goto out;
2061 	}
2062 
2063 	leaf = path->nodes[0];
2064 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2065 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2066 	if (item_size < sizeof(*ei)) {
2067 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2068 					     path, (u64)-1, 0);
2069 		if (ret < 0) {
2070 			err = ret;
2071 			goto out;
2072 		}
2073 		leaf = path->nodes[0];
2074 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2075 	}
2076 #endif
2077 	BUG_ON(item_size < sizeof(*ei));
2078 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2079 	__run_delayed_extent_op(extent_op, leaf, ei);
2080 
2081 	btrfs_mark_buffer_dirty(leaf);
2082 out:
2083 	btrfs_free_path(path);
2084 	return err;
2085 }
2086 
2087 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2088 				struct btrfs_root *root,
2089 				struct btrfs_delayed_ref_node *node,
2090 				struct btrfs_delayed_extent_op *extent_op,
2091 				int insert_reserved)
2092 {
2093 	int ret = 0;
2094 	struct btrfs_delayed_tree_ref *ref;
2095 	struct btrfs_key ins;
2096 	u64 parent = 0;
2097 	u64 ref_root = 0;
2098 
2099 	ins.objectid = node->bytenr;
2100 	ins.offset = node->num_bytes;
2101 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2102 
2103 	ref = btrfs_delayed_node_to_tree_ref(node);
2104 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2105 		parent = ref->parent;
2106 	else
2107 		ref_root = ref->root;
2108 
2109 	BUG_ON(node->ref_mod != 1);
2110 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2111 		BUG_ON(!extent_op || !extent_op->update_flags ||
2112 		       !extent_op->update_key);
2113 		ret = alloc_reserved_tree_block(trans, root,
2114 						parent, ref_root,
2115 						extent_op->flags_to_set,
2116 						&extent_op->key,
2117 						ref->level, &ins);
2118 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2119 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2120 					     node->num_bytes, parent, ref_root,
2121 					     ref->level, 0, 1, extent_op);
2122 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2123 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2124 					  node->num_bytes, parent, ref_root,
2125 					  ref->level, 0, 1, extent_op);
2126 	} else {
2127 		BUG();
2128 	}
2129 	return ret;
2130 }
2131 
2132 /* helper function to actually process a single delayed ref entry */
2133 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2134 			       struct btrfs_root *root,
2135 			       struct btrfs_delayed_ref_node *node,
2136 			       struct btrfs_delayed_extent_op *extent_op,
2137 			       int insert_reserved)
2138 {
2139 	int ret = 0;
2140 
2141 	if (trans->aborted)
2142 		return 0;
2143 
2144 	if (btrfs_delayed_ref_is_head(node)) {
2145 		struct btrfs_delayed_ref_head *head;
2146 		/*
2147 		 * we've hit the end of the chain and we were supposed
2148 		 * to insert this extent into the tree.  But, it got
2149 		 * deleted before we ever needed to insert it, so all
2150 		 * we have to do is clean up the accounting
2151 		 */
2152 		BUG_ON(extent_op);
2153 		head = btrfs_delayed_node_to_head(node);
2154 		if (insert_reserved) {
2155 			btrfs_pin_extent(root, node->bytenr,
2156 					 node->num_bytes, 1);
2157 			if (head->is_data) {
2158 				ret = btrfs_del_csums(trans, root,
2159 						      node->bytenr,
2160 						      node->num_bytes);
2161 			}
2162 		}
2163 		mutex_unlock(&head->mutex);
2164 		return ret;
2165 	}
2166 
2167 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2168 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2169 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2170 					   insert_reserved);
2171 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2172 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2173 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2174 					   insert_reserved);
2175 	else
2176 		BUG();
2177 	return ret;
2178 }
2179 
2180 static noinline struct btrfs_delayed_ref_node *
2181 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2182 {
2183 	struct rb_node *node;
2184 	struct btrfs_delayed_ref_node *ref;
2185 	int action = BTRFS_ADD_DELAYED_REF;
2186 again:
2187 	/*
2188 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2189 	 * this prevents ref count from going down to zero when
2190 	 * there still are pending delayed ref.
2191 	 */
2192 	node = rb_prev(&head->node.rb_node);
2193 	while (1) {
2194 		if (!node)
2195 			break;
2196 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2197 				rb_node);
2198 		if (ref->bytenr != head->node.bytenr)
2199 			break;
2200 		if (ref->action == action)
2201 			return ref;
2202 		node = rb_prev(node);
2203 	}
2204 	if (action == BTRFS_ADD_DELAYED_REF) {
2205 		action = BTRFS_DROP_DELAYED_REF;
2206 		goto again;
2207 	}
2208 	return NULL;
2209 }
2210 
2211 /*
2212  * Returns 0 on success or if called with an already aborted transaction.
2213  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2214  */
2215 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2216 				       struct btrfs_root *root,
2217 				       struct list_head *cluster)
2218 {
2219 	struct btrfs_delayed_ref_root *delayed_refs;
2220 	struct btrfs_delayed_ref_node *ref;
2221 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2222 	struct btrfs_delayed_extent_op *extent_op;
2223 	struct btrfs_fs_info *fs_info = root->fs_info;
2224 	int ret;
2225 	int count = 0;
2226 	int must_insert_reserved = 0;
2227 
2228 	delayed_refs = &trans->transaction->delayed_refs;
2229 	while (1) {
2230 		if (!locked_ref) {
2231 			/* pick a new head ref from the cluster list */
2232 			if (list_empty(cluster))
2233 				break;
2234 
2235 			locked_ref = list_entry(cluster->next,
2236 				     struct btrfs_delayed_ref_head, cluster);
2237 
2238 			/* grab the lock that says we are going to process
2239 			 * all the refs for this head */
2240 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2241 
2242 			/*
2243 			 * we may have dropped the spin lock to get the head
2244 			 * mutex lock, and that might have given someone else
2245 			 * time to free the head.  If that's true, it has been
2246 			 * removed from our list and we can move on.
2247 			 */
2248 			if (ret == -EAGAIN) {
2249 				locked_ref = NULL;
2250 				count++;
2251 				continue;
2252 			}
2253 		}
2254 
2255 		/*
2256 		 * We need to try and merge add/drops of the same ref since we
2257 		 * can run into issues with relocate dropping the implicit ref
2258 		 * and then it being added back again before the drop can
2259 		 * finish.  If we merged anything we need to re-loop so we can
2260 		 * get a good ref.
2261 		 */
2262 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2263 					 locked_ref);
2264 
2265 		/*
2266 		 * locked_ref is the head node, so we have to go one
2267 		 * node back for any delayed ref updates
2268 		 */
2269 		ref = select_delayed_ref(locked_ref);
2270 
2271 		if (ref && ref->seq &&
2272 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2273 			/*
2274 			 * there are still refs with lower seq numbers in the
2275 			 * process of being added. Don't run this ref yet.
2276 			 */
2277 			list_del_init(&locked_ref->cluster);
2278 			mutex_unlock(&locked_ref->mutex);
2279 			locked_ref = NULL;
2280 			delayed_refs->num_heads_ready++;
2281 			spin_unlock(&delayed_refs->lock);
2282 			cond_resched();
2283 			spin_lock(&delayed_refs->lock);
2284 			continue;
2285 		}
2286 
2287 		/*
2288 		 * record the must insert reserved flag before we
2289 		 * drop the spin lock.
2290 		 */
2291 		must_insert_reserved = locked_ref->must_insert_reserved;
2292 		locked_ref->must_insert_reserved = 0;
2293 
2294 		extent_op = locked_ref->extent_op;
2295 		locked_ref->extent_op = NULL;
2296 
2297 		if (!ref) {
2298 			/* All delayed refs have been processed, Go ahead
2299 			 * and send the head node to run_one_delayed_ref,
2300 			 * so that any accounting fixes can happen
2301 			 */
2302 			ref = &locked_ref->node;
2303 
2304 			if (extent_op && must_insert_reserved) {
2305 				kfree(extent_op);
2306 				extent_op = NULL;
2307 			}
2308 
2309 			if (extent_op) {
2310 				spin_unlock(&delayed_refs->lock);
2311 
2312 				ret = run_delayed_extent_op(trans, root,
2313 							    ref, extent_op);
2314 				kfree(extent_op);
2315 
2316 				if (ret) {
2317 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 					spin_lock(&delayed_refs->lock);
2319 					return ret;
2320 				}
2321 
2322 				goto next;
2323 			}
2324 
2325 			list_del_init(&locked_ref->cluster);
2326 			locked_ref = NULL;
2327 		}
2328 
2329 		ref->in_tree = 0;
2330 		rb_erase(&ref->rb_node, &delayed_refs->root);
2331 		delayed_refs->num_entries--;
2332 		if (locked_ref) {
2333 			/*
2334 			 * when we play the delayed ref, also correct the
2335 			 * ref_mod on head
2336 			 */
2337 			switch (ref->action) {
2338 			case BTRFS_ADD_DELAYED_REF:
2339 			case BTRFS_ADD_DELAYED_EXTENT:
2340 				locked_ref->node.ref_mod -= ref->ref_mod;
2341 				break;
2342 			case BTRFS_DROP_DELAYED_REF:
2343 				locked_ref->node.ref_mod += ref->ref_mod;
2344 				break;
2345 			default:
2346 				WARN_ON(1);
2347 			}
2348 		}
2349 		spin_unlock(&delayed_refs->lock);
2350 
2351 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2352 					  must_insert_reserved);
2353 
2354 		btrfs_put_delayed_ref(ref);
2355 		kfree(extent_op);
2356 		count++;
2357 
2358 		if (ret) {
2359 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 			spin_lock(&delayed_refs->lock);
2361 			return ret;
2362 		}
2363 
2364 next:
2365 		cond_resched();
2366 		spin_lock(&delayed_refs->lock);
2367 	}
2368 	return count;
2369 }
2370 
2371 #ifdef SCRAMBLE_DELAYED_REFS
2372 /*
2373  * Normally delayed refs get processed in ascending bytenr order. This
2374  * correlates in most cases to the order added. To expose dependencies on this
2375  * order, we start to process the tree in the middle instead of the beginning
2376  */
2377 static u64 find_middle(struct rb_root *root)
2378 {
2379 	struct rb_node *n = root->rb_node;
2380 	struct btrfs_delayed_ref_node *entry;
2381 	int alt = 1;
2382 	u64 middle;
2383 	u64 first = 0, last = 0;
2384 
2385 	n = rb_first(root);
2386 	if (n) {
2387 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2388 		first = entry->bytenr;
2389 	}
2390 	n = rb_last(root);
2391 	if (n) {
2392 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2393 		last = entry->bytenr;
2394 	}
2395 	n = root->rb_node;
2396 
2397 	while (n) {
2398 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2399 		WARN_ON(!entry->in_tree);
2400 
2401 		middle = entry->bytenr;
2402 
2403 		if (alt)
2404 			n = n->rb_left;
2405 		else
2406 			n = n->rb_right;
2407 
2408 		alt = 1 - alt;
2409 	}
2410 	return middle;
2411 }
2412 #endif
2413 
2414 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2415 					 struct btrfs_fs_info *fs_info)
2416 {
2417 	struct qgroup_update *qgroup_update;
2418 	int ret = 0;
2419 
2420 	if (list_empty(&trans->qgroup_ref_list) !=
2421 	    !trans->delayed_ref_elem.seq) {
2422 		/* list without seq or seq without list */
2423 		printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2424 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
2425 			trans->delayed_ref_elem.seq);
2426 		BUG();
2427 	}
2428 
2429 	if (!trans->delayed_ref_elem.seq)
2430 		return 0;
2431 
2432 	while (!list_empty(&trans->qgroup_ref_list)) {
2433 		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2434 						 struct qgroup_update, list);
2435 		list_del(&qgroup_update->list);
2436 		if (!ret)
2437 			ret = btrfs_qgroup_account_ref(
2438 					trans, fs_info, qgroup_update->node,
2439 					qgroup_update->extent_op);
2440 		kfree(qgroup_update);
2441 	}
2442 
2443 	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2444 
2445 	return ret;
2446 }
2447 
2448 /*
2449  * this starts processing the delayed reference count updates and
2450  * extent insertions we have queued up so far.  count can be
2451  * 0, which means to process everything in the tree at the start
2452  * of the run (but not newly added entries), or it can be some target
2453  * number you'd like to process.
2454  *
2455  * Returns 0 on success or if called with an aborted transaction
2456  * Returns <0 on error and aborts the transaction
2457  */
2458 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2459 			   struct btrfs_root *root, unsigned long count)
2460 {
2461 	struct rb_node *node;
2462 	struct btrfs_delayed_ref_root *delayed_refs;
2463 	struct btrfs_delayed_ref_node *ref;
2464 	struct list_head cluster;
2465 	int ret;
2466 	u64 delayed_start;
2467 	int run_all = count == (unsigned long)-1;
2468 	int run_most = 0;
2469 	int loops;
2470 
2471 	/* We'll clean this up in btrfs_cleanup_transaction */
2472 	if (trans->aborted)
2473 		return 0;
2474 
2475 	if (root == root->fs_info->extent_root)
2476 		root = root->fs_info->tree_root;
2477 
2478 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2479 
2480 	delayed_refs = &trans->transaction->delayed_refs;
2481 	INIT_LIST_HEAD(&cluster);
2482 again:
2483 	loops = 0;
2484 	spin_lock(&delayed_refs->lock);
2485 
2486 #ifdef SCRAMBLE_DELAYED_REFS
2487 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2488 #endif
2489 
2490 	if (count == 0) {
2491 		count = delayed_refs->num_entries * 2;
2492 		run_most = 1;
2493 	}
2494 	while (1) {
2495 		if (!(run_all || run_most) &&
2496 		    delayed_refs->num_heads_ready < 64)
2497 			break;
2498 
2499 		/*
2500 		 * go find something we can process in the rbtree.  We start at
2501 		 * the beginning of the tree, and then build a cluster
2502 		 * of refs to process starting at the first one we are able to
2503 		 * lock
2504 		 */
2505 		delayed_start = delayed_refs->run_delayed_start;
2506 		ret = btrfs_find_ref_cluster(trans, &cluster,
2507 					     delayed_refs->run_delayed_start);
2508 		if (ret)
2509 			break;
2510 
2511 		ret = run_clustered_refs(trans, root, &cluster);
2512 		if (ret < 0) {
2513 			spin_unlock(&delayed_refs->lock);
2514 			btrfs_abort_transaction(trans, root, ret);
2515 			return ret;
2516 		}
2517 
2518 		count -= min_t(unsigned long, ret, count);
2519 
2520 		if (count == 0)
2521 			break;
2522 
2523 		if (delayed_start >= delayed_refs->run_delayed_start) {
2524 			if (loops == 0) {
2525 				/*
2526 				 * btrfs_find_ref_cluster looped. let's do one
2527 				 * more cycle. if we don't run any delayed ref
2528 				 * during that cycle (because we can't because
2529 				 * all of them are blocked), bail out.
2530 				 */
2531 				loops = 1;
2532 			} else {
2533 				/*
2534 				 * no runnable refs left, stop trying
2535 				 */
2536 				BUG_ON(run_all);
2537 				break;
2538 			}
2539 		}
2540 		if (ret) {
2541 			/* refs were run, let's reset staleness detection */
2542 			loops = 0;
2543 		}
2544 	}
2545 
2546 	if (run_all) {
2547 		if (!list_empty(&trans->new_bgs)) {
2548 			spin_unlock(&delayed_refs->lock);
2549 			btrfs_create_pending_block_groups(trans, root);
2550 			spin_lock(&delayed_refs->lock);
2551 		}
2552 
2553 		node = rb_first(&delayed_refs->root);
2554 		if (!node)
2555 			goto out;
2556 		count = (unsigned long)-1;
2557 
2558 		while (node) {
2559 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2560 				       rb_node);
2561 			if (btrfs_delayed_ref_is_head(ref)) {
2562 				struct btrfs_delayed_ref_head *head;
2563 
2564 				head = btrfs_delayed_node_to_head(ref);
2565 				atomic_inc(&ref->refs);
2566 
2567 				spin_unlock(&delayed_refs->lock);
2568 				/*
2569 				 * Mutex was contended, block until it's
2570 				 * released and try again
2571 				 */
2572 				mutex_lock(&head->mutex);
2573 				mutex_unlock(&head->mutex);
2574 
2575 				btrfs_put_delayed_ref(ref);
2576 				cond_resched();
2577 				goto again;
2578 			}
2579 			node = rb_next(node);
2580 		}
2581 		spin_unlock(&delayed_refs->lock);
2582 		schedule_timeout(1);
2583 		goto again;
2584 	}
2585 out:
2586 	spin_unlock(&delayed_refs->lock);
2587 	assert_qgroups_uptodate(trans);
2588 	return 0;
2589 }
2590 
2591 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2592 				struct btrfs_root *root,
2593 				u64 bytenr, u64 num_bytes, u64 flags,
2594 				int is_data)
2595 {
2596 	struct btrfs_delayed_extent_op *extent_op;
2597 	int ret;
2598 
2599 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2600 	if (!extent_op)
2601 		return -ENOMEM;
2602 
2603 	extent_op->flags_to_set = flags;
2604 	extent_op->update_flags = 1;
2605 	extent_op->update_key = 0;
2606 	extent_op->is_data = is_data ? 1 : 0;
2607 
2608 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2609 					  num_bytes, extent_op);
2610 	if (ret)
2611 		kfree(extent_op);
2612 	return ret;
2613 }
2614 
2615 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2616 				      struct btrfs_root *root,
2617 				      struct btrfs_path *path,
2618 				      u64 objectid, u64 offset, u64 bytenr)
2619 {
2620 	struct btrfs_delayed_ref_head *head;
2621 	struct btrfs_delayed_ref_node *ref;
2622 	struct btrfs_delayed_data_ref *data_ref;
2623 	struct btrfs_delayed_ref_root *delayed_refs;
2624 	struct rb_node *node;
2625 	int ret = 0;
2626 
2627 	ret = -ENOENT;
2628 	delayed_refs = &trans->transaction->delayed_refs;
2629 	spin_lock(&delayed_refs->lock);
2630 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2631 	if (!head)
2632 		goto out;
2633 
2634 	if (!mutex_trylock(&head->mutex)) {
2635 		atomic_inc(&head->node.refs);
2636 		spin_unlock(&delayed_refs->lock);
2637 
2638 		btrfs_release_path(path);
2639 
2640 		/*
2641 		 * Mutex was contended, block until it's released and let
2642 		 * caller try again
2643 		 */
2644 		mutex_lock(&head->mutex);
2645 		mutex_unlock(&head->mutex);
2646 		btrfs_put_delayed_ref(&head->node);
2647 		return -EAGAIN;
2648 	}
2649 
2650 	node = rb_prev(&head->node.rb_node);
2651 	if (!node)
2652 		goto out_unlock;
2653 
2654 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2655 
2656 	if (ref->bytenr != bytenr)
2657 		goto out_unlock;
2658 
2659 	ret = 1;
2660 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2661 		goto out_unlock;
2662 
2663 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2664 
2665 	node = rb_prev(node);
2666 	if (node) {
2667 		int seq = ref->seq;
2668 
2669 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2670 		if (ref->bytenr == bytenr && ref->seq == seq)
2671 			goto out_unlock;
2672 	}
2673 
2674 	if (data_ref->root != root->root_key.objectid ||
2675 	    data_ref->objectid != objectid || data_ref->offset != offset)
2676 		goto out_unlock;
2677 
2678 	ret = 0;
2679 out_unlock:
2680 	mutex_unlock(&head->mutex);
2681 out:
2682 	spin_unlock(&delayed_refs->lock);
2683 	return ret;
2684 }
2685 
2686 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2687 					struct btrfs_root *root,
2688 					struct btrfs_path *path,
2689 					u64 objectid, u64 offset, u64 bytenr)
2690 {
2691 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2692 	struct extent_buffer *leaf;
2693 	struct btrfs_extent_data_ref *ref;
2694 	struct btrfs_extent_inline_ref *iref;
2695 	struct btrfs_extent_item *ei;
2696 	struct btrfs_key key;
2697 	u32 item_size;
2698 	int ret;
2699 
2700 	key.objectid = bytenr;
2701 	key.offset = (u64)-1;
2702 	key.type = BTRFS_EXTENT_ITEM_KEY;
2703 
2704 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2705 	if (ret < 0)
2706 		goto out;
2707 	BUG_ON(ret == 0); /* Corruption */
2708 
2709 	ret = -ENOENT;
2710 	if (path->slots[0] == 0)
2711 		goto out;
2712 
2713 	path->slots[0]--;
2714 	leaf = path->nodes[0];
2715 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2716 
2717 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2718 		goto out;
2719 
2720 	ret = 1;
2721 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2722 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2723 	if (item_size < sizeof(*ei)) {
2724 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2725 		goto out;
2726 	}
2727 #endif
2728 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2729 
2730 	if (item_size != sizeof(*ei) +
2731 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2732 		goto out;
2733 
2734 	if (btrfs_extent_generation(leaf, ei) <=
2735 	    btrfs_root_last_snapshot(&root->root_item))
2736 		goto out;
2737 
2738 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2739 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2740 	    BTRFS_EXTENT_DATA_REF_KEY)
2741 		goto out;
2742 
2743 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2744 	if (btrfs_extent_refs(leaf, ei) !=
2745 	    btrfs_extent_data_ref_count(leaf, ref) ||
2746 	    btrfs_extent_data_ref_root(leaf, ref) !=
2747 	    root->root_key.objectid ||
2748 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2749 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2750 		goto out;
2751 
2752 	ret = 0;
2753 out:
2754 	return ret;
2755 }
2756 
2757 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2758 			  struct btrfs_root *root,
2759 			  u64 objectid, u64 offset, u64 bytenr)
2760 {
2761 	struct btrfs_path *path;
2762 	int ret;
2763 	int ret2;
2764 
2765 	path = btrfs_alloc_path();
2766 	if (!path)
2767 		return -ENOENT;
2768 
2769 	do {
2770 		ret = check_committed_ref(trans, root, path, objectid,
2771 					  offset, bytenr);
2772 		if (ret && ret != -ENOENT)
2773 			goto out;
2774 
2775 		ret2 = check_delayed_ref(trans, root, path, objectid,
2776 					 offset, bytenr);
2777 	} while (ret2 == -EAGAIN);
2778 
2779 	if (ret2 && ret2 != -ENOENT) {
2780 		ret = ret2;
2781 		goto out;
2782 	}
2783 
2784 	if (ret != -ENOENT || ret2 != -ENOENT)
2785 		ret = 0;
2786 out:
2787 	btrfs_free_path(path);
2788 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2789 		WARN_ON(ret > 0);
2790 	return ret;
2791 }
2792 
2793 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2794 			   struct btrfs_root *root,
2795 			   struct extent_buffer *buf,
2796 			   int full_backref, int inc, int for_cow)
2797 {
2798 	u64 bytenr;
2799 	u64 num_bytes;
2800 	u64 parent;
2801 	u64 ref_root;
2802 	u32 nritems;
2803 	struct btrfs_key key;
2804 	struct btrfs_file_extent_item *fi;
2805 	int i;
2806 	int level;
2807 	int ret = 0;
2808 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2809 			    u64, u64, u64, u64, u64, u64, int);
2810 
2811 	ref_root = btrfs_header_owner(buf);
2812 	nritems = btrfs_header_nritems(buf);
2813 	level = btrfs_header_level(buf);
2814 
2815 	if (!root->ref_cows && level == 0)
2816 		return 0;
2817 
2818 	if (inc)
2819 		process_func = btrfs_inc_extent_ref;
2820 	else
2821 		process_func = btrfs_free_extent;
2822 
2823 	if (full_backref)
2824 		parent = buf->start;
2825 	else
2826 		parent = 0;
2827 
2828 	for (i = 0; i < nritems; i++) {
2829 		if (level == 0) {
2830 			btrfs_item_key_to_cpu(buf, &key, i);
2831 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2832 				continue;
2833 			fi = btrfs_item_ptr(buf, i,
2834 					    struct btrfs_file_extent_item);
2835 			if (btrfs_file_extent_type(buf, fi) ==
2836 			    BTRFS_FILE_EXTENT_INLINE)
2837 				continue;
2838 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2839 			if (bytenr == 0)
2840 				continue;
2841 
2842 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2843 			key.offset -= btrfs_file_extent_offset(buf, fi);
2844 			ret = process_func(trans, root, bytenr, num_bytes,
2845 					   parent, ref_root, key.objectid,
2846 					   key.offset, for_cow);
2847 			if (ret)
2848 				goto fail;
2849 		} else {
2850 			bytenr = btrfs_node_blockptr(buf, i);
2851 			num_bytes = btrfs_level_size(root, level - 1);
2852 			ret = process_func(trans, root, bytenr, num_bytes,
2853 					   parent, ref_root, level - 1, 0,
2854 					   for_cow);
2855 			if (ret)
2856 				goto fail;
2857 		}
2858 	}
2859 	return 0;
2860 fail:
2861 	return ret;
2862 }
2863 
2864 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2865 		  struct extent_buffer *buf, int full_backref, int for_cow)
2866 {
2867 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2868 }
2869 
2870 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2871 		  struct extent_buffer *buf, int full_backref, int for_cow)
2872 {
2873 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2874 }
2875 
2876 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2877 				 struct btrfs_root *root,
2878 				 struct btrfs_path *path,
2879 				 struct btrfs_block_group_cache *cache)
2880 {
2881 	int ret;
2882 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2883 	unsigned long bi;
2884 	struct extent_buffer *leaf;
2885 
2886 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2887 	if (ret < 0)
2888 		goto fail;
2889 	BUG_ON(ret); /* Corruption */
2890 
2891 	leaf = path->nodes[0];
2892 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2893 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2894 	btrfs_mark_buffer_dirty(leaf);
2895 	btrfs_release_path(path);
2896 fail:
2897 	if (ret) {
2898 		btrfs_abort_transaction(trans, root, ret);
2899 		return ret;
2900 	}
2901 	return 0;
2902 
2903 }
2904 
2905 static struct btrfs_block_group_cache *
2906 next_block_group(struct btrfs_root *root,
2907 		 struct btrfs_block_group_cache *cache)
2908 {
2909 	struct rb_node *node;
2910 	spin_lock(&root->fs_info->block_group_cache_lock);
2911 	node = rb_next(&cache->cache_node);
2912 	btrfs_put_block_group(cache);
2913 	if (node) {
2914 		cache = rb_entry(node, struct btrfs_block_group_cache,
2915 				 cache_node);
2916 		btrfs_get_block_group(cache);
2917 	} else
2918 		cache = NULL;
2919 	spin_unlock(&root->fs_info->block_group_cache_lock);
2920 	return cache;
2921 }
2922 
2923 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2924 			    struct btrfs_trans_handle *trans,
2925 			    struct btrfs_path *path)
2926 {
2927 	struct btrfs_root *root = block_group->fs_info->tree_root;
2928 	struct inode *inode = NULL;
2929 	u64 alloc_hint = 0;
2930 	int dcs = BTRFS_DC_ERROR;
2931 	int num_pages = 0;
2932 	int retries = 0;
2933 	int ret = 0;
2934 
2935 	/*
2936 	 * If this block group is smaller than 100 megs don't bother caching the
2937 	 * block group.
2938 	 */
2939 	if (block_group->key.offset < (100 * 1024 * 1024)) {
2940 		spin_lock(&block_group->lock);
2941 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2942 		spin_unlock(&block_group->lock);
2943 		return 0;
2944 	}
2945 
2946 again:
2947 	inode = lookup_free_space_inode(root, block_group, path);
2948 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2949 		ret = PTR_ERR(inode);
2950 		btrfs_release_path(path);
2951 		goto out;
2952 	}
2953 
2954 	if (IS_ERR(inode)) {
2955 		BUG_ON(retries);
2956 		retries++;
2957 
2958 		if (block_group->ro)
2959 			goto out_free;
2960 
2961 		ret = create_free_space_inode(root, trans, block_group, path);
2962 		if (ret)
2963 			goto out_free;
2964 		goto again;
2965 	}
2966 
2967 	/* We've already setup this transaction, go ahead and exit */
2968 	if (block_group->cache_generation == trans->transid &&
2969 	    i_size_read(inode)) {
2970 		dcs = BTRFS_DC_SETUP;
2971 		goto out_put;
2972 	}
2973 
2974 	/*
2975 	 * We want to set the generation to 0, that way if anything goes wrong
2976 	 * from here on out we know not to trust this cache when we load up next
2977 	 * time.
2978 	 */
2979 	BTRFS_I(inode)->generation = 0;
2980 	ret = btrfs_update_inode(trans, root, inode);
2981 	WARN_ON(ret);
2982 
2983 	if (i_size_read(inode) > 0) {
2984 		ret = btrfs_truncate_free_space_cache(root, trans, path,
2985 						      inode);
2986 		if (ret)
2987 			goto out_put;
2988 	}
2989 
2990 	spin_lock(&block_group->lock);
2991 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
2992 	    !btrfs_test_opt(root, SPACE_CACHE)) {
2993 		/*
2994 		 * don't bother trying to write stuff out _if_
2995 		 * a) we're not cached,
2996 		 * b) we're with nospace_cache mount option.
2997 		 */
2998 		dcs = BTRFS_DC_WRITTEN;
2999 		spin_unlock(&block_group->lock);
3000 		goto out_put;
3001 	}
3002 	spin_unlock(&block_group->lock);
3003 
3004 	/*
3005 	 * Try to preallocate enough space based on how big the block group is.
3006 	 * Keep in mind this has to include any pinned space which could end up
3007 	 * taking up quite a bit since it's not folded into the other space
3008 	 * cache.
3009 	 */
3010 	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3011 	if (!num_pages)
3012 		num_pages = 1;
3013 
3014 	num_pages *= 16;
3015 	num_pages *= PAGE_CACHE_SIZE;
3016 
3017 	ret = btrfs_check_data_free_space(inode, num_pages);
3018 	if (ret)
3019 		goto out_put;
3020 
3021 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3022 					      num_pages, num_pages,
3023 					      &alloc_hint);
3024 	if (!ret)
3025 		dcs = BTRFS_DC_SETUP;
3026 	btrfs_free_reserved_data_space(inode, num_pages);
3027 
3028 out_put:
3029 	iput(inode);
3030 out_free:
3031 	btrfs_release_path(path);
3032 out:
3033 	spin_lock(&block_group->lock);
3034 	if (!ret && dcs == BTRFS_DC_SETUP)
3035 		block_group->cache_generation = trans->transid;
3036 	block_group->disk_cache_state = dcs;
3037 	spin_unlock(&block_group->lock);
3038 
3039 	return ret;
3040 }
3041 
3042 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3043 				   struct btrfs_root *root)
3044 {
3045 	struct btrfs_block_group_cache *cache;
3046 	int err = 0;
3047 	struct btrfs_path *path;
3048 	u64 last = 0;
3049 
3050 	path = btrfs_alloc_path();
3051 	if (!path)
3052 		return -ENOMEM;
3053 
3054 again:
3055 	while (1) {
3056 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3057 		while (cache) {
3058 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3059 				break;
3060 			cache = next_block_group(root, cache);
3061 		}
3062 		if (!cache) {
3063 			if (last == 0)
3064 				break;
3065 			last = 0;
3066 			continue;
3067 		}
3068 		err = cache_save_setup(cache, trans, path);
3069 		last = cache->key.objectid + cache->key.offset;
3070 		btrfs_put_block_group(cache);
3071 	}
3072 
3073 	while (1) {
3074 		if (last == 0) {
3075 			err = btrfs_run_delayed_refs(trans, root,
3076 						     (unsigned long)-1);
3077 			if (err) /* File system offline */
3078 				goto out;
3079 		}
3080 
3081 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3082 		while (cache) {
3083 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3084 				btrfs_put_block_group(cache);
3085 				goto again;
3086 			}
3087 
3088 			if (cache->dirty)
3089 				break;
3090 			cache = next_block_group(root, cache);
3091 		}
3092 		if (!cache) {
3093 			if (last == 0)
3094 				break;
3095 			last = 0;
3096 			continue;
3097 		}
3098 
3099 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
3100 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3101 		cache->dirty = 0;
3102 		last = cache->key.objectid + cache->key.offset;
3103 
3104 		err = write_one_cache_group(trans, root, path, cache);
3105 		if (err) /* File system offline */
3106 			goto out;
3107 
3108 		btrfs_put_block_group(cache);
3109 	}
3110 
3111 	while (1) {
3112 		/*
3113 		 * I don't think this is needed since we're just marking our
3114 		 * preallocated extent as written, but just in case it can't
3115 		 * hurt.
3116 		 */
3117 		if (last == 0) {
3118 			err = btrfs_run_delayed_refs(trans, root,
3119 						     (unsigned long)-1);
3120 			if (err) /* File system offline */
3121 				goto out;
3122 		}
3123 
3124 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3125 		while (cache) {
3126 			/*
3127 			 * Really this shouldn't happen, but it could if we
3128 			 * couldn't write the entire preallocated extent and
3129 			 * splitting the extent resulted in a new block.
3130 			 */
3131 			if (cache->dirty) {
3132 				btrfs_put_block_group(cache);
3133 				goto again;
3134 			}
3135 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3136 				break;
3137 			cache = next_block_group(root, cache);
3138 		}
3139 		if (!cache) {
3140 			if (last == 0)
3141 				break;
3142 			last = 0;
3143 			continue;
3144 		}
3145 
3146 		err = btrfs_write_out_cache(root, trans, cache, path);
3147 
3148 		/*
3149 		 * If we didn't have an error then the cache state is still
3150 		 * NEED_WRITE, so we can set it to WRITTEN.
3151 		 */
3152 		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3153 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
3154 		last = cache->key.objectid + cache->key.offset;
3155 		btrfs_put_block_group(cache);
3156 	}
3157 out:
3158 
3159 	btrfs_free_path(path);
3160 	return err;
3161 }
3162 
3163 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3164 {
3165 	struct btrfs_block_group_cache *block_group;
3166 	int readonly = 0;
3167 
3168 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3169 	if (!block_group || block_group->ro)
3170 		readonly = 1;
3171 	if (block_group)
3172 		btrfs_put_block_group(block_group);
3173 	return readonly;
3174 }
3175 
3176 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3177 			     u64 total_bytes, u64 bytes_used,
3178 			     struct btrfs_space_info **space_info)
3179 {
3180 	struct btrfs_space_info *found;
3181 	int i;
3182 	int factor;
3183 
3184 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3185 		     BTRFS_BLOCK_GROUP_RAID10))
3186 		factor = 2;
3187 	else
3188 		factor = 1;
3189 
3190 	found = __find_space_info(info, flags);
3191 	if (found) {
3192 		spin_lock(&found->lock);
3193 		found->total_bytes += total_bytes;
3194 		found->disk_total += total_bytes * factor;
3195 		found->bytes_used += bytes_used;
3196 		found->disk_used += bytes_used * factor;
3197 		found->full = 0;
3198 		spin_unlock(&found->lock);
3199 		*space_info = found;
3200 		return 0;
3201 	}
3202 	found = kzalloc(sizeof(*found), GFP_NOFS);
3203 	if (!found)
3204 		return -ENOMEM;
3205 
3206 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3207 		INIT_LIST_HEAD(&found->block_groups[i]);
3208 	init_rwsem(&found->groups_sem);
3209 	spin_lock_init(&found->lock);
3210 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3211 	found->total_bytes = total_bytes;
3212 	found->disk_total = total_bytes * factor;
3213 	found->bytes_used = bytes_used;
3214 	found->disk_used = bytes_used * factor;
3215 	found->bytes_pinned = 0;
3216 	found->bytes_reserved = 0;
3217 	found->bytes_readonly = 0;
3218 	found->bytes_may_use = 0;
3219 	found->full = 0;
3220 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3221 	found->chunk_alloc = 0;
3222 	found->flush = 0;
3223 	init_waitqueue_head(&found->wait);
3224 	*space_info = found;
3225 	list_add_rcu(&found->list, &info->space_info);
3226 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 		info->data_sinfo = found;
3228 	return 0;
3229 }
3230 
3231 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3232 {
3233 	u64 extra_flags = chunk_to_extended(flags) &
3234 				BTRFS_EXTENDED_PROFILE_MASK;
3235 
3236 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3237 		fs_info->avail_data_alloc_bits |= extra_flags;
3238 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
3239 		fs_info->avail_metadata_alloc_bits |= extra_flags;
3240 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3241 		fs_info->avail_system_alloc_bits |= extra_flags;
3242 }
3243 
3244 /*
3245  * returns target flags in extended format or 0 if restripe for this
3246  * chunk_type is not in progress
3247  *
3248  * should be called with either volume_mutex or balance_lock held
3249  */
3250 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3251 {
3252 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3253 	u64 target = 0;
3254 
3255 	if (!bctl)
3256 		return 0;
3257 
3258 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
3259 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3260 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3261 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3262 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3263 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3264 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3265 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3266 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3267 	}
3268 
3269 	return target;
3270 }
3271 
3272 /*
3273  * @flags: available profiles in extended format (see ctree.h)
3274  *
3275  * Returns reduced profile in chunk format.  If profile changing is in
3276  * progress (either running or paused) picks the target profile (if it's
3277  * already available), otherwise falls back to plain reducing.
3278  */
3279 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3280 {
3281 	/*
3282 	 * we add in the count of missing devices because we want
3283 	 * to make sure that any RAID levels on a degraded FS
3284 	 * continue to be honored.
3285 	 */
3286 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3287 		root->fs_info->fs_devices->missing_devices;
3288 	u64 target;
3289 
3290 	/*
3291 	 * see if restripe for this chunk_type is in progress, if so
3292 	 * try to reduce to the target profile
3293 	 */
3294 	spin_lock(&root->fs_info->balance_lock);
3295 	target = get_restripe_target(root->fs_info, flags);
3296 	if (target) {
3297 		/* pick target profile only if it's already available */
3298 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3299 			spin_unlock(&root->fs_info->balance_lock);
3300 			return extended_to_chunk(target);
3301 		}
3302 	}
3303 	spin_unlock(&root->fs_info->balance_lock);
3304 
3305 	if (num_devices == 1)
3306 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3307 	if (num_devices < 4)
3308 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3309 
3310 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3311 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3312 		      BTRFS_BLOCK_GROUP_RAID10))) {
3313 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
3314 	}
3315 
3316 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3317 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3318 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3319 	}
3320 
3321 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3322 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3323 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
3324 	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
3325 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3326 	}
3327 
3328 	return extended_to_chunk(flags);
3329 }
3330 
3331 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3332 {
3333 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3334 		flags |= root->fs_info->avail_data_alloc_bits;
3335 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3336 		flags |= root->fs_info->avail_system_alloc_bits;
3337 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3338 		flags |= root->fs_info->avail_metadata_alloc_bits;
3339 
3340 	return btrfs_reduce_alloc_profile(root, flags);
3341 }
3342 
3343 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3344 {
3345 	u64 flags;
3346 
3347 	if (data)
3348 		flags = BTRFS_BLOCK_GROUP_DATA;
3349 	else if (root == root->fs_info->chunk_root)
3350 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3351 	else
3352 		flags = BTRFS_BLOCK_GROUP_METADATA;
3353 
3354 	return get_alloc_profile(root, flags);
3355 }
3356 
3357 /*
3358  * This will check the space that the inode allocates from to make sure we have
3359  * enough space for bytes.
3360  */
3361 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3362 {
3363 	struct btrfs_space_info *data_sinfo;
3364 	struct btrfs_root *root = BTRFS_I(inode)->root;
3365 	struct btrfs_fs_info *fs_info = root->fs_info;
3366 	u64 used;
3367 	int ret = 0, committed = 0, alloc_chunk = 1;
3368 
3369 	/* make sure bytes are sectorsize aligned */
3370 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3371 
3372 	if (root == root->fs_info->tree_root ||
3373 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3374 		alloc_chunk = 0;
3375 		committed = 1;
3376 	}
3377 
3378 	data_sinfo = fs_info->data_sinfo;
3379 	if (!data_sinfo)
3380 		goto alloc;
3381 
3382 again:
3383 	/* make sure we have enough space to handle the data first */
3384 	spin_lock(&data_sinfo->lock);
3385 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3386 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3387 		data_sinfo->bytes_may_use;
3388 
3389 	if (used + bytes > data_sinfo->total_bytes) {
3390 		struct btrfs_trans_handle *trans;
3391 
3392 		/*
3393 		 * if we don't have enough free bytes in this space then we need
3394 		 * to alloc a new chunk.
3395 		 */
3396 		if (!data_sinfo->full && alloc_chunk) {
3397 			u64 alloc_target;
3398 
3399 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3400 			spin_unlock(&data_sinfo->lock);
3401 alloc:
3402 			alloc_target = btrfs_get_alloc_profile(root, 1);
3403 			trans = btrfs_join_transaction(root);
3404 			if (IS_ERR(trans))
3405 				return PTR_ERR(trans);
3406 
3407 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3408 					     alloc_target,
3409 					     CHUNK_ALLOC_NO_FORCE);
3410 			btrfs_end_transaction(trans, root);
3411 			if (ret < 0) {
3412 				if (ret != -ENOSPC)
3413 					return ret;
3414 				else
3415 					goto commit_trans;
3416 			}
3417 
3418 			if (!data_sinfo)
3419 				data_sinfo = fs_info->data_sinfo;
3420 
3421 			goto again;
3422 		}
3423 
3424 		/*
3425 		 * If we have less pinned bytes than we want to allocate then
3426 		 * don't bother committing the transaction, it won't help us.
3427 		 */
3428 		if (data_sinfo->bytes_pinned < bytes)
3429 			committed = 1;
3430 		spin_unlock(&data_sinfo->lock);
3431 
3432 		/* commit the current transaction and try again */
3433 commit_trans:
3434 		if (!committed &&
3435 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3436 			committed = 1;
3437 			trans = btrfs_join_transaction(root);
3438 			if (IS_ERR(trans))
3439 				return PTR_ERR(trans);
3440 			ret = btrfs_commit_transaction(trans, root);
3441 			if (ret)
3442 				return ret;
3443 			goto again;
3444 		}
3445 
3446 		return -ENOSPC;
3447 	}
3448 	data_sinfo->bytes_may_use += bytes;
3449 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3450 				      data_sinfo->flags, bytes, 1);
3451 	spin_unlock(&data_sinfo->lock);
3452 
3453 	return 0;
3454 }
3455 
3456 /*
3457  * Called if we need to clear a data reservation for this inode.
3458  */
3459 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3460 {
3461 	struct btrfs_root *root = BTRFS_I(inode)->root;
3462 	struct btrfs_space_info *data_sinfo;
3463 
3464 	/* make sure bytes are sectorsize aligned */
3465 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3466 
3467 	data_sinfo = root->fs_info->data_sinfo;
3468 	spin_lock(&data_sinfo->lock);
3469 	data_sinfo->bytes_may_use -= bytes;
3470 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3471 				      data_sinfo->flags, bytes, 0);
3472 	spin_unlock(&data_sinfo->lock);
3473 }
3474 
3475 static void force_metadata_allocation(struct btrfs_fs_info *info)
3476 {
3477 	struct list_head *head = &info->space_info;
3478 	struct btrfs_space_info *found;
3479 
3480 	rcu_read_lock();
3481 	list_for_each_entry_rcu(found, head, list) {
3482 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3483 			found->force_alloc = CHUNK_ALLOC_FORCE;
3484 	}
3485 	rcu_read_unlock();
3486 }
3487 
3488 static int should_alloc_chunk(struct btrfs_root *root,
3489 			      struct btrfs_space_info *sinfo, int force)
3490 {
3491 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3492 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3493 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3494 	u64 thresh;
3495 
3496 	if (force == CHUNK_ALLOC_FORCE)
3497 		return 1;
3498 
3499 	/*
3500 	 * We need to take into account the global rsv because for all intents
3501 	 * and purposes it's used space.  Don't worry about locking the
3502 	 * global_rsv, it doesn't change except when the transaction commits.
3503 	 */
3504 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505 		num_allocated += global_rsv->size;
3506 
3507 	/*
3508 	 * in limited mode, we want to have some free space up to
3509 	 * about 1% of the FS size.
3510 	 */
3511 	if (force == CHUNK_ALLOC_LIMITED) {
3512 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3513 		thresh = max_t(u64, 64 * 1024 * 1024,
3514 			       div_factor_fine(thresh, 1));
3515 
3516 		if (num_bytes - num_allocated < thresh)
3517 			return 1;
3518 	}
3519 
3520 	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3521 		return 0;
3522 	return 1;
3523 }
3524 
3525 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3526 {
3527 	u64 num_dev;
3528 
3529 	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3530 	    type & BTRFS_BLOCK_GROUP_RAID0)
3531 		num_dev = root->fs_info->fs_devices->rw_devices;
3532 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
3533 		num_dev = 2;
3534 	else
3535 		num_dev = 1;	/* DUP or single */
3536 
3537 	/* metadata for updaing devices and chunk tree */
3538 	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3539 }
3540 
3541 static void check_system_chunk(struct btrfs_trans_handle *trans,
3542 			       struct btrfs_root *root, u64 type)
3543 {
3544 	struct btrfs_space_info *info;
3545 	u64 left;
3546 	u64 thresh;
3547 
3548 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3549 	spin_lock(&info->lock);
3550 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3551 		info->bytes_reserved - info->bytes_readonly;
3552 	spin_unlock(&info->lock);
3553 
3554 	thresh = get_system_chunk_thresh(root, type);
3555 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3556 		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3557 		       left, thresh, type);
3558 		dump_space_info(info, 0, 0);
3559 	}
3560 
3561 	if (left < thresh) {
3562 		u64 flags;
3563 
3564 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3565 		btrfs_alloc_chunk(trans, root, flags);
3566 	}
3567 }
3568 
3569 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3570 			  struct btrfs_root *extent_root, u64 flags, int force)
3571 {
3572 	struct btrfs_space_info *space_info;
3573 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3574 	int wait_for_alloc = 0;
3575 	int ret = 0;
3576 
3577 	space_info = __find_space_info(extent_root->fs_info, flags);
3578 	if (!space_info) {
3579 		ret = update_space_info(extent_root->fs_info, flags,
3580 					0, 0, &space_info);
3581 		BUG_ON(ret); /* -ENOMEM */
3582 	}
3583 	BUG_ON(!space_info); /* Logic error */
3584 
3585 again:
3586 	spin_lock(&space_info->lock);
3587 	if (force < space_info->force_alloc)
3588 		force = space_info->force_alloc;
3589 	if (space_info->full) {
3590 		spin_unlock(&space_info->lock);
3591 		return 0;
3592 	}
3593 
3594 	if (!should_alloc_chunk(extent_root, space_info, force)) {
3595 		spin_unlock(&space_info->lock);
3596 		return 0;
3597 	} else if (space_info->chunk_alloc) {
3598 		wait_for_alloc = 1;
3599 	} else {
3600 		space_info->chunk_alloc = 1;
3601 	}
3602 
3603 	spin_unlock(&space_info->lock);
3604 
3605 	mutex_lock(&fs_info->chunk_mutex);
3606 
3607 	/*
3608 	 * The chunk_mutex is held throughout the entirety of a chunk
3609 	 * allocation, so once we've acquired the chunk_mutex we know that the
3610 	 * other guy is done and we need to recheck and see if we should
3611 	 * allocate.
3612 	 */
3613 	if (wait_for_alloc) {
3614 		mutex_unlock(&fs_info->chunk_mutex);
3615 		wait_for_alloc = 0;
3616 		goto again;
3617 	}
3618 
3619 	/*
3620 	 * If we have mixed data/metadata chunks we want to make sure we keep
3621 	 * allocating mixed chunks instead of individual chunks.
3622 	 */
3623 	if (btrfs_mixed_space_info(space_info))
3624 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3625 
3626 	/*
3627 	 * if we're doing a data chunk, go ahead and make sure that
3628 	 * we keep a reasonable number of metadata chunks allocated in the
3629 	 * FS as well.
3630 	 */
3631 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3632 		fs_info->data_chunk_allocations++;
3633 		if (!(fs_info->data_chunk_allocations %
3634 		      fs_info->metadata_ratio))
3635 			force_metadata_allocation(fs_info);
3636 	}
3637 
3638 	/*
3639 	 * Check if we have enough space in SYSTEM chunk because we may need
3640 	 * to update devices.
3641 	 */
3642 	check_system_chunk(trans, extent_root, flags);
3643 
3644 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3645 	if (ret < 0 && ret != -ENOSPC)
3646 		goto out;
3647 
3648 	spin_lock(&space_info->lock);
3649 	if (ret)
3650 		space_info->full = 1;
3651 	else
3652 		ret = 1;
3653 
3654 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3655 	space_info->chunk_alloc = 0;
3656 	spin_unlock(&space_info->lock);
3657 out:
3658 	mutex_unlock(&fs_info->chunk_mutex);
3659 	return ret;
3660 }
3661 
3662 static int can_overcommit(struct btrfs_root *root,
3663 			  struct btrfs_space_info *space_info, u64 bytes,
3664 			  int flush)
3665 {
3666 	u64 profile = btrfs_get_alloc_profile(root, 0);
3667 	u64 avail;
3668 	u64 used;
3669 
3670 	used = space_info->bytes_used + space_info->bytes_reserved +
3671 		space_info->bytes_pinned + space_info->bytes_readonly +
3672 		space_info->bytes_may_use;
3673 
3674 	spin_lock(&root->fs_info->free_chunk_lock);
3675 	avail = root->fs_info->free_chunk_space;
3676 	spin_unlock(&root->fs_info->free_chunk_lock);
3677 
3678 	/*
3679 	 * If we have dup, raid1 or raid10 then only half of the free
3680 	 * space is actually useable.
3681 	 */
3682 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
3683 		       BTRFS_BLOCK_GROUP_RAID1 |
3684 		       BTRFS_BLOCK_GROUP_RAID10))
3685 		avail >>= 1;
3686 
3687 	/*
3688 	 * If we aren't flushing don't let us overcommit too much, say
3689 	 * 1/8th of the space.  If we can flush, let it overcommit up to
3690 	 * 1/2 of the space.
3691 	 */
3692 	if (flush)
3693 		avail >>= 3;
3694 	else
3695 		avail >>= 1;
3696 
3697 	if (used + bytes < space_info->total_bytes + avail)
3698 		return 1;
3699 	return 0;
3700 }
3701 
3702 /*
3703  * shrink metadata reservation for delalloc
3704  */
3705 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3706 			    bool wait_ordered)
3707 {
3708 	struct btrfs_block_rsv *block_rsv;
3709 	struct btrfs_space_info *space_info;
3710 	struct btrfs_trans_handle *trans;
3711 	u64 delalloc_bytes;
3712 	u64 max_reclaim;
3713 	long time_left;
3714 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 	int loops = 0;
3716 
3717 	trans = (struct btrfs_trans_handle *)current->journal_info;
3718 	block_rsv = &root->fs_info->delalloc_block_rsv;
3719 	space_info = block_rsv->space_info;
3720 
3721 	smp_mb();
3722 	delalloc_bytes = root->fs_info->delalloc_bytes;
3723 	if (delalloc_bytes == 0) {
3724 		if (trans)
3725 			return;
3726 		btrfs_wait_ordered_extents(root, 0);
3727 		return;
3728 	}
3729 
3730 	while (delalloc_bytes && loops < 3) {
3731 		max_reclaim = min(delalloc_bytes, to_reclaim);
3732 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3734 					       WB_REASON_FS_FREE_SPACE);
3735 
3736 		/*
3737 		 * We need to wait for the async pages to actually start before
3738 		 * we do anything.
3739 		 */
3740 		wait_event(root->fs_info->async_submit_wait,
3741 			   !atomic_read(&root->fs_info->async_delalloc_pages));
3742 
3743 		spin_lock(&space_info->lock);
3744 		if (can_overcommit(root, space_info, orig, !trans)) {
3745 			spin_unlock(&space_info->lock);
3746 			break;
3747 		}
3748 		spin_unlock(&space_info->lock);
3749 
3750 		loops++;
3751 		if (wait_ordered && !trans) {
3752 			btrfs_wait_ordered_extents(root, 0);
3753 		} else {
3754 			time_left = schedule_timeout_killable(1);
3755 			if (time_left)
3756 				break;
3757 		}
3758 		smp_mb();
3759 		delalloc_bytes = root->fs_info->delalloc_bytes;
3760 	}
3761 }
3762 
3763 /**
3764  * maybe_commit_transaction - possibly commit the transaction if its ok to
3765  * @root - the root we're allocating for
3766  * @bytes - the number of bytes we want to reserve
3767  * @force - force the commit
3768  *
3769  * This will check to make sure that committing the transaction will actually
3770  * get us somewhere and then commit the transaction if it does.  Otherwise it
3771  * will return -ENOSPC.
3772  */
3773 static int may_commit_transaction(struct btrfs_root *root,
3774 				  struct btrfs_space_info *space_info,
3775 				  u64 bytes, int force)
3776 {
3777 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3778 	struct btrfs_trans_handle *trans;
3779 
3780 	trans = (struct btrfs_trans_handle *)current->journal_info;
3781 	if (trans)
3782 		return -EAGAIN;
3783 
3784 	if (force)
3785 		goto commit;
3786 
3787 	/* See if there is enough pinned space to make this reservation */
3788 	spin_lock(&space_info->lock);
3789 	if (space_info->bytes_pinned >= bytes) {
3790 		spin_unlock(&space_info->lock);
3791 		goto commit;
3792 	}
3793 	spin_unlock(&space_info->lock);
3794 
3795 	/*
3796 	 * See if there is some space in the delayed insertion reservation for
3797 	 * this reservation.
3798 	 */
3799 	if (space_info != delayed_rsv->space_info)
3800 		return -ENOSPC;
3801 
3802 	spin_lock(&space_info->lock);
3803 	spin_lock(&delayed_rsv->lock);
3804 	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3805 		spin_unlock(&delayed_rsv->lock);
3806 		spin_unlock(&space_info->lock);
3807 		return -ENOSPC;
3808 	}
3809 	spin_unlock(&delayed_rsv->lock);
3810 	spin_unlock(&space_info->lock);
3811 
3812 commit:
3813 	trans = btrfs_join_transaction(root);
3814 	if (IS_ERR(trans))
3815 		return -ENOSPC;
3816 
3817 	return btrfs_commit_transaction(trans, root);
3818 }
3819 
3820 enum flush_state {
3821 	FLUSH_DELAYED_ITEMS_NR	=	1,
3822 	FLUSH_DELAYED_ITEMS	=	2,
3823 	FLUSH_DELALLOC		=	3,
3824 	FLUSH_DELALLOC_WAIT	=	4,
3825 	ALLOC_CHUNK		=	5,
3826 	COMMIT_TRANS		=	6,
3827 };
3828 
3829 static int flush_space(struct btrfs_root *root,
3830 		       struct btrfs_space_info *space_info, u64 num_bytes,
3831 		       u64 orig_bytes, int state)
3832 {
3833 	struct btrfs_trans_handle *trans;
3834 	int nr;
3835 	int ret = 0;
3836 
3837 	switch (state) {
3838 	case FLUSH_DELAYED_ITEMS_NR:
3839 	case FLUSH_DELAYED_ITEMS:
3840 		if (state == FLUSH_DELAYED_ITEMS_NR) {
3841 			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3842 
3843 			nr = (int)div64_u64(num_bytes, bytes);
3844 			if (!nr)
3845 				nr = 1;
3846 			nr *= 2;
3847 		} else {
3848 			nr = -1;
3849 		}
3850 		trans = btrfs_join_transaction(root);
3851 		if (IS_ERR(trans)) {
3852 			ret = PTR_ERR(trans);
3853 			break;
3854 		}
3855 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
3856 		btrfs_end_transaction(trans, root);
3857 		break;
3858 	case FLUSH_DELALLOC:
3859 	case FLUSH_DELALLOC_WAIT:
3860 		shrink_delalloc(root, num_bytes, orig_bytes,
3861 				state == FLUSH_DELALLOC_WAIT);
3862 		break;
3863 	case ALLOC_CHUNK:
3864 		trans = btrfs_join_transaction(root);
3865 		if (IS_ERR(trans)) {
3866 			ret = PTR_ERR(trans);
3867 			break;
3868 		}
3869 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870 				     btrfs_get_alloc_profile(root, 0),
3871 				     CHUNK_ALLOC_NO_FORCE);
3872 		btrfs_end_transaction(trans, root);
3873 		if (ret == -ENOSPC)
3874 			ret = 0;
3875 		break;
3876 	case COMMIT_TRANS:
3877 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3878 		break;
3879 	default:
3880 		ret = -ENOSPC;
3881 		break;
3882 	}
3883 
3884 	return ret;
3885 }
3886 /**
3887  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3888  * @root - the root we're allocating for
3889  * @block_rsv - the block_rsv we're allocating for
3890  * @orig_bytes - the number of bytes we want
3891  * @flush - wether or not we can flush to make our reservation
3892  *
3893  * This will reserve orgi_bytes number of bytes from the space info associated
3894  * with the block_rsv.  If there is not enough space it will make an attempt to
3895  * flush out space to make room.  It will do this by flushing delalloc if
3896  * possible or committing the transaction.  If flush is 0 then no attempts to
3897  * regain reservations will be made and this will fail if there is not enough
3898  * space already.
3899  */
3900 static int reserve_metadata_bytes(struct btrfs_root *root,
3901 				  struct btrfs_block_rsv *block_rsv,
3902 				  u64 orig_bytes, int flush)
3903 {
3904 	struct btrfs_space_info *space_info = block_rsv->space_info;
3905 	u64 used;
3906 	u64 num_bytes = orig_bytes;
3907 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
3908 	int ret = 0;
3909 	bool flushing = false;
3910 
3911 again:
3912 	ret = 0;
3913 	spin_lock(&space_info->lock);
3914 	/*
3915 	 * We only want to wait if somebody other than us is flushing and we are
3916 	 * actually alloed to flush.
3917 	 */
3918 	while (flush && !flushing && space_info->flush) {
3919 		spin_unlock(&space_info->lock);
3920 		/*
3921 		 * If we have a trans handle we can't wait because the flusher
3922 		 * may have to commit the transaction, which would mean we would
3923 		 * deadlock since we are waiting for the flusher to finish, but
3924 		 * hold the current transaction open.
3925 		 */
3926 		if (current->journal_info)
3927 			return -EAGAIN;
3928 		ret = wait_event_killable(space_info->wait, !space_info->flush);
3929 		/* Must have been killed, return */
3930 		if (ret)
3931 			return -EINTR;
3932 
3933 		spin_lock(&space_info->lock);
3934 	}
3935 
3936 	ret = -ENOSPC;
3937 	used = space_info->bytes_used + space_info->bytes_reserved +
3938 		space_info->bytes_pinned + space_info->bytes_readonly +
3939 		space_info->bytes_may_use;
3940 
3941 	/*
3942 	 * The idea here is that we've not already over-reserved the block group
3943 	 * then we can go ahead and save our reservation first and then start
3944 	 * flushing if we need to.  Otherwise if we've already overcommitted
3945 	 * lets start flushing stuff first and then come back and try to make
3946 	 * our reservation.
3947 	 */
3948 	if (used <= space_info->total_bytes) {
3949 		if (used + orig_bytes <= space_info->total_bytes) {
3950 			space_info->bytes_may_use += orig_bytes;
3951 			trace_btrfs_space_reservation(root->fs_info,
3952 				"space_info", space_info->flags, orig_bytes, 1);
3953 			ret = 0;
3954 		} else {
3955 			/*
3956 			 * Ok set num_bytes to orig_bytes since we aren't
3957 			 * overocmmitted, this way we only try and reclaim what
3958 			 * we need.
3959 			 */
3960 			num_bytes = orig_bytes;
3961 		}
3962 	} else {
3963 		/*
3964 		 * Ok we're over committed, set num_bytes to the overcommitted
3965 		 * amount plus the amount of bytes that we need for this
3966 		 * reservation.
3967 		 */
3968 		num_bytes = used - space_info->total_bytes +
3969 			(orig_bytes * 2);
3970 	}
3971 
3972 	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3973 		space_info->bytes_may_use += orig_bytes;
3974 		trace_btrfs_space_reservation(root->fs_info, "space_info",
3975 					      space_info->flags, orig_bytes,
3976 					      1);
3977 		ret = 0;
3978 	}
3979 
3980 	/*
3981 	 * Couldn't make our reservation, save our place so while we're trying
3982 	 * to reclaim space we can actually use it instead of somebody else
3983 	 * stealing it from us.
3984 	 */
3985 	if (ret && flush) {
3986 		flushing = true;
3987 		space_info->flush = 1;
3988 	}
3989 
3990 	spin_unlock(&space_info->lock);
3991 
3992 	if (!ret || !flush)
3993 		goto out;
3994 
3995 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 			  flush_state);
3997 	flush_state++;
3998 	if (!ret)
3999 		goto again;
4000 	else if (flush_state <= COMMIT_TRANS)
4001 		goto again;
4002 
4003 out:
4004 	if (flushing) {
4005 		spin_lock(&space_info->lock);
4006 		space_info->flush = 0;
4007 		wake_up_all(&space_info->wait);
4008 		spin_unlock(&space_info->lock);
4009 	}
4010 	return ret;
4011 }
4012 
4013 static struct btrfs_block_rsv *get_block_rsv(
4014 					const struct btrfs_trans_handle *trans,
4015 					const struct btrfs_root *root)
4016 {
4017 	struct btrfs_block_rsv *block_rsv = NULL;
4018 
4019 	if (root->ref_cows)
4020 		block_rsv = trans->block_rsv;
4021 
4022 	if (root == root->fs_info->csum_root && trans->adding_csums)
4023 		block_rsv = trans->block_rsv;
4024 
4025 	if (!block_rsv)
4026 		block_rsv = root->block_rsv;
4027 
4028 	if (!block_rsv)
4029 		block_rsv = &root->fs_info->empty_block_rsv;
4030 
4031 	return block_rsv;
4032 }
4033 
4034 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4035 			       u64 num_bytes)
4036 {
4037 	int ret = -ENOSPC;
4038 	spin_lock(&block_rsv->lock);
4039 	if (block_rsv->reserved >= num_bytes) {
4040 		block_rsv->reserved -= num_bytes;
4041 		if (block_rsv->reserved < block_rsv->size)
4042 			block_rsv->full = 0;
4043 		ret = 0;
4044 	}
4045 	spin_unlock(&block_rsv->lock);
4046 	return ret;
4047 }
4048 
4049 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4050 				u64 num_bytes, int update_size)
4051 {
4052 	spin_lock(&block_rsv->lock);
4053 	block_rsv->reserved += num_bytes;
4054 	if (update_size)
4055 		block_rsv->size += num_bytes;
4056 	else if (block_rsv->reserved >= block_rsv->size)
4057 		block_rsv->full = 1;
4058 	spin_unlock(&block_rsv->lock);
4059 }
4060 
4061 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4062 				    struct btrfs_block_rsv *block_rsv,
4063 				    struct btrfs_block_rsv *dest, u64 num_bytes)
4064 {
4065 	struct btrfs_space_info *space_info = block_rsv->space_info;
4066 
4067 	spin_lock(&block_rsv->lock);
4068 	if (num_bytes == (u64)-1)
4069 		num_bytes = block_rsv->size;
4070 	block_rsv->size -= num_bytes;
4071 	if (block_rsv->reserved >= block_rsv->size) {
4072 		num_bytes = block_rsv->reserved - block_rsv->size;
4073 		block_rsv->reserved = block_rsv->size;
4074 		block_rsv->full = 1;
4075 	} else {
4076 		num_bytes = 0;
4077 	}
4078 	spin_unlock(&block_rsv->lock);
4079 
4080 	if (num_bytes > 0) {
4081 		if (dest) {
4082 			spin_lock(&dest->lock);
4083 			if (!dest->full) {
4084 				u64 bytes_to_add;
4085 
4086 				bytes_to_add = dest->size - dest->reserved;
4087 				bytes_to_add = min(num_bytes, bytes_to_add);
4088 				dest->reserved += bytes_to_add;
4089 				if (dest->reserved >= dest->size)
4090 					dest->full = 1;
4091 				num_bytes -= bytes_to_add;
4092 			}
4093 			spin_unlock(&dest->lock);
4094 		}
4095 		if (num_bytes) {
4096 			spin_lock(&space_info->lock);
4097 			space_info->bytes_may_use -= num_bytes;
4098 			trace_btrfs_space_reservation(fs_info, "space_info",
4099 					space_info->flags, num_bytes, 0);
4100 			space_info->reservation_progress++;
4101 			spin_unlock(&space_info->lock);
4102 		}
4103 	}
4104 }
4105 
4106 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4107 				   struct btrfs_block_rsv *dst, u64 num_bytes)
4108 {
4109 	int ret;
4110 
4111 	ret = block_rsv_use_bytes(src, num_bytes);
4112 	if (ret)
4113 		return ret;
4114 
4115 	block_rsv_add_bytes(dst, num_bytes, 1);
4116 	return 0;
4117 }
4118 
4119 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4120 {
4121 	memset(rsv, 0, sizeof(*rsv));
4122 	spin_lock_init(&rsv->lock);
4123 	rsv->type = type;
4124 }
4125 
4126 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4127 					      unsigned short type)
4128 {
4129 	struct btrfs_block_rsv *block_rsv;
4130 	struct btrfs_fs_info *fs_info = root->fs_info;
4131 
4132 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4133 	if (!block_rsv)
4134 		return NULL;
4135 
4136 	btrfs_init_block_rsv(block_rsv, type);
4137 	block_rsv->space_info = __find_space_info(fs_info,
4138 						  BTRFS_BLOCK_GROUP_METADATA);
4139 	return block_rsv;
4140 }
4141 
4142 void btrfs_free_block_rsv(struct btrfs_root *root,
4143 			  struct btrfs_block_rsv *rsv)
4144 {
4145 	if (!rsv)
4146 		return;
4147 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4148 	kfree(rsv);
4149 }
4150 
4151 static inline int __block_rsv_add(struct btrfs_root *root,
4152 				  struct btrfs_block_rsv *block_rsv,
4153 				  u64 num_bytes, int flush)
4154 {
4155 	int ret;
4156 
4157 	if (num_bytes == 0)
4158 		return 0;
4159 
4160 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4161 	if (!ret) {
4162 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
4163 		return 0;
4164 	}
4165 
4166 	return ret;
4167 }
4168 
4169 int btrfs_block_rsv_add(struct btrfs_root *root,
4170 			struct btrfs_block_rsv *block_rsv,
4171 			u64 num_bytes)
4172 {
4173 	return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174 }
4175 
4176 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 				struct btrfs_block_rsv *block_rsv,
4178 				u64 num_bytes)
4179 {
4180 	return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181 }
4182 
4183 int btrfs_block_rsv_check(struct btrfs_root *root,
4184 			  struct btrfs_block_rsv *block_rsv, int min_factor)
4185 {
4186 	u64 num_bytes = 0;
4187 	int ret = -ENOSPC;
4188 
4189 	if (!block_rsv)
4190 		return 0;
4191 
4192 	spin_lock(&block_rsv->lock);
4193 	num_bytes = div_factor(block_rsv->size, min_factor);
4194 	if (block_rsv->reserved >= num_bytes)
4195 		ret = 0;
4196 	spin_unlock(&block_rsv->lock);
4197 
4198 	return ret;
4199 }
4200 
4201 static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4202 					   struct btrfs_block_rsv *block_rsv,
4203 					   u64 min_reserved, int flush)
4204 {
4205 	u64 num_bytes = 0;
4206 	int ret = -ENOSPC;
4207 
4208 	if (!block_rsv)
4209 		return 0;
4210 
4211 	spin_lock(&block_rsv->lock);
4212 	num_bytes = min_reserved;
4213 	if (block_rsv->reserved >= num_bytes)
4214 		ret = 0;
4215 	else
4216 		num_bytes -= block_rsv->reserved;
4217 	spin_unlock(&block_rsv->lock);
4218 
4219 	if (!ret)
4220 		return 0;
4221 
4222 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4223 	if (!ret) {
4224 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
4225 		return 0;
4226 	}
4227 
4228 	return ret;
4229 }
4230 
4231 int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 			   struct btrfs_block_rsv *block_rsv,
4233 			   u64 min_reserved)
4234 {
4235 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236 }
4237 
4238 int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 				   struct btrfs_block_rsv *block_rsv,
4240 				   u64 min_reserved)
4241 {
4242 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243 }
4244 
4245 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 			    struct btrfs_block_rsv *dst_rsv,
4247 			    u64 num_bytes)
4248 {
4249 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4250 }
4251 
4252 void btrfs_block_rsv_release(struct btrfs_root *root,
4253 			     struct btrfs_block_rsv *block_rsv,
4254 			     u64 num_bytes)
4255 {
4256 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4257 	if (global_rsv->full || global_rsv == block_rsv ||
4258 	    block_rsv->space_info != global_rsv->space_info)
4259 		global_rsv = NULL;
4260 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4261 				num_bytes);
4262 }
4263 
4264 /*
4265  * helper to calculate size of global block reservation.
4266  * the desired value is sum of space used by extent tree,
4267  * checksum tree and root tree
4268  */
4269 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4270 {
4271 	struct btrfs_space_info *sinfo;
4272 	u64 num_bytes;
4273 	u64 meta_used;
4274 	u64 data_used;
4275 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4276 
4277 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4278 	spin_lock(&sinfo->lock);
4279 	data_used = sinfo->bytes_used;
4280 	spin_unlock(&sinfo->lock);
4281 
4282 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4283 	spin_lock(&sinfo->lock);
4284 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4285 		data_used = 0;
4286 	meta_used = sinfo->bytes_used;
4287 	spin_unlock(&sinfo->lock);
4288 
4289 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4290 		    csum_size * 2;
4291 	num_bytes += div64_u64(data_used + meta_used, 50);
4292 
4293 	if (num_bytes * 3 > meta_used)
4294 		num_bytes = div64_u64(meta_used, 3);
4295 
4296 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4297 }
4298 
4299 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4300 {
4301 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4302 	struct btrfs_space_info *sinfo = block_rsv->space_info;
4303 	u64 num_bytes;
4304 
4305 	num_bytes = calc_global_metadata_size(fs_info);
4306 
4307 	spin_lock(&sinfo->lock);
4308 	spin_lock(&block_rsv->lock);
4309 
4310 	block_rsv->size = num_bytes;
4311 
4312 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4313 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
4314 		    sinfo->bytes_may_use;
4315 
4316 	if (sinfo->total_bytes > num_bytes) {
4317 		num_bytes = sinfo->total_bytes - num_bytes;
4318 		block_rsv->reserved += num_bytes;
4319 		sinfo->bytes_may_use += num_bytes;
4320 		trace_btrfs_space_reservation(fs_info, "space_info",
4321 				      sinfo->flags, num_bytes, 1);
4322 	}
4323 
4324 	if (block_rsv->reserved >= block_rsv->size) {
4325 		num_bytes = block_rsv->reserved - block_rsv->size;
4326 		sinfo->bytes_may_use -= num_bytes;
4327 		trace_btrfs_space_reservation(fs_info, "space_info",
4328 				      sinfo->flags, num_bytes, 0);
4329 		sinfo->reservation_progress++;
4330 		block_rsv->reserved = block_rsv->size;
4331 		block_rsv->full = 1;
4332 	}
4333 
4334 	spin_unlock(&block_rsv->lock);
4335 	spin_unlock(&sinfo->lock);
4336 }
4337 
4338 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4339 {
4340 	struct btrfs_space_info *space_info;
4341 
4342 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4343 	fs_info->chunk_block_rsv.space_info = space_info;
4344 
4345 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4346 	fs_info->global_block_rsv.space_info = space_info;
4347 	fs_info->delalloc_block_rsv.space_info = space_info;
4348 	fs_info->trans_block_rsv.space_info = space_info;
4349 	fs_info->empty_block_rsv.space_info = space_info;
4350 	fs_info->delayed_block_rsv.space_info = space_info;
4351 
4352 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4353 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4354 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4355 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4356 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4357 
4358 	update_global_block_rsv(fs_info);
4359 }
4360 
4361 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4362 {
4363 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4364 				(u64)-1);
4365 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4366 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4367 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4368 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4369 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4370 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4371 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4372 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4373 }
4374 
4375 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4376 				  struct btrfs_root *root)
4377 {
4378 	if (!trans->block_rsv)
4379 		return;
4380 
4381 	if (!trans->bytes_reserved)
4382 		return;
4383 
4384 	trace_btrfs_space_reservation(root->fs_info, "transaction",
4385 				      trans->transid, trans->bytes_reserved, 0);
4386 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4387 	trans->bytes_reserved = 0;
4388 }
4389 
4390 /* Can only return 0 or -ENOSPC */
4391 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4392 				  struct inode *inode)
4393 {
4394 	struct btrfs_root *root = BTRFS_I(inode)->root;
4395 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4396 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4397 
4398 	/*
4399 	 * We need to hold space in order to delete our orphan item once we've
4400 	 * added it, so this takes the reservation so we can release it later
4401 	 * when we are truly done with the orphan item.
4402 	 */
4403 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4404 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4405 				      btrfs_ino(inode), num_bytes, 1);
4406 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4407 }
4408 
4409 void btrfs_orphan_release_metadata(struct inode *inode)
4410 {
4411 	struct btrfs_root *root = BTRFS_I(inode)->root;
4412 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4413 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4414 				      btrfs_ino(inode), num_bytes, 0);
4415 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4416 }
4417 
4418 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4419 				struct btrfs_pending_snapshot *pending)
4420 {
4421 	struct btrfs_root *root = pending->root;
4422 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4423 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4424 	/*
4425 	 * two for root back/forward refs, two for directory entries,
4426 	 * one for root of the snapshot and one for parent inode.
4427 	 */
4428 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4429 	dst_rsv->space_info = src_rsv->space_info;
4430 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4431 }
4432 
4433 /**
4434  * drop_outstanding_extent - drop an outstanding extent
4435  * @inode: the inode we're dropping the extent for
4436  *
4437  * This is called when we are freeing up an outstanding extent, either called
4438  * after an error or after an extent is written.  This will return the number of
4439  * reserved extents that need to be freed.  This must be called with
4440  * BTRFS_I(inode)->lock held.
4441  */
4442 static unsigned drop_outstanding_extent(struct inode *inode)
4443 {
4444 	unsigned drop_inode_space = 0;
4445 	unsigned dropped_extents = 0;
4446 
4447 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4448 	BTRFS_I(inode)->outstanding_extents--;
4449 
4450 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4451 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4452 			       &BTRFS_I(inode)->runtime_flags))
4453 		drop_inode_space = 1;
4454 
4455 	/*
4456 	 * If we have more or the same amount of outsanding extents than we have
4457 	 * reserved then we need to leave the reserved extents count alone.
4458 	 */
4459 	if (BTRFS_I(inode)->outstanding_extents >=
4460 	    BTRFS_I(inode)->reserved_extents)
4461 		return drop_inode_space;
4462 
4463 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4464 		BTRFS_I(inode)->outstanding_extents;
4465 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4466 	return dropped_extents + drop_inode_space;
4467 }
4468 
4469 /**
4470  * calc_csum_metadata_size - return the amount of metada space that must be
4471  *	reserved/free'd for the given bytes.
4472  * @inode: the inode we're manipulating
4473  * @num_bytes: the number of bytes in question
4474  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4475  *
4476  * This adjusts the number of csum_bytes in the inode and then returns the
4477  * correct amount of metadata that must either be reserved or freed.  We
4478  * calculate how many checksums we can fit into one leaf and then divide the
4479  * number of bytes that will need to be checksumed by this value to figure out
4480  * how many checksums will be required.  If we are adding bytes then the number
4481  * may go up and we will return the number of additional bytes that must be
4482  * reserved.  If it is going down we will return the number of bytes that must
4483  * be freed.
4484  *
4485  * This must be called with BTRFS_I(inode)->lock held.
4486  */
4487 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4488 				   int reserve)
4489 {
4490 	struct btrfs_root *root = BTRFS_I(inode)->root;
4491 	u64 csum_size;
4492 	int num_csums_per_leaf;
4493 	int num_csums;
4494 	int old_csums;
4495 
4496 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4497 	    BTRFS_I(inode)->csum_bytes == 0)
4498 		return 0;
4499 
4500 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4501 	if (reserve)
4502 		BTRFS_I(inode)->csum_bytes += num_bytes;
4503 	else
4504 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4505 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4506 	num_csums_per_leaf = (int)div64_u64(csum_size,
4507 					    sizeof(struct btrfs_csum_item) +
4508 					    sizeof(struct btrfs_disk_key));
4509 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4510 	num_csums = num_csums + num_csums_per_leaf - 1;
4511 	num_csums = num_csums / num_csums_per_leaf;
4512 
4513 	old_csums = old_csums + num_csums_per_leaf - 1;
4514 	old_csums = old_csums / num_csums_per_leaf;
4515 
4516 	/* No change, no need to reserve more */
4517 	if (old_csums == num_csums)
4518 		return 0;
4519 
4520 	if (reserve)
4521 		return btrfs_calc_trans_metadata_size(root,
4522 						      num_csums - old_csums);
4523 
4524 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4525 }
4526 
4527 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4528 {
4529 	struct btrfs_root *root = BTRFS_I(inode)->root;
4530 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4531 	u64 to_reserve = 0;
4532 	u64 csum_bytes;
4533 	unsigned nr_extents = 0;
4534 	int extra_reserve = 0;
4535 	int flush = 1;
4536 	int ret;
4537 
4538 	/* Need to be holding the i_mutex here if we aren't free space cache */
4539 	if (btrfs_is_free_space_inode(inode))
4540 		flush = 0;
4541 
4542 	if (flush && btrfs_transaction_in_commit(root->fs_info))
4543 		schedule_timeout(1);
4544 
4545 	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4546 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 
4548 	spin_lock(&BTRFS_I(inode)->lock);
4549 	BTRFS_I(inode)->outstanding_extents++;
4550 
4551 	if (BTRFS_I(inode)->outstanding_extents >
4552 	    BTRFS_I(inode)->reserved_extents)
4553 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4554 			BTRFS_I(inode)->reserved_extents;
4555 
4556 	/*
4557 	 * Add an item to reserve for updating the inode when we complete the
4558 	 * delalloc io.
4559 	 */
4560 	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4561 		      &BTRFS_I(inode)->runtime_flags)) {
4562 		nr_extents++;
4563 		extra_reserve = 1;
4564 	}
4565 
4566 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4567 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4568 	csum_bytes = BTRFS_I(inode)->csum_bytes;
4569 	spin_unlock(&BTRFS_I(inode)->lock);
4570 
4571 	if (root->fs_info->quota_enabled) {
4572 		ret = btrfs_qgroup_reserve(root, num_bytes +
4573 					   nr_extents * root->leafsize);
4574 		if (ret) {
4575 			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 			return ret;
4577 		}
4578 	}
4579 
4580 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4581 	if (ret) {
4582 		u64 to_free = 0;
4583 		unsigned dropped;
4584 
4585 		spin_lock(&BTRFS_I(inode)->lock);
4586 		dropped = drop_outstanding_extent(inode);
4587 		/*
4588 		 * If the inodes csum_bytes is the same as the original
4589 		 * csum_bytes then we know we haven't raced with any free()ers
4590 		 * so we can just reduce our inodes csum bytes and carry on.
4591 		 * Otherwise we have to do the normal free thing to account for
4592 		 * the case that the free side didn't free up its reserve
4593 		 * because of this outstanding reservation.
4594 		 */
4595 		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4596 			calc_csum_metadata_size(inode, num_bytes, 0);
4597 		else
4598 			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4599 		spin_unlock(&BTRFS_I(inode)->lock);
4600 		if (dropped)
4601 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
4602 
4603 		if (to_free) {
4604 			btrfs_block_rsv_release(root, block_rsv, to_free);
4605 			trace_btrfs_space_reservation(root->fs_info,
4606 						      "delalloc",
4607 						      btrfs_ino(inode),
4608 						      to_free, 0);
4609 		}
4610 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 		return ret;
4612 	}
4613 
4614 	spin_lock(&BTRFS_I(inode)->lock);
4615 	if (extra_reserve) {
4616 		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4617 			&BTRFS_I(inode)->runtime_flags);
4618 		nr_extents--;
4619 	}
4620 	BTRFS_I(inode)->reserved_extents += nr_extents;
4621 	spin_unlock(&BTRFS_I(inode)->lock);
4622 	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 
4624 	if (to_reserve)
4625 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
4626 					      btrfs_ino(inode), to_reserve, 1);
4627 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4628 
4629 	return 0;
4630 }
4631 
4632 /**
4633  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4634  * @inode: the inode to release the reservation for
4635  * @num_bytes: the number of bytes we're releasing
4636  *
4637  * This will release the metadata reservation for an inode.  This can be called
4638  * once we complete IO for a given set of bytes to release their metadata
4639  * reservations.
4640  */
4641 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4642 {
4643 	struct btrfs_root *root = BTRFS_I(inode)->root;
4644 	u64 to_free = 0;
4645 	unsigned dropped;
4646 
4647 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4648 	spin_lock(&BTRFS_I(inode)->lock);
4649 	dropped = drop_outstanding_extent(inode);
4650 
4651 	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4652 	spin_unlock(&BTRFS_I(inode)->lock);
4653 	if (dropped > 0)
4654 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4655 
4656 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
4657 				      btrfs_ino(inode), to_free, 0);
4658 	if (root->fs_info->quota_enabled) {
4659 		btrfs_qgroup_free(root, num_bytes +
4660 					dropped * root->leafsize);
4661 	}
4662 
4663 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4664 				to_free);
4665 }
4666 
4667 /**
4668  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4669  * @inode: inode we're writing to
4670  * @num_bytes: the number of bytes we want to allocate
4671  *
4672  * This will do the following things
4673  *
4674  * o reserve space in the data space info for num_bytes
4675  * o reserve space in the metadata space info based on number of outstanding
4676  *   extents and how much csums will be needed
4677  * o add to the inodes ->delalloc_bytes
4678  * o add it to the fs_info's delalloc inodes list.
4679  *
4680  * This will return 0 for success and -ENOSPC if there is no space left.
4681  */
4682 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4683 {
4684 	int ret;
4685 
4686 	ret = btrfs_check_data_free_space(inode, num_bytes);
4687 	if (ret)
4688 		return ret;
4689 
4690 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4691 	if (ret) {
4692 		btrfs_free_reserved_data_space(inode, num_bytes);
4693 		return ret;
4694 	}
4695 
4696 	return 0;
4697 }
4698 
4699 /**
4700  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4701  * @inode: inode we're releasing space for
4702  * @num_bytes: the number of bytes we want to free up
4703  *
4704  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4705  * called in the case that we don't need the metadata AND data reservations
4706  * anymore.  So if there is an error or we insert an inline extent.
4707  *
4708  * This function will release the metadata space that was not used and will
4709  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4710  * list if there are no delalloc bytes left.
4711  */
4712 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4713 {
4714 	btrfs_delalloc_release_metadata(inode, num_bytes);
4715 	btrfs_free_reserved_data_space(inode, num_bytes);
4716 }
4717 
4718 static int update_block_group(struct btrfs_trans_handle *trans,
4719 			      struct btrfs_root *root,
4720 			      u64 bytenr, u64 num_bytes, int alloc)
4721 {
4722 	struct btrfs_block_group_cache *cache = NULL;
4723 	struct btrfs_fs_info *info = root->fs_info;
4724 	u64 total = num_bytes;
4725 	u64 old_val;
4726 	u64 byte_in_group;
4727 	int factor;
4728 
4729 	/* block accounting for super block */
4730 	spin_lock(&info->delalloc_lock);
4731 	old_val = btrfs_super_bytes_used(info->super_copy);
4732 	if (alloc)
4733 		old_val += num_bytes;
4734 	else
4735 		old_val -= num_bytes;
4736 	btrfs_set_super_bytes_used(info->super_copy, old_val);
4737 	spin_unlock(&info->delalloc_lock);
4738 
4739 	while (total) {
4740 		cache = btrfs_lookup_block_group(info, bytenr);
4741 		if (!cache)
4742 			return -ENOENT;
4743 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4744 				    BTRFS_BLOCK_GROUP_RAID1 |
4745 				    BTRFS_BLOCK_GROUP_RAID10))
4746 			factor = 2;
4747 		else
4748 			factor = 1;
4749 		/*
4750 		 * If this block group has free space cache written out, we
4751 		 * need to make sure to load it if we are removing space.  This
4752 		 * is because we need the unpinning stage to actually add the
4753 		 * space back to the block group, otherwise we will leak space.
4754 		 */
4755 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
4756 			cache_block_group(cache, trans, NULL, 1);
4757 
4758 		byte_in_group = bytenr - cache->key.objectid;
4759 		WARN_ON(byte_in_group > cache->key.offset);
4760 
4761 		spin_lock(&cache->space_info->lock);
4762 		spin_lock(&cache->lock);
4763 
4764 		if (btrfs_test_opt(root, SPACE_CACHE) &&
4765 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
4766 			cache->disk_cache_state = BTRFS_DC_CLEAR;
4767 
4768 		cache->dirty = 1;
4769 		old_val = btrfs_block_group_used(&cache->item);
4770 		num_bytes = min(total, cache->key.offset - byte_in_group);
4771 		if (alloc) {
4772 			old_val += num_bytes;
4773 			btrfs_set_block_group_used(&cache->item, old_val);
4774 			cache->reserved -= num_bytes;
4775 			cache->space_info->bytes_reserved -= num_bytes;
4776 			cache->space_info->bytes_used += num_bytes;
4777 			cache->space_info->disk_used += num_bytes * factor;
4778 			spin_unlock(&cache->lock);
4779 			spin_unlock(&cache->space_info->lock);
4780 		} else {
4781 			old_val -= num_bytes;
4782 			btrfs_set_block_group_used(&cache->item, old_val);
4783 			cache->pinned += num_bytes;
4784 			cache->space_info->bytes_pinned += num_bytes;
4785 			cache->space_info->bytes_used -= num_bytes;
4786 			cache->space_info->disk_used -= num_bytes * factor;
4787 			spin_unlock(&cache->lock);
4788 			spin_unlock(&cache->space_info->lock);
4789 
4790 			set_extent_dirty(info->pinned_extents,
4791 					 bytenr, bytenr + num_bytes - 1,
4792 					 GFP_NOFS | __GFP_NOFAIL);
4793 		}
4794 		btrfs_put_block_group(cache);
4795 		total -= num_bytes;
4796 		bytenr += num_bytes;
4797 	}
4798 	return 0;
4799 }
4800 
4801 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4802 {
4803 	struct btrfs_block_group_cache *cache;
4804 	u64 bytenr;
4805 
4806 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4807 	if (!cache)
4808 		return 0;
4809 
4810 	bytenr = cache->key.objectid;
4811 	btrfs_put_block_group(cache);
4812 
4813 	return bytenr;
4814 }
4815 
4816 static int pin_down_extent(struct btrfs_root *root,
4817 			   struct btrfs_block_group_cache *cache,
4818 			   u64 bytenr, u64 num_bytes, int reserved)
4819 {
4820 	spin_lock(&cache->space_info->lock);
4821 	spin_lock(&cache->lock);
4822 	cache->pinned += num_bytes;
4823 	cache->space_info->bytes_pinned += num_bytes;
4824 	if (reserved) {
4825 		cache->reserved -= num_bytes;
4826 		cache->space_info->bytes_reserved -= num_bytes;
4827 	}
4828 	spin_unlock(&cache->lock);
4829 	spin_unlock(&cache->space_info->lock);
4830 
4831 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4832 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4833 	return 0;
4834 }
4835 
4836 /*
4837  * this function must be called within transaction
4838  */
4839 int btrfs_pin_extent(struct btrfs_root *root,
4840 		     u64 bytenr, u64 num_bytes, int reserved)
4841 {
4842 	struct btrfs_block_group_cache *cache;
4843 
4844 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4845 	BUG_ON(!cache); /* Logic error */
4846 
4847 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4848 
4849 	btrfs_put_block_group(cache);
4850 	return 0;
4851 }
4852 
4853 /*
4854  * this function must be called within transaction
4855  */
4856 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4857 				    struct btrfs_root *root,
4858 				    u64 bytenr, u64 num_bytes)
4859 {
4860 	struct btrfs_block_group_cache *cache;
4861 
4862 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4863 	BUG_ON(!cache); /* Logic error */
4864 
4865 	/*
4866 	 * pull in the free space cache (if any) so that our pin
4867 	 * removes the free space from the cache.  We have load_only set
4868 	 * to one because the slow code to read in the free extents does check
4869 	 * the pinned extents.
4870 	 */
4871 	cache_block_group(cache, trans, root, 1);
4872 
4873 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
4874 
4875 	/* remove us from the free space cache (if we're there at all) */
4876 	btrfs_remove_free_space(cache, bytenr, num_bytes);
4877 	btrfs_put_block_group(cache);
4878 	return 0;
4879 }
4880 
4881 /**
4882  * btrfs_update_reserved_bytes - update the block_group and space info counters
4883  * @cache:	The cache we are manipulating
4884  * @num_bytes:	The number of bytes in question
4885  * @reserve:	One of the reservation enums
4886  *
4887  * This is called by the allocator when it reserves space, or by somebody who is
4888  * freeing space that was never actually used on disk.  For example if you
4889  * reserve some space for a new leaf in transaction A and before transaction A
4890  * commits you free that leaf, you call this with reserve set to 0 in order to
4891  * clear the reservation.
4892  *
4893  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4894  * ENOSPC accounting.  For data we handle the reservation through clearing the
4895  * delalloc bits in the io_tree.  We have to do this since we could end up
4896  * allocating less disk space for the amount of data we have reserved in the
4897  * case of compression.
4898  *
4899  * If this is a reservation and the block group has become read only we cannot
4900  * make the reservation and return -EAGAIN, otherwise this function always
4901  * succeeds.
4902  */
4903 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4904 				       u64 num_bytes, int reserve)
4905 {
4906 	struct btrfs_space_info *space_info = cache->space_info;
4907 	int ret = 0;
4908 
4909 	spin_lock(&space_info->lock);
4910 	spin_lock(&cache->lock);
4911 	if (reserve != RESERVE_FREE) {
4912 		if (cache->ro) {
4913 			ret = -EAGAIN;
4914 		} else {
4915 			cache->reserved += num_bytes;
4916 			space_info->bytes_reserved += num_bytes;
4917 			if (reserve == RESERVE_ALLOC) {
4918 				trace_btrfs_space_reservation(cache->fs_info,
4919 						"space_info", space_info->flags,
4920 						num_bytes, 0);
4921 				space_info->bytes_may_use -= num_bytes;
4922 			}
4923 		}
4924 	} else {
4925 		if (cache->ro)
4926 			space_info->bytes_readonly += num_bytes;
4927 		cache->reserved -= num_bytes;
4928 		space_info->bytes_reserved -= num_bytes;
4929 		space_info->reservation_progress++;
4930 	}
4931 	spin_unlock(&cache->lock);
4932 	spin_unlock(&space_info->lock);
4933 	return ret;
4934 }
4935 
4936 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4937 				struct btrfs_root *root)
4938 {
4939 	struct btrfs_fs_info *fs_info = root->fs_info;
4940 	struct btrfs_caching_control *next;
4941 	struct btrfs_caching_control *caching_ctl;
4942 	struct btrfs_block_group_cache *cache;
4943 
4944 	down_write(&fs_info->extent_commit_sem);
4945 
4946 	list_for_each_entry_safe(caching_ctl, next,
4947 				 &fs_info->caching_block_groups, list) {
4948 		cache = caching_ctl->block_group;
4949 		if (block_group_cache_done(cache)) {
4950 			cache->last_byte_to_unpin = (u64)-1;
4951 			list_del_init(&caching_ctl->list);
4952 			put_caching_control(caching_ctl);
4953 		} else {
4954 			cache->last_byte_to_unpin = caching_ctl->progress;
4955 		}
4956 	}
4957 
4958 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4959 		fs_info->pinned_extents = &fs_info->freed_extents[1];
4960 	else
4961 		fs_info->pinned_extents = &fs_info->freed_extents[0];
4962 
4963 	up_write(&fs_info->extent_commit_sem);
4964 
4965 	update_global_block_rsv(fs_info);
4966 }
4967 
4968 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969 {
4970 	struct btrfs_fs_info *fs_info = root->fs_info;
4971 	struct btrfs_block_group_cache *cache = NULL;
4972 	u64 len;
4973 
4974 	while (start <= end) {
4975 		if (!cache ||
4976 		    start >= cache->key.objectid + cache->key.offset) {
4977 			if (cache)
4978 				btrfs_put_block_group(cache);
4979 			cache = btrfs_lookup_block_group(fs_info, start);
4980 			BUG_ON(!cache); /* Logic error */
4981 		}
4982 
4983 		len = cache->key.objectid + cache->key.offset - start;
4984 		len = min(len, end + 1 - start);
4985 
4986 		if (start < cache->last_byte_to_unpin) {
4987 			len = min(len, cache->last_byte_to_unpin - start);
4988 			btrfs_add_free_space(cache, start, len);
4989 		}
4990 
4991 		start += len;
4992 
4993 		spin_lock(&cache->space_info->lock);
4994 		spin_lock(&cache->lock);
4995 		cache->pinned -= len;
4996 		cache->space_info->bytes_pinned -= len;
4997 		if (cache->ro)
4998 			cache->space_info->bytes_readonly += len;
4999 		spin_unlock(&cache->lock);
5000 		spin_unlock(&cache->space_info->lock);
5001 	}
5002 
5003 	if (cache)
5004 		btrfs_put_block_group(cache);
5005 	return 0;
5006 }
5007 
5008 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5009 			       struct btrfs_root *root)
5010 {
5011 	struct btrfs_fs_info *fs_info = root->fs_info;
5012 	struct extent_io_tree *unpin;
5013 	u64 start;
5014 	u64 end;
5015 	int ret;
5016 
5017 	if (trans->aborted)
5018 		return 0;
5019 
5020 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5021 		unpin = &fs_info->freed_extents[1];
5022 	else
5023 		unpin = &fs_info->freed_extents[0];
5024 
5025 	while (1) {
5026 		ret = find_first_extent_bit(unpin, 0, &start, &end,
5027 					    EXTENT_DIRTY, NULL);
5028 		if (ret)
5029 			break;
5030 
5031 		if (btrfs_test_opt(root, DISCARD))
5032 			ret = btrfs_discard_extent(root, start,
5033 						   end + 1 - start, NULL);
5034 
5035 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
5036 		unpin_extent_range(root, start, end);
5037 		cond_resched();
5038 	}
5039 
5040 	return 0;
5041 }
5042 
5043 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5044 				struct btrfs_root *root,
5045 				u64 bytenr, u64 num_bytes, u64 parent,
5046 				u64 root_objectid, u64 owner_objectid,
5047 				u64 owner_offset, int refs_to_drop,
5048 				struct btrfs_delayed_extent_op *extent_op)
5049 {
5050 	struct btrfs_key key;
5051 	struct btrfs_path *path;
5052 	struct btrfs_fs_info *info = root->fs_info;
5053 	struct btrfs_root *extent_root = info->extent_root;
5054 	struct extent_buffer *leaf;
5055 	struct btrfs_extent_item *ei;
5056 	struct btrfs_extent_inline_ref *iref;
5057 	int ret;
5058 	int is_data;
5059 	int extent_slot = 0;
5060 	int found_extent = 0;
5061 	int num_to_del = 1;
5062 	u32 item_size;
5063 	u64 refs;
5064 
5065 	path = btrfs_alloc_path();
5066 	if (!path)
5067 		return -ENOMEM;
5068 
5069 	path->reada = 1;
5070 	path->leave_spinning = 1;
5071 
5072 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5073 	BUG_ON(!is_data && refs_to_drop != 1);
5074 
5075 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
5076 				    bytenr, num_bytes, parent,
5077 				    root_objectid, owner_objectid,
5078 				    owner_offset);
5079 	if (ret == 0) {
5080 		extent_slot = path->slots[0];
5081 		while (extent_slot >= 0) {
5082 			btrfs_item_key_to_cpu(path->nodes[0], &key,
5083 					      extent_slot);
5084 			if (key.objectid != bytenr)
5085 				break;
5086 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5087 			    key.offset == num_bytes) {
5088 				found_extent = 1;
5089 				break;
5090 			}
5091 			if (path->slots[0] - extent_slot > 5)
5092 				break;
5093 			extent_slot--;
5094 		}
5095 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5096 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5097 		if (found_extent && item_size < sizeof(*ei))
5098 			found_extent = 0;
5099 #endif
5100 		if (!found_extent) {
5101 			BUG_ON(iref);
5102 			ret = remove_extent_backref(trans, extent_root, path,
5103 						    NULL, refs_to_drop,
5104 						    is_data);
5105 			if (ret) {
5106 				btrfs_abort_transaction(trans, extent_root, ret);
5107 				goto out;
5108 			}
5109 			btrfs_release_path(path);
5110 			path->leave_spinning = 1;
5111 
5112 			key.objectid = bytenr;
5113 			key.type = BTRFS_EXTENT_ITEM_KEY;
5114 			key.offset = num_bytes;
5115 
5116 			ret = btrfs_search_slot(trans, extent_root,
5117 						&key, path, -1, 1);
5118 			if (ret) {
5119 				printk(KERN_ERR "umm, got %d back from search"
5120 				       ", was looking for %llu\n", ret,
5121 				       (unsigned long long)bytenr);
5122 				if (ret > 0)
5123 					btrfs_print_leaf(extent_root,
5124 							 path->nodes[0]);
5125 			}
5126 			if (ret < 0) {
5127 				btrfs_abort_transaction(trans, extent_root, ret);
5128 				goto out;
5129 			}
5130 			extent_slot = path->slots[0];
5131 		}
5132 	} else if (ret == -ENOENT) {
5133 		btrfs_print_leaf(extent_root, path->nodes[0]);
5134 		WARN_ON(1);
5135 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5136 		       "parent %llu root %llu  owner %llu offset %llu\n",
5137 		       (unsigned long long)bytenr,
5138 		       (unsigned long long)parent,
5139 		       (unsigned long long)root_objectid,
5140 		       (unsigned long long)owner_objectid,
5141 		       (unsigned long long)owner_offset);
5142 	} else {
5143 		btrfs_abort_transaction(trans, extent_root, ret);
5144 		goto out;
5145 	}
5146 
5147 	leaf = path->nodes[0];
5148 	item_size = btrfs_item_size_nr(leaf, extent_slot);
5149 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5150 	if (item_size < sizeof(*ei)) {
5151 		BUG_ON(found_extent || extent_slot != path->slots[0]);
5152 		ret = convert_extent_item_v0(trans, extent_root, path,
5153 					     owner_objectid, 0);
5154 		if (ret < 0) {
5155 			btrfs_abort_transaction(trans, extent_root, ret);
5156 			goto out;
5157 		}
5158 
5159 		btrfs_release_path(path);
5160 		path->leave_spinning = 1;
5161 
5162 		key.objectid = bytenr;
5163 		key.type = BTRFS_EXTENT_ITEM_KEY;
5164 		key.offset = num_bytes;
5165 
5166 		ret = btrfs_search_slot(trans, extent_root, &key, path,
5167 					-1, 1);
5168 		if (ret) {
5169 			printk(KERN_ERR "umm, got %d back from search"
5170 			       ", was looking for %llu\n", ret,
5171 			       (unsigned long long)bytenr);
5172 			btrfs_print_leaf(extent_root, path->nodes[0]);
5173 		}
5174 		if (ret < 0) {
5175 			btrfs_abort_transaction(trans, extent_root, ret);
5176 			goto out;
5177 		}
5178 
5179 		extent_slot = path->slots[0];
5180 		leaf = path->nodes[0];
5181 		item_size = btrfs_item_size_nr(leaf, extent_slot);
5182 	}
5183 #endif
5184 	BUG_ON(item_size < sizeof(*ei));
5185 	ei = btrfs_item_ptr(leaf, extent_slot,
5186 			    struct btrfs_extent_item);
5187 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5188 		struct btrfs_tree_block_info *bi;
5189 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5190 		bi = (struct btrfs_tree_block_info *)(ei + 1);
5191 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5192 	}
5193 
5194 	refs = btrfs_extent_refs(leaf, ei);
5195 	BUG_ON(refs < refs_to_drop);
5196 	refs -= refs_to_drop;
5197 
5198 	if (refs > 0) {
5199 		if (extent_op)
5200 			__run_delayed_extent_op(extent_op, leaf, ei);
5201 		/*
5202 		 * In the case of inline back ref, reference count will
5203 		 * be updated by remove_extent_backref
5204 		 */
5205 		if (iref) {
5206 			BUG_ON(!found_extent);
5207 		} else {
5208 			btrfs_set_extent_refs(leaf, ei, refs);
5209 			btrfs_mark_buffer_dirty(leaf);
5210 		}
5211 		if (found_extent) {
5212 			ret = remove_extent_backref(trans, extent_root, path,
5213 						    iref, refs_to_drop,
5214 						    is_data);
5215 			if (ret) {
5216 				btrfs_abort_transaction(trans, extent_root, ret);
5217 				goto out;
5218 			}
5219 		}
5220 	} else {
5221 		if (found_extent) {
5222 			BUG_ON(is_data && refs_to_drop !=
5223 			       extent_data_ref_count(root, path, iref));
5224 			if (iref) {
5225 				BUG_ON(path->slots[0] != extent_slot);
5226 			} else {
5227 				BUG_ON(path->slots[0] != extent_slot + 1);
5228 				path->slots[0] = extent_slot;
5229 				num_to_del = 2;
5230 			}
5231 		}
5232 
5233 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5234 				      num_to_del);
5235 		if (ret) {
5236 			btrfs_abort_transaction(trans, extent_root, ret);
5237 			goto out;
5238 		}
5239 		btrfs_release_path(path);
5240 
5241 		if (is_data) {
5242 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5243 			if (ret) {
5244 				btrfs_abort_transaction(trans, extent_root, ret);
5245 				goto out;
5246 			}
5247 		}
5248 
5249 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5250 		if (ret) {
5251 			btrfs_abort_transaction(trans, extent_root, ret);
5252 			goto out;
5253 		}
5254 	}
5255 out:
5256 	btrfs_free_path(path);
5257 	return ret;
5258 }
5259 
5260 /*
5261  * when we free an block, it is possible (and likely) that we free the last
5262  * delayed ref for that extent as well.  This searches the delayed ref tree for
5263  * a given extent, and if there are no other delayed refs to be processed, it
5264  * removes it from the tree.
5265  */
5266 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5267 				      struct btrfs_root *root, u64 bytenr)
5268 {
5269 	struct btrfs_delayed_ref_head *head;
5270 	struct btrfs_delayed_ref_root *delayed_refs;
5271 	struct btrfs_delayed_ref_node *ref;
5272 	struct rb_node *node;
5273 	int ret = 0;
5274 
5275 	delayed_refs = &trans->transaction->delayed_refs;
5276 	spin_lock(&delayed_refs->lock);
5277 	head = btrfs_find_delayed_ref_head(trans, bytenr);
5278 	if (!head)
5279 		goto out;
5280 
5281 	node = rb_prev(&head->node.rb_node);
5282 	if (!node)
5283 		goto out;
5284 
5285 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5286 
5287 	/* there are still entries for this ref, we can't drop it */
5288 	if (ref->bytenr == bytenr)
5289 		goto out;
5290 
5291 	if (head->extent_op) {
5292 		if (!head->must_insert_reserved)
5293 			goto out;
5294 		kfree(head->extent_op);
5295 		head->extent_op = NULL;
5296 	}
5297 
5298 	/*
5299 	 * waiting for the lock here would deadlock.  If someone else has it
5300 	 * locked they are already in the process of dropping it anyway
5301 	 */
5302 	if (!mutex_trylock(&head->mutex))
5303 		goto out;
5304 
5305 	/*
5306 	 * at this point we have a head with no other entries.  Go
5307 	 * ahead and process it.
5308 	 */
5309 	head->node.in_tree = 0;
5310 	rb_erase(&head->node.rb_node, &delayed_refs->root);
5311 
5312 	delayed_refs->num_entries--;
5313 
5314 	/*
5315 	 * we don't take a ref on the node because we're removing it from the
5316 	 * tree, so we just steal the ref the tree was holding.
5317 	 */
5318 	delayed_refs->num_heads--;
5319 	if (list_empty(&head->cluster))
5320 		delayed_refs->num_heads_ready--;
5321 
5322 	list_del_init(&head->cluster);
5323 	spin_unlock(&delayed_refs->lock);
5324 
5325 	BUG_ON(head->extent_op);
5326 	if (head->must_insert_reserved)
5327 		ret = 1;
5328 
5329 	mutex_unlock(&head->mutex);
5330 	btrfs_put_delayed_ref(&head->node);
5331 	return ret;
5332 out:
5333 	spin_unlock(&delayed_refs->lock);
5334 	return 0;
5335 }
5336 
5337 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5338 			   struct btrfs_root *root,
5339 			   struct extent_buffer *buf,
5340 			   u64 parent, int last_ref)
5341 {
5342 	struct btrfs_block_group_cache *cache = NULL;
5343 	int ret;
5344 
5345 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5346 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5347 					buf->start, buf->len,
5348 					parent, root->root_key.objectid,
5349 					btrfs_header_level(buf),
5350 					BTRFS_DROP_DELAYED_REF, NULL, 0);
5351 		BUG_ON(ret); /* -ENOMEM */
5352 	}
5353 
5354 	if (!last_ref)
5355 		return;
5356 
5357 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5358 
5359 	if (btrfs_header_generation(buf) == trans->transid) {
5360 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5361 			ret = check_ref_cleanup(trans, root, buf->start);
5362 			if (!ret)
5363 				goto out;
5364 		}
5365 
5366 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5367 			pin_down_extent(root, cache, buf->start, buf->len, 1);
5368 			goto out;
5369 		}
5370 
5371 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5372 
5373 		btrfs_add_free_space(cache, buf->start, buf->len);
5374 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5375 	}
5376 out:
5377 	/*
5378 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5379 	 * anymore.
5380 	 */
5381 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5382 	btrfs_put_block_group(cache);
5383 }
5384 
5385 /* Can return -ENOMEM */
5386 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5387 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5388 		      u64 owner, u64 offset, int for_cow)
5389 {
5390 	int ret;
5391 	struct btrfs_fs_info *fs_info = root->fs_info;
5392 
5393 	/*
5394 	 * tree log blocks never actually go into the extent allocation
5395 	 * tree, just update pinning info and exit early.
5396 	 */
5397 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5398 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5399 		/* unlocks the pinned mutex */
5400 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
5401 		ret = 0;
5402 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5403 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5404 					num_bytes,
5405 					parent, root_objectid, (int)owner,
5406 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5407 	} else {
5408 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5409 						num_bytes,
5410 						parent, root_objectid, owner,
5411 						offset, BTRFS_DROP_DELAYED_REF,
5412 						NULL, for_cow);
5413 	}
5414 	return ret;
5415 }
5416 
5417 static u64 stripe_align(struct btrfs_root *root, u64 val)
5418 {
5419 	u64 mask = ((u64)root->stripesize - 1);
5420 	u64 ret = (val + mask) & ~mask;
5421 	return ret;
5422 }
5423 
5424 /*
5425  * when we wait for progress in the block group caching, its because
5426  * our allocation attempt failed at least once.  So, we must sleep
5427  * and let some progress happen before we try again.
5428  *
5429  * This function will sleep at least once waiting for new free space to
5430  * show up, and then it will check the block group free space numbers
5431  * for our min num_bytes.  Another option is to have it go ahead
5432  * and look in the rbtree for a free extent of a given size, but this
5433  * is a good start.
5434  */
5435 static noinline int
5436 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5437 				u64 num_bytes)
5438 {
5439 	struct btrfs_caching_control *caching_ctl;
5440 	DEFINE_WAIT(wait);
5441 
5442 	caching_ctl = get_caching_control(cache);
5443 	if (!caching_ctl)
5444 		return 0;
5445 
5446 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5447 		   (cache->free_space_ctl->free_space >= num_bytes));
5448 
5449 	put_caching_control(caching_ctl);
5450 	return 0;
5451 }
5452 
5453 static noinline int
5454 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5455 {
5456 	struct btrfs_caching_control *caching_ctl;
5457 	DEFINE_WAIT(wait);
5458 
5459 	caching_ctl = get_caching_control(cache);
5460 	if (!caching_ctl)
5461 		return 0;
5462 
5463 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
5464 
5465 	put_caching_control(caching_ctl);
5466 	return 0;
5467 }
5468 
5469 static int __get_block_group_index(u64 flags)
5470 {
5471 	int index;
5472 
5473 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
5474 		index = 0;
5475 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5476 		index = 1;
5477 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
5478 		index = 2;
5479 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5480 		index = 3;
5481 	else
5482 		index = 4;
5483 
5484 	return index;
5485 }
5486 
5487 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488 {
5489 	return __get_block_group_index(cache->flags);
5490 }
5491 
5492 enum btrfs_loop_type {
5493 	LOOP_CACHING_NOWAIT = 0,
5494 	LOOP_CACHING_WAIT = 1,
5495 	LOOP_ALLOC_CHUNK = 2,
5496 	LOOP_NO_EMPTY_SIZE = 3,
5497 };
5498 
5499 /*
5500  * walks the btree of allocated extents and find a hole of a given size.
5501  * The key ins is changed to record the hole:
5502  * ins->objectid == block start
5503  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5504  * ins->offset == number of blocks
5505  * Any available blocks before search_start are skipped.
5506  */
5507 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5508 				     struct btrfs_root *orig_root,
5509 				     u64 num_bytes, u64 empty_size,
5510 				     u64 hint_byte, struct btrfs_key *ins,
5511 				     u64 data)
5512 {
5513 	int ret = 0;
5514 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5515 	struct btrfs_free_cluster *last_ptr = NULL;
5516 	struct btrfs_block_group_cache *block_group = NULL;
5517 	struct btrfs_block_group_cache *used_block_group;
5518 	u64 search_start = 0;
5519 	int empty_cluster = 2 * 1024 * 1024;
5520 	struct btrfs_space_info *space_info;
5521 	int loop = 0;
5522 	int index = 0;
5523 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5524 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5525 	bool found_uncached_bg = false;
5526 	bool failed_cluster_refill = false;
5527 	bool failed_alloc = false;
5528 	bool use_cluster = true;
5529 	bool have_caching_bg = false;
5530 
5531 	WARN_ON(num_bytes < root->sectorsize);
5532 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5533 	ins->objectid = 0;
5534 	ins->offset = 0;
5535 
5536 	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5537 
5538 	space_info = __find_space_info(root->fs_info, data);
5539 	if (!space_info) {
5540 		printk(KERN_ERR "No space info for %llu\n", data);
5541 		return -ENOSPC;
5542 	}
5543 
5544 	/*
5545 	 * If the space info is for both data and metadata it means we have a
5546 	 * small filesystem and we can't use the clustering stuff.
5547 	 */
5548 	if (btrfs_mixed_space_info(space_info))
5549 		use_cluster = false;
5550 
5551 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5552 		last_ptr = &root->fs_info->meta_alloc_cluster;
5553 		if (!btrfs_test_opt(root, SSD))
5554 			empty_cluster = 64 * 1024;
5555 	}
5556 
5557 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5558 	    btrfs_test_opt(root, SSD)) {
5559 		last_ptr = &root->fs_info->data_alloc_cluster;
5560 	}
5561 
5562 	if (last_ptr) {
5563 		spin_lock(&last_ptr->lock);
5564 		if (last_ptr->block_group)
5565 			hint_byte = last_ptr->window_start;
5566 		spin_unlock(&last_ptr->lock);
5567 	}
5568 
5569 	search_start = max(search_start, first_logical_byte(root, 0));
5570 	search_start = max(search_start, hint_byte);
5571 
5572 	if (!last_ptr)
5573 		empty_cluster = 0;
5574 
5575 	if (search_start == hint_byte) {
5576 		block_group = btrfs_lookup_block_group(root->fs_info,
5577 						       search_start);
5578 		used_block_group = block_group;
5579 		/*
5580 		 * we don't want to use the block group if it doesn't match our
5581 		 * allocation bits, or if its not cached.
5582 		 *
5583 		 * However if we are re-searching with an ideal block group
5584 		 * picked out then we don't care that the block group is cached.
5585 		 */
5586 		if (block_group && block_group_bits(block_group, data) &&
5587 		    block_group->cached != BTRFS_CACHE_NO) {
5588 			down_read(&space_info->groups_sem);
5589 			if (list_empty(&block_group->list) ||
5590 			    block_group->ro) {
5591 				/*
5592 				 * someone is removing this block group,
5593 				 * we can't jump into the have_block_group
5594 				 * target because our list pointers are not
5595 				 * valid
5596 				 */
5597 				btrfs_put_block_group(block_group);
5598 				up_read(&space_info->groups_sem);
5599 			} else {
5600 				index = get_block_group_index(block_group);
5601 				goto have_block_group;
5602 			}
5603 		} else if (block_group) {
5604 			btrfs_put_block_group(block_group);
5605 		}
5606 	}
5607 search:
5608 	have_caching_bg = false;
5609 	down_read(&space_info->groups_sem);
5610 	list_for_each_entry(block_group, &space_info->block_groups[index],
5611 			    list) {
5612 		u64 offset;
5613 		int cached;
5614 
5615 		used_block_group = block_group;
5616 		btrfs_get_block_group(block_group);
5617 		search_start = block_group->key.objectid;
5618 
5619 		/*
5620 		 * this can happen if we end up cycling through all the
5621 		 * raid types, but we want to make sure we only allocate
5622 		 * for the proper type.
5623 		 */
5624 		if (!block_group_bits(block_group, data)) {
5625 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5626 				BTRFS_BLOCK_GROUP_RAID1 |
5627 				BTRFS_BLOCK_GROUP_RAID10;
5628 
5629 			/*
5630 			 * if they asked for extra copies and this block group
5631 			 * doesn't provide them, bail.  This does allow us to
5632 			 * fill raid0 from raid1.
5633 			 */
5634 			if ((data & extra) && !(block_group->flags & extra))
5635 				goto loop;
5636 		}
5637 
5638 have_block_group:
5639 		cached = block_group_cache_done(block_group);
5640 		if (unlikely(!cached)) {
5641 			found_uncached_bg = true;
5642 			ret = cache_block_group(block_group, trans,
5643 						orig_root, 0);
5644 			BUG_ON(ret < 0);
5645 			ret = 0;
5646 		}
5647 
5648 		if (unlikely(block_group->ro))
5649 			goto loop;
5650 
5651 		/*
5652 		 * Ok we want to try and use the cluster allocator, so
5653 		 * lets look there
5654 		 */
5655 		if (last_ptr) {
5656 			/*
5657 			 * the refill lock keeps out other
5658 			 * people trying to start a new cluster
5659 			 */
5660 			spin_lock(&last_ptr->refill_lock);
5661 			used_block_group = last_ptr->block_group;
5662 			if (used_block_group != block_group &&
5663 			    (!used_block_group ||
5664 			     used_block_group->ro ||
5665 			     !block_group_bits(used_block_group, data))) {
5666 				used_block_group = block_group;
5667 				goto refill_cluster;
5668 			}
5669 
5670 			if (used_block_group != block_group)
5671 				btrfs_get_block_group(used_block_group);
5672 
5673 			offset = btrfs_alloc_from_cluster(used_block_group,
5674 			  last_ptr, num_bytes, used_block_group->key.objectid);
5675 			if (offset) {
5676 				/* we have a block, we're done */
5677 				spin_unlock(&last_ptr->refill_lock);
5678 				trace_btrfs_reserve_extent_cluster(root,
5679 					block_group, search_start, num_bytes);
5680 				goto checks;
5681 			}
5682 
5683 			WARN_ON(last_ptr->block_group != used_block_group);
5684 			if (used_block_group != block_group) {
5685 				btrfs_put_block_group(used_block_group);
5686 				used_block_group = block_group;
5687 			}
5688 refill_cluster:
5689 			BUG_ON(used_block_group != block_group);
5690 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
5691 			 * set up a new clusters, so lets just skip it
5692 			 * and let the allocator find whatever block
5693 			 * it can find.  If we reach this point, we
5694 			 * will have tried the cluster allocator
5695 			 * plenty of times and not have found
5696 			 * anything, so we are likely way too
5697 			 * fragmented for the clustering stuff to find
5698 			 * anything.
5699 			 *
5700 			 * However, if the cluster is taken from the
5701 			 * current block group, release the cluster
5702 			 * first, so that we stand a better chance of
5703 			 * succeeding in the unclustered
5704 			 * allocation.  */
5705 			if (loop >= LOOP_NO_EMPTY_SIZE &&
5706 			    last_ptr->block_group != block_group) {
5707 				spin_unlock(&last_ptr->refill_lock);
5708 				goto unclustered_alloc;
5709 			}
5710 
5711 			/*
5712 			 * this cluster didn't work out, free it and
5713 			 * start over
5714 			 */
5715 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5716 
5717 			if (loop >= LOOP_NO_EMPTY_SIZE) {
5718 				spin_unlock(&last_ptr->refill_lock);
5719 				goto unclustered_alloc;
5720 			}
5721 
5722 			/* allocate a cluster in this block group */
5723 			ret = btrfs_find_space_cluster(trans, root,
5724 					       block_group, last_ptr,
5725 					       search_start, num_bytes,
5726 					       empty_cluster + empty_size);
5727 			if (ret == 0) {
5728 				/*
5729 				 * now pull our allocation out of this
5730 				 * cluster
5731 				 */
5732 				offset = btrfs_alloc_from_cluster(block_group,
5733 						  last_ptr, num_bytes,
5734 						  search_start);
5735 				if (offset) {
5736 					/* we found one, proceed */
5737 					spin_unlock(&last_ptr->refill_lock);
5738 					trace_btrfs_reserve_extent_cluster(root,
5739 						block_group, search_start,
5740 						num_bytes);
5741 					goto checks;
5742 				}
5743 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
5744 				   && !failed_cluster_refill) {
5745 				spin_unlock(&last_ptr->refill_lock);
5746 
5747 				failed_cluster_refill = true;
5748 				wait_block_group_cache_progress(block_group,
5749 				       num_bytes + empty_cluster + empty_size);
5750 				goto have_block_group;
5751 			}
5752 
5753 			/*
5754 			 * at this point we either didn't find a cluster
5755 			 * or we weren't able to allocate a block from our
5756 			 * cluster.  Free the cluster we've been trying
5757 			 * to use, and go to the next block group
5758 			 */
5759 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5760 			spin_unlock(&last_ptr->refill_lock);
5761 			goto loop;
5762 		}
5763 
5764 unclustered_alloc:
5765 		spin_lock(&block_group->free_space_ctl->tree_lock);
5766 		if (cached &&
5767 		    block_group->free_space_ctl->free_space <
5768 		    num_bytes + empty_cluster + empty_size) {
5769 			spin_unlock(&block_group->free_space_ctl->tree_lock);
5770 			goto loop;
5771 		}
5772 		spin_unlock(&block_group->free_space_ctl->tree_lock);
5773 
5774 		offset = btrfs_find_space_for_alloc(block_group, search_start,
5775 						    num_bytes, empty_size);
5776 		/*
5777 		 * If we didn't find a chunk, and we haven't failed on this
5778 		 * block group before, and this block group is in the middle of
5779 		 * caching and we are ok with waiting, then go ahead and wait
5780 		 * for progress to be made, and set failed_alloc to true.
5781 		 *
5782 		 * If failed_alloc is true then we've already waited on this
5783 		 * block group once and should move on to the next block group.
5784 		 */
5785 		if (!offset && !failed_alloc && !cached &&
5786 		    loop > LOOP_CACHING_NOWAIT) {
5787 			wait_block_group_cache_progress(block_group,
5788 						num_bytes + empty_size);
5789 			failed_alloc = true;
5790 			goto have_block_group;
5791 		} else if (!offset) {
5792 			if (!cached)
5793 				have_caching_bg = true;
5794 			goto loop;
5795 		}
5796 checks:
5797 		search_start = stripe_align(root, offset);
5798 
5799 		/* move on to the next group */
5800 		if (search_start + num_bytes >
5801 		    used_block_group->key.objectid + used_block_group->key.offset) {
5802 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5803 			goto loop;
5804 		}
5805 
5806 		if (offset < search_start)
5807 			btrfs_add_free_space(used_block_group, offset,
5808 					     search_start - offset);
5809 		BUG_ON(offset > search_start);
5810 
5811 		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5812 						  alloc_type);
5813 		if (ret == -EAGAIN) {
5814 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5815 			goto loop;
5816 		}
5817 
5818 		/* we are all good, lets return */
5819 		ins->objectid = search_start;
5820 		ins->offset = num_bytes;
5821 
5822 		trace_btrfs_reserve_extent(orig_root, block_group,
5823 					   search_start, num_bytes);
5824 		if (used_block_group != block_group)
5825 			btrfs_put_block_group(used_block_group);
5826 		btrfs_put_block_group(block_group);
5827 		break;
5828 loop:
5829 		failed_cluster_refill = false;
5830 		failed_alloc = false;
5831 		BUG_ON(index != get_block_group_index(block_group));
5832 		if (used_block_group != block_group)
5833 			btrfs_put_block_group(used_block_group);
5834 		btrfs_put_block_group(block_group);
5835 	}
5836 	up_read(&space_info->groups_sem);
5837 
5838 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5839 		goto search;
5840 
5841 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5842 		goto search;
5843 
5844 	/*
5845 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5846 	 *			caching kthreads as we move along
5847 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5848 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5849 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5850 	 *			again
5851 	 */
5852 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5853 		index = 0;
5854 		loop++;
5855 		if (loop == LOOP_ALLOC_CHUNK) {
5856 			ret = do_chunk_alloc(trans, root, data,
5857 					     CHUNK_ALLOC_FORCE);
5858 			/*
5859 			 * Do not bail out on ENOSPC since we
5860 			 * can do more things.
5861 			 */
5862 			if (ret < 0 && ret != -ENOSPC) {
5863 				btrfs_abort_transaction(trans,
5864 							root, ret);
5865 				goto out;
5866 			}
5867 		}
5868 
5869 		if (loop == LOOP_NO_EMPTY_SIZE) {
5870 			empty_size = 0;
5871 			empty_cluster = 0;
5872 		}
5873 
5874 		goto search;
5875 	} else if (!ins->objectid) {
5876 		ret = -ENOSPC;
5877 	} else if (ins->objectid) {
5878 		ret = 0;
5879 	}
5880 out:
5881 
5882 	return ret;
5883 }
5884 
5885 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5886 			    int dump_block_groups)
5887 {
5888 	struct btrfs_block_group_cache *cache;
5889 	int index = 0;
5890 
5891 	spin_lock(&info->lock);
5892 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5893 	       (unsigned long long)info->flags,
5894 	       (unsigned long long)(info->total_bytes - info->bytes_used -
5895 				    info->bytes_pinned - info->bytes_reserved -
5896 				    info->bytes_readonly),
5897 	       (info->full) ? "" : "not ");
5898 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5899 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
5900 	       (unsigned long long)info->total_bytes,
5901 	       (unsigned long long)info->bytes_used,
5902 	       (unsigned long long)info->bytes_pinned,
5903 	       (unsigned long long)info->bytes_reserved,
5904 	       (unsigned long long)info->bytes_may_use,
5905 	       (unsigned long long)info->bytes_readonly);
5906 	spin_unlock(&info->lock);
5907 
5908 	if (!dump_block_groups)
5909 		return;
5910 
5911 	down_read(&info->groups_sem);
5912 again:
5913 	list_for_each_entry(cache, &info->block_groups[index], list) {
5914 		spin_lock(&cache->lock);
5915 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5916 		       (unsigned long long)cache->key.objectid,
5917 		       (unsigned long long)cache->key.offset,
5918 		       (unsigned long long)btrfs_block_group_used(&cache->item),
5919 		       (unsigned long long)cache->pinned,
5920 		       (unsigned long long)cache->reserved,
5921 		       cache->ro ? "[readonly]" : "");
5922 		btrfs_dump_free_space(cache, bytes);
5923 		spin_unlock(&cache->lock);
5924 	}
5925 	if (++index < BTRFS_NR_RAID_TYPES)
5926 		goto again;
5927 	up_read(&info->groups_sem);
5928 }
5929 
5930 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5931 			 struct btrfs_root *root,
5932 			 u64 num_bytes, u64 min_alloc_size,
5933 			 u64 empty_size, u64 hint_byte,
5934 			 struct btrfs_key *ins, u64 data)
5935 {
5936 	bool final_tried = false;
5937 	int ret;
5938 
5939 	data = btrfs_get_alloc_profile(root, data);
5940 again:
5941 	WARN_ON(num_bytes < root->sectorsize);
5942 	ret = find_free_extent(trans, root, num_bytes, empty_size,
5943 			       hint_byte, ins, data);
5944 
5945 	if (ret == -ENOSPC) {
5946 		if (!final_tried) {
5947 			num_bytes = num_bytes >> 1;
5948 			num_bytes = num_bytes & ~(root->sectorsize - 1);
5949 			num_bytes = max(num_bytes, min_alloc_size);
5950 			if (num_bytes == min_alloc_size)
5951 				final_tried = true;
5952 			goto again;
5953 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
5954 			struct btrfs_space_info *sinfo;
5955 
5956 			sinfo = __find_space_info(root->fs_info, data);
5957 			printk(KERN_ERR "btrfs allocation failed flags %llu, "
5958 			       "wanted %llu\n", (unsigned long long)data,
5959 			       (unsigned long long)num_bytes);
5960 			if (sinfo)
5961 				dump_space_info(sinfo, num_bytes, 1);
5962 		}
5963 	}
5964 
5965 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5966 
5967 	return ret;
5968 }
5969 
5970 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5971 					u64 start, u64 len, int pin)
5972 {
5973 	struct btrfs_block_group_cache *cache;
5974 	int ret = 0;
5975 
5976 	cache = btrfs_lookup_block_group(root->fs_info, start);
5977 	if (!cache) {
5978 		printk(KERN_ERR "Unable to find block group for %llu\n",
5979 		       (unsigned long long)start);
5980 		return -ENOSPC;
5981 	}
5982 
5983 	if (btrfs_test_opt(root, DISCARD))
5984 		ret = btrfs_discard_extent(root, start, len, NULL);
5985 
5986 	if (pin)
5987 		pin_down_extent(root, cache, start, len, 1);
5988 	else {
5989 		btrfs_add_free_space(cache, start, len);
5990 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5991 	}
5992 	btrfs_put_block_group(cache);
5993 
5994 	trace_btrfs_reserved_extent_free(root, start, len);
5995 
5996 	return ret;
5997 }
5998 
5999 int btrfs_free_reserved_extent(struct btrfs_root *root,
6000 					u64 start, u64 len)
6001 {
6002 	return __btrfs_free_reserved_extent(root, start, len, 0);
6003 }
6004 
6005 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6006 				       u64 start, u64 len)
6007 {
6008 	return __btrfs_free_reserved_extent(root, start, len, 1);
6009 }
6010 
6011 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6012 				      struct btrfs_root *root,
6013 				      u64 parent, u64 root_objectid,
6014 				      u64 flags, u64 owner, u64 offset,
6015 				      struct btrfs_key *ins, int ref_mod)
6016 {
6017 	int ret;
6018 	struct btrfs_fs_info *fs_info = root->fs_info;
6019 	struct btrfs_extent_item *extent_item;
6020 	struct btrfs_extent_inline_ref *iref;
6021 	struct btrfs_path *path;
6022 	struct extent_buffer *leaf;
6023 	int type;
6024 	u32 size;
6025 
6026 	if (parent > 0)
6027 		type = BTRFS_SHARED_DATA_REF_KEY;
6028 	else
6029 		type = BTRFS_EXTENT_DATA_REF_KEY;
6030 
6031 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6032 
6033 	path = btrfs_alloc_path();
6034 	if (!path)
6035 		return -ENOMEM;
6036 
6037 	path->leave_spinning = 1;
6038 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6039 				      ins, size);
6040 	if (ret) {
6041 		btrfs_free_path(path);
6042 		return ret;
6043 	}
6044 
6045 	leaf = path->nodes[0];
6046 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6047 				     struct btrfs_extent_item);
6048 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6049 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6050 	btrfs_set_extent_flags(leaf, extent_item,
6051 			       flags | BTRFS_EXTENT_FLAG_DATA);
6052 
6053 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6054 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
6055 	if (parent > 0) {
6056 		struct btrfs_shared_data_ref *ref;
6057 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
6058 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6059 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6060 	} else {
6061 		struct btrfs_extent_data_ref *ref;
6062 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6063 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6064 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6065 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6066 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6067 	}
6068 
6069 	btrfs_mark_buffer_dirty(path->nodes[0]);
6070 	btrfs_free_path(path);
6071 
6072 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6073 	if (ret) { /* -ENOENT, logic error */
6074 		printk(KERN_ERR "btrfs update block group failed for %llu "
6075 		       "%llu\n", (unsigned long long)ins->objectid,
6076 		       (unsigned long long)ins->offset);
6077 		BUG();
6078 	}
6079 	return ret;
6080 }
6081 
6082 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6083 				     struct btrfs_root *root,
6084 				     u64 parent, u64 root_objectid,
6085 				     u64 flags, struct btrfs_disk_key *key,
6086 				     int level, struct btrfs_key *ins)
6087 {
6088 	int ret;
6089 	struct btrfs_fs_info *fs_info = root->fs_info;
6090 	struct btrfs_extent_item *extent_item;
6091 	struct btrfs_tree_block_info *block_info;
6092 	struct btrfs_extent_inline_ref *iref;
6093 	struct btrfs_path *path;
6094 	struct extent_buffer *leaf;
6095 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6096 
6097 	path = btrfs_alloc_path();
6098 	if (!path)
6099 		return -ENOMEM;
6100 
6101 	path->leave_spinning = 1;
6102 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6103 				      ins, size);
6104 	if (ret) {
6105 		btrfs_free_path(path);
6106 		return ret;
6107 	}
6108 
6109 	leaf = path->nodes[0];
6110 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6111 				     struct btrfs_extent_item);
6112 	btrfs_set_extent_refs(leaf, extent_item, 1);
6113 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6114 	btrfs_set_extent_flags(leaf, extent_item,
6115 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6116 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6117 
6118 	btrfs_set_tree_block_key(leaf, block_info, key);
6119 	btrfs_set_tree_block_level(leaf, block_info, level);
6120 
6121 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6122 	if (parent > 0) {
6123 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6124 		btrfs_set_extent_inline_ref_type(leaf, iref,
6125 						 BTRFS_SHARED_BLOCK_REF_KEY);
6126 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6127 	} else {
6128 		btrfs_set_extent_inline_ref_type(leaf, iref,
6129 						 BTRFS_TREE_BLOCK_REF_KEY);
6130 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6131 	}
6132 
6133 	btrfs_mark_buffer_dirty(leaf);
6134 	btrfs_free_path(path);
6135 
6136 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6137 	if (ret) { /* -ENOENT, logic error */
6138 		printk(KERN_ERR "btrfs update block group failed for %llu "
6139 		       "%llu\n", (unsigned long long)ins->objectid,
6140 		       (unsigned long long)ins->offset);
6141 		BUG();
6142 	}
6143 	return ret;
6144 }
6145 
6146 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6147 				     struct btrfs_root *root,
6148 				     u64 root_objectid, u64 owner,
6149 				     u64 offset, struct btrfs_key *ins)
6150 {
6151 	int ret;
6152 
6153 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6154 
6155 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6156 					 ins->offset, 0,
6157 					 root_objectid, owner, offset,
6158 					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6159 	return ret;
6160 }
6161 
6162 /*
6163  * this is used by the tree logging recovery code.  It records that
6164  * an extent has been allocated and makes sure to clear the free
6165  * space cache bits as well
6166  */
6167 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6168 				   struct btrfs_root *root,
6169 				   u64 root_objectid, u64 owner, u64 offset,
6170 				   struct btrfs_key *ins)
6171 {
6172 	int ret;
6173 	struct btrfs_block_group_cache *block_group;
6174 	struct btrfs_caching_control *caching_ctl;
6175 	u64 start = ins->objectid;
6176 	u64 num_bytes = ins->offset;
6177 
6178 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6179 	cache_block_group(block_group, trans, NULL, 0);
6180 	caching_ctl = get_caching_control(block_group);
6181 
6182 	if (!caching_ctl) {
6183 		BUG_ON(!block_group_cache_done(block_group));
6184 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6185 		BUG_ON(ret); /* -ENOMEM */
6186 	} else {
6187 		mutex_lock(&caching_ctl->mutex);
6188 
6189 		if (start >= caching_ctl->progress) {
6190 			ret = add_excluded_extent(root, start, num_bytes);
6191 			BUG_ON(ret); /* -ENOMEM */
6192 		} else if (start + num_bytes <= caching_ctl->progress) {
6193 			ret = btrfs_remove_free_space(block_group,
6194 						      start, num_bytes);
6195 			BUG_ON(ret); /* -ENOMEM */
6196 		} else {
6197 			num_bytes = caching_ctl->progress - start;
6198 			ret = btrfs_remove_free_space(block_group,
6199 						      start, num_bytes);
6200 			BUG_ON(ret); /* -ENOMEM */
6201 
6202 			start = caching_ctl->progress;
6203 			num_bytes = ins->objectid + ins->offset -
6204 				    caching_ctl->progress;
6205 			ret = add_excluded_extent(root, start, num_bytes);
6206 			BUG_ON(ret); /* -ENOMEM */
6207 		}
6208 
6209 		mutex_unlock(&caching_ctl->mutex);
6210 		put_caching_control(caching_ctl);
6211 	}
6212 
6213 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6214 					  RESERVE_ALLOC_NO_ACCOUNT);
6215 	BUG_ON(ret); /* logic error */
6216 	btrfs_put_block_group(block_group);
6217 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6218 					 0, owner, offset, ins, 1);
6219 	return ret;
6220 }
6221 
6222 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6223 					    struct btrfs_root *root,
6224 					    u64 bytenr, u32 blocksize,
6225 					    int level)
6226 {
6227 	struct extent_buffer *buf;
6228 
6229 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6230 	if (!buf)
6231 		return ERR_PTR(-ENOMEM);
6232 	btrfs_set_header_generation(buf, trans->transid);
6233 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6234 	btrfs_tree_lock(buf);
6235 	clean_tree_block(trans, root, buf);
6236 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6237 
6238 	btrfs_set_lock_blocking(buf);
6239 	btrfs_set_buffer_uptodate(buf);
6240 
6241 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6242 		/*
6243 		 * we allow two log transactions at a time, use different
6244 		 * EXENT bit to differentiate dirty pages.
6245 		 */
6246 		if (root->log_transid % 2 == 0)
6247 			set_extent_dirty(&root->dirty_log_pages, buf->start,
6248 					buf->start + buf->len - 1, GFP_NOFS);
6249 		else
6250 			set_extent_new(&root->dirty_log_pages, buf->start,
6251 					buf->start + buf->len - 1, GFP_NOFS);
6252 	} else {
6253 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6254 			 buf->start + buf->len - 1, GFP_NOFS);
6255 	}
6256 	trans->blocks_used++;
6257 	/* this returns a buffer locked for blocking */
6258 	return buf;
6259 }
6260 
6261 static struct btrfs_block_rsv *
6262 use_block_rsv(struct btrfs_trans_handle *trans,
6263 	      struct btrfs_root *root, u32 blocksize)
6264 {
6265 	struct btrfs_block_rsv *block_rsv;
6266 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6267 	int ret;
6268 
6269 	block_rsv = get_block_rsv(trans, root);
6270 
6271 	if (block_rsv->size == 0) {
6272 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6273 		/*
6274 		 * If we couldn't reserve metadata bytes try and use some from
6275 		 * the global reserve.
6276 		 */
6277 		if (ret && block_rsv != global_rsv) {
6278 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6279 			if (!ret)
6280 				return global_rsv;
6281 			return ERR_PTR(ret);
6282 		} else if (ret) {
6283 			return ERR_PTR(ret);
6284 		}
6285 		return block_rsv;
6286 	}
6287 
6288 	ret = block_rsv_use_bytes(block_rsv, blocksize);
6289 	if (!ret)
6290 		return block_rsv;
6291 	if (ret && !block_rsv->failfast) {
6292 		static DEFINE_RATELIMIT_STATE(_rs,
6293 				DEFAULT_RATELIMIT_INTERVAL,
6294 				/*DEFAULT_RATELIMIT_BURST*/ 2);
6295 		if (__ratelimit(&_rs)) {
6296 			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
6297 			WARN_ON(1);
6298 		}
6299 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6300 		if (!ret) {
6301 			return block_rsv;
6302 		} else if (ret && block_rsv != global_rsv) {
6303 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6304 			if (!ret)
6305 				return global_rsv;
6306 		}
6307 	}
6308 
6309 	return ERR_PTR(-ENOSPC);
6310 }
6311 
6312 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6313 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
6314 {
6315 	block_rsv_add_bytes(block_rsv, blocksize, 0);
6316 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6317 }
6318 
6319 /*
6320  * finds a free extent and does all the dirty work required for allocation
6321  * returns the key for the extent through ins, and a tree buffer for
6322  * the first block of the extent through buf.
6323  *
6324  * returns the tree buffer or NULL.
6325  */
6326 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6327 					struct btrfs_root *root, u32 blocksize,
6328 					u64 parent, u64 root_objectid,
6329 					struct btrfs_disk_key *key, int level,
6330 					u64 hint, u64 empty_size)
6331 {
6332 	struct btrfs_key ins;
6333 	struct btrfs_block_rsv *block_rsv;
6334 	struct extent_buffer *buf;
6335 	u64 flags = 0;
6336 	int ret;
6337 
6338 
6339 	block_rsv = use_block_rsv(trans, root, blocksize);
6340 	if (IS_ERR(block_rsv))
6341 		return ERR_CAST(block_rsv);
6342 
6343 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6344 				   empty_size, hint, &ins, 0);
6345 	if (ret) {
6346 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6347 		return ERR_PTR(ret);
6348 	}
6349 
6350 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6351 				    blocksize, level);
6352 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6353 
6354 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6355 		if (parent == 0)
6356 			parent = ins.objectid;
6357 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6358 	} else
6359 		BUG_ON(parent > 0);
6360 
6361 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6362 		struct btrfs_delayed_extent_op *extent_op;
6363 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
6364 		BUG_ON(!extent_op); /* -ENOMEM */
6365 		if (key)
6366 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
6367 		else
6368 			memset(&extent_op->key, 0, sizeof(extent_op->key));
6369 		extent_op->flags_to_set = flags;
6370 		extent_op->update_key = 1;
6371 		extent_op->update_flags = 1;
6372 		extent_op->is_data = 0;
6373 
6374 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6375 					ins.objectid,
6376 					ins.offset, parent, root_objectid,
6377 					level, BTRFS_ADD_DELAYED_EXTENT,
6378 					extent_op, 0);
6379 		BUG_ON(ret); /* -ENOMEM */
6380 	}
6381 	return buf;
6382 }
6383 
6384 struct walk_control {
6385 	u64 refs[BTRFS_MAX_LEVEL];
6386 	u64 flags[BTRFS_MAX_LEVEL];
6387 	struct btrfs_key update_progress;
6388 	int stage;
6389 	int level;
6390 	int shared_level;
6391 	int update_ref;
6392 	int keep_locks;
6393 	int reada_slot;
6394 	int reada_count;
6395 	int for_reloc;
6396 };
6397 
6398 #define DROP_REFERENCE	1
6399 #define UPDATE_BACKREF	2
6400 
6401 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6402 				     struct btrfs_root *root,
6403 				     struct walk_control *wc,
6404 				     struct btrfs_path *path)
6405 {
6406 	u64 bytenr;
6407 	u64 generation;
6408 	u64 refs;
6409 	u64 flags;
6410 	u32 nritems;
6411 	u32 blocksize;
6412 	struct btrfs_key key;
6413 	struct extent_buffer *eb;
6414 	int ret;
6415 	int slot;
6416 	int nread = 0;
6417 
6418 	if (path->slots[wc->level] < wc->reada_slot) {
6419 		wc->reada_count = wc->reada_count * 2 / 3;
6420 		wc->reada_count = max(wc->reada_count, 2);
6421 	} else {
6422 		wc->reada_count = wc->reada_count * 3 / 2;
6423 		wc->reada_count = min_t(int, wc->reada_count,
6424 					BTRFS_NODEPTRS_PER_BLOCK(root));
6425 	}
6426 
6427 	eb = path->nodes[wc->level];
6428 	nritems = btrfs_header_nritems(eb);
6429 	blocksize = btrfs_level_size(root, wc->level - 1);
6430 
6431 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6432 		if (nread >= wc->reada_count)
6433 			break;
6434 
6435 		cond_resched();
6436 		bytenr = btrfs_node_blockptr(eb, slot);
6437 		generation = btrfs_node_ptr_generation(eb, slot);
6438 
6439 		if (slot == path->slots[wc->level])
6440 			goto reada;
6441 
6442 		if (wc->stage == UPDATE_BACKREF &&
6443 		    generation <= root->root_key.offset)
6444 			continue;
6445 
6446 		/* We don't lock the tree block, it's OK to be racy here */
6447 		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6448 					       &refs, &flags);
6449 		/* We don't care about errors in readahead. */
6450 		if (ret < 0)
6451 			continue;
6452 		BUG_ON(refs == 0);
6453 
6454 		if (wc->stage == DROP_REFERENCE) {
6455 			if (refs == 1)
6456 				goto reada;
6457 
6458 			if (wc->level == 1 &&
6459 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6460 				continue;
6461 			if (!wc->update_ref ||
6462 			    generation <= root->root_key.offset)
6463 				continue;
6464 			btrfs_node_key_to_cpu(eb, &key, slot);
6465 			ret = btrfs_comp_cpu_keys(&key,
6466 						  &wc->update_progress);
6467 			if (ret < 0)
6468 				continue;
6469 		} else {
6470 			if (wc->level == 1 &&
6471 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6472 				continue;
6473 		}
6474 reada:
6475 		ret = readahead_tree_block(root, bytenr, blocksize,
6476 					   generation);
6477 		if (ret)
6478 			break;
6479 		nread++;
6480 	}
6481 	wc->reada_slot = slot;
6482 }
6483 
6484 /*
6485  * hepler to process tree block while walking down the tree.
6486  *
6487  * when wc->stage == UPDATE_BACKREF, this function updates
6488  * back refs for pointers in the block.
6489  *
6490  * NOTE: return value 1 means we should stop walking down.
6491  */
6492 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6493 				   struct btrfs_root *root,
6494 				   struct btrfs_path *path,
6495 				   struct walk_control *wc, int lookup_info)
6496 {
6497 	int level = wc->level;
6498 	struct extent_buffer *eb = path->nodes[level];
6499 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6500 	int ret;
6501 
6502 	if (wc->stage == UPDATE_BACKREF &&
6503 	    btrfs_header_owner(eb) != root->root_key.objectid)
6504 		return 1;
6505 
6506 	/*
6507 	 * when reference count of tree block is 1, it won't increase
6508 	 * again. once full backref flag is set, we never clear it.
6509 	 */
6510 	if (lookup_info &&
6511 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6512 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6513 		BUG_ON(!path->locks[level]);
6514 		ret = btrfs_lookup_extent_info(trans, root,
6515 					       eb->start, eb->len,
6516 					       &wc->refs[level],
6517 					       &wc->flags[level]);
6518 		BUG_ON(ret == -ENOMEM);
6519 		if (ret)
6520 			return ret;
6521 		BUG_ON(wc->refs[level] == 0);
6522 	}
6523 
6524 	if (wc->stage == DROP_REFERENCE) {
6525 		if (wc->refs[level] > 1)
6526 			return 1;
6527 
6528 		if (path->locks[level] && !wc->keep_locks) {
6529 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6530 			path->locks[level] = 0;
6531 		}
6532 		return 0;
6533 	}
6534 
6535 	/* wc->stage == UPDATE_BACKREF */
6536 	if (!(wc->flags[level] & flag)) {
6537 		BUG_ON(!path->locks[level]);
6538 		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6539 		BUG_ON(ret); /* -ENOMEM */
6540 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6541 		BUG_ON(ret); /* -ENOMEM */
6542 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6543 						  eb->len, flag, 0);
6544 		BUG_ON(ret); /* -ENOMEM */
6545 		wc->flags[level] |= flag;
6546 	}
6547 
6548 	/*
6549 	 * the block is shared by multiple trees, so it's not good to
6550 	 * keep the tree lock
6551 	 */
6552 	if (path->locks[level] && level > 0) {
6553 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6554 		path->locks[level] = 0;
6555 	}
6556 	return 0;
6557 }
6558 
6559 /*
6560  * hepler to process tree block pointer.
6561  *
6562  * when wc->stage == DROP_REFERENCE, this function checks
6563  * reference count of the block pointed to. if the block
6564  * is shared and we need update back refs for the subtree
6565  * rooted at the block, this function changes wc->stage to
6566  * UPDATE_BACKREF. if the block is shared and there is no
6567  * need to update back, this function drops the reference
6568  * to the block.
6569  *
6570  * NOTE: return value 1 means we should stop walking down.
6571  */
6572 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6573 				 struct btrfs_root *root,
6574 				 struct btrfs_path *path,
6575 				 struct walk_control *wc, int *lookup_info)
6576 {
6577 	u64 bytenr;
6578 	u64 generation;
6579 	u64 parent;
6580 	u32 blocksize;
6581 	struct btrfs_key key;
6582 	struct extent_buffer *next;
6583 	int level = wc->level;
6584 	int reada = 0;
6585 	int ret = 0;
6586 
6587 	generation = btrfs_node_ptr_generation(path->nodes[level],
6588 					       path->slots[level]);
6589 	/*
6590 	 * if the lower level block was created before the snapshot
6591 	 * was created, we know there is no need to update back refs
6592 	 * for the subtree
6593 	 */
6594 	if (wc->stage == UPDATE_BACKREF &&
6595 	    generation <= root->root_key.offset) {
6596 		*lookup_info = 1;
6597 		return 1;
6598 	}
6599 
6600 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6601 	blocksize = btrfs_level_size(root, level - 1);
6602 
6603 	next = btrfs_find_tree_block(root, bytenr, blocksize);
6604 	if (!next) {
6605 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6606 		if (!next)
6607 			return -ENOMEM;
6608 		reada = 1;
6609 	}
6610 	btrfs_tree_lock(next);
6611 	btrfs_set_lock_blocking(next);
6612 
6613 	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6614 				       &wc->refs[level - 1],
6615 				       &wc->flags[level - 1]);
6616 	if (ret < 0) {
6617 		btrfs_tree_unlock(next);
6618 		return ret;
6619 	}
6620 
6621 	BUG_ON(wc->refs[level - 1] == 0);
6622 	*lookup_info = 0;
6623 
6624 	if (wc->stage == DROP_REFERENCE) {
6625 		if (wc->refs[level - 1] > 1) {
6626 			if (level == 1 &&
6627 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6628 				goto skip;
6629 
6630 			if (!wc->update_ref ||
6631 			    generation <= root->root_key.offset)
6632 				goto skip;
6633 
6634 			btrfs_node_key_to_cpu(path->nodes[level], &key,
6635 					      path->slots[level]);
6636 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6637 			if (ret < 0)
6638 				goto skip;
6639 
6640 			wc->stage = UPDATE_BACKREF;
6641 			wc->shared_level = level - 1;
6642 		}
6643 	} else {
6644 		if (level == 1 &&
6645 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6646 			goto skip;
6647 	}
6648 
6649 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
6650 		btrfs_tree_unlock(next);
6651 		free_extent_buffer(next);
6652 		next = NULL;
6653 		*lookup_info = 1;
6654 	}
6655 
6656 	if (!next) {
6657 		if (reada && level == 1)
6658 			reada_walk_down(trans, root, wc, path);
6659 		next = read_tree_block(root, bytenr, blocksize, generation);
6660 		if (!next)
6661 			return -EIO;
6662 		btrfs_tree_lock(next);
6663 		btrfs_set_lock_blocking(next);
6664 	}
6665 
6666 	level--;
6667 	BUG_ON(level != btrfs_header_level(next));
6668 	path->nodes[level] = next;
6669 	path->slots[level] = 0;
6670 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6671 	wc->level = level;
6672 	if (wc->level == 1)
6673 		wc->reada_slot = 0;
6674 	return 0;
6675 skip:
6676 	wc->refs[level - 1] = 0;
6677 	wc->flags[level - 1] = 0;
6678 	if (wc->stage == DROP_REFERENCE) {
6679 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6680 			parent = path->nodes[level]->start;
6681 		} else {
6682 			BUG_ON(root->root_key.objectid !=
6683 			       btrfs_header_owner(path->nodes[level]));
6684 			parent = 0;
6685 		}
6686 
6687 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6688 				root->root_key.objectid, level - 1, 0, 0);
6689 		BUG_ON(ret); /* -ENOMEM */
6690 	}
6691 	btrfs_tree_unlock(next);
6692 	free_extent_buffer(next);
6693 	*lookup_info = 1;
6694 	return 1;
6695 }
6696 
6697 /*
6698  * hepler to process tree block while walking up the tree.
6699  *
6700  * when wc->stage == DROP_REFERENCE, this function drops
6701  * reference count on the block.
6702  *
6703  * when wc->stage == UPDATE_BACKREF, this function changes
6704  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6705  * to UPDATE_BACKREF previously while processing the block.
6706  *
6707  * NOTE: return value 1 means we should stop walking up.
6708  */
6709 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6710 				 struct btrfs_root *root,
6711 				 struct btrfs_path *path,
6712 				 struct walk_control *wc)
6713 {
6714 	int ret;
6715 	int level = wc->level;
6716 	struct extent_buffer *eb = path->nodes[level];
6717 	u64 parent = 0;
6718 
6719 	if (wc->stage == UPDATE_BACKREF) {
6720 		BUG_ON(wc->shared_level < level);
6721 		if (level < wc->shared_level)
6722 			goto out;
6723 
6724 		ret = find_next_key(path, level + 1, &wc->update_progress);
6725 		if (ret > 0)
6726 			wc->update_ref = 0;
6727 
6728 		wc->stage = DROP_REFERENCE;
6729 		wc->shared_level = -1;
6730 		path->slots[level] = 0;
6731 
6732 		/*
6733 		 * check reference count again if the block isn't locked.
6734 		 * we should start walking down the tree again if reference
6735 		 * count is one.
6736 		 */
6737 		if (!path->locks[level]) {
6738 			BUG_ON(level == 0);
6739 			btrfs_tree_lock(eb);
6740 			btrfs_set_lock_blocking(eb);
6741 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6742 
6743 			ret = btrfs_lookup_extent_info(trans, root,
6744 						       eb->start, eb->len,
6745 						       &wc->refs[level],
6746 						       &wc->flags[level]);
6747 			if (ret < 0) {
6748 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6749 				return ret;
6750 			}
6751 			BUG_ON(wc->refs[level] == 0);
6752 			if (wc->refs[level] == 1) {
6753 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6754 				return 1;
6755 			}
6756 		}
6757 	}
6758 
6759 	/* wc->stage == DROP_REFERENCE */
6760 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6761 
6762 	if (wc->refs[level] == 1) {
6763 		if (level == 0) {
6764 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6765 				ret = btrfs_dec_ref(trans, root, eb, 1,
6766 						    wc->for_reloc);
6767 			else
6768 				ret = btrfs_dec_ref(trans, root, eb, 0,
6769 						    wc->for_reloc);
6770 			BUG_ON(ret); /* -ENOMEM */
6771 		}
6772 		/* make block locked assertion in clean_tree_block happy */
6773 		if (!path->locks[level] &&
6774 		    btrfs_header_generation(eb) == trans->transid) {
6775 			btrfs_tree_lock(eb);
6776 			btrfs_set_lock_blocking(eb);
6777 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6778 		}
6779 		clean_tree_block(trans, root, eb);
6780 	}
6781 
6782 	if (eb == root->node) {
6783 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6784 			parent = eb->start;
6785 		else
6786 			BUG_ON(root->root_key.objectid !=
6787 			       btrfs_header_owner(eb));
6788 	} else {
6789 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6790 			parent = path->nodes[level + 1]->start;
6791 		else
6792 			BUG_ON(root->root_key.objectid !=
6793 			       btrfs_header_owner(path->nodes[level + 1]));
6794 	}
6795 
6796 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6797 out:
6798 	wc->refs[level] = 0;
6799 	wc->flags[level] = 0;
6800 	return 0;
6801 }
6802 
6803 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6804 				   struct btrfs_root *root,
6805 				   struct btrfs_path *path,
6806 				   struct walk_control *wc)
6807 {
6808 	int level = wc->level;
6809 	int lookup_info = 1;
6810 	int ret;
6811 
6812 	while (level >= 0) {
6813 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
6814 		if (ret > 0)
6815 			break;
6816 
6817 		if (level == 0)
6818 			break;
6819 
6820 		if (path->slots[level] >=
6821 		    btrfs_header_nritems(path->nodes[level]))
6822 			break;
6823 
6824 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
6825 		if (ret > 0) {
6826 			path->slots[level]++;
6827 			continue;
6828 		} else if (ret < 0)
6829 			return ret;
6830 		level = wc->level;
6831 	}
6832 	return 0;
6833 }
6834 
6835 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6836 				 struct btrfs_root *root,
6837 				 struct btrfs_path *path,
6838 				 struct walk_control *wc, int max_level)
6839 {
6840 	int level = wc->level;
6841 	int ret;
6842 
6843 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6844 	while (level < max_level && path->nodes[level]) {
6845 		wc->level = level;
6846 		if (path->slots[level] + 1 <
6847 		    btrfs_header_nritems(path->nodes[level])) {
6848 			path->slots[level]++;
6849 			return 0;
6850 		} else {
6851 			ret = walk_up_proc(trans, root, path, wc);
6852 			if (ret > 0)
6853 				return 0;
6854 
6855 			if (path->locks[level]) {
6856 				btrfs_tree_unlock_rw(path->nodes[level],
6857 						     path->locks[level]);
6858 				path->locks[level] = 0;
6859 			}
6860 			free_extent_buffer(path->nodes[level]);
6861 			path->nodes[level] = NULL;
6862 			level++;
6863 		}
6864 	}
6865 	return 1;
6866 }
6867 
6868 /*
6869  * drop a subvolume tree.
6870  *
6871  * this function traverses the tree freeing any blocks that only
6872  * referenced by the tree.
6873  *
6874  * when a shared tree block is found. this function decreases its
6875  * reference count by one. if update_ref is true, this function
6876  * also make sure backrefs for the shared block and all lower level
6877  * blocks are properly updated.
6878  */
6879 int btrfs_drop_snapshot(struct btrfs_root *root,
6880 			 struct btrfs_block_rsv *block_rsv, int update_ref,
6881 			 int for_reloc)
6882 {
6883 	struct btrfs_path *path;
6884 	struct btrfs_trans_handle *trans;
6885 	struct btrfs_root *tree_root = root->fs_info->tree_root;
6886 	struct btrfs_root_item *root_item = &root->root_item;
6887 	struct walk_control *wc;
6888 	struct btrfs_key key;
6889 	int err = 0;
6890 	int ret;
6891 	int level;
6892 
6893 	path = btrfs_alloc_path();
6894 	if (!path) {
6895 		err = -ENOMEM;
6896 		goto out;
6897 	}
6898 
6899 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6900 	if (!wc) {
6901 		btrfs_free_path(path);
6902 		err = -ENOMEM;
6903 		goto out;
6904 	}
6905 
6906 	trans = btrfs_start_transaction(tree_root, 0);
6907 	if (IS_ERR(trans)) {
6908 		err = PTR_ERR(trans);
6909 		goto out_free;
6910 	}
6911 
6912 	if (block_rsv)
6913 		trans->block_rsv = block_rsv;
6914 
6915 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6916 		level = btrfs_header_level(root->node);
6917 		path->nodes[level] = btrfs_lock_root_node(root);
6918 		btrfs_set_lock_blocking(path->nodes[level]);
6919 		path->slots[level] = 0;
6920 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6921 		memset(&wc->update_progress, 0,
6922 		       sizeof(wc->update_progress));
6923 	} else {
6924 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6925 		memcpy(&wc->update_progress, &key,
6926 		       sizeof(wc->update_progress));
6927 
6928 		level = root_item->drop_level;
6929 		BUG_ON(level == 0);
6930 		path->lowest_level = level;
6931 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6932 		path->lowest_level = 0;
6933 		if (ret < 0) {
6934 			err = ret;
6935 			goto out_end_trans;
6936 		}
6937 		WARN_ON(ret > 0);
6938 
6939 		/*
6940 		 * unlock our path, this is safe because only this
6941 		 * function is allowed to delete this snapshot
6942 		 */
6943 		btrfs_unlock_up_safe(path, 0);
6944 
6945 		level = btrfs_header_level(root->node);
6946 		while (1) {
6947 			btrfs_tree_lock(path->nodes[level]);
6948 			btrfs_set_lock_blocking(path->nodes[level]);
6949 
6950 			ret = btrfs_lookup_extent_info(trans, root,
6951 						path->nodes[level]->start,
6952 						path->nodes[level]->len,
6953 						&wc->refs[level],
6954 						&wc->flags[level]);
6955 			if (ret < 0) {
6956 				err = ret;
6957 				goto out_end_trans;
6958 			}
6959 			BUG_ON(wc->refs[level] == 0);
6960 
6961 			if (level == root_item->drop_level)
6962 				break;
6963 
6964 			btrfs_tree_unlock(path->nodes[level]);
6965 			WARN_ON(wc->refs[level] != 1);
6966 			level--;
6967 		}
6968 	}
6969 
6970 	wc->level = level;
6971 	wc->shared_level = -1;
6972 	wc->stage = DROP_REFERENCE;
6973 	wc->update_ref = update_ref;
6974 	wc->keep_locks = 0;
6975 	wc->for_reloc = for_reloc;
6976 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6977 
6978 	while (1) {
6979 		ret = walk_down_tree(trans, root, path, wc);
6980 		if (ret < 0) {
6981 			err = ret;
6982 			break;
6983 		}
6984 
6985 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6986 		if (ret < 0) {
6987 			err = ret;
6988 			break;
6989 		}
6990 
6991 		if (ret > 0) {
6992 			BUG_ON(wc->stage != DROP_REFERENCE);
6993 			break;
6994 		}
6995 
6996 		if (wc->stage == DROP_REFERENCE) {
6997 			level = wc->level;
6998 			btrfs_node_key(path->nodes[level],
6999 				       &root_item->drop_progress,
7000 				       path->slots[level]);
7001 			root_item->drop_level = level;
7002 		}
7003 
7004 		BUG_ON(wc->level == 0);
7005 		if (btrfs_should_end_transaction(trans, tree_root)) {
7006 			ret = btrfs_update_root(trans, tree_root,
7007 						&root->root_key,
7008 						root_item);
7009 			if (ret) {
7010 				btrfs_abort_transaction(trans, tree_root, ret);
7011 				err = ret;
7012 				goto out_end_trans;
7013 			}
7014 
7015 			btrfs_end_transaction_throttle(trans, tree_root);
7016 			trans = btrfs_start_transaction(tree_root, 0);
7017 			if (IS_ERR(trans)) {
7018 				err = PTR_ERR(trans);
7019 				goto out_free;
7020 			}
7021 			if (block_rsv)
7022 				trans->block_rsv = block_rsv;
7023 		}
7024 	}
7025 	btrfs_release_path(path);
7026 	if (err)
7027 		goto out_end_trans;
7028 
7029 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
7030 	if (ret) {
7031 		btrfs_abort_transaction(trans, tree_root, ret);
7032 		goto out_end_trans;
7033 	}
7034 
7035 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7036 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7037 					   NULL, NULL);
7038 		if (ret < 0) {
7039 			btrfs_abort_transaction(trans, tree_root, ret);
7040 			err = ret;
7041 			goto out_end_trans;
7042 		} else if (ret > 0) {
7043 			/* if we fail to delete the orphan item this time
7044 			 * around, it'll get picked up the next time.
7045 			 *
7046 			 * The most common failure here is just -ENOENT.
7047 			 */
7048 			btrfs_del_orphan_item(trans, tree_root,
7049 					      root->root_key.objectid);
7050 		}
7051 	}
7052 
7053 	if (root->in_radix) {
7054 		btrfs_free_fs_root(tree_root->fs_info, root);
7055 	} else {
7056 		free_extent_buffer(root->node);
7057 		free_extent_buffer(root->commit_root);
7058 		kfree(root);
7059 	}
7060 out_end_trans:
7061 	btrfs_end_transaction_throttle(trans, tree_root);
7062 out_free:
7063 	kfree(wc);
7064 	btrfs_free_path(path);
7065 out:
7066 	if (err)
7067 		btrfs_std_error(root->fs_info, err);
7068 	return err;
7069 }
7070 
7071 /*
7072  * drop subtree rooted at tree block 'node'.
7073  *
7074  * NOTE: this function will unlock and release tree block 'node'
7075  * only used by relocation code
7076  */
7077 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7078 			struct btrfs_root *root,
7079 			struct extent_buffer *node,
7080 			struct extent_buffer *parent)
7081 {
7082 	struct btrfs_path *path;
7083 	struct walk_control *wc;
7084 	int level;
7085 	int parent_level;
7086 	int ret = 0;
7087 	int wret;
7088 
7089 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7090 
7091 	path = btrfs_alloc_path();
7092 	if (!path)
7093 		return -ENOMEM;
7094 
7095 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7096 	if (!wc) {
7097 		btrfs_free_path(path);
7098 		return -ENOMEM;
7099 	}
7100 
7101 	btrfs_assert_tree_locked(parent);
7102 	parent_level = btrfs_header_level(parent);
7103 	extent_buffer_get(parent);
7104 	path->nodes[parent_level] = parent;
7105 	path->slots[parent_level] = btrfs_header_nritems(parent);
7106 
7107 	btrfs_assert_tree_locked(node);
7108 	level = btrfs_header_level(node);
7109 	path->nodes[level] = node;
7110 	path->slots[level] = 0;
7111 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7112 
7113 	wc->refs[parent_level] = 1;
7114 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7115 	wc->level = level;
7116 	wc->shared_level = -1;
7117 	wc->stage = DROP_REFERENCE;
7118 	wc->update_ref = 0;
7119 	wc->keep_locks = 1;
7120 	wc->for_reloc = 1;
7121 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7122 
7123 	while (1) {
7124 		wret = walk_down_tree(trans, root, path, wc);
7125 		if (wret < 0) {
7126 			ret = wret;
7127 			break;
7128 		}
7129 
7130 		wret = walk_up_tree(trans, root, path, wc, parent_level);
7131 		if (wret < 0)
7132 			ret = wret;
7133 		if (wret != 0)
7134 			break;
7135 	}
7136 
7137 	kfree(wc);
7138 	btrfs_free_path(path);
7139 	return ret;
7140 }
7141 
7142 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7143 {
7144 	u64 num_devices;
7145 	u64 stripped;
7146 
7147 	/*
7148 	 * if restripe for this chunk_type is on pick target profile and
7149 	 * return, otherwise do the usual balance
7150 	 */
7151 	stripped = get_restripe_target(root->fs_info, flags);
7152 	if (stripped)
7153 		return extended_to_chunk(stripped);
7154 
7155 	/*
7156 	 * we add in the count of missing devices because we want
7157 	 * to make sure that any RAID levels on a degraded FS
7158 	 * continue to be honored.
7159 	 */
7160 	num_devices = root->fs_info->fs_devices->rw_devices +
7161 		root->fs_info->fs_devices->missing_devices;
7162 
7163 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
7164 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7165 
7166 	if (num_devices == 1) {
7167 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7168 		stripped = flags & ~stripped;
7169 
7170 		/* turn raid0 into single device chunks */
7171 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
7172 			return stripped;
7173 
7174 		/* turn mirroring into duplication */
7175 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7176 			     BTRFS_BLOCK_GROUP_RAID10))
7177 			return stripped | BTRFS_BLOCK_GROUP_DUP;
7178 	} else {
7179 		/* they already had raid on here, just return */
7180 		if (flags & stripped)
7181 			return flags;
7182 
7183 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7184 		stripped = flags & ~stripped;
7185 
7186 		/* switch duplicated blocks with raid1 */
7187 		if (flags & BTRFS_BLOCK_GROUP_DUP)
7188 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
7189 
7190 		/* this is drive concat, leave it alone */
7191 	}
7192 
7193 	return flags;
7194 }
7195 
7196 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7197 {
7198 	struct btrfs_space_info *sinfo = cache->space_info;
7199 	u64 num_bytes;
7200 	u64 min_allocable_bytes;
7201 	int ret = -ENOSPC;
7202 
7203 
7204 	/*
7205 	 * We need some metadata space and system metadata space for
7206 	 * allocating chunks in some corner cases until we force to set
7207 	 * it to be readonly.
7208 	 */
7209 	if ((sinfo->flags &
7210 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7211 	    !force)
7212 		min_allocable_bytes = 1 * 1024 * 1024;
7213 	else
7214 		min_allocable_bytes = 0;
7215 
7216 	spin_lock(&sinfo->lock);
7217 	spin_lock(&cache->lock);
7218 
7219 	if (cache->ro) {
7220 		ret = 0;
7221 		goto out;
7222 	}
7223 
7224 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7225 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7226 
7227 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7228 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7229 	    min_allocable_bytes <= sinfo->total_bytes) {
7230 		sinfo->bytes_readonly += num_bytes;
7231 		cache->ro = 1;
7232 		ret = 0;
7233 	}
7234 out:
7235 	spin_unlock(&cache->lock);
7236 	spin_unlock(&sinfo->lock);
7237 	return ret;
7238 }
7239 
7240 int btrfs_set_block_group_ro(struct btrfs_root *root,
7241 			     struct btrfs_block_group_cache *cache)
7242 
7243 {
7244 	struct btrfs_trans_handle *trans;
7245 	u64 alloc_flags;
7246 	int ret;
7247 
7248 	BUG_ON(cache->ro);
7249 
7250 	trans = btrfs_join_transaction(root);
7251 	if (IS_ERR(trans))
7252 		return PTR_ERR(trans);
7253 
7254 	alloc_flags = update_block_group_flags(root, cache->flags);
7255 	if (alloc_flags != cache->flags) {
7256 		ret = do_chunk_alloc(trans, root, alloc_flags,
7257 				     CHUNK_ALLOC_FORCE);
7258 		if (ret < 0)
7259 			goto out;
7260 	}
7261 
7262 	ret = set_block_group_ro(cache, 0);
7263 	if (!ret)
7264 		goto out;
7265 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7266 	ret = do_chunk_alloc(trans, root, alloc_flags,
7267 			     CHUNK_ALLOC_FORCE);
7268 	if (ret < 0)
7269 		goto out;
7270 	ret = set_block_group_ro(cache, 0);
7271 out:
7272 	btrfs_end_transaction(trans, root);
7273 	return ret;
7274 }
7275 
7276 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7277 			    struct btrfs_root *root, u64 type)
7278 {
7279 	u64 alloc_flags = get_alloc_profile(root, type);
7280 	return do_chunk_alloc(trans, root, alloc_flags,
7281 			      CHUNK_ALLOC_FORCE);
7282 }
7283 
7284 /*
7285  * helper to account the unused space of all the readonly block group in the
7286  * list. takes mirrors into account.
7287  */
7288 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7289 {
7290 	struct btrfs_block_group_cache *block_group;
7291 	u64 free_bytes = 0;
7292 	int factor;
7293 
7294 	list_for_each_entry(block_group, groups_list, list) {
7295 		spin_lock(&block_group->lock);
7296 
7297 		if (!block_group->ro) {
7298 			spin_unlock(&block_group->lock);
7299 			continue;
7300 		}
7301 
7302 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7303 					  BTRFS_BLOCK_GROUP_RAID10 |
7304 					  BTRFS_BLOCK_GROUP_DUP))
7305 			factor = 2;
7306 		else
7307 			factor = 1;
7308 
7309 		free_bytes += (block_group->key.offset -
7310 			       btrfs_block_group_used(&block_group->item)) *
7311 			       factor;
7312 
7313 		spin_unlock(&block_group->lock);
7314 	}
7315 
7316 	return free_bytes;
7317 }
7318 
7319 /*
7320  * helper to account the unused space of all the readonly block group in the
7321  * space_info. takes mirrors into account.
7322  */
7323 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7324 {
7325 	int i;
7326 	u64 free_bytes = 0;
7327 
7328 	spin_lock(&sinfo->lock);
7329 
7330 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7331 		if (!list_empty(&sinfo->block_groups[i]))
7332 			free_bytes += __btrfs_get_ro_block_group_free_space(
7333 						&sinfo->block_groups[i]);
7334 
7335 	spin_unlock(&sinfo->lock);
7336 
7337 	return free_bytes;
7338 }
7339 
7340 void btrfs_set_block_group_rw(struct btrfs_root *root,
7341 			      struct btrfs_block_group_cache *cache)
7342 {
7343 	struct btrfs_space_info *sinfo = cache->space_info;
7344 	u64 num_bytes;
7345 
7346 	BUG_ON(!cache->ro);
7347 
7348 	spin_lock(&sinfo->lock);
7349 	spin_lock(&cache->lock);
7350 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7351 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7352 	sinfo->bytes_readonly -= num_bytes;
7353 	cache->ro = 0;
7354 	spin_unlock(&cache->lock);
7355 	spin_unlock(&sinfo->lock);
7356 }
7357 
7358 /*
7359  * checks to see if its even possible to relocate this block group.
7360  *
7361  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7362  * ok to go ahead and try.
7363  */
7364 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7365 {
7366 	struct btrfs_block_group_cache *block_group;
7367 	struct btrfs_space_info *space_info;
7368 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7369 	struct btrfs_device *device;
7370 	u64 min_free;
7371 	u64 dev_min = 1;
7372 	u64 dev_nr = 0;
7373 	u64 target;
7374 	int index;
7375 	int full = 0;
7376 	int ret = 0;
7377 
7378 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7379 
7380 	/* odd, couldn't find the block group, leave it alone */
7381 	if (!block_group)
7382 		return -1;
7383 
7384 	min_free = btrfs_block_group_used(&block_group->item);
7385 
7386 	/* no bytes used, we're good */
7387 	if (!min_free)
7388 		goto out;
7389 
7390 	space_info = block_group->space_info;
7391 	spin_lock(&space_info->lock);
7392 
7393 	full = space_info->full;
7394 
7395 	/*
7396 	 * if this is the last block group we have in this space, we can't
7397 	 * relocate it unless we're able to allocate a new chunk below.
7398 	 *
7399 	 * Otherwise, we need to make sure we have room in the space to handle
7400 	 * all of the extents from this block group.  If we can, we're good
7401 	 */
7402 	if ((space_info->total_bytes != block_group->key.offset) &&
7403 	    (space_info->bytes_used + space_info->bytes_reserved +
7404 	     space_info->bytes_pinned + space_info->bytes_readonly +
7405 	     min_free < space_info->total_bytes)) {
7406 		spin_unlock(&space_info->lock);
7407 		goto out;
7408 	}
7409 	spin_unlock(&space_info->lock);
7410 
7411 	/*
7412 	 * ok we don't have enough space, but maybe we have free space on our
7413 	 * devices to allocate new chunks for relocation, so loop through our
7414 	 * alloc devices and guess if we have enough space.  if this block
7415 	 * group is going to be restriped, run checks against the target
7416 	 * profile instead of the current one.
7417 	 */
7418 	ret = -1;
7419 
7420 	/*
7421 	 * index:
7422 	 *      0: raid10
7423 	 *      1: raid1
7424 	 *      2: dup
7425 	 *      3: raid0
7426 	 *      4: single
7427 	 */
7428 	target = get_restripe_target(root->fs_info, block_group->flags);
7429 	if (target) {
7430 		index = __get_block_group_index(extended_to_chunk(target));
7431 	} else {
7432 		/*
7433 		 * this is just a balance, so if we were marked as full
7434 		 * we know there is no space for a new chunk
7435 		 */
7436 		if (full)
7437 			goto out;
7438 
7439 		index = get_block_group_index(block_group);
7440 	}
7441 
7442 	if (index == 0) {
7443 		dev_min = 4;
7444 		/* Divide by 2 */
7445 		min_free >>= 1;
7446 	} else if (index == 1) {
7447 		dev_min = 2;
7448 	} else if (index == 2) {
7449 		/* Multiply by 2 */
7450 		min_free <<= 1;
7451 	} else if (index == 3) {
7452 		dev_min = fs_devices->rw_devices;
7453 		do_div(min_free, dev_min);
7454 	}
7455 
7456 	mutex_lock(&root->fs_info->chunk_mutex);
7457 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7458 		u64 dev_offset;
7459 
7460 		/*
7461 		 * check to make sure we can actually find a chunk with enough
7462 		 * space to fit our block group in.
7463 		 */
7464 		if (device->total_bytes > device->bytes_used + min_free) {
7465 			ret = find_free_dev_extent(device, min_free,
7466 						   &dev_offset, NULL);
7467 			if (!ret)
7468 				dev_nr++;
7469 
7470 			if (dev_nr >= dev_min)
7471 				break;
7472 
7473 			ret = -1;
7474 		}
7475 	}
7476 	mutex_unlock(&root->fs_info->chunk_mutex);
7477 out:
7478 	btrfs_put_block_group(block_group);
7479 	return ret;
7480 }
7481 
7482 static int find_first_block_group(struct btrfs_root *root,
7483 		struct btrfs_path *path, struct btrfs_key *key)
7484 {
7485 	int ret = 0;
7486 	struct btrfs_key found_key;
7487 	struct extent_buffer *leaf;
7488 	int slot;
7489 
7490 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7491 	if (ret < 0)
7492 		goto out;
7493 
7494 	while (1) {
7495 		slot = path->slots[0];
7496 		leaf = path->nodes[0];
7497 		if (slot >= btrfs_header_nritems(leaf)) {
7498 			ret = btrfs_next_leaf(root, path);
7499 			if (ret == 0)
7500 				continue;
7501 			if (ret < 0)
7502 				goto out;
7503 			break;
7504 		}
7505 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7506 
7507 		if (found_key.objectid >= key->objectid &&
7508 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7509 			ret = 0;
7510 			goto out;
7511 		}
7512 		path->slots[0]++;
7513 	}
7514 out:
7515 	return ret;
7516 }
7517 
7518 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7519 {
7520 	struct btrfs_block_group_cache *block_group;
7521 	u64 last = 0;
7522 
7523 	while (1) {
7524 		struct inode *inode;
7525 
7526 		block_group = btrfs_lookup_first_block_group(info, last);
7527 		while (block_group) {
7528 			spin_lock(&block_group->lock);
7529 			if (block_group->iref)
7530 				break;
7531 			spin_unlock(&block_group->lock);
7532 			block_group = next_block_group(info->tree_root,
7533 						       block_group);
7534 		}
7535 		if (!block_group) {
7536 			if (last == 0)
7537 				break;
7538 			last = 0;
7539 			continue;
7540 		}
7541 
7542 		inode = block_group->inode;
7543 		block_group->iref = 0;
7544 		block_group->inode = NULL;
7545 		spin_unlock(&block_group->lock);
7546 		iput(inode);
7547 		last = block_group->key.objectid + block_group->key.offset;
7548 		btrfs_put_block_group(block_group);
7549 	}
7550 }
7551 
7552 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7553 {
7554 	struct btrfs_block_group_cache *block_group;
7555 	struct btrfs_space_info *space_info;
7556 	struct btrfs_caching_control *caching_ctl;
7557 	struct rb_node *n;
7558 
7559 	down_write(&info->extent_commit_sem);
7560 	while (!list_empty(&info->caching_block_groups)) {
7561 		caching_ctl = list_entry(info->caching_block_groups.next,
7562 					 struct btrfs_caching_control, list);
7563 		list_del(&caching_ctl->list);
7564 		put_caching_control(caching_ctl);
7565 	}
7566 	up_write(&info->extent_commit_sem);
7567 
7568 	spin_lock(&info->block_group_cache_lock);
7569 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7570 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7571 				       cache_node);
7572 		rb_erase(&block_group->cache_node,
7573 			 &info->block_group_cache_tree);
7574 		spin_unlock(&info->block_group_cache_lock);
7575 
7576 		down_write(&block_group->space_info->groups_sem);
7577 		list_del(&block_group->list);
7578 		up_write(&block_group->space_info->groups_sem);
7579 
7580 		if (block_group->cached == BTRFS_CACHE_STARTED)
7581 			wait_block_group_cache_done(block_group);
7582 
7583 		/*
7584 		 * We haven't cached this block group, which means we could
7585 		 * possibly have excluded extents on this block group.
7586 		 */
7587 		if (block_group->cached == BTRFS_CACHE_NO)
7588 			free_excluded_extents(info->extent_root, block_group);
7589 
7590 		btrfs_remove_free_space_cache(block_group);
7591 		btrfs_put_block_group(block_group);
7592 
7593 		spin_lock(&info->block_group_cache_lock);
7594 	}
7595 	spin_unlock(&info->block_group_cache_lock);
7596 
7597 	/* now that all the block groups are freed, go through and
7598 	 * free all the space_info structs.  This is only called during
7599 	 * the final stages of unmount, and so we know nobody is
7600 	 * using them.  We call synchronize_rcu() once before we start,
7601 	 * just to be on the safe side.
7602 	 */
7603 	synchronize_rcu();
7604 
7605 	release_global_block_rsv(info);
7606 
7607 	while(!list_empty(&info->space_info)) {
7608 		space_info = list_entry(info->space_info.next,
7609 					struct btrfs_space_info,
7610 					list);
7611 		if (space_info->bytes_pinned > 0 ||
7612 		    space_info->bytes_reserved > 0 ||
7613 		    space_info->bytes_may_use > 0) {
7614 			WARN_ON(1);
7615 			dump_space_info(space_info, 0, 0);
7616 		}
7617 		list_del(&space_info->list);
7618 		kfree(space_info);
7619 	}
7620 	return 0;
7621 }
7622 
7623 static void __link_block_group(struct btrfs_space_info *space_info,
7624 			       struct btrfs_block_group_cache *cache)
7625 {
7626 	int index = get_block_group_index(cache);
7627 
7628 	down_write(&space_info->groups_sem);
7629 	list_add_tail(&cache->list, &space_info->block_groups[index]);
7630 	up_write(&space_info->groups_sem);
7631 }
7632 
7633 int btrfs_read_block_groups(struct btrfs_root *root)
7634 {
7635 	struct btrfs_path *path;
7636 	int ret;
7637 	struct btrfs_block_group_cache *cache;
7638 	struct btrfs_fs_info *info = root->fs_info;
7639 	struct btrfs_space_info *space_info;
7640 	struct btrfs_key key;
7641 	struct btrfs_key found_key;
7642 	struct extent_buffer *leaf;
7643 	int need_clear = 0;
7644 	u64 cache_gen;
7645 
7646 	root = info->extent_root;
7647 	key.objectid = 0;
7648 	key.offset = 0;
7649 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7650 	path = btrfs_alloc_path();
7651 	if (!path)
7652 		return -ENOMEM;
7653 	path->reada = 1;
7654 
7655 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7656 	if (btrfs_test_opt(root, SPACE_CACHE) &&
7657 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7658 		need_clear = 1;
7659 	if (btrfs_test_opt(root, CLEAR_CACHE))
7660 		need_clear = 1;
7661 
7662 	while (1) {
7663 		ret = find_first_block_group(root, path, &key);
7664 		if (ret > 0)
7665 			break;
7666 		if (ret != 0)
7667 			goto error;
7668 		leaf = path->nodes[0];
7669 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7670 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7671 		if (!cache) {
7672 			ret = -ENOMEM;
7673 			goto error;
7674 		}
7675 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7676 						GFP_NOFS);
7677 		if (!cache->free_space_ctl) {
7678 			kfree(cache);
7679 			ret = -ENOMEM;
7680 			goto error;
7681 		}
7682 
7683 		atomic_set(&cache->count, 1);
7684 		spin_lock_init(&cache->lock);
7685 		cache->fs_info = info;
7686 		INIT_LIST_HEAD(&cache->list);
7687 		INIT_LIST_HEAD(&cache->cluster_list);
7688 
7689 		if (need_clear) {
7690 			/*
7691 			 * When we mount with old space cache, we need to
7692 			 * set BTRFS_DC_CLEAR and set dirty flag.
7693 			 *
7694 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7695 			 *    truncate the old free space cache inode and
7696 			 *    setup a new one.
7697 			 * b) Setting 'dirty flag' makes sure that we flush
7698 			 *    the new space cache info onto disk.
7699 			 */
7700 			cache->disk_cache_state = BTRFS_DC_CLEAR;
7701 			if (btrfs_test_opt(root, SPACE_CACHE))
7702 				cache->dirty = 1;
7703 		}
7704 
7705 		read_extent_buffer(leaf, &cache->item,
7706 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7707 				   sizeof(cache->item));
7708 		memcpy(&cache->key, &found_key, sizeof(found_key));
7709 
7710 		key.objectid = found_key.objectid + found_key.offset;
7711 		btrfs_release_path(path);
7712 		cache->flags = btrfs_block_group_flags(&cache->item);
7713 		cache->sectorsize = root->sectorsize;
7714 
7715 		btrfs_init_free_space_ctl(cache);
7716 
7717 		/*
7718 		 * We need to exclude the super stripes now so that the space
7719 		 * info has super bytes accounted for, otherwise we'll think
7720 		 * we have more space than we actually do.
7721 		 */
7722 		exclude_super_stripes(root, cache);
7723 
7724 		/*
7725 		 * check for two cases, either we are full, and therefore
7726 		 * don't need to bother with the caching work since we won't
7727 		 * find any space, or we are empty, and we can just add all
7728 		 * the space in and be done with it.  This saves us _alot_ of
7729 		 * time, particularly in the full case.
7730 		 */
7731 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7732 			cache->last_byte_to_unpin = (u64)-1;
7733 			cache->cached = BTRFS_CACHE_FINISHED;
7734 			free_excluded_extents(root, cache);
7735 		} else if (btrfs_block_group_used(&cache->item) == 0) {
7736 			cache->last_byte_to_unpin = (u64)-1;
7737 			cache->cached = BTRFS_CACHE_FINISHED;
7738 			add_new_free_space(cache, root->fs_info,
7739 					   found_key.objectid,
7740 					   found_key.objectid +
7741 					   found_key.offset);
7742 			free_excluded_extents(root, cache);
7743 		}
7744 
7745 		ret = update_space_info(info, cache->flags, found_key.offset,
7746 					btrfs_block_group_used(&cache->item),
7747 					&space_info);
7748 		BUG_ON(ret); /* -ENOMEM */
7749 		cache->space_info = space_info;
7750 		spin_lock(&cache->space_info->lock);
7751 		cache->space_info->bytes_readonly += cache->bytes_super;
7752 		spin_unlock(&cache->space_info->lock);
7753 
7754 		__link_block_group(space_info, cache);
7755 
7756 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7757 		BUG_ON(ret); /* Logic error */
7758 
7759 		set_avail_alloc_bits(root->fs_info, cache->flags);
7760 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7761 			set_block_group_ro(cache, 1);
7762 	}
7763 
7764 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7765 		if (!(get_alloc_profile(root, space_info->flags) &
7766 		      (BTRFS_BLOCK_GROUP_RAID10 |
7767 		       BTRFS_BLOCK_GROUP_RAID1 |
7768 		       BTRFS_BLOCK_GROUP_DUP)))
7769 			continue;
7770 		/*
7771 		 * avoid allocating from un-mirrored block group if there are
7772 		 * mirrored block groups.
7773 		 */
7774 		list_for_each_entry(cache, &space_info->block_groups[3], list)
7775 			set_block_group_ro(cache, 1);
7776 		list_for_each_entry(cache, &space_info->block_groups[4], list)
7777 			set_block_group_ro(cache, 1);
7778 	}
7779 
7780 	init_global_block_rsv(info);
7781 	ret = 0;
7782 error:
7783 	btrfs_free_path(path);
7784 	return ret;
7785 }
7786 
7787 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7788 				       struct btrfs_root *root)
7789 {
7790 	struct btrfs_block_group_cache *block_group, *tmp;
7791 	struct btrfs_root *extent_root = root->fs_info->extent_root;
7792 	struct btrfs_block_group_item item;
7793 	struct btrfs_key key;
7794 	int ret = 0;
7795 
7796 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7797 				 new_bg_list) {
7798 		list_del_init(&block_group->new_bg_list);
7799 
7800 		if (ret)
7801 			continue;
7802 
7803 		spin_lock(&block_group->lock);
7804 		memcpy(&item, &block_group->item, sizeof(item));
7805 		memcpy(&key, &block_group->key, sizeof(key));
7806 		spin_unlock(&block_group->lock);
7807 
7808 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
7809 					sizeof(item));
7810 		if (ret)
7811 			btrfs_abort_transaction(trans, extent_root, ret);
7812 	}
7813 }
7814 
7815 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7816 			   struct btrfs_root *root, u64 bytes_used,
7817 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7818 			   u64 size)
7819 {
7820 	int ret;
7821 	struct btrfs_root *extent_root;
7822 	struct btrfs_block_group_cache *cache;
7823 
7824 	extent_root = root->fs_info->extent_root;
7825 
7826 	root->fs_info->last_trans_log_full_commit = trans->transid;
7827 
7828 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7829 	if (!cache)
7830 		return -ENOMEM;
7831 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7832 					GFP_NOFS);
7833 	if (!cache->free_space_ctl) {
7834 		kfree(cache);
7835 		return -ENOMEM;
7836 	}
7837 
7838 	cache->key.objectid = chunk_offset;
7839 	cache->key.offset = size;
7840 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7841 	cache->sectorsize = root->sectorsize;
7842 	cache->fs_info = root->fs_info;
7843 
7844 	atomic_set(&cache->count, 1);
7845 	spin_lock_init(&cache->lock);
7846 	INIT_LIST_HEAD(&cache->list);
7847 	INIT_LIST_HEAD(&cache->cluster_list);
7848 	INIT_LIST_HEAD(&cache->new_bg_list);
7849 
7850 	btrfs_init_free_space_ctl(cache);
7851 
7852 	btrfs_set_block_group_used(&cache->item, bytes_used);
7853 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7854 	cache->flags = type;
7855 	btrfs_set_block_group_flags(&cache->item, type);
7856 
7857 	cache->last_byte_to_unpin = (u64)-1;
7858 	cache->cached = BTRFS_CACHE_FINISHED;
7859 	exclude_super_stripes(root, cache);
7860 
7861 	add_new_free_space(cache, root->fs_info, chunk_offset,
7862 			   chunk_offset + size);
7863 
7864 	free_excluded_extents(root, cache);
7865 
7866 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7867 				&cache->space_info);
7868 	BUG_ON(ret); /* -ENOMEM */
7869 	update_global_block_rsv(root->fs_info);
7870 
7871 	spin_lock(&cache->space_info->lock);
7872 	cache->space_info->bytes_readonly += cache->bytes_super;
7873 	spin_unlock(&cache->space_info->lock);
7874 
7875 	__link_block_group(cache->space_info, cache);
7876 
7877 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7878 	BUG_ON(ret); /* Logic error */
7879 
7880 	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7881 
7882 	set_avail_alloc_bits(extent_root->fs_info, type);
7883 
7884 	return 0;
7885 }
7886 
7887 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7888 {
7889 	u64 extra_flags = chunk_to_extended(flags) &
7890 				BTRFS_EXTENDED_PROFILE_MASK;
7891 
7892 	if (flags & BTRFS_BLOCK_GROUP_DATA)
7893 		fs_info->avail_data_alloc_bits &= ~extra_flags;
7894 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
7895 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7896 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7897 		fs_info->avail_system_alloc_bits &= ~extra_flags;
7898 }
7899 
7900 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7901 			     struct btrfs_root *root, u64 group_start)
7902 {
7903 	struct btrfs_path *path;
7904 	struct btrfs_block_group_cache *block_group;
7905 	struct btrfs_free_cluster *cluster;
7906 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7907 	struct btrfs_key key;
7908 	struct inode *inode;
7909 	int ret;
7910 	int index;
7911 	int factor;
7912 
7913 	root = root->fs_info->extent_root;
7914 
7915 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7916 	BUG_ON(!block_group);
7917 	BUG_ON(!block_group->ro);
7918 
7919 	/*
7920 	 * Free the reserved super bytes from this block group before
7921 	 * remove it.
7922 	 */
7923 	free_excluded_extents(root, block_group);
7924 
7925 	memcpy(&key, &block_group->key, sizeof(key));
7926 	index = get_block_group_index(block_group);
7927 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7928 				  BTRFS_BLOCK_GROUP_RAID1 |
7929 				  BTRFS_BLOCK_GROUP_RAID10))
7930 		factor = 2;
7931 	else
7932 		factor = 1;
7933 
7934 	/* make sure this block group isn't part of an allocation cluster */
7935 	cluster = &root->fs_info->data_alloc_cluster;
7936 	spin_lock(&cluster->refill_lock);
7937 	btrfs_return_cluster_to_free_space(block_group, cluster);
7938 	spin_unlock(&cluster->refill_lock);
7939 
7940 	/*
7941 	 * make sure this block group isn't part of a metadata
7942 	 * allocation cluster
7943 	 */
7944 	cluster = &root->fs_info->meta_alloc_cluster;
7945 	spin_lock(&cluster->refill_lock);
7946 	btrfs_return_cluster_to_free_space(block_group, cluster);
7947 	spin_unlock(&cluster->refill_lock);
7948 
7949 	path = btrfs_alloc_path();
7950 	if (!path) {
7951 		ret = -ENOMEM;
7952 		goto out;
7953 	}
7954 
7955 	inode = lookup_free_space_inode(tree_root, block_group, path);
7956 	if (!IS_ERR(inode)) {
7957 		ret = btrfs_orphan_add(trans, inode);
7958 		if (ret) {
7959 			btrfs_add_delayed_iput(inode);
7960 			goto out;
7961 		}
7962 		clear_nlink(inode);
7963 		/* One for the block groups ref */
7964 		spin_lock(&block_group->lock);
7965 		if (block_group->iref) {
7966 			block_group->iref = 0;
7967 			block_group->inode = NULL;
7968 			spin_unlock(&block_group->lock);
7969 			iput(inode);
7970 		} else {
7971 			spin_unlock(&block_group->lock);
7972 		}
7973 		/* One for our lookup ref */
7974 		btrfs_add_delayed_iput(inode);
7975 	}
7976 
7977 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7978 	key.offset = block_group->key.objectid;
7979 	key.type = 0;
7980 
7981 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7982 	if (ret < 0)
7983 		goto out;
7984 	if (ret > 0)
7985 		btrfs_release_path(path);
7986 	if (ret == 0) {
7987 		ret = btrfs_del_item(trans, tree_root, path);
7988 		if (ret)
7989 			goto out;
7990 		btrfs_release_path(path);
7991 	}
7992 
7993 	spin_lock(&root->fs_info->block_group_cache_lock);
7994 	rb_erase(&block_group->cache_node,
7995 		 &root->fs_info->block_group_cache_tree);
7996 	spin_unlock(&root->fs_info->block_group_cache_lock);
7997 
7998 	down_write(&block_group->space_info->groups_sem);
7999 	/*
8000 	 * we must use list_del_init so people can check to see if they
8001 	 * are still on the list after taking the semaphore
8002 	 */
8003 	list_del_init(&block_group->list);
8004 	if (list_empty(&block_group->space_info->block_groups[index]))
8005 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
8006 	up_write(&block_group->space_info->groups_sem);
8007 
8008 	if (block_group->cached == BTRFS_CACHE_STARTED)
8009 		wait_block_group_cache_done(block_group);
8010 
8011 	btrfs_remove_free_space_cache(block_group);
8012 
8013 	spin_lock(&block_group->space_info->lock);
8014 	block_group->space_info->total_bytes -= block_group->key.offset;
8015 	block_group->space_info->bytes_readonly -= block_group->key.offset;
8016 	block_group->space_info->disk_total -= block_group->key.offset * factor;
8017 	spin_unlock(&block_group->space_info->lock);
8018 
8019 	memcpy(&key, &block_group->key, sizeof(key));
8020 
8021 	btrfs_clear_space_info_full(root->fs_info);
8022 
8023 	btrfs_put_block_group(block_group);
8024 	btrfs_put_block_group(block_group);
8025 
8026 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8027 	if (ret > 0)
8028 		ret = -EIO;
8029 	if (ret < 0)
8030 		goto out;
8031 
8032 	ret = btrfs_del_item(trans, root, path);
8033 out:
8034 	btrfs_free_path(path);
8035 	return ret;
8036 }
8037 
8038 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8039 {
8040 	struct btrfs_space_info *space_info;
8041 	struct btrfs_super_block *disk_super;
8042 	u64 features;
8043 	u64 flags;
8044 	int mixed = 0;
8045 	int ret;
8046 
8047 	disk_super = fs_info->super_copy;
8048 	if (!btrfs_super_root(disk_super))
8049 		return 1;
8050 
8051 	features = btrfs_super_incompat_flags(disk_super);
8052 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8053 		mixed = 1;
8054 
8055 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
8056 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8057 	if (ret)
8058 		goto out;
8059 
8060 	if (mixed) {
8061 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8062 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8063 	} else {
8064 		flags = BTRFS_BLOCK_GROUP_METADATA;
8065 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8066 		if (ret)
8067 			goto out;
8068 
8069 		flags = BTRFS_BLOCK_GROUP_DATA;
8070 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8071 	}
8072 out:
8073 	return ret;
8074 }
8075 
8076 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8077 {
8078 	return unpin_extent_range(root, start, end);
8079 }
8080 
8081 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8082 			       u64 num_bytes, u64 *actual_bytes)
8083 {
8084 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8085 }
8086 
8087 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8088 {
8089 	struct btrfs_fs_info *fs_info = root->fs_info;
8090 	struct btrfs_block_group_cache *cache = NULL;
8091 	u64 group_trimmed;
8092 	u64 start;
8093 	u64 end;
8094 	u64 trimmed = 0;
8095 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8096 	int ret = 0;
8097 
8098 	/*
8099 	 * try to trim all FS space, our block group may start from non-zero.
8100 	 */
8101 	if (range->len == total_bytes)
8102 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
8103 	else
8104 		cache = btrfs_lookup_block_group(fs_info, range->start);
8105 
8106 	while (cache) {
8107 		if (cache->key.objectid >= (range->start + range->len)) {
8108 			btrfs_put_block_group(cache);
8109 			break;
8110 		}
8111 
8112 		start = max(range->start, cache->key.objectid);
8113 		end = min(range->start + range->len,
8114 				cache->key.objectid + cache->key.offset);
8115 
8116 		if (end - start >= range->minlen) {
8117 			if (!block_group_cache_done(cache)) {
8118 				ret = cache_block_group(cache, NULL, root, 0);
8119 				if (!ret)
8120 					wait_block_group_cache_done(cache);
8121 			}
8122 			ret = btrfs_trim_block_group(cache,
8123 						     &group_trimmed,
8124 						     start,
8125 						     end,
8126 						     range->minlen);
8127 
8128 			trimmed += group_trimmed;
8129 			if (ret) {
8130 				btrfs_put_block_group(cache);
8131 				break;
8132 			}
8133 		}
8134 
8135 		cache = next_block_group(fs_info->tree_root, cache);
8136 	}
8137 
8138 	range->len = trimmed;
8139 	return ret;
8140 }
8141